diff options
author | trav90 <travawine@palemoon.org> | 2018-10-19 23:00:02 -0500 |
---|---|---|
committer | trav90 <travawine@palemoon.org> | 2018-10-19 23:00:02 -0500 |
commit | b8df135c97a854c2ff9b4394b016649c601177fa (patch) | |
tree | 802b7de5ad245f1a12adbcef835ab0d0687c1bf8 /third_party | |
parent | a4d3c59dcac642f6b9557dc09b60eda40b517630 (diff) | |
download | UXP-b8df135c97a854c2ff9b4394b016649c601177fa.tar UXP-b8df135c97a854c2ff9b4394b016649c601177fa.tar.gz UXP-b8df135c97a854c2ff9b4394b016649c601177fa.tar.lz UXP-b8df135c97a854c2ff9b4394b016649c601177fa.tar.xz UXP-b8df135c97a854c2ff9b4394b016649c601177fa.zip |
Update libaom to rev b25610052a1398032320008d69b51d2da94f5928
Diffstat (limited to 'third_party')
210 files changed, 16335 insertions, 5545 deletions
diff --git a/third_party/aom/.cmake-format.py b/third_party/aom/.cmake-format.py index cebad0742..aa7354c2a 100644 --- a/third_party/aom/.cmake-format.py +++ b/third_party/aom/.cmake-format.py @@ -1,3 +1,4 @@ +# Generated with cmake-format 0.3.6 # How wide to allow formatted cmake files line_width = 80 diff --git a/third_party/aom/CHANGELOG b/third_party/aom/CHANGELOG index 7510dc660..d84aa0249 100644 --- a/third_party/aom/CHANGELOG +++ b/third_party/aom/CHANGELOG @@ -1,631 +1,5 @@ -Next Release - - Incompatible changes: - The AV1 encoder's default keyframe interval changed to 128 from 9999. - Support for armv6 was removed. +2018-06-28 v1.0.0 + AOMedia Codec Workgroup Approved version 1.0 2016-04-07 v0.1.0 "AOMedia Codec 1" This release is the first Alliance for Open Media codec. -2015-11-09 v1.5.0 "Javan Whistling Duck" - This release improves upon the VP9 encoder and speeds up the encoding and - decoding processes. - - - Upgrading: - This release is ABI incompatible with 1.4.0. It drops deprecated VP8 - controls and adds a variety of VP9 controls for testing. - - The vpxenc utility now prefers VP9 by default. - - - Enhancements: - Faster VP9 encoding and decoding - Smaller library size by combining functions used by VP8 and VP9 - - - Bug Fixes: - A variety of fuzzing issues - -2015-04-03 v1.4.0 "Indian Runner Duck" - This release includes significant improvements to the VP9 codec. - - - Upgrading: - This release is ABI incompatible with 1.3.0. It drops the compatibility - layer, requiring VPX_IMG_FMT_* instead of IMG_FMT_*, and adds several codec - controls for VP9. - - - Enhancements: - Faster VP9 encoding and decoding - Multithreaded VP9 decoding (tile and frame-based) - Multithreaded VP9 encoding - on by default - YUV 4:2:2 and 4:4:4 support in VP9 - 10 and 12bit support in VP9 - 64bit ARM support by replacing ARM assembly with intrinsics - - - Bug Fixes: - Fixes a VP9 bitstream issue in Profile 1. This only affected non-YUV 4:2:0 - files. - - - Known Issues: - Frame Parallel decoding fails for segmented and non-420 files. - -2013-11-15 v1.3.0 "Forest" - This release introduces the VP9 codec in a backward-compatible way. - All existing users of VP8 can continue to use the library without - modification. However, some VP8 options do not map to VP9 in the same manner. - - The VP9 encoder in this release is not feature complete. Users interested in - the encoder are advised to use the git master branch and discuss issues on - libvpx mailing lists. - - - Upgrading: - This release is ABI and API compatible with Duclair (v1.0.0). Users - of older releases should refer to the Upgrading notes in this document - for that release. - - - Enhancements: - Get rid of bashisms in the main build scripts - Added usage info on command line options - Add lossless compression mode - Dll build of libvpx - Add additional Mac OS X targets: 10.7, 10.8 and 10.9 (darwin11-13) - Add option to disable documentation - configure: add --enable-external-build support - make: support V=1 as short form of verbose=yes - configure: support mingw-w64 - configure: support hardfloat armv7 CHOSTS - configure: add support for android x86 - Add estimated completion time to vpxenc - Don't exit on decode errors in vpxenc - vpxenc: support scaling prior to encoding - vpxdec: support scaling output - vpxenc: improve progress indicators with --skip - msvs: Don't link to winmm.lib - Add a new script for producing vcxproj files - Produce Visual Studio 10 and 11 project files - Produce Windows Phone project files - msvs-build: use msbuild for vs >= 2005 - configure: default configure log to config.log - Add encoding option --static-thresh - - - Speed: - Miscellaneous speed optimizations for VP8 and VP9. - - - Quality: - In general, quality is consistent with the Eider release. - - - Bug Fixes: - This release represents approximately a year of engineering effort, - and contains multiple bug fixes. Please refer to git history for details. - - -2012-12-21 v1.2.0 - This release acts as a checkpoint for a large amount of internal refactoring - and testing. It also contains a number of small bugfixes, so all users are - encouraged to upgrade. - - - Upgrading: - This release is ABI and API compatible with Duclair (v1.0.0). Users - of older releases should refer to the Upgrading notes in this - document for that release. - - - Enhancements: - VP8 optimizations for MIPS dspr2 - vpxenc: add -quiet option - - - Speed: - Encoder and decoder speed is consistent with the Eider release. - - - Quality: - In general, quality is consistent with the Eider release. - - Minor tweaks to ARNR filtering - Minor improvements to real time encoding with multiple temporal layers - - - Bug Fixes: - Fixes multithreaded encoder race condition in loopfilter - Fixes multi-resolution threaded encoding - Fix potential encoder dead-lock after picture resize - - -2012-05-09 v1.1.0 "Eider" - This introduces a number of enhancements, mostly focused on real-time - encoding. In addition, it fixes a decoder bug (first introduced in - Duclair) so all users of that release are encouraged to upgrade. - - - Upgrading: - This release is ABI and API compatible with Duclair (v1.0.0). Users - of older releases should refer to the Upgrading notes in this - document for that release. - - This release introduces a new temporal denoiser, controlled by the - VP8E_SET_NOISE_SENSITIVITY control. The temporal denoiser does not - currently take a strength parameter, so the control is effectively - a boolean - zero (off) or non-zero (on). For compatibility with - existing applications, the values accepted are the same as those - for the spatial denoiser (0-6). The temporal denoiser is enabled - by default, and the older spatial denoiser may be restored by - configuring with --disable-temporal-denoising. The temporal denoiser - is more computationally intensive than the spatial one. - - This release removes support for a legacy, decode only API that was - supported, but deprecated, at the initial release of libvpx - (v0.9.0). This is not expected to have any impact. If you are - impacted, you can apply a reversion to commit 2bf8fb58 locally. - Please update to the latest libvpx API if you are affected. - - - Enhancements: - Adds a motion compensated temporal denoiser to the encoder, which - gives higher quality than the older spatial denoiser. (See above - for notes on upgrading). - - In addition, support for new compilers and platforms were added, - including: - improved support for XCode - Android x86 NDK build - OS/2 support - SunCC support - - Changing resolution with vpx_codec_enc_config_set() is now - supported. Previously, reinitializing the codec was required to - change the input resolution. - - The vpxenc application has initial support for producing multiple - encodes from the same input in one call. Resizing is not yet - supported, but varying other codec parameters is. Use -- to - delineate output streams. Options persist from one stream to the - next. - - Also, the vpxenc application will now use a keyframe interval of - 5 seconds by default. Use the --kf-max-dist option to override. - - - Speed: - Decoder performance improved 2.5% versus Duclair. Encoder speed is - consistent with Duclair for most material. Two pass encoding of - slideshow-like material will see significant improvements. - - Large realtime encoding speed gains at a small quality expense are - possible by configuring the on-the-fly bitpacking experiment with - --enable-onthefly-bitpacking. Realtime encoder can be up to 13% - faster (ARM) depending on the number of threads and bitrate - settings. This technique sees constant gain over the 5-16 speed - range. For VC style input the loss seen is up to 0.2dB. See commit - 52cf4dca for further details. - - - Quality: - On the whole, quality is consistent with the Duclair release. Some - tweaks: - - Reduced blockiness in easy sections by applying a penalty to - intra modes. - - Improved quality of static sections (like slideshows) with - two pass encoding. - - Improved keyframe sizing with multiple temporal layers - - - Bug Fixes: - Corrected alt-ref contribution to frame rate for visible updates - to the alt-ref buffer. This affected applications making manual - usage of the frame reference flags, or temporal layers. - - Additional constraints were added to disable multi-frame quality - enhancement (MFQE) in sections of the frame where there is motion. - (#392) - - Fixed corruption issues when vpx_codec_enc_config_set() was called - with spatial resampling enabled. - - Fixed a decoder error introduced in Duclair where the segmentation - map was not being reinitialized on keyframes (#378) - - -2012-01-27 v1.0.0 "Duclair" - Our fourth named release, focused on performance and features related to - real-time encoding. It also fixes a decoder crash bug introduced in - v0.9.7, so all users of that release are encouraged to upgrade. - - - Upgrading: - This release is ABI incompatible with prior releases of libvpx, so the - "major" version number has been bumped to 1. You must recompile your - applications against the latest version of the libvpx headers. The - API remains compatible, and this should not require code changes in most - applications. - - - Enhancements: - This release introduces several substantial new features to the encoder, - of particular interest to real time streaming applications. - - Temporal scalability allows the encoder to produce a stream that can - be decimated to different frame rates, with independent rate targetting - for each substream. - - Multiframe quality enhancement postprocessing can make visual quality - more consistent in the presence of frames that are substantially - different quality than the surrounding frames, as in the temporal - scalability case and in some forced keyframe scenarios. - - Multiple-resolution encoding support allows the encoding of the - same content at different resolutions faster than encoding them - separately. - - - Speed: - Optimization targets for this release included the decoder and the real- - time modes of the encoder. Decoder speed on x86 has improved 10.5% with - this release. Encoder improvements followed a curve where speeds 1-3 - improved 4.0%-1.5%, speeds 4-8 improved <1%, and speeds 9-16 improved - 1.5% to 10.5%, respectively. "Best" mode speed is consistent with the - Cayuga release. - - - Quality: - Encoder quality in the single stream case is consistent with the Cayuga - release. - - - Bug Fixes: - This release fixes an OOB read decoder crash bug present in v0.9.7 - related to the clamping of motion vectors in SPLITMV blocks. This - behavior could be triggered by corrupt input or by starting - decoding from a P-frame. - - -2011-08-15 v0.9.7-p1 "Cayuga" patch 1 - This is an incremental bugfix release against Cayuga. All users of that - release are strongly encouraged to upgrade. - - - Fix potential OOB reads (cdae03a) - - An unbounded out of bounds read was discovered when the - decoder was requested to perform error concealment (new in - Cayuga) given a frame with corrupt partition sizes. - - A bounded out of bounds read was discovered affecting all - versions of libvpx. Given an multipartition input frame that - is truncated between the mode/mv partition and the first - residiual paritition (in the block of partition offsets), up - to 3 extra bytes could have been read from the source buffer. - The code will not take any action regardless of the contents - of these undefined bytes, as the truncated buffer is detected - immediately following the read based on the calculated - starting position of the coefficient partition. - - - Fix potential error concealment crash when the very first frame - is missing or corrupt (a609be5) - - - Fix significant artifacts in error concealment (a4c2211, 99d870a) - - - Revert 1-pass CBR rate control changes (e961317) - Further testing showed this change produced undesirable visual - artifacts, rolling back for now. - - -2011-08-02 v0.9.7 "Cayuga" - Our third named release, focused on a faster, higher quality, encoder. - - - Upgrading: - This release is backwards compatible with Aylesbury (v0.9.5) and - Bali (v0.9.6). Users of older releases should refer to the Upgrading - notes in this document for that release. - - - Enhancements: - Stereo 3D format support for vpxenc - Runtime detection of available processor cores. - Allow specifying --end-usage by enum name - vpxdec: test for frame corruption - vpxenc: add quantizer histogram display - vpxenc: add rate histogram display - Set VPX_FRAME_IS_DROPPABLE - update configure for ios sdk 4.3 - Avoid text relocations in ARM vp8 decoder - Generate a vpx.pc file for pkg-config. - New ways of passing encoded data between encoder and decoder. - - - Speed: - This release includes across-the-board speed improvements to the - encoder. On x86, these measure at approximately 11.5% in Best mode, - 21.5% in Good mode (speed 0), and 22.5% in Realtime mode (speed 6). - On ARM Cortex A9 with Neon extensions, real-time encoding of video - telephony content is 35% faster than Bali on single core and 48% - faster on multi-core. On the NVidia Tegra2 platform, real time - encoding is 40% faster than Bali. - - Decoder speed was not a priority for this release, but improved - approximately 8.4% on x86. - - Reduce motion vector search on alt-ref frame. - Encoder loopfilter running in its own thread - Reworked loopfilter to precalculate more parameters - SSE2/SSSE3 optimizations for build_predictors_mbuv{,_s}(). - Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3. - Removed redundant checks - Reduced structure sizes - utilize preload in ARMv6 MC/LPF/Copy routines - ARM optimized quantization, dfct, variance, subtract - Increase chrow row alignment to 16 bytes. - disable trellis optimization for first pass - Write SSSE3 sub-pixel filter function - Improve SSE2 half-pixel filter funtions - Add vp8_sub_pixel_variance16x8_ssse3 function - Reduce unnecessary distortion computation - Use diamond search to replace full search - Preload reference area in sub-pixel motion search (real-time mode) - - - Quality: - This release focused primarily on one-pass use cases, including - video conferencing. Low latency data rate control was significantly - improved, improving streamability over bandwidth constrained links. - Added support for error concealment, allowing frames to maintain - visual quality in the presence of substantial packet loss. - - Add rc_max_intra_bitrate_pct control - Limit size of initial keyframe in one-pass. - Improve framerate adaptation - Improved 1-pass CBR rate control - Improved KF insertion after fades to still. - Improved key frame detection. - Improved activity masking (lower PSNR impact for same SSIM boost) - Improved interaction between GF and ARFs - Adding error-concealment to the decoder. - Adding support for independent partitions - Adjusted rate-distortion constants - - - - Bug Fixes: - Removed firstpass motion map - Fix parallel make install - Fix multithreaded encoding for 1 MB wide frame - Fixed iwalsh_neon build problems with RVDS4.1 - Fix semaphore emulation, spin-wait intrinsics on Windows - Fix build with xcode4 and simplify GLOBAL. - Mark ARM asm objects as allowing a non-executable stack. - Fix vpxenc encoding incorrect webm file header on big endian - - -2011-03-07 v0.9.6 "Bali" - Our second named release, focused on a faster, higher quality, encoder. - - - Upgrading: - This release is backwards compatible with Aylesbury (v0.9.5). Users - of older releases should refer to the Upgrading notes in this - document for that release. - - - Enhancements: - vpxenc --psnr shows a summary when encode completes - --tune=ssim option to enable activity masking - improved postproc visualizations for development - updated support for Apple iOS to SDK 4.2 - query decoder to determine which reference frames were updated - implemented error tracking in the decoder - fix pipe support on windows - - - Speed: - Primary focus was on good quality mode, speed 0. Average improvement - on x86 about 40%, up to 100% on user-generated content at that speed. - Best quality mode speed improved 35%, and realtime speed 10-20%. This - release also saw significant improvement in realtime encoding speed - on ARM platforms. - - Improved encoder threading - Dont pick encoder filter level when loopfilter is disabled. - Avoid double copying of key frames into alt and golden buffer - FDCT optimizations. - x86 sse2 temporal filter - SSSE3 version of fast quantizer - vp8_rd_pick_best_mbsegmentation code restructure - Adjusted breakout RD for SPLITMV - Changed segmentation check order - Improved rd_pick_intra4x4block - Adds armv6 optimized variance calculation - ARMv6 optimized sad16x16 - ARMv6 optimized half pixel variance calculations - Full search SAD function optimization in SSE4.1 - Improve MV prediction accuracy to achieve performance gain - Improve MV prediction in vp8_pick_inter_mode() for speed>3 - - - Quality: - Best quality mode improved PSNR 6.3%, and SSIM 6.1%. This release - also includes support for "activity masking," which greatly improves - SSIM at the expense of PSNR. For now, this feature is available with - the --tune=ssim option. Further experimentation in this area - is ongoing. This release also introduces a new rate control mode - called "CQ," which changes the allocation of bits within a clip to - the sections where they will have the most visual impact. - - Tuning for the more exact quantizer. - Relax rate control for last few frames - CQ Mode - Limit key frame quantizer for forced key frames. - KF/GF Pulsing - Add simple version of activity masking. - make rdmult adaptive for intra in quantizer RDO - cap the best quantizer for 2nd order DC - change the threshold of DC check for encode breakout - - - Bug Fixes: - Fix crash on Sparc Solaris. - Fix counter of fixed keyframe distance - ARNR filter pointer update bug fix - Fixed use of motion percentage in KF/GF group calc - Changed condition for using RD in Intra Mode - Fix encoder real-time only configuration. - Fix ARM encoder crash with multiple token partitions - Fixed bug first cluster timecode of webm file is wrong. - Fixed various encoder bugs with odd-sized images - vp8e_get_preview fixed when spatial resampling enabled - quantizer: fix assertion in fast quantizer path - Allocate source buffers to be multiples of 16 - Fix for manual Golden frame frequency - Fix drastic undershoot in long form content - - -2010-10-28 v0.9.5 "Aylesbury" - Our first named release, focused on a faster decoder, and a better encoder. - - - Upgrading: - This release incorporates backwards-incompatible changes to the - ivfenc and ivfdec tools. These tools are now called vpxenc and vpxdec. - - vpxdec - * the -q (quiet) option has been removed, and replaced with - -v (verbose). the output is quiet by default. Use -v to see - the version number of the binary. - - * The default behavior is now to write output to a single file - instead of individual frames. The -y option has been removed. - Y4M output is the default. - - * For raw I420/YV12 output instead of Y4M, the --i420 or --yv12 - options must be specified. - - $ ivfdec -o OUTPUT INPUT - $ vpxdec --i420 -o OUTPUT INPUT - - * If an output file is not specified, the default is to write - Y4M to stdout. This makes piping more natural. - - $ ivfdec -y -o - INPUT | ... - $ vpxdec INPUT | ... - - * The output file has additional flexibility for formatting the - filename. It supports escape characters for constructing a - filename from the width, height, and sequence number. This - replaces the -p option. To get the equivalent: - - $ ivfdec -p frame INPUT - $ vpxdec --i420 -o frame-%wx%h-%4.i420 INPUT - - vpxenc - * The output file must be specified with -o, rather than as the - last argument. - - $ ivfenc <options> INPUT OUTPUT - $ vpxenc <options> -o OUTPUT INPUT - - * The output defaults to webm. To get IVF output, use the --ivf - option. - - $ ivfenc <options> INPUT OUTPUT.ivf - $ vpxenc <options> -o OUTPUT.ivf --ivf INPUT - - - - Enhancements: - ivfenc and ivfdec have been renamed to vpxenc, vpxdec. - vpxdec supports .webm input - vpxdec writes .y4m by default - vpxenc writes .webm output by default - vpxenc --psnr now shows the average/overall PSNR at the end - ARM platforms now support runtime cpu detection - vpxdec visualizations added for motion vectors, block modes, references - vpxdec now silent by default - vpxdec --progress shows frame-by-frame timing information - vpxenc supports the distinction between --fps and --timebase - NASM is now a supported assembler - configure: enable PIC for shared libs by default - configure: add --enable-small - configure: support for ppc32-linux-gcc - configure: support for sparc-solaris-gcc - - - Bugs: - Improve handling of invalid frames - Fix valgrind errors in the NEON loop filters. - Fix loopfilter delta zero transitions - Fix valgrind errors in vp8_sixtap_predict8x4_armv6(). - Build fixes for darwin-icc - - - Speed: - 20-40% (average 28%) improvement in libvpx decoder speed, - including: - Rewrite vp8_short_walsh4x4_sse2() - Optimizations on the loopfilters. - Miscellaneous improvements for Atom - Add 4-tap version of 2nd-pass ARMv6 MC filter. - Improved multithread utilization - Better instruction choices on x86 - reorder data to use wider instructions - Update NEON wide idcts - Make block access to frame buffer sequential - Improved subset block search - Bilinear subpixel optimizations for ssse3. - Decrease memory footprint - - Encoder speed improvements (percentage gain not measured): - Skip unnecessary search of identical frames - Add SSE2 subtract functions - Improve bounds checking in vp8_diamond_search_sadx4() - Added vp8_fast_quantize_b_sse2 - - - Quality: - Over 7% overall PSNR improvement (6.3% SSIM) in "best" quality - encoding mode, and up to 60% improvement on very noisy, still - or slow moving source video - - Motion compensated temporal filter for Alt-Ref Noise Reduction - Improved use of trellis quantization on 2nd order Y blocks - Tune effect of motion on KF/GF boost in two pass - Allow coefficient optimization for good quality speed 0. - Improved control of active min quantizer for two pass. - Enable ARFs for non-lagged compress - -2010-09-02 v0.9.2 - - Enhancements: - Disable frame dropping by default - Improved multithreaded performance - Improved Force Key Frame Behaviour - Increased rate control buffer level precision - Fix bug in 1st pass motion compensation - ivfenc: correct fixed kf interval, --disable-kf - - Speed: - Changed above and left context data layout - Rework idct calling structure. - Removed unnecessary MB_MODE_INFO copies - x86: SSSE3 sixtap prediction - Reworked IDCT to include reconstruction (add) step - Swap alt/gold/new/last frame buffer ptrs instead of copying. - Improve SSE2 loopfilter functions - Change bitreader to use a larger window. - Avoid loopfilter reinitialization when possible - - Quality: - Normalize quantizer's zero bin and rounding factors - Add trellis quantization. - Make the quantizer exact. - Updates to ARNR filtering algorithm - Fix breakout thresh computation for golden & AltRef frames - Redo the forward 4x4 dct - Improve the accuracy of forward walsh-hadamard transform - Further adjustment of RD behaviour with Q and Zbin. - - Build System: - Allow linking of libs built with MinGW to MSVC - Fix target auto-detection on mingw32 - Allow --cpu= to work for x86. - configure: pass original arguments through to make dist - Fix builds without runtime CPU detection - msvs: fix install of codec sources - msvs: Change devenv.com command line for better msys support - msvs: Add vs9 targets. - Add x86_64-linux-icc target - - Bugs: - Potential crashes on older MinGW builds - Fix two-pass framrate for Y4M input. - Fixed simple loop filter, other crashes on ARM v6 - arm: fix missing dependency with --enable-shared - configure: support directories containing .o - Replace pinsrw (SSE) with MMX instructions - apple: include proper mach primatives - Fixed rate control bug with long key frame interval. - Fix DSO link errors on x86-64 when not using a version script - Fixed buffer selection for UV in AltRef filtering - - -2010-06-17 v0.9.1 - - Enhancements: - * ivfenc/ivfdec now support YUV4MPEG2 input and pipe I/O - * Speed optimizations - - Bugfixes: - * Rate control - * Prevent out-of-bounds accesses on invalid data - - Build system updates: - * Detect toolchain to be used automatically for native builds - * Support building shared libraries - * Better autotools emulation (--prefix, --libdir, DESTDIR) - - Updated LICENSE - * http://webmproject.blogspot.com/2010/06/changes-to-webm-open-source-license.html - - -2010-05-18 v0.9.0 - - Initial open source release. Welcome to WebM and VP8! - diff --git a/third_party/aom/CMakeLists.txt b/third_party/aom/CMakeLists.txt index 0f6a37ffb..0b445722d 100644 --- a/third_party/aom/CMakeLists.txt +++ b/third_party/aom/CMakeLists.txt @@ -186,11 +186,9 @@ list(APPEND AOM_ENCODER_APP_UTIL_SOURCES "${AOM_ROOT}/examples/encoder_util.h" "${AOM_ROOT}/examples/encoder_util.c") -if (ENABLE_EXAMPLES) - list(APPEND AOM_ENCODER_STATS_SOURCES "${AOM_ROOT}/stats/aomstats.c" - "${AOM_ROOT}/stats/aomstats.h" "${AOM_ROOT}/stats/rate_hist.c" - "${AOM_ROOT}/stats/rate_hist.h") -endif () +list(APPEND AOM_ENCODER_STATS_SOURCES "${AOM_ROOT}/stats/aomstats.c" + "${AOM_ROOT}/stats/aomstats.h" "${AOM_ROOT}/stats/rate_hist.c" + "${AOM_ROOT}/stats/rate_hist.h") list(APPEND AOM_PKG_CONFIG_SOURCES "${AOM_CONFIG_DIR}/aom.pc") diff --git a/third_party/aom/aom/aomcx.h b/third_party/aom/aom/aomcx.h index 4cdb5d332..e77e5f693 100644 --- a/third_party/aom/aom/aomcx.h +++ b/third_party/aom/aom/aomcx.h @@ -854,6 +854,12 @@ enum aome_enc_control_id { /*!\brief Codec control function to set the path to the film grain parameters */ AV1E_SET_FILM_GRAIN_TABLE, + + /*!\brief Sets the noise level */ + AV1E_SET_DENOISE_NOISE_LEVEL, + + /*!\brief Sets the denoisers block size */ + AV1E_SET_DENOISE_BLOCK_SIZE, }; /*!\brief aom 1-D scaling mode @@ -1165,6 +1171,14 @@ AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TABLE, const char *) AOM_CTRL_USE_TYPE(AV1E_SET_CDF_UPDATE_MODE, int) #define AOM_CTRL_AV1E_SET_CDF_UPDATE_MODE +#ifdef CONFIG_DENOISE +AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_NOISE_LEVEL, int); +#define AOM_CTRL_AV1E_SET_DENOISE_NOISE_LEVEL + +AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_BLOCK_SIZE, unsigned int); +#define AOM_CTRL_AV1E_SET_DENOISE_BLOCK_SIZE +#endif + /*!\endcond */ /*! @} - end defgroup aom_encoder */ #ifdef __cplusplus diff --git a/third_party/aom/aom/aomdx.h b/third_party/aom/aom/aomdx.h index 7ff21a59b..50ff22410 100644 --- a/third_party/aom/aom/aomdx.h +++ b/third_party/aom/aom/aomdx.h @@ -119,6 +119,12 @@ enum aom_dec_control_id { /** control function to get the bit depth of the stream. */ AV1D_GET_BIT_DEPTH, + /** control function to get the image format of the stream. */ + AV1D_GET_IMG_FORMAT, + + /** control function to get the size of the tile. */ + AV1D_GET_TILE_SIZE, + /** control function to set the byte alignment of the planes in the reference * buffers. Valid values are power of 2, from 32 to 1024. A value of 0 sets * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly @@ -187,6 +193,12 @@ enum aom_dec_control_id { */ AV1D_EXT_TILE_DEBUG, + /** control function to enable the row based multi-threading of decoding. A + * value that is equal to 1 indicates that row based multi-threading is + * enabled. + */ + AV1D_SET_ROW_MT, + /** control function to indicate whether bitstream is in Annex-B format. */ AV1D_SET_IS_ANNEXB, @@ -238,6 +250,10 @@ AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *) #define AOM_CTRL_AV1D_GET_DISPLAY_SIZE AOM_CTRL_USE_TYPE(AV1D_GET_BIT_DEPTH, unsigned int *) #define AOM_CTRL_AV1D_GET_BIT_DEPTH +AOM_CTRL_USE_TYPE(AV1D_GET_IMG_FORMAT, aom_img_fmt_t *) +#define AOM_CTRL_AV1D_GET_IMG_FORMAT +AOM_CTRL_USE_TYPE(AV1D_GET_TILE_SIZE, unsigned int *) +#define AOM_CTRL_AV1D_GET_TILE_SIZE AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_SIZE, int *) #define AOM_CTRL_AV1D_GET_FRAME_SIZE AOM_CTRL_USE_TYPE(AV1_INVERT_TILE_DECODE_ORDER, int) @@ -258,6 +274,8 @@ AOM_CTRL_USE_TYPE(AV1D_SET_EXT_REF_PTR, av1_ext_ref_frame_t *) #define AOM_CTRL_AV1D_SET_EXT_REF_PTR AOM_CTRL_USE_TYPE(AV1D_EXT_TILE_DEBUG, unsigned int) #define AOM_CTRL_AV1D_EXT_TILE_DEBUG +AOM_CTRL_USE_TYPE(AV1D_SET_ROW_MT, unsigned int) +#define AOM_CTRL_AV1D_SET_ROW_MT AOM_CTRL_USE_TYPE(AV1D_SET_IS_ANNEXB, unsigned int) #define AOM_CTRL_AV1D_SET_IS_ANNEXB AOM_CTRL_USE_TYPE(AV1D_SET_OPERATING_POINT, int) diff --git a/third_party/aom/aom/internal/aom_codec_internal.h b/third_party/aom/aom/internal/aom_codec_internal.h index 84ea4eefa..88bf78ef2 100644 --- a/third_party/aom/aom/internal/aom_codec_internal.h +++ b/third_party/aom/aom/internal/aom_codec_internal.h @@ -417,7 +417,7 @@ struct aom_internal_error_info { aom_codec_err_t error_code; int has_detail; char detail[80]; - int setjmp; + int setjmp; // Boolean: whether 'jmp' is valid. jmp_buf jmp; }; diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake index 768875f7d..7c0111a69 100644 --- a/third_party/aom/aom_dsp/aom_dsp.cmake +++ b/third_party/aom/aom_dsp/aom_dsp.cmake @@ -83,6 +83,7 @@ list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1 list(APPEND AOM_DSP_COMMON_INTRIN_AVX2 "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c" "${AOM_ROOT}/aom_dsp/x86/common_avx2.h" + "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h" "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h" "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c" @@ -190,13 +191,16 @@ if(CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/x86/ssim_opt_x86_64.asm") list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c" "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c") + "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c") list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64 "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm") @@ -205,9 +209,11 @@ if(CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm") list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h" "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h" "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c") diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd.c b/third_party/aom/aom_dsp/aom_dsp_rtcd.c index 5d7d4515b..1514bd64e 100644 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd.c +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd.c @@ -15,4 +15,4 @@ #include "aom_ports/aom_once.h" -void aom_dsp_rtcd() { once(setup_rtcd_internal); } +void aom_dsp_rtcd() { aom_once(setup_rtcd_internal); } diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl index a8ac5eb5c..1a9ac3184 100755 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl @@ -377,7 +377,7 @@ add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8 specialize qw/aom_lpf_vertical_14_dual sse2/; add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_vertical_6 sse2/; +specialize qw/aom_lpf_vertical_6 sse2 neon/; add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/aom_lpf_vertical_8 sse2 neon/; @@ -386,13 +386,13 @@ add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_ specialize qw/aom_lpf_vertical_8_dual sse2/; add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_vertical_4 sse2/; +specialize qw/aom_lpf_vertical_4 sse2 neon/; add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/aom_lpf_vertical_4_dual sse2/; add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_horizontal_14 sse2/; +specialize qw/aom_lpf_horizontal_14 sse2 neon/; add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/aom_lpf_horizontal_14_dual sse2/; @@ -410,7 +410,7 @@ add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint specialize qw/aom_lpf_horizontal_8_dual sse2/; add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_horizontal_4 sse2/; +specialize qw/aom_lpf_horizontal_4 sse2 neon/; add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/aom_lpf_horizontal_4_dual sse2/; @@ -564,7 +564,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # Block subtraction # add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; - specialize qw/aom_subtract_block neon msa sse2/; + specialize qw/aom_subtract_block neon msa sse2 avx2/; add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; specialize qw/aom_highbd_subtract_block sse2/; @@ -732,14 +732,14 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { foreach (@block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask"; - specialize "aom_masked_sad${w}x${h}", qw/ssse3/; + specialize "aom_masked_sad${w}x${h}", qw/ssse3 avx2/; } foreach (@block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask"; - specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3/; + specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2/; } @@ -750,7 +750,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { - specialize "aom_obmc_sad${w}x${h}", qw/sse4_1/; + specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2/; } } @@ -759,7 +759,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { - specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1/; + specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/; } } @@ -1102,6 +1102,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; specialize "aom_obmc_variance${w}x${h}", q/sse4_1/; + specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/; } @@ -1539,9 +1540,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_comp_mask_pred ssse3 avx2/; add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; - add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd"; - + specialize qw/aom_highbd_comp_mask_pred avx2/; } # CONFIG_AV1_ENCODER diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c index 69470eeb0..c85b1e910 100644 --- a/third_party/aom/aom_dsp/arm/intrapred_neon.c +++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c @@ -528,3 +528,63 @@ void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, } } } + +static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, + const uint16_t *above, + const uint16_t *left) { + assert(bw >= 4); + assert(IS_POWER_OF_TWO(bw)); + int expected_dc, sum = 0; + const int count = bw * 2; + uint32x4_t sum_q = vdupq_n_u32(0); + uint32x2_t sum_d; + uint16_t *dst_1; + if (bw >= 8) { + for (int i = 0; i < bw; i += 8) { + sum_q = vpadalq_u16(sum_q, vld1q_u16(above)); + sum_q = vpadalq_u16(sum_q, vld1q_u16(left)); + above += 8; + left += 8; + } + sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q)); + sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0); + expected_dc = (sum + (count >> 1)) / count; + const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc); + for (int r = 0; r < bw; r++) { + dst_1 = dst; + for (int i = 0; i < bw; i += 8) { + vst1q_u16(dst_1, dc); + dst_1 += 8; + } + dst += stride; + } + } else { // 4x4 + sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left)); + sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q)); + sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0); + expected_dc = (sum + (count >> 1)) / count; + const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc); + for (int r = 0; r < bw; r++) { + vst1_u16(dst, dc); + dst += stride; + } + } +} + +#define intra_pred_highbd_sized_neon(type, width) \ + void aom_highbd_##type##_predictor_##width##x##width##_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)bd; \ + highbd_##type##_predictor(dst, stride, width, above, left); \ + } + +#define intra_pred_square(type) \ + intra_pred_highbd_sized_neon(type, 4); \ + intra_pred_highbd_sized_neon(type, 8); \ + intra_pred_highbd_sized_neon(type, 16); \ + intra_pred_highbd_sized_neon(type, 32); \ + intra_pred_highbd_sized_neon(type, 64); + +intra_pred_square(dc); +#undef intra_pred_square diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c index ee1a3c78f..bdc67626d 100644 --- a/third_party/aom/aom_dsp/arm/loopfilter_neon.c +++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c @@ -52,6 +52,36 @@ static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1, return mask_8x8; } +static INLINE uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0, + const uint8_t blimit, const uint8_t limit) { + uint32x2x2_t p0q0_p1q1; + uint16x8_t temp_16x8; + uint16x4_t temp0_16x4, temp1_16x4; + const uint16x4_t blimit_16x4 = vdup_n_u16(blimit); + const uint8x8_t limit_8x8 = vdup_n_u8(limit); + uint8x8_t mask_8x8, temp_8x8; + + mask_8x8 = vabd_u8(p1q1, p0q0); + mask_8x8 = vcle_u8(mask_8x8, limit_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); + temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), + vreinterpret_u8_u32(p0q0_p1q1.val[1])); + temp_16x8 = vmovl_u8(temp_8x8); + temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); + temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); + temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); + temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); + temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); + + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + return mask_8x8; +} + static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1, uint8x8_t p0q0) { const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1 @@ -523,6 +553,68 @@ static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0, } } +static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit, + const uint8_t limit, const uint8_t thresh) { + int32x2x2_t ps0_qs0, ps1_qs1; + int16x8_t filter_s16; + const uint8x8_t thresh_f4 = vdup_n_u8(thresh); + uint8x8_t mask_8x8, temp0_8x8, temp1_8x8; + int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; + int8x8_t op0, oq0, op1, oq1; + int8x8_t pq_s0, pq_s1; + int8x8_t filter_s8, filter1_s8, filter2_s8; + int8x8_t hev_8x8; + const int8x8_t sign_mask = vdup_n_s8(0x80); + const int8x8_t val_4 = vdup_n_s8(4); + const int8x8_t val_3 = vdup_n_s8(3); + + // Calculate filter mask + mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit); + + pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); + pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); + + ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); + ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); + ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); + qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); + ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); + qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); + + // hev_mask + temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); + temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); + hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); + + // add outer taps if we have high edge variance + filter_s8 = vqsub_s8(ps1_s8, qs1_s8); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + // inner taps + temp_s8 = vqsub_s8(qs0_s8, ps0_s8); + filter_s16 = vmovl_s8(filter_s8); + filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); + filter_s8 = vqmovn_s16(filter_s16); + filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); + + filter1_s8 = vqadd_s8(filter_s8, val_4); + filter2_s8 = vqadd_s8(filter_s8, val_3); + filter1_s8 = vshr_n_s8(filter1_s8, 3); + filter2_s8 = vshr_n_s8(filter2_s8, 3); + + oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); + op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); + + filter_s8 = vrshr_n_s8(filter1_s8, 1); + filter_s8 = vbic_s8(filter_s8, hev_8x8); + + oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); + op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); + + *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); + *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); +} + void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { uint8x16_t row0, row1, row2, row3; @@ -646,6 +738,125 @@ void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit, store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3); } +void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint32x2x2_t p2q2_p1q1, pxqy_p0q0; + uint32x2_t pq_rev; + uint8x8_t pxq0, p2q1, p1q2, p0qy; + uint8x8_t p0q0, p1q1, p2q2, pxqy; + + // row0: px p2 p1 p0 | q0 q1 q2 qy + // row1: px p2 p1 p0 | q0 q1 q2 qy + // row2: px p2 p1 p0 | q0 q1 q2 qy + // row3: px p2 p1 p0 | q0 q1 q2 qy + load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy); + + transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy)); + pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2)); + p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev); + + p0q0 = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1])); + p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); + p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); + pxqy = vreinterpret_u8_u32(pxqy_p0q0.val[0]); + + lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0)); + pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxqy), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1)); + p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev); + + p0qy = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1])); + p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); + p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); + pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]); + transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy); + + store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy); +} + +void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0; + uint32x2_t pq_rev; + uint8x8_t UNINITIALIZED_IS_SAFE(p1p0), q0q1, p0q0, p1q1; + + // row0: p1 p0 | q0 q1 + // row1: p1 p0 | q0 q1 + // row2: p1 p0 | q0 q1 + // row3: p1 p0 | q0 q1 + load_u8_4x1(src - 2, &p1p0, 0); + load_u8_4x1((src - 2) + 1 * stride, &p1p0, 1); + load_u8_4x1((src - 2) + 2 * stride, &q0q1, 0); + load_u8_4x1((src - 2) + 3 * stride, &q0q1, 1); + + transpose_u8_4x4(&p1p0, &q0q1); + + p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1)); + + pq_rev = vrev64_u32(p1q0_p0q1.val[1]); + p1q1_p0q0 = vtrn_u32(p1q0_p0q1.val[0], pq_rev); + + p1q1 = vreinterpret_u8_u32(p1q1_p0q0.val[0]); + p0q0 = vreinterpret_u8_u32(p1q1_p0q0.val[1]); + + lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh); + + p1p0_q1q0 = vtrn_u32(vreinterpret_u32_u8(p1q1), vreinterpret_u32_u8(p0q0)); + + p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]); + q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1])); + + transpose_u8_4x4(&p1p0, &q0q1); + + store_u8_4x1(src - 2, p1p0, 0); + store_u8_4x1((src - 2) + 1 * stride, q0q1, 0); + store_u8_4x1((src - 2) + 2 * stride, p1p0, 1); + store_u8_4x1((src - 2) + 3 * stride, q0q1, 1); +} + +void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, UNINITIALIZED_IS_SAFE(p6q6); + + load_u8_4x1(src - 7 * stride, &p6q6, 0); + load_u8_4x1(src - 6 * stride, &p5q5, 0); + load_u8_4x1(src - 5 * stride, &p4q4, 0); + load_u8_4x1(src - 4 * stride, &p3q3, 0); + load_u8_4x1(src - 3 * stride, &p2q2, 0); + load_u8_4x1(src - 2 * stride, &p1q1, 0); + load_u8_4x1(src - 1 * stride, &p0q0, 0); + load_u8_4x1(src + 0 * stride, &p0q0, 1); + load_u8_4x1(src + 1 * stride, &p1q1, 1); + load_u8_4x1(src + 2 * stride, &p2q2, 1); + load_u8_4x1(src + 3 * stride, &p3q3, 1); + load_u8_4x1(src + 4 * stride, &p4q4, 1); + load_u8_4x1(src + 5 * stride, &p5q5, 1); + load_u8_4x1(src + 6 * stride, &p6q6, 1); + + lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, + *thresh); + + store_u8_4x1(src - 6 * stride, p5q5, 0); + store_u8_4x1(src - 5 * stride, p4q4, 0); + store_u8_4x1(src - 4 * stride, p3q3, 0); + store_u8_4x1(src - 3 * stride, p2q2, 0); + store_u8_4x1(src - 2 * stride, p1q1, 0); + store_u8_4x1(src - 1 * stride, p0q0, 0); + store_u8_4x1(src + 0 * stride, p0q0, 1); + store_u8_4x1(src + 1 * stride, p1q1, 1); + store_u8_4x1(src + 2 * stride, p2q2, 1); + store_u8_4x1(src + 3 * stride, p3q3, 1); + store_u8_4x1(src + 4 * stride, p4q4, 1); + store_u8_4x1(src + 5 * stride, p5q5, 1); +} + void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { uint8x8_t p0q0, p1q1, p2q2, p3q3; @@ -698,3 +909,20 @@ void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit, vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1); vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1); } + +void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p0q0, UNINITIALIZED_IS_SAFE(p1q1); + + load_u8_4x1(src - 2 * stride, &p1q1, 0); + load_u8_4x1(src - 1 * stride, &p0q0, 0); + load_u8_4x1(src + 0 * stride, &p0q0, 1); + load_u8_4x1(src + 1 * stride, &p1q1, 1); + + lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh); + + store_u8_4x1(src - 2 * stride, p1q1, 0); + store_u8_4x1(src - 1 * stride, p0q0, 0); + store_u8_4x1(src + 0 * stride, p0q0, 1); + store_u8_4x1(src + 1 * stride, p1q1, 1); +} diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c index 68fc381f2..02b5ef924 100644 --- a/third_party/aom/aom_dsp/bitreader_buffer.c +++ b/third_party/aom/aom_dsp/bitreader_buffer.c @@ -8,11 +8,14 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ + +#include <assert.h> + #include "config/aom_config.h" #include "aom_dsp/bitreader_buffer.h" -size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb) { +size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb) { return (rb->bit_offset + 7) >> 3; } @@ -31,6 +34,7 @@ int aom_rb_read_bit(struct aom_read_bit_buffer *rb) { } int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) { + assert(bits <= 31); int value = 0, bit; for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit; return value; @@ -38,6 +42,7 @@ int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) { uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits) { + assert(bits <= 32); uint32_t value = 0; int bit; for (bit = bits - 1; bit >= 0; bit--) diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h index 2dafe11ad..5c94ab883 100644 --- a/third_party/aom/aom_dsp/bitreader_buffer.h +++ b/third_party/aom/aom_dsp/bitreader_buffer.h @@ -31,7 +31,7 @@ struct aom_read_bit_buffer { aom_rb_error_handler error_handler; }; -size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb); +size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb); int aom_rb_read_bit(struct aom_read_bit_buffer *rb); diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c index 21314eb2a..a563bf684 100644 --- a/third_party/aom/aom_dsp/bitwriter_buffer.c +++ b/third_party/aom/aom_dsp/bitwriter_buffer.c @@ -9,6 +9,7 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include <assert.h> #include <limits.h> #include <stdlib.h> @@ -49,12 +50,14 @@ void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit) { } void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) { + assert(bits <= 31); int bit; for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1); } void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb, uint32_t data, int bits) { + assert(bits <= 32); int bit; for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1); } diff --git a/third_party/aom/aom_dsp/grain_synthesis.c b/third_party/aom/aom_dsp/grain_synthesis.c index fcb6c290e..ff1ec41a2 100644 --- a/third_party/aom/aom_dsp/grain_synthesis.c +++ b/third_party/aom/aom_dsp/grain_synthesis.c @@ -17,6 +17,7 @@ #include <stdio.h> #include <string.h> #include <stdlib.h> +#include <assert.h> #include "aom_dsp/grain_synthesis.h" #include "aom_mem/aom_mem.h" @@ -237,7 +238,7 @@ static int grain_max; static uint16_t random_register = 0; // random number generator register -static void init_arrays(aom_film_grain_t *params, int luma_stride, +static void init_arrays(const aom_film_grain_t *params, int luma_stride, int chroma_stride, int ***pred_pos_luma_p, int ***pred_pos_chroma_p, int **luma_grain_block, int **cb_grain_block, int **cr_grain_block, @@ -331,7 +332,7 @@ static void init_arrays(aom_film_grain_t *params, int luma_stride, (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples); } -static void dealloc_arrays(aom_film_grain_t *params, int ***pred_pos_luma, +static void dealloc_arrays(const aom_film_grain_t *params, int ***pred_pos_luma, int ***pred_pos_chroma, int **luma_grain_block, int **cb_grain_block, int **cr_grain_block, int **y_line_buf, int **cb_line_buf, @@ -396,10 +397,14 @@ static void init_random_generator(int luma_line, uint16_t seed) { } static void generate_luma_grain_block( - aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block, + const aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block, int luma_block_size_y, int luma_block_size_x, int luma_grain_stride, int left_pad, int top_pad, int right_pad, int bottom_pad) { - if (params->num_y_points == 0) return; + if (params->num_y_points == 0) { + memset(luma_grain_block, 0, + sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride); + return; + } int bit_depth = params->bit_depth; int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift; @@ -431,7 +436,7 @@ static void generate_luma_grain_block( } static void generate_chroma_grain_blocks( - aom_film_grain_t *params, + const aom_film_grain_t *params, // int** pred_pos_luma, int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block, int *cr_grain_block, int luma_grain_stride, int chroma_block_size_y, @@ -443,7 +448,7 @@ static void generate_chroma_grain_blocks( int num_pos_chroma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); if (params->num_y_points > 0) ++num_pos_chroma; int rounding_offset = (1 << (params->ar_coeff_shift - 1)); - int chroma_grain_samples = chroma_block_size_y * chroma_block_size_x; + int chroma_grain_block_size = chroma_block_size_y * chroma_grain_stride; if (params->num_cb_points || params->chroma_scaling_from_luma) { init_random_generator(7 << 5, params->random_seed); @@ -455,7 +460,8 @@ static void generate_chroma_grain_blocks( ((1 << gauss_sec_shift) >> 1)) >> gauss_sec_shift; } else { - memset(cr_grain_block, 0, sizeof(*cr_grain_block) * chroma_grain_samples); + memset(cb_grain_block, 0, + sizeof(*cb_grain_block) * chroma_grain_block_size); } if (params->num_cr_points || params->chroma_scaling_from_luma) { @@ -468,7 +474,8 @@ static void generate_chroma_grain_blocks( ((1 << gauss_sec_shift) >> 1)) >> gauss_sec_shift; } else { - memset(cb_grain_block, 0, sizeof(*cb_grain_block) * chroma_grain_samples); + memset(cr_grain_block, 0, + sizeof(*cr_grain_block) * chroma_grain_block_size); } for (int i = top_pad; i < chroma_block_size_y - bottom_pad; i++) @@ -522,7 +529,7 @@ static void generate_chroma_grain_blocks( } } -static void init_scaling_function(int scaling_points[][2], int num_points, +static void init_scaling_function(const int scaling_points[][2], int num_points, int scaling_lut[]) { if (num_points == 0) return; @@ -559,7 +566,7 @@ static int scale_LUT(int *scaling_lut, int index, int bit_depth) { (bit_depth - 8)); } -static void add_noise_to_block(aom_film_grain_t *params, uint8_t *luma, +static void add_noise_to_block(const aom_film_grain_t *params, uint8_t *luma, uint8_t *cb, uint8_t *cr, int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain, int *cr_grain, @@ -675,7 +682,7 @@ static void add_noise_to_block(aom_film_grain_t *params, uint8_t *luma, } static void add_noise_to_block_hbd( - aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr, + const aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr, int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain, int *cr_grain, int luma_grain_stride, int chroma_grain_stride, int half_luma_height, int half_luma_width, int bit_depth, @@ -903,7 +910,7 @@ static void hor_boundary_overlap(int *top_block, int top_stride, } } -void av1_add_film_grain(aom_film_grain_t *params, aom_image_t *src, +void av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src, aom_image_t *dst) { uint8_t *luma, *cb, *cr; int height, width, luma_stride, chroma_stride; @@ -950,6 +957,11 @@ void av1_add_film_grain(aom_film_grain_t *params, aom_image_t *src, exit(1); } + assert(params->bit_depth == src->bit_depth); + + dst->fmt = src->fmt; + dst->bit_depth = src->bit_depth; + dst->r_w = src->r_w; dst->r_h = src->r_h; dst->d_w = src->d_w; @@ -999,15 +1011,13 @@ void av1_add_film_grain(aom_film_grain_t *params, aom_image_t *src, luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth; chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth; - params->bit_depth = dst->bit_depth; - av1_add_film_grain_run(params, luma, cb, cr, height, width, luma_stride, chroma_stride, use_high_bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); return; } -void av1_add_film_grain_run(aom_film_grain_t *params, uint8_t *luma, +void av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma, uint8_t *cb, uint8_t *cr, int height, int width, int luma_stride, int chroma_stride, int use_high_bit_depth, int chroma_subsamp_y, diff --git a/third_party/aom/aom_dsp/grain_synthesis.h b/third_party/aom/aom_dsp/grain_synthesis.h index 016cb12d7..65feb6068 100644 --- a/third_party/aom/aom_dsp/grain_synthesis.h +++ b/third_party/aom/aom_dsp/grain_synthesis.h @@ -72,7 +72,7 @@ typedef struct { int clip_to_restricted_range; - int bit_depth; // video bit depth + unsigned int bit_depth; // video bit depth int chroma_scaling_from_luma; @@ -94,7 +94,7 @@ typedef struct { * \param[in] luma_stride luma plane stride * \param[in] chroma_stride chroma plane stride */ -void av1_add_film_grain_run(aom_film_grain_t *grain_params, uint8_t *luma, +void av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma, uint8_t *cb, uint8_t *cr, int height, int width, int luma_stride, int chroma_stride, int use_high_bit_depth, int chroma_subsamp_y, @@ -106,10 +106,10 @@ void av1_add_film_grain_run(aom_film_grain_t *grain_params, uint8_t *luma, * * \param[in] grain_params Grain parameters * \param[in] src Source image - * \param[in] dst Resulting image with grain + * \param[out] dst Resulting image with grain */ -void av1_add_film_grain(aom_film_grain_t *grain_params, aom_image_t *src, - aom_image_t *dst); +void av1_add_film_grain(const aom_film_grain_t *grain_params, + const aom_image_t *src, aom_image_t *dst); #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/aom_dsp/noise_model.c b/third_party/aom/aom_dsp/noise_model.c index a1287f74f..5975c62e8 100644 --- a/third_party/aom/aom_dsp/noise_model.c +++ b/third_party/aom/aom_dsp/noise_model.c @@ -1458,3 +1458,189 @@ int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3], } return init_success; } + +struct aom_denoise_and_model_t { + int block_size; + int bit_depth; + float noise_level; + + // Size of current denoised buffer and flat_block buffer + int width; + int height; + int y_stride; + int uv_stride; + int num_blocks_w; + int num_blocks_h; + + // Buffers for image and noise_psd allocated on the fly + float *noise_psd[3]; + uint8_t *denoised[3]; + uint8_t *flat_blocks; + + aom_flat_block_finder_t flat_block_finder; + aom_noise_model_t noise_model; +}; + +struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth, + int block_size, + float noise_level) { + struct aom_denoise_and_model_t *ctx = + (struct aom_denoise_and_model_t *)aom_malloc( + sizeof(struct aom_denoise_and_model_t)); + if (!ctx) { + fprintf(stderr, "Unable to allocate denoise_and_model struct\n"); + return NULL; + } + memset(ctx, 0, sizeof(*ctx)); + + ctx->block_size = block_size; + ctx->noise_level = noise_level; + ctx->bit_depth = bit_depth; + + ctx->noise_psd[0] = + aom_malloc(sizeof(*ctx->noise_psd[0]) * block_size * block_size); + ctx->noise_psd[1] = + aom_malloc(sizeof(*ctx->noise_psd[1]) * block_size * block_size); + ctx->noise_psd[2] = + aom_malloc(sizeof(*ctx->noise_psd[2]) * block_size * block_size); + if (!ctx->noise_psd[0] || !ctx->noise_psd[1] || !ctx->noise_psd[2]) { + fprintf(stderr, "Unable to allocate noise PSD buffers\n"); + aom_denoise_and_model_free(ctx); + return NULL; + } + return ctx; +} + +void aom_denoise_and_model_free(struct aom_denoise_and_model_t *ctx) { + aom_free(ctx->flat_blocks); + for (int i = 0; i < 3; ++i) { + aom_free(ctx->denoised[i]); + aom_free(ctx->noise_psd[i]); + } + aom_noise_model_free(&ctx->noise_model); + aom_flat_block_finder_free(&ctx->flat_block_finder); + aom_free(ctx); +} + +static int denoise_and_model_realloc_if_necessary( + struct aom_denoise_and_model_t *ctx, YV12_BUFFER_CONFIG *sd) { + if (ctx->width == sd->y_width && ctx->height == sd->y_height && + ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride) + return 1; + const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; + const int block_size = ctx->block_size; + + ctx->width = sd->y_width; + ctx->height = sd->y_height; + ctx->y_stride = sd->y_stride; + ctx->uv_stride = sd->uv_stride; + + for (int i = 0; i < 3; ++i) { + aom_free(ctx->denoised[i]); + ctx->denoised[i] = NULL; + } + aom_free(ctx->flat_blocks); + ctx->flat_blocks = NULL; + + ctx->denoised[0] = aom_malloc((sd->y_stride * sd->y_height) << use_highbd); + ctx->denoised[1] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd); + ctx->denoised[2] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd); + if (!ctx->denoised[0] || !ctx->denoised[1] || !ctx->denoised[2]) { + fprintf(stderr, "Unable to allocate denoise buffers\n"); + return 0; + } + ctx->num_blocks_w = (sd->y_width + ctx->block_size - 1) / ctx->block_size; + ctx->num_blocks_h = (sd->y_height + ctx->block_size - 1) / ctx->block_size; + ctx->flat_blocks = aom_malloc(ctx->num_blocks_w * ctx->num_blocks_h); + + aom_flat_block_finder_free(&ctx->flat_block_finder); + if (!aom_flat_block_finder_init(&ctx->flat_block_finder, ctx->block_size, + ctx->bit_depth, use_highbd)) { + fprintf(stderr, "Unable to init flat block finder\n"); + return 0; + } + + const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3, + ctx->bit_depth, use_highbd }; + aom_noise_model_free(&ctx->noise_model); + if (!aom_noise_model_init(&ctx->noise_model, params)) { + fprintf(stderr, "Unable to init noise model\n"); + return 0; + } + + // Simply use a flat PSD (although we could use the flat blocks to estimate + // PSD) those to estimate an actual noise PSD) + const float y_noise_level = + aom_noise_psd_get_default_value(ctx->block_size, ctx->noise_level); + const float uv_noise_level = aom_noise_psd_get_default_value( + ctx->block_size >> sd->subsampling_x, ctx->noise_level); + for (int i = 0; i < block_size * block_size; ++i) { + ctx->noise_psd[0][i] = y_noise_level; + ctx->noise_psd[1][i] = ctx->noise_psd[2][i] = uv_noise_level; + } + return 1; +} + +int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, + YV12_BUFFER_CONFIG *sd, + aom_film_grain_t *film_grain) { + const int block_size = ctx->block_size; + const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; + uint8_t *raw_data[3] = { + use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->y_buffer) : sd->y_buffer, + use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->u_buffer) : sd->u_buffer, + use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->v_buffer) : sd->v_buffer, + }; + const uint8_t *const data[3] = { raw_data[0], raw_data[1], raw_data[2] }; + int strides[3] = { sd->y_stride, sd->uv_stride, sd->uv_stride }; + int chroma_sub_log2[2] = { sd->subsampling_x, sd->subsampling_y }; + + if (!denoise_and_model_realloc_if_necessary(ctx, sd)) { + fprintf(stderr, "Unable to realloc buffers\n"); + return 0; + } + + aom_flat_block_finder_run(&ctx->flat_block_finder, data[0], sd->y_width, + sd->y_height, strides[0], ctx->flat_blocks); + + if (!aom_wiener_denoise_2d(data, ctx->denoised, sd->y_width, sd->y_height, + strides, chroma_sub_log2, ctx->noise_psd, + block_size, ctx->bit_depth, use_highbd)) { + fprintf(stderr, "Unable to denoise image\n"); + return 0; + } + + const aom_noise_status_t status = aom_noise_model_update( + &ctx->noise_model, data, (const uint8_t *const *)ctx->denoised, + sd->y_width, sd->y_height, strides, chroma_sub_log2, ctx->flat_blocks, + block_size); + int have_noise_estimate = 0; + if (status == AOM_NOISE_STATUS_OK) { + have_noise_estimate = 1; + } else if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) { + aom_noise_model_save_latest(&ctx->noise_model); + have_noise_estimate = 1; + } else { + // Unable to update noise model; proceed if we have a previous estimate. + have_noise_estimate = + (ctx->noise_model.combined_state[0].strength_solver.num_equations > 0); + } + + film_grain->apply_grain = 0; + if (have_noise_estimate) { + if (!aom_noise_model_get_grain_parameters(&ctx->noise_model, film_grain)) { + fprintf(stderr, "Unable to get grain parameters.\n"); + return 0; + } + if (!film_grain->random_seed) { + film_grain->random_seed = 1071; + } + memcpy(raw_data[0], ctx->denoised[0], + (strides[0] * sd->y_height) << use_highbd); + memcpy(raw_data[1], ctx->denoised[1], + (strides[1] * sd->uv_height) << use_highbd); + memcpy(raw_data[2], ctx->denoised[2], + (strides[2] * sd->uv_height) << use_highbd); + } + return 1; +} diff --git a/third_party/aom/aom_dsp/noise_model.h b/third_party/aom/aom_dsp/noise_model.h index dabeacc14..b07bf8617 100644 --- a/third_party/aom/aom_dsp/noise_model.h +++ b/third_party/aom/aom_dsp/noise_model.h @@ -18,6 +18,7 @@ extern "C" { #include <stdint.h> #include "aom_dsp/grain_synthesis.h" +#include "aom_scale/yv12config.h" /*!\brief Wrapper of data required to represent linear system of eqns and soln. */ @@ -280,6 +281,42 @@ int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3], int w, int h, int stride[3], int chroma_sub_log2[2], float *noise_psd[3], int block_size, int bit_depth, int use_highbd); + +struct aom_denoise_and_model_t; + +/*!\brief Denoise the buffer and model the residual noise. + * + * This is meant to be called sequentially on input frames. The input buffer + * is denoised and the residual noise is modelled. The current noise estimate + * is populated in film_grain. Returns true on success. The grain.apply_grain + * parameter will be true when the input buffer was successfully denoised and + * grain was modelled. Returns false on error. + * + * \param[in] ctx Struct allocated with aom_denoise_and_model_alloc + * that holds some buffers for denoising and the current + * noise estimate. + * \param[in/out] buf The raw input buffer to be denoised. + * \param[out] grain Output film grain parameters + */ +int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, + YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain); + +/*!\brief Allocates a context that can be used for denoising and noise modeling. + * + * \param[in] bit_depth Bit depth of buffers this will be run on. + * \param[in] block_size Block size for noise modeling and flat block + * estimation + * \param[in] noise_level The noise_level (2.5 for moderate noise, and 5 for + * higher levels of noise) + */ +struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth, + int block_size, + float noise_level); + +/*!\brief Frees the denoise context allocated with aom_denoise_and_model_alloc + */ +void aom_denoise_and_model_free(struct aom_denoise_and_model_t *denoise_model); + #ifdef __cplusplus } // extern "C" #endif // __cplusplus diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h index 0e5ae5b68..4b70cc57b 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics.h @@ -289,6 +289,15 @@ SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { return c_v256_shr_s32(a, c); } +SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) { + return c_v256_shl_64(a, c); +} +SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) { + return c_v256_shr_u64(a, c); +} +SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) { + return c_v256_shr_s64(a, c); +} SIMD_INLINE v256 v256_shr_n_byte(v256 a, unsigned int n) { return c_v256_shr_n_byte(a, n); diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c index d367905bc..817ebe15d 100644 --- a/third_party/aom/aom_dsp/variance.c +++ b/third_party/aom/aom_dsp/variance.c @@ -386,7 +386,7 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, } } - const InterpFilterParams filter = + const InterpFilterParams *filter = av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); if (!subpel_x_q3 && !subpel_y_q3) { @@ -413,12 +413,12 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); const int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride, - temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, - intermediate_height); - aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1), + aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1), + ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, + width, intermediate_height); + aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1), MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, width, height); } @@ -974,7 +974,7 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, } } - const InterpFilterParams filter = + const InterpFilterParams *filter = av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); if (!subpel_x_q3 && !subpel_y_q3) { @@ -1004,14 +1004,14 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); const int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1), + aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1), ref_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd); aom_highbd_convolve8_vert( - CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)), + CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, 16, width, height, bd); } @@ -1185,29 +1185,18 @@ void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8, } } -void aom_highbd_comp_mask_upsampled_pred_c( +void aom_highbd_comp_mask_upsampled_pred( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd) { - int i, j; - - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, bd); - for (i = 0; i < height; ++i) { - for (j = 0; j < width; ++j) { - if (!invert_mask) - comp_pred[j] = AOM_BLEND_A64(mask[j], comp_pred[j], pred[j]); - else - comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], comp_pred[j]); - } - comp_pred += width; - pred += width; - mask += mask_stride; - } + aom_highbd_comp_mask_pred(comp_pred, pred8, width, height, + CONVERT_TO_BYTEPTR(comp_pred), width, mask, + mask_stride, invert_mask); } #define HIGHBD_MASK_SUBPIX_VAR(W, H) \ diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h index 544dda944..b954470de 100644 --- a/third_party/aom/aom_dsp/variance.h +++ b/third_party/aom/aom_dsp/variance.h @@ -76,6 +76,13 @@ void aom_comp_mask_upsampled_pred( int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); +void aom_highbd_comp_mask_upsampled_pred( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, + int bd); + typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride, const int32_t *wsrc, const int32_t *msk); diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c index af45a03ac..f3fe50372 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c @@ -41,20 +41,290 @@ #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) #endif // __clang__ +static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a)); + *((uint32_t *)(output_ptr + stride)) = + _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1)); +} + +static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) { + __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo))); + a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1); + return a; +} + +static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); + _mm_storel_epi64((__m128i *)(output_ptr + stride), + _mm256_extractf128_si256(*a, 1)); +} + +static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) { + __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo))); + a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1); + return a; +} + +static INLINE void xx_store2_mi128(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); + _mm_store_si128((__m128i *)(output_ptr + stride), + _mm256_extractf128_si256(*a, 1)); +} + +static void aom_filter_block1d4_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, filt2Reg; + __m256i firstFilters, secondFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2; + __m256i srcReg32b1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 32 bits + firstFilters = _mm256_shuffle_epi32(filtersReg32, 0); + // duplicate only the second 32 bits + secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55); + + filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + // filter the source buffer + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + srcRegFilt32b1_1 = + _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 4 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + + // filter the source buffer + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 4 bytes + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); + } +} + +static void aom_filter_block1d8_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 8 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); + } +} + static void aom_filter_block1d16_h8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; - __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1, srcReg32b2, filtersReg32; unsigned int i; ptrdiff_t src_stride, dst_stride; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); @@ -74,22 +344,17 @@ static void aom_filter_block1d16_h8_avx2( // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); - filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); - filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); - filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); - filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); + filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i -= 2) { // load the 2 strides of source - srcReg32b1 = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3))); - srcReg32b1 = _mm256_inserti128_si256( - srcReg32b1, - _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)), - 1); + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); // filter the source buffer srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); @@ -110,22 +375,13 @@ static void aom_filter_block1d16_h8_avx2( srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16( - srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5))); - srcReg32b2 = _mm256_inserti128_si256( - srcReg32b2, - _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)), - 1); - - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16( - srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); // filter the source buffer srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); @@ -148,32 +404,21 @@ static void aom_filter_block1d16_h8_avx2( // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16( - srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); - srcRegFilt32b2_1 = _mm256_adds_epi16( - srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); - - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); + srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2)); - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); - srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result + // convolve result and the second lane contain the second convolve result srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); src_ptr += src_stride; - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, - _mm256_castsi256_si128(srcRegFilt32b1_1)); - - // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + output_pitch), - _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); output_ptr += dst_stride; } @@ -183,7 +428,7 @@ static void aom_filter_block1d16_h8_avx2( __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; __m128i srcRegFilt2, srcRegFilt3; - srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); @@ -210,15 +455,11 @@ static void aom_filter_block1d16_h8_avx2( // add and saturate the results together srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); // reading the next 16 bytes // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); - - // add and saturate the results together - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); // filter the source buffer srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); @@ -245,19 +486,16 @@ static void aom_filter_block1d16_h8_avx2( // add and saturate the results together srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); - srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); + // shift by 6 bit each 16 bit srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); - - // shift by 7 bit each 16 bit - srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); - srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); + _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve @@ -269,11 +507,163 @@ static void aom_filter_block1d16_h8_avx2( } } +static void aom_filter_block1d8_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32; + __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; + __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; + __m256i srcReg32b11, srcReg32b12, filtersReg32; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + // load 16 bytes 7 times in stride of src_pitch + srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr); + srcReg32b3 = + xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg32b5 = + xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); + srcReg32b7 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + + // have each consecutive loads on the same 256 register + srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); + srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); + srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); + // merge every two consecutive registers except the last one + srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); + srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); + + // shift by 6 bit each 16 bit + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b11 = srcReg32b2; + srcReg32b2 = srcReg32b4; + srcReg32b7 = srcReg32b9; + } + if (i > 0) { + __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8; + // load the last 16 bytes + srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the last 2 results together + srcRegFilt4 = + _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = + _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), + _mm256_castsi256_si128(secondFilters)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); + + // shift by 6 bit each 16 bit + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1); + } +} + static void aom_filter_block1d16_v8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; - __m256i addFilterReg64; + __m256i addFilterReg32; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; __m256i srcReg32b11, srcReg32b12, filtersReg32; @@ -281,11 +671,11 @@ static void aom_filter_block1d16_v8_avx2( unsigned int i; ptrdiff_t src_stride, dst_stride; - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + addFilterReg32 = _mm256_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); @@ -308,49 +698,26 @@ static void aom_filter_block1d16_v8_avx2( dst_stride = out_pitch << 1; // load 16 bytes 7 times in stride of src_pitch - srcReg32b1 = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr))); - srcReg32b2 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); - srcReg32b3 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); - srcReg32b4 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); - srcReg32b5 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); - srcReg32b6 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr); + srcReg32b3 = + xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg32b5 = + xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); srcReg32b7 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); // have each consecutive loads on the same 256 register - srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, - _mm256_castsi256_si128(srcReg32b2), 1); - srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, - _mm256_castsi256_si128(srcReg32b3), 1); - srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, - _mm256_castsi256_si128(srcReg32b4), 1); - srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, - _mm256_castsi256_si128(srcReg32b5), 1); - srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, - _mm256_castsi256_si128(srcReg32b6), 1); - srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, - _mm256_castsi256_si128(srcReg32b7), 1); - + srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); + srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); + srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); // merge every two consecutive registers except the last one srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); // save srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); - - // save srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); - - // save srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); - - // save srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); for (i = output_height; i > 1; i -= 2) { @@ -383,9 +750,7 @@ static void aom_filter_block1d16_v8_avx2( // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); // multiply 2 adjacent elements with the filter and add the result srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); @@ -399,16 +764,13 @@ static void aom_filter_block1d16_v8_avx2( // add and saturate the results together srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); - - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); - // shift by 7 bit each 16 bit - srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); - srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); + // shift by 6 bit each 16 bit + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32); + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); + srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve @@ -417,12 +779,7 @@ static void aom_filter_block1d16_v8_avx2( src_ptr += src_stride; - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1)); - - // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + out_pitch), - _mm256_extractf128_si256(srcReg32b1, 1)); + xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1); output_ptr += dst_stride; @@ -475,24 +832,17 @@ static void aom_filter_block1d16_v8_avx2( // add and saturate the results together srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); + _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); + _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7)); - // add and saturate the results together + // shift by 6 bit each 16 bit srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); + _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); - - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); - srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); - - // shift by 7 bit each 16 bit - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); + _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); + srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve @@ -506,21 +856,6 @@ static void aom_filter_block1d16_v8_avx2( #if HAVE_AVX2 && HAVE_SSSE3 filter8_1dfunction aom_filter_block1d4_v8_ssse3; -#if ARCH_X86_64 -filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3; -#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_intrin_ssse3 -#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_intrin_ssse3 -#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_intrin_ssse3 -#else // ARCH_X86 -filter8_1dfunction aom_filter_block1d8_v8_ssse3; -filter8_1dfunction aom_filter_block1d8_h8_ssse3; -filter8_1dfunction aom_filter_block1d4_h8_ssse3; -#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_ssse3 -#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_ssse3 -#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_ssse3 -#endif // ARCH_X86_64 filter8_1dfunction aom_filter_block1d16_v2_ssse3; filter8_1dfunction aom_filter_block1d16_h2_ssse3; filter8_1dfunction aom_filter_block1d8_v2_ssse3; diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h index 7790baf2e..72fabd236 100644 --- a/third_party/aom/aom_dsp/x86/convolve_avx2.h +++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h @@ -13,31 +13,27 @@ #define AOM_DSP_X86_CONVOLVE_AVX2_H_ // filters for 16 -DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, + 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, + 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; -DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, + 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, + 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, }; static INLINE void prepare_coeffs_lowbd( const InterpFilterParams *const filter_params, const int subpel_q4, __m256i *const coeffs /* [4] */) { const int16_t *const filter = av1_get_interp_filter_subpel_kernel( - *filter_params, subpel_q4 & SUBPEL_MASK); + filter_params, subpel_q4 & SUBPEL_MASK); const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); @@ -65,7 +61,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, const int subpel_q4, __m256i *const coeffs /* [4] */) { const int16_t *filter = av1_get_interp_filter_subpel_kernel( - *filter_params, subpel_q4 & SUBPEL_MASK); + filter_params, subpel_q4 & SUBPEL_MASK); const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h index 846fe7bb4..399df5d6d 100644 --- a/third_party/aom/aom_dsp/x86/convolve_sse2.h +++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h @@ -19,7 +19,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, const int subpel_q4, __m128i *const coeffs /* [4] */) { const int16_t *filter = av1_get_interp_filter_subpel_kernel( - *filter_params, subpel_q4 & SUBPEL_MASK); + filter_params, subpel_q4 & SUBPEL_MASK); const __m128i coeff = _mm_loadu_si128((__m128i *)filter); // coeffs 0 1 0 1 0 1 0 1 diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c index e5e3238d5..099fcf7fc 100644 --- a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c +++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c @@ -105,8 +105,8 @@ void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride, void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { int i, j; @@ -254,8 +254,8 @@ void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { int i, j; diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c index f7ac9b496..e7b33d1c4 100644 --- a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c +++ b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c @@ -18,8 +18,8 @@ void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { @@ -166,8 +166,8 @@ void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c index fdfadc886..131c16aa9 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c @@ -676,7 +676,7 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, } } - const InterpFilterParams filter = + const InterpFilterParams *filter = av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); if (!subpel_x_q3 && !subpel_y_q3) { @@ -726,14 +726,14 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); const int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1), + aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1), ref_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd); aom_highbd_convolve8_vert( - CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)), + CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, 16, width, height, bd); } diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c index 9801e285c..eaf1f347b 100644 --- a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c +++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c @@ -22,118 +22,12 @@ void aom_var_filter_block2d_bil_first_pass_ssse3( const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter) { - // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow - // in computation using _mm_maddubs_epi16. - // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow. - const int16_t round = (1 << (FILTER_BITS - 1)) >> 1; - const __m128i r = _mm_set1_epi16(round); - const uint8_t f0 = filter[0] >> 1; - const uint8_t f1 = filter[1] >> 1; - const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1, - f0, f1, f0, f1, f0, f1); - const __m128i shuffle_mask = - _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); - unsigned int i, j; - (void)pixel_step; - - if (output_width >= 8) { - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 8) { - // load source - __m128i source_low = xx_loadl_64(a); - __m128i source_hi = _mm_setzero_si128(); - - // avoid load undefined memory - if (a + 8 != NULL) source_hi = xx_loadl_64(a + 8); - __m128i source = _mm_unpacklo_epi64(source_low, source_hi); - - // shuffle to: - // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], - // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } - __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); - - // b[i] = a[i] * filter[0] + a[i + 1] * filter[1] - __m128i res = _mm_maddubs_epi16(source_shuffle, filters); - - // round - res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); - - xx_storeu_128(b, res); - - a += 8; - b += 8; - } - - a += src_pixels_per_line - output_width; - } - } else { - for (i = 0; i < output_height; ++i) { - // load source, only first 5 values are meaningful: - // { a[0], a[1], a[2], a[3], a[4], xxxx } - __m128i source = xx_loadl_64(a); - - // shuffle, up to the first 8 are useful - // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], - // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } - __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); - - __m128i res = _mm_maddubs_epi16(source_shuffle, filters); - res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); - - xx_storel_64(b, res); - - a += src_pixels_per_line; - b += output_width; - } - } -} + unsigned int output_width, const uint8_t *filter); void aom_var_filter_block2d_bil_second_pass_ssse3( const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter) { - const int16_t round = (1 << FILTER_BITS) >> 1; - const __m128i r = _mm_set1_epi32(round); - const __m128i filters = - _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0], - filter[1], filter[0], filter[1]); - const __m128i shuffle_mask = - _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); - const __m128i mask = - _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); - unsigned int i, j; - - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 4) { - // load source as: - // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] } - __m128i source1 = xx_loadl_64(a); - __m128i source2 = xx_loadl_64(a + pixel_step); - __m128i source = _mm_unpacklo_epi64(source1, source2); - - // shuffle source to: - // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] } - __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); - - // b[i] = a[i] * filter[0] + a[w + i] * filter[1] - __m128i res = _mm_madd_epi16(source_shuffle, filters); - - // round - res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS); - - // shuffle to get each lower 8 bit of every 32 bit - res = _mm_shuffle_epi8(res, mask); - - xx_storel_32(b, res); - - a += 4; - b += 4; - } - - a += src_pixels_per_line - output_width; - } -} + unsigned int output_width, const uint8_t *filter); static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1, const __m128i *w, const __m128i *r, diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c new file mode 100644 index 000000000..6538e4d5e --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdio.h> +#include <tmmintrin.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86//masked_sad_intrin_ssse3.h" + +static INLINE unsigned int masked_sad32xh_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height) { + int x, y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_scale = + _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 32) { + const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); + const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); + const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); + const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]); + const __m256i m_inv = _mm256_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m256i data_l = _mm256_unpacklo_epi8(a, b); + const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); + __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); + pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); + + const __m256i data_r = _mm256_unpackhi_epi8(a, b); + const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); + __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); + pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); + + const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); + res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. + res = _mm256_shuffle_epi32(res, 0xd8); + res = _mm256_permute4x64_epi64(res, 0xd8); + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int32_t sad = _mm256_extract_epi32(res, 0); + return (sad + 31) >> 6; +} + +static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) { + __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo)); + __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi)); + __m256i a = _mm256_castsi128_si256(a0); + return _mm256_inserti128_si256(a, a1, 1); +} + +static INLINE unsigned int masked_sad16xh_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, + int height) { + int y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_scale = + _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + for (y = 0; y < height; y += 2) { + const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr); + const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr); + const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr); + const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr); + const __m256i m_inv = _mm256_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m256i data_l = _mm256_unpacklo_epi8(a, b); + const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); + __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); + pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); + + const __m256i data_r = _mm256_unpackhi_epi8(a, b); + const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); + __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); + pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); + + const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); + res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); + + src_ptr += src_stride << 1; + a_ptr += a_stride << 1; + b_ptr += b_stride << 1; + m_ptr += m_stride << 1; + } + // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. + res = _mm256_shuffle_epi32(res, 0xd8); + res = _mm256_permute4x64_epi64(res, 0xd8); + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int32_t sad = _mm256_extract_epi32(res, 0); + return (sad + 31) >> 6; +} + +static INLINE unsigned int aom_masked_sad_avx2( + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, + int invert_mask, int m, int n) { + unsigned int sad; + if (!invert_mask) { + switch (m) { + case 4: + sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 8: + sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 16: + sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred, + m, msk, msk_stride, n); + break; + default: + sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred, + m, msk, msk_stride, m, n); + break; + } + } else { + switch (m) { + case 4: + sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 8: + sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 16: + sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + default: + sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, m, n); + break; + } + } + return sad; +} + +#define MASKSADMXN_AVX2(m, n) \ + unsigned int aom_masked_sad##m##x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \ + msk, msk_stride, invert_mask, m, n); \ + } + +MASKSADMXN_AVX2(4, 4) +MASKSADMXN_AVX2(4, 8) +MASKSADMXN_AVX2(8, 4) +MASKSADMXN_AVX2(8, 8) +MASKSADMXN_AVX2(8, 16) +MASKSADMXN_AVX2(16, 8) +MASKSADMXN_AVX2(16, 16) +MASKSADMXN_AVX2(16, 32) +MASKSADMXN_AVX2(32, 16) +MASKSADMXN_AVX2(32, 32) +MASKSADMXN_AVX2(32, 64) +MASKSADMXN_AVX2(64, 32) +MASKSADMXN_AVX2(64, 64) +MASKSADMXN_AVX2(64, 128) +MASKSADMXN_AVX2(128, 64) +MASKSADMXN_AVX2(128, 128) +MASKSADMXN_AVX2(4, 16) +MASKSADMXN_AVX2(16, 4) +MASKSADMXN_AVX2(8, 32) +MASKSADMXN_AVX2(32, 8) +MASKSADMXN_AVX2(16, 64) +MASKSADMXN_AVX2(64, 16) + +static INLINE unsigned int highbd_masked_sad8xh_avx2( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i one = _mm256_set1_epi16(1); + + for (y = 0; y < height; y += 2) { + const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr); + const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr); + const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr); + // Zero-extend mask to 16 bits + const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(m_ptr)), + _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride)))); + const __m256i m_inv = _mm256_sub_epi16(mask_max, m); + + const __m256i data_l = _mm256_unpacklo_epi16(a, b); + const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); + __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); + pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m256i data_r = _mm256_unpackhi_epi16(a, b); + const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); + __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); + pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, + // so it is safe to do signed saturation here. + const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); + // There is no 16-bit SAD instruction, so we have to synthesize + // an 8-element SAD. We do this by storing 4 32-bit partial SADs, + // and accumulating them at the end + const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); + res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); + + src_ptr += src_stride << 1; + a_ptr += a_stride << 1; + b_ptr += b_stride << 1; + m_ptr += m_stride << 1; + } + // At this point, we have four 32-bit partial SADs stored in 'res'. + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); + return (sad + 31) >> 6; +} + +static INLINE unsigned int highbd_masked_sad16xh_avx2( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int x, y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i one = _mm256_set1_epi16(1); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); + const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); + const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); + // Zero-extend mask to 16 bits + const __m256i m = + _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x])); + const __m256i m_inv = _mm256_sub_epi16(mask_max, m); + + const __m256i data_l = _mm256_unpacklo_epi16(a, b); + const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); + __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); + pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m256i data_r = _mm256_unpackhi_epi16(a, b); + const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); + __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); + pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, + // so it is safe to do signed saturation here. + const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); + // There is no 16-bit SAD instruction, so we have to synthesize + // an 8-element SAD. We do this by storing 4 32-bit partial SADs, + // and accumulating them at the end + const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); + res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have four 32-bit partial SADs stored in 'res'. + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); + return (sad + 31) >> 6; +} + +static INLINE unsigned int aom_highbd_masked_sad_avx2( + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, + int invert_mask, int m, int n) { + unsigned int sad; + if (!invert_mask) { + switch (m) { + case 4: + sad = + aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 8: + sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + default: + sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, m, n); + break; + } + } else { + switch (m) { + case 4: + sad = + aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 8: + sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + default: + sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, m, n); + break; + } + } + return sad; +} + +#define HIGHBD_MASKSADMXN_AVX2(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_avx2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \ + second_pred8, msk, msk_stride, \ + invert_mask, m, n); \ + } + +HIGHBD_MASKSADMXN_AVX2(4, 4); +HIGHBD_MASKSADMXN_AVX2(4, 8); +HIGHBD_MASKSADMXN_AVX2(8, 4); +HIGHBD_MASKSADMXN_AVX2(8, 8); +HIGHBD_MASKSADMXN_AVX2(8, 16); +HIGHBD_MASKSADMXN_AVX2(16, 8); +HIGHBD_MASKSADMXN_AVX2(16, 16); +HIGHBD_MASKSADMXN_AVX2(16, 32); +HIGHBD_MASKSADMXN_AVX2(32, 16); +HIGHBD_MASKSADMXN_AVX2(32, 32); +HIGHBD_MASKSADMXN_AVX2(32, 64); +HIGHBD_MASKSADMXN_AVX2(64, 32); +HIGHBD_MASKSADMXN_AVX2(64, 64); +HIGHBD_MASKSADMXN_AVX2(64, 128); +HIGHBD_MASKSADMXN_AVX2(128, 64); +HIGHBD_MASKSADMXN_AVX2(128, 128); +HIGHBD_MASKSADMXN_AVX2(4, 16); +HIGHBD_MASKSADMXN_AVX2(16, 4); +HIGHBD_MASKSADMXN_AVX2(8, 32); +HIGHBD_MASKSADMXN_AVX2(32, 8); +HIGHBD_MASKSADMXN_AVX2(16, 64); +HIGHBD_MASKSADMXN_AVX2(64, 16); diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c index 1f42eec2f..493f9bd8f 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c @@ -19,6 +19,8 @@ #include "aom/aom_integer.h" #include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86//masked_sad_intrin_ssse3.h" + // For width a multiple of 16 static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, int src_stride, @@ -27,16 +29,6 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, const uint8_t *m_ptr, int m_stride, int width, int height); -static INLINE unsigned int masked_sad8xh_ssse3( - const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, - int height); - -static INLINE unsigned int masked_sad4xh_ssse3( - const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, - int height); - #define MASKSADMXN_SSSE3(m, n) \ unsigned int aom_masked_sad##m##x##n##_ssse3( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ @@ -56,11 +48,11 @@ static INLINE unsigned int masked_sad4xh_ssse3( const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ int invert_mask) { \ if (!invert_mask) \ - return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \ - second_pred, 8, msk, msk_stride, n); \ + return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \ + second_pred, 8, msk, msk_stride, n); \ else \ - return masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \ - ref_stride, msk, msk_stride, n); \ + return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \ + ref_stride, msk, msk_stride, n); \ } #define MASKSAD4XN_SSSE3(n) \ @@ -69,11 +61,11 @@ static INLINE unsigned int masked_sad4xh_ssse3( const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ int invert_mask) { \ if (!invert_mask) \ - return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \ - second_pred, 4, msk, msk_stride, n); \ + return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \ + second_pred, 4, msk, msk_stride, n); \ else \ - return masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \ - ref_stride, msk, msk_stride, n); \ + return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \ + ref_stride, msk, msk_stride, n); \ } MASKSADMXN_SSSE3(128, 128) @@ -145,10 +137,11 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, return (sad + 31) >> 6; } -static INLINE unsigned int masked_sad8xh_ssse3( - const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, - int height) { +unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); @@ -189,10 +182,11 @@ static INLINE unsigned int masked_sad8xh_ssse3( return (sad + 31) >> 6; } -static INLINE unsigned int masked_sad4xh_ssse3( - const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, - int height) { +unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); @@ -238,11 +232,6 @@ static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height); -static INLINE unsigned int highbd_masked_sad4xh_ssse3( - const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, - int height); - #define HIGHBD_MASKSADMXN_SSSE3(m, n) \ unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \ const uint8_t *src8, int src_stride, const uint8_t *ref8, \ @@ -262,11 +251,13 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3( int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ int msk_stride, int invert_mask) { \ if (!invert_mask) \ - return highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, ref_stride, \ - second_pred8, 4, msk, msk_stride, n); \ + return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, \ + ref_stride, second_pred8, 4, msk, \ + msk_stride, n); \ else \ - return highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \ - ref8, ref_stride, msk, msk_stride, n); \ + return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \ + ref8, ref_stride, msk, msk_stride, \ + n); \ } HIGHBD_MASKSADMXN_SSSE3(128, 128) @@ -350,10 +341,11 @@ static INLINE unsigned int highbd_masked_sad_ssse3( return (sad + 31) >> 6; } -static INLINE unsigned int highbd_masked_sad4xh_ssse3( - const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, - int height) { +unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, + const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h new file mode 100644 index 000000000..19b429d91 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H +#define _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H + +unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, + const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +#endif diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c new file mode 100644 index 000000000..2aa2a0555 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <immintrin.h> + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" +#include "aom_dsp/x86/synonyms.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + + do { + const __m128i v_p_b_0 = xx_loadl_32(pre); + const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride); + const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1); + const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); + + const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d); + const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d); + const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d); + + n += 8; + pre += pre_stride << 1; + } while (n < 8 * (height >> 1)); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +static INLINE unsigned int obmc_sad_w8n_avx2( + const uint8_t *pre, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const int pre_step = pre_stride - width; + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p0_b = xx_loadl_64(pre + n); + const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d); + const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d); + + n += 8; + + if ((n & (width - 1)) == 0) pre += pre_step; + } while (n < width * height); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +#define OBMCSADWXH(w, h) \ + unsigned int aom_obmc_sad##w##x##h##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *msk) { \ + if (w == 4) { \ + return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h); \ + } else { \ + return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \ + } \ + } + +OBMCSADWXH(128, 128) +OBMCSADWXH(128, 64) +OBMCSADWXH(64, 128) +OBMCSADWXH(64, 64) +OBMCSADWXH(64, 32) +OBMCSADWXH(32, 64) +OBMCSADWXH(32, 32) +OBMCSADWXH(32, 16) +OBMCSADWXH(16, 32) +OBMCSADWXH(16, 16) +OBMCSADWXH(16, 8) +OBMCSADWXH(8, 16) +OBMCSADWXH(8, 8) +OBMCSADWXH(8, 4) +OBMCSADWXH(4, 8) +OBMCSADWXH(4, 4) +OBMCSADWXH(4, 16) +OBMCSADWXH(16, 4) +OBMCSADWXH(8, 32) +OBMCSADWXH(32, 8) +OBMCSADWXH(16, 64) +OBMCSADWXH(64, 16) + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// + +static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + do { + const __m128i v_p_w_0 = xx_loadl_64(pre); + const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride); + const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1); + const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); + + const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d); + const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d); + + // Rounded absolute difference + + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d); + const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d); + + n += 8; + + pre += pre_stride << 1; + } while (n < 8 * (height >> 1)); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +static INLINE unsigned int hbd_obmc_sad_w8n_avx2( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - width; + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n)); + const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d); + const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d); + + n += 8; + + if (n % width == 0) pre += pre_step; + } while (n < width * height); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +#define HBD_OBMCSADWXH(w, h) \ + unsigned int aom_highbd_obmc_sad##w##x##h##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + if (w == 4) { \ + return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h); \ + } else { \ + return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \ + } \ + } + +HBD_OBMCSADWXH(128, 128) +HBD_OBMCSADWXH(128, 64) +HBD_OBMCSADWXH(64, 128) +HBD_OBMCSADWXH(64, 64) +HBD_OBMCSADWXH(64, 32) +HBD_OBMCSADWXH(32, 64) +HBD_OBMCSADWXH(32, 32) +HBD_OBMCSADWXH(32, 16) +HBD_OBMCSADWXH(16, 32) +HBD_OBMCSADWXH(16, 16) +HBD_OBMCSADWXH(16, 8) +HBD_OBMCSADWXH(8, 16) +HBD_OBMCSADWXH(8, 8) +HBD_OBMCSADWXH(8, 4) +HBD_OBMCSADWXH(4, 8) +HBD_OBMCSADWXH(4, 4) +HBD_OBMCSADWXH(4, 16) +HBD_OBMCSADWXH(16, 4) +HBD_OBMCSADWXH(8, 32) +HBD_OBMCSADWXH(32, 8) +HBD_OBMCSADWXH(16, 64) +HBD_OBMCSADWXH(64, 16) diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c index 571aa770b..2e2f6e09f 100644 --- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c @@ -26,6 +26,16 @@ // 8 bit //////////////////////////////////////////////////////////////////////////////// +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *const sse, int *const sum, @@ -152,6 +162,46 @@ OBMCVARWXH(32, 8) OBMCVARWXH(16, 64) OBMCVARWXH(64, 16) +#include "config/aom_dsp_rtcd.h" + +#define OBMC_SUBPIX_VAR(W, H) \ + uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + aom_var_filter_block2d_bil_first_pass_ssse3( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_ssse3( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse); \ + } + +OBMC_SUBPIX_VAR(128, 128) +OBMC_SUBPIX_VAR(128, 64) +OBMC_SUBPIX_VAR(64, 128) +OBMC_SUBPIX_VAR(64, 64) +OBMC_SUBPIX_VAR(64, 32) +OBMC_SUBPIX_VAR(32, 64) +OBMC_SUBPIX_VAR(32, 32) +OBMC_SUBPIX_VAR(32, 16) +OBMC_SUBPIX_VAR(16, 32) +OBMC_SUBPIX_VAR(16, 16) +OBMC_SUBPIX_VAR(16, 8) +OBMC_SUBPIX_VAR(8, 16) +OBMC_SUBPIX_VAR(8, 8) +OBMC_SUBPIX_VAR(8, 4) +OBMC_SUBPIX_VAR(4, 8) +OBMC_SUBPIX_VAR(4, 4) +OBMC_SUBPIX_VAR(4, 16) +OBMC_SUBPIX_VAR(16, 4) +OBMC_SUBPIX_VAR(8, 32) +OBMC_SUBPIX_VAR(32, 8) +OBMC_SUBPIX_VAR(16, 64) +OBMC_SUBPIX_VAR(64, 16) + //////////////////////////////////////////////////////////////////////////////// // High bit-depth //////////////////////////////////////////////////////////////////////////////// diff --git a/third_party/aom/aom_dsp/x86/subtract_avx2.c b/third_party/aom/aom_dsp/x86/subtract_avx2.c new file mode 100644 index 000000000..4389d123d --- /dev/null +++ b/third_party/aom/aom_dsp/x86/subtract_avx2.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include <immintrin.h> + +#include "config/aom_dsp_rtcd.h" + +static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr, + const uint8_t *pred_ptr) { + __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr)); + __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr)); + __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s)); + __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1)); + __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p)); + __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1)); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + const __m256i d_1 = _mm256_sub_epi16(s_1, p_1); + _mm256_store_si256((__m256i *)(diff_ptr), d_0); + _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1); +} + +static INLINE void aom_subtract_block_16xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr)); + __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr)); + __m256i s_0 = _mm256_cvtepu8_epi16(s); + __m256i p_0 = _mm256_cvtepu8_epi16(p); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + _mm256_store_si256((__m256i *)(diff_ptr), d_0); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void aom_subtract_block_32xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void aom_subtract_block_64xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void aom_subtract_block_128xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); + subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64); + subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + switch (cols) { + case 16: + aom_subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + case 32: + aom_subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + case 64: + aom_subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + case 128: + aom_subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + default: + aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } +} diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h new file mode 100644 index 000000000..bdff64b8f --- /dev/null +++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H_ +#define AOM_DSP_X86_TXFM_COMMON_AVX2_H_ + +#include <emmintrin.h> +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output, + int8_t cos_bit); + +static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) { + return _mm256_set1_epi32( + (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); +} + +static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1, + __m256i *in0, __m256i *in1, const __m256i _r, + const int32_t cos_bit) { + __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1); + __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1); + __m256i u0 = _mm256_madd_epi16(t0, w0); + __m256i u1 = _mm256_madd_epi16(t1, w0); + __m256i v0 = _mm256_madd_epi16(t0, w1); + __m256i v1 = _mm256_madd_epi16(t1, w1); + + __m256i a0 = _mm256_add_epi32(u0, _r); + __m256i a1 = _mm256_add_epi32(u1, _r); + __m256i b0 = _mm256_add_epi32(v0, _r); + __m256i b1 = _mm256_add_epi32(v1, _r); + + __m256i c0 = _mm256_srai_epi32(a0, cos_bit); + __m256i c1 = _mm256_srai_epi32(a1, cos_bit); + __m256i d0 = _mm256_srai_epi32(b0, cos_bit); + __m256i d1 = _mm256_srai_epi32(b1, cos_bit); + + *in0 = _mm256_packs_epi32(c0, c1); + *in1 = _mm256_packs_epi32(d0, d1); +} + +static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) { + const __m256i _in0 = *in0; + const __m256i _in1 = *in1; + *in0 = _mm256_adds_epi16(_in0, _in1); + *in1 = _mm256_subs_epi16(_in0, _in1); +} + +static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) { + const __m256i _in0 = *in0; + const __m256i _in1 = *in1; + *in0 = _mm256_add_epi32(_in0, _in1); + *in1 = _mm256_sub_epi32(_in0, _in1); +} + +static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1, + __m256i in0, __m256i in1) { + const __m256i _in0 = in0; + const __m256i _in1 = in1; + *out0 = _mm256_adds_epi16(_in0, _in1); + *out1 = _mm256_subs_epi16(_in0, _in1); +} + +static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1, + __m256i in0, __m256i in1) { + const __m256i _in0 = in0; + const __m256i _in1 = in1; + *out0 = _mm256_add_epi32(_in0, _in1); + *out1 = _mm256_sub_epi32(_in0, _in1); +} + +static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) { + return _mm256_load_si256((const __m256i *)a); +} + +static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in, + int stride, __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_16bit_to_16bit_avx2(in + i * stride); + } +} + +static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in, + int stride, + __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride); + } +} + +static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) { + const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a); + const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8)); + return _mm256_permute4x64_epi64(b, 0xD8); +} + +static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in, + int stride, __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride); + } +} + +static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in, + __m256i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 08 09 0a 0b 04 05 06 07 0c 0d 0e 0f + // in[1]: 10 11 12 13 18 19 1a 1b 14 15 16 17 1c 1d 1e 1f + // in[2]: 20 21 22 23 28 29 2a 2b 24 25 26 27 2c 2d 2e 2f + // in[3]: 30 31 32 33 38 39 3a 3b 34 35 36 37 3c 3d 3e 3f + // in[4]: 40 41 42 43 48 49 4a 4b 44 45 46 47 4c 4d 4e 4f + // in[5]: 50 51 52 53 58 59 5a 5b 54 55 56 57 5c 5d 5e 5f + // in[6]: 60 61 62 63 68 69 6a 6b 64 65 66 67 6c 6d 6e 6f + // in[7]: 70 71 72 73 78 79 7a 7b 74 75 76 77 7c 7d 7e 7f + // in[8]: 80 81 82 83 88 89 8a 8b 84 85 86 87 8c 8d 8e 8f + // to: + // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // ... + __m256i a[16]; + for (int i = 0; i < 16; i += 2) { + a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]); + a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]); + } + __m256i b[16]; + for (int i = 0; i < 16; i += 2) { + b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]); + b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]); + } + __m256i c[16]; + for (int i = 0; i < 16; i += 2) { + c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]); + c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]); + } + out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20); + out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20); + out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20); + out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20); + + out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31); + out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31); + out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31); + out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31); + + out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20); + out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20); + out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20); + out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20); + + out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31); + out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31); + out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31); + out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31); +} + +static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) { + for (int i = 0; i < size; ++i) { + out[size - i - 1] = in[i]; + } +} + +static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) { + if (bit < 0) { + bit = -bit; + __m256i round = _mm256_set1_epi16(1 << (bit - 1)); + for (int i = 0; i < size; ++i) { + in[i] = _mm256_adds_epi16(in[i], round); + in[i] = _mm256_srai_epi16(in[i], bit); + } + } else if (bit > 0) { + for (int i = 0; i < size; ++i) { + in[i] = _mm256_slli_epi16(in[i], bit); + } + } +} + +#ifdef __cplusplus +} +#endif + +#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c index 7d6b7d287..a7ac2c93d 100644 --- a/third_party/aom/aom_dsp/x86/variance_avx2.c +++ b/third_party/aom/aom_dsp/x86/variance_avx2.c @@ -324,6 +324,12 @@ static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) { return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); } +static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) { + const __m256i d = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1)); + return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); +} + static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1, const __m256i a, uint8_t *comp_pred) { @@ -401,3 +407,110 @@ void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, } while (i < height); } } + +static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0, + const __m256i s1, + const __m256i a) { + const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i a_inv = _mm256_sub_epi16(alpha_max, a); + + const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1); + const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv); + const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo); + const __m256i pred_l = _mm256_srai_epi32( + _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS); + + const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1); + const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv); + const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi); + const __m256i pred_h = _mm256_srai_epi32( + _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS); + + const __m256i comp = _mm256_packs_epi32(pred_l, pred_h); + + return comp; +} + +void aom_highbd_comp_mask_pred_avx2(uint16_t *comp_pred, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + int i = 0; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + const uint16_t *src0 = invert_mask ? pred : ref; + const uint16_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + const __m256i zero = _mm256_setzero_si256(); + + if (width == 8) { + do { + const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0); + const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1); + + const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask); + const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8)); + + __m256i m = _mm256_castsi128_si256(m_l); + m = _mm256_insertf128_si256(m, m_h, 1); + const __m256i m_16 = _mm256_unpacklo_epi8(m, zero); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); + + _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp)); + + _mm_storeu_si128((__m128i *)(comp_pred + width), + _mm256_extractf128_si256(comp, 1)); + + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + comp_pred += (width << 1); + i += 2; + } while (i < height); + } else if (width == 16) { + do { + const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0)); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1)); + const __m256i m_16 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); + + _mm256_storeu_si256((__m256i *)comp_pred, comp); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else if (width == 32) { + do { + const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0); + const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16)); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1); + const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16)); + + const __m256i m01_16 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); + const __m256i m23_16 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16))); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16); + const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16); + + _mm256_storeu_si256((__m256i *)comp_pred, comp); + _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } +} diff --git a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c new file mode 100644 index 000000000..66b0d7d84 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <tmmintrin.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow + // in computation using _mm_maddubs_epi16. + // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow. + const int16_t round = (1 << (FILTER_BITS - 1)) >> 1; + const __m128i r = _mm_set1_epi16(round); + const uint8_t f0 = filter[0] >> 1; + const uint8_t f1 = filter[1] >> 1; + const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1, + f0, f1, f0, f1, f0, f1); + unsigned int i, j; + (void)pixel_step; + + if (output_width >= 8) { + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 8) { + // load source + __m128i source_low = xx_loadl_64(a); + __m128i source_hi = xx_loadl_64(a + 1); + + // unpack to: + // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], + // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } + __m128i source = _mm_unpacklo_epi8(source_low, source_hi); + + // b[i] = a[i] * filter[0] + a[i + 1] * filter[1] + __m128i res = _mm_maddubs_epi16(source, filters); + + // round + res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); + + xx_storeu_128(b, res); + + a += 8; + b += 8; + } + + a += src_pixels_per_line - output_width; + } + } else { + const __m128i shuffle_mask = + _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + for (i = 0; i < output_height; ++i) { + // load source, only first 5 values are meaningful: + // { a[0], a[1], a[2], a[3], a[4], xxxx } + __m128i source = xx_loadl_64(a); + + // shuffle, up to the first 8 are useful + // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], + // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + __m128i res = _mm_maddubs_epi16(source_shuffle, filters); + res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); + + xx_storel_64(b, res); + + a += src_pixels_per_line; + b += output_width; + } + } +} + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + const int16_t round = (1 << FILTER_BITS) >> 1; + const __m128i r = _mm_set1_epi32(round); + const __m128i filters = + _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0], + filter[1], filter[0], filter[1]); + const __m128i shuffle_mask = + _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); + const __m128i mask = + _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 4) { + // load source as: + // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] } + __m128i source1 = xx_loadl_64(a); + __m128i source2 = xx_loadl_64(a + pixel_step); + __m128i source = _mm_unpacklo_epi64(source1, source2); + + // shuffle source to: + // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + // b[i] = a[i] * filter[0] + a[w + i] * filter[1] + __m128i res = _mm_madd_epi16(source_shuffle, filters); + + // round + res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS); + + // shuffle to get each lower 8 bit of every 32 bit + res = _mm_shuffle_epi8(res, mask); + + xx_storel_32(b, res); + + a += 4; + b += 4; + } + + a += src_pixels_per_line - output_width; + } +} diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c index c8c90a7dc..7e3c5d5db 100644 --- a/third_party/aom/aom_dsp/x86/variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/variance_sse2.c @@ -569,7 +569,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, } } - const InterpFilterParams filter = + const InterpFilterParams *filter = av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); if (!subpel_x_q3 && !subpel_y_q3) { @@ -633,12 +633,12 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); const int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride, - temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, - intermediate_height); - aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1), + aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1), + ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, + width, intermediate_height); + aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1), MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, width, height); } diff --git a/third_party/aom/aom_ports/aom_once.h b/third_party/aom/aom_ports/aom_once.h index bb1e21366..8e04f8583 100644 --- a/third_party/aom/aom_ports/aom_once.h +++ b/third_party/aom/aom_ports/aom_once.h @@ -17,7 +17,7 @@ /* Implement a function wrapper to guarantee initialization * thread-safety for library singletons. * - * NOTE: These functions use static locks, and can only be + * NOTE: This function uses static locks, and can only be * used with one common argument per compilation unit. So * * file1.c: @@ -25,8 +25,8 @@ * ... * aom_once(foo); * - * file2.c: - * aom_once(bar); + * file2.c: + * aom_once(bar); * * will ensure foo() and bar() are each called only once, but in * @@ -46,19 +46,19 @@ * local initializers are not thread-safe in MSVC prior to Visual * Studio 2015. * - * As a static, once_state will be zero-initialized as program start. + * As a static, aom_once_state will be zero-initialized as program start. */ -static LONG once_state; -static void once(void (*func)(void)) { - /* Try to advance once_state from its initial value of 0 to 1. +static LONG aom_once_state; +static void aom_once(void (*func)(void)) { + /* Try to advance aom_once_state from its initial value of 0 to 1. * Only one thread can succeed in doing so. */ - if (InterlockedCompareExchange(&once_state, 1, 0) == 0) { - /* We're the winning thread, having set once_state to 1. + if (InterlockedCompareExchange(&aom_once_state, 1, 0) == 0) { + /* We're the winning thread, having set aom_once_state to 1. * Call our function. */ func(); - /* Now advance once_state to 2, unblocking any other threads. */ - InterlockedIncrement(&once_state); + /* Now advance aom_once_state to 2, unblocking any other threads. */ + InterlockedIncrement(&aom_once_state); return; } @@ -66,10 +66,10 @@ static void once(void (*func)(void)) { * the state variable so we don't return before func() * has finished executing elsewhere. * - * Try to advance once_state from 2 to 2, which is only possible + * Try to advance aom_once_state from 2 to 2, which is only possible * after the winning thead advances it from 1 to 2. */ - while (InterlockedCompareExchange(&once_state, 2, 2) != 2) { + while (InterlockedCompareExchange(&aom_once_state, 2, 2) != 2) { /* State isn't yet 2. Try again. * * We are used for singleton initialization functions, @@ -83,8 +83,8 @@ static void once(void (*func)(void)) { Sleep(0); } - /* We've seen once_state advance to 2, so we know func() - * has been called. And we've left once_state as we found it, + /* We've seen aom_once_state advance to 2, so we know func() + * has been called. And we've left aom_once_state as we found it, * so other threads will have the same experience. * * It's safe to return now. @@ -95,7 +95,7 @@ static void once(void (*func)(void)) { #elif CONFIG_MULTITHREAD && defined(__OS2__) #define INCL_DOS #include <os2.h> -static void once(void (*func)(void)) { +static void aom_once(void (*func)(void)) { static int done; /* If the initialization is complete, return early. */ @@ -117,18 +117,15 @@ static void once(void (*func)(void)) { #elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H #include <pthread.h> -static void once(void (*func)(void)) { +static void aom_once(void (*func)(void)) { static pthread_once_t lock = PTHREAD_ONCE_INIT; pthread_once(&lock, func); } #else -/* No-op version that performs no synchronization. *_rtcd() is idempotent, - * so as long as your platform provides atomic loads/stores of pointers - * no synchronization is strictly necessary. - */ +/* Default version that performs no synchronization. */ -static void once(void (*func)(void)) { +static void aom_once(void (*func)(void)) { static int done; if (!done) { diff --git a/third_party/aom/aom_scale/aom_scale_rtcd.c b/third_party/aom/aom_scale/aom_scale_rtcd.c index 08f1a376d..a04e053b0 100644 --- a/third_party/aom/aom_scale/aom_scale_rtcd.c +++ b/third_party/aom/aom_scale/aom_scale_rtcd.c @@ -15,4 +15,4 @@ #include "aom_ports/aom_once.h" -void aom_scale_rtcd() { once(setup_rtcd_internal); } +void aom_scale_rtcd() { aom_once(setup_rtcd_internal); } diff --git a/third_party/aom/aom_scale/generic/yv12config.c b/third_party/aom/aom_scale/generic/yv12config.c index cce915165..ca5b69066 100644 --- a/third_party/aom/aom_scale/generic/yv12config.c +++ b/third_party/aom/aom_scale/generic/yv12config.c @@ -51,6 +51,10 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, aom_codec_frame_buffer_t *fb, aom_get_frame_buffer_cb_fn_t cb, void *cb_priv) { if (ybf) { +#if CONFIG_SIZE_LIMIT + if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1; +#endif + const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment; const int aligned_width = (width + 7) & ~7; const int aligned_height = (height + 7) & ~7; @@ -154,7 +158,7 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, (uv_border_h * uv_stride) + uv_border_w, aom_byte_align); - ybf->use_external_refernce_buffers = 0; + ybf->use_external_reference_buffers = 0; if (use_highbitdepth) { if (ybf->y_buffer_8bit) aom_free(ybf->y_buffer_8bit); diff --git a/third_party/aom/aom_scale/yv12config.h b/third_party/aom/aom_scale/yv12config.h index 8f1c60069..2b4f597b0 100644 --- a/third_party/aom/aom_scale/yv12config.h +++ b/third_party/aom/aom_scale/yv12config.h @@ -81,7 +81,7 @@ typedef struct yv12_buffer_config { // Indicate whether y_buffer, u_buffer, and v_buffer points to the internally // allocated memory or external buffers. - int use_external_refernce_buffers; + int use_external_reference_buffers; // This is needed to store y_buffer, u_buffer, and v_buffer when set reference // uses an external refernece, and restore those buffer pointers after the // external reference frame is no longer used. diff --git a/third_party/aom/aom_util/aom_thread.h b/third_party/aom/aom_util/aom_thread.h index 3b22ac70c..fdb724d0c 100644 --- a/third_party/aom/aom_util/aom_thread.h +++ b/third_party/aom/aom_util/aom_thread.h @@ -369,7 +369,8 @@ typedef enum { } AVxWorkerStatus; // Function to be called by the worker thread. Takes two opaque pointers as -// arguments (data1 and data2), and should return false in case of error. +// arguments (data1 and data2). Should return true on success and return false +// in case of error. typedef int (*AVxWorkerHook)(void *, void *); // Platform-dependent implementation details for the worker. @@ -382,7 +383,7 @@ typedef struct { AVxWorkerHook hook; // hook to call void *data1; // first argument passed to 'hook' void *data2; // second argument passed to 'hook' - int had_error; // return value of the last call to 'hook' + int had_error; // true if a call to 'hook' returned false } AVxWorker; // The interface for all thread-worker related functions. All these functions diff --git a/third_party/aom/apps/aomdec.c b/third_party/aom/apps/aomdec.c index 6c4d724a4..48952586f 100644 --- a/third_party/aom/apps/aomdec.c +++ b/third_party/aom/apps/aomdec.c @@ -83,6 +83,8 @@ static const arg_def_t outputfile = ARG_DEF("o", "output", 1, "Output file name pattern (see below)"); static const arg_def_t threadsarg = ARG_DEF("t", "threads", 1, "Max threads to use"); +static const arg_def_t rowmtarg = + ARG_DEF(NULL, "row-mt", 1, "Enable row based multi-threading"); static const arg_def_t verbosearg = ARG_DEF("v", "verbose", 0, "Show version string"); static const arg_def_t scalearg = @@ -114,12 +116,12 @@ static const arg_def_t outallarg = ARG_DEF( NULL, "all-layers", 0, "Output all decoded frames of a scalable bitstream"); static const arg_def_t *all_args[] = { - &help, &codecarg, &use_yv12, &use_i420, &flipuvarg, - &rawvideo, &noblitarg, &progressarg, &limitarg, &skiparg, - &postprocarg, &summaryarg, &outputfile, &threadsarg, &verbosearg, - &scalearg, &fb_arg, &md5arg, &framestatsarg, &continuearg, - &outbitdeptharg, &tilem, &tiler, &tilec, &isannexb, - &oppointarg, &outallarg, NULL + &help, &codecarg, &use_yv12, &use_i420, &flipuvarg, + &rawvideo, &noblitarg, &progressarg, &limitarg, &skiparg, + &postprocarg, &summaryarg, &outputfile, &threadsarg, &rowmtarg, + &verbosearg, &scalearg, &fb_arg, &md5arg, &framestatsarg, + &continuearg, &outbitdeptharg, &tilem, &tiler, &tilec, + &isannexb, &oppointarg, &outallarg, NULL }; #if CONFIG_LIBYUV @@ -512,6 +514,7 @@ static int main_loop(int argc, const char **argv_) { int do_scale = 0; int operating_point = 0; int output_all_layers = 0; + unsigned int row_mt = 0; aom_image_t *scaled_img = NULL; aom_image_t *img_shifted = NULL; int frame_avail, got_data, flush_decoder = 0; @@ -601,6 +604,15 @@ static int main_loop(int argc, const char **argv_) { summary = 1; } else if (arg_match(&arg, &threadsarg, argi)) { cfg.threads = arg_parse_uint(&arg); +#if !CONFIG_MULTITHREAD + if (cfg.threads > 1) { + die("Error: --threads=%d is not supported when CONFIG_MULTITHREAD = " + "0.\n", + cfg.threads); + } +#endif + } else if (arg_match(&arg, &rowmtarg, argi)) { + row_mt = arg_parse_uint(&arg); } else if (arg_match(&arg, &verbosearg, argi)) { quiet = 0; } else if (arg_match(&arg, &scalearg, argi)) { @@ -763,6 +775,11 @@ static int main_loop(int argc, const char **argv_) { aom_codec_error(&decoder)); goto fail; } + + if (aom_codec_control(&decoder, AV1D_SET_ROW_MT, row_mt)) { + fprintf(stderr, "Failed to set row_mt: %s\n", aom_codec_error(&decoder)); + goto fail; + } #endif if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip); @@ -910,9 +927,8 @@ static int main_loop(int argc, const char **argv_) { // Shift up or down if necessary if (output_bit_depth != 0) { const aom_img_fmt_t shifted_fmt = - output_bit_depth == 8 - ? img->fmt ^ (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) - : img->fmt | AOM_IMG_FMT_HIGHBITDEPTH; + output_bit_depth == 8 ? img->fmt & ~AOM_IMG_FMT_HIGHBITDEPTH + : img->fmt | AOM_IMG_FMT_HIGHBITDEPTH; if (shifted_fmt != img->fmt || output_bit_depth != img->bit_depth) { if (img_shifted && diff --git a/third_party/aom/apps/aomenc.c b/third_party/aom/apps/aomenc.c index db0910220..31cb662e4 100644 --- a/third_party/aom/apps/aomenc.c +++ b/third_party/aom/apps/aomenc.c @@ -475,6 +475,13 @@ static const arg_def_t film_grain_test = static const arg_def_t film_grain_table = ARG_DEF(NULL, "film-grain-table", 1, "Path to file containing film grain parameters"); +#if CONFIG_DENOISE +static const arg_def_t denoise_noise_level = + ARG_DEF(NULL, "denoise-noise-level", 1, + "Amount of noise (from 0 = don't denoise, to 50)"); +static const arg_def_t denoise_block_size = + ARG_DEF(NULL, "denoise-block-size", 1, "Denoise block size (default = 32)"); +#endif static const arg_def_t enable_ref_frame_mvs = ARG_DEF(NULL, "enable-ref-frame-mvs", 1, "Enable temporal mv prediction (default is 1)"); @@ -656,6 +663,10 @@ static const arg_def_t *av1_args[] = { &cpu_used_av1, &timing_info, &film_grain_test, &film_grain_table, +#if CONFIG_DENOISE + &denoise_noise_level, + &denoise_block_size, +#endif &enable_ref_frame_mvs, &bitdeptharg, &inbitdeptharg, @@ -708,6 +719,10 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED, AV1E_SET_TIMING_INFO_TYPE, AV1E_SET_FILM_GRAIN_TEST_VECTOR, AV1E_SET_FILM_GRAIN_TABLE, +#if CONFIG_DENOISE + AV1E_SET_DENOISE_NOISE_LEVEL, + AV1E_SET_DENOISE_BLOCK_SIZE, +#endif AV1E_SET_ENABLE_REF_FRAME_MVS, AV1E_SET_ENABLE_DF, AV1E_SET_ENABLE_ORDER_HINT, diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake index 1c7f937e1..4c4f542fe 100644 --- a/third_party/aom/av1/av1.cmake +++ b/third_party/aom/av1/av1.cmake @@ -45,7 +45,6 @@ list(APPEND AOM_AV1_COMMON_SOURCES "${AOM_ROOT}/av1/common/entropymv.c" "${AOM_ROOT}/av1/common/entropymv.h" "${AOM_ROOT}/av1/common/enums.h" - "${AOM_ROOT}/av1/common/filter.c" "${AOM_ROOT}/av1/common/filter.h" "${AOM_ROOT}/av1/common/frame_buffers.c" "${AOM_ROOT}/av1/common/frame_buffers.h" @@ -274,7 +273,10 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1 list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2 "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c" "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c" - "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c") + "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h" + "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c") list(APPEND AOM_AV1_ENCODER_INTRIN_NEON "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c") @@ -296,7 +298,9 @@ list(APPEND AOM_AV1_COMMON_INTRIN_NEON "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c" "${AOM_ROOT}/av1/common/arm/reconinter_neon.c" "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c" - "${AOM_ROOT}/av1/common/arm/intrapred_neon.c" + "${AOM_ROOT}/av1/common/arm/selfguided_neon.c" + "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c" + "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h" "${AOM_ROOT}/av1/common/cdef_block_neon.c") list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2 diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c index 9d5414c1e..3bc4804c9 100644 --- a/third_party/aom/av1/av1_cx_iface.c +++ b/third_party/aom/av1/av1_cx_iface.c @@ -94,6 +94,10 @@ struct av1_extracfg { int enable_warped_motion; // sequence level int allow_warped_motion; // frame level int enable_superres; +#if CONFIG_DENOISE + float noise_level; + int noise_block_size; +#endif }; static struct av1_extracfg default_extra_cfg = { @@ -160,6 +164,10 @@ static struct av1_extracfg default_extra_cfg = { 1, // enable_warped_motion at sequence level 1, // allow_warped_motion at frame level 1, // superres +#if CONFIG_DENOISE + 0, // noise_level + 32, // noise_block_size +#endif }; struct aom_codec_alg_priv { @@ -464,7 +472,7 @@ static aom_codec_err_t set_encoder_config( oxcf->buffer_model.num_units_in_decoding_tick = cfg->g_timebase.num; oxcf->timing_info.equal_picture_interval = 0; oxcf->decoder_model_info_present_flag = 1; - oxcf->buffer_removal_delay_present = 1; + oxcf->buffer_removal_time_present = 1; oxcf->display_model_info_present_flag = 1; } if (oxcf->init_framerate > 180) { @@ -612,6 +620,10 @@ static aom_codec_err_t set_encoder_config( oxcf->film_grain_test_vector = extra_cfg->film_grain_test_vector; oxcf->film_grain_table_filename = extra_cfg->film_grain_table_filename; } +#if CONFIG_DENOISE + oxcf->noise_level = extra_cfg->noise_level; + oxcf->noise_block_size = extra_cfg->noise_block_size; +#endif oxcf->large_scale_tile = cfg->large_scale_tile; oxcf->single_tile_decoding = (oxcf->large_scale_tile) ? extra_cfg->single_tile_decoding : 0; @@ -710,7 +722,7 @@ static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx, ctx->cfg = *cfg; set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); // On profile change, request a key frame - force_key |= ctx->cpi->common.profile != ctx->oxcf.profile; + force_key |= ctx->cpi->common.seq_params.profile != ctx->oxcf.profile; av1_change_config(ctx->cpi, &ctx->oxcf); } @@ -1055,6 +1067,23 @@ static aom_codec_err_t ctrl_set_film_grain_table(aom_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +#if CONFIG_DENOISE +static aom_codec_err_t ctrl_set_denoise_noise_level(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.noise_level = + ((float)CAST(AV1E_SET_DENOISE_NOISE_LEVEL, args)) / 10.0f; + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_denoise_block_size(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.noise_block_size = CAST(AV1E_SET_DENOISE_BLOCK_SIZE, args); + return update_extra_cfg(ctx, &extra_cfg); +} +#endif + static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; @@ -1119,7 +1148,7 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx, } priv->extra_cfg = default_extra_cfg; - once(av1_initialize_enc); + aom_once(av1_initialize_enc); res = validate_config(priv, &priv->cfg, &priv->extra_cfg); @@ -1200,6 +1229,9 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, volatile aom_enc_frame_flags_t flags = enc_flags; + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. if (setjmp(cpi->common.error.jmp)) { cpi->common.error.setjmp = 0; res = update_error_state(ctx, &cpi->common.error); @@ -1259,7 +1291,6 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, if (cx_data_sz < ctx->cx_data_sz / 2) { aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, "Compressed data buffer too small"); - return AOM_CODEC_ERROR; } } @@ -1275,8 +1306,8 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, !img, timebase)) { if (cpi->common.seq_params.frame_id_numbers_present_flag) { if (cpi->common.invalid_delta_frame_id_minus_1) { - ctx->base.err_detail = "Invalid delta_frame_id_minus_1"; - return AOM_CODEC_ERROR; + aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, + "Invalid delta_frame_id_minus_1"); } } cpi->seq_params_locked = 1; @@ -1305,7 +1336,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, // OBUs are preceded/succeeded by an unsigned leb128 coded integer. if (write_uleb_obu_size(obu_header_size, obu_payload_size, ctx->pending_cx_data) != AOM_CODEC_OK) { - return AOM_CODEC_ERROR; + aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL); } frame_size += obu_header_size + obu_payload_size + length_field_size; @@ -1315,7 +1346,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, size_t curr_frame_size = frame_size; if (av1_convert_sect5obus_to_annexb(cx_data, &curr_frame_size) != AOM_CODEC_OK) { - return AOM_CODEC_ERROR; + aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL); } frame_size = curr_frame_size; @@ -1327,7 +1358,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, } if (write_uleb_obu_size(0, (uint32_t)frame_size, cx_data) != AOM_CODEC_OK) { - return AOM_CODEC_ERROR; + aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL); } frame_size += length_field_size; } @@ -1358,7 +1389,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, } if (write_uleb_obu_size(0, (uint32_t)tu_size, ctx->pending_cx_data) != AOM_CODEC_OK) { - return AOM_CODEC_ERROR; + aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL); } ctx->pending_cx_data_sz += length_field_size; } @@ -1710,6 +1741,10 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding }, { AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector }, { AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table }, +#if CONFIG_DENOISE + { AV1E_SET_DENOISE_NOISE_LEVEL, ctrl_set_denoise_noise_level }, + { AV1E_SET_DENOISE_BLOCK_SIZE, ctrl_set_denoise_block_size }, +#endif // CONFIG_FILM_GRAIN { AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test }, // Getters @@ -1728,7 +1763,7 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { { // NOLINT 0, // g_usage - 8, // g_threads + 0, // g_threads 0, // g_profile 320, // g_width @@ -1810,7 +1845,7 @@ CODEC_INTERFACE(aom_codec_av1_cx) = { NULL, // aom_codec_peek_si_fn_t NULL, // aom_codec_get_si_fn_t NULL, // aom_codec_decode_fn_t - NULL, // aom_codec_frame_get_fn_t + NULL, // aom_codec_get_frame_fn_t NULL // aom_codec_set_fb_fn_t }, { diff --git a/third_party/aom/av1/av1_dx_iface.c b/third_party/aom/av1/av1_dx_iface.c index db338f7e3..f42572019 100644 --- a/third_party/aom/av1/av1_dx_iface.c +++ b/third_party/aom/av1/av1_dx_iface.c @@ -50,6 +50,7 @@ struct aom_codec_alg_priv { int decode_tile_col; unsigned int tile_mode; unsigned int ext_tile_debug; + unsigned int row_mt; EXTERNAL_REFERENCES ext_refs; unsigned int is_annexb; int operating_point; @@ -61,7 +62,7 @@ struct aom_codec_alg_priv { int last_submit_worker_id; int next_output_worker_id; int available_threads; - aom_image_t *image_with_grain; + aom_image_t *image_with_grain[MAX_NUM_SPATIAL_LAYERS]; int need_resync; // wait for key/intra-only frame // BufferPool that holds all reference frames. Shared by all the FrameWorkers. BufferPool *buffer_pool; @@ -101,7 +102,7 @@ static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx, // default values priv->cfg.cfg.ext_partition = 1; } - priv->image_with_grain = NULL; + av1_zero(priv->image_with_grain); } return AOM_CODEC_OK; @@ -139,7 +140,9 @@ static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) { aom_free(ctx->frame_workers); aom_free(ctx->buffer_pool); - if (ctx->image_with_grain) aom_img_free(ctx->image_with_grain); + for (int i = 0; i < MAX_NUM_SPATIAL_LAYERS; i++) { + if (ctx->image_with_grain[i]) aom_img_free(ctx->image_with_grain[i]); + } aom_free(ctx); return AOM_CODEC_OK; } @@ -339,16 +342,16 @@ static int frame_worker_hook(void *arg1, void *arg2) { const uint8_t *data = frame_worker_data->data; (void)arg2; - frame_worker_data->result = av1_receive_compressed_data( - frame_worker_data->pbi, frame_worker_data->data_size, &data); + int result = av1_receive_compressed_data(frame_worker_data->pbi, + frame_worker_data->data_size, &data); frame_worker_data->data_end = data; - if (frame_worker_data->result != 0) { + if (result != 0) { // Check decode result in serial decode. frame_worker_data->pbi->cur_buf->buf.corrupted = 1; frame_worker_data->pbi->need_resync = 1; } - return !frame_worker_data->result; + return !result; } static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) { @@ -429,6 +432,7 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) { frame_worker_data->pbi->operating_point = ctx->operating_point; frame_worker_data->pbi->output_all_layers = ctx->output_all_layers; frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug; + frame_worker_data->pbi->row_mt = ctx->row_mt; worker->hook = (AVxWorkerHook)frame_worker_hook; if (!winterface->reset(worker)) { @@ -489,6 +493,7 @@ static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx, frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row; frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col; frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug; + frame_worker_data->pbi->row_mt = ctx->row_mt; frame_worker_data->pbi->ext_refs = ctx->ext_refs; frame_worker_data->pbi->common.is_annexb = ctx->is_annexb; @@ -592,21 +597,31 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx, return res; } -aom_image_t *add_grain_if_needed(aom_image_t *img, aom_image_t *grain_img_buf, - aom_film_grain_t *grain_params) { +// If grain_params->apply_grain is false, returns img. Otherwise, adds film +// grain to img, saves the result in *grain_img_ptr (allocating *grain_img_ptr +// if necessary), and returns *grain_img_ptr. +static aom_image_t *add_grain_if_needed(aom_image_t *img, + aom_image_t **grain_img_ptr, + aom_film_grain_t *grain_params) { if (!grain_params->apply_grain) return img; - if (grain_img_buf && - (img->d_w != grain_img_buf->d_w || img->d_h != grain_img_buf->d_h || - img->fmt != grain_img_buf->fmt || !(img->d_h % 2) || !(img->d_w % 2))) { - aom_img_free(grain_img_buf); - grain_img_buf = NULL; + aom_image_t *grain_img_buf = *grain_img_ptr; + + const int w_even = ALIGN_POWER_OF_TWO(img->d_w, 1); + const int h_even = ALIGN_POWER_OF_TWO(img->d_h, 1); + + if (grain_img_buf) { + const int alloc_w = ALIGN_POWER_OF_TWO(grain_img_buf->d_w, 1); + const int alloc_h = ALIGN_POWER_OF_TWO(grain_img_buf->d_h, 1); + if (w_even != alloc_w || h_even != alloc_h || + img->fmt != grain_img_buf->fmt) { + aom_img_free(grain_img_buf); + grain_img_buf = NULL; + } } if (!grain_img_buf) { - int w_even = img->d_w % 2 ? img->d_w + 1 : img->d_w; - int h_even = img->d_h % 2 ? img->d_h + 1 : img->d_h; grain_img_buf = aom_img_alloc(NULL, img->fmt, w_even, h_even, 16); - grain_img_buf->bit_depth = img->bit_depth; + *grain_img_ptr = grain_img_buf; } av1_add_film_grain(grain_params, img, grain_img_buf); @@ -649,8 +664,6 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx, aom_film_grain_t *grain_params; if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd, &grain_params) == 0) { - *index += 1; // Advance the iterator to point to the next image - AV1Decoder *const pbi = frame_worker_data->pbi; AV1_COMMON *const cm = &pbi->common; RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; @@ -659,6 +672,7 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx, yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv); if (!pbi->ext_tile_debug && cm->large_scale_tile) { + *index += 1; // Advance the iterator to point to the next image img = &ctx->img; img->img_data = pbi->tile_list_output; img->sz = pbi->tile_list_size; @@ -688,11 +702,14 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx, const int tile_col = AOMMIN(pbi->dec_tile_col, cm->tile_cols - 1); const int mi_col = tile_col * cm->tile_width; const int ssx = ctx->img.x_chroma_shift; + const int is_hbd = + (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0; int plane; - ctx->img.planes[0] += mi_col * MI_SIZE; + ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd); if (num_planes > 1) { for (plane = 1; plane < MAX_MB_PLANE; ++plane) { - ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx); + ctx->img.planes[plane] += + mi_col * (MI_SIZE >> ssx) * (1 + is_hbd); } } ctx->img.d_w = @@ -703,7 +720,10 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx, img = &ctx->img; img->temporal_id = cm->temporal_layer_id; img->spatial_id = cm->spatial_layer_id; - return add_grain_if_needed(img, ctx->image_with_grain, grain_params); + aom_image_t *res = add_grain_if_needed( + img, &ctx->image_with_grain[*index], grain_params); + *index += 1; // Advance the iterator to point to the next image + return res; } } else { // Decoding failed. Release the worker thread. @@ -999,7 +1019,7 @@ static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx, FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const AV1_COMMON *const cm = &frame_worker_data->pbi->common; - *bit_depth = cm->bit_depth; + *bit_depth = cm->seq_params.bit_depth; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; @@ -1009,6 +1029,64 @@ static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx, return AOM_CODEC_INVALID_PARAM; } +static aom_img_fmt_t get_img_format(int subsampling_x, int subsampling_y, + int use_highbitdepth) { + aom_img_fmt_t fmt = 0; + + if (subsampling_x == 0 && subsampling_y == 0) + fmt = AOM_IMG_FMT_I444; + else if (subsampling_x == 1 && subsampling_y == 0) + fmt = AOM_IMG_FMT_I422; + else if (subsampling_x == 1 && subsampling_y == 1) + fmt = AOM_IMG_FMT_I420; + + if (use_highbitdepth) fmt |= AOM_IMG_FMT_HIGHBITDEPTH; + return fmt; +} + +static aom_codec_err_t ctrl_get_img_format(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_img_fmt_t *const img_fmt = va_arg(args, aom_img_fmt_t *); + AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; + + if (img_fmt) { + if (worker) { + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1_COMMON *const cm = &frame_worker_data->pbi->common; + + *img_fmt = get_img_format(cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y, + cm->seq_params.use_highbitdepth); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_tile_size(aom_codec_alg_priv_t *ctx, + va_list args) { + unsigned int *const tile_size = va_arg(args, unsigned int *); + AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; + + if (tile_size) { + if (worker) { + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1_COMMON *const cm = &frame_worker_data->pbi->common; + *tile_size = + ((cm->tile_width * MI_SIZE) << 16) + cm->tile_height * MI_SIZE; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + return AOM_CODEC_INVALID_PARAM; +} + static aom_codec_err_t ctrl_set_invert_tile_order(aom_codec_alg_priv_t *ctx, va_list args) { ctx->invert_tile_order = va_arg(args, int); @@ -1124,6 +1202,12 @@ static aom_codec_err_t ctrl_ext_tile_debug(aom_codec_alg_priv_t *ctx, return AOM_CODEC_OK; } +static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->row_mt = va_arg(args, unsigned int); + return AOM_CODEC_OK; +} + static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { AV1_COPY_REFERENCE, ctrl_copy_reference }, @@ -1145,6 +1229,7 @@ static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { AV1D_SET_OUTPUT_ALL_LAYERS, ctrl_set_output_all_layers }, { AV1_SET_INSPECTION_CALLBACK, ctrl_set_inspection_callback }, { AV1D_EXT_TILE_DEBUG, ctrl_ext_tile_debug }, + { AV1D_SET_ROW_MT, ctrl_set_row_mt }, { AV1D_SET_EXT_REF_PTR, ctrl_set_ext_ref_ptr }, // Getters @@ -1152,6 +1237,8 @@ static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { AOMD_GET_LAST_QUANTIZER, ctrl_get_last_quantizer }, { AOMD_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates }, { AV1D_GET_BIT_DEPTH, ctrl_get_bit_depth }, + { AV1D_GET_IMG_FORMAT, ctrl_get_img_format }, + { AV1D_GET_TILE_SIZE, ctrl_get_tile_size }, { AV1D_GET_DISPLAY_SIZE, ctrl_get_render_size }, { AV1D_GET_FRAME_SIZE, ctrl_get_frame_size }, { AV1_GET_ACCOUNTING, ctrl_get_accounting }, @@ -1180,7 +1267,7 @@ CODEC_INTERFACE(aom_codec_av1_dx) = { decoder_peek_si, // aom_codec_peek_si_fn_t decoder_get_si, // aom_codec_get_si_fn_t decoder_decode, // aom_codec_decode_fn_t - decoder_get_frame, // aom_codec_frame_get_fn_t + decoder_get_frame, // aom_codec_get_frame_fn_t decoder_set_fb_fn, // aom_codec_set_fb_fn_t }, { diff --git a/third_party/aom/av1/common/alloccommon.c b/third_party/aom/av1/common/alloccommon.c index 49902cc7d..1bf81c91d 100644 --- a/third_party/aom/av1/common/alloccommon.c +++ b/third_party/aom/av1/common/alloccommon.c @@ -137,11 +137,11 @@ void av1_alloc_restoration_buffers(AV1_COMMON *cm) { // Now we need to allocate enough space to store the line buffers for the // stripes const int frame_w = cm->superres_upscaled_width; - const int use_highbd = cm->use_highbitdepth ? 1 : 0; + const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0; for (int p = 0; p < num_planes; ++p) { const int is_uv = p > 0; - const int ss_x = is_uv && cm->subsampling_x; + const int ss_x = is_uv && cm->seq_params.subsampling_x; const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ; const int stride = ALIGN_POWER_OF_TWO(plane_w, 5); const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c new file mode 100644 index 000000000..51c991498 --- /dev/null +++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c @@ -0,0 +1,844 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/common/av1_inv_txfm1d.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/enums.h" +#include "av1/common/idct.h" +#include "av1/common/arm/av1_inv_txfm_neon.h" + +static INLINE TxSetType find_TxSetType(TX_SIZE tx_size) { + const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size]; + TxSetType tx_set_type; + if (tx_size_sqr_up > TX_32X32) { + tx_set_type = EXT_TX_SET_DCTONLY; + } else if (tx_size_sqr_up == TX_32X32) { + tx_set_type = EXT_TX_SET_DCT_IDTX; + } else { + tx_set_type = EXT_TX_SET_ALL16; + } + return tx_set_type; +} + +// 1D itx types +typedef enum ATTRIBUTE_PACKED { + IDCT_1D, + IADST_1D, + IFLIPADST_1D = IADST_1D, + IIDENTITY_1D, + ITX_TYPES_1D, +} ITX_TYPE_1D; + +static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = { + IDCT_1D, IADST_1D, IDCT_1D, IADST_1D, + IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D, + IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, + IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D, +}; + +static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { + IDCT_1D, IDCT_1D, IADST_1D, IADST_1D, + IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D, + IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D, + IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, +}; + +// 1D functions +static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = { + { av1_idct4_new, av1_iadst4_new, av1_iidentity4_c }, + { av1_idct8_new, av1_iadst8_new, av1_iidentity8_c }, + { av1_idct16_new, av1_iadst16_new, av1_iidentity16_c }, + { av1_idct32_new, NULL, NULL }, + { av1_idct64_new, NULL, NULL }, +}; + +// Functions for blocks with eob at DC and within +// topleft 8x8, 16x16, 32x32 corner +static const transform_1d_neon + lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { av1_idct4_new, av1_idct4_new, NULL, NULL }, + { av1_iadst4_new, av1_iadst4_new, NULL, NULL }, + { av1_iidentity4_c, av1_iidentity4_c, NULL, NULL }, + }, + { { av1_idct8_new, av1_idct8_new, NULL, NULL }, + { av1_iadst8_new, av1_iadst8_new, NULL, NULL }, + { av1_iidentity8_c, av1_iidentity8_c, NULL, NULL } }, + { + { av1_idct16_new, av1_idct16_new, av1_idct16_new, NULL }, + { av1_iadst16_new, av1_iadst16_new, av1_iadst16_new, NULL }, + { av1_iidentity16_c, av1_iidentity16_c, av1_iidentity16_c, NULL }, + }, + { { av1_idct32_new, av1_idct32_new, av1_idct32_new, av1_idct32_new }, + { NULL, NULL, NULL, NULL }, + { av1_iidentity32_c, av1_iidentity32_c, av1_iidentity32_c, + av1_iidentity32_c } }, + { { av1_idct64_new, av1_idct64_new, av1_idct64_new, av1_idct64_new }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; +static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]); + int32_t *temp_in = txfm_buf; + + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; + int r, bd = 8; + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + + // row tx + int row_start = (buf_size_nonzero_h_div8 * 8); + for (int i = 0; i < row_start; i++) { + if (abs(rect_type) == 1) { + for (int j = 0; j < txfm_size_col; j++) + temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits); + row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range); + } else { + row_txfm(input, buf_ptr, cos_bit_row, stage_range); + } + av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); + input += txfm_size_col; + buf_ptr += txfm_size_col; + } + + // Doing memset for the rows which are not processed in row transform. + memset(buf_ptr, 0, + sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start)); + + // col tx + for (int c = 0; c < txfm_size_col; c++) { + for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + c]; + + col_txfm(temp_in, temp_out, cos_bit_col, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); + } + } +} + +static INLINE void lowbd_inv_txfm2d_add_v_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]); + int32_t *temp_in = txfm_buf; + + int eobx, eoby; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; + int r, bd = 8; + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // row tx + int row_start = (buf_size_nonzero_h_div8 * 8); + for (int i = 0; i < row_start; i++) { + if (abs(rect_type) == 1) { + for (int j = 0; j < txfm_size_col; j++) + temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits); + row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range); + } else { + row_txfm(input, buf_ptr, cos_bit_row, stage_range); + } + av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); + input += txfm_size_col; + buf_ptr += txfm_size_col; + } + // Doing memset for the rows which are not processed in row transform. + memset(buf_ptr, 0, + sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start)); + + // col tx + for (int c = 0; c < txfm_size_col; c++) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + col_txfm(temp_in, temp_out, cos_bit_col, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = highbd_clip_pixel_add( + output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); + } + } + } +} + +static INLINE void lowbd_inv_txfm2d_add_h_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]); + int32_t *temp_in = txfm_buf; + + int eobx, eoby; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; + int r, bd = 8; + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // row tx + int row_start = (buf_size_nonzero_h_div8 * 8); + for (int i = 0; i < row_start; i++) { + if (abs(rect_type) == 1) { + for (int j = 0; j < txfm_size_col; j++) + temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits); + row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range); + } else { + row_txfm(input, buf_ptr, cos_bit_row, stage_range); + } + av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); + input += txfm_size_col; + buf_ptr += txfm_size_col; + } + // Doing memset for the rows which are not processed in row transform. + memset(buf_ptr, 0, + sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start)); + + // col tx + for (int c = 0; c < txfm_size_col; c++) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + col_txfm(temp_in, temp_out, cos_bit_col, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = highbd_clip_pixel_add( + output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); + } + } + } +} + +static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + (void)eob; + DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; + int r, bd = 8; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + row_txfm(input, buf_ptr, cos_bit_row, stage_range); + + input += txfm_size_col; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + col_txfm(temp_in, temp_out, cos_bit_col, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = highbd_clip_pixel_add( + output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); + } + } + } +} + +void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, TX_SIZE tx_size, + int eob) { + (void)eob; + DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; + int r, bd = 8; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + for (int j = 0; j < txfm_size_col; j++) + temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits); + + row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range); + input += txfm_size_col; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + col_txfm(temp_in, temp_out, cos_bit_col, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = highbd_clip_pixel_add( + output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); + } + } + } +} + +void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, TX_SIZE tx_size, + int eob) { + (void)eob; + DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; + int r, bd = 8; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + for (int j = 0; j < txfm_size_col; j++) + temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits); + + row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range); + input += txfm_size_col; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + col_txfm(temp_in, temp_out, cos_bit_col, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = highbd_clip_pixel_add( + output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); + } + } + } +} + +void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + (void)eob; + DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; + int r, bd = 8; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + row_txfm(input, buf_ptr, cos_bit_row, stage_range); + av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); + input += txfm_size_col; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + col_txfm(temp_in, temp_out, cos_bit_col, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = highbd_clip_pixel_add( + output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); + } + } + } +} + +void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + (void)eob; + + DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; + int r, bd = 8; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + row_txfm(input, buf_ptr, cos_bit_row, stage_range); + av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); + input += txfm_size_col; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + col_txfm(temp_in, temp_out, cos_bit_col, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = highbd_clip_pixel_add( + output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); + } + } + } +} + +static INLINE void lowbd_inv_txfm2d_add_no_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]); + int32_t *temp_in = txfm_buf; + + int eobx, eoby, ud_flip, lr_flip, row_start; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; + const int bd = 8; + int r; + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + row_start = (buf_size_nonzero_h_div8 << 3); + + for (int i = 0; i < row_start; i++) { + if (abs(rect_type) == 1) { + for (int j = 0; j < txfm_size_col; j++) + temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits); + row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range); + } else { + row_txfm(input, buf_ptr, cos_bit_row, stage_range); + } + av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); + input += txfm_size_col; + buf_ptr += txfm_size_col; + } + + // Doing memset for the rows which are not processed in row transform. + memset(buf_ptr, 0, + sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start)); + + for (int c = 0; c < txfm_size_col; c++) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + col_txfm(temp_in, temp_out, cos_bit_col, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = highbd_clip_pixel_add( + output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); + } + } + } +} + +static INLINE void lowbd_inv_txfm2d_add_universe_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + switch (tx_type) { + case IDTX: + lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size, + eob); + break; + + case H_DCT: + case H_ADST: + case H_FLIPADST: + lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + + case V_DCT: + case V_ADST: + case V_FLIPADST: + lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + + default: + lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + } +} +void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, TX_SIZE tx_size, + int eob) { + int row; + switch (tx_size) { + case TX_4X4: + lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, tx_size, + eob); + break; + + case TX_4X8: + lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, tx_size, + eob); + break; + + case TX_8X4: + lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, tx_size, + eob); + break; + + case TX_4X16: + lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, tx_size, + eob); + break; + + case TX_16X4: + lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, tx_size, + eob); + break; + + case TX_16X64: { + lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type, + tx_size, eob); + } break; + + case TX_64X16: { + int32_t mod_input[64 * 16]; + for (row = 0; row < 16; ++row) { + memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input)); + memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input)); + } + lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type, + tx_size, eob); + } break; + + case TX_32X64: { + lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type, + tx_size, eob); + } break; + + case TX_64X32: { + int32_t mod_input[64 * 32]; + for (row = 0; row < 32; ++row) { + memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input)); + memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input)); + } + lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type, + tx_size, eob); + } break; + + case TX_64X64: { + int32_t mod_input[64 * 64]; + for (row = 0; row < 32; ++row) { + memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input)); + memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input)); + } + lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type, + tx_size, eob); + } break; + + default: + lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type, + tx_size, eob); + break; + } +} +void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride, + const TxfmParam *txfm_param) { + const TX_TYPE tx_type = txfm_param->tx_type; + if (!txfm_param->lossless) { + av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type, + txfm_param->tx_size, txfm_param->eob); + } else { + av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); + } +} diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h new file mode 100644 index 000000000..6af2d61e7 --- /dev/null +++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ +#define AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "av1/common/enums.h" +#include "av1/common/av1_inv_txfm1d.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/av1_txfm.h" + +typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output, + const int8_t cos_bit, + const int8_t *stage_ptr); + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x16_default[16]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x32_default[32]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = { + 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x32_default[32]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x16_default[16]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, + 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = { + 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f, +}; + +DECLARE_ALIGNED(16, static const int16_t *, + av1_eob_to_eobxy_default[TX_SIZES_ALL]) = { + NULL, + av1_eob_to_eobxy_8x8_default, + av1_eob_to_eobxy_16x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x16_default, + av1_eob_to_eobxy_16x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x32_default, + av1_eob_to_eobxy_32x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, +}; + +static const int lowbd_txfm_all_1d_zeros_idx[32] = { + 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +}; + +// Transform block width in log2 for eob (size of 64 map to 32) +static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = { + 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5, +}; + +static int eob_fill[32] = { + 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, +}; + +static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + if (eob == 1) { + *eobx = 0; + *eoby = 0; + return; + } + + const int tx_w_log2 = tx_size_wide_log2_eob[tx_size]; + const int eob_row = (eob - 1) >> tx_w_log2; + const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row]; + *eobx = eobxy & 0xFF; + *eoby = eobxy >> 8; +} + +static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + eob -= 1; + const int txfm_size_row = tx_size_high[tx_size]; + const int eoby_max = AOMMIN(32, txfm_size_row) - 1; + *eobx = eob / (eoby_max + 1); + *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob]; +} + +static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + eob -= 1; + const int txfm_size_col = tx_size_wide[tx_size]; + const int eobx_max = AOMMIN(32, txfm_size_col) - 1; + *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob]; + const int temp_eoby = eob / (eobx_max + 1); + assert(temp_eoby < 32); + *eoby = eob_fill[temp_eoby]; +} + +#endif // AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ diff --git a/third_party/aom/av1/common/arm/convolve_neon.c b/third_party/aom/av1/common/arm/convolve_neon.c index 86a25e109..f15744c94 100644 --- a/third_party/aom/av1/common/arm/convolve_neon.c +++ b/third_party/aom/av1/common/arm/convolve_neon.c @@ -164,8 +164,8 @@ static INLINE uint8x8_t convolve8_vert_8x4_s32( void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { const uint8_t horiz_offset = filter_params_x->taps / 2 - 1; @@ -182,7 +182,7 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_q4 & SUBPEL_MASK); const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0); const int16x8_t shift_by_bits = vdupq_n_s16(-bits); @@ -485,8 +485,8 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { const int vert_offset = filter_params_y->taps / 2 - 1; @@ -502,7 +502,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_q4 & SUBPEL_MASK); if (w <= 4) { uint8x8_t d01, d23; @@ -680,8 +680,8 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { int im_dst_stride; @@ -711,7 +711,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits); const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_q4 & SUBPEL_MASK); int16_t x_filter_tmp[8]; int16x8_t filter_x_coef = vld1q_s16(x_filter); @@ -896,7 +896,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)); const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_q4 & SUBPEL_MASK); const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1)); const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); @@ -1086,8 +1086,8 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, } void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { (void)filter_params_x; diff --git a/third_party/aom/av1/common/arm/intrapred_neon.c b/third_party/aom/av1/common/arm/intrapred_neon.c deleted file mode 100644 index 799355553..000000000 --- a/third_party/aom/av1/common/arm/intrapred_neon.c +++ /dev/null @@ -1,79 +0,0 @@ -/* - * - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#include <arm_neon.h> -#include <assert.h> - -#include "aom_mem/aom_mem.h" -#include "aom_ports/mem.h" -#include "av1/common/arm/mem_neon.h" -#include "config/aom_dsp_rtcd.h" - -static INLINE void highbd_dc_predictor_neon(uint16_t *dst, ptrdiff_t stride, - int bw, const uint16_t *above, - const uint16_t *left) { - assert(bw >= 4); - assert(IS_POWER_OF_TWO(bw)); - int expected_dc, sum = 0; - const int count = bw * 2; - uint32x4_t sum_q = vdupq_n_u32(0); - uint32x2_t sum_d; - uint16_t *dst_1; - if (bw >= 8) { - for (int i = 0; i < bw; i += 8) { - sum_q = vpadalq_u16(sum_q, vld1q_u16(above)); - sum_q = vpadalq_u16(sum_q, vld1q_u16(left)); - above += 8; - left += 8; - } - sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q)); - sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0); - expected_dc = (sum + (count >> 1)) / count; - const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc); - for (int r = 0; r < bw; r++) { - dst_1 = dst; - for (int i = 0; i < bw; i += 8) { - vst1q_u16(dst_1, dc); - dst_1 += 8; - } - dst += stride; - } - } else { // 4x4 - sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left)); - sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q)); - sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0); - expected_dc = (sum + (count >> 1)) / count; - const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc); - for (int r = 0; r < bw; r++) { - vst1_u16(dst, dc); - dst += stride; - } - } -} - -#define intra_pred_highbd_sized(type, width) \ - void aom_highbd_##type##_predictor_##width##x##width##_neon( \ - uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ - const uint16_t *left, int bd) { \ - (void)bd; \ - highbd_##type##_predictor_neon(dst, stride, width, above, left); \ - } - -#define intra_pred_square(type) \ - intra_pred_highbd_sized(type, 4); \ - intra_pred_highbd_sized(type, 8); \ - intra_pred_highbd_sized(type, 16); \ - intra_pred_highbd_sized(type, 32); \ - intra_pred_highbd_sized(type, 64); - -intra_pred_square(dc); - -#undef intra_pred_square diff --git a/third_party/aom/av1/common/arm/jnt_convolve_neon.c b/third_party/aom/av1/common/arm/jnt_convolve_neon.c index 992be4a9e..4015082b4 100644 --- a/third_party/aom/av1/common/arm/jnt_convolve_neon.c +++ b/third_party/aom/av1/common/arm/jnt_convolve_neon.c @@ -515,8 +515,8 @@ static INLINE void jnt_convolve_2d_vert_neon( void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { assert(!(w % 4)); @@ -532,9 +532,9 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8, const int round_0 = conv_params->round_0 - 1; const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_q4 & SUBPEL_MASK); const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_q4 & SUBPEL_MASK); int16_t x_filter_tmp[8]; int16x8_t filter_x_coef = vld1q_s16(x_filter); @@ -553,8 +553,8 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8, void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2, @@ -679,8 +679,8 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { assert(!(w % 4)); @@ -705,7 +705,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_q4 & SUBPEL_MASK); const uint8_t *src_ptr = src - horiz_offset; @@ -1013,8 +1013,8 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { assert(!(w % 4)); @@ -1040,7 +1040,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, // vertical filter const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_q4 & SUBPEL_MASK); const uint8_t *src_ptr = src - (vert_offset * src_stride); diff --git a/third_party/aom/av1/common/arm/mem_neon.h b/third_party/aom/av1/common/arm/mem_neon.h index 214b14bf7..4bf45a52c 100644 --- a/third_party/aom/av1/common/arm/mem_neon.h +++ b/third_party/aom/av1/common/arm/mem_neon.h @@ -22,6 +22,14 @@ static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0, s += p; } +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define load_u8_4x1(s, s0, lane) \ + do { \ + *(s0) = vreinterpret_u8_u32( \ + vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \ + } while (0) + static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p, uint8x8_t *const s0, uint8x8_t *const s1, uint8x8_t *const s2, uint8x8_t *const s3, @@ -128,6 +136,13 @@ static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p, *s3 = vld1_s16(s); } +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define store_u8_4x1(s, s0, lane) \ + do { \ + vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \ + } while (0) + static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, const uint8x8_t s1, const uint8x8_t s2, const uint8x8_t s3, const uint8x8_t s4, @@ -242,6 +257,30 @@ static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride, vst1q_s16(s, s7); } +static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride, + const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3) { + vst1_s16(s, s0); + s += dst_stride; + vst1_s16(s, s1); + s += dst_stride; + vst1_s16(s, s2); + s += dst_stride; + vst1_s16(s, s3); +} + +static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride, + const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3) { + vst1q_s16(s, s0); + s += dst_stride; + vst1q_s16(s, s1); + s += dst_stride; + vst1q_s16(s, s2); + s += dst_stride; + vst1q_s16(s, s3); +} + static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p, int16x8_t *const s0, int16x8_t *const s1, int16x8_t *const s2, int16x8_t *const s3, @@ -398,4 +437,49 @@ static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride, *tu1 = vsetq_lane_u64(a, *tu1, 1); } +static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1, + int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) { + *s1 = vld1q_s32(s); + s += p; + *s2 = vld1q_s32(s); + s += p; + *s3 = vld1q_s32(s); + s += p; + *s4 = vld1q_s32(s); +} + +static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1, + int32x4_t s2, int32x4_t s3, int32x4_t s4) { + vst1q_s32(s, s1); + s += p; + vst1q_s32(s, s2); + s += p; + vst1q_s32(s, s3); + s += p; + vst1q_s32(s, s4); +} + +static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1, + uint32x4_t *s2, uint32x4_t *s3, + uint32x4_t *s4) { + *s1 = vld1q_u32(s); + s += p; + *s2 = vld1q_u32(s); + s += p; + *s3 = vld1q_u32(s); + s += p; + *s4 = vld1q_u32(s); +} + +static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1, + uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) { + vst1q_u32(s, s1); + s += p; + vst1q_u32(s, s2); + s += p; + vst1q_u32(s, s3); + s += p; + vst1q_u32(s, s4); +} + #endif // AV1_COMMON_ARM_MEM_NEON_H_ diff --git a/third_party/aom/av1/common/arm/selfguided_neon.c b/third_party/aom/av1/common/arm/selfguided_neon.c new file mode 100644 index 000000000..b4808a972 --- /dev/null +++ b/third_party/aom/av1/common/arm/selfguided_neon.c @@ -0,0 +1,1506 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/txfm_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "av1/common/common.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/resize.h" +#include "av1/common/restoration.h" +#include "av1/common/arm/mem_neon.h" +#include "av1/common/arm/transpose_neon.h" + +// Constants used for right shift in final_filter calculation. +#define NB_EVEN 5 +#define NB_ODD 4 + +static INLINE void calc_ab_fast_internal_common( + uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4, + uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, int32x4_t sr4, int32x4_t sr5, + int32x4_t sr6, int32x4_t sr7, uint32x4_t const_n_val, uint32x4_t s_vec, + uint32x4_t const_val, uint32x4_t one_by_n_minus_1_vec, + uint16x4_t sgrproj_sgr, int32_t *src1, uint16_t *dst_A16, int32_t *src2, + const int buf_stride) { + uint32x4_t q0, q1, q2, q3; + uint32x4_t p0, p1, p2, p3; + uint16x4_t d0, d1, d2, d3; + + s0 = vmulq_u32(s0, const_n_val); + s1 = vmulq_u32(s1, const_n_val); + s2 = vmulq_u32(s2, const_n_val); + s3 = vmulq_u32(s3, const_n_val); + + q0 = vmulq_u32(s4, s4); + q1 = vmulq_u32(s5, s5); + q2 = vmulq_u32(s6, s6); + q3 = vmulq_u32(s7, s7); + + p0 = vcleq_u32(q0, s0); + p1 = vcleq_u32(q1, s1); + p2 = vcleq_u32(q2, s2); + p3 = vcleq_u32(q3, s3); + + q0 = vsubq_u32(s0, q0); + q1 = vsubq_u32(s1, q1); + q2 = vsubq_u32(s2, q2); + q3 = vsubq_u32(s3, q3); + + p0 = vandq_u32(p0, q0); + p1 = vandq_u32(p1, q1); + p2 = vandq_u32(p2, q2); + p3 = vandq_u32(p3, q3); + + p0 = vmulq_u32(p0, s_vec); + p1 = vmulq_u32(p1, s_vec); + p2 = vmulq_u32(p2, s_vec); + p3 = vmulq_u32(p3, s_vec); + + p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS); + p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS); + p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS); + p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS); + + p0 = vminq_u32(p0, const_val); + p1 = vminq_u32(p1, const_val); + p2 = vminq_u32(p2, const_val); + p3 = vminq_u32(p3, const_val); + + { + store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3); + + for (int x = 0; x < 4; x++) { + for (int y = 0; y < 4; y++) { + dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]]; + } + } + load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3); + } + p0 = vsubl_u16(sgrproj_sgr, d0); + p1 = vsubl_u16(sgrproj_sgr, d1); + p2 = vsubl_u16(sgrproj_sgr, d2); + p3 = vsubl_u16(sgrproj_sgr, d3); + + s4 = vmulq_u32(vreinterpretq_u32_s32(sr4), one_by_n_minus_1_vec); + s5 = vmulq_u32(vreinterpretq_u32_s32(sr5), one_by_n_minus_1_vec); + s6 = vmulq_u32(vreinterpretq_u32_s32(sr6), one_by_n_minus_1_vec); + s7 = vmulq_u32(vreinterpretq_u32_s32(sr7), one_by_n_minus_1_vec); + + s4 = vmulq_u32(s4, p0); + s5 = vmulq_u32(s5, p1); + s6 = vmulq_u32(s6, p2); + s7 = vmulq_u32(s7, p3); + + p0 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS); + p1 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS); + p2 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS); + p3 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS); + + store_s32_4x4(src2, buf_stride, vreinterpretq_s32_u32(p0), + vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2), + vreinterpretq_s32_u32(p3)); +} +static INLINE void calc_ab_internal_common( + uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4, + uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, uint16x8_t s16_0, + uint16x8_t s16_1, uint16x8_t s16_2, uint16x8_t s16_3, uint16x8_t s16_4, + uint16x8_t s16_5, uint16x8_t s16_6, uint16x8_t s16_7, + uint32x4_t const_n_val, uint32x4_t s_vec, uint32x4_t const_val, + uint16x4_t one_by_n_minus_1_vec, uint16x8_t sgrproj_sgr, int32_t *src1, + uint16_t *dst_A16, int32_t *dst2, const int buf_stride) { + uint16x4_t d0, d1, d2, d3, d4, d5, d6, d7; + uint32x4_t q0, q1, q2, q3, q4, q5, q6, q7; + uint32x4_t p0, p1, p2, p3, p4, p5, p6, p7; + + s0 = vmulq_u32(s0, const_n_val); + s1 = vmulq_u32(s1, const_n_val); + s2 = vmulq_u32(s2, const_n_val); + s3 = vmulq_u32(s3, const_n_val); + s4 = vmulq_u32(s4, const_n_val); + s5 = vmulq_u32(s5, const_n_val); + s6 = vmulq_u32(s6, const_n_val); + s7 = vmulq_u32(s7, const_n_val); + + d0 = vget_low_u16(s16_4); + d1 = vget_low_u16(s16_5); + d2 = vget_low_u16(s16_6); + d3 = vget_low_u16(s16_7); + d4 = vget_high_u16(s16_4); + d5 = vget_high_u16(s16_5); + d6 = vget_high_u16(s16_6); + d7 = vget_high_u16(s16_7); + + q0 = vmull_u16(d0, d0); + q1 = vmull_u16(d1, d1); + q2 = vmull_u16(d2, d2); + q3 = vmull_u16(d3, d3); + q4 = vmull_u16(d4, d4); + q5 = vmull_u16(d5, d5); + q6 = vmull_u16(d6, d6); + q7 = vmull_u16(d7, d7); + + p0 = vcleq_u32(q0, s0); + p1 = vcleq_u32(q1, s1); + p2 = vcleq_u32(q2, s2); + p3 = vcleq_u32(q3, s3); + p4 = vcleq_u32(q4, s4); + p5 = vcleq_u32(q5, s5); + p6 = vcleq_u32(q6, s6); + p7 = vcleq_u32(q7, s7); + + q0 = vsubq_u32(s0, q0); + q1 = vsubq_u32(s1, q1); + q2 = vsubq_u32(s2, q2); + q3 = vsubq_u32(s3, q3); + q4 = vsubq_u32(s4, q4); + q5 = vsubq_u32(s5, q5); + q6 = vsubq_u32(s6, q6); + q7 = vsubq_u32(s7, q7); + + p0 = vandq_u32(p0, q0); + p1 = vandq_u32(p1, q1); + p2 = vandq_u32(p2, q2); + p3 = vandq_u32(p3, q3); + p4 = vandq_u32(p4, q4); + p5 = vandq_u32(p5, q5); + p6 = vandq_u32(p6, q6); + p7 = vandq_u32(p7, q7); + + p0 = vmulq_u32(p0, s_vec); + p1 = vmulq_u32(p1, s_vec); + p2 = vmulq_u32(p2, s_vec); + p3 = vmulq_u32(p3, s_vec); + p4 = vmulq_u32(p4, s_vec); + p5 = vmulq_u32(p5, s_vec); + p6 = vmulq_u32(p6, s_vec); + p7 = vmulq_u32(p7, s_vec); + + p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS); + p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS); + p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS); + p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS); + p4 = vrshrq_n_u32(p4, SGRPROJ_MTABLE_BITS); + p5 = vrshrq_n_u32(p5, SGRPROJ_MTABLE_BITS); + p6 = vrshrq_n_u32(p6, SGRPROJ_MTABLE_BITS); + p7 = vrshrq_n_u32(p7, SGRPROJ_MTABLE_BITS); + + p0 = vminq_u32(p0, const_val); + p1 = vminq_u32(p1, const_val); + p2 = vminq_u32(p2, const_val); + p3 = vminq_u32(p3, const_val); + p4 = vminq_u32(p4, const_val); + p5 = vminq_u32(p5, const_val); + p6 = vminq_u32(p6, const_val); + p7 = vminq_u32(p7, const_val); + + { + store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3); + store_u32_4x4((uint32_t *)src1 + 4, buf_stride, p4, p5, p6, p7); + + for (int x = 0; x < 4; x++) { + for (int y = 0; y < 8; y++) { + dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]]; + } + } + load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7); + } + + s16_4 = vsubq_u16(sgrproj_sgr, s16_4); + s16_5 = vsubq_u16(sgrproj_sgr, s16_5); + s16_6 = vsubq_u16(sgrproj_sgr, s16_6); + s16_7 = vsubq_u16(sgrproj_sgr, s16_7); + + s0 = vmull_u16(vget_low_u16(s16_0), one_by_n_minus_1_vec); + s1 = vmull_u16(vget_low_u16(s16_1), one_by_n_minus_1_vec); + s2 = vmull_u16(vget_low_u16(s16_2), one_by_n_minus_1_vec); + s3 = vmull_u16(vget_low_u16(s16_3), one_by_n_minus_1_vec); + s4 = vmull_u16(vget_high_u16(s16_0), one_by_n_minus_1_vec); + s5 = vmull_u16(vget_high_u16(s16_1), one_by_n_minus_1_vec); + s6 = vmull_u16(vget_high_u16(s16_2), one_by_n_minus_1_vec); + s7 = vmull_u16(vget_high_u16(s16_3), one_by_n_minus_1_vec); + + s0 = vmulq_u32(s0, vmovl_u16(vget_low_u16(s16_4))); + s1 = vmulq_u32(s1, vmovl_u16(vget_low_u16(s16_5))); + s2 = vmulq_u32(s2, vmovl_u16(vget_low_u16(s16_6))); + s3 = vmulq_u32(s3, vmovl_u16(vget_low_u16(s16_7))); + s4 = vmulq_u32(s4, vmovl_u16(vget_high_u16(s16_4))); + s5 = vmulq_u32(s5, vmovl_u16(vget_high_u16(s16_5))); + s6 = vmulq_u32(s6, vmovl_u16(vget_high_u16(s16_6))); + s7 = vmulq_u32(s7, vmovl_u16(vget_high_u16(s16_7))); + + p0 = vrshrq_n_u32(s0, SGRPROJ_RECIP_BITS); + p1 = vrshrq_n_u32(s1, SGRPROJ_RECIP_BITS); + p2 = vrshrq_n_u32(s2, SGRPROJ_RECIP_BITS); + p3 = vrshrq_n_u32(s3, SGRPROJ_RECIP_BITS); + p4 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS); + p5 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS); + p6 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS); + p7 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS); + + store_s32_4x4(dst2, buf_stride, vreinterpretq_s32_u32(p0), + vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2), + vreinterpretq_s32_u32(p3)); + store_s32_4x4(dst2 + 4, buf_stride, vreinterpretq_s32_u32(p4), + vreinterpretq_s32_u32(p5), vreinterpretq_s32_u32(p6), + vreinterpretq_s32_u32(p7)); +} + +static INLINE void boxsum2_square_sum_calc( + int16x4_t t1, int16x4_t t2, int16x4_t t3, int16x4_t t4, int16x4_t t5, + int16x4_t t6, int16x4_t t7, int16x4_t t8, int16x4_t t9, int16x4_t t10, + int16x4_t t11, int32x4_t *r0, int32x4_t *r1, int32x4_t *r2, int32x4_t *r3) { + int32x4_t d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11; + int32x4_t r12, r34, r67, r89, r1011; + int32x4_t r345, r6789, r789; + + d1 = vmull_s16(t1, t1); + d2 = vmull_s16(t2, t2); + d3 = vmull_s16(t3, t3); + d4 = vmull_s16(t4, t4); + d5 = vmull_s16(t5, t5); + d6 = vmull_s16(t6, t6); + d7 = vmull_s16(t7, t7); + d8 = vmull_s16(t8, t8); + d9 = vmull_s16(t9, t9); + d10 = vmull_s16(t10, t10); + d11 = vmull_s16(t11, t11); + + r12 = vaddq_s32(d1, d2); + r34 = vaddq_s32(d3, d4); + r67 = vaddq_s32(d6, d7); + r89 = vaddq_s32(d8, d9); + r1011 = vaddq_s32(d10, d11); + r345 = vaddq_s32(r34, d5); + r6789 = vaddq_s32(r67, r89); + r789 = vsubq_s32(r6789, d6); + *r0 = vaddq_s32(r12, r345); + *r1 = vaddq_s32(r67, r345); + *r2 = vaddq_s32(d5, r6789); + *r3 = vaddq_s32(r789, r1011); +} + +static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16, + int32_t *dst32, int32_t *dst2, const int dst_stride, + const int width, const int height) { + assert(width > 2 * SGRPROJ_BORDER_HORZ); + assert(height > 2 * SGRPROJ_BORDER_VERT); + + int16_t *dst1_16_ptr, *src_ptr; + int32_t *dst2_ptr; + int h, w, count = 0; + const int dst_stride_2 = (dst_stride << 1); + const int dst_stride_8 = (dst_stride << 3); + + dst1_16_ptr = dst16; + dst2_ptr = dst2; + src_ptr = src; + w = width; + { + int16x8_t t1, t2, t3, t4, t5, t6, t7; + int16x8_t t8, t9, t10, t11, t12; + + int16x8_t q12345, q56789, q34567, q7891011; + int16x8_t q12, q34, q67, q89, q1011; + int16x8_t q345, q6789, q789; + + int32x4_t r12345, r56789, r34567, r7891011; + + do { + h = height; + dst1_16_ptr = dst16 + (count << 3); + dst2_ptr = dst2 + (count << 3); + src_ptr = src + (count << 3); + + dst1_16_ptr += dst_stride_2; + dst2_ptr += dst_stride_2; + do { + load_s16_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4); + src_ptr += 4 * src_stride; + load_s16_8x4(src_ptr, src_stride, &t5, &t6, &t7, &t8); + src_ptr += 4 * src_stride; + load_s16_8x4(src_ptr, src_stride, &t9, &t10, &t11, &t12); + + q12 = vaddq_s16(t1, t2); + q34 = vaddq_s16(t3, t4); + q67 = vaddq_s16(t6, t7); + q89 = vaddq_s16(t8, t9); + q1011 = vaddq_s16(t10, t11); + q345 = vaddq_s16(q34, t5); + q6789 = vaddq_s16(q67, q89); + q789 = vaddq_s16(q89, t7); + q12345 = vaddq_s16(q12, q345); + q34567 = vaddq_s16(q67, q345); + q56789 = vaddq_s16(t5, q6789); + q7891011 = vaddq_s16(q789, q1011); + + store_s16_8x4(dst1_16_ptr, dst_stride_2, q12345, q34567, q56789, + q7891011); + dst1_16_ptr += dst_stride_8; + + boxsum2_square_sum_calc( + vget_low_s16(t1), vget_low_s16(t2), vget_low_s16(t3), + vget_low_s16(t4), vget_low_s16(t5), vget_low_s16(t6), + vget_low_s16(t7), vget_low_s16(t8), vget_low_s16(t9), + vget_low_s16(t10), vget_low_s16(t11), &r12345, &r34567, &r56789, + &r7891011); + + store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r34567, r56789, r7891011); + + boxsum2_square_sum_calc( + vget_high_s16(t1), vget_high_s16(t2), vget_high_s16(t3), + vget_high_s16(t4), vget_high_s16(t5), vget_high_s16(t6), + vget_high_s16(t7), vget_high_s16(t8), vget_high_s16(t9), + vget_high_s16(t10), vget_high_s16(t11), &r12345, &r34567, &r56789, + &r7891011); + + store_s32_4x4(dst2_ptr + 4, dst_stride_2, r12345, r34567, r56789, + r7891011); + dst2_ptr += (dst_stride_8); + h -= 8; + } while (h > 0); + w -= 8; + count++; + } while (w > 0); + } + + { + int16x4_t s1, s2, s3, s4, s5, s6, s7, s8; + int32x4_t d1, d2, d3, d4, d5, d6, d7, d8; + int32x4_t q12345, q34567, q23456, q45678; + int32x4_t q23, q45, q67; + int32x4_t q2345, q4567; + + int32x4_t r12345, r34567, r23456, r45678; + int32x4_t r23, r45, r67; + int32x4_t r2345, r4567; + + int32_t *src2_ptr, *dst1_32_ptr; + int16_t *src1_ptr; + count = 0; + h = height; + do { + dst1_32_ptr = dst32 + count * dst_stride_8 + (dst_stride_2); + dst2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2); + src1_ptr = dst16 + count * dst_stride_8 + (dst_stride_2); + src2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2); + w = width; + + dst1_32_ptr += 2; + dst2_ptr += 2; + load_s16_4x4(src1_ptr, dst_stride_2, &s1, &s2, &s3, &s4); + transpose_s16_4x4d(&s1, &s2, &s3, &s4); + load_s32_4x4(src2_ptr, dst_stride_2, &d1, &d2, &d3, &d4); + transpose_s32_4x4(&d1, &d2, &d3, &d4); + do { + src1_ptr += 4; + src2_ptr += 4; + load_s16_4x4(src1_ptr, dst_stride_2, &s5, &s6, &s7, &s8); + transpose_s16_4x4d(&s5, &s6, &s7, &s8); + load_s32_4x4(src2_ptr, dst_stride_2, &d5, &d6, &d7, &d8); + transpose_s32_4x4(&d5, &d6, &d7, &d8); + q23 = vaddl_s16(s2, s3); + q45 = vaddl_s16(s4, s5); + q67 = vaddl_s16(s6, s7); + q2345 = vaddq_s32(q23, q45); + q4567 = vaddq_s32(q45, q67); + q12345 = vaddq_s32(vmovl_s16(s1), q2345); + q23456 = vaddq_s32(q2345, vmovl_s16(s6)); + q34567 = vaddq_s32(q4567, vmovl_s16(s3)); + q45678 = vaddq_s32(q4567, vmovl_s16(s8)); + + transpose_s32_4x4(&q12345, &q23456, &q34567, &q45678); + store_s32_4x4(dst1_32_ptr, dst_stride_2, q12345, q23456, q34567, + q45678); + dst1_32_ptr += 4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + + r23 = vaddq_s32(d2, d3); + r45 = vaddq_s32(d4, d5); + r67 = vaddq_s32(d6, d7); + r2345 = vaddq_s32(r23, r45); + r4567 = vaddq_s32(r45, r67); + r12345 = vaddq_s32(d1, r2345); + r23456 = vaddq_s32(r2345, d6); + r34567 = vaddq_s32(r4567, d3); + r45678 = vaddq_s32(r4567, d8); + + transpose_s32_4x4(&r12345, &r23456, &r34567, &r45678); + store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r23456, r34567, r45678); + dst2_ptr += 4; + d1 = d5; + d2 = d6; + d3 = d7; + d4 = d8; + w -= 4; + } while (w > 0); + h -= 8; + count++; + } while (h > 0); + } +} + +static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16, + uint16_t *B16, int32_t *B, + const int buf_stride, const int width, + const int height, const int r, + const int s, const int ht_inc) { + int32_t *src1, *dst2, count = 0; + uint16_t *dst_A16, *src2; + const uint32_t n = (2 * r + 1) * (2 * r + 1); + const uint32x4_t const_n_val = vdupq_n_u32(n); + const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR); + const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]); + const uint32x4_t const_val = vdupq_n_u32(255); + + uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7; + + uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + + const uint32x4_t s_vec = vdupq_n_u32(s); + int w, h = height; + + do { + dst_A16 = A16 + (count << 2) * buf_stride; + src1 = A + (count << 2) * buf_stride; + src2 = B16 + (count << 2) * buf_stride; + dst2 = B + (count << 2) * buf_stride; + w = width; + do { + load_u32_4x4((uint32_t *)src1, buf_stride, &s0, &s1, &s2, &s3); + load_u32_4x4((uint32_t *)src1 + 4, buf_stride, &s4, &s5, &s6, &s7); + load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3); + + s16_4 = s16_0; + s16_5 = s16_1; + s16_6 = s16_2; + s16_7 = s16_3; + + calc_ab_internal_common( + s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4, + s16_5, s16_6, s16_7, const_n_val, s_vec, const_val, + one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride); + + w -= 8; + dst2 += 8; + src1 += 8; + src2 += 8; + dst_A16 += 8; + } while (w > 0); + count++; + h -= (ht_inc * 4); + } while (h > 0); +} + +static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16, + uint16_t *B16, int32_t *B, + const int buf_stride, const int width, + const int height, const int bit_depth, + const int r, const int s, + const int ht_inc) { + int32_t *src1, *dst2, count = 0; + uint16_t *dst_A16, *src2; + const uint32_t n = (2 * r + 1) * (2 * r + 1); + const int16x8_t bd_min_2_vec = vdupq_n_s16(-(bit_depth - 8)); + const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1)); + const uint32x4_t const_n_val = vdupq_n_u32(n); + const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR); + const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]); + const uint32x4_t const_val = vdupq_n_u32(255); + + int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; + uint16x8_t s16_0, s16_1, s16_2, s16_3; + uint16x8_t s16_4, s16_5, s16_6, s16_7; + uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + + const uint32x4_t s_vec = vdupq_n_u32(s); + int w, h = height; + + do { + src1 = A + (count << 2) * buf_stride; + src2 = B16 + (count << 2) * buf_stride; + dst2 = B + (count << 2) * buf_stride; + dst_A16 = A16 + (count << 2) * buf_stride; + w = width; + do { + load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3); + load_s32_4x4(src1 + 4, buf_stride, &sr4, &sr5, &sr6, &sr7); + load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3); + + s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec); + s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec); + s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec); + s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec); + s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_1_vec); + s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_1_vec); + s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_1_vec); + s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_1_vec); + + s16_4 = vrshlq_u16(s16_0, bd_min_2_vec); + s16_5 = vrshlq_u16(s16_1, bd_min_2_vec); + s16_6 = vrshlq_u16(s16_2, bd_min_2_vec); + s16_7 = vrshlq_u16(s16_3, bd_min_2_vec); + + calc_ab_internal_common( + s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4, + s16_5, s16_6, s16_7, const_n_val, s_vec, const_val, + one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride); + + w -= 8; + dst2 += 8; + src1 += 8; + src2 += 8; + dst_A16 += 8; + } while (w > 0); + count++; + h -= (ht_inc * 4); + } while (h > 0); +} + +static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16, + int32_t *B, const int buf_stride, + const int width, const int height, + const int r, const int s, + const int ht_inc) { + int32_t *src1, *src2, count = 0; + uint16_t *dst_A16; + const uint32_t n = (2 * r + 1) * (2 * r + 1); + const uint32x4_t const_n_val = vdupq_n_u32(n); + const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR); + const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]); + const uint32x4_t const_val = vdupq_n_u32(255); + + int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; + uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + + const uint32x4_t s_vec = vdupq_n_u32(s); + int w, h = height; + + do { + src1 = A + (count << 2) * buf_stride; + src2 = B + (count << 2) * buf_stride; + dst_A16 = A16 + (count << 2) * buf_stride; + w = width; + do { + load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3); + load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7); + + s0 = vreinterpretq_u32_s32(sr0); + s1 = vreinterpretq_u32_s32(sr1); + s2 = vreinterpretq_u32_s32(sr2); + s3 = vreinterpretq_u32_s32(sr3); + s4 = vreinterpretq_u32_s32(sr4); + s5 = vreinterpretq_u32_s32(sr5); + s6 = vreinterpretq_u32_s32(sr6); + s7 = vreinterpretq_u32_s32(sr7); + + calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5, + sr6, sr7, const_n_val, s_vec, const_val, + one_by_n_minus_1_vec, sgrproj_sgr, src1, + dst_A16, src2, buf_stride); + + w -= 4; + src1 += 4; + src2 += 4; + dst_A16 += 4; + } while (w > 0); + count++; + h -= (ht_inc * 4); + } while (h > 0); +} + +static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16, + int32_t *B, const int buf_stride, + const int width, const int height, + const int bit_depth, const int r, + const int s, const int ht_inc) { + int32_t *src1, *src2, count = 0; + uint16_t *dst_A16; + const uint32_t n = (2 * r + 1) * (2 * r + 1); + const int32x4_t bd_min_2_vec = vdupq_n_s32(-(bit_depth - 8)); + const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1)); + const uint32x4_t const_n_val = vdupq_n_u32(n); + const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR); + const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]); + const uint32x4_t const_val = vdupq_n_u32(255); + + int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; + uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + + const uint32x4_t s_vec = vdupq_n_u32(s); + int w, h = height; + + do { + src1 = A + (count << 2) * buf_stride; + src2 = B + (count << 2) * buf_stride; + dst_A16 = A16 + (count << 2) * buf_stride; + w = width; + do { + load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3); + load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7); + + s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec); + s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec); + s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec); + s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec); + s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_2_vec); + s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_2_vec); + s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_2_vec); + s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_2_vec); + + calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5, + sr6, sr7, const_n_val, s_vec, const_val, + one_by_n_minus_1_vec, sgrproj_sgr, src1, + dst_A16, src2, buf_stride); + + w -= 4; + src1 += 4; + src2 += 4; + dst_A16 += 4; + } while (w > 0); + count++; + h -= (ht_inc * 4); + } while (h > 0); +} + +static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1, + int32_t *dst2, const int dst_stride, const int width, + const int height) { + assert(width > 2 * SGRPROJ_BORDER_HORZ); + assert(height > 2 * SGRPROJ_BORDER_VERT); + + int16_t *src_ptr; + int32_t *dst2_ptr; + uint16_t *dst1_ptr; + int h, w, count = 0; + + w = width; + { + int16x8_t s1, s2, s3, s4, s5, s6, s7, s8; + int16x8_t q23, q34, q56, q234, q345, q456, q567; + int32x4_t r23, r56, r345, r456, r567, r78, r678; + int32x4_t r4_low, r4_high, r34_low, r34_high, r234_low, r234_high; + int32x4_t r2, r3, r5, r6, r7, r8; + int16x8_t q678, q78; + + do { + dst1_ptr = dst1 + (count << 3); + dst2_ptr = dst2 + (count << 3); + src_ptr = src + (count << 3); + h = height; + + load_s16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4); + src_ptr += 4 * src_stride; + + q23 = vaddq_s16(s2, s3); + q234 = vaddq_s16(q23, s4); + q34 = vaddq_s16(s3, s4); + dst1_ptr += (dst_stride << 1); + + r2 = vmull_s16(vget_low_s16(s2), vget_low_s16(s2)); + r3 = vmull_s16(vget_low_s16(s3), vget_low_s16(s3)); + r4_low = vmull_s16(vget_low_s16(s4), vget_low_s16(s4)); + r23 = vaddq_s32(r2, r3); + r234_low = vaddq_s32(r23, r4_low); + r34_low = vaddq_s32(r3, r4_low); + + r2 = vmull_s16(vget_high_s16(s2), vget_high_s16(s2)); + r3 = vmull_s16(vget_high_s16(s3), vget_high_s16(s3)); + r4_high = vmull_s16(vget_high_s16(s4), vget_high_s16(s4)); + r23 = vaddq_s32(r2, r3); + r234_high = vaddq_s32(r23, r4_high); + r34_high = vaddq_s32(r3, r4_high); + + dst2_ptr += (dst_stride << 1); + + do { + load_s16_8x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); + src_ptr += 4 * src_stride; + + q345 = vaddq_s16(s5, q34); + q56 = vaddq_s16(s5, s6); + q456 = vaddq_s16(s4, q56); + q567 = vaddq_s16(s7, q56); + q78 = vaddq_s16(s7, s8); + q678 = vaddq_s16(s6, q78); + + store_s16_8x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567); + dst1_ptr += (dst_stride << 2); + + s4 = s8; + q34 = q78; + q234 = q678; + + r5 = vmull_s16(vget_low_s16(s5), vget_low_s16(s5)); + r6 = vmull_s16(vget_low_s16(s6), vget_low_s16(s6)); + r7 = vmull_s16(vget_low_s16(s7), vget_low_s16(s7)); + r8 = vmull_s16(vget_low_s16(s8), vget_low_s16(s8)); + + r345 = vaddq_s32(r5, r34_low); + r56 = vaddq_s32(r5, r6); + r456 = vaddq_s32(r4_low, r56); + r567 = vaddq_s32(r7, r56); + r78 = vaddq_s32(r7, r8); + r678 = vaddq_s32(r6, r78); + store_s32_4x4(dst2_ptr, dst_stride, r234_low, r345, r456, r567); + + r4_low = r8; + r34_low = r78; + r234_low = r678; + + r5 = vmull_s16(vget_high_s16(s5), vget_high_s16(s5)); + r6 = vmull_s16(vget_high_s16(s6), vget_high_s16(s6)); + r7 = vmull_s16(vget_high_s16(s7), vget_high_s16(s7)); + r8 = vmull_s16(vget_high_s16(s8), vget_high_s16(s8)); + + r345 = vaddq_s32(r5, r34_high); + r56 = vaddq_s32(r5, r6); + r456 = vaddq_s32(r4_high, r56); + r567 = vaddq_s32(r7, r56); + r78 = vaddq_s32(r7, r8); + r678 = vaddq_s32(r6, r78); + store_s32_4x4((dst2_ptr + 4), dst_stride, r234_high, r345, r456, r567); + dst2_ptr += (dst_stride << 2); + + r4_high = r8; + r34_high = r78; + r234_high = r678; + + h -= 4; + } while (h > 0); + w -= 8; + count++; + } while (w > 0); + } + + { + int16x4_t d1, d2, d3, d4, d5, d6, d7, d8; + int16x4_t q23, q34, q56, q234, q345, q456, q567; + int32x4_t r23, r56, r234, r345, r456, r567, r34, r78, r678; + int32x4_t r1, r2, r3, r4, r5, r6, r7, r8; + int16x4_t q678, q78; + + int32_t *src2_ptr; + uint16_t *src1_ptr; + count = 0; + h = height; + w = width; + do { + dst1_ptr = dst1 + (count << 2) * dst_stride; + dst2_ptr = dst2 + (count << 2) * dst_stride; + src1_ptr = dst1 + (count << 2) * dst_stride; + src2_ptr = dst2 + (count << 2) * dst_stride; + w = width; + + load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d1, &d2, &d3, &d4); + transpose_s16_4x4d(&d1, &d2, &d3, &d4); + load_s32_4x4(src2_ptr, dst_stride, &r1, &r2, &r3, &r4); + transpose_s32_4x4(&r1, &r2, &r3, &r4); + src1_ptr += 4; + src2_ptr += 4; + + q23 = vadd_s16(d2, d3); + q234 = vadd_s16(q23, d4); + q34 = vadd_s16(d3, d4); + dst1_ptr += 2; + r23 = vaddq_s32(r2, r3); + r234 = vaddq_s32(r23, r4); + r34 = vaddq_s32(r3, r4); + dst2_ptr += 2; + + do { + load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d5, &d6, &d7, &d8); + transpose_s16_4x4d(&d5, &d6, &d7, &d8); + load_s32_4x4(src2_ptr, dst_stride, &r5, &r6, &r7, &r8); + transpose_s32_4x4(&r5, &r6, &r7, &r8); + src1_ptr += 4; + src2_ptr += 4; + + q345 = vadd_s16(d5, q34); + q56 = vadd_s16(d5, d6); + q456 = vadd_s16(d4, q56); + q567 = vadd_s16(d7, q56); + q78 = vadd_s16(d7, d8); + q678 = vadd_s16(d6, q78); + transpose_s16_4x4d(&q234, &q345, &q456, &q567); + store_s16_4x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567); + dst1_ptr += 4; + + d4 = d8; + q34 = q78; + q234 = q678; + + r345 = vaddq_s32(r5, r34); + r56 = vaddq_s32(r5, r6); + r456 = vaddq_s32(r4, r56); + r567 = vaddq_s32(r7, r56); + r78 = vaddq_s32(r7, r8); + r678 = vaddq_s32(r6, r78); + transpose_s32_4x4(&r234, &r345, &r456, &r567); + store_s32_4x4(dst2_ptr, dst_stride, r234, r345, r456, r567); + dst2_ptr += 4; + + r4 = r8; + r34 = r78; + r234 = r678; + w -= 4; + } while (w > 0); + h -= 4; + count++; + } while (h > 0); + } +} + +static INLINE int32x4_t cross_sum_inp_s32(int32_t *buf, int buf_stride) { + int32x4_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl; + int32x4_t fours, threes, res; + + xtl = vld1q_s32(buf - buf_stride - 1); + xt = vld1q_s32(buf - buf_stride); + xtr = vld1q_s32(buf - buf_stride + 1); + xl = vld1q_s32(buf - 1); + x = vld1q_s32(buf); + xr = vld1q_s32(buf + 1); + xbl = vld1q_s32(buf + buf_stride - 1); + xb = vld1q_s32(buf + buf_stride); + xbr = vld1q_s32(buf + buf_stride + 1); + + fours = vaddq_s32(xl, vaddq_s32(xt, vaddq_s32(xr, vaddq_s32(xb, x)))); + threes = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl))); + res = vsubq_s32(vshlq_n_s32(vaddq_s32(fours, threes), 2), threes); + return res; +} + +static INLINE void cross_sum_inp_u16(uint16_t *buf, int buf_stride, + int32x4_t *a0, int32x4_t *a1) { + uint16x8_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl; + uint16x8_t r0, r1; + + xtl = vld1q_u16(buf - buf_stride - 1); + xt = vld1q_u16(buf - buf_stride); + xtr = vld1q_u16(buf - buf_stride + 1); + xl = vld1q_u16(buf - 1); + x = vld1q_u16(buf); + xr = vld1q_u16(buf + 1); + xbl = vld1q_u16(buf + buf_stride - 1); + xb = vld1q_u16(buf + buf_stride); + xbr = vld1q_u16(buf + buf_stride + 1); + + xb = vaddq_u16(xb, x); + xt = vaddq_u16(xt, xr); + xl = vaddq_u16(xl, xb); + xl = vaddq_u16(xl, xt); + + r0 = vshlq_n_u16(xl, 2); + + xbl = vaddq_u16(xbl, xbr); + xtl = vaddq_u16(xtl, xtr); + xtl = vaddq_u16(xtl, xbl); + + r1 = vshlq_n_u16(xtl, 2); + r1 = vsubq_u16(r1, xtl); + + *a0 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_low_u16(r0)), vmovl_u16(vget_low_u16(r1)))); + *a1 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_high_u16(r0)), vmovl_u16(vget_high_u16(r1)))); +} + +static INLINE int32x4_t cross_sum_fast_even_row(int32_t *buf, int buf_stride) { + int32x4_t xtr, xt, xtl, xbr, xb, xbl; + int32x4_t fives, sixes, fives_plus_sixes; + + xtl = vld1q_s32(buf - buf_stride - 1); + xt = vld1q_s32(buf - buf_stride); + xtr = vld1q_s32(buf - buf_stride + 1); + xbl = vld1q_s32(buf + buf_stride - 1); + xb = vld1q_s32(buf + buf_stride); + xbr = vld1q_s32(buf + buf_stride + 1); + + fives = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl))); + sixes = vaddq_s32(xt, xb); + fives_plus_sixes = vaddq_s32(fives, sixes); + + return vaddq_s32( + vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes); +} + +static INLINE void cross_sum_fast_even_row_inp16(uint16_t *buf, int buf_stride, + int32x4_t *a0, int32x4_t *a1) { + uint16x8_t xtr, xt, xtl, xbr, xb, xbl, xb0; + + xtl = vld1q_u16(buf - buf_stride - 1); + xt = vld1q_u16(buf - buf_stride); + xtr = vld1q_u16(buf - buf_stride + 1); + xbl = vld1q_u16(buf + buf_stride - 1); + xb = vld1q_u16(buf + buf_stride); + xbr = vld1q_u16(buf + buf_stride + 1); + + xbr = vaddq_u16(xbr, xbl); + xtr = vaddq_u16(xtr, xtl); + xbr = vaddq_u16(xbr, xtr); + xtl = vshlq_n_u16(xbr, 2); + xbr = vaddq_u16(xtl, xbr); + + xb = vaddq_u16(xb, xt); + xb0 = vshlq_n_u16(xb, 1); + xb = vshlq_n_u16(xb, 2); + xb = vaddq_u16(xb, xb0); + + *a0 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_low_u16(xbr)), vmovl_u16(vget_low_u16(xb)))); + *a1 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_high_u16(xbr)), vmovl_u16(vget_high_u16(xb)))); +} + +static INLINE int32x4_t cross_sum_fast_odd_row(int32_t *buf) { + int32x4_t xl, x, xr; + int32x4_t fives, sixes, fives_plus_sixes; + + xl = vld1q_s32(buf - 1); + x = vld1q_s32(buf); + xr = vld1q_s32(buf + 1); + fives = vaddq_s32(xl, xr); + sixes = x; + fives_plus_sixes = vaddq_s32(fives, sixes); + + return vaddq_s32( + vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes); +} + +static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0, + int32x4_t *a1) { + uint16x8_t xl, x, xr; + uint16x8_t x0; + + xl = vld1q_u16(buf - 1); + x = vld1q_u16(buf); + xr = vld1q_u16(buf + 1); + xl = vaddq_u16(xl, xr); + x0 = vshlq_n_u16(xl, 2); + xl = vaddq_u16(xl, x0); + + x0 = vshlq_n_u16(x, 1); + x = vshlq_n_u16(x, 2); + x = vaddq_u16(x, x0); + + *a0 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_low_u16(xl)), vmovl_u16(vget_low_u16(x)))); + *a1 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x)))); +} + +void final_filter_fast_internal(uint16_t *A, int32_t *B, const int buf_stride, + int16_t *src, const int src_stride, + int32_t *dst, const int dst_stride, + const int width, const int height) { + int16x8_t s0; + int32_t *B_tmp, *dst_ptr; + uint16_t *A_tmp; + int16_t *src_ptr; + int32x4_t a_res0, a_res1, b_res0, b_res1; + int w, h, count = 0; + assert(SGRPROJ_SGR_BITS == 8); + assert(SGRPROJ_RST_BITS == 4); + + A_tmp = A; + B_tmp = B; + src_ptr = src; + dst_ptr = dst; + h = height; + do { + A_tmp = (A + count * buf_stride); + B_tmp = (B + count * buf_stride); + src_ptr = (src + count * src_stride); + dst_ptr = (dst + count * dst_stride); + w = width; + if (!(count & 1)) { + do { + s0 = vld1q_s16(src_ptr); + cross_sum_fast_even_row_inp16(A_tmp, buf_stride, &a_res0, &a_res1); + a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0); + a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1); + + b_res0 = cross_sum_fast_even_row(B_tmp, buf_stride); + b_res1 = cross_sum_fast_even_row(B_tmp + 4, buf_stride); + a_res0 = vaddq_s32(a_res0, b_res0); + a_res1 = vaddq_s32(a_res1, b_res1); + + a_res0 = + vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); + a_res1 = + vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); + + vst1q_s32(dst_ptr, a_res0); + vst1q_s32(dst_ptr + 4, a_res1); + + A_tmp += 8; + B_tmp += 8; + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 0); + } else { + do { + s0 = vld1q_s16(src_ptr); + cross_sum_fast_odd_row_inp16(A_tmp, &a_res0, &a_res1); + a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0); + a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1); + + b_res0 = cross_sum_fast_odd_row(B_tmp); + b_res1 = cross_sum_fast_odd_row(B_tmp + 4); + a_res0 = vaddq_s32(a_res0, b_res0); + a_res1 = vaddq_s32(a_res1, b_res1); + + a_res0 = + vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS); + a_res1 = + vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS); + + vst1q_s32(dst_ptr, a_res0); + vst1q_s32(dst_ptr + 4, a_res1); + + A_tmp += 8; + B_tmp += 8; + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 0); + } + count++; + h -= 1; + } while (h > 0); +} + +void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride, + int16_t *src, const int src_stride, int32_t *dst, + const int dst_stride, const int width, + const int height) { + int16x8_t s0; + int32_t *B_tmp, *dst_ptr; + uint16_t *A_tmp; + int16_t *src_ptr; + int32x4_t a_res0, a_res1, b_res0, b_res1; + int w, h, count = 0; + + assert(SGRPROJ_SGR_BITS == 8); + assert(SGRPROJ_RST_BITS == 4); + h = height; + + do { + A_tmp = (A + count * buf_stride); + B_tmp = (B + count * buf_stride); + src_ptr = (src + count * src_stride); + dst_ptr = (dst + count * dst_stride); + w = width; + do { + s0 = vld1q_s16(src_ptr); + cross_sum_inp_u16(A_tmp, buf_stride, &a_res0, &a_res1); + a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0); + a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1); + + b_res0 = cross_sum_inp_s32(B_tmp, buf_stride); + b_res1 = cross_sum_inp_s32(B_tmp + 4, buf_stride); + a_res0 = vaddq_s32(a_res0, b_res0); + a_res1 = vaddq_s32(a_res1, b_res1); + + a_res0 = + vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); + a_res1 = + vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); + vst1q_s32(dst_ptr, a_res0); + vst1q_s32(dst_ptr + 4, a_res1); + + A_tmp += 8; + B_tmp += 8; + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 0); + count++; + h -= 1; + } while (h > 0); +} + +static INLINE void restoration_fast_internal(uint16_t *dgd16, int width, + int height, int dgd_stride, + int32_t *dst, int dst_stride, + int bit_depth, int sgr_params_idx, + int radius_idx) { + const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + + const int buf_stride = ((width_ext + 3) & ~3) + 16; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + uint16_t A16_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *square_sum_buf = A_; + int32_t *sum_buf = B_; + uint16_t *tmp16_buf = A16_; + + assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); + assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && + "Need SGRPROJ_BORDER_* >= r+1"); + + assert(radius_idx == 0); + assert(r == 2); + + // input(dgd16) is 16bit. + // sum of pixels 1st stage output will be in 16bit(tmp16_buf). End output is + // kept in 32bit [sum_buf]. sum of squares output is kept in 32bit + // buffer(square_sum_buf). + boxsum2((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT - + SGRPROJ_BORDER_HORZ), + dgd_stride, (int16_t *)tmp16_buf, sum_buf, square_sum_buf, buf_stride, + width_ext, height_ext); + + square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + tmp16_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + + // Calculation of a, b. a output is in 16bit tmp_buf which is in range of + // [1, 256] for all bit depths. b output is kept in 32bit buffer. + + if (8 == bit_depth) { + calc_ab_fast_internal_lbd( + (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1), + (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r, + params->s[radius_idx], 2); + } else { + calc_ab_fast_internal_hbd( + (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1), + (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, + bit_depth, r, params->s[radius_idx], 2); + } + final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16, + dgd_stride, dst, dst_stride, width, height); +} + +static INLINE void restoration_internal(uint16_t *dgd16, int width, int height, + int dgd_stride, int32_t *dst, + int dst_stride, int bit_depth, + int sgr_params_idx, int radius_idx) { + const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + + int buf_stride = ((width_ext + 3) & ~3) + 16; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + uint16_t A16_[RESTORATION_PROC_UNIT_PELS]; + uint16_t B16_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *square_sum_buf = A_; + uint16_t *sum_buf = B16_; + uint16_t *A16 = A16_; + int32_t *B = B_; + + assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); + assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && + "Need SGRPROJ_BORDER_* >= r+1"); + + assert(radius_idx == 1); + assert(r == 1); + + // input(dgd16) is 16bit. + // sum of pixels output will be in 16bit(sum_buf). + // sum of squares output is kept in 32bit buffer(square_sum_buf). + boxsum1((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT - + SGRPROJ_BORDER_HORZ), + dgd_stride, sum_buf, square_sum_buf, buf_stride, width_ext, + height_ext); + + square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + + // Calculation of a, b. a output is in 16bit tmp_buf which is in range of + // [1, 256] for all bit depths. b output is kept in 32bit buffer. + if (8 == bit_depth) { + calc_ab_internal_lbd((square_sum_buf - buf_stride - 1), + (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), + (B - buf_stride - 1), buf_stride, width + 2, + height + 2, r, params->s[radius_idx], 1); + } else { + calc_ab_internal_hbd((square_sum_buf - buf_stride - 1), + (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), + (B - buf_stride - 1), buf_stride, width + 2, + height + 2, bit_depth, r, params->s[radius_idx], 1); + } + final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst, + dst_stride, width, height); +} + +static INLINE void src_convert_u8_to_u16(const uint8_t *src, + const int src_stride, uint16_t *dst, + const int dst_stride, const int width, + const int height) { + const uint8_t *src_ptr; + uint16_t *dst_ptr; + int h, w, count = 0; + + uint8x8_t t1, t2, t3, t4; + uint16x8_t s1, s2, s3, s4; + h = height; + do { + src_ptr = src + (count << 2) * src_stride; + dst_ptr = dst + (count << 2) * dst_stride; + w = width; + if (w >= 7) { + do { + load_u8_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4); + s1 = vmovl_u8(t1); + s2 = vmovl_u8(t2); + s3 = vmovl_u8(t3); + s4 = vmovl_u8(t4); + store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4); + + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 7); + } + + for (int y = 0; y < w; y++) { + dst_ptr[y] = src_ptr[y]; + dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride]; + dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride]; + dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride]; + } + count++; + h -= 4; + } while (h > 3); + + src_ptr = src + (count << 2) * src_stride; + dst_ptr = dst + (count << 2) * dst_stride; + for (int x = 0; x < h; x++) { + for (int y = 0; y < width; y++) { + dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride]; + } + } +} + +static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride, + uint16_t *dst, const int dst_stride, + int width, int height) { + const uint16_t *src_ptr; + uint16_t *dst_ptr; + int h, w, count = 0; + uint16x8_t s1, s2, s3, s4; + + h = height; + do { + src_ptr = src + (count << 2) * src_stride; + dst_ptr = dst + (count << 2) * dst_stride; + w = width; + do { + load_u16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4); + store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 7); + + for (int y = 0; y < w; y++) { + dst_ptr[y] = src_ptr[y]; + dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride]; + dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride]; + dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride]; + } + count++; + h -= 4; + } while (h > 3); + + src_ptr = src + (count << 2) * src_stride; + dst_ptr = dst + (count << 2) * dst_stride; + + for (int x = 0; x < h; x++) { + memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride), + sizeof(uint16_t) * width); + } +} + +void av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, + int stride, int32_t *flt0, int32_t *flt1, + int flt_stride, int sgr_params_idx, + int bit_depth, int highbd) { + const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + assert(!(params->r[0] == 0 && params->r[1] == 0)); + + uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS]; + const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ; + uint16_t *dgd16 = + dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + const int dgd_stride = stride; + + if (highbd) { + const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8); + src_convert_hbd_copy( + dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, + dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); + } else { + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, + dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); + } + + if (params->r[0] > 0) + restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, + flt_stride, bit_depth, sgr_params_idx, 0); + if (params->r[1] > 0) + restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride, + bit_depth, sgr_params_idx, 1); +} + +void apply_selfguided_restoration_neon(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { + int32_t *flt0 = tmpbuf; + int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; + assert(width * height <= RESTORATION_UNITPELS_MAX); + uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS]; + const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ; + uint16_t *dgd16 = + dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + const int dgd_stride = stride; + const sgr_params_type *const params = &sgr_params[eps]; + int xq[2]; + + assert(!(params->r[0] == 0 && params->r[1] == 0)); + + if (highbd) { + const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8); + src_convert_hbd_copy( + dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, + dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); + } else { + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, + dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); + } + + if (params->r[0] > 0) + restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width, + bit_depth, eps, 0); + if (params->r[1] > 0) + restoration_internal(dgd16, width, height, dgd16_stride, flt1, width, + bit_depth, eps, 1); + + decode_xq(xqd, xq, params); + + { + int16_t *src_ptr; + uint8_t *dst_ptr; + uint16_t *dst16_ptr; + int16x4_t d0, d4; + int16x8_t r0, s0; + uint16x8_t r4; + int32x4_t u0, u4, v0, v4, f00, f10; + uint8x8_t t0; + int count = 0, w = width, h = height, rc = 0; + + const int32x4_t xq0_vec = vdupq_n_s32(xq[0]); + const int32x4_t xq1_vec = vdupq_n_s32(xq[1]); + const int16x8_t zero = vdupq_n_s16(0); + const uint16x8_t max = vdupq_n_u16((1 << bit_depth) - 1); + uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8); + dst_ptr = dst8; + src_ptr = (int16_t *)dgd16; + do { + w = width; + count = 0; + dst_ptr = dst8 + rc * dst_stride; + dst16_ptr = dst16 + rc * dst_stride; + do { + s0 = vld1q_s16(src_ptr + count); + + u0 = vshll_n_s16(vget_low_s16(s0), SGRPROJ_RST_BITS); + u4 = vshll_n_s16(vget_high_s16(s0), SGRPROJ_RST_BITS); + + v0 = vshlq_n_s32(u0, SGRPROJ_PRJ_BITS); + v4 = vshlq_n_s32(u4, SGRPROJ_PRJ_BITS); + + if (params->r[0] > 0) { + f00 = vld1q_s32(flt0 + count); + f10 = vld1q_s32(flt0 + count + 4); + + f00 = vsubq_s32(f00, u0); + f10 = vsubq_s32(f10, u4); + + v0 = vmlaq_s32(v0, xq0_vec, f00); + v4 = vmlaq_s32(v4, xq0_vec, f10); + } + + if (params->r[1] > 0) { + f00 = vld1q_s32(flt1 + count); + f10 = vld1q_s32(flt1 + count + 4); + + f00 = vsubq_s32(f00, u0); + f10 = vsubq_s32(f10, u4); + + v0 = vmlaq_s32(v0, xq1_vec, f00); + v4 = vmlaq_s32(v4, xq1_vec, f10); + } + + d0 = vqrshrn_n_s32(v0, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + d4 = vqrshrn_n_s32(v4, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + + r0 = vcombine_s16(d0, d4); + + r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero)); + + if (highbd) { + r4 = vminq_u16(r4, max); + vst1q_u16(dst16_ptr, r4); + } else { + t0 = vqmovn_u16(r4); + vst1_u8(dst_ptr, t0); + } + w -= 8; + count += 8; + dst_ptr += 8; + dst16_ptr += 8; + } while (w > 0); + + src_ptr += dgd16_stride; + flt1 += width; + flt0 += width; + rc++; + h--; + } while (h > 0); + } +} diff --git a/third_party/aom/av1/common/arm/transpose_neon.h b/third_party/aom/av1/common/arm/transpose_neon.h index 53727bb43..fe134087b 100644 --- a/third_party/aom/av1/common/arm/transpose_neon.h +++ b/third_party/aom/av1/common/arm/transpose_neon.h @@ -419,4 +419,42 @@ static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1, *a3 = vreinterpret_s16_s32(c1.val[1]); } +static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { + int32x4x2_t b0; + b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1)); + b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1)); + return b0; +} + +static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1, + int32x4_t *a2, int32x4_t *a3) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + + const int32x4x2_t b0 = vtrnq_s32(*a0, *a1); + const int32x4x2_t b1 = vtrnq_s32(*a2, *a3); + + // Swap 64 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + + const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]); + const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]); + + *a0 = c0.val[0]; + *a1 = c1.val[0]; + *a2 = c0.val[1]; + *a3 = c1.val[1]; +} + #endif // AV1_COMMON_ARM_TRANSPOSE_NEON_H_ diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c index 738290fad..9d68b8760 100644 --- a/third_party/aom/av1/common/av1_loopfilter.c +++ b/third_party/aom/av1/common/av1_loopfilter.c @@ -1308,7 +1308,7 @@ static int compare_ref_dst(AV1_COMMON *const cm, uint8_t *ref_buf, end <<= MI_SIZE_LOG2; uint8_t *ref0 = ref_buf; uint8_t *dst0 = dst_buf; - if (cm->use_highbitdepth) { + if (cm->seq_params.use_highbitdepth) { const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref_buf); const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst_buf); for (int j = 0; j < 4; ++j) { @@ -1404,11 +1404,11 @@ void av1_filter_block_plane_ver(AV1_COMMON *const cm, uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff; uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff; - if (cm->use_highbitdepth) + if (cm->seq_params.use_highbitdepth) highbd_filter_selectively_vert_row2( ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1, - &cm->lf_info, lfl, lfl2, (int)cm->bit_depth); + &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth); else filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0, @@ -1474,10 +1474,11 @@ void av1_filter_block_plane_hor(AV1_COMMON *const cm, mask_8x8 = (mask_8x8 >> shift) & mask_cutoff; mask_4x4 = (mask_4x4 >> shift) & mask_cutoff; - if (cm->use_highbitdepth) - highbd_filter_selectively_horiz( - CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16, - mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->bit_depth); + if (cm->seq_params.use_highbitdepth) + highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf), + dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl, + (int)cm->seq_params.bit_depth); else filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16, mask_8x8, mask_4x4, &cm->lf_info, lfl); @@ -1652,6 +1653,8 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm, const int dst_stride = plane_ptr->dst.stride; const int y_range = (MAX_MIB_SIZE >> scale_vert); const int x_range = (MAX_MIB_SIZE >> scale_horz); + const int use_highbitdepth = cm->seq_params.use_highbitdepth; + const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth; for (int y = 0; y < y_range; y += row_step) { uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride; for (int x = 0; x < x_range;) { @@ -1677,40 +1680,40 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm, switch (params.filter_length) { // apply 4-tap filtering case 4: - if (cm->use_highbitdepth) + if (use_highbitdepth) aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p), dst_stride, params.mblim, params.lim, params.hev_thr, - cm->bit_depth); + bit_depth); else aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim, params.hev_thr); break; case 6: // apply 6-tap filter for chroma plane only assert(plane != 0); - if (cm->use_highbitdepth) + if (use_highbitdepth) aom_highbd_lpf_vertical_6(CONVERT_TO_SHORTPTR(p), dst_stride, params.mblim, params.lim, params.hev_thr, - cm->bit_depth); + bit_depth); else aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim, params.hev_thr); break; // apply 8-tap filtering case 8: - if (cm->use_highbitdepth) + if (use_highbitdepth) aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(p), dst_stride, params.mblim, params.lim, params.hev_thr, - cm->bit_depth); + bit_depth); else aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim, params.hev_thr); break; // apply 14-tap filtering case 14: - if (cm->use_highbitdepth) + if (use_highbitdepth) aom_highbd_lpf_vertical_14(CONVERT_TO_SHORTPTR(p), dst_stride, params.mblim, params.lim, params.hev_thr, - cm->bit_depth); + bit_depth); else aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim, params.hev_thr); @@ -1737,6 +1740,8 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm, const int dst_stride = plane_ptr->dst.stride; const int y_range = (MAX_MIB_SIZE >> scale_vert); const int x_range = (MAX_MIB_SIZE >> scale_horz); + const int use_highbitdepth = cm->seq_params.use_highbitdepth; + const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth; for (int x = 0; x < x_range; x += col_step) { uint8_t *p = dst_ptr + x * MI_SIZE; for (int y = 0; y < y_range;) { @@ -1762,10 +1767,10 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm, switch (params.filter_length) { // apply 4-tap filtering case 4: - if (cm->use_highbitdepth) + if (use_highbitdepth) aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p), dst_stride, params.mblim, params.lim, - params.hev_thr, cm->bit_depth); + params.hev_thr, bit_depth); else aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim, params.hev_thr); @@ -1773,30 +1778,30 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm, // apply 6-tap filtering case 6: assert(plane != 0); - if (cm->use_highbitdepth) + if (use_highbitdepth) aom_highbd_lpf_horizontal_6(CONVERT_TO_SHORTPTR(p), dst_stride, params.mblim, params.lim, - params.hev_thr, cm->bit_depth); + params.hev_thr, bit_depth); else aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim, params.hev_thr); break; // apply 8-tap filtering case 8: - if (cm->use_highbitdepth) + if (use_highbitdepth) aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(p), dst_stride, params.mblim, params.lim, - params.hev_thr, cm->bit_depth); + params.hev_thr, bit_depth); else aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim, params.hev_thr); break; // apply 14-tap filtering case 14: - if (cm->use_highbitdepth) + if (use_highbitdepth) aom_highbd_lpf_horizontal_14(CONVERT_TO_SHORTPTR(p), dst_stride, params.mblim, params.lim, - params.hev_thr, cm->bit_depth); + params.hev_thr, bit_depth); else aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim, params.hev_thr); diff --git a/third_party/aom/av1/common/av1_rtcd.c b/third_party/aom/av1/common/av1_rtcd.c index 38e26bee1..a77a4d254 100644 --- a/third_party/aom/av1/common/av1_rtcd.c +++ b/third_party/aom/av1/common/av1_rtcd.c @@ -16,7 +16,7 @@ #include "aom_ports/aom_once.h" void av1_rtcd() { - // TODO(JBB): Remove this once, by insuring that both the encoder and - // decoder setup functions are protected by once(); - once(setup_rtcd_internal); + // TODO(JBB): Remove this aom_once, by insuring that both the encoder and + // decoder setup functions are protected by aom_once(); + aom_once(setup_rtcd_internal); } diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl index 6aa925515..fa8b34981 100755 --- a/third_party/aom/av1/common/av1_rtcd_defs.pl +++ b/third_party/aom/av1/common/av1_rtcd_defs.pl @@ -106,7 +106,7 @@ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64"; #inv txfm add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; -specialize qw/av1_inv_txfm_add ssse3 avx2/; +specialize qw/av1_inv_txfm_add ssse3 avx2 neon/; add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; @@ -181,7 +181,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { #fwd txfm add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param"; - specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1/; + specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2/; add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; @@ -241,11 +241,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/av1_txb_init_levels sse4_1/; add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N"; - specialize qw/av1_wedge_sse_from_residuals sse2/; + specialize qw/av1_wedge_sse_from_residuals sse2 avx2/; add_proto qw/int av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit"; - specialize qw/av1_wedge_sign_from_residuals sse2/; + specialize qw/av1_wedge_sign_from_residuals sse2 avx2/; add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N"; - specialize qw/av1_wedge_compute_delta_squares sse2/; + specialize qw/av1_wedge_compute_delta_squares sse2 avx2/; # hash add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length"; @@ -288,34 +288,34 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # LOOP_RESTORATION functions add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd"; -specialize qw/apply_selfguided_restoration sse4_1 avx2/; +specialize qw/apply_selfguided_restoration sse4_1 avx2 neon/; add_proto qw/void av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height, int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride, int sgr_params_idx, int bit_depth, int highbd"; -specialize qw/av1_selfguided_restoration sse4_1 avx2/; +specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/; # CONVOLVE_ROUND/COMPOUND_ROUND functions -add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; - - add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params"; - add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd"; +add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; +add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; +add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; +add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; +add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; +add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; +add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; +add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; +add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; +add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; +add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; +add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; +add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; +add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; +add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; +add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; + + add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params"; + add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd"; specialize qw/av1_convolve_2d_sr sse2 avx2 neon/; specialize qw/av1_convolve_2d_copy_sr sse2 avx2 neon/; diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h index 5db3233f5..c9cc79852 100644 --- a/third_party/aom/av1/common/av1_txfm.h +++ b/third_party/aom/av1/common/av1_txfm.h @@ -171,53 +171,6 @@ static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) { get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip); } -static INLINE TX_SIZE av1_rotate_tx_size(TX_SIZE tx_size) { - switch (tx_size) { - case TX_4X4: return TX_4X4; - case TX_8X8: return TX_8X8; - case TX_16X16: return TX_16X16; - case TX_32X32: return TX_32X32; - case TX_64X64: return TX_64X64; - case TX_32X64: return TX_64X32; - case TX_64X32: return TX_32X64; - case TX_4X8: return TX_8X4; - case TX_8X4: return TX_4X8; - case TX_8X16: return TX_16X8; - case TX_16X8: return TX_8X16; - case TX_16X32: return TX_32X16; - case TX_32X16: return TX_16X32; - case TX_4X16: return TX_16X4; - case TX_16X4: return TX_4X16; - case TX_8X32: return TX_32X8; - case TX_32X8: return TX_8X32; - case TX_16X64: return TX_64X16; - case TX_64X16: return TX_16X64; - default: assert(0); return TX_INVALID; - } -} - -static INLINE TX_TYPE av1_rotate_tx_type(TX_TYPE tx_type) { - switch (tx_type) { - case DCT_DCT: return DCT_DCT; - case ADST_DCT: return DCT_ADST; - case DCT_ADST: return ADST_DCT; - case ADST_ADST: return ADST_ADST; - case FLIPADST_DCT: return DCT_FLIPADST; - case DCT_FLIPADST: return FLIPADST_DCT; - case FLIPADST_FLIPADST: return FLIPADST_FLIPADST; - case ADST_FLIPADST: return FLIPADST_ADST; - case FLIPADST_ADST: return ADST_FLIPADST; - case IDTX: return IDTX; - case V_DCT: return H_DCT; - case H_DCT: return V_DCT; - case V_ADST: return H_ADST; - case H_ADST: return V_ADST; - case V_FLIPADST: return H_FLIPADST; - case H_FLIPADST: return V_FLIPADST; - default: assert(0); return TX_TYPES; - } -} - // Utility function that returns the log of the ratio of the col and row // sizes. static INLINE int get_rect_tx_log_ratio(int col, int row) { diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h index 3e8d1d6c6..979f13bd9 100644 --- a/third_party/aom/av1/common/blockd.h +++ b/third_party/aom/av1/common/blockd.h @@ -605,6 +605,12 @@ static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) { return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0; } +static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) { + return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? CONVERT_TO_BYTEPTR(buf16) + : buf16; +} + static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) { switch (bsize) { case BLOCK_4X4: return 0; @@ -674,6 +680,15 @@ static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = { { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, }; +static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = { + 0x0001, // 0000 0000 0000 0001 + 0x0201, // 0000 0010 0000 0001 + 0x020F, // 0000 0010 0000 1111 + 0x0E0F, // 0000 1110 0000 1111 + 0x0FFF, // 0000 1111 1111 1111 + 0xFFFF, // 1111 1111 1111 1111 +}; + static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter, int use_reduced_set) { const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size]; @@ -1145,38 +1160,6 @@ static INLINE PLANE_TYPE get_plane_type(int plane) { return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV; } -static INLINE void transpose_uint8(uint8_t *dst, int dst_stride, - const uint8_t *src, int src_stride, int w, - int h) { - int r, c; - for (r = 0; r < h; ++r) - for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c]; -} - -static INLINE void transpose_uint16(uint16_t *dst, int dst_stride, - const uint16_t *src, int src_stride, int w, - int h) { - int r, c; - for (r = 0; r < h; ++r) - for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c]; -} - -static INLINE void transpose_int16(int16_t *dst, int dst_stride, - const int16_t *src, int src_stride, int w, - int h) { - int r, c; - for (r = 0; r < h; ++r) - for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c]; -} - -static INLINE void transpose_int32(int32_t *dst, int dst_stride, - const int32_t *src, int src_stride, int w, - int h) { - int r, c; - for (r = 0; r < h; ++r) - for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c]; -} - static INLINE int av1_get_max_eob(TX_SIZE tx_size) { if (tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64) { return 1024; diff --git a/third_party/aom/av1/common/cdef.c b/third_party/aom/av1/common/cdef.c index c9b974900..e9e2b0e42 100644 --- a/third_party/aom/av1/common/cdef.c +++ b/third_party/aom/av1/common/cdef.c @@ -110,7 +110,7 @@ void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, static void copy_sb8_16(AOM_UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride, const uint8_t *src, int src_voffset, int src_hoffset, int sstride, int vsize, int hsize) { - if (cm->use_highbitdepth) { + if (cm->seq_params.use_highbitdepth) { const uint16_t *base = &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset]; copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize); @@ -153,7 +153,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, int mi_high_l2[3]; int xdec[3]; int ydec[3]; - int coeff_shift = AOMMAX(cm->bit_depth - 8, 0); + int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0); const int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; const int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0, @@ -363,7 +363,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); } - if (cm->use_highbitdepth) { + if (cm->seq_params.use_highbitdepth) { cdef_filter_fb( NULL, &CONVERT_TO_SHORTPTR( diff --git a/third_party/aom/av1/common/cfl.c b/third_party/aom/av1/common/cfl.c index ee19f0bcf..ccc59b4eb 100644 --- a/third_party/aom/av1/common/cfl.c +++ b/third_party/aom/av1/common/cfl.c @@ -15,21 +15,14 @@ #include "config/av1_rtcd.h" -void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm) { +void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params) { assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE); assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE); - if (!(cm->subsampling_x == 0 && cm->subsampling_y == 0) && - !(cm->subsampling_x == 1 && cm->subsampling_y == 1) && - !(cm->subsampling_x == 1 && cm->subsampling_y == 0)) { - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, - "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported by " - "CfL, %d %d subsampling is not supported.\n", - cm->subsampling_x, cm->subsampling_y); - } + memset(&cfl->recon_buf_q3, 0, sizeof(cfl->recon_buf_q3)); memset(&cfl->ac_buf_q3, 0, sizeof(cfl->ac_buf_q3)); - cfl->subsampling_x = cm->subsampling_x; - cfl->subsampling_y = cm->subsampling_y; + cfl->subsampling_x = seq_params->subsampling_x; + cfl->subsampling_y = seq_params->subsampling_y; cfl->are_parameters_computed = 0; cfl->store_y = 0; // The DC_PRED cache is disabled by default and is only enabled in diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c index d57f44f8b..ed962c722 100644 --- a/third_party/aom/av1/common/convolve.c +++ b/third_party/aom/av1/common/convolve.c @@ -75,8 +75,8 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; @@ -91,7 +91,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, // horizontal filter const uint8_t *src_horiz = src - fo_vert * src_stride; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_q4 & SUBPEL_MASK); for (int y = 0; y < im_h; ++y) { for (int x = 0; x < w; ++x) { int32_t sum = (1 << (bd + FILTER_BITS - 1)); @@ -107,7 +107,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, // vertical filter int16_t *src_vert = im_block + fo_vert * im_stride; const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_q4 & SUBPEL_MASK); const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { @@ -126,8 +126,8 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { const int fo_vert = filter_params_y->taps / 2 - 1; @@ -141,7 +141,7 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, // vertical filter const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_q4 & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; @@ -156,8 +156,8 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { const int fo_horiz = filter_params_x->taps / 2 - 1; @@ -172,7 +172,7 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_q4 & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; @@ -187,8 +187,8 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { (void)filter_params_x; @@ -204,8 +204,8 @@ void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; @@ -222,7 +222,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8, // horizontal filter const uint8_t *src_horiz = src - fo_vert * src_stride; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_q4 & SUBPEL_MASK); for (int y = 0; y < im_h; ++y) { for (int x = 0; x < w; ++x) { int32_t sum = (1 << (bd + FILTER_BITS - 1)); @@ -238,7 +238,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8, // vertical filter int16_t *src_vert = im_block + fo_vert * im_stride; const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_q4 & SUBPEL_MASK); const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { @@ -270,8 +270,8 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8, void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; @@ -289,7 +289,7 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8, // vertical filter const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_q4 & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; @@ -320,8 +320,8 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8, void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; @@ -339,7 +339,7 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8, // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_q4 & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; @@ -370,8 +370,8 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8, void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; @@ -412,8 +412,8 @@ void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params) { @@ -439,7 +439,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8, const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(x_filter_idx < SUBPEL_SHIFTS); const int16_t *x_filter = - av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx); + av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx); int32_t sum = (1 << (bd + FILTER_BITS - 1)); for (int k = 0; k < filter_params_x->taps; ++k) { sum += x_filter[k] * src_x[k - fo_horiz]; @@ -461,7 +461,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8, const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(y_filter_idx < SUBPEL_SHIFTS); const int16_t *y_filter = - av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx); + av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx); int32_t sum = 1 << offset_bits; for (int k = 0; k < filter_params_y->taps; ++k) { sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; @@ -498,8 +498,8 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8, static void convolve_2d_scale_wrapper( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, const int subpel_x_qn, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params) { if (conv_params->is_compound) { @@ -520,25 +520,27 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, (void)y_step_q4; (void)dst; (void)dst_stride; - - InterpFilterParams filter_params_x, filter_params_y; - av1_get_convolve_filter_params(interp_filters, &filter_params_x, - &filter_params_y, w, h); + InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1); + InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0); + const InterpFilterParams *filter_params_x = + av1_get_interp_filter_params_with_block_size(filter_x, w); + const InterpFilterParams *filter_params_y = + av1_get_interp_filter_params_with_block_size(filter_y, h); if (scaled) convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h, - &filter_params_x, &filter_params_y, subpel_x_q4, + filter_params_x, filter_params_y, subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params); else sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound]( - src, src_stride, dst, dst_stride, w, h, &filter_params_x, - &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params); + src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, subpel_x_q4, subpel_y_q4, conv_params); } void av1_highbd_convolve_2d_copy_sr_c( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, const int subpel_x_q4, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { (void)filter_params_x; (void)filter_params_y; @@ -554,8 +556,8 @@ void av1_highbd_convolve_2d_copy_sr_c( void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { const int fo_horiz = filter_params_x->taps / 2 - 1; @@ -569,7 +571,7 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_q4 & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; @@ -585,8 +587,8 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { const int fo_vert = filter_params_y->taps / 2 - 1; @@ -599,7 +601,7 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); // vertical filter const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_q4 & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; @@ -614,8 +616,8 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; @@ -630,7 +632,7 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, // horizontal filter const uint16_t *src_horiz = src - fo_vert * src_stride; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_q4 & SUBPEL_MASK); for (int y = 0; y < im_h; ++y) { for (int x = 0; x < w; ++x) { int32_t sum = (1 << (bd + FILTER_BITS - 1)); @@ -646,7 +648,7 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, // vertical filter int16_t *src_vert = im_block + fo_vert * im_stride; const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_q4 & SUBPEL_MASK); const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { @@ -666,8 +668,9 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { int x, y, k; @@ -685,7 +688,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, // horizontal filter const uint16_t *src_horiz = src - fo_vert * src_stride; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_q4 & SUBPEL_MASK); for (y = 0; y < im_h; ++y) { for (x = 0; x < w; ++x) { int32_t sum = (1 << (bd + FILTER_BITS - 1)); @@ -703,7 +706,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, int16_t *src_vert = im_block + fo_vert * im_stride; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_q4 & SUBPEL_MASK); for (y = 0; y < h; ++y) { for (x = 0; x < w; ++x) { int32_t sum = 1 << offset_bits; @@ -734,8 +737,9 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; @@ -753,7 +757,7 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, assert(bits >= 0); // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_q4 & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; @@ -784,8 +788,9 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; @@ -803,7 +808,7 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, assert(bits >= 0); // vertical filter const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_q4 & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; @@ -834,8 +839,8 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, void av1_highbd_jnt_convolve_2d_copy_c( const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride, - int w, int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, const int subpel_x_q4, + int w, int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; @@ -875,8 +880,8 @@ void av1_highbd_jnt_convolve_2d_copy_c( void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd) { @@ -900,7 +905,7 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(x_filter_idx < SUBPEL_SHIFTS); const int16_t *x_filter = - av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx); + av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx); int32_t sum = (1 << (bd + FILTER_BITS - 1)); for (int k = 0; k < filter_params_x->taps; ++k) { sum += x_filter[k] * src_x[k - fo_horiz]; @@ -922,7 +927,7 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(y_filter_idx < SUBPEL_SHIFTS); const int16_t *y_filter = - av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx); + av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx); int32_t sum = 1 << offset_bits; for (int k = 0; k < filter_params_y->taps; ++k) { sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; @@ -971,9 +976,12 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, (void)dst_stride; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - InterpFilterParams filter_params_x, filter_params_y; - av1_get_convolve_filter_params(interp_filters, &filter_params_x, - &filter_params_y, w, h); + InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1); + InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0); + const InterpFilterParams *filter_params_x = + av1_get_interp_filter_params_with_block_size(filter_x, w); + const InterpFilterParams *filter_params_y = + av1_get_interp_filter_params_with_block_size(filter_y, h); if (scaled) { uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); @@ -981,16 +989,16 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, assert(conv_params->dst != NULL); } av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, - &filter_params_x, &filter_params_y, - subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, - conv_params, bd); + filter_params_x, filter_params_y, subpel_x_q4, + x_step_q4, subpel_y_q4, y_step_q4, conv_params, + bd); } else { uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound]( - src, src_stride, dst, dst_stride, w, h, &filter_params_x, - &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd); + src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd); } } diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h index 1b2c2d0d5..bc2d4bccf 100644 --- a/third_party/aom/av1/common/convolve.h +++ b/third_party/aom/av1/common/convolve.h @@ -40,27 +40,17 @@ typedef struct ConvolveParams { typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); typedef void (*aom_highbd_convolve_fn_t)( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, const int subpel_x_q4, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); -static INLINE void av1_get_convolve_filter_params(InterpFilters interp_filters, - InterpFilterParams *params_x, - InterpFilterParams *params_y, - int w, int h) { - InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1); - InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0); - *params_x = av1_get_interp_filter_params_with_block_size(filter_x, w); - *params_y = av1_get_interp_filter_params_with_block_size(filter_y, h); -} - struct AV1Common; struct scale_factors; diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h index a37ee9f24..689c25f30 100644 --- a/third_party/aom/av1/common/enums.h +++ b/third_party/aom/av1/common/enums.h @@ -557,6 +557,7 @@ typedef uint8_t TXFM_CONTEXT; #define BWDREF_FRAME 5 #define ALTREF2_FRAME 6 #define ALTREF_FRAME 7 +#define EXTREF_FRAME REF_FRAMES #define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1) #define INTER_REFS_PER_FRAME (ALTREF_FRAME - LAST_FRAME + 1) @@ -607,6 +608,7 @@ typedef enum ATTRIBUTE_PACKED { // In large_scale_tile coding, external references are used. #define MAX_EXTERNAL_REFERENCES 128 +#define MAX_TILES 512 #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/av1/common/filter.c b/third_party/aom/av1/common/filter.c deleted file mode 100644 index a7e67ea4a..000000000 --- a/third_party/aom/av1/common/filter.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> - -#include "av1/common/filter.h" - -DECLARE_ALIGNED(256, static const InterpKernel, - bilinear_filters[SUBPEL_SHIFTS]) = { - { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 }, - { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 }, - { 0, 0, 0, 96, 32, 0, 0, 0 }, { 0, 0, 0, 88, 40, 0, 0, 0 }, - { 0, 0, 0, 80, 48, 0, 0, 0 }, { 0, 0, 0, 72, 56, 0, 0, 0 }, - { 0, 0, 0, 64, 64, 0, 0, 0 }, { 0, 0, 0, 56, 72, 0, 0, 0 }, - { 0, 0, 0, 48, 80, 0, 0, 0 }, { 0, 0, 0, 40, 88, 0, 0, 0 }, - { 0, 0, 0, 32, 96, 0, 0, 0 }, { 0, 0, 0, 24, 104, 0, 0, 0 }, - { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 } -}; - -DECLARE_ALIGNED(256, static const InterpKernel, - sub_pel_filters_8[SUBPEL_SHIFTS]) = { - { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, -6, 126, 8, -2, 0, 0 }, - { 0, 2, -10, 122, 18, -4, 0, 0 }, { 0, 2, -12, 116, 28, -8, 2, 0 }, - { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 }, - { 0, 2, -16, 94, 58, -12, 2, 0 }, { 0, 2, -14, 84, 66, -12, 2, 0 }, - { 0, 2, -14, 76, 76, -14, 2, 0 }, { 0, 2, -12, 66, 84, -14, 2, 0 }, - { 0, 2, -12, 58, 94, -16, 2, 0 }, { 0, 2, -12, 48, 102, -14, 2, 0 }, - { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 }, - { 0, 0, -4, 18, 122, -10, 2, 0 }, { 0, 0, -2, 8, 126, -6, 2, 0 } -}; - -DECLARE_ALIGNED(256, static const InterpKernel, - sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = { - { 0, 0, 0, 128, 0, 0, 0, 0 }, { -2, 2, -6, 126, 8, -2, 2, 0 }, - { -2, 6, -12, 124, 16, -6, 4, -2 }, { -2, 8, -18, 120, 26, -10, 6, -2 }, - { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 }, - { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 }, - { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 }, - { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 }, - { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 }, - { -2, 4, -6, 16, 124, -12, 6, -2 }, { 0, 2, -2, 8, 126, -6, 2, -2 } -}; - -DECLARE_ALIGNED(256, static const InterpKernel, - sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = { - { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, 28, 62, 34, 2, 0, 0 }, - { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 }, - { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 }, - { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, -2, 16, 54, 48, 12, 0, 0 }, - { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 }, - { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 }, - { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 }, - { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 28, 2, 0 } -}; - -static const InterpFilterParams - av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = { - { (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS, - EIGHTTAP_REGULAR }, - { (const int16_t *)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS, - EIGHTTAP_SMOOTH }, - { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS, - MULTITAP_SHARP }, - { (const int16_t *)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS, - BILINEAR } - }; - -DECLARE_ALIGNED(256, static const InterpKernel, - sub_pel_filters_4[SUBPEL_SHIFTS]) = { - { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 }, - { 0, 0, -8, 122, 18, -4, 0, 0 }, { 0, 0, -10, 116, 28, -6, 0, 0 }, - { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 }, - { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 }, - { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 }, - { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 }, - { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 }, - { 0, 0, -4, 18, 122, -8, 0, 0 }, { 0, 0, -2, 8, 126, -4, 0, 0 } -}; -DECLARE_ALIGNED(256, static const InterpKernel, - sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = { - { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 30, 62, 34, 2, 0, 0 }, - { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 }, - { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 }, - { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 }, - { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 }, - { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 }, - { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 }, - { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 30, 0, 0 } -}; - -static const InterpFilterParams av1_interp_4tap[2] = { - { (const int16_t *)sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS, - EIGHTTAP_REGULAR }, - { (const int16_t *)sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS, - EIGHTTAP_SMOOTH }, -}; - -InterpFilterParams av1_get_interp_filter_params_with_block_size( - const InterpFilter interp_filter, const int w) { - if (w <= 4 && - (interp_filter == MULTITAP_SHARP || interp_filter == EIGHTTAP_REGULAR)) - return av1_interp_4tap[0]; - else if (w <= 4 && interp_filter == EIGHTTAP_SMOOTH) - return av1_interp_4tap[1]; - - return av1_interp_filter_params_list[interp_filter]; -} - -const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter) { - return (const int16_t *)av1_interp_filter_params_list[interp_filter] - .filter_ptr; -} diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h index 0c24ad9d0..7f8ad583a 100644 --- a/third_party/aom/av1/common/filter.h +++ b/third_party/aom/av1/common/filter.h @@ -64,8 +64,8 @@ static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) { return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter; } -#define LOG_SWITCHABLE_FILTERS \ - 2 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */ +/* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */ +#define LOG_SWITCHABLE_FILTERS 2 #define MAX_SUBPEL_TAPS 12 #define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4) @@ -79,14 +79,116 @@ typedef struct InterpFilterParams { InterpFilter interp_filter; } InterpFilterParams; -const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter); +DECLARE_ALIGNED(256, static const InterpKernel, + av1_bilinear_filters[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 }, + { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 }, + { 0, 0, 0, 96, 32, 0, 0, 0 }, { 0, 0, 0, 88, 40, 0, 0, 0 }, + { 0, 0, 0, 80, 48, 0, 0, 0 }, { 0, 0, 0, 72, 56, 0, 0, 0 }, + { 0, 0, 0, 64, 64, 0, 0, 0 }, { 0, 0, 0, 56, 72, 0, 0, 0 }, + { 0, 0, 0, 48, 80, 0, 0, 0 }, { 0, 0, 0, 40, 88, 0, 0, 0 }, + { 0, 0, 0, 32, 96, 0, 0, 0 }, { 0, 0, 0, 24, 104, 0, 0, 0 }, + { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 } +}; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_8[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, -6, 126, 8, -2, 0, 0 }, + { 0, 2, -10, 122, 18, -4, 0, 0 }, { 0, 2, -12, 116, 28, -8, 2, 0 }, + { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 }, + { 0, 2, -16, 94, 58, -12, 2, 0 }, { 0, 2, -14, 84, 66, -12, 2, 0 }, + { 0, 2, -14, 76, 76, -14, 2, 0 }, { 0, 2, -12, 66, 84, -14, 2, 0 }, + { 0, 2, -12, 58, 94, -16, 2, 0 }, { 0, 2, -12, 48, 102, -14, 2, 0 }, + { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 }, + { 0, 0, -4, 18, 122, -10, 2, 0 }, { 0, 0, -2, 8, 126, -6, 2, 0 } +}; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { -2, 2, -6, 126, 8, -2, 2, 0 }, + { -2, 6, -12, 124, 16, -6, 4, -2 }, { -2, 8, -18, 120, 26, -10, 6, -2 }, + { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 }, + { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 }, + { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 }, + { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 }, + { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 }, + { -2, 4, -6, 16, 124, -12, 6, -2 }, { 0, 2, -2, 8, 126, -6, 2, -2 } +}; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, 28, 62, 34, 2, 0, 0 }, + { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 }, + { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 }, + { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, -2, 16, 54, 48, 12, 0, 0 }, + { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 }, + { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 }, + { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 }, + { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 28, 2, 0 } +}; + +static const InterpFilterParams + av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = { + { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS, + EIGHTTAP_REGULAR }, + { (const int16_t *)av1_sub_pel_filters_8smooth, SUBPEL_TAPS, + SUBPEL_SHIFTS, EIGHTTAP_SMOOTH }, + { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS, + MULTITAP_SHARP }, + { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS, + BILINEAR } + }; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 }, + { 0, 0, -8, 122, 18, -4, 0, 0 }, { 0, 0, -10, 116, 28, -6, 0, 0 }, + { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 }, + { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 }, + { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 }, + { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 }, + { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 }, + { 0, 0, -4, 18, 122, -8, 0, 0 }, { 0, 0, -2, 8, 126, -4, 0, 0 } +}; +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 30, 62, 34, 2, 0, 0 }, + { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 }, + { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 }, + { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 }, + { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 }, + { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 }, + { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 }, + { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 30, 0, 0 } +}; + +// For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR +static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = { + { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS, + EIGHTTAP_REGULAR }, + { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS, + EIGHTTAP_SMOOTH }, + { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS, + EIGHTTAP_REGULAR }, + { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS, + BILINEAR }, +}; + +static INLINE const InterpFilterParams * +av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter, + const int w) { + if (w <= 4) return &av1_interp_4tap[interp_filter]; + return &av1_interp_filter_params_list[interp_filter]; +} -InterpFilterParams av1_get_interp_filter_params_with_block_size( - const InterpFilter interp_filter, const int w); +static INLINE const int16_t *av1_get_interp_filter_kernel( + const InterpFilter interp_filter) { + return av1_interp_filter_params_list[interp_filter].filter_ptr; +} static INLINE const int16_t *av1_get_interp_filter_subpel_kernel( - const InterpFilterParams filter_params, const int subpel) { - return filter_params.filter_ptr + filter_params.taps * subpel; + const InterpFilterParams *const filter_params, const int subpel) { + return filter_params->filter_ptr + filter_params->taps * subpel; } #ifdef __cplusplus diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h index a6227f18f..c2495640e 100644 --- a/third_party/aom/av1/common/mv.h +++ b/third_party/aom/av1/common/mv.h @@ -294,9 +294,6 @@ static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row, mv->row = clamp(mv->row, min_row, max_row); } -static INLINE int mv_has_subpel(const MV *mv) { - return (mv->row & SUBPEL_MASK) || (mv->col & SUBPEL_MASK); -} #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h index 716b4a247..f68c159e1 100644 --- a/third_party/aom/av1/common/mvref_common.h +++ b/third_party/aom/av1/common/mvref_common.h @@ -44,7 +44,7 @@ static INLINE int get_relative_dist(const AV1_COMMON *cm, int a, int b) { assert(b >= 0 && b < (1 << bits)); int diff = a - b; - int m = 1 << (bits - 1); + const int m = 1 << (bits - 1); diff = (diff & (m - 1)) - (diff & m); return diff; } diff --git a/third_party/aom/av1/common/onyxc_int.h b/third_party/aom/av1/common/onyxc_int.h index fa5f02e52..6b1bf2d74 100644 --- a/third_party/aom/av1/common/onyxc_int.h +++ b/third_party/aom/av1/common/onyxc_int.h @@ -184,7 +184,10 @@ typedef struct BitstreamLevel { uint8_t minor; } BitstreamLevel; -/* Initial version of sequence header structure */ +// Sequence header structure. +// Note: All syntax elements of sequence_header_obu that need to be +// bit-identical across multiple sequence headers must be part of this struct, +// so that consistency is checked by are_seq_headers_consistent() function. typedef struct SequenceHeader { int num_bits_width; int num_bits_height; @@ -205,7 +208,6 @@ typedef struct SequenceHeader { // 2 - adaptive int still_picture; // Video is a single frame still picture int reduced_still_picture_hdr; // Use reduced header for still picture - int monochrome; // Monochorme video int enable_filter_intra; // enables/disables filterintra int enable_intra_edge_filter; // enables/disables corner/edge/upsampling int enable_interintra_compound; // enables/disables interintra_compound @@ -229,6 +231,9 @@ typedef struct SequenceHeader { // enabled for that frame. int enable_cdef; // To turn on/off CDEF int enable_restoration; // To turn on/off loop restoration + BITSTREAM_PROFILE profile; + + // Operating point info. int operating_points_cnt_minus_1; int operating_point_idc[MAX_NUM_OPERATING_POINTS]; int display_model_info_present_flag; @@ -236,15 +241,26 @@ typedef struct SequenceHeader { BitstreamLevel level[MAX_NUM_OPERATING_POINTS]; uint8_t tier[MAX_NUM_OPERATING_POINTS]; // seq_tier in the spec. One bit: 0 // or 1. -} SequenceHeader; -typedef struct AV1Common { - struct aom_internal_error_info error; + // Color config. + aom_bit_depth_t bit_depth; // AOM_BITS_8 in profile 0 or 1, + // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3. + int use_highbitdepth; // If true, we need to use 16bit frame buffers. + int monochrome; // Monochorme video aom_color_primaries_t color_primaries; aom_transfer_characteristics_t transfer_characteristics; aom_matrix_coefficients_t matrix_coefficients; - aom_chroma_sample_position_t chroma_sample_position; int color_range; + int subsampling_x; // Chroma subsampling for x + int subsampling_y; // Chroma subsampling for y + aom_chroma_sample_position_t chroma_sample_position; + int separate_uv_delta_q; + + int film_grain_params_present; +} SequenceHeader; + +typedef struct AV1Common { + struct aom_internal_error_info error; int width; int height; int render_width; @@ -253,18 +269,11 @@ typedef struct AV1Common { int last_height; int timing_info_present; aom_timing_info_t timing_info; - int buffer_removal_delay_present; + int buffer_removal_time_present; aom_dec_model_info_t buffer_model; aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1]; aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1]; - int tu_presentation_delay_flag; - int64_t tu_presentation_delay; - - // TODO(jkoleszar): this implies chroma ss right now, but could vary per - // plane. Revisit as part of the future change to YV12_BUFFER_CONFIG to - // support additional planes. - int subsampling_x; - int subsampling_y; + uint32_t frame_presentation_time; int largest_tile_id; size_t largest_tile_size; @@ -273,8 +282,6 @@ typedef struct AV1Common { // Scale of the current frame with respect to itself. struct scale_factors sf_identity; - // Marks if we need to use 16bit frame buffers (1: yes, 0: no). - int use_highbitdepth; YV12_BUFFER_CONFIG *frame_to_show; RefCntBuffer *prev_frame; @@ -342,8 +349,6 @@ typedef struct AV1Common { int u_ac_delta_q; int v_ac_delta_q; - int separate_uv_delta_q; - // The dequantizers below are true dequntizers used only in the // dequantization process. They have the same coefficient // shift/scale as TX. @@ -447,10 +452,7 @@ typedef struct AV1Common { unsigned int frame_offset; unsigned int current_video_frame; - BITSTREAM_PROFILE profile; - // AOM_BITS_8 in profile 0 or 1, AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3. - aom_bit_depth_t bit_depth; aom_bit_depth_t dequant_bit_depth; // bit_depth of current dequantizer int error_resilient_mode; @@ -494,9 +496,8 @@ typedef struct AV1Common { ENTROPY_CONTEXT **above_context[MAX_MB_PLANE]; TXFM_CONTEXT **above_txfm_context; WarpedMotionParams global_motion[REF_FRAMES]; - aom_film_grain_table_t *film_grain_table; - int film_grain_params_present; aom_film_grain_t film_grain_params; + int cdef_pri_damping; int cdef_sec_damping; int nb_cdef_strengths; @@ -590,7 +591,7 @@ static INLINE int get_free_fb(AV1_COMMON *cm) { if (frame_bufs[i].ref_count == 0) break; if (i != FRAME_BUFFERS) { - if (frame_bufs[i].buf.use_external_refernce_buffers) { + if (frame_bufs[i].buf.use_external_reference_buffers) { // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the // external reference buffers. Restore the buffer pointers to point to the // internally allocated memory. @@ -598,7 +599,7 @@ static INLINE int get_free_fb(AV1_COMMON *cm) { ybf->y_buffer = ybf->store_buf_adr[0]; ybf->u_buffer = ybf->store_buf_adr[1]; ybf->v_buffer = ybf->store_buf_adr[2]; - ybf->use_external_refernce_buffers = 0; + ybf->use_external_reference_buffers = 0; } frame_bufs[i].ref_count = 1; @@ -683,15 +684,7 @@ static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) { } } -static INLINE int mi_cols_aligned_to_sb(const AV1_COMMON *cm) { - return ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2); -} - -static INLINE int mi_rows_aligned_to_sb(const AV1_COMMON *cm) { - return ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2); -} - -void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm); +void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params); static INLINE int av1_num_planes(const AV1_COMMON *cm) { return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE; @@ -734,7 +727,7 @@ static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd, } xd->mi_stride = cm->mi_stride; xd->error_info = &cm->error; - cfl_init(&xd->cfl, cm); + cfl_init(&xd->cfl, &cm->seq_params); } static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col, @@ -1066,17 +1059,18 @@ static INLINE int max_intra_block_height(const MACROBLOCKD *xd, return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]); } -static INLINE void av1_zero_above_context(AV1_COMMON *const cm, +static INLINE void av1_zero_above_context(AV1_COMMON *const cm, const MACROBLOCKD *xd, int mi_col_start, int mi_col_end, const int tile_row) { + const SequenceHeader *const seq_params = &cm->seq_params; const int num_planes = av1_num_planes(cm); const int width = mi_col_end - mi_col_start; const int aligned_width = - ALIGN_POWER_OF_TWO(width, cm->seq_params.mib_size_log2); + ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2); const int offset_y = mi_col_start; const int width_y = aligned_width; - const int offset_uv = offset_y >> cm->subsampling_x; - const int width_uv = width_y >> cm->subsampling_x; + const int offset_uv = offset_y >> seq_params->subsampling_x; + const int width_uv = width_y >> seq_params->subsampling_x; av1_zero_array(cm->above_context[0][tile_row] + offset_y, width_y); if (num_planes > 1) { @@ -1084,7 +1078,7 @@ static INLINE void av1_zero_above_context(AV1_COMMON *const cm, av1_zero_array(cm->above_context[1][tile_row] + offset_uv, width_uv); av1_zero_array(cm->above_context[2][tile_row] + offset_uv, width_uv); } else { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, "Invalid value of planes"); } } diff --git a/third_party/aom/av1/common/quant_common.c b/third_party/aom/av1/common/quant_common.c index 84575d74b..0e14da7a3 100644 --- a/third_party/aom/av1/common/quant_common.c +++ b/third_party/aom/av1/common/quant_common.c @@ -223,29 +223,6 @@ int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) { return av1_ac_quant_Q3(qindex, delta, bit_depth); } -int16_t av1_qindex_from_ac_Q3(int ac_Q3, aom_bit_depth_t bit_depth) { - int i; - const int16_t *tab = ac_qlookup_Q3; - switch (bit_depth) { - case AOM_BITS_10: { - tab = ac_qlookup_10_Q3; - break; - } - case AOM_BITS_12: { - tab = ac_qlookup_12_Q3; - break; - } - default: - assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); - return -1; - } - (void)bit_depth; - for (i = 0; i < QINDEX_RANGE; i++) { - if (ac_Q3 <= tab[i]) return i; - } - return QINDEX_RANGE - 1; -} - int av1_get_qindex(const struct segmentation *seg, int segment_id, int base_qindex) { if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h index f9681036d..ca199e94c 100644 --- a/third_party/aom/av1/common/quant_common.h +++ b/third_party/aom/av1/common/quant_common.h @@ -42,7 +42,6 @@ int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth); int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth); int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth); int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth); -int16_t av1_qindex_from_ac_Q3(int ac_Q3, aom_bit_depth_t bit_depth); int av1_get_qindex(const struct segmentation *seg, int segment_id, int base_qindex); diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c index b6ac436fb..b9f0b57f3 100644 --- a/third_party/aom/av1/common/reconinter.c +++ b/third_party/aom/av1/common/reconinter.c @@ -627,9 +627,7 @@ void av1_make_masked_inter_predictor( tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]); #undef INTER_PRED_BYTES_PER_PIXEL - uint8_t *tmp_dst = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - ? CONVERT_TO_BYTEPTR(tmp_buf) - : tmp_buf; + uint8_t *tmp_dst = get_buf_by_bd(xd, tmp_buf); const int tmp_buf_stride = MAX_SB_SIZE; CONV_BUF_TYPE *org_dst = conv_params->dst; @@ -1002,8 +1000,8 @@ void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd, BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL }, { xd->plane[0].dst.stride, 0, 0 } }; if (!ctx) ctx = &default_ctx; - av1_build_interintra_predictors_sby(cm, xd, xd->plane[0].dst.buf, - xd->plane[0].dst.stride, ctx, bsize); + av1_build_interintra_predictors_sbp(cm, xd, xd->plane[0].dst.buf, + xd->plane[0].dst.stride, ctx, 0, bsize); } } @@ -1609,10 +1607,10 @@ void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm, const int ssy = xd->plane[plane].subsampling_y; BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy); PREDICTION_MODE mode = interintra_to_intra_mode[xd->mi[0]->interintra_mode]; - xd->mi[0]->angle_delta[PLANE_TYPE_Y] = 0; - xd->mi[0]->angle_delta[PLANE_TYPE_UV] = 0; - xd->mi[0]->filter_intra_mode_info.use_filter_intra = 0; - xd->mi[0]->use_intrabc = 0; + assert(xd->mi[0]->angle_delta[PLANE_TYPE_Y] == 0); + assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0); + assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0); + assert(xd->mi[0]->use_intrabc == 0); av1_predict_intra_block(cm, xd, pd->width, pd->height, max_txsize_rect_lookup[plane_bsize], mode, 0, 0, @@ -1642,42 +1640,23 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, inter_pred, inter_stride, intra_pred, intra_stride); } -void av1_build_interintra_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd, - uint8_t *ypred, int ystride, - BUFFER_SET *ctx, BLOCK_SIZE bsize) { - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]); - av1_build_intra_predictors_for_interintra( - cm, xd, bsize, 0, ctx, CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE); - av1_combine_interintra(xd, bsize, 0, ypred, ystride, - CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE); - return; - } - { - DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]); - av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, ctx, - intrapredictor, MAX_SB_SIZE); - av1_combine_interintra(xd, bsize, 0, ypred, ystride, intrapredictor, - MAX_SB_SIZE); - } -} - -void av1_build_interintra_predictors_sbc(const AV1_COMMON *cm, MACROBLOCKD *xd, - uint8_t *upred, int ustride, +// build interintra_predictors for one plane +void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *pred, int stride, BUFFER_SET *ctx, int plane, BLOCK_SIZE bsize) { if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - DECLARE_ALIGNED(16, uint16_t, uintrapredictor[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]); av1_build_intra_predictors_for_interintra( - cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(uintrapredictor), + cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE); - av1_combine_interintra(xd, bsize, plane, upred, ustride, - CONVERT_TO_BYTEPTR(uintrapredictor), MAX_SB_SIZE); + av1_combine_interintra(xd, bsize, plane, pred, stride, + CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE); } else { - DECLARE_ALIGNED(16, uint8_t, uintrapredictor[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]); av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx, - uintrapredictor, MAX_SB_SIZE); - av1_combine_interintra(xd, bsize, plane, upred, ustride, uintrapredictor, + intrapredictor, MAX_SB_SIZE); + av1_combine_interintra(xd, bsize, plane, pred, stride, intrapredictor, MAX_SB_SIZE); } } @@ -1686,8 +1665,8 @@ void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *upred, uint8_t *vpred, int ustride, int vstride, BUFFER_SET *ctx, BLOCK_SIZE bsize) { - av1_build_interintra_predictors_sbc(cm, xd, upred, ustride, ctx, 1, bsize); - av1_build_interintra_predictors_sbc(cm, xd, vpred, vstride, ctx, 2, bsize); + av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize); + av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize); } void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, @@ -1695,7 +1674,7 @@ void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *vpred, int ystride, int ustride, int vstride, BUFFER_SET *ctx, BLOCK_SIZE bsize) { - av1_build_interintra_predictors_sby(cm, xd, ypred, ystride, ctx, bsize); + av1_build_interintra_predictors_sbp(cm, xd, ypred, ystride, ctx, 0, bsize); av1_build_interintra_predictors_sbuv(cm, xd, upred, vpred, ustride, vstride, ctx, bsize); } @@ -1713,9 +1692,7 @@ static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane, const struct scale_factors *const sf = &xd->block_refs[ref]->sf; struct buf_2d *const pre_buf = &pd->pre[ref]; - const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; - uint8_t *const dst = - (hbd ? CONVERT_TO_BYTEPTR(ext_dst) : ext_dst) + ext_dst_stride * y + x; + uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x; const MV mv = mi->mv[ref].as_mv; ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd); diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h index aa3aefc88..6a3def270 100644 --- a/third_party/aom/av1/common/reconinter.h +++ b/third_party/aom/av1/common/reconinter.h @@ -412,12 +412,9 @@ void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, int vstride, BUFFER_SET *ctx, BLOCK_SIZE bsize); -void av1_build_interintra_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd, - uint8_t *ypred, int ystride, - BUFFER_SET *ctx, BLOCK_SIZE bsize); - -void av1_build_interintra_predictors_sbc(const AV1_COMMON *cm, MACROBLOCKD *xd, - uint8_t *upred, int ustride, +// build interintra_predictors for one plane +void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *pred, int stride, BUFFER_SET *ctx, int plane, BLOCK_SIZE bsize); @@ -429,6 +426,7 @@ void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, void av1_build_intra_predictors_for_interintra( const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride); + void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, const uint8_t *inter_pred, int inter_stride, const uint8_t *intra_pred, int intra_stride); diff --git a/third_party/aom/av1/common/reconintra.c b/third_party/aom/av1/common/reconintra.c index 21d1f60b2..71a52e73e 100644 --- a/third_party/aom/av1/common/reconintra.c +++ b/third_party/aom/av1/common/reconintra.c @@ -1071,13 +1071,6 @@ static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) { p_left[-1] = s; } -static int use_intra_edge_upsample(int bs0, int bs1, int delta, int type) { - const int d = abs(delta); - const int blk_wh = bs0 + bs1; - if (d <= 0 || d >= 40) return 0; - return type ? (blk_wh <= 8) : (blk_wh <= 16); -} - void av1_upsample_intra_edge_c(uint8_t *p, int sz) { // interpolate half-sample positions assert(sz <= MAX_UPSAMPLE_SZ); @@ -1284,13 +1277,13 @@ static void build_intra_predictors_high( } } upsample_above = - use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type); + av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type); if (need_above && upsample_above) { const int n_px = txwpx + (need_right ? txhpx : 0); av1_upsample_intra_edge_high(above_row, n_px, xd->bd); } upsample_left = - use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type); + av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type); if (need_left && upsample_left) { const int n_px = txhpx + (need_bottom ? txwpx : 0); av1_upsample_intra_edge_high(left_col, n_px, xd->bd); @@ -1467,13 +1460,13 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, } } upsample_above = - use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type); + av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type); if (need_above && upsample_above) { const int n_px = txwpx + (need_right ? txhpx : 0); av1_upsample_intra_edge(above_row, n_px); } upsample_left = - use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type); + av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type); if (need_left && upsample_left) { const int n_px = txhpx + (need_bottom ? txwpx : 0); av1_upsample_intra_edge(left_col, n_px); @@ -1642,4 +1635,6 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, dst_stride, dst, dst_stride, blk_col, blk_row, plane); } -void av1_init_intra_predictors(void) { once(init_intra_predictors_internal); } +void av1_init_intra_predictors(void) { + aom_once(init_intra_predictors_internal); +} diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h index a7d9e8b79..57638f24e 100644 --- a/third_party/aom/av1/common/reconintra.h +++ b/third_party/aom/av1/common/reconintra.h @@ -12,6 +12,8 @@ #ifndef AV1_COMMON_RECONINTRA_H_ #define AV1_COMMON_RECONINTRA_H_ +#include <stdlib.h> + #include "aom/aom_integer.h" #include "av1/common/blockd.h" #include "av1/common/onyxc_int.h" @@ -103,6 +105,14 @@ static INLINE int av1_get_dy(int angle) { return 1; } } + +static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta, + int type) { + const int d = abs(delta); + const int blk_wh = bs0 + bs1; + if (d <= 0 || d >= 40) return 0; + return type ? (blk_wh <= 8) : (blk_wh <= 16); +} #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c index 17e6823b1..93d62292a 100644 --- a/third_party/aom/av1/common/resize.c +++ b/third_party/aom/av1/common/resize.c @@ -1100,7 +1100,7 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int plane, int rows) { const int is_uv = (plane > 0); - const int ss_x = is_uv && cm->subsampling_x; + const int ss_x = is_uv && cm->seq_params.subsampling_x; const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x); const int upscaled_plane_width = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x); @@ -1141,10 +1141,11 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src, const int pad_left = (j == 0); const int pad_right = (j == cm->tile_cols - 1); - if (cm->use_highbitdepth) - highbd_upscale_normative_rect( - src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width, - dst_stride, x_step_qn, x0_qn, pad_left, pad_right, cm->bit_depth); + if (cm->seq_params.use_highbitdepth) + highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride, + dst_ptr, rows, dst_width, dst_stride, + x_step_qn, x0_qn, pad_left, pad_right, + cm->seq_params.bit_depth); else upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width, dst_stride, x_step_qn, x0_qn, @@ -1175,7 +1176,7 @@ YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm, const int num_planes = av1_num_planes(cm); if (cm->width != unscaled->y_crop_width || cm->height != unscaled->y_crop_height) { - av1_resize_and_extend_frame(unscaled, scaled, (int)cm->bit_depth, + av1_resize_and_extend_frame(unscaled, scaled, (int)cm->seq_params.bit_depth, num_planes); return scaled; } else { @@ -1232,6 +1233,7 @@ static void copy_buffer_config(const YV12_BUFFER_CONFIG *const src, void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) { const int num_planes = av1_num_planes(cm); if (!av1_superres_scaled(cm)) return; + const SequenceHeader *const seq_params = &cm->seq_params; YV12_BUFFER_CONFIG copy_buffer; memset(©_buffer, 0, sizeof(copy_buffer)); @@ -1239,10 +1241,10 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) { YV12_BUFFER_CONFIG *const frame_to_show = get_frame_new_buffer(cm); const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3); - if (aom_alloc_frame_buffer(©_buffer, aligned_width, cm->height, - cm->subsampling_x, cm->subsampling_y, - cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, - cm->byte_alignment)) + if (aom_alloc_frame_buffer( + ©_buffer, aligned_width, cm->height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, cm->byte_alignment)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate copy buffer for superres upscaling"); @@ -1269,11 +1271,11 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) { "Failed to free current frame buffer before superres upscaling"); // aom_realloc_frame_buffer() leaves config data for frame_to_show intact - if (aom_realloc_frame_buffer(frame_to_show, cm->superres_upscaled_width, - cm->superres_upscaled_height, - cm->subsampling_x, cm->subsampling_y, - cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, - cm->byte_alignment, fb, cb, cb_priv)) + if (aom_realloc_frame_buffer( + frame_to_show, cm->superres_upscaled_width, + cm->superres_upscaled_height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, cm->byte_alignment, fb, cb, cb_priv)) aom_internal_error( &cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate current frame buffer for superres upscaling"); @@ -1283,10 +1285,11 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) { // Don't use callbacks on the encoder. // aom_alloc_frame_buffer() clears the config data for frame_to_show - if (aom_alloc_frame_buffer(frame_to_show, cm->superres_upscaled_width, - cm->superres_upscaled_height, cm->subsampling_x, - cm->subsampling_y, cm->use_highbitdepth, - AOM_BORDER_IN_PIXELS, cm->byte_alignment)) + if (aom_alloc_frame_buffer( + frame_to_show, cm->superres_upscaled_width, + cm->superres_upscaled_height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, cm->byte_alignment)) aom_internal_error( &cm->error, AOM_CODEC_MEM_ERROR, "Failed to reallocate current frame buffer for superres upscaling"); diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c index 58a5275ca..632967957 100644 --- a/third_party/aom/av1/common/restoration.c +++ b/third_party/aom/av1/common/restoration.c @@ -42,8 +42,8 @@ const sgr_params_type sgr_params[SGRPROJ_PARAMS] = { AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) { AV1PixelRect rect; - int ss_x = is_uv && cm->subsampling_x; - int ss_y = is_uv && cm->subsampling_y; + int ss_x = is_uv && cm->seq_params.subsampling_x; + int ss_y = is_uv && cm->seq_params.subsampling_y; rect.top = 0; rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y); @@ -1146,16 +1146,17 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt, YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, int optimized_lr, int num_planes) { - const int bit_depth = cm->bit_depth; - const int highbd = cm->use_highbitdepth; + const SequenceHeader *const seq_params = &cm->seq_params; + const int bit_depth = seq_params->bit_depth; + const int highbd = seq_params->use_highbitdepth; lr_ctxt->dst = &cm->rst_frame; const int frame_width = frame->crop_widths[0]; const int frame_height = frame->crop_heights[0]; - if (aom_realloc_frame_buffer(lr_ctxt->dst, frame_width, frame_height, - cm->subsampling_x, cm->subsampling_y, - cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, - cm->byte_alignment, NULL, NULL, NULL) < 0) + if (aom_realloc_frame_buffer( + lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x, + seq_params->subsampling_y, highbd, AOM_BORDER_IN_PIXELS, + cm->byte_alignment, NULL, NULL, NULL) < 0) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate restoration dst buffer"); @@ -1180,8 +1181,8 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt, highbd); lr_plane_ctxt->rsi = rsi; - lr_plane_ctxt->ss_x = is_uv && cm->subsampling_x; - lr_plane_ctxt->ss_y = is_uv && cm->subsampling_y; + lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x; + lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y; lr_plane_ctxt->highbd = highbd; lr_plane_ctxt->bit_depth = bit_depth; lr_plane_ctxt->data8 = frame->buffers[plane]; @@ -1337,7 +1338,7 @@ void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane, int32_t *tmpbuf, RestorationLineBuffers *rlbs) { const int is_uv = plane > 0; - const int ss_y = is_uv && cm->subsampling_y; + const int ss_y = is_uv && cm->seq_params.subsampling_y; const RestorationInfo *rsi = &cm->rst_info[plane]; @@ -1350,7 +1351,7 @@ void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane, int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rcol0, int *rcol1, int *rrow0, - int *rrow1, int *tile_tl_idx) { + int *rrow1) { assert(rcol0 && rcol1 && rrow0 && rrow1); if (bsize != cm->seq_params.sb_size) return 0; @@ -1383,8 +1384,8 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane, const int vert_units = av1_lr_count_units_in_tile(size, tile_h); // The size of an MI-unit on this plane of the image - const int ss_x = is_uv && cm->subsampling_x; - const int ss_y = is_uv && cm->subsampling_y; + const int ss_x = is_uv && cm->seq_params.subsampling_x; + const int ss_y = is_uv && cm->seq_params.subsampling_y; const int mi_size_x = MI_SIZE >> ss_x; const int mi_size_y = MI_SIZE >> ss_y; @@ -1419,9 +1420,6 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane, *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units); *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units); - const int tile_idx = 0; - *tile_tl_idx = tile_idx * rsi->units_per_tile; - return *rcol0 < *rcol1 && *rrow0 < *rrow1; } @@ -1468,7 +1466,7 @@ static void save_deblock_boundary_lines( int upscaled_width; int line_bytes; if (av1_superres_scaled(cm)) { - const int ss_x = is_uv && cm->subsampling_x; + const int ss_x = is_uv && cm->seq_params.subsampling_x; upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x; line_bytes = upscaled_width << use_highbd; if (use_highbd) @@ -1515,7 +1513,7 @@ static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame, // At the point where this function is called, we've already applied // superres. So we don't need to extend the lines here, we can just // pull directly from the topmost row of the upscaled frame. - const int ss_x = is_uv && cm->subsampling_x; + const int ss_x = is_uv && cm->seq_params.subsampling_x; const int upscaled_width = av1_superres_scaled(cm) ? (cm->superres_upscaled_width + ss_x) >> ss_x : src_width; @@ -1535,7 +1533,7 @@ static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame, int use_highbd, int plane, AV1_COMMON *cm, int after_cdef) { const int is_uv = plane > 0; - const int ss_y = is_uv && cm->subsampling_y; + const int ss_y = is_uv && cm->seq_params.subsampling_y; const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y; @@ -1600,7 +1598,7 @@ static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame, void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, int after_cdef) { const int num_planes = av1_num_planes(cm); - const int use_highbd = cm->use_highbitdepth; + const int use_highbd = cm->seq_params.use_highbitdepth; for (int p = 0; p < num_planes; ++p) { save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef); } diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h index 0c4017534..aec37d834 100644 --- a/third_party/aom/av1/common/restoration.h +++ b/third_party/aom/av1/common/restoration.h @@ -346,7 +346,7 @@ void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane, int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rcol0, int *rcol1, int *rrow0, - int *rrow1, int *tile_tl_idx); + int *rrow1); void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h index c5cebc135..d206586b5 100644 --- a/third_party/aom/av1/common/scan.h +++ b/third_party/aom/av1/common/scan.h @@ -39,13 +39,6 @@ extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES]; void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd); -static INLINE int get_coef_context(const int16_t *neighbors, - const uint8_t *token_cache, int c) { - return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] + - token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> - 1; -} - static INLINE const SCAN_ORDER *get_default_scan(TX_SIZE tx_size, TX_TYPE tx_type) { return &av1_scan_orders[tx_size][tx_type]; diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c index 3fa998a91..f9b734b8c 100644 --- a/third_party/aom/av1/common/thread_common.c +++ b/third_party/aom/av1/common/thread_common.c @@ -572,7 +572,7 @@ static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt, for (int plane = 0; plane < num_planes; plane++) { if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; const int is_uv = plane > 0; - const int ss_y = is_uv && cm->subsampling_y; + const int ss_y = is_uv && cm->seq_params.subsampling_y; AV1PixelRect tile_rect = ctxt[plane].tile_rect; const int unit_size = ctxt[plane].rsi->restoration_unit_size; diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c index 9a43ab29a..026c904b6 100644 --- a/third_party/aom/av1/common/tile_common.c +++ b/third_party/aom/av1/common/tile_common.c @@ -179,8 +179,8 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm, r.bottom = AOMMIN(r.bottom, frame_h); // Convert to coordinates in the appropriate plane - const int ss_x = is_uv && cm->subsampling_x; - const int ss_y = is_uv && cm->subsampling_y; + const int ss_x = is_uv && cm->seq_params.subsampling_x; + const int ss_y = is_uv && cm->seq_params.subsampling_y; r.left = ROUND_POWER_OF_TWO(r.left, ss_x); r.right = ROUND_POWER_OF_TWO(r.right, ss_x); diff --git a/third_party/aom/av1/common/timing.c b/third_party/aom/av1/common/timing.c index 5ff538ae1..49dbde78f 100644 --- a/third_party/aom/av1/common/timing.c +++ b/third_party/aom/av1/common/timing.c @@ -53,8 +53,8 @@ int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx, void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) { decoder_model->encoder_decoder_buffer_delay_length = 16; - decoder_model->buffer_removal_delay_length = 10; - decoder_model->frame_presentation_delay_length = 10; + decoder_model->buffer_removal_time_length = 10; + decoder_model->frame_presentation_time_length = 10; } void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) { diff --git a/third_party/aom/av1/common/timing.h b/third_party/aom/av1/common/timing.h index d31f4b7fc..1749baa57 100644 --- a/third_party/aom/av1/common/timing.h +++ b/third_party/aom/av1/common/timing.h @@ -27,23 +27,23 @@ typedef struct aom_timing { typedef struct aom_dec_model_info { uint32_t num_units_in_decoding_tick; int encoder_decoder_buffer_delay_length; - int buffer_removal_delay_length; - int frame_presentation_delay_length; + int buffer_removal_time_length; + int frame_presentation_time_length; } aom_dec_model_info_t; typedef struct aom_dec_model_op_parameters { int decoder_model_param_present_flag; int64_t bitrate; int64_t buffer_size; - int decoder_buffer_delay; - int encoder_buffer_delay; + uint32_t decoder_buffer_delay; + uint32_t encoder_buffer_delay; int low_delay_mode_flag; int display_model_param_present_flag; int initial_display_delay; } aom_dec_model_op_parameters_t; typedef struct aom_op_timing_info_t { - int64_t buffer_removal_delay; + uint32_t buffer_removal_time; } aom_op_timing_info_t; void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model); diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h index cdac90d9e..f0ab79d0f 100644 --- a/third_party/aom/av1/common/txb_common.h +++ b/third_party/aom/av1/common/txb_common.h @@ -466,31 +466,6 @@ static AOM_FORCE_INLINE int get_nz_mag(const uint8_t *const levels, return mag; } -static INLINE int get_nz_count(const uint8_t *const levels, const int bwl, - const TX_CLASS tx_class) { - int count; - - count = (levels[1] != 0); // { 0, 1 } - count += (levels[(1 << bwl) + TX_PAD_HOR] != 0); // { 1, 0 } - - for (int idx = 0; idx < SIG_REF_DIFF_OFFSET_NUM; ++idx) { - const int row_offset = - ((tx_class == TX_CLASS_2D) ? sig_ref_diff_offset[idx][0] - : ((tx_class == TX_CLASS_VERT) - ? sig_ref_diff_offset_vert[idx][0] - : sig_ref_diff_offset_horiz[idx][0])); - const int col_offset = - ((tx_class == TX_CLASS_2D) ? sig_ref_diff_offset[idx][1] - : ((tx_class == TX_CLASS_VERT) - ? sig_ref_diff_offset_vert[idx][1] - : sig_ref_diff_offset_horiz[idx][1])); - const int nb_pos = - (row_offset << bwl) + (row_offset << TX_PAD_HOR_LOG2) + col_offset; - count += (levels[nb_pos] != 0); - } - return count; -} - #define NZ_MAP_CTX_0 SIG_COEF_CONTEXTS_2D #define NZ_MAP_CTX_5 (NZ_MAP_CTX_0 + 5) #define NZ_MAP_CTX_10 (NZ_MAP_CTX_0 + 10) diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c index ae6f07657..412d83ed8 100644 --- a/third_party/aom/av1/common/warped_motion.c +++ b/third_party/aom/av1/common/warped_motion.c @@ -92,33 +92,6 @@ static const int error_measure_lut[512] = { }; /* clang-format on */ -void project_points_affine(const int32_t *mat, int *points, int *proj, - const int n, const int stride_points, - const int stride_proj, const int subsampling_x, - const int subsampling_y) { - for (int i = 0; i < n; ++i) { - const int x = *(points++), y = *(points++); - if (subsampling_x) - *(proj++) = ROUND_POWER_OF_TWO_SIGNED( - mat[2] * 2 * x + mat[3] * 2 * y + mat[0] + - (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2, - WARPEDDIFF_PREC_BITS + 1); - else - *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[0], - WARPEDDIFF_PREC_BITS); - if (subsampling_y) - *(proj++) = ROUND_POWER_OF_TWO_SIGNED( - mat[4] * 2 * x + mat[5] * 2 * y + mat[1] + - (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2, - WARPEDDIFF_PREC_BITS + 1); - else - *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[4] * x + mat[5] * y + mat[1], - WARPEDDIFF_PREC_BITS); - points += stride_points - 2; - proj += stride_proj - 2; - } -} - // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels // at a time. The zoom/rotation/shear in the model are applied to the // "fractional" position of each pixel, which therefore varies within diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h index f5da36bbb..ce4032ee5 100644 --- a/third_party/aom/av1/common/warped_motion.h +++ b/third_party/aom/av1/common/warped_motion.h @@ -68,11 +68,6 @@ static const uint8_t warp_pad_right[14][16] = { { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 } }; -void project_points_affine(const int32_t *mat, int *points, int *proj, - const int n, const int stride_points, - const int stride_proj, const int subsampling_x, - const int subsampling_y); - // Returns the error between the result of applying motion 'wm' to the frame // described by 'ref' and the frame described by 'dst'. int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd, diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c index 6747cae01..0c5286f9d 100644 --- a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c +++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c @@ -39,7 +39,7 @@ static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w, const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(filter_idx < SUBPEL_SHIFTS); const int16_t *filter = - av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx); + av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); // Load the filter coefficients const __m128i coefflo = _mm_loadu_si128((__m128i *)filter); @@ -140,7 +140,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst, const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(filter_idx < SUBPEL_SHIFTS); const int16_t *filter = - av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx); + av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter); int x; @@ -232,8 +232,8 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst, } void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params) { @@ -278,7 +278,7 @@ static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst, const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(filter_idx < SUBPEL_SHIFTS); const int16_t *filter = - av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx); + av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); // Load the filter coefficients const __m128i coefflo = _mm_loadu_si128((__m128i *)filter); @@ -372,7 +372,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst, const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(filter_idx < SUBPEL_SHIFTS); const int16_t *filter = - av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx); + av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter); int x; @@ -472,8 +472,8 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst, void av1_highbd_convolve_2d_scale_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, const int subpel_x_qn, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd) { // TODO(yaowu): Move this out of stack diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c index 7415c58df..ae331b40d 100644 --- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c @@ -19,49 +19,47 @@ #include "av1/common/x86/av1_inv_txfm_ssse3.h" static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(x1[0], x1[3]); - btf_16_adds_subs_avx2(x1[1], x1[2]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x1[5], x1[6]); - - btf_16_adds_subs_avx2(x1[8], x1[11]); - btf_16_adds_subs_avx2(x1[9], x1[10]); - btf_16_subs_adds_avx2(x1[15], x1[12]); - btf_16_subs_adds_avx2(x1[14], x1[13]); + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); } static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(x[0], x[7]); - btf_16_adds_subs_avx2(x[1], x[6]); - btf_16_adds_subs_avx2(x[2], x[5]); - btf_16_adds_subs_avx2(x[3], x[4]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); } static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) { - btf_16_adds_subs_out_avx2(output[0], output[15], x1[0], x1[15]); - btf_16_adds_subs_out_avx2(output[1], output[14], x1[1], x1[14]); - btf_16_adds_subs_out_avx2(output[2], output[13], x1[2], x1[13]); - btf_16_adds_subs_out_avx2(output[3], output[12], x1[3], x1[12]); - btf_16_adds_subs_out_avx2(output[4], output[11], x1[4], x1[11]); - btf_16_adds_subs_out_avx2(output[5], output[10], x1[5], x1[10]); - btf_16_adds_subs_out_avx2(output[6], output[9], x1[6], x1[9]); - btf_16_adds_subs_out_avx2(output[7], output[8], x1[7], x1[8]); + btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]); + btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]); + btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]); + btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]); + btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]); + btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]); + btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]); + btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]); } static void idct16_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); @@ -103,29 +101,29 @@ static void idct16_new_avx2(const __m256i *input, __m256i *output, x1[15] = input[15]; // stage 2 - btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]); - btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]); - btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]); - btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]); + btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit); // stage 3 - btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]); - btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]); - btf_16_adds_subs_avx2(x1[8], x1[9]); - btf_16_subs_adds_avx2(x1[11], x1[10]); - btf_16_adds_subs_avx2(x1[12], x1[13]); - btf_16_subs_adds_avx2(x1[15], x1[14]); + btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); // stage 4 - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]); - btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]); - btf_16_adds_subs_avx2(x1[4], x1[5]); - btf_16_subs_adds_avx2(x1[7], x1[6]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]); - - idct16_stage5_avx2(x1, cospi, __rounding, cos_bit); - idct16_stage6_avx2(x1, cospi, __rounding, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + + idct16_stage5_avx2(x1, cospi, _r, cos_bit); + idct16_stage6_avx2(x1, cospi, _r, cos_bit); idct16_stage7_avx2(output, x1); } @@ -133,7 +131,7 @@ static void idct16_low8_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); @@ -159,21 +157,21 @@ static void idct16_low8_new_avx2(const __m256i *input, __m256i *output, // stage 3 btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]); btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]); - btf_16_adds_subs_avx2(x1[8], x1[9]); - btf_16_subs_adds_avx2(x1[11], x1[10]); - btf_16_adds_subs_avx2(x1[12], x1[13]); - btf_16_subs_adds_avx2(x1[15], x1[14]); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); // stage 4 btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]); - btf_16_adds_subs_avx2(x1[4], x1[5]); - btf_16_subs_adds_avx2(x1[7], x1[6]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); - idct16_stage5_avx2(x1, cospi, __rounding, cos_bit); - idct16_stage6_avx2(x1, cospi, __rounding, cos_bit); + idct16_stage5_avx2(x1, cospi, _r, cos_bit); + idct16_stage6_avx2(x1, cospi, _r, cos_bit); idct16_stage7_avx2(output, x1); } @@ -212,74 +210,71 @@ static void idct16_low1_new_avx2(const __m256i *input, __m256i *output, } static INLINE void iadst16_stage3_avx2(__m256i *x) { - btf_16_adds_subs_avx2(x[0], x[8]); - btf_16_adds_subs_avx2(x[1], x[9]); - btf_16_adds_subs_avx2(x[2], x[10]); - btf_16_adds_subs_avx2(x[3], x[11]); - btf_16_adds_subs_avx2(x[4], x[12]); - btf_16_adds_subs_avx2(x[5], x[13]); - btf_16_adds_subs_avx2(x[6], x[14]); - btf_16_adds_subs_avx2(x[7], x[15]); + btf_16_adds_subs_avx2(&x[0], &x[8]); + btf_16_adds_subs_avx2(&x[1], &x[9]); + btf_16_adds_subs_avx2(&x[2], &x[10]); + btf_16_adds_subs_avx2(&x[3], &x[11]); + btf_16_adds_subs_avx2(&x[4], &x[12]); + btf_16_adds_subs_avx2(&x[5], &x[13]); + btf_16_adds_subs_avx2(&x[6], &x[14]); + btf_16_adds_subs_avx2(&x[7], &x[15]); } static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]); const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]); - btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); - btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]); - btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]); - btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]); + btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit); } static INLINE void iadst16_stage5_avx2(__m256i *x) { - btf_16_adds_subs_avx2(x[0], x[4]); - btf_16_adds_subs_avx2(x[1], x[5]); - btf_16_adds_subs_avx2(x[2], x[6]); - btf_16_adds_subs_avx2(x[3], x[7]); - btf_16_adds_subs_avx2(x[8], x[12]); - btf_16_adds_subs_avx2(x[9], x[13]); - btf_16_adds_subs_avx2(x[10], x[14]); - btf_16_adds_subs_avx2(x[11], x[15]); + btf_16_adds_subs_avx2(&x[0], &x[4]); + btf_16_adds_subs_avx2(&x[1], &x[5]); + btf_16_adds_subs_avx2(&x[2], &x[6]); + btf_16_adds_subs_avx2(&x[3], &x[7]); + btf_16_adds_subs_avx2(&x[8], &x[12]); + btf_16_adds_subs_avx2(&x[9], &x[13]); + btf_16_adds_subs_avx2(&x[10], &x[14]); + btf_16_adds_subs_avx2(&x[11], &x[15]); } static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); - btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); - btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); - btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); - btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit); } static INLINE void iadst16_stage7_avx2(__m256i *x) { - btf_16_adds_subs_avx2(x[0], x[2]); - btf_16_adds_subs_avx2(x[1], x[3]); - btf_16_adds_subs_avx2(x[4], x[6]); - btf_16_adds_subs_avx2(x[5], x[7]); - btf_16_adds_subs_avx2(x[8], x[10]); - btf_16_adds_subs_avx2(x[9], x[11]); - btf_16_adds_subs_avx2(x[12], x[14]); - btf_16_adds_subs_avx2(x[13], x[15]); + btf_16_adds_subs_avx2(&x[0], &x[2]); + btf_16_adds_subs_avx2(&x[1], &x[3]); + btf_16_adds_subs_avx2(&x[4], &x[6]); + btf_16_adds_subs_avx2(&x[5], &x[7]); + btf_16_adds_subs_avx2(&x[8], &x[10]); + btf_16_adds_subs_avx2(&x[9], &x[11]); + btf_16_adds_subs_avx2(&x[12], &x[14]); + btf_16_adds_subs_avx2(&x[13], &x[15]); } static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x1[2], x1[3]); - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x1[6], x1[7]); - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x1[10], x1[11]); - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x1[14], x1[15]); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit); } static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) { @@ -307,7 +302,7 @@ static void iadst16_new_avx2(const __m256i *input, __m256i *output, (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); @@ -346,21 +341,21 @@ static void iadst16_new_avx2(const __m256i *input, __m256i *output, x1[15] = input[14]; // stage 2 - btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, x1[0], x1[1], x1[0], x1[1]); - btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, x1[2], x1[3], x1[2], x1[3]); - btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, x1[4], x1[5], x1[4], x1[5]); - btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, x1[6], x1[7], x1[6], x1[7]); - btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, x1[8], x1[9], x1[8], x1[9]); - btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, x1[10], x1[11], x1[10], x1[11]); - btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, x1[12], x1[13], x1[12], x1[13]); - btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, x1[14], x1[15], x1[14], x1[15]); + btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit); + btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit); iadst16_stage3_avx2(x1); - iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage4_avx2(x1, cospi, _r, cos_bit); iadst16_stage5_avx2(x1); - iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage6_avx2(x1, cospi, _r, cos_bit); iadst16_stage7_avx2(x1); - iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage8_avx2(x1, cospi, _r, cos_bit); iadst16_stage9_avx2(output, x1); } @@ -368,7 +363,7 @@ static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); // stage 1 __m256i x1[16]; @@ -392,11 +387,11 @@ static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output, btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]); iadst16_stage3_avx2(x1); - iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage4_avx2(x1, cospi, _r, cos_bit); iadst16_stage5_avx2(x1); - iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage6_avx2(x1, cospi, _r, cos_bit); iadst16_stage7_avx2(x1); - iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage8_avx2(x1, cospi, _r, cos_bit); iadst16_stage9_avx2(output, x1); } @@ -404,7 +399,7 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); @@ -423,7 +418,7 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output, x1[9] = x1[1]; // stage 4 - btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x1[8], x1[9], x1[8], x1[9]); + btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit); // stage 5 x1[4] = x1[0]; @@ -433,8 +428,8 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output, x1[13] = x1[9]; // stage 6 - btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[4], x1[5], x1[4], x1[5]); - btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[12], x1[13], x1[12], x1[13]); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit); // stage 7 x1[2] = x1[0]; @@ -446,130 +441,125 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output, x1[14] = x1[12]; x1[15] = x1[13]; - iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage8_avx2(x1, cospi, _r, cos_bit); iadst16_stage9_avx2(output, x1); } static INLINE void idct32_high16_stage3_avx2(__m256i *x) { - btf_16_adds_subs_avx2(x[16], x[17]); - btf_16_subs_adds_avx2(x[19], x[18]); - btf_16_adds_subs_avx2(x[20], x[21]); - btf_16_subs_adds_avx2(x[23], x[22]); - btf_16_adds_subs_avx2(x[24], x[25]); - btf_16_subs_adds_avx2(x[27], x[26]); - btf_16_adds_subs_avx2(x[28], x[29]); - btf_16_subs_adds_avx2(x[31], x[30]); + btf_16_adds_subs_avx2(&x[16], &x[17]); + btf_16_adds_subs_avx2(&x[19], &x[18]); + btf_16_adds_subs_avx2(&x[20], &x[21]); + btf_16_adds_subs_avx2(&x[23], &x[22]); + btf_16_adds_subs_avx2(&x[24], &x[25]); + btf_16_adds_subs_avx2(&x[27], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[29]); + btf_16_adds_subs_avx2(&x[31], &x[30]); } static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); - btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); - btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); - btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); - btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); } static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); - btf_16_adds_subs_avx2(x[16], x[19]); - btf_16_adds_subs_avx2(x[17], x[18]); - btf_16_subs_adds_avx2(x[23], x[20]); - btf_16_subs_adds_avx2(x[22], x[21]); - btf_16_adds_subs_avx2(x[24], x[27]); - btf_16_adds_subs_avx2(x[25], x[26]); - btf_16_subs_adds_avx2(x[31], x[28]); - btf_16_subs_adds_avx2(x[30], x[29]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit); + btf_16_adds_subs_avx2(&x[16], &x[19]); + btf_16_adds_subs_avx2(&x[17], &x[18]); + btf_16_adds_subs_avx2(&x[23], &x[20]); + btf_16_adds_subs_avx2(&x[22], &x[21]); + btf_16_adds_subs_avx2(&x[24], &x[27]); + btf_16_adds_subs_avx2(&x[25], &x[26]); + btf_16_adds_subs_avx2(&x[31], &x[28]); + btf_16_adds_subs_avx2(&x[30], &x[29]); } static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); - btf_16_adds_subs_avx2(x[8], x[11]); - btf_16_adds_subs_avx2(x[9], x[10]); - btf_16_subs_adds_avx2(x[15], x[12]); - btf_16_subs_adds_avx2(x[14], x[13]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x[8], &x[11]); + btf_16_adds_subs_avx2(&x[9], &x[10]); + btf_16_adds_subs_avx2(&x[15], &x[12]); + btf_16_adds_subs_avx2(&x[14], &x[13]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); } static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(x[0], x[7]); - btf_16_adds_subs_avx2(x[1], x[6]); - btf_16_adds_subs_avx2(x[2], x[5]); - btf_16_adds_subs_avx2(x[3], x[4]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); - btf_16_adds_subs_avx2(x[16], x[23]); - btf_16_adds_subs_avx2(x[17], x[22]); - btf_16_adds_subs_avx2(x[18], x[21]); - btf_16_adds_subs_avx2(x[19], x[20]); - btf_16_subs_adds_avx2(x[31], x[24]); - btf_16_subs_adds_avx2(x[30], x[25]); - btf_16_subs_adds_avx2(x[29], x[26]); - btf_16_subs_adds_avx2(x[28], x[27]); + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x[16], &x[23]); + btf_16_adds_subs_avx2(&x[17], &x[22]); + btf_16_adds_subs_avx2(&x[18], &x[21]); + btf_16_adds_subs_avx2(&x[19], &x[20]); + btf_16_adds_subs_avx2(&x[31], &x[24]); + btf_16_adds_subs_avx2(&x[30], &x[25]); + btf_16_adds_subs_avx2(&x[29], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[27]); } static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(x[0], x[15]); - btf_16_adds_subs_avx2(x[1], x[14]); - btf_16_adds_subs_avx2(x[2], x[13]); - btf_16_adds_subs_avx2(x[3], x[12]); - btf_16_adds_subs_avx2(x[4], x[11]); - btf_16_adds_subs_avx2(x[5], x[10]); - btf_16_adds_subs_avx2(x[6], x[9]); - btf_16_adds_subs_avx2(x[7], x[8]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); + btf_16_adds_subs_avx2(&x[0], &x[15]); + btf_16_adds_subs_avx2(&x[1], &x[14]); + btf_16_adds_subs_avx2(&x[2], &x[13]); + btf_16_adds_subs_avx2(&x[3], &x[12]); + btf_16_adds_subs_avx2(&x[4], &x[11]); + btf_16_adds_subs_avx2(&x[5], &x[10]); + btf_16_adds_subs_avx2(&x[6], &x[9]); + btf_16_adds_subs_avx2(&x[7], &x[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); } static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) { - btf_16_adds_subs_out_avx2(output[0], output[31], x[0], x[31]); - btf_16_adds_subs_out_avx2(output[1], output[30], x[1], x[30]); - btf_16_adds_subs_out_avx2(output[2], output[29], x[2], x[29]); - btf_16_adds_subs_out_avx2(output[3], output[28], x[3], x[28]); - btf_16_adds_subs_out_avx2(output[4], output[27], x[4], x[27]); - btf_16_adds_subs_out_avx2(output[5], output[26], x[5], x[26]); - btf_16_adds_subs_out_avx2(output[6], output[25], x[6], x[25]); - btf_16_adds_subs_out_avx2(output[7], output[24], x[7], x[24]); - btf_16_adds_subs_out_avx2(output[8], output[23], x[8], x[23]); - btf_16_adds_subs_out_avx2(output[9], output[22], x[9], x[22]); - btf_16_adds_subs_out_avx2(output[10], output[21], x[10], x[21]); - btf_16_adds_subs_out_avx2(output[11], output[20], x[11], x[20]); - btf_16_adds_subs_out_avx2(output[12], output[19], x[12], x[19]); - btf_16_adds_subs_out_avx2(output[13], output[18], x[13], x[18]); - btf_16_adds_subs_out_avx2(output[14], output[17], x[14], x[17]); - btf_16_adds_subs_out_avx2(output[15], output[16], x[15], x[16]); + btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]); + btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]); + btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]); + btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]); + btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]); + btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]); + btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]); + btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]); + btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]); + btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]); + btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]); + btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]); + btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]); + btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]); + btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]); + btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]); } static void idct32_low1_new_avx2(const __m256i *input, __m256i *output, @@ -629,7 +619,7 @@ static void idct32_low8_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); // stage 1 __m256i x[32]; @@ -666,20 +656,20 @@ static void idct32_low8_new_avx2(const __m256i *input, __m256i *output, x[10] = x[11]; x[13] = x[12]; x[14] = x[15]; - idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit); + idct32_high16_stage4_avx2(x, cospi, _r, cos_bit); // stage 5 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); x[5] = x[4]; x[6] = x[7]; - idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit); + idct32_high24_stage5_avx2(x, cospi, _r, cos_bit); // stage 6 x[3] = x[0]; x[2] = x[1]; - idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit); + idct32_high28_stage6_avx2(x, cospi, _r, cos_bit); - idct32_stage7_avx2(x, cospi, __rounding, cos_bit); - idct32_stage8_avx2(x, cospi, __rounding, cos_bit); + idct32_stage7_avx2(x, cospi, _r, cos_bit); + idct32_stage8_avx2(x, cospi, _r, cos_bit); idct32_stage9_avx2(output, x); } @@ -687,7 +677,7 @@ static void idct32_low16_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); // stage 1 __m256i x[32]; @@ -728,25 +718,25 @@ static void idct32_low16_new_avx2(const __m256i *input, __m256i *output, // stage 4 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); - btf_16_adds_subs_avx2(x[8], x[9]); - btf_16_subs_adds_avx2(x[11], x[10]); - btf_16_adds_subs_avx2(x[12], x[13]); - btf_16_subs_adds_avx2(x[15], x[14]); - idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[8], &x[9]); + btf_16_adds_subs_avx2(&x[11], &x[10]); + btf_16_adds_subs_avx2(&x[12], &x[13]); + btf_16_adds_subs_avx2(&x[15], &x[14]); + idct32_high16_stage4_avx2(x, cospi, _r, cos_bit); // stage 5 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); - btf_16_adds_subs_avx2(x[4], x[5]); - btf_16_subs_adds_avx2(x[7], x[6]); - idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[4], &x[5]); + btf_16_adds_subs_avx2(&x[7], &x[6]); + idct32_high24_stage5_avx2(x, cospi, _r, cos_bit); - btf_16_adds_subs_avx2(x[0], x[3]); - btf_16_adds_subs_avx2(x[1], x[2]); - idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[0], &x[3]); + btf_16_adds_subs_avx2(&x[1], &x[2]); + idct32_high28_stage6_avx2(x, cospi, _r, cos_bit); - idct32_stage7_avx2(x, cospi, __rounding, cos_bit); - idct32_stage8_avx2(x, cospi, __rounding, cos_bit); + idct32_stage7_avx2(x, cospi, _r, cos_bit); + idct32_stage8_avx2(x, cospi, _r, cos_bit); idct32_stage9_avx2(output, x); } @@ -754,7 +744,7 @@ static void idct32_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); @@ -825,51 +815,50 @@ static void idct32_new_avx2(const __m256i *input, __m256i *output, x1[31] = input[31]; // stage 2 - btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, x1[16], x1[31], x1[16], x1[31]); - btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, x1[17], x1[30], x1[17], x1[30]); - btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, x1[18], x1[29], x1[18], x1[29]); - btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, x1[19], x1[28], x1[19], x1[28]); - btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, x1[20], x1[27], x1[20], x1[27]); - btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, x1[21], x1[26], x1[21], x1[26]); - btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, x1[22], x1[25], x1[22], x1[25]); - btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, x1[23], x1[24], x1[23], x1[24]); + btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, cos_bit); + btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, cos_bit); // stage 3 - btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]); - btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]); - btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]); - btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]); + btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit); idct32_high16_stage3_avx2(x1); // stage 4 - btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]); - btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]); - btf_16_adds_subs_avx2(x1[8], x1[9]); - btf_16_subs_adds_avx2(x1[11], x1[10]); - btf_16_adds_subs_avx2(x1[12], x1[13]); - btf_16_subs_adds_avx2(x1[15], x1[14]); - idct32_high16_stage4_avx2(x1, cospi, __rounding, cos_bit); + btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + idct32_high16_stage4_avx2(x1, cospi, _r, cos_bit); // stage 5 - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]); - btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]); - btf_16_adds_subs_avx2(x1[4], x1[5]); - btf_16_subs_adds_avx2(x1[7], x1[6]); - idct32_high24_stage5_avx2(x1, cospi, __rounding, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + idct32_high24_stage5_avx2(x1, cospi, _r, cos_bit); // stage 6 - btf_16_adds_subs_avx2(x1[0], x1[3]); - btf_16_adds_subs_avx2(x1[1], x1[2]); - idct32_high28_stage6_avx2(x1, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + idct32_high28_stage6_avx2(x1, cospi, _r, cos_bit); - idct32_stage7_avx2(x1, cospi, __rounding, cos_bit); - idct32_stage8_avx2(x1, cospi, __rounding, cos_bit); + idct32_stage7_avx2(x1, cospi, _r, cos_bit); + idct32_stage8_avx2(x1, cospi, _r, cos_bit); idct32_stage9_avx2(output, x1); } static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); @@ -883,19 +872,18 @@ static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi, const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); - btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); - btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]); - btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]); - btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); - btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); - btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]); - btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]); - btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); + btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit); + btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit); } static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); @@ -903,31 +891,30 @@ static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi, const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); - btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); - btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); - btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); - btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); - btf_16_adds_subs_avx2(x[32], x[35]); - btf_16_adds_subs_avx2(x[33], x[34]); - btf_16_subs_adds_avx2(x[39], x[36]); - btf_16_subs_adds_avx2(x[38], x[37]); - btf_16_adds_subs_avx2(x[40], x[43]); - btf_16_adds_subs_avx2(x[41], x[42]); - btf_16_subs_adds_avx2(x[47], x[44]); - btf_16_subs_adds_avx2(x[46], x[45]); - btf_16_adds_subs_avx2(x[48], x[51]); - btf_16_adds_subs_avx2(x[49], x[50]); - btf_16_subs_adds_avx2(x[55], x[52]); - btf_16_subs_adds_avx2(x[54], x[53]); - btf_16_adds_subs_avx2(x[56], x[59]); - btf_16_adds_subs_avx2(x[57], x[58]); - btf_16_subs_adds_avx2(x[63], x[60]); - btf_16_subs_adds_avx2(x[62], x[61]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); + btf_16_adds_subs_avx2(&x[32], &x[35]); + btf_16_adds_subs_avx2(&x[33], &x[34]); + btf_16_adds_subs_avx2(&x[39], &x[36]); + btf_16_adds_subs_avx2(&x[38], &x[37]); + btf_16_adds_subs_avx2(&x[40], &x[43]); + btf_16_adds_subs_avx2(&x[41], &x[42]); + btf_16_adds_subs_avx2(&x[47], &x[44]); + btf_16_adds_subs_avx2(&x[46], &x[45]); + btf_16_adds_subs_avx2(&x[48], &x[51]); + btf_16_adds_subs_avx2(&x[49], &x[50]); + btf_16_adds_subs_avx2(&x[55], &x[52]); + btf_16_adds_subs_avx2(&x[54], &x[53]); + btf_16_adds_subs_avx2(&x[56], &x[59]); + btf_16_adds_subs_avx2(&x[57], &x[58]); + btf_16_adds_subs_avx2(&x[63], &x[60]); + btf_16_adds_subs_avx2(&x[62], &x[61]); } static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); @@ -935,185 +922,180 @@ static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi, const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); - btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]); - btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]); - btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]); - btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]); - btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]); - btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]); - btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]); - btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit); } static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { - btf_16_adds_subs_avx2(x[16], x[19]); - btf_16_adds_subs_avx2(x[17], x[18]); - btf_16_subs_adds_avx2(x[23], x[20]); - btf_16_subs_adds_avx2(x[22], x[21]); - btf_16_adds_subs_avx2(x[24], x[27]); - btf_16_adds_subs_avx2(x[25], x[26]); - btf_16_subs_adds_avx2(x[31], x[28]); - btf_16_subs_adds_avx2(x[30], x[29]); - idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit); + const __m256i _r, int8_t cos_bit) { + btf_16_adds_subs_avx2(&x[16], &x[19]); + btf_16_adds_subs_avx2(&x[17], &x[18]); + btf_16_adds_subs_avx2(&x[23], &x[20]); + btf_16_adds_subs_avx2(&x[22], &x[21]); + btf_16_adds_subs_avx2(&x[24], &x[27]); + btf_16_adds_subs_avx2(&x[25], &x[26]); + btf_16_adds_subs_avx2(&x[31], &x[28]); + btf_16_adds_subs_avx2(&x[30], &x[29]); + idct64_stage6_high32_avx2(x, cospi, _r, cos_bit); } static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); - btf_16_adds_subs_avx2(x[32], x[39]); - btf_16_adds_subs_avx2(x[33], x[38]); - btf_16_adds_subs_avx2(x[34], x[37]); - btf_16_adds_subs_avx2(x[35], x[36]); - btf_16_subs_adds_avx2(x[47], x[40]); - btf_16_subs_adds_avx2(x[46], x[41]); - btf_16_subs_adds_avx2(x[45], x[42]); - btf_16_subs_adds_avx2(x[44], x[43]); - btf_16_adds_subs_avx2(x[48], x[55]); - btf_16_adds_subs_avx2(x[49], x[54]); - btf_16_adds_subs_avx2(x[50], x[53]); - btf_16_adds_subs_avx2(x[51], x[52]); - btf_16_subs_adds_avx2(x[63], x[56]); - btf_16_subs_adds_avx2(x[62], x[57]); - btf_16_subs_adds_avx2(x[61], x[58]); - btf_16_subs_adds_avx2(x[60], x[59]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); + btf_16_adds_subs_avx2(&x[32], &x[39]); + btf_16_adds_subs_avx2(&x[33], &x[38]); + btf_16_adds_subs_avx2(&x[34], &x[37]); + btf_16_adds_subs_avx2(&x[35], &x[36]); + btf_16_adds_subs_avx2(&x[47], &x[40]); + btf_16_adds_subs_avx2(&x[46], &x[41]); + btf_16_adds_subs_avx2(&x[45], &x[42]); + btf_16_adds_subs_avx2(&x[44], &x[43]); + btf_16_adds_subs_avx2(&x[48], &x[55]); + btf_16_adds_subs_avx2(&x[49], &x[54]); + btf_16_adds_subs_avx2(&x[50], &x[53]); + btf_16_adds_subs_avx2(&x[51], &x[52]); + btf_16_adds_subs_avx2(&x[63], &x[56]); + btf_16_adds_subs_avx2(&x[62], &x[57]); + btf_16_adds_subs_avx2(&x[61], &x[58]); + btf_16_adds_subs_avx2(&x[60], &x[59]); } static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); - btf_16_adds_subs_avx2(x[16], x[23]); - btf_16_adds_subs_avx2(x[17], x[22]); - btf_16_adds_subs_avx2(x[18], x[21]); - btf_16_adds_subs_avx2(x[19], x[20]); - btf_16_subs_adds_avx2(x[31], x[24]); - btf_16_subs_adds_avx2(x[30], x[25]); - btf_16_subs_adds_avx2(x[29], x[26]); - btf_16_subs_adds_avx2(x[28], x[27]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]); + btf_16_adds_subs_avx2(&x[16], &x[23]); + btf_16_adds_subs_avx2(&x[17], &x[22]); + btf_16_adds_subs_avx2(&x[18], &x[21]); + btf_16_adds_subs_avx2(&x[19], &x[20]); + btf_16_adds_subs_avx2(&x[31], &x[24]); + btf_16_adds_subs_avx2(&x[30], &x[25]); + btf_16_adds_subs_avx2(&x[29], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[27]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit); } static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(x[0], x[15]); - btf_16_adds_subs_avx2(x[1], x[14]); - btf_16_adds_subs_avx2(x[2], x[13]); - btf_16_adds_subs_avx2(x[3], x[12]); - btf_16_adds_subs_avx2(x[4], x[11]); - btf_16_adds_subs_avx2(x[5], x[10]); - btf_16_adds_subs_avx2(x[6], x[9]); - btf_16_adds_subs_avx2(x[7], x[8]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); - btf_16_adds_subs_avx2(x[32], x[47]); - btf_16_adds_subs_avx2(x[33], x[46]); - btf_16_adds_subs_avx2(x[34], x[45]); - btf_16_adds_subs_avx2(x[35], x[44]); - btf_16_adds_subs_avx2(x[36], x[43]); - btf_16_adds_subs_avx2(x[37], x[42]); - btf_16_adds_subs_avx2(x[38], x[41]); - btf_16_adds_subs_avx2(x[39], x[40]); - btf_16_subs_adds_avx2(x[63], x[48]); - btf_16_subs_adds_avx2(x[62], x[49]); - btf_16_subs_adds_avx2(x[61], x[50]); - btf_16_subs_adds_avx2(x[60], x[51]); - btf_16_subs_adds_avx2(x[59], x[52]); - btf_16_subs_adds_avx2(x[58], x[53]); - btf_16_subs_adds_avx2(x[57], x[54]); - btf_16_subs_adds_avx2(x[56], x[55]); + btf_16_adds_subs_avx2(&x[0], &x[15]); + btf_16_adds_subs_avx2(&x[1], &x[14]); + btf_16_adds_subs_avx2(&x[2], &x[13]); + btf_16_adds_subs_avx2(&x[3], &x[12]); + btf_16_adds_subs_avx2(&x[4], &x[11]); + btf_16_adds_subs_avx2(&x[5], &x[10]); + btf_16_adds_subs_avx2(&x[6], &x[9]); + btf_16_adds_subs_avx2(&x[7], &x[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); + btf_16_adds_subs_avx2(&x[32], &x[47]); + btf_16_adds_subs_avx2(&x[33], &x[46]); + btf_16_adds_subs_avx2(&x[34], &x[45]); + btf_16_adds_subs_avx2(&x[35], &x[44]); + btf_16_adds_subs_avx2(&x[36], &x[43]); + btf_16_adds_subs_avx2(&x[37], &x[42]); + btf_16_adds_subs_avx2(&x[38], &x[41]); + btf_16_adds_subs_avx2(&x[39], &x[40]); + btf_16_adds_subs_avx2(&x[63], &x[48]); + btf_16_adds_subs_avx2(&x[62], &x[49]); + btf_16_adds_subs_avx2(&x[61], &x[50]); + btf_16_adds_subs_avx2(&x[60], &x[51]); + btf_16_adds_subs_avx2(&x[59], &x[52]); + btf_16_adds_subs_avx2(&x[58], &x[53]); + btf_16_adds_subs_avx2(&x[57], &x[54]); + btf_16_adds_subs_avx2(&x[56], &x[55]); } static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(x[0], x[31]); - btf_16_adds_subs_avx2(x[1], x[30]); - btf_16_adds_subs_avx2(x[2], x[29]); - btf_16_adds_subs_avx2(x[3], x[28]); - btf_16_adds_subs_avx2(x[4], x[27]); - btf_16_adds_subs_avx2(x[5], x[26]); - btf_16_adds_subs_avx2(x[6], x[25]); - btf_16_adds_subs_avx2(x[7], x[24]); - btf_16_adds_subs_avx2(x[8], x[23]); - btf_16_adds_subs_avx2(x[9], x[22]); - btf_16_adds_subs_avx2(x[10], x[21]); - btf_16_adds_subs_avx2(x[11], x[20]); - btf_16_adds_subs_avx2(x[12], x[19]); - btf_16_adds_subs_avx2(x[13], x[18]); - btf_16_adds_subs_avx2(x[14], x[17]); - btf_16_adds_subs_avx2(x[15], x[16]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]); + btf_16_adds_subs_avx2(&x[0], &x[31]); + btf_16_adds_subs_avx2(&x[1], &x[30]); + btf_16_adds_subs_avx2(&x[2], &x[29]); + btf_16_adds_subs_avx2(&x[3], &x[28]); + btf_16_adds_subs_avx2(&x[4], &x[27]); + btf_16_adds_subs_avx2(&x[5], &x[26]); + btf_16_adds_subs_avx2(&x[6], &x[25]); + btf_16_adds_subs_avx2(&x[7], &x[24]); + btf_16_adds_subs_avx2(&x[8], &x[23]); + btf_16_adds_subs_avx2(&x[9], &x[22]); + btf_16_adds_subs_avx2(&x[10], &x[21]); + btf_16_adds_subs_avx2(&x[11], &x[20]); + btf_16_adds_subs_avx2(&x[12], &x[19]); + btf_16_adds_subs_avx2(&x[13], &x[18]); + btf_16_adds_subs_avx2(&x[14], &x[17]); + btf_16_adds_subs_avx2(&x[15], &x[16]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit); } static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) { - btf_16_adds_subs_out_avx2(output[0], output[63], x[0], x[63]); - btf_16_adds_subs_out_avx2(output[1], output[62], x[1], x[62]); - btf_16_adds_subs_out_avx2(output[2], output[61], x[2], x[61]); - btf_16_adds_subs_out_avx2(output[3], output[60], x[3], x[60]); - btf_16_adds_subs_out_avx2(output[4], output[59], x[4], x[59]); - btf_16_adds_subs_out_avx2(output[5], output[58], x[5], x[58]); - btf_16_adds_subs_out_avx2(output[6], output[57], x[6], x[57]); - btf_16_adds_subs_out_avx2(output[7], output[56], x[7], x[56]); - btf_16_adds_subs_out_avx2(output[8], output[55], x[8], x[55]); - btf_16_adds_subs_out_avx2(output[9], output[54], x[9], x[54]); - btf_16_adds_subs_out_avx2(output[10], output[53], x[10], x[53]); - btf_16_adds_subs_out_avx2(output[11], output[52], x[11], x[52]); - btf_16_adds_subs_out_avx2(output[12], output[51], x[12], x[51]); - btf_16_adds_subs_out_avx2(output[13], output[50], x[13], x[50]); - btf_16_adds_subs_out_avx2(output[14], output[49], x[14], x[49]); - btf_16_adds_subs_out_avx2(output[15], output[48], x[15], x[48]); - btf_16_adds_subs_out_avx2(output[16], output[47], x[16], x[47]); - btf_16_adds_subs_out_avx2(output[17], output[46], x[17], x[46]); - btf_16_adds_subs_out_avx2(output[18], output[45], x[18], x[45]); - btf_16_adds_subs_out_avx2(output[19], output[44], x[19], x[44]); - btf_16_adds_subs_out_avx2(output[20], output[43], x[20], x[43]); - btf_16_adds_subs_out_avx2(output[21], output[42], x[21], x[42]); - btf_16_adds_subs_out_avx2(output[22], output[41], x[22], x[41]); - btf_16_adds_subs_out_avx2(output[23], output[40], x[23], x[40]); - btf_16_adds_subs_out_avx2(output[24], output[39], x[24], x[39]); - btf_16_adds_subs_out_avx2(output[25], output[38], x[25], x[38]); - btf_16_adds_subs_out_avx2(output[26], output[37], x[26], x[37]); - btf_16_adds_subs_out_avx2(output[27], output[36], x[27], x[36]); - btf_16_adds_subs_out_avx2(output[28], output[35], x[28], x[35]); - btf_16_adds_subs_out_avx2(output[29], output[34], x[29], x[34]); - btf_16_adds_subs_out_avx2(output[30], output[33], x[30], x[33]); - btf_16_adds_subs_out_avx2(output[31], output[32], x[31], x[32]); + btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]); + btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]); + btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]); + btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]); + btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]); + btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]); + btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]); + btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]); + btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]); + btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]); + btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]); + btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]); + btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]); + btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]); + btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]); + btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]); + btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]); + btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]); + btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]); + btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]); + btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]); + btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]); + btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]); + btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]); + btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]); + btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]); + btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]); + btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]); + btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]); + btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]); + btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]); + btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]); } static void idct64_low1_new_avx2(const __m256i *input, __m256i *output, @@ -1207,7 +1189,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); @@ -1260,16 +1242,16 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output, x[22] = x[23]; x[25] = x[24]; x[30] = x[31]; - btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); - btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); - btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); - btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); + btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit); + btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit); // stage 5 x[9] = x[8]; x[14] = x[15]; - btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); - btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); x[35] = x[32]; x[34] = x[33]; x[36] = x[39]; @@ -1289,7 +1271,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output, // stage 6 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); x[19] = x[16]; x[18] = x[17]; x[20] = x[23]; @@ -1298,7 +1280,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output, x[26] = x[25]; x[28] = x[31]; x[29] = x[30]; - idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit); + idct64_stage6_high32_avx2(x, cospi, _r, cos_bit); // stage 7 x[3] = x[0]; @@ -1307,7 +1289,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output, x[10] = x[9]; x[12] = x[15]; x[13] = x[14]; - idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit); + idct64_stage7_high48_avx2(x, cospi, _r, cos_bit); // stage 8 x[7] = x[0]; @@ -1315,12 +1297,12 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output, x[5] = x[2]; x[4] = x[3]; x[9] = x[9]; - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); - idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); + idct64_stage8_high48_avx2(x, cospi, _r, cos_bit); - idct64_stage9_avx2(x, cospi, __rounding, cos_bit); - idct64_stage10_avx2(x, cospi, __rounding, cos_bit); + idct64_stage9_avx2(x, cospi, _r, cos_bit); + idct64_stage10_avx2(x, cospi, _r, cos_bit); idct64_stage11_avx2(output, x); } @@ -1328,7 +1310,7 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); @@ -1398,7 +1380,7 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output, x[26] = x[27]; x[29] = x[28]; x[30] = x[31]; - idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit); + idct64_stage4_high32_avx2(x, cospi, _r, cos_bit); // stage 5 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); @@ -1406,37 +1388,37 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output, x[10] = x[11]; x[13] = x[12]; x[14] = x[15]; - idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit); + idct64_stage5_high48_avx2(x, cospi, _r, cos_bit); // stage 6 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); x[5] = x[4]; x[6] = x[7]; - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); - idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit); + idct64_stage6_high48_avx2(x, cospi, _r, cos_bit); // stage 7 x[3] = x[0]; x[2] = x[1]; - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); - btf_16_adds_subs_avx2(x[8], x[11]); - btf_16_adds_subs_avx2(x[9], x[10]); - btf_16_subs_adds_avx2(x[15], x[12]); - btf_16_subs_adds_avx2(x[14], x[13]); - idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x[8], &x[11]); + btf_16_adds_subs_avx2(&x[9], &x[10]); + btf_16_adds_subs_avx2(&x[15], &x[12]); + btf_16_adds_subs_avx2(&x[14], &x[13]); + idct64_stage7_high48_avx2(x, cospi, _r, cos_bit); // stage 8 - btf_16_adds_subs_avx2(x[0], x[7]); - btf_16_adds_subs_avx2(x[1], x[6]); - btf_16_adds_subs_avx2(x[2], x[5]); - btf_16_adds_subs_avx2(x[3], x[4]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); - idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit); - - idct64_stage9_avx2(x, cospi, __rounding, cos_bit); - idct64_stage10_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); + idct64_stage8_high48_avx2(x, cospi, _r, cos_bit); + + idct64_stage9_avx2(x, cospi, _r, cos_bit); + idct64_stage10_avx2(x, cospi, _r, cos_bit); idct64_stage11_avx2(output, x); } @@ -1444,7 +1426,7 @@ static void idct64_low32_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); @@ -1514,78 +1496,78 @@ static void idct64_low32_new_avx2(const __m256i *input, __m256i *output, btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); - btf_16_adds_subs_avx2(x[32], x[33]); - btf_16_subs_adds_avx2(x[35], x[34]); - btf_16_adds_subs_avx2(x[36], x[37]); - btf_16_subs_adds_avx2(x[39], x[38]); - btf_16_adds_subs_avx2(x[40], x[41]); - btf_16_subs_adds_avx2(x[43], x[42]); - btf_16_adds_subs_avx2(x[44], x[45]); - btf_16_subs_adds_avx2(x[47], x[46]); - btf_16_adds_subs_avx2(x[48], x[49]); - btf_16_subs_adds_avx2(x[51], x[50]); - btf_16_adds_subs_avx2(x[52], x[53]); - btf_16_subs_adds_avx2(x[55], x[54]); - btf_16_adds_subs_avx2(x[56], x[57]); - btf_16_subs_adds_avx2(x[59], x[58]); - btf_16_adds_subs_avx2(x[60], x[61]); - btf_16_subs_adds_avx2(x[63], x[62]); + btf_16_adds_subs_avx2(&x[32], &x[33]); + btf_16_adds_subs_avx2(&x[35], &x[34]); + btf_16_adds_subs_avx2(&x[36], &x[37]); + btf_16_adds_subs_avx2(&x[39], &x[38]); + btf_16_adds_subs_avx2(&x[40], &x[41]); + btf_16_adds_subs_avx2(&x[43], &x[42]); + btf_16_adds_subs_avx2(&x[44], &x[45]); + btf_16_adds_subs_avx2(&x[47], &x[46]); + btf_16_adds_subs_avx2(&x[48], &x[49]); + btf_16_adds_subs_avx2(&x[51], &x[50]); + btf_16_adds_subs_avx2(&x[52], &x[53]); + btf_16_adds_subs_avx2(&x[55], &x[54]); + btf_16_adds_subs_avx2(&x[56], &x[57]); + btf_16_adds_subs_avx2(&x[59], &x[58]); + btf_16_adds_subs_avx2(&x[60], &x[61]); + btf_16_adds_subs_avx2(&x[63], &x[62]); // stage 4 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); - btf_16_adds_subs_avx2(x[16], x[17]); - btf_16_subs_adds_avx2(x[19], x[18]); - btf_16_adds_subs_avx2(x[20], x[21]); - btf_16_subs_adds_avx2(x[23], x[22]); - btf_16_adds_subs_avx2(x[24], x[25]); - btf_16_subs_adds_avx2(x[27], x[26]); - btf_16_adds_subs_avx2(x[28], x[29]); - btf_16_subs_adds_avx2(x[31], x[30]); - idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[16], &x[17]); + btf_16_adds_subs_avx2(&x[19], &x[18]); + btf_16_adds_subs_avx2(&x[20], &x[21]); + btf_16_adds_subs_avx2(&x[23], &x[22]); + btf_16_adds_subs_avx2(&x[24], &x[25]); + btf_16_adds_subs_avx2(&x[27], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[29]); + btf_16_adds_subs_avx2(&x[31], &x[30]); + idct64_stage4_high32_avx2(x, cospi, _r, cos_bit); // stage 5 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); - btf_16_adds_subs_avx2(x[8], x[9]); - btf_16_subs_adds_avx2(x[11], x[10]); - btf_16_adds_subs_avx2(x[12], x[13]); - btf_16_subs_adds_avx2(x[15], x[14]); - idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[8], &x[9]); + btf_16_adds_subs_avx2(&x[11], &x[10]); + btf_16_adds_subs_avx2(&x[12], &x[13]); + btf_16_adds_subs_avx2(&x[15], &x[14]); + idct64_stage5_high48_avx2(x, cospi, _r, cos_bit); // stage 6 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); - btf_16_adds_subs_avx2(x[4], x[5]); - btf_16_subs_adds_avx2(x[7], x[6]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); - idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[4], &x[5]); + btf_16_adds_subs_avx2(&x[7], &x[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit); + idct64_stage6_high48_avx2(x, cospi, _r, cos_bit); // stage 7 - btf_16_adds_subs_avx2(x[0], x[3]); - btf_16_adds_subs_avx2(x[1], x[2]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); - btf_16_adds_subs_avx2(x[8], x[11]); - btf_16_adds_subs_avx2(x[9], x[10]); - btf_16_subs_adds_avx2(x[15], x[12]); - btf_16_subs_adds_avx2(x[14], x[13]); - idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[0], &x[3]); + btf_16_adds_subs_avx2(&x[1], &x[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x[8], &x[11]); + btf_16_adds_subs_avx2(&x[9], &x[10]); + btf_16_adds_subs_avx2(&x[15], &x[12]); + btf_16_adds_subs_avx2(&x[14], &x[13]); + idct64_stage7_high48_avx2(x, cospi, _r, cos_bit); // stage 8 - btf_16_adds_subs_avx2(x[0], x[7]); - btf_16_adds_subs_avx2(x[1], x[6]); - btf_16_adds_subs_avx2(x[2], x[5]); - btf_16_adds_subs_avx2(x[3], x[4]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); - idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); + idct64_stage8_high48_avx2(x, cospi, _r, cos_bit); // stage 9~11 - idct64_stage9_avx2(x, cospi, __rounding, cos_bit); - idct64_stage10_avx2(x, cospi, __rounding, cos_bit); + idct64_stage9_avx2(x, cospi, _r, cos_bit); + idct64_stage10_avx2(x, cospi, _r, cos_bit); idct64_stage11_avx2(output, x); } @@ -1667,7 +1649,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2( if (lr_flip) { for (int j = 0; j < buf_size_w_div16; ++j) { __m256i temp[16]; - flip_buf_av2(buf0 + 16 * j, temp, 16); + flip_buf_avx2(buf0 + 16 * j, temp, 16); int offset = txfm_size_row * (buf_size_w_div16 - 1 - j); transpose_16bit_16x16_avx2(temp, buf1_cur + offset); } @@ -1693,18 +1675,18 @@ static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input, int txw_idx, int rect_type) { const int32_t *input_row = input; const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]); - const __m256i rounding = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) + - (1 << (NewSqrt2Bits - shift - 1))); + const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) + + (1 << (NewSqrt2Bits - shift - 1))); const __m256i one = _mm256_set1_epi16(1); - const __m256i scale_rounding = _mm256_unpacklo_epi16(scale, rounding); + const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r); if (rect_type != 1 && rect_type != -1) { for (int i = 0; i < height; ++i) { const __m256i src = load_32bit_to_16bit_w16_avx2(input_row); input_row += stride; __m256i lo = _mm256_unpacklo_epi16(src, one); __m256i hi = _mm256_unpackhi_epi16(src, one); - lo = _mm256_madd_epi16(lo, scale_rounding); - hi = _mm256_madd_epi16(hi, scale_rounding); + lo = _mm256_madd_epi16(lo, scale__r); + hi = _mm256_madd_epi16(hi, scale__r); lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); out[i] = _mm256_packs_epi32(lo, hi); @@ -1718,8 +1700,8 @@ static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input, input_row += stride; __m256i lo = _mm256_unpacklo_epi16(src, one); __m256i hi = _mm256_unpackhi_epi16(src, one); - lo = _mm256_madd_epi16(lo, scale_rounding); - hi = _mm256_madd_epi16(hi, scale_rounding); + lo = _mm256_madd_epi16(lo, scale__r); + hi = _mm256_madd_epi16(hi, scale__r); lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); out[i] = _mm256_packs_epi32(lo, hi); @@ -1731,10 +1713,10 @@ static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride, __m256i *buf, int shift, int height, int txh_idx) { const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]); - const __m256i scale_rounding = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1)); - const __m256i shift_rounding = _mm256_set1_epi32(1 << (-shift - 1)); + const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1)); + const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1)); const __m256i one = _mm256_set1_epi16(1); - const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale_rounding); + const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r); for (int h = 0; h < height; ++h) { __m256i lo = _mm256_unpacklo_epi16(buf[h], one); __m256i hi = _mm256_unpackhi_epi16(buf[h], one); @@ -1742,8 +1724,8 @@ static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride, hi = _mm256_madd_epi16(hi, scale_coeff); lo = _mm256_srai_epi32(lo, NewSqrt2Bits); hi = _mm256_srai_epi32(hi, NewSqrt2Bits); - lo = _mm256_add_epi32(lo, shift_rounding); - hi = _mm256_add_epi32(hi, shift_rounding); + lo = _mm256_add_epi32(lo, shift__r); + hi = _mm256_add_epi32(hi, shift__r); lo = _mm256_srai_epi32(lo, -shift); hi = _mm256_srai_epi32(hi, -shift); const __m256i x = _mm256_packs_epi32(lo, hi); @@ -1856,7 +1838,7 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2( if (lr_flip) { for (int j = 0; j < buf_size_w_div16; ++j) { __m256i temp[16]; - flip_buf_av2(buf0 + 16 * j, temp, 16); + flip_buf_avx2(buf0 + 16 * j, temp, 16); transpose_16bit_16x16_avx2(temp, _buf1 + 16 * (buf_size_w_div16 - 1 - j)); } diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h index c17f655c5..7b5b29cf8 100644 --- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h @@ -19,37 +19,12 @@ #include "aom/aom_integer.h" #include "aom_dsp/x86/transpose_sse2.h" #include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_dsp/x86/txfm_common_avx2.h" #ifdef __cplusplus extern "C" { #endif -#define pair_set_w16_epi16(a, b) \ - _mm256_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))) - -#define btf_16_w16_avx2(w0, w1, in0, in1, out0, out1) \ - { \ - __m256i t0 = _mm256_unpacklo_epi16(in0, in1); \ - __m256i t1 = _mm256_unpackhi_epi16(in0, in1); \ - __m256i u0 = _mm256_madd_epi16(t0, w0); \ - __m256i u1 = _mm256_madd_epi16(t1, w0); \ - __m256i v0 = _mm256_madd_epi16(t0, w1); \ - __m256i v1 = _mm256_madd_epi16(t1, w1); \ - \ - __m256i a0 = _mm256_add_epi32(u0, __rounding); \ - __m256i a1 = _mm256_add_epi32(u1, __rounding); \ - __m256i b0 = _mm256_add_epi32(v0, __rounding); \ - __m256i b1 = _mm256_add_epi32(v1, __rounding); \ - \ - __m256i c0 = _mm256_srai_epi32(a0, cos_bit); \ - __m256i c1 = _mm256_srai_epi32(a1, cos_bit); \ - __m256i d0 = _mm256_srai_epi32(b0, cos_bit); \ - __m256i d1 = _mm256_srai_epi32(b1, cos_bit); \ - \ - out0 = _mm256_packs_epi32(c0, c1); \ - out1 = _mm256_packs_epi32(d0, d1); \ - } - // half input is zero #define btf_16_w16_0_avx2(w0, w1, in, out0, out1) \ { \ @@ -60,111 +35,6 @@ extern "C" { out1 = _mm256_mulhrs_epi16(_in, _w1); \ } -#define btf_16_adds_subs_avx2(in0, in1) \ - { \ - const __m256i _in0 = in0; \ - const __m256i _in1 = in1; \ - in0 = _mm256_adds_epi16(_in0, _in1); \ - in1 = _mm256_subs_epi16(_in0, _in1); \ - } - -#define btf_16_subs_adds_avx2(in0, in1) \ - { \ - const __m256i _in0 = in0; \ - const __m256i _in1 = in1; \ - in1 = _mm256_subs_epi16(_in0, _in1); \ - in0 = _mm256_adds_epi16(_in0, _in1); \ - } - -#define btf_16_adds_subs_out_avx2(out0, out1, in0, in1) \ - { \ - const __m256i _in0 = in0; \ - const __m256i _in1 = in1; \ - out0 = _mm256_adds_epi16(_in0, _in1); \ - out1 = _mm256_subs_epi16(_in0, _in1); \ - } - -static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) { - const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a); - const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8)); - return _mm256_permute4x64_epi64(b, 0xD8); -} - -static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in, - int stride, __m256i *out, - int out_size) { - for (int i = 0; i < out_size; ++i) { - out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride); - } -} - -static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in, - __m256i *const out) { - // Unpack 16 bit elements. Goes from: - // in[0]: 00 01 02 03 08 09 0a 0b 04 05 06 07 0c 0d 0e 0f - // in[1]: 10 11 12 13 18 19 1a 1b 14 15 16 17 1c 1d 1e 1f - // in[2]: 20 21 22 23 28 29 2a 2b 24 25 26 27 2c 2d 2e 2f - // in[3]: 30 31 32 33 38 39 3a 3b 34 35 36 37 3c 3d 3e 3f - // in[4]: 40 41 42 43 48 49 4a 4b 44 45 46 47 4c 4d 4e 4f - // in[5]: 50 51 52 53 58 59 5a 5b 54 55 56 57 5c 5d 5e 5f - // in[6]: 60 61 62 63 68 69 6a 6b 64 65 66 67 6c 6d 6e 6f - // in[7]: 70 71 72 73 78 79 7a 7b 74 75 76 77 7c 7d 7e 7f - // in[8]: 80 81 82 83 88 89 8a 8b 84 85 86 87 8c 8d 8e 8f - // to: - // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - // ... - __m256i a[16]; - for (int i = 0; i < 16; i += 2) { - a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]); - a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]); - } - __m256i b[16]; - for (int i = 0; i < 16; i += 2) { - b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]); - b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]); - } - __m256i c[16]; - for (int i = 0; i < 16; i += 2) { - c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]); - c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]); - } - out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20); - out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20); - out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20); - out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20); - - out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31); - out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31); - out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31); - out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31); - - out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20); - out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20); - out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20); - out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20); - - out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31); - out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31); - out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31); - out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31); -} - -static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) { - if (bit < 0) { - __m256i scale = _mm256_set1_epi16(1 << (bit + 15)); - for (int i = 0; i < size; ++i) { - in[i] = _mm256_mulhrs_epi16(in[i], scale); - } - } else if (bit > 0) { - for (int i = 0; i < size; ++i) { - in[i] = _mm256_slli_epi16(in[i], bit); - } - } -} - static INLINE void round_shift_avx2(const __m256i *input, __m256i *output, int size) { const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8); @@ -173,12 +43,6 @@ static INLINE void round_shift_avx2(const __m256i *input, __m256i *output, } } -static INLINE void flip_buf_av2(__m256i *in, __m256i *out, int size) { - for (int i = 0; i < size; ++i) { - out[size - i - 1] = in[i]; - } -} - static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) { __m128i pred = _mm_loadu_si128((__m128i const *)(output)); __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res); @@ -197,9 +61,6 @@ static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output, } } -typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output, - int8_t cos_bit); - void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob); diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.c b/third_party/aom/av1/common/x86/av1_txfm_sse4.c index cccc62f03..90b9879cc 100644 --- a/third_party/aom/av1/common/x86/av1_txfm_sse4.c +++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.c @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + #include "config/aom_dsp_rtcd.h" #include "av1/common/av1_txfm.h" diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h index faf7251fa..367e02096 100644 --- a/third_party/aom/av1/common/x86/av1_txfm_sse4.h +++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + #ifndef AV1_TXFM_SSE4_H_ #define AV1_TXFM_SSE4_H_ diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c index fd5e90a2e..1099144fe 100644 --- a/third_party/aom/av1/common/x86/convolve_2d_avx2.c +++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c @@ -24,8 +24,8 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { const int bd = 8; @@ -46,10 +46,10 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, assert(conv_params->round_0 > 0); - filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); - filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); - filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); - filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); + filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h); prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v); @@ -180,8 +180,8 @@ static INLINE void copy_128(const uint8_t *src, uint8_t *dst) { void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { (void)filter_params_x; diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c index fc0e65453..637f83cf7 100644 --- a/third_party/aom/av1/common/x86/convolve_2d_sse2.c +++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c @@ -21,8 +21,8 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { const int bd = 8; @@ -46,7 +46,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, /* Horizontal filter */ { const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_q4 & SUBPEL_MASK); const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); // coeffs 0 1 0 1 2 3 2 3 @@ -112,7 +112,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, /* Vertical filter */ { const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_q4 & SUBPEL_MASK); const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); // coeffs 0 1 0 1 2 3 2 3 @@ -239,8 +239,8 @@ static INLINE void copy_128(const uint8_t *src, uint8_t *dst) { void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { (void)filter_params_x; @@ -357,8 +357,8 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { const int bd = 8; diff --git a/third_party/aom/av1/common/x86/convolve_avx2.c b/third_party/aom/av1/common/x86/convolve_avx2.c index 6fdfb0954..0e91ea947 100644 --- a/third_party/aom/av1/common/x86/convolve_avx2.c +++ b/third_party/aom/av1/common/x86/convolve_avx2.c @@ -19,8 +19,8 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { int i, j; @@ -176,8 +176,8 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { int i, j; @@ -187,10 +187,10 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, __m256i filt[4], coeffs[4]; - filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); - filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); - filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); - filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); + filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs); diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c index 18fe9ae5a..f66dee37d 100644 --- a/third_party/aom/av1/common/x86/convolve_sse2.c +++ b/third_party/aom/av1/common/x86/convolve_sse2.c @@ -23,7 +23,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, const int subpel_q4, __m128i *const coeffs /* [4] */) { const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params, subpel_q4 & SUBPEL_MASK); + filter_params, subpel_q4 & SUBPEL_MASK); const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); @@ -78,8 +78,8 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s, void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { const int fo_vert = filter_params_y->taps / 2 - 1; @@ -239,8 +239,8 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { const int fo_horiz = filter_params_x->taps / 2 - 1; diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c index a34c618d0..8444ffa93 100644 --- a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c +++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c @@ -23,8 +23,8 @@ void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { @@ -222,8 +222,8 @@ static INLINE void copy_128(const uint16_t *src, uint16_t *dst) { void av1_highbd_convolve_2d_copy_sr_avx2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, const int subpel_x_q4, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { (void)filter_params_x; (void)filter_params_y; diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c index bdf813fa0..15f8872c1 100644 --- a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c +++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c @@ -73,8 +73,8 @@ static INLINE void copy_128(const uint16_t *src, uint16_t *dst) { void av1_highbd_convolve_2d_copy_sr_sse2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, const int subpel_x_q4, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { (void)filter_params_x; (void)filter_params_y; diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c index 5d2fc465e..eb340523a 100644 --- a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c +++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c @@ -24,8 +24,8 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, const int subpel_x_q4, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; @@ -169,8 +169,8 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1( void av1_highbd_jnt_convolve_2d_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, const int subpel_x_q4, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); @@ -207,7 +207,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1( /* Horizontal filter */ { const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_q4 & SUBPEL_MASK); const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); // coeffs 0 1 0 1 2 3 2 3 @@ -274,7 +274,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1( /* Vertical filter */ { const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_q4 & SUBPEL_MASK); const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); // coeffs 0 1 0 1 2 3 2 3 diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c index a9cf6a4d6..33183fdee 100644 --- a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c +++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c @@ -20,13 +20,11 @@ #include "aom_dsp/x86/convolve_sse2.h" #include "av1/common/convolve.h" -void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, - uint16_t *dst, int dst_stride, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_q4, - const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { +void av1_highbd_convolve_2d_sr_ssse3( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, + const int subpel_y_q4, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); int im_h = h + filter_params_y->taps - 1; int im_stride = 8; diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c index 89d0ecb1e..608bd88a4 100644 --- a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c +++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c @@ -25,8 +25,8 @@ void av1_highbd_jnt_convolve_2d_copy_avx2( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, const int subpel_x_q4, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; @@ -224,13 +224,11 @@ void av1_highbd_jnt_convolve_2d_copy_avx2( } } -void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, - uint16_t *dst0, int dst_stride0, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_q4, - const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { +void av1_highbd_jnt_convolve_2d_avx2( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, + const int subpel_y_q4, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; @@ -459,13 +457,11 @@ void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, } } -void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, - uint16_t *dst0, int dst_stride0, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_q4, - const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { +void av1_highbd_jnt_convolve_x_avx2( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, + const int subpel_y_q4, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int fo_horiz = filter_params_x->taps / 2 - 1; @@ -628,13 +624,11 @@ void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, } } -void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, - uint16_t *dst0, int dst_stride0, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_q4, - const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { +void av1_highbd_jnt_convolve_y_avx2( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, + const int subpel_y_q4, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int fo_vert = filter_params_y->taps / 2 - 1; diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c index ccca6b07a..1a29985b5 100644 --- a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c +++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c @@ -19,8 +19,8 @@ void av1_highbd_jnt_convolve_y_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, const int subpel_x_q4, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; @@ -259,8 +259,8 @@ void av1_highbd_jnt_convolve_y_sse4_1( void av1_highbd_jnt_convolve_x_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, const int subpel_x_q4, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c index ac1d2c9ca..d1ea26290 100644 --- a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c +++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c @@ -23,8 +23,8 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; @@ -53,10 +53,10 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, assert(bits >= 0); assert(conv_params->round_0 > 0); - filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); - filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); - filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); - filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); + filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs); @@ -126,8 +126,8 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; @@ -389,8 +389,8 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; @@ -422,10 +422,10 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, assert(conv_params->round_0 > 0); - filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); - filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); - filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); - filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); + filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x); prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); @@ -581,8 +581,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { const int bd = 8; diff --git a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c index 4df7bd42e..87dc3242e 100644 --- a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c +++ b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c @@ -18,8 +18,8 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { const int bd = 8; @@ -152,8 +152,8 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { const int bd = 8; diff --git a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c index e4d51ac8d..822772782 100644 --- a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c +++ b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c @@ -18,8 +18,8 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; @@ -56,7 +56,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, /* Horizontal filter */ { const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_q4 & SUBPEL_MASK); const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); // coeffs 0 1 0 1 2 3 2 3 @@ -124,7 +124,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, /* Vertical filter */ { const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_q4 & SUBPEL_MASK); const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); // coeffs 0 1 0 1 2 3 2 3 diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c index a42c94028..c64150b9d 100644 --- a/third_party/aom/av1/common/x86/selfguided_sse4.c +++ b/third_party/aom/av1/common/x86/selfguided_sse4.c @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + #include <smmintrin.h> #include "config/aom_config.h" diff --git a/third_party/aom/av1/decoder/decodeframe.c b/third_party/aom/av1/decoder/decodeframe.c index e92c6b28c..6dbc4f3eb 100644 --- a/third_party/aom/av1/decoder/decodeframe.c +++ b/third_party/aom/av1/decoder/decodeframe.c @@ -84,15 +84,15 @@ int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) { } // Use only_chroma = 1 to only set the chroma planes -static void set_planes_to_neutral_grey(AV1_COMMON *const cm, +static void set_planes_to_neutral_grey(const SequenceHeader *const seq_params, const YV12_BUFFER_CONFIG *const buf, int only_chroma) { - const int val = 1 << (cm->bit_depth - 1); + const int val = 1 << (seq_params->bit_depth - 1); for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) { const int is_uv = plane > 0; for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) { - if (cm->use_highbitdepth) { + if (seq_params->use_highbitdepth) { // TODO(yaowu): replace this with aom_memset16() for speed for (int col_idx = 0; col_idx < buf->crop_widths[is_uv]; col_idx++) { uint16_t *base = CONVERT_TO_SHORTPTR(buf->buffers[plane]); @@ -157,16 +157,18 @@ static void inverse_transform_block(MACROBLOCKD *xd, int plane, memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0])); } -static void read_coeffs_tx_intra_block(AV1_COMMON *cm, MACROBLOCKD *const xd, - aom_reader *const r, int plane, int row, - int col, TX_SIZE tx_size) { +static void read_coeffs_tx_intra_block(const AV1_COMMON *const cm, + MACROBLOCKD *const xd, + aom_reader *const r, const int plane, + const int row, const int col, + const TX_SIZE tx_size) { MB_MODE_INFO *mbmi = xd->mi[0]; if (!mbmi->skip) { #if TXCOEFF_TIMER struct aom_usec_timer timer; aom_usec_timer_start(&timer); #endif - av1_read_coeffs_txb_facade(cm, xd, r, row, col, plane, tx_size); + av1_read_coeffs_txb_facade(cm, xd, r, plane, row, col, tx_size); #if TXCOEFF_TIMER aom_usec_timer_mark(&timer); const int64_t elapsed_time = aom_usec_timer_elapsed(&timer); @@ -176,11 +178,38 @@ static void read_coeffs_tx_intra_block(AV1_COMMON *cm, MACROBLOCKD *const xd, } } -static void predict_and_reconstruct_intra_block(AV1_COMMON *cm, - MACROBLOCKD *const xd, - aom_reader *const r, int plane, - int row, int col, - TX_SIZE tx_size) { +static void decode_block_void(const AV1_COMMON *const cm, MACROBLOCKD *const xd, + aom_reader *const r, const int plane, + const int row, const int col, + const TX_SIZE tx_size) { + (void)cm; + (void)xd; + (void)r; + (void)plane; + (void)row; + (void)col; + (void)tx_size; +} + +static void predict_inter_block_void(AV1_COMMON *const cm, + MACROBLOCKD *const xd, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + (void)cm; + (void)xd; + (void)mi_row; + (void)mi_col; + (void)bsize; +} + +static void cfl_store_inter_block_void(AV1_COMMON *const cm, + MACROBLOCKD *const xd) { + (void)cm; + (void)xd; +} + +static void predict_and_reconstruct_intra_block( + const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r, + const int plane, const int row, const int col, const TX_SIZE tx_size) { (void)r; MB_MODE_INFO *mbmi = xd->mi[0]; PLANE_TYPE plane_type = get_plane_type(plane); @@ -208,28 +237,33 @@ static void predict_and_reconstruct_intra_block(AV1_COMMON *cm, static void inverse_transform_inter_block(const AV1_COMMON *const cm, MACROBLOCKD *const xd, - aom_reader *const r, + aom_reader *const r, const int plane, const int blk_row, const int blk_col, - const int plane, const TX_SIZE tx_size) { (void)r; PLANE_TYPE plane_type = get_plane_type(plane); const struct macroblockd_plane *const pd = &xd->plane[plane]; - MB_MODE_INFO *mbmi = xd->mi[0]; // tx_type will be read out in av1_read_coeffs_txb_facade const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size, cm->reduced_tx_set_used); - if (plane == 0) - update_txk_array(mbmi->txk_type, mbmi->sb_type, blk_row, blk_col, tx_size, - tx_type); - uint8_t *dst = &pd->dst .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]]; inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride, cm->reduced_tx_set_used); +#if CONFIG_MISMATCH_DEBUG + int pixel_c, pixel_r; + BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; + int blk_w = block_size_wide[bsize]; + int blk_h = block_size_high[bsize]; + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row, + pd->subsampling_x, pd->subsampling_y); + mismatch_check_block_tx(dst, pd->dst.stride, cm->frame_offset, plane, pixel_c, + pixel_r, blk_w, blk_h, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); +#endif } static void set_cb_buffer_offsets(MACROBLOCKD *const xd, TX_SIZE tx_size, @@ -239,11 +273,12 @@ static void set_cb_buffer_offsets(MACROBLOCKD *const xd, TX_SIZE tx_size, xd->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN); } -static void decode_reconstruct_tx(AV1_COMMON *cm, MACROBLOCKD *const xd, +static void decode_reconstruct_tx(AV1_COMMON *cm, ThreadData *const td, aom_reader *r, MB_MODE_INFO *const mbmi, int plane, BLOCK_SIZE plane_bsize, int blk_row, int blk_col, int block, TX_SIZE tx_size, int *eob_total) { + MACROBLOCKD *const xd = &td->xd; const struct macroblockd_plane *const pd = &xd->plane[plane]; const TX_SIZE plane_tx_size = plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x, @@ -257,30 +292,11 @@ static void decode_reconstruct_tx(AV1_COMMON *cm, MACROBLOCKD *const xd, if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; if (tx_size == plane_tx_size || plane) { -#if TXCOEFF_TIMER - struct aom_usec_timer timer; - aom_usec_timer_start(&timer); -#endif - av1_read_coeffs_txb_facade(cm, xd, r, blk_row, blk_col, plane, tx_size); -#if TXCOEFF_TIMER - aom_usec_timer_mark(&timer); - const int64_t elapsed_time = aom_usec_timer_elapsed(&timer); - cm->txcoeff_timer += elapsed_time; - ++cm->txb_count; -#endif - inverse_transform_inter_block(cm, xd, r, blk_row, blk_col, plane, tx_size); + td->read_coeffs_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col, + tx_size); -#if CONFIG_MISMATCH_DEBUG - int pixel_c, pixel_r; - BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; - int blk_w = block_size_wide[bsize]; - int blk_h = block_size_high[bsize]; - mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row, - pd->subsampling_x, pd->subsampling_y); - mismatch_check_block_tx(dst, pd->dst.stride, cm->frame_offset, plane, - pixel_c, pixel_r, blk_w, blk_h, - xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); -#endif + td->inverse_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col, + tx_size); eob_info *eob_data = pd->eob_data + xd->txb_offset[plane]; *eob_total += eob_data->eob; set_cb_buffer_offsets(xd, tx_size, plane); @@ -301,7 +317,7 @@ static void decode_reconstruct_tx(AV1_COMMON *cm, MACROBLOCKD *const xd, if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; - decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, offsetr, + decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, offsetr, offsetc, block, sub_txs, eob_total); block += sub_step; } @@ -352,6 +368,7 @@ static void decode_mbmi_block(AV1Decoder *const pbi, MACROBLOCKD *const xd, int mi_row, int mi_col, aom_reader *r, PARTITION_TYPE partition, BLOCK_SIZE bsize) { AV1_COMMON *const cm = &pbi->common; + const SequenceHeader *const seq_params = &cm->seq_params; const int bw = mi_size_wide[bsize]; const int bh = mi_size_high[bsize]; const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col); @@ -363,9 +380,11 @@ static void decode_mbmi_block(AV1Decoder *const pbi, MACROBLOCKD *const xd, set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis); xd->mi[0]->partition = partition; av1_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis); - if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) { + if (bsize >= BLOCK_8X8 && + (seq_params->subsampling_x || seq_params->subsampling_y)) { const BLOCK_SIZE uv_subsize = - ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y]; + ss_size_lookup[bsize][seq_params->subsampling_x] + [seq_params->subsampling_y]; if (uv_subsize == BLOCK_INVALID) aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, "Invalid block size."); @@ -843,8 +862,8 @@ static void dec_build_inter_predictors_sby(const AV1_COMMON *cm, BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL }, { xd->plane[0].dst.stride, 0, 0 } }; if (!ctx) ctx = &default_ctx; - av1_build_interintra_predictors_sby(cm, xd, xd->plane[0].dst.buf, - xd->plane[0].dst.stride, ctx, bsize); + av1_build_interintra_predictors_sbp(cm, xd, xd->plane[0].dst.buf, + xd->plane[0].dst.stride, ctx, 0, bsize); } } @@ -1052,6 +1071,20 @@ static void predict_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd, dec_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize); if (mbmi->motion_mode == OBMC_CAUSAL) dec_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col); +#if CONFIG_MISMATCH_DEBUG + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + int pixel_c, pixel_r; + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, pd->subsampling_x, + pd->subsampling_y); + if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, + pd->subsampling_y)) + continue; + mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, cm->frame_offset, + plane, pixel_c, pixel_r, pd->width, pd->height, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } +#endif } static void set_color_index_map_offset(MACROBLOCKD *const xd, int plane, @@ -1064,42 +1097,19 @@ static void set_color_index_map_offset(MACROBLOCKD *const xd, int plane, xd->color_index_map_offset[plane] += params.plane_width * params.plane_height; } -static void decode_token_and_recon_block(AV1Decoder *const pbi, - MACROBLOCKD *const xd, int mi_row, - int mi_col, aom_reader *r, - BLOCK_SIZE bsize) { +static void decode_token_recon_block(AV1Decoder *const pbi, + ThreadData *const td, int mi_row, + int mi_col, aom_reader *r, + BLOCK_SIZE bsize) { AV1_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &td->xd; const int num_planes = av1_num_planes(cm); - const int bw = mi_size_wide[bsize]; - const int bh = mi_size_high[bsize]; - const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col); - const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row); - set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis); MB_MODE_INFO *mbmi = xd->mi[0]; CFL_CTX *const cfl = &xd->cfl; cfl->is_chroma_reference = is_chroma_reference( mi_row, mi_col, bsize, cfl->subsampling_x, cfl->subsampling_y); - if (cm->delta_q_present_flag) { - for (int i = 0; i < MAX_SEGMENTS; i++) { - const int current_qindex = - av1_get_qindex(&cm->seg, i, xd->current_qindex); - for (int j = 0; j < num_planes; ++j) { - const int dc_delta_q = - j == 0 ? cm->y_dc_delta_q - : (j == 1 ? cm->u_dc_delta_q : cm->v_dc_delta_q); - const int ac_delta_q = - j == 0 ? 0 : (j == 1 ? cm->u_ac_delta_q : cm->v_ac_delta_q); - xd->plane[j].seg_dequant_QTX[i][0] = - av1_dc_quant_QTX(current_qindex, dc_delta_q, cm->bit_depth); - xd->plane[j].seg_dequant_QTX[i][1] = - av1_ac_quant_QTX(current_qindex, ac_delta_q, cm->bit_depth); - } - } - } - if (mbmi->skip) av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes); - if (!is_inter_block(mbmi)) { int row, col; assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x, @@ -1135,10 +1145,10 @@ static void decode_token_and_recon_block(AV1Decoder *const pbi, blk_row += stepr) { for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width; blk_col += stepc) { - read_coeffs_tx_intra_block(cm, xd, r, plane, blk_row, blk_col, - tx_size); - predict_and_reconstruct_intra_block(cm, xd, r, plane, blk_row, - blk_col, tx_size); + td->read_coeffs_tx_intra_block_visit(cm, xd, r, plane, blk_row, + blk_col, tx_size); + td->predict_and_recon_intra_block_visit(cm, xd, r, plane, blk_row, + blk_col, tx_size); set_cb_buffer_offsets(xd, tx_size, plane); } } @@ -1146,22 +1156,7 @@ static void decode_token_and_recon_block(AV1Decoder *const pbi, } } } else { - predict_inter_block(cm, xd, mi_row, mi_col, bsize); -#if CONFIG_MISMATCH_DEBUG - for (int plane = 0; plane < num_planes; ++plane) { - const struct macroblockd_plane *pd = &xd->plane[plane]; - int pixel_c, pixel_r; - mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, - pd->subsampling_x, pd->subsampling_y); - if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, - pd->subsampling_y)) - continue; - mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, cm->frame_offset, - plane, pixel_c, pixel_r, pd->width, pd->height, - xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); - } -#endif - + td->predict_inter_block_visit(cm, xd, mi_row, mi_col, bsize); // Reconstruction if (!mbmi->skip) { int eobtotal = 0; @@ -1213,7 +1208,7 @@ static void decode_token_and_recon_block(AV1Decoder *const pbi, blk_row += bh_var_tx) { for (blk_col = col >> pd->subsampling_x; blk_col < unit_width; blk_col += bw_var_tx) { - decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, + decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, blk_row, blk_col, block, max_tx_size, &eobtotal); block += step; @@ -1223,14 +1218,11 @@ static void decode_token_and_recon_block(AV1Decoder *const pbi, } } } - cfl_store_inter_block(cm, xd); + td->cfl_store_inter_block_visit(cm, xd); } av1_visit_palette(pbi, xd, mi_row, mi_col, r, bsize, set_color_index_map_offset); - - int reader_corrupted_flag = aom_reader_has_error(r); - aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag); } static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi, @@ -1338,15 +1330,17 @@ static TX_SIZE read_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd, int is_inter, } } -static void decode_block(AV1Decoder *const pbi, MACROBLOCKD *const xd, - int mi_row, int mi_col, aom_reader *r, - PARTITION_TYPE partition, BLOCK_SIZE bsize) { +static void parse_decode_block(AV1Decoder *const pbi, ThreadData *const td, + int mi_row, int mi_col, aom_reader *r, + PARTITION_TYPE partition, BLOCK_SIZE bsize) { + MACROBLOCKD *const xd = &td->xd; decode_mbmi_block(pbi, xd, mi_row, mi_col, r, partition, bsize); av1_visit_palette(pbi, xd, mi_row, mi_col, r, bsize, av1_decode_palette_tokens); AV1_COMMON *cm = &pbi->common; + const int num_planes = av1_num_planes(cm); MB_MODE_INFO *mbmi = xd->mi[0]; int inter_block_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi); if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) && @@ -1368,7 +1362,63 @@ static void decode_block(AV1Decoder *const pbi, MACROBLOCKD *const xd, mbmi->skip && is_inter_block(mbmi), xd); } - decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, bsize); + if (cm->delta_q_present_flag) { + for (int i = 0; i < MAX_SEGMENTS; i++) { + const int current_qindex = + av1_get_qindex(&cm->seg, i, xd->current_qindex); + for (int j = 0; j < num_planes; ++j) { + const int dc_delta_q = + j == 0 ? cm->y_dc_delta_q + : (j == 1 ? cm->u_dc_delta_q : cm->v_dc_delta_q); + const int ac_delta_q = + j == 0 ? 0 : (j == 1 ? cm->u_ac_delta_q : cm->v_ac_delta_q); + xd->plane[j].seg_dequant_QTX[i][0] = av1_dc_quant_QTX( + current_qindex, dc_delta_q, cm->seq_params.bit_depth); + xd->plane[j].seg_dequant_QTX[i][1] = av1_ac_quant_QTX( + current_qindex, ac_delta_q, cm->seq_params.bit_depth); + } + } + } + if (mbmi->skip) av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes); + + decode_token_recon_block(pbi, td, mi_row, mi_col, r, bsize); + + int reader_corrupted_flag = aom_reader_has_error(r); + aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag); +} + +static void set_offsets_for_pred_and_recon(AV1Decoder *const pbi, + ThreadData *const td, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &td->xd; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int num_planes = av1_num_planes(cm); + + const int offset = mi_row * cm->mi_stride + mi_col; + const TileInfo *const tile = &xd->tile; + + xd->mi = cm->mi_grid_visible + offset; + xd->cfl.mi_row = mi_row; + xd->cfl.mi_col = mi_col; + + set_plane_n4(xd, bw, bh, num_planes); + + // Distance of Mb to the various image edges. These are specified to 8th pel + // as they are always compared to values that are in 1/8th pel units + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); + + av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row, + mi_col, 0, num_planes); +} + +static void decode_block(AV1Decoder *const pbi, ThreadData *const td, + int mi_row, int mi_col, aom_reader *r, + PARTITION_TYPE partition, BLOCK_SIZE bsize) { + (void)partition; + set_offsets_for_pred_and_recon(pbi, td, mi_row, mi_col, bsize); + decode_token_recon_block(pbi, td, mi_row, mi_col, r, bsize); } static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col, @@ -1401,10 +1451,11 @@ static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col, } // TODO(slavarnway): eliminate bsize and subsize in future commits -static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd, +static void decode_partition(AV1Decoder *const pbi, ThreadData *const td, int mi_row, int mi_col, aom_reader *r, - BLOCK_SIZE bsize) { + BLOCK_SIZE bsize, int parse_decode_flag) { AV1_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &td->xd; const int bw = mi_size_wide[bsize]; const int hbs = bw >> 1; PARTITION_TYPE partition; @@ -1416,25 +1467,36 @@ static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd, if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - const int num_planes = av1_num_planes(cm); - for (int plane = 0; plane < num_planes; ++plane) { - int rcol0, rcol1, rrow0, rrow1, tile_tl_idx; - if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize, - &rcol0, &rcol1, &rrow0, &rrow1, - &tile_tl_idx)) { - const int rstride = cm->rst_info[plane].horz_units_per_tile; - for (int rrow = rrow0; rrow < rrow1; ++rrow) { - for (int rcol = rcol0; rcol < rcol1; ++rcol) { - const int runit_idx = tile_tl_idx + rcol + rrow * rstride; - loop_restoration_read_sb_coeffs(cm, xd, r, plane, runit_idx); + // parse_decode_flag takes the following values : + // 01 - do parse only + // 10 - do decode only + // 11 - do parse and decode + static const block_visitor_fn_t block_visit[4] = { + NULL, parse_decode_block, decode_block, parse_decode_block + }; + + if (parse_decode_flag & 1) { + const int num_planes = av1_num_planes(cm); + for (int plane = 0; plane < num_planes; ++plane) { + int rcol0, rcol1, rrow0, rrow1; + if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize, + &rcol0, &rcol1, &rrow0, &rrow1)) { + const int rstride = cm->rst_info[plane].horz_units_per_tile; + for (int rrow = rrow0; rrow < rrow1; ++rrow) { + for (int rcol = rcol0; rcol < rcol1; ++rcol) { + const int runit_idx = rcol + rrow * rstride; + loop_restoration_read_sb_coeffs(cm, xd, r, plane, runit_idx); + } } } } - } - partition = (bsize < BLOCK_8X8) ? PARTITION_NONE - : read_partition(xd, mi_row, mi_col, r, - has_rows, has_cols, bsize); + partition = (bsize < BLOCK_8X8) ? PARTITION_NONE + : read_partition(xd, mi_row, mi_col, r, + has_rows, has_cols, bsize); + } else { + partition = get_partition(cm, mi_row, mi_col, bsize); + } subsize = get_partition_subsize(bsize, partition); // Check the bitstream is conformant: if there is subsampling on the @@ -1442,18 +1504,19 @@ static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd, const struct macroblockd_plane *const pd_u = &xd->plane[1]; if (get_plane_block_size(subsize, pd_u->subsampling_x, pd_u->subsampling_y) == BLOCK_INVALID) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, "Block size %dx%d invalid with this subsampling mode", block_size_wide[subsize], block_size_high[subsize]); } #define DEC_BLOCK_STX_ARG #define DEC_BLOCK_EPT_ARG partition, -#define DEC_BLOCK(db_r, db_c, db_subsize) \ - decode_block(pbi, xd, DEC_BLOCK_STX_ARG(db_r), (db_c), r, \ - DEC_BLOCK_EPT_ARG(db_subsize)) -#define DEC_PARTITION(db_r, db_c, db_subsize) \ - decode_partition(pbi, xd, DEC_BLOCK_STX_ARG(db_r), (db_c), r, (db_subsize)) +#define DEC_BLOCK(db_r, db_c, db_subsize) \ + block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), r, \ + DEC_BLOCK_EPT_ARG(db_subsize)) +#define DEC_PARTITION(db_r, db_c, db_subsize) \ + decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), r, (db_subsize), \ + parse_decode_flag) switch (partition) { case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize); break; @@ -1513,7 +1576,8 @@ static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd, #undef DEC_BLOCK_EPT_ARG #undef DEC_BLOCK_STX_ARG - update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); + if (parse_decode_flag & 1) + update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); } static void setup_bool_decoder(const uint8_t *data, const uint8_t *data_end, @@ -1650,7 +1714,7 @@ static void decode_restoration_mode(AV1_COMMON *cm, } if (num_planes > 1) { - int s = AOMMIN(cm->subsampling_x, cm->subsampling_y); + int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y); if (s && !chroma_none) { cm->rst_info[1].restoration_unit_size = cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s); @@ -1872,12 +1936,13 @@ static INLINE int read_delta_q(struct aom_read_bit_buffer *rb) { static void setup_quantization(AV1_COMMON *const cm, struct aom_read_bit_buffer *rb) { + const SequenceHeader *const seq_params = &cm->seq_params; const int num_planes = av1_num_planes(cm); cm->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS); cm->y_dc_delta_q = read_delta_q(rb); if (num_planes > 1) { int diff_uv_delta = 0; - if (cm->separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb); + if (seq_params->separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb); cm->u_dc_delta_q = read_delta_q(rb); cm->u_ac_delta_q = read_delta_q(rb); if (diff_uv_delta) { @@ -1888,12 +1953,12 @@ static void setup_quantization(AV1_COMMON *const cm, cm->v_ac_delta_q = cm->u_ac_delta_q; } } - cm->dequant_bit_depth = cm->bit_depth; + cm->dequant_bit_depth = seq_params->bit_depth; cm->using_qmatrix = aom_rb_read_bit(rb); if (cm->using_qmatrix) { cm->qm_y = aom_rb_read_literal(rb, QM_LEVEL_BITS); cm->qm_u = aom_rb_read_literal(rb, QM_LEVEL_BITS); - if (!cm->separate_uv_delta_q) + if (!seq_params->separate_uv_delta_q) cm->qm_v = cm->qm_u; else cm->qm_v = aom_rb_read_literal(rb, QM_LEVEL_BITS); @@ -1906,6 +1971,7 @@ static void setup_quantization(AV1_COMMON *const cm, // Build y/uv dequant values based on segmentation. static void setup_segmentation_dequant(AV1_COMMON *const cm) { + const int bit_depth = cm->seq_params.bit_depth; const int using_qm = cm->using_qmatrix; // When segmentation is disabled, only the first value is used. The // remaining are don't cares. @@ -1913,16 +1979,16 @@ static void setup_segmentation_dequant(AV1_COMMON *const cm) { for (int i = 0; i < max_segments; ++i) { const int qindex = av1_get_qindex(&cm->seg, i, cm->base_qindex); cm->y_dequant_QTX[i][0] = - av1_dc_quant_QTX(qindex, cm->y_dc_delta_q, cm->bit_depth); - cm->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, cm->bit_depth); + av1_dc_quant_QTX(qindex, cm->y_dc_delta_q, bit_depth); + cm->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, bit_depth); cm->u_dequant_QTX[i][0] = - av1_dc_quant_QTX(qindex, cm->u_dc_delta_q, cm->bit_depth); + av1_dc_quant_QTX(qindex, cm->u_dc_delta_q, bit_depth); cm->u_dequant_QTX[i][1] = - av1_ac_quant_QTX(qindex, cm->u_ac_delta_q, cm->bit_depth); + av1_ac_quant_QTX(qindex, cm->u_ac_delta_q, bit_depth); cm->v_dequant_QTX[i][0] = - av1_dc_quant_QTX(qindex, cm->v_dc_delta_q, cm->bit_depth); + av1_dc_quant_QTX(qindex, cm->v_dc_delta_q, bit_depth); cm->v_dequant_QTX[i][1] = - av1_ac_quant_QTX(qindex, cm->v_ac_delta_q, cm->bit_depth); + av1_ac_quant_QTX(qindex, cm->v_ac_delta_q, bit_depth); const int lossless = qindex == 0 && cm->y_dc_delta_q == 0 && cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 && cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0; @@ -1994,9 +2060,15 @@ static void resize_context_buffers(AV1_COMMON *cm, int width, int height) { // Allocations in av1_alloc_context_buffers() depend on individual // dimensions as well as the overall size. if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) { - if (av1_alloc_context_buffers(cm, width, height)) + if (av1_alloc_context_buffers(cm, width, height)) { + // The cm->mi_* values have been cleared and any existing context + // buffers have been freed. Clear cm->width and cm->height to be + // consistent and to force a realloc next time. + cm->width = 0; + cm->height = 0; aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate context buffers"); + } } else { av1_set_mb_mi(cm, width, height); } @@ -2012,21 +2084,22 @@ static void resize_context_buffers(AV1_COMMON *cm, int width, int height) { static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag, struct aom_read_bit_buffer *rb) { + const SequenceHeader *const seq_params = &cm->seq_params; int width, height; BufferPool *const pool = cm->buffer_pool; if (frame_size_override_flag) { - int num_bits_width = cm->seq_params.num_bits_width; - int num_bits_height = cm->seq_params.num_bits_height; + int num_bits_width = seq_params->num_bits_width; + int num_bits_height = seq_params->num_bits_height; av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height); - if (width > cm->seq_params.max_frame_width || - height > cm->seq_params.max_frame_height) { + if (width > seq_params->max_frame_width || + height > seq_params->max_frame_height) { aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Frame dimensions are larger than the maximum values"); } } else { - width = cm->seq_params.max_frame_width; - height = cm->seq_params.max_frame_height; + width = seq_params->max_frame_width; + height = seq_params->max_frame_height; } setup_superres(cm, rb, &width, &height); @@ -2035,8 +2108,9 @@ static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag, lock_buffer_pool(pool); if (aom_realloc_frame_buffer( - get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, - cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, + get_frame_new_buffer(cm), cm->width, cm->height, + seq_params->subsampling_x, seq_params->subsampling_y, + seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->byte_alignment, &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) { @@ -2046,18 +2120,22 @@ static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag, } unlock_buffer_pool(pool); - pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; - pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; - pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; - pool->frame_bufs[cm->new_fb_idx].buf.color_primaries = cm->color_primaries; + pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = + seq_params->subsampling_x; + pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = + seq_params->subsampling_y; + pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = + (unsigned int)seq_params->bit_depth; + pool->frame_bufs[cm->new_fb_idx].buf.color_primaries = + seq_params->color_primaries; pool->frame_bufs[cm->new_fb_idx].buf.transfer_characteristics = - cm->transfer_characteristics; + seq_params->transfer_characteristics; pool->frame_bufs[cm->new_fb_idx].buf.matrix_coefficients = - cm->matrix_coefficients; - pool->frame_bufs[cm->new_fb_idx].buf.monochrome = cm->seq_params.monochrome; + seq_params->matrix_coefficients; + pool->frame_bufs[cm->new_fb_idx].buf.monochrome = seq_params->monochrome; pool->frame_bufs[cm->new_fb_idx].buf.chroma_sample_position = - cm->chroma_sample_position; - pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range; + seq_params->chroma_sample_position; + pool->frame_bufs[cm->new_fb_idx].buf.color_range = seq_params->color_range; pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width; pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height; } @@ -2095,9 +2173,10 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm, } } + const SequenceHeader *const seq_params = &cm->seq_params; if (!found) { - int num_bits_width = cm->seq_params.num_bits_width; - int num_bits_height = cm->seq_params.num_bits_height; + int num_bits_width = seq_params->num_bits_width; + int num_bits_height = seq_params->num_bits_height; av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height); setup_superres(cm, rb, &width, &height); @@ -2122,18 +2201,19 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm, "Referenced frame has invalid size"); for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { RefBuffer *const ref_frame = &cm->frame_refs[i]; - if (!valid_ref_frame_img_fmt(ref_frame->buf->bit_depth, - ref_frame->buf->subsampling_x, - ref_frame->buf->subsampling_y, cm->bit_depth, - cm->subsampling_x, cm->subsampling_y)) + if (!valid_ref_frame_img_fmt( + ref_frame->buf->bit_depth, ref_frame->buf->subsampling_x, + ref_frame->buf->subsampling_y, seq_params->bit_depth, + seq_params->subsampling_x, seq_params->subsampling_y)) aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Referenced frame has incompatible color format"); } lock_buffer_pool(pool); if (aom_realloc_frame_buffer( - get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, - cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, + get_frame_new_buffer(cm), cm->width, cm->height, + seq_params->subsampling_x, seq_params->subsampling_y, + seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->byte_alignment, &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) { @@ -2143,18 +2223,22 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm, } unlock_buffer_pool(pool); - pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; - pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; - pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; - pool->frame_bufs[cm->new_fb_idx].buf.color_primaries = cm->color_primaries; + pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = + seq_params->subsampling_x; + pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = + seq_params->subsampling_y; + pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = + (unsigned int)seq_params->bit_depth; + pool->frame_bufs[cm->new_fb_idx].buf.color_primaries = + seq_params->color_primaries; pool->frame_bufs[cm->new_fb_idx].buf.transfer_characteristics = - cm->transfer_characteristics; + seq_params->transfer_characteristics; pool->frame_bufs[cm->new_fb_idx].buf.matrix_coefficients = - cm->matrix_coefficients; - pool->frame_bufs[cm->new_fb_idx].buf.monochrome = cm->seq_params.monochrome; + seq_params->matrix_coefficients; + pool->frame_bufs[cm->new_fb_idx].buf.monochrome = seq_params->monochrome; pool->frame_bufs[cm->new_fb_idx].buf.chroma_sample_position = - cm->chroma_sample_position; - pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range; + seq_params->chroma_sample_position; + pool->frame_bufs[cm->new_fb_idx].buf.color_range = seq_params->color_range; pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width; pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height; } @@ -2500,8 +2584,15 @@ static void get_tile_buffers(AV1Decoder *pbi, const uint8_t *data, } } -static void set_cb_buffer(MACROBLOCKD *const xd, CB_BUFFER *cb_buffer, - const int num_planes) { +static void set_cb_buffer(AV1Decoder *pbi, MACROBLOCKD *const xd, + CB_BUFFER *cb_buffer_base, const int num_planes, + int mi_row, int mi_col) { + AV1_COMMON *const cm = &pbi->common; + int mib_size_log2 = cm->seq_params.mib_size_log2; + int stride = (cm->mi_cols >> mib_size_log2) + 1; + int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2); + CB_BUFFER *cb_buffer = cb_buffer_base + offset; + for (int plane = 0; plane < num_planes; ++plane) { xd->plane[plane].dqcoeff_block = cb_buffer->dqcoeff[plane]; xd->plane[plane].eob_data = cb_buffer->eob_data[plane]; @@ -2514,18 +2605,189 @@ static void set_cb_buffer(MACROBLOCKD *const xd, CB_BUFFER *cb_buffer, xd->color_index_map_offset[1] = 0; } +static void decoder_alloc_tile_data(AV1Decoder *pbi, const int n_tiles) { + AV1_COMMON *const cm = &pbi->common; + aom_free(pbi->tile_data); + CHECK_MEM_ERROR(cm, pbi->tile_data, + aom_memalign(32, n_tiles * sizeof(*pbi->tile_data))); + pbi->allocated_tiles = n_tiles; + for (int i = 0; i < n_tiles; i++) { + TileDataDec *const tile_data = pbi->tile_data + i; + av1_zero(tile_data->dec_row_mt_sync); + } + pbi->allocated_row_mt_sync_rows = 0; +} + +// Set up nsync by width. +static INLINE int get_sync_range(int width) { +// nsync numbers are picked by testing. +#if 0 + if (width < 640) + return 1; + else if (width <= 1280) + return 2; + else if (width <= 4096) + return 4; + else + return 8; +#else + (void)width; +#endif + return 1; +} + +// Allocate memory for decoder row synchronization +static void dec_row_mt_alloc(AV1DecRowMTSync *dec_row_mt_sync, AV1_COMMON *cm, + int rows) { + dec_row_mt_sync->allocated_sb_rows = rows; +#if CONFIG_MULTITHREAD + { + int i; + + CHECK_MEM_ERROR(cm, dec_row_mt_sync->mutex_, + aom_malloc(sizeof(*(dec_row_mt_sync->mutex_)) * rows)); + if (dec_row_mt_sync->mutex_) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&dec_row_mt_sync->mutex_[i], NULL); + } + } + + CHECK_MEM_ERROR(cm, dec_row_mt_sync->cond_, + aom_malloc(sizeof(*(dec_row_mt_sync->cond_)) * rows)); + if (dec_row_mt_sync->cond_) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&dec_row_mt_sync->cond_[i], NULL); + } + } + } +#endif // CONFIG_MULTITHREAD + + CHECK_MEM_ERROR(cm, dec_row_mt_sync->cur_sb_col, + aom_malloc(sizeof(*(dec_row_mt_sync->cur_sb_col)) * rows)); + + // Set up nsync. + dec_row_mt_sync->sync_range = get_sync_range(cm->width); +} + +// Deallocate decoder row synchronization related mutex and data +void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync) { + if (dec_row_mt_sync != NULL) { +#if CONFIG_MULTITHREAD + int i; + if (dec_row_mt_sync->mutex_ != NULL) { + for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) { + pthread_mutex_destroy(&dec_row_mt_sync->mutex_[i]); + } + aom_free(dec_row_mt_sync->mutex_); + } + if (dec_row_mt_sync->cond_ != NULL) { + for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) { + pthread_cond_destroy(&dec_row_mt_sync->cond_[i]); + } + aom_free(dec_row_mt_sync->cond_); + } +#endif // CONFIG_MULTITHREAD + aom_free(dec_row_mt_sync->cur_sb_col); + + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + av1_zero(*dec_row_mt_sync); + } +} + +static INLINE void sync_read(AV1DecRowMTSync *const dec_row_mt_sync, int r, + int c) { +#if CONFIG_MULTITHREAD + const int nsync = dec_row_mt_sync->sync_range; + + if (r && !(c & (nsync - 1))) { + pthread_mutex_t *const mutex = &dec_row_mt_sync->mutex_[r - 1]; + pthread_mutex_lock(mutex); + + while (c > dec_row_mt_sync->cur_sb_col[r - 1] - nsync) { + pthread_cond_wait(&dec_row_mt_sync->cond_[r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)dec_row_mt_sync; + (void)r; + (void)c; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r, + int c, const int sb_cols) { +#if CONFIG_MULTITHREAD + const int nsync = dec_row_mt_sync->sync_range; + int cur; + int sig = 1; + + if (c < sb_cols - 1) { + cur = c; + if (c % nsync) sig = 0; + } else { + cur = sb_cols + nsync; + } + + if (sig) { + pthread_mutex_lock(&dec_row_mt_sync->mutex_[r]); + + dec_row_mt_sync->cur_sb_col[r] = cur; + + pthread_cond_signal(&dec_row_mt_sync->cond_[r]); + pthread_mutex_unlock(&dec_row_mt_sync->mutex_[r]); + } +#else + (void)dec_row_mt_sync; + (void)r; + (void)c; + (void)sb_cols; +#endif // CONFIG_MULTITHREAD +} + +static INLINE int get_sb_rows_in_tile(AV1Decoder *pbi, TileInfo tile) { + AV1_COMMON *cm = &pbi->common; + int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO( + tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2); + int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2; + + return sb_rows; +} + +static INLINE int get_sb_cols_in_tile(AV1Decoder *pbi, TileInfo tile) { + AV1_COMMON *cm = &pbi->common; + int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO( + tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2); + int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2; + + return sb_cols; +} + static void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td, TileInfo tile_info, const int mi_row) { AV1_COMMON *const cm = &pbi->common; const int num_planes = av1_num_planes(cm); - av1_zero_left_context(&td->xd); + TileDataDec *const tile_data = + pbi->tile_data + tile_info.tile_row * cm->tile_cols + tile_info.tile_col; + const int sb_cols_in_tile = get_sb_cols_in_tile(pbi, tile_info); + const int sb_row_in_tile = + (mi_row - tile_info.mi_row_start) >> cm->seq_params.mib_size_log2; + int sb_col_in_tile = 0; for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end; - mi_col += cm->seq_params.mib_size) { - set_cb_buffer(&td->xd, &td->cb_buffer_base, num_planes); + mi_col += cm->seq_params.mib_size, sb_col_in_tile++) { + set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row, + mi_col); + + sync_read(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile); - decode_partition(pbi, &td->xd, mi_row, mi_col, td->bit_reader, - cm->seq_params.sb_size); + // Decoding of the super-block + decode_partition(pbi, td, mi_row, mi_col, td->bit_reader, + cm->seq_params.sb_size, 0x2); + + sync_write(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile, + sb_cols_in_tile); } } @@ -2555,6 +2817,27 @@ static int check_trailing_bits_after_symbol_coder(aom_reader *r) { return 0; } +static void set_decode_func_pointers(ThreadData *td, int parse_decode_flag) { + td->read_coeffs_tx_intra_block_visit = decode_block_void; + td->predict_and_recon_intra_block_visit = decode_block_void; + td->read_coeffs_tx_inter_block_visit = decode_block_void; + td->inverse_tx_inter_block_visit = decode_block_void; + td->predict_inter_block_visit = predict_inter_block_void; + td->cfl_store_inter_block_visit = cfl_store_inter_block_void; + + if (parse_decode_flag & 0x1) { + td->read_coeffs_tx_intra_block_visit = read_coeffs_tx_intra_block; + td->read_coeffs_tx_inter_block_visit = av1_read_coeffs_txb_facade; + } + if (parse_decode_flag & 0x2) { + td->predict_and_recon_intra_block_visit = + predict_and_reconstruct_intra_block; + td->inverse_tx_inter_block_visit = inverse_transform_inter_block; + td->predict_inter_block_visit = predict_inter_block; + td->cfl_store_inter_block_visit = cfl_store_inter_block; + } +} + static void decode_tile(AV1Decoder *pbi, ThreadData *const td, int tile_row, int tile_col) { TileInfo tile_info; @@ -2564,13 +2847,23 @@ static void decode_tile(AV1Decoder *pbi, ThreadData *const td, int tile_row, av1_tile_set_row(&tile_info, cm, tile_row); av1_tile_set_col(&tile_info, cm, tile_col); - av1_zero_above_context(cm, tile_info.mi_col_start, tile_info.mi_col_end, - tile_row); + av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start, + tile_info.mi_col_end, tile_row); + av1_reset_loop_filter_delta(&td->xd, num_planes); av1_reset_loop_restoration(&td->xd, num_planes); for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end; mi_row += cm->seq_params.mib_size) { - decode_tile_sb_row(pbi, td, tile_info, mi_row); + av1_zero_left_context(&td->xd); + + for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end; + mi_col += cm->seq_params.mib_size) { + set_cb_buffer(pbi, &td->xd, &td->cb_buffer_base, num_planes, 0, 0); + + // Bit-stream parsing and decoding of the superblock + decode_partition(pbi, td, mi_row, mi_col, td->bit_reader, + cm->seq_params.sb_size, 0x3); + } } int corrupted = @@ -2582,6 +2875,7 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end, int start_tile, int end_tile) { AV1_COMMON *const cm = &pbi->common; + ThreadData *const td = &pbi->td; const int tile_cols = cm->tile_cols; const int tile_rows = cm->tile_rows; const int n_tiles = tile_cols * tile_rows; @@ -2641,23 +2935,26 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data, get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile); if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) { - aom_free(pbi->tile_data); - CHECK_MEM_ERROR(cm, pbi->tile_data, - aom_memalign(32, n_tiles * (sizeof(*pbi->tile_data)))); - pbi->allocated_tiles = n_tiles; + decoder_alloc_tile_data(pbi, n_tiles); } #if CONFIG_ACCOUNTING if (pbi->acct_enabled) { aom_accounting_reset(&pbi->accounting); } #endif + + set_decode_func_pointers(&pbi->td, 0x3); + // Load all tile information into thread_data. + td->xd = pbi->mb; + td->xd.corrupted = 0; + td->xd.mc_buf[0] = td->mc_buf[0]; + td->xd.mc_buf[1] = td->mc_buf[1]; for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) { const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row; for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) { const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col; - ThreadData *const td = &pbi->td; TileDataDec *const tile_data = pbi->tile_data + row * cm->tile_cols + col; const TileBufferDec *const tile_bs_buf = &tile_buffers[row][col]; @@ -2665,13 +2962,10 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data, row * cm->tile_cols + col > end_tile) continue; - td->xd = pbi->mb; - td->xd.corrupted = 0; - td->xd.mc_buf[0] = pbi->td.mc_buf[0]; - td->xd.mc_buf[1] = pbi->td.mc_buf[1]; td->bit_reader = &tile_data->bit_reader; av1_zero(td->dqcoeff); av1_tile_init(&td->xd.tile, cm, row, col); + td->xd.current_qindex = cm->base_qindex; setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size, &cm->error, td->bit_reader, allow_update_cdf); #if CONFIG_ACCOUNTING @@ -2691,7 +2985,7 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data, td->xd.tile_ctx = &tile_data->tctx; // decode tile - decode_tile(pbi, &pbi->td, row, col); + decode_tile(pbi, td, row, col); aom_merge_corrupted_flag(&pbi->mb.corrupted, td->xd.corrupted); if (pbi->mb.corrupted) aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, @@ -2729,6 +3023,47 @@ static TileJobsDec *get_dec_job_info(AV1DecTileMT *tile_mt_info) { return cur_job_info; } +static void tile_worker_hook_init(AV1Decoder *const pbi, + DecWorkerData *const thread_data, + const TileBufferDec *const tile_buffer, + TileDataDec *const tile_data, + uint8_t allow_update_cdf) { + AV1_COMMON *cm = &pbi->common; + ThreadData *const td = thread_data->td; + int tile_row = tile_data->tile_info.tile_row; + int tile_col = tile_data->tile_info.tile_col; + + td->bit_reader = &tile_data->bit_reader; + av1_zero(td->dqcoeff); + av1_tile_init(&td->xd.tile, cm, tile_row, tile_col); + td->xd.current_qindex = cm->base_qindex; + setup_bool_decoder(tile_buffer->data, thread_data->data_end, + tile_buffer->size, &thread_data->error_info, + td->bit_reader, allow_update_cdf); +#if CONFIG_ACCOUNTING + if (pbi->acct_enabled) { + td->bit_reader->accounting = &pbi->accounting; + td->bit_reader->accounting->last_tell_frac = + aom_reader_tell_frac(td->bit_reader); + } else { + td->bit_reader->accounting = NULL; + } +#endif + av1_init_macroblockd(cm, &td->xd, td->dqcoeff); + td->xd.error_info = &thread_data->error_info; + av1_init_above_context(cm, &td->xd, tile_row); + + // Initialise the tile context from the frame context + tile_data->tctx = *cm->fc; + td->xd.tile_ctx = &tile_data->tctx; +#if CONFIG_ACCOUNTING + if (pbi->acct_enabled) { + tile_data->bit_reader.accounting->last_tell_frac = + aom_reader_tell_frac(&tile_data->bit_reader); + } +#endif +} + static int tile_worker_hook(void *arg1, void *arg2) { DecWorkerData *const thread_data = (DecWorkerData *)arg1; AV1Decoder *const pbi = (AV1Decoder *)arg2; @@ -2736,14 +3071,21 @@ static int tile_worker_hook(void *arg1, void *arg2) { ThreadData *const td = thread_data->td; uint8_t allow_update_cdf; + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. if (setjmp(thread_data->error_info.jmp)) { thread_data->error_info.setjmp = 0; thread_data->td->xd.corrupted = 1; return 0; } + thread_data->error_info.setjmp = 1; + allow_update_cdf = cm->large_scale_tile ? 0 : 1; allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update; + set_decode_func_pointers(td, 0x3); + assert(cm->tile_cols > 0); while (1) { TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info); @@ -2751,46 +3093,248 @@ static int tile_worker_hook(void *arg1, void *arg2) { if (cur_job_info != NULL && !td->xd.corrupted) { const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer; TileDataDec *const tile_data = cur_job_info->tile_data; - volatile int tile_row = tile_data->tile_info.tile_row; - volatile int tile_col = tile_data->tile_info.tile_col; + tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data, + allow_update_cdf); + // decode tile + int tile_row = tile_data->tile_info.tile_row; + int tile_col = tile_data->tile_info.tile_col; + decode_tile(pbi, td, tile_row, tile_col); + } else { + break; + } + } + thread_data->error_info.setjmp = 0; + return !td->xd.corrupted; +} - td->xd = pbi->mb; - td->xd.corrupted = 0; - td->xd.mc_buf[0] = td->mc_buf[0]; - td->xd.mc_buf[1] = td->mc_buf[1]; - td->bit_reader = &tile_data->bit_reader; - av1_zero(td->dqcoeff); - av1_tile_init(&td->xd.tile, cm, tile_row, tile_col); - setup_bool_decoder(tile_buffer->data, thread_data->data_end, - tile_buffer->size, &cm->error, td->bit_reader, - allow_update_cdf); -#if CONFIG_ACCOUNTING - if (pbi->acct_enabled) { - td->bit_reader->accounting = &pbi->accounting; - td->bit_reader->accounting->last_tell_frac = - aom_reader_tell_frac(td->bit_reader); - } else { - td->bit_reader->accounting = NULL; +static int get_next_job_info(AV1Decoder *const pbi, + AV1DecRowMTJobInfo *next_job_info, + int *end_of_frame) { + AV1_COMMON *cm = &pbi->common; + TileDataDec *tile_data; + AV1DecRowMTSync *dec_row_mt_sync; + AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info; + TileInfo tile_info; + const int tile_rows_start = frame_row_mt_info->tile_rows_start; + const int tile_rows_end = frame_row_mt_info->tile_rows_end; + const int tile_cols_start = frame_row_mt_info->tile_cols_start; + const int tile_cols_end = frame_row_mt_info->tile_cols_end; + const int start_tile = frame_row_mt_info->start_tile; + const int end_tile = frame_row_mt_info->end_tile; + const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size]; + int num_mis_to_decode, num_threads_working; + int num_mis_waiting_for_decode; + int min_threads_working = INT_MAX; + int max_mis_to_decode = 0; + int tile_row_idx, tile_col_idx; + int tile_row = 0; + int tile_col = 0; + + memset(next_job_info, 0, sizeof(*next_job_info)); + + // Frame decode is completed or error is encountered. + *end_of_frame = (frame_row_mt_info->mi_rows_decode_started == + frame_row_mt_info->mi_rows_to_decode) || + (frame_row_mt_info->row_mt_exit == 1); + if (*end_of_frame) { + return 1; + } + + // Decoding cannot start as bit-stream parsing is not complete. + if (frame_row_mt_info->mi_rows_parse_done - + frame_row_mt_info->mi_rows_decode_started == + 0) + return 0; + + // Choose the tile to decode. + for (tile_row_idx = tile_rows_start; tile_row_idx < tile_rows_end; + ++tile_row_idx) { + for (tile_col_idx = tile_cols_start; tile_col_idx < tile_cols_end; + ++tile_col_idx) { + if (tile_row_idx * cm->tile_cols + tile_col_idx < start_tile || + tile_row_idx * cm->tile_cols + tile_col_idx > end_tile) + continue; + + tile_data = pbi->tile_data + tile_row_idx * cm->tile_cols + tile_col_idx; + dec_row_mt_sync = &tile_data->dec_row_mt_sync; + + num_threads_working = dec_row_mt_sync->num_threads_working; + num_mis_waiting_for_decode = (dec_row_mt_sync->mi_rows_parse_done - + dec_row_mt_sync->mi_rows_decode_started) * + dec_row_mt_sync->mi_cols; + num_mis_to_decode = + (dec_row_mt_sync->mi_rows - dec_row_mt_sync->mi_rows_decode_started) * + dec_row_mt_sync->mi_cols; + + assert(num_mis_to_decode >= num_mis_waiting_for_decode); + + // Pick the tile which has minimum number of threads working on it. + if (num_mis_waiting_for_decode > 0) { + if (num_threads_working < min_threads_working) { + min_threads_working = num_threads_working; + max_mis_to_decode = 0; + } + if (num_threads_working == min_threads_working && + num_mis_to_decode > max_mis_to_decode) { + max_mis_to_decode = num_mis_to_decode; + tile_row = tile_row_idx; + tile_col = tile_col_idx; + } } + } + } + + tile_data = pbi->tile_data + tile_row * cm->tile_cols + tile_col; + tile_info = tile_data->tile_info; + dec_row_mt_sync = &tile_data->dec_row_mt_sync; + + next_job_info->tile_row = tile_row; + next_job_info->tile_col = tile_col; + next_job_info->mi_row = + dec_row_mt_sync->mi_rows_decode_started + tile_info.mi_row_start; + + dec_row_mt_sync->num_threads_working++; + dec_row_mt_sync->mi_rows_decode_started += sb_mi_size; + frame_row_mt_info->mi_rows_decode_started += sb_mi_size; + + return 1; +} + +static INLINE void signal_parse_sb_row_done(AV1Decoder *const pbi, + TileDataDec *const tile_data, + const int sb_mi_size) { + AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); #endif - av1_init_macroblockd(cm, &td->xd, td->dqcoeff); - av1_init_above_context(cm, &td->xd, tile_row); + tile_data->dec_row_mt_sync.mi_rows_parse_done += sb_mi_size; + frame_row_mt_info->mi_rows_parse_done += sb_mi_size; +#if CONFIG_MULTITHREAD + pthread_cond_broadcast(pbi->row_mt_cond_); + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif +} - // Initialise the tile context from the frame context - tile_data->tctx = *cm->fc; - td->xd.tile_ctx = &tile_data->tctx; -#if CONFIG_ACCOUNTING - if (pbi->acct_enabled) { - tile_data->bit_reader.accounting->last_tell_frac = - aom_reader_tell_frac(&tile_data->bit_reader); - } +static int row_mt_worker_hook(void *arg1, void *arg2) { + DecWorkerData *const thread_data = (DecWorkerData *)arg1; + AV1Decoder *const pbi = (AV1Decoder *)arg2; + AV1_COMMON *cm = &pbi->common; + ThreadData *const td = thread_data->td; + uint8_t allow_update_cdf; + const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size]; + AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info; + td->xd.corrupted = 0; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(thread_data->error_info.jmp)) { + thread_data->error_info.setjmp = 0; + thread_data->td->xd.corrupted = 1; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); #endif + frame_row_mt_info->row_mt_exit = 1; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif + return 0; + } + thread_data->error_info.setjmp = 1; + + const int num_planes = av1_num_planes(cm); + allow_update_cdf = cm->large_scale_tile ? 0 : 1; + allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update; + + assert(cm->tile_cols > 0); + while (1) { + TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info); + + if (cur_job_info != NULL && !td->xd.corrupted) { + const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer; + TileDataDec *const tile_data = cur_job_info->tile_data; + tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data, + allow_update_cdf); + + set_decode_func_pointers(td, 0x1); + // decode tile - decode_tile(pbi, td, tile_row, tile_col); + TileInfo tile_info = tile_data->tile_info; + int tile_row = tile_info.tile_row; + + av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start, + tile_info.mi_col_end, tile_row); + av1_reset_loop_filter_delta(&td->xd, num_planes); + av1_reset_loop_restoration(&td->xd, num_planes); + + for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end; + mi_row += cm->seq_params.mib_size) { + av1_zero_left_context(&td->xd); + + for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end; + mi_col += cm->seq_params.mib_size) { + set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row, + mi_col); + + // Bit-stream parsing of the superblock + decode_partition(pbi, td, mi_row, mi_col, td->bit_reader, + cm->seq_params.sb_size, 0x1); + } + signal_parse_sb_row_done(pbi, tile_data, sb_mi_size); + } + + int corrupted = + (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0; + aom_merge_corrupted_flag(&td->xd.corrupted, corrupted); } else { break; } } + + set_decode_func_pointers(td, 0x2); + + while (1) { + AV1DecRowMTJobInfo next_job_info; + int end_of_frame = 0; + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + while (!get_next_job_info(pbi, &next_job_info, &end_of_frame)) { +#if CONFIG_MULTITHREAD + pthread_cond_wait(pbi->row_mt_cond_, pbi->row_mt_mutex_); +#endif + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif + + if (end_of_frame) break; + + int tile_row = next_job_info.tile_row; + int tile_col = next_job_info.tile_col; + int mi_row = next_job_info.mi_row; + + TileDataDec *tile_data = + pbi->tile_data + tile_row * cm->tile_cols + tile_col; + AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync; + TileInfo tile_info = tile_data->tile_info; + + av1_tile_init(&td->xd.tile, cm, tile_row, tile_col); + av1_init_macroblockd(cm, &td->xd, td->dqcoeff); + td->xd.error_info = &thread_data->error_info; + + decode_tile_sb_row(pbi, td, tile_info, mi_row); + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + dec_row_mt_sync->num_threads_working--; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif + } + thread_data->error_info.setjmp = 0; return !td->xd.corrupted; } @@ -2842,8 +3386,7 @@ static void alloc_dec_jobs(AV1DecTileMT *tile_mt_info, AV1_COMMON *cm, aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles)); } -void av1_free_mc_tmp_buf(void *td, int use_highbd) { - ThreadData *thread_data = (ThreadData *)td; +void av1_free_mc_tmp_buf(ThreadData *thread_data, int use_highbd) { int ref; for (ref = 0; ref < 2; ref++) { if (use_highbd) @@ -2855,10 +3398,8 @@ void av1_free_mc_tmp_buf(void *td, int use_highbd) { thread_data->mc_buf_size = 0; } -static void allocate_mc_tmp_buf(AV1_COMMON *const cm, void *td, int buf_size, - int use_highbd) { - ThreadData *thread_data = (ThreadData *)td; - +static void allocate_mc_tmp_buf(AV1_COMMON *const cm, ThreadData *thread_data, + int buf_size, int use_highbd) { for (int ref = 0; ref < 2; ref++) { if (use_highbd) { uint16_t *hbd_mc_buf; @@ -2872,11 +3413,130 @@ static void allocate_mc_tmp_buf(AV1_COMMON *const cm, void *td, int buf_size, thread_data->mc_buf_size = buf_size; } +static void reset_dec_workers(AV1Decoder *pbi, AVxWorkerHook worker_hook, + int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + + // Reset tile decoding hook + for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) { + AVxWorker *const worker = &pbi->tile_workers[worker_idx]; + DecWorkerData *const thread_data = pbi->thread_data + worker_idx; + thread_data->td->xd = pbi->mb; + thread_data->td->xd.corrupted = 0; + thread_data->td->xd.mc_buf[0] = thread_data->td->mc_buf[0]; + thread_data->td->xd.mc_buf[1] = thread_data->td->mc_buf[1]; + winterface->sync(worker); + + worker->hook = worker_hook; + worker->data1 = thread_data; + worker->data2 = pbi; + } +#if CONFIG_ACCOUNTING + if (pbi->acct_enabled) { + aom_accounting_reset(&pbi->accounting); + } +#endif +} + +static void launch_dec_workers(AV1Decoder *pbi, const uint8_t *data_end, + int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + + for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) { + AVxWorker *const worker = &pbi->tile_workers[worker_idx]; + DecWorkerData *const thread_data = (DecWorkerData *)worker->data1; + + thread_data->data_end = data_end; + + worker->had_error = 0; + if (worker_idx == num_workers - 1) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } +} + +static void sync_dec_workers(AV1Decoder *pbi, int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int corrupted = 0; + + for (int worker_idx = num_workers; worker_idx > 0; --worker_idx) { + AVxWorker *const worker = &pbi->tile_workers[worker_idx - 1]; + aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker)); + } + + pbi->mb.corrupted = corrupted; +} + +static void decode_mt_init(AV1Decoder *pbi) { + AV1_COMMON *const cm = &pbi->common; + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int worker_idx; + + // Create workers and thread_data + if (pbi->num_workers == 0) { + const int num_threads = pbi->max_threads; + CHECK_MEM_ERROR(cm, pbi->tile_workers, + aom_malloc(num_threads * sizeof(*pbi->tile_workers))); + CHECK_MEM_ERROR(cm, pbi->thread_data, + aom_malloc(num_threads * sizeof(*pbi->thread_data))); + + for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) { + AVxWorker *const worker = &pbi->tile_workers[worker_idx]; + DecWorkerData *const thread_data = pbi->thread_data + worker_idx; + ++pbi->num_workers; + + winterface->init(worker); + if (worker_idx < num_threads - 1 && !winterface->reset(worker)) { + aom_internal_error(&cm->error, AOM_CODEC_ERROR, + "Tile decoder thread creation failed"); + } + + if (worker_idx < num_threads - 1) { + // Allocate thread data. + CHECK_MEM_ERROR(cm, thread_data->td, + aom_memalign(32, sizeof(*thread_data->td))); + av1_zero(*thread_data->td); + } else { + // Main thread acts as a worker and uses the thread data in pbi + thread_data->td = &pbi->td; + } + thread_data->error_info.error_code = AOM_CODEC_OK; + thread_data->error_info.setjmp = 0; + } + } + const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0; + const int buf_size = MC_TEMP_BUF_PELS << use_highbd; + for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) { + DecWorkerData *const thread_data = pbi->thread_data + worker_idx; + if (thread_data->td->mc_buf_size != buf_size) { + av1_free_mc_tmp_buf(thread_data->td, use_highbd); + allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd); + } + } +} + +static void tile_mt_queue(AV1Decoder *pbi, int tile_cols, int tile_rows, + int tile_rows_start, int tile_rows_end, + int tile_cols_start, int tile_cols_end, + int start_tile, int end_tile) { + AV1_COMMON *const cm = &pbi->common; + if (pbi->tile_mt_info.alloc_tile_cols != tile_cols || + pbi->tile_mt_info.alloc_tile_rows != tile_rows) { + av1_dealloc_dec_jobs(&pbi->tile_mt_info); + alloc_dec_jobs(&pbi->tile_mt_info, cm, tile_rows, tile_cols); + } + enqueue_tile_jobs(pbi, cm, tile_rows_start, tile_rows_end, tile_cols_start, + tile_cols_end, start_tile, end_tile); + qsort(pbi->tile_mt_info.job_queue, pbi->tile_mt_info.jobs_enqueued, + sizeof(pbi->tile_mt_info.job_queue[0]), compare_tile_buffers); +} + static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end, int start_tile, int end_tile) { AV1_COMMON *const cm = &pbi->common; - const AVxWorkerInterface *const winterface = aom_get_worker_interface(); const int tile_cols = cm->tile_cols; const int tile_rows = cm->tile_rows; const int n_tiles = tile_cols * tile_rows; @@ -2891,7 +3551,6 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data, int tile_cols_end; int tile_count_tg; int num_workers; - int worker_idx; const uint8_t *raw_data_end = NULL; if (cm->large_scale_tile) { @@ -2923,48 +3582,188 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data, assert(start_tile <= end_tile); assert(start_tile >= 0 && end_tile < n_tiles); - // Create workers and thread_data - if (pbi->num_workers == 0) { - const int num_threads = pbi->max_threads; - CHECK_MEM_ERROR(cm, pbi->tile_workers, - aom_malloc(num_threads * sizeof(*pbi->tile_workers))); - CHECK_MEM_ERROR(cm, pbi->thread_data, - aom_malloc(num_threads * sizeof(*pbi->thread_data))); + decode_mt_init(pbi); - for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) { - AVxWorker *const worker = &pbi->tile_workers[worker_idx]; - DecWorkerData *const thread_data = pbi->thread_data + worker_idx; - ++pbi->num_workers; + // get tile size in tile group +#if EXT_TILE_DEBUG + if (cm->large_scale_tile) assert(pbi->ext_tile_debug == 1); + if (cm->large_scale_tile) + raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers); + else +#endif // EXT_TILE_DEBUG + get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile); - winterface->init(worker); - if (worker_idx < num_threads - 1 && !winterface->reset(worker)) { - aom_internal_error(&cm->error, AOM_CODEC_ERROR, - "Tile decoder thread creation failed"); - } + if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) { + decoder_alloc_tile_data(pbi, n_tiles); + } - if (worker_idx < num_threads - 1) { - // Allocate thread data. - CHECK_MEM_ERROR(cm, thread_data->td, - aom_memalign(32, sizeof(*thread_data->td))); - av1_zero(*thread_data->td); - } else { - // Main thread acts as a worker and uses the thread data in pbi - thread_data->td = &pbi->td; - } + for (int row = 0; row < tile_rows; row++) { + for (int col = 0; col < tile_cols; col++) { + TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col; + av1_tile_init(&tile_data->tile_info, cm, row, col); } } - const int use_highbd = cm->use_highbitdepth ? 1 : 0; - const int buf_size = MC_TEMP_BUF_PELS << use_highbd; - for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) { - DecWorkerData *const thread_data = pbi->thread_data + worker_idx; - if (thread_data->td->mc_buf_size != buf_size) { - av1_free_mc_tmp_buf(thread_data->td, use_highbd); - allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd); + + tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end, + tile_cols_start, tile_cols_end, start_tile, end_tile); + + reset_dec_workers(pbi, tile_worker_hook, num_workers); + launch_dec_workers(pbi, data_end, num_workers); + sync_dec_workers(pbi, num_workers); + + if (pbi->mb.corrupted) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); + + if (cm->large_scale_tile) { + if (n_tiles == 1) { + // Find the end of the single tile buffer + return aom_reader_find_end(&pbi->tile_data->bit_reader); } + // Return the end of the last tile buffer + return raw_data_end; + } + TileDataDec *const tile_data = pbi->tile_data + end_tile; + + return aom_reader_find_end(&tile_data->bit_reader); +} + +static void dec_alloc_cb_buf(AV1Decoder *pbi) { + AV1_COMMON *const cm = &pbi->common; + int size = ((cm->mi_rows >> cm->seq_params.mib_size_log2) + 1) * + ((cm->mi_cols >> cm->seq_params.mib_size_log2) + 1); + + if (pbi->cb_buffer_alloc_size < size) { + av1_dec_free_cb_buf(pbi); + CHECK_MEM_ERROR(cm, pbi->cb_buffer_base, + aom_memalign(32, sizeof(*pbi->cb_buffer_base) * size)); + pbi->cb_buffer_alloc_size = size; + } +} + +static void row_mt_frame_init(AV1Decoder *pbi, int tile_rows_start, + int tile_rows_end, int tile_cols_start, + int tile_cols_end, int start_tile, int end_tile, + int max_sb_rows) { + AV1_COMMON *const cm = &pbi->common; + AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info; + + frame_row_mt_info->tile_rows_start = tile_rows_start; + frame_row_mt_info->tile_rows_end = tile_rows_end; + frame_row_mt_info->tile_cols_start = tile_cols_start; + frame_row_mt_info->tile_cols_end = tile_cols_end; + frame_row_mt_info->start_tile = start_tile; + frame_row_mt_info->end_tile = end_tile; + frame_row_mt_info->mi_rows_to_decode = 0; + frame_row_mt_info->mi_rows_parse_done = 0; + frame_row_mt_info->mi_rows_decode_started = 0; + frame_row_mt_info->row_mt_exit = 0; + + for (int tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) { + for (int tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) { + if (tile_row * cm->tile_cols + tile_col < start_tile || + tile_row * cm->tile_cols + tile_col > end_tile) + continue; + + TileDataDec *const tile_data = + pbi->tile_data + tile_row * cm->tile_cols + tile_col; + TileInfo tile_info = tile_data->tile_info; + + tile_data->dec_row_mt_sync.mi_rows_parse_done = 0; + tile_data->dec_row_mt_sync.mi_rows_decode_started = 0; + tile_data->dec_row_mt_sync.num_threads_working = 0; + tile_data->dec_row_mt_sync.mi_rows = + ALIGN_POWER_OF_TWO(tile_info.mi_row_end - tile_info.mi_row_start, + cm->seq_params.mib_size_log2); + tile_data->dec_row_mt_sync.mi_cols = + ALIGN_POWER_OF_TWO(tile_info.mi_col_end - tile_info.mi_col_start, + cm->seq_params.mib_size_log2); + + frame_row_mt_info->mi_rows_to_decode += + tile_data->dec_row_mt_sync.mi_rows; + + // Initialize cur_sb_col to -1 for all SB rows. + memset(tile_data->dec_row_mt_sync.cur_sb_col, -1, + sizeof(*tile_data->dec_row_mt_sync.cur_sb_col) * max_sb_rows); + } + } + +#if CONFIG_MULTITHREAD + if (pbi->row_mt_mutex_ == NULL) { + CHECK_MEM_ERROR(cm, pbi->row_mt_mutex_, + aom_malloc(sizeof(*(pbi->row_mt_mutex_)))); + if (pbi->row_mt_mutex_) { + pthread_mutex_init(pbi->row_mt_mutex_, NULL); + } + } + + if (pbi->row_mt_cond_ == NULL) { + CHECK_MEM_ERROR(cm, pbi->row_mt_cond_, + aom_malloc(sizeof(*(pbi->row_mt_cond_)))); + if (pbi->row_mt_cond_) { + pthread_cond_init(pbi->row_mt_cond_, NULL); + } + } +#endif +} + +static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, + int start_tile, int end_tile) { + AV1_COMMON *const cm = &pbi->common; + const int tile_cols = cm->tile_cols; + const int tile_rows = cm->tile_rows; + const int n_tiles = tile_cols * tile_rows; + TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers; + const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows); + const int single_row = pbi->dec_tile_row >= 0; + const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols); + const int single_col = pbi->dec_tile_col >= 0; + int tile_rows_start; + int tile_rows_end; + int tile_cols_start; + int tile_cols_end; + int tile_count_tg; + int num_workers; + const uint8_t *raw_data_end = NULL; + int max_sb_rows = 0; + + if (cm->large_scale_tile) { + tile_rows_start = single_row ? dec_tile_row : 0; + tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows; + tile_cols_start = single_col ? dec_tile_col : 0; + tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols; + } else { + tile_rows_start = 0; + tile_rows_end = tile_rows; + tile_cols_start = 0; + tile_cols_end = tile_cols; } + tile_count_tg = end_tile - start_tile + 1; + num_workers = pbi->max_threads; - // get tile size in tile group + // No tiles to decode. + if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start || + // First tile is larger than end_tile. + tile_rows_start * tile_cols + tile_cols_start > end_tile || + // Last tile is smaller than start_tile. + (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile) + return data; + + assert(tile_rows <= MAX_TILE_ROWS); + assert(tile_cols <= MAX_TILE_COLS); + assert(tile_count_tg > 0); + assert(num_workers > 0); + assert(start_tile <= end_tile); + assert(start_tile >= 0 && end_tile < n_tiles); + + (void)tile_count_tg; + + decode_mt_init(pbi); + + // get tile size in tile group #if EXT_TILE_DEBUG + if (cm->large_scale_tile) assert(pbi->ext_tile_debug == 1); if (cm->large_scale_tile) raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers); else @@ -2972,74 +3771,43 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data, get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile); if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) { - aom_free(pbi->tile_data); - CHECK_MEM_ERROR(cm, pbi->tile_data, - aom_memalign(32, n_tiles * sizeof(*pbi->tile_data))); - pbi->allocated_tiles = n_tiles; + for (int i = 0; i < pbi->allocated_tiles; i++) { + TileDataDec *const tile_data = pbi->tile_data + i; + av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync); + } + decoder_alloc_tile_data(pbi, n_tiles); } - // Reset tile decoding hook - for (worker_idx = 0; worker_idx < num_workers; ++worker_idx) { - AVxWorker *const worker = &pbi->tile_workers[worker_idx]; - DecWorkerData *const thread_data = pbi->thread_data + worker_idx; - winterface->sync(worker); - - worker->hook = tile_worker_hook; - worker->data1 = thread_data; - worker->data2 = pbi; - } -#if CONFIG_ACCOUNTING - if (pbi->acct_enabled) { - aom_accounting_reset(&pbi->accounting); - } -#endif for (int row = 0; row < tile_rows; row++) { for (int col = 0; col < tile_cols; col++) { TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col; av1_tile_init(&tile_data->tile_info, cm, row, col); + + max_sb_rows = + AOMMAX(max_sb_rows, get_sb_rows_in_tile(pbi, tile_data->tile_info)); } } - if (pbi->tile_mt_info.alloc_tile_cols != tile_cols || - pbi->tile_mt_info.alloc_tile_rows != tile_rows) { - av1_dealloc_dec_jobs(&pbi->tile_mt_info); - alloc_dec_jobs(&pbi->tile_mt_info, cm, tile_rows, tile_cols); + if (pbi->allocated_row_mt_sync_rows != max_sb_rows) { + for (int i = 0; i < n_tiles; ++i) { + TileDataDec *const tile_data = pbi->tile_data + i; + av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync); + dec_row_mt_alloc(&tile_data->dec_row_mt_sync, cm, max_sb_rows); + } + pbi->allocated_row_mt_sync_rows = max_sb_rows; } - enqueue_tile_jobs(pbi, cm, tile_rows_start, tile_rows_end, tile_cols_start, - tile_cols_end, start_tile, end_tile); - qsort(pbi->tile_mt_info.job_queue, pbi->tile_mt_info.jobs_enqueued, - sizeof(pbi->tile_mt_info.job_queue[0]), compare_tile_buffers); - { - const int base = tile_count_tg / num_workers; - const int remain = tile_count_tg % num_workers; - int tile_start = start_tile; - int corrupted = 0; - - for (worker_idx = 0; worker_idx < num_workers; ++worker_idx) { - // compute number of tiles assign to each worker - const int count = base + (remain + worker_idx) / num_workers; - AVxWorker *const worker = &pbi->tile_workers[worker_idx]; - DecWorkerData *const thread_data = (DecWorkerData *)worker->data1; - - thread_data->data_end = data_end; - tile_start += count; + tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end, + tile_cols_start, tile_cols_end, start_tile, end_tile); - worker->had_error = 0; - if (worker_idx == num_workers - 1) { - winterface->execute(worker); - } else { - winterface->launch(worker); - } - } + dec_alloc_cb_buf(pbi); - for (; worker_idx > 0; --worker_idx) { - AVxWorker *const worker = &pbi->tile_workers[worker_idx - 1]; - aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker)); - } + row_mt_frame_init(pbi, tile_rows_start, tile_rows_end, tile_cols_start, + tile_cols_end, start_tile, end_tile, max_sb_rows); - pbi->mb.corrupted = corrupted; - } + reset_dec_workers(pbi, row_mt_worker_hook, num_workers); + launch_dec_workers(pbi, data_end, num_workers); + sync_dec_workers(pbi, num_workers); if (pbi->mb.corrupted) aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, @@ -3064,17 +3832,20 @@ static void error_handler(void *data) { } // Reads the high_bitdepth and twelve_bit fields in color_config() and sets -// cm->bit_depth based on the values of those fields and cm->profile. Reports -// errors by calling rb->error_handler() or aom_internal_error(). -static void av1_read_bitdepth(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { +// seq_params->bit_depth based on the values of those fields and +// seq_params->profile. Reports errors by calling rb->error_handler() or +// aom_internal_error(). +static void read_bitdepth(struct aom_read_bit_buffer *rb, + SequenceHeader *seq_params, + struct aom_internal_error_info *error_info) { const int high_bitdepth = aom_rb_read_bit(rb); - if (cm->profile == PROFILE_2 && high_bitdepth) { + if (seq_params->profile == PROFILE_2 && high_bitdepth) { const int twelve_bit = aom_rb_read_bit(rb); - cm->bit_depth = twelve_bit ? AOM_BITS_12 : AOM_BITS_10; - } else if (cm->profile <= PROFILE_2) { - cm->bit_depth = high_bitdepth ? AOM_BITS_10 : AOM_BITS_8; + seq_params->bit_depth = twelve_bit ? AOM_BITS_12 : AOM_BITS_10; + } else if (seq_params->profile <= PROFILE_2) { + seq_params->bit_depth = high_bitdepth ? AOM_BITS_10 : AOM_BITS_8; } else { - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM, "Unsupported profile/bit-depth combination"); } } @@ -3082,6 +3853,7 @@ static void av1_read_bitdepth(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { void av1_read_film_grain_params(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { aom_film_grain_t *pars = &cm->film_grain_params; + const SequenceHeader *const seq_params = &cm->seq_params; pars->apply_grain = aom_rb_read_bit(rb); if (!pars->apply_grain) { @@ -3095,6 +3867,8 @@ void av1_read_film_grain_params(AV1_COMMON *cm, else pars->update_parameters = 1; + pars->bit_depth = seq_params->bit_depth; + if (!pars->update_parameters) { // inherit parameters from a previous reference frame RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; @@ -3129,11 +3903,11 @@ void av1_read_film_grain_params(AV1_COMMON *cm, pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8); } - if (!cm->seq_params.monochrome) + if (!seq_params->monochrome) pars->chroma_scaling_from_luma = aom_rb_read_bit(rb); - if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma || - ((cm->subsampling_x == 1) && (cm->subsampling_y == 1) && + if (seq_params->monochrome || pars->chroma_scaling_from_luma || + ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) && (pars->num_y_points == 0))) { pars->num_cb_points = 0; pars->num_cr_points = 0; @@ -3168,7 +3942,7 @@ void av1_read_film_grain_params(AV1_COMMON *cm, pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8); } - if ((cm->subsampling_x == 1) && (cm->subsampling_y == 1) && + if ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) && (((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) || ((pars->num_cb_points != 0) && (pars->num_cr_points == 0)))) aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, @@ -3222,89 +3996,93 @@ void av1_read_film_grain_params(AV1_COMMON *cm, } static void read_film_grain(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { - if (cm->film_grain_params_present && (cm->show_frame || cm->showable_frame)) { + if (cm->seq_params.film_grain_params_present && + (cm->show_frame || cm->showable_frame)) { av1_read_film_grain_params(cm, rb); } else { memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params)); } - cm->film_grain_params.bit_depth = cm->bit_depth; + cm->film_grain_params.bit_depth = cm->seq_params.bit_depth; memcpy(&cm->cur_frame->film_grain_params, &cm->film_grain_params, sizeof(aom_film_grain_t)); } -void av1_read_color_config(AV1_COMMON *cm, struct aom_read_bit_buffer *rb, - int allow_lowbitdepth) { - av1_read_bitdepth(cm, rb); +void av1_read_color_config(struct aom_read_bit_buffer *rb, + int allow_lowbitdepth, SequenceHeader *seq_params, + struct aom_internal_error_info *error_info) { + read_bitdepth(rb, seq_params, error_info); - cm->use_highbitdepth = cm->bit_depth > AOM_BITS_8 || !allow_lowbitdepth; + seq_params->use_highbitdepth = + seq_params->bit_depth > AOM_BITS_8 || !allow_lowbitdepth; // monochrome bit (not needed for PROFILE_1) - const int is_monochrome = cm->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0; - cm->seq_params.monochrome = is_monochrome; + const int is_monochrome = + seq_params->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0; + seq_params->monochrome = is_monochrome; int color_description_present_flag = aom_rb_read_bit(rb); if (color_description_present_flag) { - cm->color_primaries = aom_rb_read_literal(rb, 8); - cm->transfer_characteristics = aom_rb_read_literal(rb, 8); - cm->matrix_coefficients = aom_rb_read_literal(rb, 8); + seq_params->color_primaries = aom_rb_read_literal(rb, 8); + seq_params->transfer_characteristics = aom_rb_read_literal(rb, 8); + seq_params->matrix_coefficients = aom_rb_read_literal(rb, 8); } else { - cm->color_primaries = AOM_CICP_CP_UNSPECIFIED; - cm->transfer_characteristics = AOM_CICP_TC_UNSPECIFIED; - cm->matrix_coefficients = AOM_CICP_MC_UNSPECIFIED; + seq_params->color_primaries = AOM_CICP_CP_UNSPECIFIED; + seq_params->transfer_characteristics = AOM_CICP_TC_UNSPECIFIED; + seq_params->matrix_coefficients = AOM_CICP_MC_UNSPECIFIED; } if (is_monochrome) { // [16,235] (including xvycc) vs [0,255] range - cm->color_range = aom_rb_read_bit(rb); - cm->subsampling_y = cm->subsampling_x = 1; - cm->chroma_sample_position = AOM_CSP_UNKNOWN; - cm->separate_uv_delta_q = 0; + seq_params->color_range = aom_rb_read_bit(rb); + seq_params->subsampling_y = seq_params->subsampling_x = 1; + seq_params->chroma_sample_position = AOM_CSP_UNKNOWN; + seq_params->separate_uv_delta_q = 0; return; } - if (cm->color_primaries == AOM_CICP_CP_BT_709 && - cm->transfer_characteristics == AOM_CICP_TC_SRGB && - cm->matrix_coefficients == AOM_CICP_MC_IDENTITY) { // it would be better - // to remove this - // dependency too - cm->subsampling_y = cm->subsampling_x = 0; - cm->color_range = 1; // assume full color-range - if (!(cm->profile == PROFILE_1 || - (cm->profile == PROFILE_2 && cm->bit_depth == AOM_BITS_12))) { + if (seq_params->color_primaries == AOM_CICP_CP_BT_709 && + seq_params->transfer_characteristics == AOM_CICP_TC_SRGB && + seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { + // It would be good to remove this dependency. + seq_params->subsampling_y = seq_params->subsampling_x = 0; + seq_params->color_range = 1; // assume full color-range + if (!(seq_params->profile == PROFILE_1 || + (seq_params->profile == PROFILE_2 && + seq_params->bit_depth == AOM_BITS_12))) { aom_internal_error( - &cm->error, AOM_CODEC_UNSUP_BITSTREAM, + error_info, AOM_CODEC_UNSUP_BITSTREAM, "sRGB colorspace not compatible with specified profile"); } } else { // [16,235] (including xvycc) vs [0,255] range - cm->color_range = aom_rb_read_bit(rb); - if (cm->profile == PROFILE_0) { + seq_params->color_range = aom_rb_read_bit(rb); + if (seq_params->profile == PROFILE_0) { // 420 only - cm->subsampling_x = cm->subsampling_y = 1; - } else if (cm->profile == PROFILE_1) { + seq_params->subsampling_x = seq_params->subsampling_y = 1; + } else if (seq_params->profile == PROFILE_1) { // 444 only - cm->subsampling_x = cm->subsampling_y = 0; + seq_params->subsampling_x = seq_params->subsampling_y = 0; } else { - assert(cm->profile == PROFILE_2); - if (cm->bit_depth == AOM_BITS_12) { - cm->subsampling_x = aom_rb_read_bit(rb); - if (cm->subsampling_x) - cm->subsampling_y = aom_rb_read_bit(rb); // 422 or 420 + assert(seq_params->profile == PROFILE_2); + if (seq_params->bit_depth == AOM_BITS_12) { + seq_params->subsampling_x = aom_rb_read_bit(rb); + if (seq_params->subsampling_x) + seq_params->subsampling_y = aom_rb_read_bit(rb); // 422 or 420 else - cm->subsampling_y = 0; // 444 + seq_params->subsampling_y = 0; // 444 } else { // 422 - cm->subsampling_x = 1; - cm->subsampling_y = 0; + seq_params->subsampling_x = 1; + seq_params->subsampling_y = 0; } } - if (cm->matrix_coefficients == AOM_CICP_MC_IDENTITY && - (cm->subsampling_x || cm->subsampling_y)) { + if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY && + (seq_params->subsampling_x || seq_params->subsampling_y)) { aom_internal_error( - &cm->error, AOM_CODEC_UNSUP_BITSTREAM, + error_info, AOM_CODEC_UNSUP_BITSTREAM, "Identity CICP Matrix incompatible with non 4:4:4 color sampling"); } - if (cm->subsampling_x && cm->subsampling_y) { - cm->chroma_sample_position = aom_rb_read_literal(rb, 2); + if (seq_params->subsampling_x && seq_params->subsampling_y) { + seq_params->chroma_sample_position = aom_rb_read_literal(rb, 2); } } - cm->separate_uv_delta_q = aom_rb_read_bit(rb); + seq_params->separate_uv_delta_q = aom_rb_read_bit(rb); } void av1_read_timing_info_header(AV1_COMMON *cm, @@ -3338,8 +4116,8 @@ void av1_read_decoder_model_info(AV1_COMMON *cm, aom_rb_read_literal(rb, 5) + 1; cm->buffer_model.num_units_in_decoding_tick = aom_rb_read_unsigned_literal( rb, 32); // Number of units in a decoding tick - cm->buffer_model.buffer_removal_delay_length = aom_rb_read_literal(rb, 5) + 1; - cm->buffer_model.frame_presentation_delay_length = + cm->buffer_model.buffer_removal_time_length = aom_rb_read_literal(rb, 5) + 1; + cm->buffer_model.frame_presentation_time_length = aom_rb_read_literal(rb, 5) + 1; } @@ -3352,32 +4130,27 @@ void av1_read_op_parameters_info(AV1_COMMON *const cm, op_num + 1); } - cm->op_params[op_num].decoder_buffer_delay = aom_rb_read_literal( + cm->op_params[op_num].decoder_buffer_delay = aom_rb_read_unsigned_literal( rb, cm->buffer_model.encoder_decoder_buffer_delay_length); - cm->op_params[op_num].encoder_buffer_delay = aom_rb_read_literal( + cm->op_params[op_num].encoder_buffer_delay = aom_rb_read_unsigned_literal( rb, cm->buffer_model.encoder_decoder_buffer_delay_length); cm->op_params[op_num].low_delay_mode_flag = aom_rb_read_bit(rb); } -static void av1_read_tu_pts_info(AV1_COMMON *const cm, - struct aom_read_bit_buffer *rb) { - cm->tu_presentation_delay = - aom_rb_read_literal(rb, cm->buffer_model.frame_presentation_delay_length); -} - -void read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { - // rb->error_handler may be triggered during aom_rb_read_bit(), raising - // internal errors and immediate decoding termination. We use a local variable - // to store the info. as we decode. At the end, if no errors have occurred, - // cm->seq_params is updated. - SequenceHeader sh = cm->seq_params; - SequenceHeader *const seq_params = &sh; - int num_bits_width = aom_rb_read_literal(rb, 4) + 1; - int num_bits_height = aom_rb_read_literal(rb, 4) + 1; - int max_frame_width = aom_rb_read_literal(rb, num_bits_width) + 1; - int max_frame_height = aom_rb_read_literal(rb, num_bits_height) + 1; +static void av1_read_temporal_point_info(AV1_COMMON *const cm, + struct aom_read_bit_buffer *rb) { + cm->frame_presentation_time = aom_rb_read_unsigned_literal( + rb, cm->buffer_model.frame_presentation_time_length); +} + +void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb, + SequenceHeader *seq_params) { + const int num_bits_width = aom_rb_read_literal(rb, 4) + 1; + const int num_bits_height = aom_rb_read_literal(rb, 4) + 1; + const int max_frame_width = aom_rb_read_literal(rb, num_bits_width) + 1; + const int max_frame_height = aom_rb_read_literal(rb, num_bits_height) + 1; seq_params->num_bits_width = num_bits_width; seq_params->num_bits_height = num_bits_height; @@ -3452,7 +4225,6 @@ void read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { seq_params->enable_superres = aom_rb_read_bit(rb); seq_params->enable_cdef = aom_rb_read_bit(rb); seq_params->enable_restoration = aom_rb_read_bit(rb); - cm->seq_params = *seq_params; } static int read_global_motion_params(WarpedMotionParams *params, @@ -3640,9 +4412,12 @@ static void show_existing_frame_reset(AV1Decoder *const pbi, *cm->fc = cm->frame_contexts[existing_frame_idx]; } +// On success, returns 0. On failure, calls aom_internal_error and does not +// return. static int read_uncompressed_header(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) { AV1_COMMON *const cm = &pbi->common; + const SequenceHeader *const seq_params = &cm->seq_params; MACROBLOCKD *const xd = &pbi->mb; BufferPool *const pool = cm->buffer_pool; RefCntBuffer *const frame_bufs = pool->frame_bufs; @@ -3658,7 +4433,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, // NOTE: By default all coded frames to be used as a reference cm->is_reference_frame = 1; - if (cm->seq_params.reduced_still_picture_hdr) { + if (seq_params->reduced_still_picture_hdr) { cm->show_existing_frame = 0; cm->show_frame = 1; cm->frame_type = KEY_FRAME; @@ -3671,12 +4446,12 @@ static int read_uncompressed_header(AV1Decoder *pbi, // Show an existing frame directly. const int existing_frame_idx = aom_rb_read_literal(rb, 3); const int frame_to_show = cm->ref_frame_map[existing_frame_idx]; - if (cm->seq_params.decoder_model_info_present_flag && + if (seq_params->decoder_model_info_present_flag && cm->timing_info.equal_picture_interval == 0) { - av1_read_tu_pts_info(cm, rb); + av1_read_temporal_point_info(cm, rb); } - if (cm->seq_params.frame_id_numbers_present_flag) { - int frame_id_length = cm->seq_params.frame_id_length; + if (seq_params->frame_id_numbers_present_flag) { + int frame_id_length = seq_params->frame_id_length; int display_frame_id = aom_rb_read_literal(rb, frame_id_length); /* Compare display_frame_id with ref_frame_id and check valid for * referencing */ @@ -3719,16 +4494,16 @@ static int read_uncompressed_header(AV1Decoder *pbi, cm->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2); // 2 bits cm->show_frame = aom_rb_read_bit(rb); - if (cm->seq_params.still_picture && + if (seq_params->still_picture && (cm->frame_type != KEY_FRAME || !cm->show_frame)) { aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Still pictures must be coded as shown keyframes"); } cm->showable_frame = cm->frame_type != KEY_FRAME; if (cm->show_frame) { - if (cm->seq_params.decoder_model_info_present_flag && + if (seq_params->decoder_model_info_present_flag && cm->timing_info.equal_picture_interval == 0) - av1_read_tu_pts_info(cm, rb); + av1_read_temporal_point_info(cm, rb); } else { // See if this frame can be used as show_existing_frame in future cm->showable_frame = aom_rb_read_bit(rb); @@ -3742,17 +4517,17 @@ static int read_uncompressed_header(AV1Decoder *pbi, } cm->disable_cdf_update = aom_rb_read_bit(rb); - if (cm->seq_params.force_screen_content_tools == 2) { + if (seq_params->force_screen_content_tools == 2) { cm->allow_screen_content_tools = aom_rb_read_bit(rb); } else { - cm->allow_screen_content_tools = cm->seq_params.force_screen_content_tools; + cm->allow_screen_content_tools = seq_params->force_screen_content_tools; } if (cm->allow_screen_content_tools) { - if (cm->seq_params.force_integer_mv == 2) { + if (seq_params->force_integer_mv == 2) { cm->cur_frame_force_integer_mv = aom_rb_read_bit(rb); } else { - cm->cur_frame_force_integer_mv = cm->seq_params.force_integer_mv; + cm->cur_frame_force_integer_mv = seq_params->force_integer_mv; } } else { cm->cur_frame_force_integer_mv = 0; @@ -3763,10 +4538,10 @@ static int read_uncompressed_header(AV1Decoder *pbi, cm->allow_intrabc = 0; cm->primary_ref_frame = PRIMARY_REF_NONE; - if (!cm->seq_params.reduced_still_picture_hdr) { - if (cm->seq_params.frame_id_numbers_present_flag) { - int frame_id_length = cm->seq_params.frame_id_length; - int diff_len = cm->seq_params.delta_frame_id_length; + if (!seq_params->reduced_still_picture_hdr) { + if (seq_params->frame_id_numbers_present_flag) { + int frame_id_length = seq_params->frame_id_length; + int diff_len = seq_params->delta_frame_id_length; int prev_frame_id = 0; int have_prev_frame_id = !pbi->decoding_first_frame && !(cm->frame_type == KEY_FRAME && cm->show_frame); @@ -3811,7 +4586,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, frame_is_sframe(cm) ? 1 : aom_rb_read_literal(rb, 1); cm->frame_offset = - aom_rb_read_literal(rb, cm->seq_params.order_hint_bits_minus_1 + 1); + aom_rb_read_literal(rb, seq_params->order_hint_bits_minus_1 + 1); cm->current_video_frame = cm->frame_offset; if (!cm->error_resilient_mode && !frame_is_intra_only(cm)) { @@ -3819,27 +4594,27 @@ static int read_uncompressed_header(AV1Decoder *pbi, } } - if (cm->seq_params.decoder_model_info_present_flag) { - cm->buffer_removal_delay_present = aom_rb_read_bit(rb); - if (cm->buffer_removal_delay_present) { + if (seq_params->decoder_model_info_present_flag) { + cm->buffer_removal_time_present = aom_rb_read_bit(rb); + if (cm->buffer_removal_time_present) { for (int op_num = 0; - op_num < cm->seq_params.operating_points_cnt_minus_1 + 1; op_num++) { + op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) { if (cm->op_params[op_num].decoder_model_param_present_flag) { - if ((((cm->seq_params.operating_point_idc[op_num] >> + if ((((seq_params->operating_point_idc[op_num] >> cm->temporal_layer_id) & 0x1) && - ((cm->seq_params.operating_point_idc[op_num] >> + ((seq_params->operating_point_idc[op_num] >> (cm->spatial_layer_id + 8)) & 0x1)) || - cm->seq_params.operating_point_idc[op_num] == 0) { - cm->op_frame_timing[op_num].buffer_removal_delay = - aom_rb_read_literal( - rb, cm->buffer_model.buffer_removal_delay_length); + seq_params->operating_point_idc[op_num] == 0) { + cm->op_frame_timing[op_num].buffer_removal_time = + aom_rb_read_unsigned_literal( + rb, cm->buffer_model.buffer_removal_time_length); } else { - cm->op_frame_timing[op_num].buffer_removal_delay = 0; + cm->op_frame_timing[op_num].buffer_removal_time = 0; } } else { - cm->op_frame_timing[op_num].buffer_removal_delay = 0; + cm->op_frame_timing[op_num].buffer_removal_time = 0; } } } @@ -3882,11 +4657,11 @@ static int read_uncompressed_header(AV1Decoder *pbi, if (!frame_is_intra_only(cm) || pbi->refresh_frame_flags != 0xFF) { // Read all ref frame order hints if error_resilient_mode == 1 - if (cm->error_resilient_mode && cm->seq_params.enable_order_hint) { + if (cm->error_resilient_mode && seq_params->enable_order_hint) { for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) { // Read order hint from bit stream unsigned int frame_offset = - aom_rb_read_literal(rb, cm->seq_params.order_hint_bits_minus_1 + 1); + aom_rb_read_literal(rb, seq_params->order_hint_bits_minus_1 + 1); // Get buffer index int buf_idx = cm->ref_frame_map[ref_idx]; assert(buf_idx < FRAME_BUFFERS); @@ -3906,10 +4681,10 @@ static int read_uncompressed_header(AV1Decoder *pbi, } lock_buffer_pool(pool); if (aom_realloc_frame_buffer( - &frame_bufs[buf_idx].buf, cm->seq_params.max_frame_width, - cm->seq_params.max_frame_height, cm->subsampling_x, - cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, - cm->byte_alignment, + &frame_bufs[buf_idx].buf, seq_params->max_frame_width, + seq_params->max_frame_height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, cm->byte_alignment, &pool->frame_bufs[buf_idx].raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) { unlock_buffer_pool(pool); @@ -3917,7 +4692,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, "Failed to allocate frame buffer"); } unlock_buffer_pool(pool); - set_planes_to_neutral_grey(cm, &frame_bufs[buf_idx].buf, 0); + set_planes_to_neutral_grey(seq_params, &frame_bufs[buf_idx].buf, 0); cm->ref_frame_map[ref_idx] = buf_idx; frame_bufs[buf_idx].cur_frame_offset = frame_offset; @@ -3937,7 +4712,8 @@ static int read_uncompressed_header(AV1Decoder *pbi, cm->allow_ref_frame_mvs = 0; if (cm->intra_only) { - cm->cur_frame->film_grain_params_present = cm->film_grain_params_present; + cm->cur_frame->film_grain_params_present = + seq_params->film_grain_params_present; setup_frame_size(cm, frame_size_override_flag, rb); if (cm->allow_screen_content_tools && !av1_superres_scaled(cm)) cm->allow_intrabc = aom_rb_read_bit(rb); @@ -3945,7 +4721,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, } else if (pbi->need_resync != 1) { /* Skip if need resync */ // Frame refs short signaling is off when error resilient mode is on. - if (cm->seq_params.enable_order_hint) + if (seq_params->enable_order_hint) cm->frame_refs_short_signaling = aom_rb_read_bit(rb); if (cm->frame_refs_short_signaling) { @@ -3999,9 +4775,9 @@ static int read_uncompressed_header(AV1Decoder *pbi, cm->ref_frame_sign_bias[LAST_FRAME + i] = 0; - if (cm->seq_params.frame_id_numbers_present_flag) { - int frame_id_length = cm->seq_params.frame_id_length; - int diff_len = cm->seq_params.delta_frame_id_length; + if (seq_params->frame_id_numbers_present_flag) { + int frame_id_length = seq_params->frame_id_length; + int diff_len = seq_params->delta_frame_id_length; int delta_frame_id_minus_1 = aom_rb_read_literal(rb, diff_len); int ref_frame_id = ((cm->current_frame_id - (delta_frame_id_minus_1 + 1) + @@ -4064,7 +4840,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, cm->cur_frame->intra_only = cm->frame_type == KEY_FRAME || cm->intra_only; cm->cur_frame->frame_type = cm->frame_type; - if (cm->seq_params.frame_id_numbers_present_flag) { + if (seq_params->frame_id_numbers_present_flag) { /* If bitmask is set, update reference frame id values and mark frames as valid for reference */ int refresh_frame_flags = pbi->refresh_frame_flags; @@ -4077,7 +4853,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, } const int might_bwd_adapt = - !(cm->seq_params.reduced_still_picture_hdr) && !(cm->disable_cdf_update); + !(seq_params->reduced_still_picture_hdr) && !(cm->disable_cdf_update); if (might_bwd_adapt) { cm->refresh_frame_context = aom_rb_read_bit(rb) ? REFRESH_FRAME_CONTEXT_DISABLED @@ -4086,14 +4862,16 @@ static int read_uncompressed_header(AV1Decoder *pbi, cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; } - get_frame_new_buffer(cm)->bit_depth = cm->bit_depth; - get_frame_new_buffer(cm)->color_primaries = cm->color_primaries; + get_frame_new_buffer(cm)->bit_depth = seq_params->bit_depth; + get_frame_new_buffer(cm)->color_primaries = seq_params->color_primaries; get_frame_new_buffer(cm)->transfer_characteristics = - cm->transfer_characteristics; - get_frame_new_buffer(cm)->matrix_coefficients = cm->matrix_coefficients; - get_frame_new_buffer(cm)->monochrome = cm->seq_params.monochrome; - get_frame_new_buffer(cm)->chroma_sample_position = cm->chroma_sample_position; - get_frame_new_buffer(cm)->color_range = cm->color_range; + seq_params->transfer_characteristics; + get_frame_new_buffer(cm)->matrix_coefficients = + seq_params->matrix_coefficients; + get_frame_new_buffer(cm)->monochrome = seq_params->monochrome; + get_frame_new_buffer(cm)->chroma_sample_position = + seq_params->chroma_sample_position; + get_frame_new_buffer(cm)->color_range = seq_params->color_range; get_frame_new_buffer(cm)->render_width = cm->render_width; get_frame_new_buffer(cm)->render_height = cm->render_height; @@ -4145,7 +4923,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, read_tile_info(pbi, rb); setup_quantization(cm, rb); - xd->bd = (int)cm->bit_depth; + xd->bd = (int)seq_params->bit_depth; if (cm->num_allocated_above_context_planes < av1_num_planes(cm) || cm->num_allocated_above_context_mi_col < cm->mi_cols || @@ -4196,22 +4974,22 @@ static int read_uncompressed_header(AV1Decoder *pbi, cm->lf.filter_level[0] = 0; cm->lf.filter_level[1] = 0; } - if (cm->coded_lossless || !cm->seq_params.enable_cdef) { + if (cm->coded_lossless || !seq_params->enable_cdef) { cm->cdef_bits = 0; cm->cdef_strengths[0] = 0; cm->cdef_uv_strengths[0] = 0; } - if (cm->all_lossless || !cm->seq_params.enable_restoration) { + if (cm->all_lossless || !seq_params->enable_restoration) { cm->rst_info[0].frame_restoration_type = RESTORE_NONE; cm->rst_info[1].frame_restoration_type = RESTORE_NONE; cm->rst_info[2].frame_restoration_type = RESTORE_NONE; } setup_loopfilter(cm, rb); - if (!cm->coded_lossless && cm->seq_params.enable_cdef) { + if (!cm->coded_lossless && seq_params->enable_cdef) { setup_cdef(cm, rb); } - if (!cm->all_lossless && cm->seq_params.enable_restoration) { + if (!cm->all_lossless && seq_params->enable_restoration) { decode_restoration_mode(cm, rb); } @@ -4236,7 +5014,8 @@ static int read_uncompressed_header(AV1Decoder *pbi, if (!frame_is_intra_only(cm)) read_global_motion(cm, rb); - cm->cur_frame->film_grain_params_present = cm->film_grain_params_present; + cm->cur_frame->film_grain_params_present = + seq_params->film_grain_params_present; read_film_grain(cm, rb); #if EXT_TILE_DEBUG @@ -4282,11 +5061,11 @@ void superres_post_decode(AV1Decoder *pbi) { unlock_buffer_pool(pool); } -int av1_decode_frame_headers_and_setup(AV1Decoder *pbi, - struct aom_read_bit_buffer *rb, - const uint8_t *data, - const uint8_t **p_data_end, - int trailing_bits_present) { +uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi, + struct aom_read_bit_buffer *rb, + const uint8_t *data, + const uint8_t **p_data_end, + int trailing_bits_present) { AV1_COMMON *const cm = &pbi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &pbi->mb; @@ -4316,7 +5095,8 @@ int av1_decode_frame_headers_and_setup(AV1Decoder *pbi, pbi->dec_tile_col = -1; } - pbi->uncomp_hdr_size = aom_rb_bytes_read(rb); + const uint32_t uncomp_hdr_size = + (uint32_t)aom_rb_bytes_read(rb); // Size of the uncompressed header YV12_BUFFER_CONFIG *new_fb = get_frame_new_buffer(cm); xd->cur_buf = new_fb; if (av1_allow_intrabc(cm)) { @@ -4327,7 +5107,7 @@ int av1_decode_frame_headers_and_setup(AV1Decoder *pbi, if (cm->show_existing_frame) { // showing a frame directly - *p_data_end = data + aom_rb_bytes_read(rb); + *p_data_end = data + uncomp_hdr_size; if (cm->reset_decoder_state) { // Use the default frame context values. *cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS]; @@ -4335,7 +5115,7 @@ int av1_decode_frame_headers_and_setup(AV1Decoder *pbi, aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Uninitialized entropy context."); } - return 0; + return uncomp_hdr_size; } cm->setup_mi(cm); @@ -4344,7 +5124,8 @@ int av1_decode_frame_headers_and_setup(AV1Decoder *pbi, av1_setup_motion_field(cm); - av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y, num_planes); + av1_setup_block_planes(xd, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y, num_planes); if (cm->primary_ref_frame == PRIMARY_REF_NONE) { // use the default frame context values *cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS]; @@ -4356,7 +5137,7 @@ int av1_decode_frame_headers_and_setup(AV1Decoder *pbi, "Uninitialized entropy context."); xd->corrupted = 0; - return 0; + return uncomp_hdr_size; } // Once-per-frame initialization @@ -4368,7 +5149,7 @@ static void setup_frame_info(AV1Decoder *pbi) { cm->rst_info[2].frame_restoration_type != RESTORE_NONE) { av1_alloc_restoration_buffers(cm); } - const int use_highbd = cm->use_highbitdepth ? 1 : 0; + const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0; const int buf_size = MC_TEMP_BUF_PELS << use_highbd; if (pbi->td.mc_buf_size != buf_size) { av1_free_mc_tmp_buf(&pbi->td, use_highbd); @@ -4386,14 +5167,21 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data, if (initialize_flag) setup_frame_info(pbi); - if (pbi->max_threads > 1 && tile_count_tg > 1 && !cm->large_scale_tile) + if (pbi->max_threads > 1 && !(cm->large_scale_tile && !pbi->ext_tile_debug) && + pbi->row_mt) + *p_data_end = + decode_tiles_row_mt(pbi, data, data_end, start_tile, end_tile); + else if (pbi->max_threads > 1 && tile_count_tg > 1 && + !(cm->large_scale_tile && !pbi->ext_tile_debug)) *p_data_end = decode_tiles_mt(pbi, data, data_end, start_tile, end_tile); else *p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile); const int num_planes = av1_num_planes(cm); // If the bit stream is monochrome, set the U and V buffers to a constant. - if (num_planes < 3) set_planes_to_neutral_grey(cm, xd->cur_buf, 1); + if (num_planes < 3) { + set_planes_to_neutral_grey(&cm->seq_params, xd->cur_buf, 1); + } if (end_tile != cm->tile_rows * cm->tile_cols - 1) { return; diff --git a/third_party/aom/av1/decoder/decodeframe.h b/third_party/aom/av1/decoder/decodeframe.h index 330cedcdc..d289b31f2 100644 --- a/third_party/aom/av1/decoder/decodeframe.h +++ b/third_party/aom/av1/decoder/decodeframe.h @@ -18,12 +18,13 @@ extern "C" { struct AV1Decoder; struct aom_read_bit_buffer; +struct ThreadData; // Reads the middle part of the sequence header OBU (from -// frame_width_bits_minus_1 to enable_restoration) into cm->seq_params (a -// SequenceHeader). Reports errors by calling rb->error_handler() or -// aom_internal_error(). -void read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb); +// frame_width_bits_minus_1 to enable_restoration) into seq_params. +// Reports errors by calling rb->error_handler() or aom_internal_error(). +void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb, + SequenceHeader *seq_params); void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width, int num_bits_height, int *width, int *height); @@ -34,11 +35,14 @@ BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb); int av1_check_trailing_bits(struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb); -int av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi, - struct aom_read_bit_buffer *rb, - const uint8_t *data, - const uint8_t **p_data_end, - int trailing_bits_present); +// On success, returns the frame header size. On failure, calls +// aom_internal_error and does not return. +// TODO(wtc): Figure out and document the p_data_end parameter. +uint32_t av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi, + struct aom_read_bit_buffer *rb, + const uint8_t *data, + const uint8_t **p_data_end, + int trailing_bits_present); void av1_decode_tg_tiles_and_wrapup(struct AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end, @@ -47,8 +51,9 @@ void av1_decode_tg_tiles_and_wrapup(struct AV1Decoder *pbi, const uint8_t *data, // Implements the color_config() function in the spec. Reports errors by // calling rb->error_handler() or aom_internal_error(). -void av1_read_color_config(AV1_COMMON *cm, struct aom_read_bit_buffer *rb, - int allow_lowbitdepth); +void av1_read_color_config(struct aom_read_bit_buffer *rb, + int allow_lowbitdepth, SequenceHeader *seq_params, + struct aom_internal_error_info *error_info); // Implements the timing_info() function in the spec. Reports errors by calling // rb->error_handler(). @@ -69,7 +74,7 @@ struct aom_read_bit_buffer *av1_init_read_bit_buffer( struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data, const uint8_t *data_end); -void av1_free_mc_tmp_buf(void *td, int use_highbd); +void av1_free_mc_tmp_buf(struct ThreadData *thread_data, int use_highbd); void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm); diff --git a/third_party/aom/av1/decoder/decodemv.c b/third_party/aom/av1/decoder/decodemv.c index cc8f4d29e..5e920b18d 100644 --- a/third_party/aom/av1/decoder/decodemv.c +++ b/third_party/aom/av1/decoder/decodemv.c @@ -290,7 +290,7 @@ static int read_segment_id(AV1_COMMON *const cm, const MACROBLOCKD *const xd, av1_neg_deinterleave(coded_id, pred, seg->last_active_segid + 1); if (segment_id < 0 || segment_id > seg->last_active_segid) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, "Corrupted segment_ids"); } return segment_id; @@ -573,7 +573,7 @@ static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx], PALETTE_SIZES, ACCT_STR) + 2; - read_palette_colors_y(xd, cm->bit_depth, pmi, r); + read_palette_colors_y(xd, cm->seq_params.bit_depth, pmi, r); } } if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && @@ -587,7 +587,7 @@ static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx], PALETTE_SIZES, ACCT_STR) + 2; - read_palette_colors_uv(xd, cm->bit_depth, pmi, r); + read_palette_colors_uv(xd, cm->seq_params.bit_depth, pmi, r); } } } @@ -1299,7 +1299,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi, } if (is_compound != is_inter_compound_mode(mbmi->mode)) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, "Prediction mode %d invalid with ref frame %d %d", mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); } @@ -1480,8 +1480,9 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi, } } - xd->cfl.is_chroma_reference = is_chroma_reference( - mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y); + xd->cfl.is_chroma_reference = + is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y); xd->cfl.store_y = store_cfl_required(cm, xd); #if DEC_MISMATCH_DEBUG diff --git a/third_party/aom/av1/decoder/decoder.c b/third_party/aom/av1/decoder/decoder.c index 2e91d27d3..e978fad6c 100644 --- a/third_party/aom/av1/decoder/decoder.c +++ b/third_party/aom/av1/decoder/decoder.c @@ -71,6 +71,7 @@ static void dec_free_mi(AV1_COMMON *cm) { cm->mip = NULL; aom_free(cm->mi_grid_base); cm->mi_grid_base = NULL; + cm->mi_alloc_size = 0; } AV1Decoder *av1_decoder_create(BufferPool *const pool) { @@ -81,6 +82,9 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) { av1_zero(*pbi); + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. if (setjmp(cm->error.jmp)) { cm->error.setjmp = 0; av1_decoder_remove(pbi); @@ -98,7 +102,7 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) { memset(cm->frame_contexts, 0, FRAME_CONTEXTS * sizeof(*cm->frame_contexts)); pbi->need_resync = 1; - once(initialize_dec); + aom_once(initialize_dec); // Initialize the references to not point to any frame buffers. memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); @@ -108,7 +112,7 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) { pbi->decoding_first_frame = 1; pbi->common.buffer_pool = pool; - cm->bit_depth = AOM_BITS_8; + cm->seq_params.bit_depth = AOM_BITS_8; cm->dequant_bit_depth = AOM_BITS_8; cm->alloc_mi = av1_dec_alloc_mi; @@ -146,6 +150,12 @@ void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info) { } } +void av1_dec_free_cb_buf(AV1Decoder *pbi) { + aom_free(pbi->cb_buffer_base); + pbi->cb_buffer_base = NULL; + pbi->cb_buffer_alloc_size = 0; +} + void av1_decoder_remove(AV1Decoder *pbi) { int i; @@ -161,7 +171,7 @@ void av1_decoder_remove(AV1Decoder *pbi) { if (pbi->thread_data) { for (int worker_idx = 0; worker_idx < pbi->max_threads - 1; worker_idx++) { DecWorkerData *const thread_data = pbi->thread_data + worker_idx; - const int use_highbd = pbi->common.use_highbitdepth ? 1 : 0; + const int use_highbd = pbi->common.seq_params.use_highbitdepth ? 1 : 0; av1_free_mc_tmp_buf(thread_data->td, use_highbd); aom_free(thread_data->td); } @@ -172,6 +182,20 @@ void av1_decoder_remove(AV1Decoder *pbi) { AVxWorker *const worker = &pbi->tile_workers[i]; aom_get_worker_interface()->end(worker); } +#if CONFIG_MULTITHREAD + if (pbi->row_mt_mutex_ != NULL) { + pthread_mutex_destroy(pbi->row_mt_mutex_); + aom_free(pbi->row_mt_mutex_); + } + if (pbi->row_mt_cond_ != NULL) { + pthread_cond_destroy(pbi->row_mt_cond_); + aom_free(pbi->row_mt_cond_); + } +#endif + for (i = 0; i < pbi->allocated_tiles; i++) { + TileDataDec *const tile_data = pbi->tile_data + i; + av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync); + } aom_free(pbi->tile_data); aom_free(pbi->tile_workers); @@ -181,10 +205,11 @@ void av1_decoder_remove(AV1Decoder *pbi) { av1_dealloc_dec_jobs(&pbi->tile_mt_info); } + av1_dec_free_cb_buf(pbi); #if CONFIG_ACCOUNTING aom_accounting_clear(&pbi->accounting); #endif - const int use_highbd = pbi->common.use_highbitdepth ? 1 : 0; + const int use_highbd = pbi->common.seq_params.use_highbitdepth ? 1 : 0; av1_free_mc_tmp_buf(&pbi->td, use_highbd); aom_free(pbi); @@ -279,7 +304,7 @@ aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx, ref_buf->y_buffer = sd->y_buffer; ref_buf->u_buffer = sd->u_buffer; ref_buf->v_buffer = sd->v_buffer; - ref_buf->use_external_refernce_buffers = 1; + ref_buf->use_external_reference_buffers = 1; } } @@ -414,7 +439,10 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size, // Find a free frame buffer. Return error if can not find any. cm->new_fb_idx = get_free_fb(cm); - if (cm->new_fb_idx == INVALID_IDX) return AOM_CODEC_MEM_ERROR; + if (cm->new_fb_idx == INVALID_IDX) { + cm->error.error_code = AOM_CODEC_MEM_ERROR; + return 1; + } // Assign a MV array to the frame buffer. cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; @@ -423,6 +451,9 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size, pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. if (setjmp(cm->error.jmp)) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); int i; @@ -474,7 +505,13 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size, int frame_decoded = aom_decode_frame_from_obus(pbi, source, source + size, psource); - if (cm->error.error_code != AOM_CODEC_OK) return 1; + if (cm->error.error_code != AOM_CODEC_OK) { + lock_buffer_pool(pool); + decrease_ref_count(cm->new_fb_idx, frame_bufs, pool); + unlock_buffer_pool(pool); + cm->error.setjmp = 0; + return 1; + } #if TXCOEFF_TIMER cm->cum_txcoeff_timer += cm->txcoeff_timer; @@ -493,7 +530,10 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size, pbi->decoding_first_frame = 0; } - if (cm->error.error_code != AOM_CODEC_OK) return 1; + if (cm->error.error_code != AOM_CODEC_OK) { + cm->error.setjmp = 0; + return 1; + } aom_clear_system_state(); diff --git a/third_party/aom/av1/decoder/decoder.h b/third_party/aom/av1/decoder/decoder.h index 42fcc1256..610b98d95 100644 --- a/third_party/aom/av1/decoder/decoder.h +++ b/third_party/aom/av1/decoder/decoder.h @@ -33,6 +33,20 @@ extern "C" { #endif +typedef void (*decode_block_visitor_fn_t)(const AV1_COMMON *const cm, + MACROBLOCKD *const xd, + aom_reader *const r, const int plane, + const int row, const int col, + const TX_SIZE tx_size); + +typedef void (*predict_inter_block_visitor_fn_t)(AV1_COMMON *const cm, + MACROBLOCKD *const xd, + int mi_row, int mi_col, + BLOCK_SIZE bsize); + +typedef void (*cfl_store_inter_block_visitor_fn_t)(AV1_COMMON *const cm, + MACROBLOCKD *const xd); + typedef struct ThreadData { aom_reader *bit_reader; DECLARE_ALIGNED(32, MACROBLOCKD, xd); @@ -41,12 +55,54 @@ typedef struct ThreadData { CB_BUFFER cb_buffer_base; uint8_t *mc_buf[2]; int32_t mc_buf_size; + + decode_block_visitor_fn_t read_coeffs_tx_intra_block_visit; + decode_block_visitor_fn_t predict_and_recon_intra_block_visit; + decode_block_visitor_fn_t read_coeffs_tx_inter_block_visit; + decode_block_visitor_fn_t inverse_tx_inter_block_visit; + predict_inter_block_visitor_fn_t predict_inter_block_visit; + cfl_store_inter_block_visitor_fn_t cfl_store_inter_block_visit; } ThreadData; +typedef struct AV1DecRowMTJobInfo { + int tile_row; + int tile_col; + int mi_row; +} AV1DecRowMTJobInfo; + +typedef struct AV1DecRowMTSyncData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_; + pthread_cond_t *cond_; +#endif + int allocated_sb_rows; + int *cur_sb_col; + int sync_range; + int mi_rows; + int mi_cols; + int mi_rows_parse_done; + int mi_rows_decode_started; + int num_threads_working; +} AV1DecRowMTSync; + +typedef struct AV1DecRowMTInfo { + int tile_rows_start; + int tile_rows_end; + int tile_cols_start; + int tile_cols_end; + int start_tile; + int end_tile; + int mi_rows_parse_done; + int mi_rows_decode_started; + int mi_rows_to_decode; + int row_mt_exit; +} AV1DecRowMTInfo; + typedef struct TileDataDec { TileInfo tile_info; aom_reader bit_reader; DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx); + AV1DecRowMTSync dec_row_mt_sync; } TileDataDec; typedef struct TileBufferDec { @@ -139,9 +195,8 @@ typedef struct AV1Decoder { int acct_enabled; Accounting accounting; #endif - size_t uncomp_hdr_size; // Size of the uncompressed header - int tg_size; // Number of tiles in the current tilegroup - int tg_start; // First tile in the current tilegroup + int tg_size; // Number of tiles in the current tilegroup + int tg_start; // First tile in the current tilegroup int tg_size_bit_offset; int sequence_header_ready; #if CONFIG_INSPECTION @@ -162,12 +217,27 @@ typedef struct AV1Decoder { int tile_count_minus_1; uint32_t coded_tile_data_size; unsigned int ext_tile_debug; // for ext-tile software debug & testing + unsigned int row_mt; EXTERNAL_REFERENCES ext_refs; size_t tile_list_size; uint8_t *tile_list_output; size_t buffer_sz; + + CB_BUFFER *cb_buffer_base; + int cb_buffer_alloc_size; + + int allocated_row_mt_sync_rows; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *row_mt_mutex_; + pthread_cond_t *row_mt_cond_; +#endif + + AV1DecRowMTInfo frame_row_mt_info; } AV1Decoder; +// Returns 0 on success. Sets pbi->common.error.error_code to a nonzero error +// code and returns a nonzero value on failure. int av1_receive_compressed_data(struct AV1Decoder *pbi, size_t size, const uint8_t **dest); @@ -192,6 +262,10 @@ struct AV1Decoder *av1_decoder_create(BufferPool *const pool); void av1_decoder_remove(struct AV1Decoder *pbi); void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_jobs_sync); +void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync); + +void av1_dec_free_cb_buf(AV1Decoder *pbi); + static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs, BufferPool *const pool) { if (idx >= 0) { @@ -207,18 +281,6 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs, } } -static INLINE int dec_is_ref_frame_buf(AV1Decoder *const pbi, - RefCntBuffer *frame_buf) { - AV1_COMMON *const cm = &pbi->common; - int i; - for (i = 0; i < INTER_REFS_PER_FRAME; ++i) { - RefBuffer *const ref_frame = &cm->frame_refs[i]; - if (ref_frame->idx == INVALID_IDX) continue; - if (frame_buf == &cm->buffer_pool->frame_bufs[ref_frame->idx]) break; - } - return (i < INTER_REFS_PER_FRAME); -} - #define ACCT_STR __func__ static INLINE int av1_read_uniform(aom_reader *r, int n) { const int l = get_unsigned_bits(n); @@ -238,6 +300,10 @@ void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, int mi_row, int mi_col, aom_reader *r, BLOCK_SIZE bsize, palette_visitor_fn_t visit); +typedef void (*block_visitor_fn_t)(AV1Decoder *const pbi, ThreadData *const td, + int mi_row, int mi_col, aom_reader *r, + PARTITION_TYPE partition, BLOCK_SIZE bsize); + #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/av1/decoder/decodetxb.c b/third_party/aom/av1/decoder/decodetxb.c index f9a3e8578..f3ef2d55e 100644 --- a/third_party/aom/av1/decoder/decodetxb.c +++ b/third_party/aom/av1/decoder/decodetxb.c @@ -320,10 +320,14 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd, return cul_level; } -uint8_t av1_read_coeffs_txb_facade(const AV1_COMMON *const cm, - MACROBLOCKD *const xd, aom_reader *const r, - const int row, const int col, - const int plane, const TX_SIZE tx_size) { +void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm, + MACROBLOCKD *const xd, aom_reader *const r, + const int plane, const int row, const int col, + const TX_SIZE tx_size) { +#if TXCOEFF_TIMER + struct aom_usec_timer timer; + aom_usec_timer_start(&timer); +#endif MB_MODE_INFO *const mbmi = xd->mi[0]; struct macroblockd_plane *const pd = &xd->plane[plane]; @@ -337,5 +341,22 @@ uint8_t av1_read_coeffs_txb_facade(const AV1_COMMON *const cm, const uint8_t cul_level = av1_read_coeffs_txb(cm, xd, r, row, col, plane, &txb_ctx, tx_size); av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col, row); - return cul_level; + + if (is_inter_block(mbmi)) { + PLANE_TYPE plane_type = get_plane_type(plane); + // tx_type will be read out in av1_read_coeffs_txb_facade + const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, row, col, tx_size, + cm->reduced_tx_set_used); + + if (plane == 0) + update_txk_array(mbmi->txk_type, mbmi->sb_type, row, col, tx_size, + tx_type); + } + +#if TXCOEFF_TIMER + aom_usec_timer_mark(&timer); + const int64_t elapsed_time = aom_usec_timer_elapsed(&timer); + cm->txcoeff_timer += elapsed_time; + ++cm->txb_count; +#endif } diff --git a/third_party/aom/av1/decoder/decodetxb.h b/third_party/aom/av1/decoder/decodetxb.h index d0b3d8c7a..687bba958 100644 --- a/third_party/aom/av1/decoder/decodetxb.h +++ b/third_party/aom/av1/decoder/decodetxb.h @@ -25,8 +25,8 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd, const TXB_CTX *const txb_ctx, const TX_SIZE tx_size); -uint8_t av1_read_coeffs_txb_facade(const AV1_COMMON *const cm, - MACROBLOCKD *const xd, aom_reader *const r, - const int row, const int col, - const int plane, const TX_SIZE tx_size); +void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm, + MACROBLOCKD *const xd, aom_reader *const r, + const int plane, const int row, const int col, + const TX_SIZE tx_size); #endif // DECODETXB_H_ diff --git a/third_party/aom/av1/decoder/dthread.c b/third_party/aom/av1/decoder/dthread.c index ff03502e6..3946c787a 100644 --- a/third_party/aom/av1/decoder/dthread.c +++ b/third_party/aom/av1/decoder/dthread.c @@ -157,8 +157,8 @@ void av1_frameworker_copy_context(AVxWorker *const dst_worker, dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync; av1_frameworker_unlock_stats(src_worker); - dst_cm->bit_depth = src_cm->bit_depth; - dst_cm->use_highbitdepth = src_cm->use_highbitdepth; + dst_cm->seq_params.bit_depth = src_cm->seq_params.bit_depth; + dst_cm->seq_params.use_highbitdepth = src_cm->seq_params.use_highbitdepth; // TODO(zoeliu): To handle parallel decoding dst_cm->prev_frame = src_cm->show_existing_frame ? src_cm->prev_frame : src_cm->cur_frame; @@ -166,8 +166,8 @@ void av1_frameworker_copy_context(AVxWorker *const dst_worker, !src_cm->show_existing_frame ? src_cm->width : src_cm->last_width; dst_cm->last_height = !src_cm->show_existing_frame ? src_cm->height : src_cm->last_height; - dst_cm->subsampling_x = src_cm->subsampling_x; - dst_cm->subsampling_y = src_cm->subsampling_y; + dst_cm->seq_params.subsampling_x = src_cm->seq_params.subsampling_x; + dst_cm->seq_params.subsampling_y = src_cm->seq_params.subsampling_y; dst_cm->frame_type = src_cm->frame_type; dst_cm->last_show_frame = !src_cm->show_existing_frame ? src_cm->show_frame diff --git a/third_party/aom/av1/decoder/dthread.h b/third_party/aom/av1/decoder/dthread.h index 33d89006e..9f854e015 100644 --- a/third_party/aom/av1/decoder/dthread.h +++ b/third_party/aom/av1/decoder/dthread.h @@ -39,7 +39,6 @@ typedef struct FrameWorkerData { const uint8_t *data_end; size_t data_size; void *user_priv; - int result; int worker_id; int received_frame; diff --git a/third_party/aom/av1/decoder/obu.c b/third_party/aom/av1/decoder/obu.c index 482b6415e..715bc6837 100644 --- a/third_party/aom/av1/decoder/obu.c +++ b/third_party/aom/av1/decoder/obu.c @@ -161,6 +161,17 @@ static int is_obu_in_current_operating_point(AV1Decoder *pbi, return 0; } +static int byte_alignment(AV1_COMMON *const cm, + struct aom_read_bit_buffer *const rb) { + while (rb->bit_offset & 7) { + if (aom_rb_read_bit(rb)) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + } + return 0; +} + static uint32_t read_temporal_delimiter_obu() { return 0; } // Returns a boolean that indicates success. @@ -173,6 +184,13 @@ static int read_bitstream_level(BitstreamLevel *bl, return 1; } +// Returns whether two sequence headers are consistent with each other. +// TODO(huisu,wtc@google.com): make sure the code matches the spec exactly. +static int are_seq_headers_consistent(const SequenceHeader *seq_params_old, + const SequenceHeader *seq_params_new) { + return !memcmp(seq_params_old, seq_params_new, sizeof(SequenceHeader)); +} + // On success, sets pbi->sequence_header_ready to 1 and returns the number of // bytes read from 'rb'. // On failure, sets pbi->common.error.error_code and returns 0. @@ -184,14 +202,17 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi, // Verify rb has been configured to report errors. assert(rb->error_handler); - cm->profile = av1_read_profile(rb); - if (cm->profile > PROFILE_2) { + // Use a local variable to store the information as we decode. At the end, + // if no errors have occurred, cm->seq_params is updated. + SequenceHeader sh = cm->seq_params; + SequenceHeader *const seq_params = &sh; + + seq_params->profile = av1_read_profile(rb); + if (seq_params->profile > PROFILE_2) { cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; return 0; } - SequenceHeader *const seq_params = &cm->seq_params; - // Still picture or not seq_params->still_picture = aom_rb_read_bit(rb); seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb); @@ -252,7 +273,8 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi, (cm->timing_info.equal_picture_interval || cm->op_params[i].decoder_model_param_present_flag)) { cm->op_params[i].bitrate = max_level_bitrate( - cm->profile, major_minor_to_seq_level_idx(seq_params->level[i]), + seq_params->profile, + major_minor_to_seq_level_idx(seq_params->level[i]), seq_params->tier[i]); // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass // the check @@ -305,30 +327,49 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi, return 0; } - read_sequence_header(cm, rb); + av1_read_sequence_header(cm, rb, seq_params); - av1_read_color_config(cm, rb, pbi->allow_lowbitdepth); + av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &cm->error); + if (!(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0) && + !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) && + !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0)) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported, " + "%d %d subsampling is not supported.\n", + seq_params->subsampling_x, seq_params->subsampling_y); + } - cm->film_grain_params_present = aom_rb_read_bit(rb); + seq_params->film_grain_params_present = aom_rb_read_bit(rb); if (av1_check_trailing_bits(pbi, rb) != 0) { // cm->error.error_code is already set. return 0; } + // If a sequence header has been decoded before, we check if the new + // one is consistent with the old one. + if (pbi->sequence_header_ready) { + if (!are_seq_headers_consistent(&cm->seq_params, seq_params)) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Inconsistent sequence headers received."); + } + } + + cm->seq_params = *seq_params; pbi->sequence_header_ready = 1; return ((rb->bit_offset - saved_bit_offset + 7) >> 3); } +// On success, returns the frame header size. On failure, calls +// aom_internal_error and does not return. static uint32_t read_frame_header_obu(AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data, const uint8_t **p_data_end, int trailing_bits_present) { - av1_decode_frame_headers_and_setup(pbi, rb, data, p_data_end, - trailing_bits_present); - return (uint32_t)(pbi->uncomp_hdr_size); + return av1_decode_frame_headers_and_setup(pbi, rb, data, p_data_end, + trailing_bits_present); } static int32_t read_tile_group_header(AV1Decoder *pbi, @@ -353,7 +394,6 @@ static int32_t read_tile_group_header(AV1Decoder *pbi, aom_internal_error( &cm->error, AOM_CODEC_UNSUP_BITSTREAM, "For OBU_FRAME type obu tile_start_and_end_present_flag must be 0"); - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } *start_tile = @@ -371,9 +411,12 @@ static uint32_t read_one_tile_group_obu( int start_tile, end_tile; int32_t header_size, tg_payload_size; + assert((rb->bit_offset & 7) == 0); + assert(rb->bit_buffer + aom_rb_bytes_read(rb) == data); + header_size = read_tile_group_header(pbi, rb, &start_tile, &end_tile, tile_start_implicit); - if (header_size == -1) return 0; + if (header_size == -1 || byte_alignment(cm, rb)) return 0; if (start_tile > end_tile) return header_size; data += header_size; av1_decode_tg_tiles_and_wrapup(pbi, data, data_end, p_data_end, start_tile, @@ -386,44 +429,22 @@ static uint32_t read_one_tile_group_obu( return header_size + tg_payload_size; } -// Only called while large_scale_tile = 1. -static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi, - struct aom_read_bit_buffer *rb, - const uint8_t *data, - const uint8_t *data_end, - const uint8_t **p_data_end, - int *frame_decoding_finished) { - AV1_COMMON *const cm = &pbi->common; - uint32_t tile_list_payload_size = 0; - const int num_tiles = cm->tile_cols * cm->tile_rows; - const int start_tile = 0; - const int end_tile = num_tiles - 1; - int i = 0; - - // Process the tile list info. - pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8); - pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8); - pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16); - if (pbi->tile_count_minus_1 > 511) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; - return 0; - } - - // Allocate output frame buffer for the tile list. +static void alloc_tile_list_buffer(AV1Decoder *pbi) { // TODO(yunqing): for now, copy each tile's decoded YUV data directly to the // output buffer. This needs to be modified according to the application // requirement. + AV1_COMMON *const cm = &pbi->common; const int tile_width_in_pixels = cm->tile_width * MI_SIZE; const int tile_height_in_pixels = cm->tile_height * MI_SIZE; - const int ssy = cm->subsampling_y; - const int ssx = cm->subsampling_x; + const int ssy = cm->seq_params.subsampling_y; + const int ssx = cm->seq_params.subsampling_x; const int num_planes = av1_num_planes(cm); const size_t yplane_tile_size = tile_height_in_pixels * tile_width_in_pixels; const size_t uvplane_tile_size = (num_planes > 1) ? (tile_height_in_pixels >> ssy) * (tile_width_in_pixels >> ssx) : 0; - const size_t tile_size = (cm->use_highbitdepth ? 2 : 1) * + const size_t tile_size = (cm->seq_params.use_highbitdepth ? 2 : 1) * (yplane_tile_size + 2 * uvplane_tile_size); pbi->tile_list_size = tile_size * (pbi->tile_count_minus_1 + 1); @@ -437,6 +458,83 @@ static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi, "Failed to allocate the tile list output buffer"); pbi->buffer_sz = pbi->tile_list_size; } +} + +static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi, + uint8_t **output) { + AV1_COMMON *const cm = &pbi->common; + const int tile_width_in_pixels = cm->tile_width * MI_SIZE; + const int tile_height_in_pixels = cm->tile_height * MI_SIZE; + const int ssy = cm->seq_params.subsampling_y; + const int ssx = cm->seq_params.subsampling_x; + const int num_planes = av1_num_planes(cm); + + // Copy decoded tile to the tile list output buffer. + YV12_BUFFER_CONFIG *cur_frame = get_frame_new_buffer(cm); + const int mi_row = pbi->dec_tile_row * cm->tile_height; + const int mi_col = pbi->dec_tile_col * cm->tile_width; + const int is_hbd = (cur_frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; + uint8_t *bufs[MAX_MB_PLANE] = { NULL, NULL, NULL }; + int strides[MAX_MB_PLANE] = { 0, 0, 0 }; + int plane; + + for (plane = 0; plane < num_planes; ++plane) { + int shift_x = plane > 0 ? ssx : 0; + int shift_y = plane > 0 ? ssy : 0; + + bufs[plane] = cur_frame->buffers[plane]; + strides[plane] = + (plane > 0) ? cur_frame->strides[1] : cur_frame->strides[0]; + + bufs[plane] += mi_row * (MI_SIZE >> shift_y) * strides[plane] + + mi_col * (MI_SIZE >> shift_x); + + if (is_hbd) { + bufs[plane] = (uint8_t *)CONVERT_TO_SHORTPTR(bufs[plane]); + strides[plane] *= 2; + } + + int w, h; + w = (plane > 0 && shift_x > 0) ? ((tile_width_in_pixels + 1) >> shift_x) + : tile_width_in_pixels; + w *= (1 + is_hbd); + h = (plane > 0 && shift_y > 0) ? ((tile_height_in_pixels + 1) >> shift_y) + : tile_height_in_pixels; + int j; + + for (j = 0; j < h; ++j) { + memcpy(*output, bufs[plane], w); + bufs[plane] += strides[plane]; + *output += w; + } + } +} + +// Only called while large_scale_tile = 1. +static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi, + struct aom_read_bit_buffer *rb, + const uint8_t *data, + const uint8_t *data_end, + const uint8_t **p_data_end, + int *frame_decoding_finished) { + AV1_COMMON *const cm = &pbi->common; + uint32_t tile_list_payload_size = 0; + const int num_tiles = cm->tile_cols * cm->tile_rows; + const int start_tile = 0; + const int end_tile = num_tiles - 1; + int i = 0; + + // Process the tile list info. + pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8); + pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8); + pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16); + if (pbi->tile_count_minus_1 > MAX_TILES - 1) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + + // Allocate output frame buffer for the tile list. + alloc_tile_list_buffer(pbi); uint32_t tile_list_info_bytes = 4; tile_list_payload_size += tile_list_info_bytes; @@ -485,45 +583,8 @@ static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi, data = *p_data_end; assert(data <= data_end); - // Copy decoded tile to the tile list output buffer. - YV12_BUFFER_CONFIG *cur_frame = get_frame_new_buffer(cm); - const int mi_row = pbi->dec_tile_row * cm->tile_height; - const int mi_col = pbi->dec_tile_col * cm->tile_width; - const int is_hbd = (cur_frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; - uint8_t *bufs[MAX_MB_PLANE] = { NULL, NULL, NULL }; - int strides[MAX_MB_PLANE] = { 0, 0, 0 }; - int plane; - - for (plane = 0; plane < num_planes; ++plane) { - int shift_x = plane > 0 ? ssx : 0; - int shift_y = plane > 0 ? ssy : 0; - - bufs[plane] = cur_frame->buffers[plane]; - strides[plane] = - (plane > 0) ? cur_frame->strides[1] : cur_frame->strides[0]; - if (is_hbd) { - bufs[plane] = (uint8_t *)CONVERT_TO_SHORTPTR(cur_frame->buffers[plane]); - strides[plane] = - (plane > 0) ? 2 * cur_frame->strides[1] : 2 * cur_frame->strides[0]; - } - - bufs[plane] += mi_row * (MI_SIZE >> shift_y) * strides[plane] + - mi_col * (MI_SIZE >> shift_x); - - int w, h; - w = (plane > 0 && shift_x > 0) ? ((tile_width_in_pixels + 1) >> shift_x) - : tile_width_in_pixels; - w *= (1 + is_hbd); - h = (plane > 0 && shift_y > 0) ? ((tile_height_in_pixels + 1) >> shift_y) - : tile_height_in_pixels; - int j; - - for (j = 0; j < h; ++j) { - memcpy(output, bufs[plane], w); - bufs[plane] += strides[plane]; - output += w; - } - } + // Copy the decoded tile to the tile list output buffer. + copy_decoded_tile_to_tile_list_buffer(pbi, &output); } *frame_decoding_finished = 1; @@ -710,7 +771,6 @@ aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data, return AOM_CODEC_OK; } -#define EXT_TILE_DEBUG 0 // On success, returns a boolean that indicates whether the decoding of the // current frame is finished. On failure, sets cm->error.error_code and // returns -1. @@ -720,7 +780,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, AV1_COMMON *const cm = &pbi->common; int frame_decoding_finished = 0; int is_first_tg_obu_received = 1; - int frame_header_size = 0; + uint32_t frame_header_size = 0; int seq_header_received = 0; size_t seq_header_size = 0; ObuHeader obu_header; @@ -785,7 +845,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, } } - av1_init_read_bit_buffer(pbi, &rb, data, data_end); + av1_init_read_bit_buffer(pbi, &rb, data, data + payload_size); switch (obu_header.type) { case OBU_TEMPORAL_DELIMITER: @@ -813,21 +873,35 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, // Only decode first frame header received if (!pbi->seen_frame_header || (cm->large_scale_tile && !pbi->camera_frame_header_ready)) { - pbi->seen_frame_header = 1; frame_header_size = read_frame_header_obu( pbi, &rb, data, p_data_end, obu_header.type != OBU_FRAME); - if (cm->large_scale_tile) pbi->camera_frame_header_ready = 1; + pbi->seen_frame_header = 1; + if (!pbi->ext_tile_debug && cm->large_scale_tile) + pbi->camera_frame_header_ready = 1; + } else { + // TODO(wtc): Verify that the frame_header_obu is identical to the + // original frame_header_obu. For now just skip frame_header_size + // bytes in the bit buffer. + if (frame_header_size > payload_size) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + assert(rb.bit_offset == 0); + rb.bit_offset = 8 * frame_header_size; } decoded_payload_size = frame_header_size; - pbi->frame_header_size = (size_t)frame_header_size; + pbi->frame_header_size = frame_header_size; if (cm->show_existing_frame) { + if (obu_header.type == OBU_FRAME) { + cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + return -1; + } frame_decoding_finished = 1; pbi->seen_frame_header = 0; break; } -#if !EXT_TILE_DEBUG // In large scale tile coding, decode the common camera frame header // before any tile list OBU. if (!pbi->ext_tile_debug && pbi->camera_frame_header_ready) { @@ -838,17 +912,18 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, *p_data_end = data_end; break; } -#endif // EXT_TILE_DEBUG if (obu_header.type != OBU_FRAME) break; obu_payload_offset = frame_header_size; + // Byte align the reader before reading the tile group. + if (byte_alignment(cm, &rb)) return -1; AOM_FALLTHROUGH_INTENDED; // fall through to read tile group. case OBU_TILE_GROUP: if (!pbi->seen_frame_header) { cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } - if ((size_t)(data_end - data) < obu_payload_offset) { + if (obu_payload_offset > payload_size) { cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } @@ -904,4 +979,3 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, return frame_decoding_finished; } -#undef EXT_TILE_DEBUG diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c index c5a6bc831..b721b6d2b 100644 --- a/third_party/aom/av1/encoder/aq_complexity.c +++ b/third_party/aom/av1/encoder/aq_complexity.c @@ -66,7 +66,8 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) { cpi->refresh_alt_ref_frame || (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { int segment; - const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth); + const int aq_strength = + get_aq_c_strength(cm->base_qindex, cm->seq_params.bit_depth); // Clear down the segment map. memset(cpi->segmentation_map, DEFAULT_AQ2_SEG, cm->mi_rows * cm->mi_cols); @@ -93,7 +94,7 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) { qindex_delta = av1_compute_qdelta_by_rate( &cpi->rc, cm->frame_type, cm->base_qindex, - aq_c_q_adj_factor[aq_strength][segment], cm->bit_depth); + aq_c_q_adj_factor[aq_strength][segment], cm->seq_params.bit_depth); // For AQ complexity mode, we dont allow Q0 in a segment if the base // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment @@ -138,7 +139,8 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs, const int target_rate = (int)(num / denom); double logvar; double low_var_thresh; - const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth); + const int aq_strength = + get_aq_c_strength(cm->base_qindex, cm->seq_params.bit_depth); aom_clear_system_state(); low_var_thresh = (cpi->oxcf.pass == 2) ? AOMMAX(cpi->twopass.mb_av_energy, diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c index a1fe37d4a..dec2c730d 100644 --- a/third_party/aom/av1/encoder/aq_cyclicrefresh.c +++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c @@ -137,8 +137,9 @@ static int candidate_refresh_aq(const CYCLIC_REFRESH *cr, static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) { const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; const RATE_CONTROL *const rc = &cpi->rc; - int deltaq = av1_compute_qdelta_by_rate(rc, cpi->common.frame_type, q, - rate_factor, cpi->common.bit_depth); + int deltaq = + av1_compute_qdelta_by_rate(rc, cpi->common.frame_type, q, rate_factor, + cpi->common.seq_params.bit_depth); if ((-deltaq) > cr->max_qdelta_perc * q / 100) { deltaq = -cr->max_qdelta_perc * q / 100; } @@ -164,15 +165,16 @@ int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi, estimated_bits = (int)((1.0 - weight_segment1 - weight_segment2) * av1_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs, - correction_factor, cm->bit_depth) + - weight_segment1 * - av1_estimate_bits_at_q(cm->frame_type, - cm->base_qindex + cr->qindex_delta[1], - mbs, correction_factor, cm->bit_depth) + - weight_segment2 * - av1_estimate_bits_at_q(cm->frame_type, - cm->base_qindex + cr->qindex_delta[2], - mbs, correction_factor, cm->bit_depth)); + correction_factor, + cm->seq_params.bit_depth) + + weight_segment1 * av1_estimate_bits_at_q( + cm->frame_type, + cm->base_qindex + cr->qindex_delta[1], mbs, + correction_factor, cm->seq_params.bit_depth) + + weight_segment2 * av1_estimate_bits_at_q( + cm->frame_type, + cm->base_qindex + cr->qindex_delta[2], mbs, + correction_factor, cm->seq_params.bit_depth)); return estimated_bits; } @@ -197,12 +199,13 @@ int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i, // Compute delta-q corresponding to qindex i. int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta); // Take segment weighted average for bits per mb. - bits_per_mb = (int)((1.0 - weight_segment) * - av1_rc_bits_per_mb(cm->frame_type, i, - correction_factor, cm->bit_depth) + - weight_segment * - av1_rc_bits_per_mb(cm->frame_type, i + deltaq, - correction_factor, cm->bit_depth)); + bits_per_mb = + (int)((1.0 - weight_segment) * + av1_rc_bits_per_mb(cm->frame_type, i, correction_factor, + cm->seq_params.bit_depth) + + weight_segment * av1_rc_bits_per_mb(cm->frame_type, i + deltaq, + correction_factor, + cm->seq_params.bit_depth)); return bits_per_mb; } @@ -507,7 +510,8 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) { } else { int qindex_delta = 0; int qindex2; - const double q = av1_convert_qindex_to_q(cm->base_qindex, cm->bit_depth); + const double q = + av1_convert_qindex_to_q(cm->base_qindex, cm->seq_params.bit_depth); aom_clear_system_state(); // Set rate threshold to some multiple (set to 2 for now) of the target // rate (target is given by sb64_target_rate and scaled by 256). diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c index 29a311447..6cb6adc42 100644 --- a/third_party/aom/av1/encoder/aq_variance.c +++ b/third_party/aom/av1/encoder/aq_variance.c @@ -71,7 +71,7 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) { for (i = 0; i < MAX_SEGMENTS; ++i) { int qindex_delta = av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex, - rate_ratio[i], cm->bit_depth); + rate_ratio[i], cm->seq_params.bit_depth); // We don't allow qindex 0 in a segment if the base value is not 0. // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment @@ -235,9 +235,9 @@ int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi, const int rate_level = SEGMENT_ID(block_var_level); const AV1_COMMON *const cm = &cpi->common; - int qindex_delta = - av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex, - rate_ratio[rate_level], cm->bit_depth); + int qindex_delta = av1_compute_qdelta_by_rate( + &cpi->rc, cm->frame_type, cm->base_qindex, rate_ratio[rate_level], + cm->seq_params.bit_depth); if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) { qindex_delta = -cm->base_qindex + 1; diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c index 1c5bdeb25..d0477b35b 100644 --- a/third_party/aom/av1/encoder/av1_quantize.c +++ b/third_party/aom/av1/encoder/av1_quantize.c @@ -613,9 +613,9 @@ void av1_init_quantizer(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; QUANTS *const quants = &cpi->quants; Dequants *const dequants = &cpi->dequants; - av1_build_quantizer(cm->bit_depth, cm->y_dc_delta_q, cm->u_dc_delta_q, - cm->u_ac_delta_q, cm->v_dc_delta_q, cm->v_ac_delta_q, - quants, dequants); + av1_build_quantizer(cm->seq_params.bit_depth, cm->y_dc_delta_q, + cm->u_dc_delta_q, cm->u_ac_delta_q, cm->v_dc_delta_q, + cm->v_ac_delta_q, quants, dequants); } void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x, @@ -713,7 +713,7 @@ void av1_set_quantizer(AV1_COMMON *cm, int q) { cm->qm_u = aom_get_qmlevel(cm->base_qindex + cm->u_ac_delta_q, cm->min_qmlevel, cm->max_qmlevel); - if (!cm->separate_uv_delta_q) + if (!cm->seq_params.separate_uv_delta_q) cm->qm_v = cm->qm_u; else cm->qm_v = aom_get_qmlevel(cm->base_qindex + cm->v_ac_delta_q, diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c index cdd7c2492..2070755cd 100644 --- a/third_party/aom/av1/encoder/bitstream.c +++ b/third_party/aom/av1/encoder/bitstream.c @@ -769,7 +769,7 @@ static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd, aom_write_symbol(w, n - PALETTE_MIN_SIZE, xd->tile_ctx->palette_y_size_cdf[bsize_ctx], PALETTE_SIZES); - write_palette_colors_y(xd, pmi, cm->bit_depth, w); + write_palette_colors_y(xd, pmi, cm->seq_params.bit_depth, w); } } @@ -786,7 +786,7 @@ static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd, aom_write_symbol(w, n - PALETTE_MIN_SIZE, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx], PALETTE_SIZES); - write_palette_colors_uv(xd, pmi, cm->bit_depth, w); + write_palette_colors_uv(xd, pmi, cm->seq_params.bit_depth, w); } } } @@ -1421,8 +1421,8 @@ static void write_inter_txb_coeff(AV1_COMMON *const cm, MACROBLOCK *const x, for (blk_col = col >> pd->subsampling_x; blk_col < unit_width; blk_col += bkw) { pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize, - cm->bit_depth, *block, blk_row, blk_col, max_tx_size, - token_stats); + cm->seq_params.bit_depth, *block, blk_row, blk_col, + max_tx_size, token_stats); *block += step; } } @@ -1612,14 +1612,13 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile, const int num_planes = av1_num_planes(cm); for (int plane = 0; plane < num_planes; ++plane) { - int rcol0, rcol1, rrow0, rrow1, tile_tl_idx; + int rcol0, rcol1, rrow0, rrow1; if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize, - &rcol0, &rcol1, &rrow0, &rrow1, - &tile_tl_idx)) { + &rcol0, &rcol1, &rrow0, &rrow1)) { const int rstride = cm->rst_info[plane].horz_units_per_tile; for (int rrow = rrow0; rrow < rrow1; ++rrow) { for (int rcol = rcol0; rcol < rcol1; ++rcol) { - const int runit_idx = tile_tl_idx + rcol + rrow * rstride; + const int runit_idx = rcol + rrow * rstride; const RestorationUnitInfo *rui = &cm->rst_info[plane].unit_info[runit_idx]; loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane, @@ -1705,7 +1704,7 @@ static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile, const int mi_col_end = tile->mi_col_end; int mi_row, mi_col; - av1_zero_above_context(cm, mi_col_start, mi_col_end, tile->tile_row); + av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row); av1_init_above_context(cm, xd, tile->tile_row); if (cpi->common.delta_q_present_flag) { @@ -1779,7 +1778,7 @@ static void encode_restoration_mode(AV1_COMMON *cm, } if (num_planes > 1) { - int s = AOMMIN(cm->subsampling_x, cm->subsampling_y); + int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y); if (s && !chroma_none) { aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size != cm->rst_info[0].restoration_unit_size); @@ -2020,7 +2019,7 @@ static void encode_quantization(const AV1_COMMON *const cm, if (num_planes > 1) { int diff_uv_delta = (cm->u_dc_delta_q != cm->v_dc_delta_q) || (cm->u_ac_delta_q != cm->v_ac_delta_q); - if (cm->separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta); + if (cm->seq_params.separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta); write_delta_q(wb, cm->u_dc_delta_q); write_delta_q(wb, cm->u_ac_delta_q); if (diff_uv_delta) { @@ -2032,7 +2031,7 @@ static void encode_quantization(const AV1_COMMON *const cm, if (cm->using_qmatrix) { aom_wb_write_literal(wb, cm->qm_y, QM_LEVEL_BITS); aom_wb_write_literal(wb, cm->qm_u, QM_LEVEL_BITS); - if (!cm->separate_uv_delta_q) + if (!cm->seq_params.separate_uv_delta_q) assert(cm->qm_u == cm->qm_v); else aom_wb_write_literal(wb, cm->qm_v, QM_LEVEL_BITS); @@ -2240,7 +2239,8 @@ static int get_refresh_mask_gf16(AV1_COMP *cpi) { #endif // USE_GF16_MULTI_LAYER static int get_refresh_mask(AV1_COMP *cpi) { - if (cpi->common.frame_type == KEY_FRAME || frame_is_sframe(&cpi->common)) + if ((cpi->common.frame_type == KEY_FRAME && cpi->common.show_frame) || + frame_is_sframe(&cpi->common)) return 0xFF; int refresh_mask = 0; @@ -2258,9 +2258,15 @@ static int get_refresh_mask(AV1_COMP *cpi) { // LAST3_FRAME. refresh_mask |= (cpi->refresh_last_frame << cpi->ref_fb_idx[LAST_REF_FRAMES - 1]); - +#if USE_SYMM_MULTI_LAYER + refresh_mask |= + (cpi->new_bwdref_update_rule == 1) + ? (cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[EXTREF_FRAME - 1]) + : (cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[BWDREF_FRAME - 1]); +#else refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[BWDREF_FRAME - 1]); +#endif refresh_mask |= (cpi->refresh_alt2_ref_frame << cpi->ref_fb_idx[ALTREF2_FRAME - 1]); @@ -2419,80 +2425,82 @@ static void write_profile(BITSTREAM_PROFILE profile, aom_wb_write_literal(wb, profile, PROFILE_BITS); } -static void write_bitdepth(AV1_COMMON *const cm, +static void write_bitdepth(const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) { // Profile 0/1: [0] for 8 bit, [1] 10-bit // Profile 2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit - aom_wb_write_bit(wb, cm->bit_depth == AOM_BITS_8 ? 0 : 1); - if (cm->profile == PROFILE_2 && cm->bit_depth != AOM_BITS_8) { - aom_wb_write_bit(wb, cm->bit_depth == AOM_BITS_10 ? 0 : 1); + aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_8 ? 0 : 1); + if (seq_params->profile == PROFILE_2 && seq_params->bit_depth != AOM_BITS_8) { + aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_10 ? 0 : 1); } } -static void write_color_config(AV1_COMMON *const cm, +static void write_color_config(const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) { - write_bitdepth(cm, wb); - const int is_monochrome = cm->seq_params.monochrome; + write_bitdepth(seq_params, wb); + const int is_monochrome = seq_params->monochrome; // monochrome bit - if (cm->profile != PROFILE_1) + if (seq_params->profile != PROFILE_1) aom_wb_write_bit(wb, is_monochrome); else assert(!is_monochrome); - if (cm->color_primaries == AOM_CICP_CP_UNSPECIFIED && - cm->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED && - cm->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) { + if (seq_params->color_primaries == AOM_CICP_CP_UNSPECIFIED && + seq_params->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED && + seq_params->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) { aom_wb_write_bit(wb, 0); // No color description present } else { aom_wb_write_bit(wb, 1); // Color description present - aom_wb_write_literal(wb, cm->color_primaries, 8); - aom_wb_write_literal(wb, cm->transfer_characteristics, 8); - aom_wb_write_literal(wb, cm->matrix_coefficients, 8); + aom_wb_write_literal(wb, seq_params->color_primaries, 8); + aom_wb_write_literal(wb, seq_params->transfer_characteristics, 8); + aom_wb_write_literal(wb, seq_params->matrix_coefficients, 8); } if (is_monochrome) { // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] - aom_wb_write_bit(wb, cm->color_range); + aom_wb_write_bit(wb, seq_params->color_range); return; } - if (cm->color_primaries == AOM_CICP_CP_BT_709 && - cm->transfer_characteristics == AOM_CICP_TC_SRGB && - cm->matrix_coefficients == + if (seq_params->color_primaries == AOM_CICP_CP_BT_709 && + seq_params->transfer_characteristics == AOM_CICP_TC_SRGB && + seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { // it would be better to remove this // dependency too - assert(cm->subsampling_x == 0 && cm->subsampling_y == 0); - assert(cm->profile == PROFILE_1 || - (cm->profile == PROFILE_2 && cm->bit_depth == AOM_BITS_12)); + assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0); + assert(seq_params->profile == PROFILE_1 || + (seq_params->profile == PROFILE_2 && + seq_params->bit_depth == AOM_BITS_12)); } else { // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] - aom_wb_write_bit(wb, cm->color_range); - if (cm->profile == PROFILE_0) { + aom_wb_write_bit(wb, seq_params->color_range); + if (seq_params->profile == PROFILE_0) { // 420 only - assert(cm->subsampling_x == 1 && cm->subsampling_y == 1); - } else if (cm->profile == PROFILE_1) { + assert(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1); + } else if (seq_params->profile == PROFILE_1) { // 444 only - assert(cm->subsampling_x == 0 && cm->subsampling_y == 0); - } else if (cm->profile == PROFILE_2) { - if (cm->bit_depth == AOM_BITS_12) { + assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0); + } else if (seq_params->profile == PROFILE_2) { + if (seq_params->bit_depth == AOM_BITS_12) { // 420, 444 or 422 - aom_wb_write_bit(wb, cm->subsampling_x); - if (cm->subsampling_x == 0) { - assert(cm->subsampling_y == 0 && + aom_wb_write_bit(wb, seq_params->subsampling_x); + if (seq_params->subsampling_x == 0) { + assert(seq_params->subsampling_y == 0 && "4:4:0 subsampling not allowed in AV1"); } else { - aom_wb_write_bit(wb, cm->subsampling_y); + aom_wb_write_bit(wb, seq_params->subsampling_y); } } else { // 422 only - assert(cm->subsampling_x == 1 && cm->subsampling_y == 0); + assert(seq_params->subsampling_x == 1 && + seq_params->subsampling_y == 0); } } - if (cm->matrix_coefficients == AOM_CICP_MC_IDENTITY) { - assert(cm->subsampling_x == 0 && cm->subsampling_y == 0); + if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { + assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0); } - if (cm->subsampling_x == 1 && cm->subsampling_y == 1) { - aom_wb_write_literal(wb, cm->chroma_sample_position, 2); + if (seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) { + aom_wb_write_literal(wb, seq_params->chroma_sample_position, 2); } } - aom_wb_write_bit(wb, cm->separate_uv_delta_q); + aom_wb_write_bit(wb, seq_params->separate_uv_delta_q); } static void write_timing_info_header(AV1_COMMON *const cm, @@ -2517,8 +2525,8 @@ static void write_decoder_model_info(AV1_COMMON *const cm, wb, cm->buffer_model.encoder_decoder_buffer_delay_length - 1, 5); aom_wb_write_unsigned_literal(wb, cm->buffer_model.num_units_in_decoding_tick, 32); // Number of units in decoding tick - aom_wb_write_literal(wb, cm->buffer_model.buffer_removal_delay_length - 1, 5); - aom_wb_write_literal(wb, cm->buffer_model.frame_presentation_delay_length - 1, + aom_wb_write_literal(wb, cm->buffer_model.buffer_removal_time_length - 1, 5); + aom_wb_write_literal(wb, cm->buffer_model.frame_presentation_time_length - 1, 5); } @@ -2533,23 +2541,25 @@ static void write_dec_model_op_parameters(AV1_COMMON *const cm, // aom_wb_write_bit(wb, cm->op_params[op_num].has_parameters); // if (!cm->op_params[op_num].has_parameters) return; - aom_wb_write_literal(wb, cm->op_params[op_num].decoder_buffer_delay, - cm->buffer_model.encoder_decoder_buffer_delay_length); + aom_wb_write_unsigned_literal( + wb, cm->op_params[op_num].decoder_buffer_delay, + cm->buffer_model.encoder_decoder_buffer_delay_length); - aom_wb_write_literal(wb, cm->op_params[op_num].encoder_buffer_delay, - cm->buffer_model.encoder_decoder_buffer_delay_length); + aom_wb_write_unsigned_literal( + wb, cm->op_params[op_num].encoder_buffer_delay, + cm->buffer_model.encoder_decoder_buffer_delay_length); aom_wb_write_bit(wb, cm->op_params[op_num].low_delay_mode_flag); - cm->op_frame_timing[op_num].buffer_removal_delay = + cm->op_frame_timing[op_num].buffer_removal_time = 0; // reset the decoded frame counter } static void write_tu_pts_info(AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { aom_wb_write_unsigned_literal( - wb, (uint32_t)cm->tu_presentation_delay, - cm->buffer_model.frame_presentation_delay_length); + wb, cm->frame_presentation_time, + cm->buffer_model.frame_presentation_time_length); } static void write_film_grain_params(AV1_COMP *cpi, @@ -2601,8 +2611,8 @@ static void write_film_grain_params(AV1_COMP *cpi, pars->chroma_scaling_from_luma = 0; // for monochrome override to 0 if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma || - ((cm->subsampling_x == 1) && (cm->subsampling_y == 1) && - (pars->num_y_points == 0))) { + ((cm->seq_params.subsampling_x == 1) && + (cm->seq_params.subsampling_y == 1) && (pars->num_y_points == 0))) { pars->num_cb_points = 0; pars->num_cr_points = 0; } else { @@ -2931,18 +2941,19 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, struct aom_write_bit_buffer *saved_wb, struct aom_write_bit_buffer *wb) { AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = &cm->seq_params; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; // NOTE: By default all coded frames to be used as a reference cm->is_reference_frame = 1; cm->frame_type = cm->intra_only ? INTRA_ONLY_FRAME : cm->frame_type; - if (cm->seq_params.still_picture) { + if (seq_params->still_picture) { assert(cm->show_existing_frame == 0); assert(cm->show_frame == 1); assert(cm->frame_type == KEY_FRAME); } - if (!cm->seq_params.reduced_still_picture_hdr) { + if (!seq_params->reduced_still_picture_hdr) { if (cm->show_existing_frame) { RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show]; @@ -2957,12 +2968,12 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, aom_wb_write_bit(wb, 1); // show_existing_frame aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3); - if (cm->seq_params.decoder_model_info_present_flag && + if (seq_params->decoder_model_info_present_flag && cm->timing_info.equal_picture_interval == 0) { write_tu_pts_info(cm, wb); } - if (cm->seq_params.frame_id_numbers_present_flag) { - int frame_id_len = cm->seq_params.frame_id_length; + if (seq_params->frame_id_numbers_present_flag) { + int frame_id_len = seq_params->frame_id_length; int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show]; aom_wb_write_literal(wb, display_frame_id, frame_id_len); } @@ -2983,7 +2994,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, aom_wb_write_bit(wb, cm->show_frame); if (cm->show_frame) { - if (cm->seq_params.decoder_model_info_present_flag && + if (seq_params->decoder_model_info_present_flag && cm->timing_info.equal_picture_interval == 0) write_tu_pts_info(cm, wb); } else { @@ -2997,18 +3008,18 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, } aom_wb_write_bit(wb, cm->disable_cdf_update); - if (cm->seq_params.force_screen_content_tools == 2) { + if (seq_params->force_screen_content_tools == 2) { aom_wb_write_bit(wb, cm->allow_screen_content_tools); } else { assert(cm->allow_screen_content_tools == - cm->seq_params.force_screen_content_tools); + seq_params->force_screen_content_tools); } if (cm->allow_screen_content_tools) { - if (cm->seq_params.force_integer_mv == 2) { + if (seq_params->force_integer_mv == 2) { aom_wb_write_bit(wb, cm->cur_frame_force_integer_mv); } else { - assert(cm->cur_frame_force_integer_mv == cm->seq_params.force_integer_mv); + assert(cm->cur_frame_force_integer_mv == seq_params->force_integer_mv); } } else { assert(cm->cur_frame_force_integer_mv == 0); @@ -3018,53 +3029,57 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, int frame_size_override_flag = 0; cm->frame_refs_short_signaling = 0; - if (cm->seq_params.reduced_still_picture_hdr) { - assert(cm->width == cm->seq_params.max_frame_width && - cm->height == cm->seq_params.max_frame_height); + if (seq_params->reduced_still_picture_hdr) { + assert(cm->width == seq_params->max_frame_width && + cm->height == seq_params->max_frame_height); } else { - if (cm->seq_params.frame_id_numbers_present_flag) { - int frame_id_len = cm->seq_params.frame_id_length; + if (seq_params->frame_id_numbers_present_flag) { + int frame_id_len = seq_params->frame_id_length; aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len); } - if (cm->width > cm->seq_params.max_frame_width || - cm->height > cm->seq_params.max_frame_height) { + if (cm->width > seq_params->max_frame_width || + cm->height > seq_params->max_frame_height) { aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Frame dimensions are larger than the maximum values"); } frame_size_override_flag = frame_is_sframe(cm) ? 1 - : (cm->width != cm->seq_params.max_frame_width || - cm->height != cm->seq_params.max_frame_height); + : (cm->width != seq_params->max_frame_width || + cm->height != seq_params->max_frame_height); if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag); - if (cm->seq_params.enable_order_hint) + if (seq_params->enable_order_hint) aom_wb_write_literal(wb, cm->frame_offset, - cm->seq_params.order_hint_bits_minus_1 + 1); + seq_params->order_hint_bits_minus_1 + 1); if (!cm->error_resilient_mode && !frame_is_intra_only(cm)) { aom_wb_write_literal(wb, cm->primary_ref_frame, PRIMARY_REF_BITS); } } - if (cm->seq_params.decoder_model_info_present_flag) { - aom_wb_write_bit(wb, cm->buffer_removal_delay_present); - if (cm->buffer_removal_delay_present) { + if (seq_params->decoder_model_info_present_flag) { + aom_wb_write_bit(wb, cm->buffer_removal_time_present); + if (cm->buffer_removal_time_present) { for (int op_num = 0; - op_num < cm->seq_params.operating_points_cnt_minus_1 + 1; op_num++) { + op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) { if (cm->op_params[op_num].decoder_model_param_present_flag) { - if (((cm->seq_params.operating_point_idc[op_num] >> + if (((seq_params->operating_point_idc[op_num] >> cm->temporal_layer_id) & 0x1 && - (cm->seq_params.operating_point_idc[op_num] >> + (seq_params->operating_point_idc[op_num] >> (cm->spatial_layer_id + 8)) & 0x1) || - cm->seq_params.operating_point_idc[op_num] == 0) { - aom_wb_write_literal( - wb, (uint32_t)cm->op_frame_timing[op_num].buffer_removal_delay, - cm->buffer_model.buffer_removal_delay_length); - cm->op_frame_timing[op_num].buffer_removal_delay++; + seq_params->operating_point_idc[op_num] == 0) { + aom_wb_write_unsigned_literal( + wb, cm->op_frame_timing[op_num].buffer_removal_time, + cm->buffer_model.buffer_removal_time_length); + cm->op_frame_timing[op_num].buffer_removal_time++; + if (cm->op_frame_timing[op_num].buffer_removal_time == 0) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "buffer_removal_time overflowed"); + } } } } @@ -3122,7 +3137,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, if (!frame_is_intra_only(cm) || cpi->refresh_frame_mask != 0xFF) { // Write all ref frame order hints if error_resilient_mode == 1 - if (cm->error_resilient_mode && cm->seq_params.enable_order_hint) { + if (cm->error_resilient_mode && seq_params->enable_order_hint) { RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) { // Get buffer index @@ -3131,7 +3146,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, // Write order hint to bit stream aom_wb_write_literal(wb, frame_bufs[buf_idx].cur_frame_offset, - cm->seq_params.order_hint_bits_minus_1 + 1); + seq_params->order_hint_bits_minus_1 + 1); } } } @@ -3156,7 +3171,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, // automatically. #define FRAME_REFS_SHORT_SIGNALING 0 #if FRAME_REFS_SHORT_SIGNALING - cm->frame_refs_short_signaling = cm->seq_params.enable_order_hint; + cm->frame_refs_short_signaling = seq_params->enable_order_hint; #endif // FRAME_REFS_SHORT_SIGNALING if (cm->frame_refs_short_signaling) { @@ -3167,7 +3182,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, check_frame_refs_short_signaling(cpi); } - if (cm->seq_params.enable_order_hint) + if (seq_params->enable_order_hint) aom_wb_write_bit(wb, cm->frame_refs_short_signaling); if (cm->frame_refs_short_signaling) { @@ -3183,10 +3198,10 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, if (!cm->frame_refs_short_signaling) aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame), REF_FRAMES_LOG2); - if (cm->seq_params.frame_id_numbers_present_flag) { + if (seq_params->frame_id_numbers_present_flag) { int i = get_ref_frame_map_idx(cpi, ref_frame); - int frame_id_len = cm->seq_params.frame_id_length; - int diff_len = cm->seq_params.delta_frame_id_length; + int frame_id_len = seq_params->frame_id_length; + int diff_len = seq_params->delta_frame_id_length; int delta_frame_id_minus_1 = ((cm->current_frame_id - cm->ref_frame_id[i] + (1 << frame_id_len)) % @@ -3222,7 +3237,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, } const int might_bwd_adapt = - !(cm->seq_params.reduced_still_picture_hdr) && !(cm->disable_cdf_update); + !(seq_params->reduced_still_picture_hdr) && !(cm->disable_cdf_update); if (cm->large_scale_tile) cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; @@ -3282,7 +3297,8 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb); - if (cm->film_grain_params_present && (cm->show_frame || cm->showable_frame)) { + if (seq_params->film_grain_params_present && + (cm->show_frame || cm->showable_frame)) { int flip_back_update_parameters_flag = 0; if (cm->frame_type != INTER_FRAME && cm->film_grain_params.update_parameters == 0) { @@ -3497,7 +3513,7 @@ static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) { struct aom_write_bit_buffer wb = { dst, 0 }; uint32_t size = 0; - write_profile(cm->profile, &wb); + write_profile(cm->seq_params.profile, &wb); // Still picture or not aom_wb_write_bit(&wb, cm->seq_params.still_picture); @@ -3551,9 +3567,9 @@ static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) { } write_sequence_header(cpi, &wb); - write_color_config(cm, &wb); + write_color_config(&cm->seq_params, &wb); - aom_wb_write_bit(&wb, cm->film_grain_params_present); + aom_wb_write_bit(&wb, cm->seq_params.film_grain_params_present); add_trailing_bits(&wb); @@ -3960,7 +3976,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) { // The TD is now written outside the frame encode loop // write sequence header obu if KEY_FRAME, preceded by 4-byte size - if (cm->frame_type == KEY_FRAME) { + if (cm->frame_type == KEY_FRAME && cm->show_frame) { obu_header_size = write_obu_header(OBU_SEQUENCE_HEADER, 0, data); obu_payload_size = write_sequence_header_obu(cpi, data + obu_header_size); diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h index 13fc11c31..003e59e39 100644 --- a/third_party/aom/av1/encoder/block.h +++ b/third_party/aom/av1/encoder/block.h @@ -224,6 +224,7 @@ struct macroblock { int sadperbit4; int rdmult; int mb_energy; + int sb_energy_level; int *m_search_count_ptr; int *ex_search_count_ptr; @@ -258,7 +259,6 @@ struct macroblock { MvLimits mv_limits; uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; - uint8_t blk_skip_drl[MAX_MIB_SIZE * MAX_MIB_SIZE]; int skip; int skip_chroma_rd; diff --git a/third_party/aom/av1/encoder/dwt.c b/third_party/aom/av1/encoder/dwt.c index 0a57ebcfb..04088b25f 100644 --- a/third_party/aom/av1/encoder/dwt.c +++ b/third_party/aom/av1/encoder/dwt.c @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + #include <assert.h> #include <stdlib.h> #include <math.h> diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h index 9a86db2f1..03318e5b7 100644 --- a/third_party/aom/av1/encoder/dwt.h +++ b/third_party/aom/av1/encoder/dwt.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + #include "av1/common/common.h" #include "av1/common/enums.h" diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c index 027b80a16..27ca53761 100644 --- a/third_party/aom/av1/encoder/encodeframe.c +++ b/third_party/aom/av1/encoder/encodeframe.c @@ -41,7 +41,6 @@ #include "av1/common/seg_common.h" #include "av1/common/tile_common.h" -#include "av1/encoder/ab_partition_model_weights.h" #include "av1/encoder/aq_complexity.h" #include "av1/encoder/aq_cyclicrefresh.h" #include "av1/encoder/aq_variance.h" @@ -54,6 +53,7 @@ #include "av1/encoder/ethread.h" #include "av1/encoder/extend.h" #include "av1/encoder/ml.h" +#include "av1/encoder/partition_model_weights.h" #include "av1/encoder/rd.h" #include "av1/encoder/rdopt.h" #include "av1/encoder/segmentation.h" @@ -2099,7 +2099,7 @@ static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile, // When use_square_partition_only is true, make sure at least one square // partition is allowed by selecting the next smaller square size as // *min_block_size. - if (cpi->sf.use_square_partition_only) { + if (min_size >= cpi->sf.use_square_partition_only_threshold) { min_size = AOMMIN(min_size, next_square_size[max_size]); } @@ -2363,6 +2363,7 @@ static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x, static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) { pc_tree->partitioning = PARTITION_NONE; pc_tree->cb_search_range = SEARCH_FULL_PLANE; + pc_tree->none.skip = 0; if (bsize >= BLOCK_8X8) { BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); @@ -2876,6 +2877,168 @@ static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx, } } +#define FEATURES 18 +#define LABELS 4 +// Use a ML model to predict if horz4 and vert4 should be considered. +static void ml_prune_4_partition(const AV1_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE bsize, + int part_ctx, int64_t best_rd, + int64_t horz_rd[2], int64_t vert_rd[2], + int64_t split_rd[4], + int *const partition_horz4_allowed, + int *const partition_vert4_allowed) { + if (best_rd >= 1000000000) return; + const NN_CONFIG *nn_config = NULL; + switch (bsize) { + case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break; + case BLOCK_32X32: nn_config = &av1_4_partition_nnconfig_32; break; + case BLOCK_64X64: nn_config = &av1_4_partition_nnconfig_64; break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config) return; + + aom_clear_system_state(); + + // Generate features. + float features[FEATURES]; + int feature_index = 0; + features[feature_index++] = (float)part_ctx; + features[feature_index++] = (float)get_unsigned_bits(x->source_variance); + + const int rdcost = (int)AOMMIN(INT_MAX, best_rd); + int sub_block_rdcost[8] = { 0 }; + int rd_index = 0; + for (int i = 0; i < 2; ++i) { + if (horz_rd[i] > 0 && horz_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)horz_rd[i]; + ++rd_index; + } + for (int i = 0; i < 2; ++i) { + if (vert_rd[i] > 0 && vert_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)vert_rd[i]; + ++rd_index; + } + for (int i = 0; i < 4; ++i) { + if (split_rd[i] > 0 && split_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)split_rd[i]; + ++rd_index; + } + for (int i = 0; i < 8; ++i) { + // Ratio between the sub-block RD and the whole-block RD. + float rd_ratio = 1.0f; + if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost) + rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost; + features[feature_index++] = rd_ratio; + } + + // Get variance of the 1:4 and 4:1 sub-blocks. + unsigned int horz_4_source_var[4] = { 0 }; + unsigned int vert_4_source_var[4] = { 0 }; + { + BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4); + BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4); + const int src_stride = x->plane[0].src.stride; + const uint8_t *src = x->plane[0].src.buf; + const MACROBLOCKD *const xd = &x->e_mbd; + for (int i = 0; i < 4; ++i) { + const uint8_t *horz_src = + src + i * block_size_high[horz_4_bs] * src_stride; + const uint8_t *vert_src = src + i * block_size_wide[vert_4_bs]; + unsigned int horz_var, vert_var, sse; + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + switch (xd->bd) { + case 10: + horz_var = cpi->fn_ptr[horz_4_bs].vf( + horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10), + 0, &sse); + vert_var = cpi->fn_ptr[vert_4_bs].vf( + vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10), + 0, &sse); + break; + case 12: + horz_var = cpi->fn_ptr[horz_4_bs].vf( + horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12), + 0, &sse); + vert_var = cpi->fn_ptr[vert_4_bs].vf( + vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12), + 0, &sse); + break; + case 8: + default: + horz_var = cpi->fn_ptr[horz_4_bs].vf( + horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8), + 0, &sse); + vert_var = cpi->fn_ptr[vert_4_bs].vf( + vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8), + 0, &sse); + break; + } + horz_4_source_var[i] = + ROUND_POWER_OF_TWO(horz_var, num_pels_log2_lookup[horz_4_bs]); + vert_4_source_var[i] = + ROUND_POWER_OF_TWO(vert_var, num_pels_log2_lookup[vert_4_bs]); + } else { + horz_var = cpi->fn_ptr[horz_4_bs].vf(horz_src, src_stride, AV1_VAR_OFFS, + 0, &sse); + vert_var = cpi->fn_ptr[vert_4_bs].vf(vert_src, src_stride, AV1_VAR_OFFS, + 0, &sse); + horz_4_source_var[i] = + ROUND_POWER_OF_TWO(horz_var, num_pels_log2_lookup[horz_4_bs]); + vert_4_source_var[i] = + ROUND_POWER_OF_TWO(vert_var, num_pels_log2_lookup[vert_4_bs]); + } + } + } + + const float denom = (float)(x->source_variance + 1); + const float low_b = 0.1f; + const float high_b = 10.0f; + for (int i = 0; i < 4; ++i) { + // Ratio between the 4:1 sub-block variance and the whole-block variance. + float var_ratio = (float)(horz_4_source_var[i] + 1) / denom; + if (var_ratio < low_b) var_ratio = low_b; + if (var_ratio > high_b) var_ratio = high_b; + features[feature_index++] = var_ratio; + } + for (int i = 0; i < 4; ++i) { + // Ratio between the 1:4 sub-block RD and the whole-block RD. + float var_ratio = (float)(vert_4_source_var[i] + 1) / denom; + if (var_ratio < low_b) var_ratio = low_b; + if (var_ratio > high_b) var_ratio = high_b; + features[feature_index++] = var_ratio; + } + assert(feature_index == FEATURES); + + // Calculate scores using the NN model. + float score[LABELS] = { 0.0f }; + av1_nn_predict(features, nn_config, score); + int int_score[LABELS]; + int max_score = -1000; + for (int i = 0; i < LABELS; ++i) { + int_score[i] = (int)(100 * score[i]); + max_score = AOMMAX(int_score[i], max_score); + } + + // Make decisions based on the model scores. + int thresh = max_score; + switch (bsize) { + case BLOCK_16X16: thresh -= 400; break; + case BLOCK_32X32: thresh -= 400; break; + case BLOCK_64X64: thresh -= 100; break; + default: break; + } + *partition_horz4_allowed = 0; + *partition_vert4_allowed = 0; + for (int i = 0; i < LABELS; ++i) { + if (int_score[i] >= thresh) { + if ((i >> 0) & 1) *partition_horz4_allowed = 1; + if ((i >> 1) & 1) *partition_vert4_allowed = 1; + } + } +} +#undef FEATURES +#undef LABELS + // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization // results, for encoding speed-up. @@ -3003,7 +3166,8 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, partition_vert_allowed &= partition_allowed || !has_cols; do_square_split &= bsize > min_size; } - if (cpi->sf.use_square_partition_only) { + + if (bsize > cpi->sf.use_square_partition_only_threshold) { partition_horz_allowed &= !has_rows; partition_vert_allowed &= !has_cols; } @@ -3480,13 +3644,6 @@ BEGIN_PARTITION_SEARCH: const int ext_partition_allowed = do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed; - // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or - // PARTITION_VERT_4 for this block. This is almost the same as - // ext_partition_allowed, except that we don't allow 128x32 or 32x128 blocks, - // so we require that bsize is not BLOCK_128X128. - const int partition4_allowed = - ext_partition_allowed && bsize != BLOCK_128X128; - // The standard AB partitions are allowed whenever ext-partition-types are // allowed int horzab_partition_allowed = ext_partition_allowed; @@ -3642,15 +3799,34 @@ BEGIN_PARTITION_SEARCH: restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } - // PARTITION_HORZ_4 + // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or + // PARTITION_VERT_4 for this block. This is almost the same as + // ext_partition_allowed, except that we don't allow 128x32 or 32x128 blocks, + // so we require that bsize is not BLOCK_128X128. + const int partition4_allowed = + ext_partition_allowed && bsize != BLOCK_128X128; int partition_horz4_allowed = partition4_allowed && partition_horz_allowed; + int partition_vert4_allowed = partition4_allowed && partition_vert_allowed; if (cpi->sf.prune_ext_partition_types_search_level == 2) { partition_horz4_allowed &= (pc_tree->partitioning == PARTITION_HORZ || pc_tree->partitioning == PARTITION_HORZ_A || pc_tree->partitioning == PARTITION_HORZ_B || pc_tree->partitioning == PARTITION_SPLIT || pc_tree->partitioning == PARTITION_NONE); + partition_vert4_allowed &= (pc_tree->partitioning == PARTITION_VERT || + pc_tree->partitioning == PARTITION_VERT_A || + pc_tree->partitioning == PARTITION_VERT_B || + pc_tree->partitioning == PARTITION_SPLIT || + pc_tree->partitioning == PARTITION_NONE); } + if (cpi->sf.ml_prune_4_partition && partition4_allowed && + partition_horz_allowed && partition_vert_allowed) { + ml_prune_4_partition(cpi, x, bsize, pc_tree->partitioning, best_rdc.rdcost, + horz_rd, vert_rd, split_rd, &partition_horz4_allowed, + &partition_vert4_allowed); + } + + // PARTITION_HORZ_4 if (partition_horz4_allowed && has_rows && (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) { av1_init_rd_stats(&sum_rdc); @@ -3687,14 +3863,6 @@ BEGIN_PARTITION_SEARCH: } // PARTITION_VERT_4 - int partition_vert4_allowed = partition4_allowed && partition_vert_allowed; - if (cpi->sf.prune_ext_partition_types_search_level == 2) { - partition_vert4_allowed &= (pc_tree->partitioning == PARTITION_VERT || - pc_tree->partitioning == PARTITION_VERT_A || - pc_tree->partitioning == PARTITION_VERT_B || - pc_tree->partitioning == PARTITION_SPLIT || - pc_tree->partitioning == PARTITION_NONE); - } if (partition_vert4_allowed && has_cols && (do_rectangular_split || active_v_edge(cpi, mi_row, mi_step))) { av1_init_rd_stats(&sum_rdc); @@ -3857,6 +4025,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, } xd->cur_frame_force_integer_mv = cm->cur_frame_force_integer_mv; + x->sb_energy_level = 0; if (cm->delta_q_present_flag) { // Delta-q modulation based on variance av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes); @@ -3865,11 +4034,13 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, if (DELTAQ_MODULATION == 1) { const int block_wavelet_energy_level = av1_block_wavelet_energy_level(cpi, x, cm->seq_params.sb_size); + x->sb_energy_level = block_wavelet_energy_level; offset_qindex = av1_compute_deltaq_from_energy_level( cpi, block_wavelet_energy_level); } else { const int block_var_level = av1_block_energy(cpi, x, cm->seq_params.sb_size); + x->sb_energy_level = block_var_level; offset_qindex = av1_compute_deltaq_from_energy_level(cpi, block_var_level); } @@ -3943,6 +4114,8 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, x->use_cb_search_range = 0; init_first_partition_pass_stats_tables(x->first_partition_pass_stats); if (cpi->sf.two_pass_partition_search && + cpi->sf.use_square_partition_only_threshold < + cm->seq_params.sb_size && mi_row + mi_size_high[cm->seq_params.sb_size] < cm->mi_rows && mi_col + mi_size_wide[cm->seq_params.sb_size] < cm->mi_cols && cm->frame_type != KEY_FRAME) { @@ -4030,7 +4203,8 @@ static void init_encode_frame_mb_context(AV1_COMP *cpi) { // Copy data over into macro block data structures. av1_setup_src_planes(x, cpi->source, 0, 0, num_planes); - av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y, num_planes); + av1_setup_block_planes(xd, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y, num_planes); } static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) { @@ -4116,8 +4290,8 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row, TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col]; int mi_row; - av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end, - tile_row); + av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start, + tile_info->mi_col_end, tile_row); av1_init_above_context(cm, &td->mb.e_mbd, tile_row); // Set up pointers to per thread motion search counters. @@ -4128,7 +4302,7 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row, this_tile->tctx = *cm->fc; td->mb.e_mbd.tile_ctx = &this_tile->tctx; - cfl_init(&td->mb.e_mbd.cfl, cm); + cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params); av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator); @@ -4263,25 +4437,24 @@ static int is_screen_content(const uint8_t *src, int use_hbd, int bd, return counts * blk_h * blk_w * 10 > width * height; } +static const uint8_t ref_frame_flag_list[REF_FRAMES] = { 0, + AOM_LAST_FLAG, + AOM_LAST2_FLAG, + AOM_LAST3_FLAG, + AOM_GOLD_FLAG, + AOM_BWD_FLAG, + AOM_ALT2_FLAG, + AOM_ALT_FLAG }; + // Enforce the number of references for each arbitrary frame limited to // (INTER_REFS_PER_FRAME - 1) static void enforce_max_ref_frames(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; - static const int flag_list[REF_FRAMES] = { 0, - AOM_LAST_FLAG, - AOM_LAST2_FLAG, - AOM_LAST3_FLAG, - AOM_GOLD_FLAG, - AOM_BWD_FLAG, - AOM_ALT2_FLAG, - AOM_ALT_FLAG }; MV_REFERENCE_FRAME ref_frame; int total_valid_refs = 0; - - (void)flag_list; - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - if (cpi->ref_frame_flags & flag_list[ref_frame]) total_valid_refs++; + if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame]) + total_valid_refs++; } // NOTE(zoeliu): When all the possible reference frames are availble, we @@ -4617,7 +4790,6 @@ static void encode_frame_internal(AV1_COMP *cpi) { cm->prev_mi = cm->allow_ref_frame_mvs ? cm->prev_mip : NULL; x->txb_split_count = 0; - av1_zero(x->blk_skip_drl); av1_zero(rdc->global_motion_used); av1_zero(cpi->gmparams_cost); @@ -4672,8 +4844,9 @@ static void encode_frame_internal(AV1_COMP *cpi) { } compute_global_motion_feature_based( - model, cpi->source, ref_buf[frame], cpi->common.bit_depth, - inliers_by_motion, params_by_motion, RANSAC_NUM_MOTIONS); + model, cpi->source, ref_buf[frame], + cpi->common.seq_params.bit_depth, inliers_by_motion, + params_by_motion, RANSAC_NUM_MOTIONS); for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) { if (inliers_by_motion[i] == 0) continue; @@ -4734,6 +4907,15 @@ static void encode_frame_internal(AV1_COMP *cpi) { cpi->gmtype_cost[cm->global_motion[frame].wmtype] - cpi->gmtype_cost[IDENTITY]; } + // clear disabled ref_frames + for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) { + const int ref_disabled = + !(cpi->ref_frame_flags & ref_frame_flag_list[frame]); + if (ref_disabled && cpi->sf.recode_loop != DISALLOW_RECODE) { + cpi->gmparams_cost[frame] = 0; + cm->global_motion[frame] = default_warp_params; + } + } cpi->global_motion_search_done = 1; } memcpy(cm->cur_frame->global_motion, cm->global_motion, @@ -5082,8 +5264,9 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data, } if (!is_inter) { - xd->cfl.is_chroma_reference = is_chroma_reference( - mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y); + xd->cfl.is_chroma_reference = + is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y); xd->cfl.store_y = store_cfl_required(cm, xd); mbmi->skip = 1; for (int plane = 0; plane < num_planes; ++plane) { diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c index 196e18d8a..13ea32e38 100644 --- a/third_party/aom/av1/encoder/encoder.c +++ b/third_party/aom/av1/encoder/encoder.c @@ -56,6 +56,11 @@ #include "av1/encoder/grain_test_vectors.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" +#if CONFIG_DENOISE +#include "aom_dsp/grain_table.h" +#include "aom_dsp/noise_util.h" +#include "aom_dsp/noise_model.h" +#endif #include "aom_ports/aom_timer.h" #include "aom_ports/mem.h" #include "aom_ports/system_state.h" @@ -290,7 +295,8 @@ static void setup_frame(AV1_COMP *cpi) { cm->fb_of_context_type[i] = -1; } cm->fb_of_context_type[REGULAR_FRAME] = - get_ref_frame_map_idx(cpi, GOLDEN_FRAME); + cm->show_frame ? get_ref_frame_map_idx(cpi, GOLDEN_FRAME) + : get_ref_frame_map_idx(cpi, ALTREF_FRAME); cm->frame_context_idx = REGULAR_FRAME; } else { const GF_GROUP *gf_group = &cpi->twopass.gf_group; @@ -315,7 +321,7 @@ static void setup_frame(AV1_COMP *cpi) { } } - if (cm->frame_type == KEY_FRAME) { + if (cm->frame_type == KEY_FRAME && cm->show_frame) { cpi->refresh_golden_frame = 1; cpi->refresh_alt_ref_frame = 1; av1_zero(cpi->interp_filter_selected); @@ -344,19 +350,20 @@ static void setup_frame(AV1_COMP *cpi) { static void enc_setup_mi(AV1_COMMON *cm) { int i; + int mi_rows_sb_aligned = calc_mi_size(cm->mi_rows); cm->mi = cm->mip; - memset(cm->mip, 0, cm->mi_stride * cm->mi_rows * sizeof(*cm->mip)); + memset(cm->mip, 0, cm->mi_stride * mi_rows_sb_aligned * sizeof(*cm->mip)); cm->prev_mi = cm->prev_mip; // Clear top border row memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride); // Clear left border column - for (i = 0; i < cm->mi_rows; ++i) + for (i = 0; i < mi_rows_sb_aligned; ++i) memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip)); cm->mi_grid_visible = cm->mi_grid_base; cm->prev_mi_grid_visible = cm->prev_mi_grid_base; memset(cm->mi_grid_base, 0, - cm->mi_stride * cm->mi_rows * sizeof(*cm->mi_grid_base)); + cm->mi_stride * mi_rows_sb_aligned * sizeof(*cm->mi_grid_base)); } static int enc_alloc_mi(AV1_COMMON *cm, int mi_size) { @@ -441,32 +448,32 @@ static void update_film_grain_parameters(struct AV1_COMP *cpi, AV1_COMMON *const cm = &cpi->common; cpi->oxcf = *oxcf; - if (cm->film_grain_table) { - aom_film_grain_table_free(cm->film_grain_table); - aom_free(cm->film_grain_table); + if (cpi->film_grain_table) { + aom_film_grain_table_free(cpi->film_grain_table); + aom_free(cpi->film_grain_table); + cpi->film_grain_table = NULL; } - cm->film_grain_table = 0; if (oxcf->film_grain_test_vector) { - cm->film_grain_params_present = 1; + cm->seq_params.film_grain_params_present = 1; if (cm->frame_type == KEY_FRAME) { memcpy(&cm->film_grain_params, film_grain_test_vectors + oxcf->film_grain_test_vector - 1, sizeof(cm->film_grain_params)); - cm->film_grain_params.bit_depth = cm->bit_depth; - if (cm->color_range == AOM_CR_FULL_RANGE) { + cm->film_grain_params.bit_depth = cm->seq_params.bit_depth; + if (cm->seq_params.color_range == AOM_CR_FULL_RANGE) { cm->film_grain_params.clip_to_restricted_range = 0; } } } else if (oxcf->film_grain_table_filename) { - cm->film_grain_table = aom_malloc(sizeof(*cm->film_grain_table)); - memset(cm->film_grain_table, 0, sizeof(aom_film_grain_table_t)); + cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table)); + memset(cpi->film_grain_table, 0, sizeof(aom_film_grain_table_t)); - aom_film_grain_table_read(cm->film_grain_table, + aom_film_grain_table_read(cpi->film_grain_table, oxcf->film_grain_table_filename, &cm->error); } else { - cm->film_grain_params_present = 0; + cm->seq_params.film_grain_params_present = 0; memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params)); } } @@ -523,6 +530,17 @@ static void dealloc_compressor_data(AV1_COMP *cpi) { av1_free_pc_tree(&cpi->td, num_planes); aom_free(cpi->td.mb.palette_buffer); + +#if CONFIG_DENOISE + if (cpi->denoise_and_model) { + aom_denoise_and_model_free(cpi->denoise_and_model); + cpi->denoise_and_model = NULL; + } +#endif + if (cpi->film_grain_table) { + aom_film_grain_table_free(cpi->film_grain_table); + cpi->film_grain_table = NULL; + } } static void save_coding_context(AV1_COMP *cpi) { @@ -596,8 +614,8 @@ static void configure_static_seg_features(AV1_COMP *cpi) { seg->update_map = 1; seg->update_data = 1; - qi_delta = - av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875, cm->bit_depth); + qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875, + cm->seq_params.bit_depth); av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2); av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2); av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2); @@ -621,8 +639,8 @@ static void configure_static_seg_features(AV1_COMP *cpi) { seg->update_map = 0; seg->update_data = 1; - qi_delta = - av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125, cm->bit_depth); + qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125, + cm->seq_params.bit_depth); av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2); av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); @@ -705,53 +723,58 @@ static void update_reference_segmentation_map(AV1_COMP *cpi) { static void alloc_raw_frame_buffers(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; + const SequenceHeader *const seq_params = &cm->seq_params; const AV1EncoderConfig *oxcf = &cpi->oxcf; if (!cpi->lookahead) - cpi->lookahead = av1_lookahead_init( - oxcf->width, oxcf->height, cm->subsampling_x, cm->subsampling_y, - cm->use_highbitdepth, oxcf->lag_in_frames); + cpi->lookahead = + av1_lookahead_init(oxcf->width, oxcf->height, seq_params->subsampling_x, + seq_params->subsampling_y, + seq_params->use_highbitdepth, oxcf->lag_in_frames); if (!cpi->lookahead) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate lag buffers"); // TODO(agrange) Check if ARF is enabled and skip allocation if not. - if (aom_realloc_frame_buffer(&cpi->alt_ref_buffer, oxcf->width, oxcf->height, - cm->subsampling_x, cm->subsampling_y, - cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, - cm->byte_alignment, NULL, NULL, NULL)) + if (aom_realloc_frame_buffer( + &cpi->alt_ref_buffer, oxcf->width, oxcf->height, + seq_params->subsampling_x, seq_params->subsampling_y, + seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate altref buffer"); } static void alloc_util_frame_buffers(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; - if (aom_realloc_frame_buffer(&cpi->last_frame_uf, cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, - cm->byte_alignment, NULL, NULL, NULL)) + const SequenceHeader *const seq_params = &cm->seq_params; + if (aom_realloc_frame_buffer( + &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate last frame buffer"); if (aom_realloc_frame_buffer( &cpi->trial_frame_rst, cm->superres_upscaled_width, - cm->superres_upscaled_height, cm->subsampling_x, cm->subsampling_y, - cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, - NULL, NULL)) + cm->superres_upscaled_height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate trial restored frame buffer"); - if (aom_realloc_frame_buffer(&cpi->scaled_source, cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, - cm->byte_alignment, NULL, NULL, NULL)) + if (aom_realloc_frame_buffer( + &cpi->scaled_source, cm->width, cm->height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate scaled source buffer"); - if (aom_realloc_frame_buffer(&cpi->scaled_last_source, cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, - cm->byte_alignment, NULL, NULL, NULL)) + if (aom_realloc_frame_buffer( + &cpi->scaled_last_source, cm->width, cm->height, + seq_params->subsampling_x, seq_params->subsampling_y, + seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate scaled last source buffer"); } @@ -846,8 +869,6 @@ static void init_buffer_indices(AV1_COMP *cpi) { int fb_idx; for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx) cpi->ref_fb_idx[fb_idx] = fb_idx; - for (fb_idx = 0; fb_idx < MAX_EXT_ARFS + 1; ++fb_idx) - cpi->arf_map[fb_idx] = LAST_REF_FRAMES + 2 + fb_idx; cpi->rate_index = 0; cpi->rate_size = 0; cpi->cur_poc = -1; @@ -941,7 +962,8 @@ static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm, // Set the maximum parameters for bitrate and buffer size for this profile, // level, and tier cm->op_params[i].bitrate = max_level_bitrate( - cm->profile, major_minor_to_seq_level_idx(seq->level[i]), seq->tier[i]); + cm->seq_params.profile, major_minor_to_seq_level_idx(seq->level[i]), + seq->tier[i]); // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the // check if (cm->op_params[i].bitrate == 0) @@ -1006,15 +1028,15 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) { cpi->oxcf = *oxcf; cpi->framerate = oxcf->init_framerate; - cm->profile = oxcf->profile; - cm->bit_depth = oxcf->bit_depth; - cm->use_highbitdepth = oxcf->use_highbitdepth; - cm->color_primaries = oxcf->color_primaries; - cm->transfer_characteristics = oxcf->transfer_characteristics; - cm->matrix_coefficients = oxcf->matrix_coefficients; + cm->seq_params.profile = oxcf->profile; + cm->seq_params.bit_depth = oxcf->bit_depth; + cm->seq_params.use_highbitdepth = oxcf->use_highbitdepth; + cm->seq_params.color_primaries = oxcf->color_primaries; + cm->seq_params.transfer_characteristics = oxcf->transfer_characteristics; + cm->seq_params.matrix_coefficients = oxcf->matrix_coefficients; cm->seq_params.monochrome = oxcf->monochrome; - cm->chroma_sample_position = oxcf->chroma_sample_position; - cm->color_range = oxcf->color_range; + cm->seq_params.chroma_sample_position = oxcf->chroma_sample_position; + cm->seq_params.color_range = oxcf->color_range; cm->timing_info_present = oxcf->timing_info_present; cm->timing_info.num_units_in_display_tick = oxcf->timing_info.num_units_in_display_tick; @@ -1032,7 +1054,7 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) { // set the decoder model parameters in schedule mode cm->buffer_model.num_units_in_decoding_tick = oxcf->buffer_model.num_units_in_decoding_tick; - cm->buffer_removal_delay_present = 1; + cm->buffer_removal_time_present = 1; set_aom_dec_model_info(&cm->buffer_model); set_dec_model_op_parameters(&cm->op_params[0]); } else if (cm->timing_info_present && @@ -1365,8 +1387,8 @@ MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16) static void highbd_set_var_fns(AV1_COMP *const cpi) { AV1_COMMON *const cm = &cpi->common; - if (cm->use_highbitdepth) { - switch (cm->bit_depth) { + if (cm->seq_params.use_highbitdepth) { + switch (cm->seq_params.bit_depth) { case AOM_BITS_8: HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits8, aom_highbd_sad64x16_avg_bits8, aom_highbd_8_variance64x16, @@ -2226,7 +2248,7 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) { default: assert(0 && - "cm->bit_depth should be AOM_BITS_8, " + "cm->seq_params.bit_depth should be AOM_BITS_8, " "AOM_BITS_10 or AOM_BITS_12"); } } @@ -2253,20 +2275,22 @@ static void realloc_segmentation_maps(AV1_COMP *cpi) { void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = &cm->seq_params; const int num_planes = av1_num_planes(cm); RATE_CONTROL *const rc = &cpi->rc; MACROBLOCK *const x = &cpi->td.mb; - if (cm->profile != oxcf->profile) cm->profile = oxcf->profile; - cm->bit_depth = oxcf->bit_depth; - cm->color_primaries = oxcf->color_primaries; - cm->transfer_characteristics = oxcf->transfer_characteristics; - cm->matrix_coefficients = oxcf->matrix_coefficients; - cm->seq_params.monochrome = oxcf->monochrome; - cm->chroma_sample_position = oxcf->chroma_sample_position; - cm->color_range = oxcf->color_range; + if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile; + seq_params->bit_depth = oxcf->bit_depth; + seq_params->color_primaries = oxcf->color_primaries; + seq_params->transfer_characteristics = oxcf->transfer_characteristics; + seq_params->matrix_coefficients = oxcf->matrix_coefficients; + seq_params->monochrome = oxcf->monochrome; + seq_params->chroma_sample_position = oxcf->chroma_sample_position; + seq_params->color_range = oxcf->color_range; - assert(IMPLIES(cm->profile <= PROFILE_1, cm->bit_depth <= AOM_BITS_10)); + assert(IMPLIES(seq_params->profile <= PROFILE_1, + seq_params->bit_depth <= AOM_BITS_10)); cm->timing_info_present = oxcf->timing_info_present; cm->timing_info.num_units_in_display_tick = @@ -2277,20 +2301,20 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { cm->timing_info.num_ticks_per_picture = oxcf->timing_info.num_ticks_per_picture; - cm->seq_params.display_model_info_present_flag = + seq_params->display_model_info_present_flag = oxcf->display_model_info_present_flag; - cm->seq_params.decoder_model_info_present_flag = + seq_params->decoder_model_info_present_flag = oxcf->decoder_model_info_present_flag; if (oxcf->decoder_model_info_present_flag) { // set the decoder model parameters in schedule mode cm->buffer_model.num_units_in_decoding_tick = oxcf->buffer_model.num_units_in_decoding_tick; - cm->buffer_removal_delay_present = 1; + cm->buffer_removal_time_present = 1; set_aom_dec_model_info(&cm->buffer_model); set_dec_model_op_parameters(&cm->op_params[0]); } else if (cm->timing_info_present && cm->timing_info.equal_picture_interval && - !cm->seq_params.decoder_model_info_present_flag) { + !seq_params->decoder_model_info_present_flag) { // set the decoder model parameters in resource availability mode set_resource_availability_parameters(&cm->op_params[0]); } else { @@ -2302,7 +2326,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { cpi->oxcf = *oxcf; cpi->common.options = oxcf->cfg; - x->e_mbd.bd = (int)cm->bit_depth; + x->e_mbd.bd = (int)seq_params->bit_depth; x->e_mbd.global_motion = cm->global_motion; if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) { @@ -2360,15 +2384,15 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { cm->width = cpi->oxcf.width; cm->height = cpi->oxcf.height; - int sb_size = cm->seq_params.sb_size; + int sb_size = seq_params->sb_size; // Superblock size should not be updated after the first key frame. if (!cpi->seq_params_locked) { set_sb_size(&cm->seq_params, select_sb_size(cpi)); } - if (cpi->initial_width || sb_size != cm->seq_params.sb_size) { + if (cpi->initial_width || sb_size != seq_params->sb_size) { if (cm->width > cpi->initial_width || cm->height > cpi->initial_height || - cm->seq_params.sb_size != sb_size) { + seq_params->sb_size != sb_size) { av1_free_context_buffers(cm); av1_free_pc_tree(&cpi->td, num_planes); alloc_compressor_data(cpi); @@ -2395,7 +2419,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { // Init sequence level coding tools // This should not be called after the first key frame. if (!cpi->seq_params_locked) { - cm->seq_params.operating_points_cnt_minus_1 = + seq_params->operating_points_cnt_minus_1 = cm->number_spatial_layers > 1 ? cm->number_spatial_layers - 1 : 0; init_seq_coding_tools(&cm->seq_params, cm, oxcf); } @@ -2411,6 +2435,9 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, av1_zero(*cpi); + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. if (setjmp(cm->error.jmp)) { cm->error.setjmp = 0; av1_remove_compressor(cpi); @@ -3082,28 +3109,52 @@ static void check_show_existing_frame(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; const FRAME_UPDATE_TYPE next_frame_update_type = gf_group->update_type[gf_group->index]; +#if USE_SYMM_MULTI_LAYER + const int which_arf = (cpi->new_bwdref_update_rule == 1) + ? gf_group->arf_update_idx[gf_group->index] > 0 + : gf_group->arf_update_idx[gf_group->index]; +#else const int which_arf = gf_group->arf_update_idx[gf_group->index]; +#endif if (cm->show_existing_frame == 1) { cm->show_existing_frame = 0; } else if (cpi->rc.is_last_bipred_frame) { - // NOTE: If the current frame is a last bi-predictive frame, it is - // needed next to show the BWDREF_FRAME, which is pointed by - // the last_fb_idxes[0] after reference frame buffer update - cpi->rc.is_last_bipred_frame = 0; - cm->show_existing_frame = 1; - cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[0]; +#if USE_SYMM_MULTI_LAYER + // NOTE: When new structure is used, every bwdref will have one overlay + // frame. Therefore, there is no need to find out which frame to + // show in advance. + if (cpi->new_bwdref_update_rule == 0) { +#endif + // NOTE: If the current frame is a last bi-predictive frame, it is + // needed next to show the BWDREF_FRAME, which is pointed by + // the last_fb_idxes[0] after reference frame buffer update + cpi->rc.is_last_bipred_frame = 0; + cm->show_existing_frame = 1; + cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[0]; +#if USE_SYMM_MULTI_LAYER + } +#endif } else if (cpi->is_arf_filter_off[which_arf] && (next_frame_update_type == OVERLAY_UPDATE || next_frame_update_type == INTNL_OVERLAY_UPDATE)) { +#if USE_SYMM_MULTI_LAYER + const int bwdref_to_show = + (cpi->new_bwdref_update_rule == 1) ? BWDREF_FRAME : ALTREF2_FRAME; +#else + const int bwdref_to_show = ALTREF2_FRAME; +#endif // Other parameters related to OVERLAY_UPDATE will be taken care of // in av1_rc_get_second_pass_params(cpi) cm->show_existing_frame = 1; cpi->rc.is_src_frame_alt_ref = 1; cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE) ? cpi->ref_fb_idx[ALTREF_FRAME - 1] - : cpi->ref_fb_idx[ALTREF2_FRAME - 1]; - cpi->is_arf_filter_off[which_arf] = 0; + : cpi->ref_fb_idx[bwdref_to_show - 1]; +#if USE_SYMM_MULTI_LAYER + if (cpi->new_bwdref_update_rule == 0) +#endif + cpi->is_arf_filter_off[which_arf] = 0; } cpi->rc.is_src_frame_ext_arf = 0; } @@ -3288,6 +3339,48 @@ static INLINE void shift_last_ref_frames(AV1_COMP *cpi) { } } +#if USE_SYMM_MULTI_LAYER +// This function is used to shift the virtual indices of bwd reference +// frames as follows: +// BWD_REF -> ALT2_REF -> EXT_REF +// to clear a space to store the closest bwdref +static INLINE void rshift_bwd_ref_frames(AV1_COMP *cpi) { + // TODO(isbs): shift the scaled indices as well + static const int ordered_bwd[3] = { BWDREF_FRAME - 1, ALTREF2_FRAME - 1, + EXTREF_FRAME - 1 }; + + for (int i = 2; i > 0; --i) { + cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i - 1]]; + + // [0] is allocated to the current coded frame, i.e. bwdref + memcpy( + cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME], + cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME], + sizeof(cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME])); + } +} + +// This function is used to shift the virtual indices of bwd reference +// frames as follows: +// BWD_REF <- ALT2_REF <- EXT_REF +// to update the bwd reference frame for coding the next frame. +static INLINE void lshift_bwd_ref_frames(AV1_COMP *cpi) { + // TODO(isbs): shift the scaled indices as well + static const int ordered_bwd[3] = { BWDREF_FRAME - 1, ALTREF2_FRAME - 1, + EXTREF_FRAME - 1 }; + + for (int i = 0; i < 2; ++i) { + cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i + 1]]; + + // [0] is allocated to the current coded frame, i.e. bwdref + memcpy( + cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME], + cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME], + sizeof(cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME])); + } +} +#endif // USE_SYMM_MULTI_LAYER + #if USE_GF16_MULTI_LAYER static void update_reference_frames_gf16(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; @@ -3343,7 +3436,9 @@ static void update_reference_frames(AV1_COMP *cpi) { // At this point the new frame has been encoded. // If any buffer copy / swapping is signaled it should be done here. - if (cm->frame_type == KEY_FRAME || frame_is_sframe(cm)) { + // Only update all of the reference buffers if a KEY_FRAME is also a + // show_frame. This ensures a fwd keyframe does not update all of the buffers + if ((cm->frame_type == KEY_FRAME && cm->show_frame) || frame_is_sframe(cm)) { for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->ref_fb_idx[ref_frame]], @@ -3370,37 +3465,49 @@ static void update_reference_frames(AV1_COMP *cpi) { cpi->ref_fb_idx[ALTREF_FRAME - 1] = cpi->ref_fb_idx[GOLDEN_FRAME - 1]; cpi->ref_fb_idx[GOLDEN_FRAME - 1] = tmp; - // We need to modify the mapping accordingly - cpi->arf_map[0] = cpi->ref_fb_idx[ALTREF_FRAME - 1]; // TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to // cpi->interp_filter_selected[GOLDEN_FRAME]? } else if (cpi->rc.is_src_frame_ext_arf && cm->show_existing_frame) { +#if CONFIG_DEBUG + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + assert(gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE); +#endif +#if USE_SYMM_MULTI_LAYER + const int bwdref_to_show = + (cpi->new_bwdref_update_rule == 1) ? BWDREF_FRAME : ALTREF2_FRAME; +#else + const int bwdref_to_show = ALTREF2_FRAME; +#endif // Deal with the special case for showing existing internal ALTREF_FRAME // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME // by updating the virtual indices. - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - const int which_arf = gf_group->arf_ref_idx[gf_group->index]; - assert(gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE); - const int tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1]; shift_last_ref_frames(cpi); - cpi->ref_fb_idx[LAST_FRAME - 1] = cpi->ref_fb_idx[ALTREF2_FRAME - 1]; - cpi->ref_fb_idx[ALTREF2_FRAME - 1] = tmp; - // We need to modify the mapping accordingly - cpi->arf_map[which_arf] = cpi->ref_fb_idx[ALTREF2_FRAME - 1]; + cpi->ref_fb_idx[LAST_FRAME - 1] = cpi->ref_fb_idx[bwdref_to_show - 1]; memcpy(cpi->interp_filter_selected[LAST_FRAME], - cpi->interp_filter_selected[ALTREF2_FRAME], - sizeof(cpi->interp_filter_selected[ALTREF2_FRAME])); + cpi->interp_filter_selected[bwdref_to_show], + sizeof(cpi->interp_filter_selected[bwdref_to_show])); +#if USE_SYMM_MULTI_LAYER + if (cpi->new_bwdref_update_rule == 1) { + lshift_bwd_ref_frames(cpi); + // pass outdated forward reference frame (previous LAST3) to the + // spared space + cpi->ref_fb_idx[EXTREF_FRAME - 1] = tmp; + } else { +#endif + cpi->ref_fb_idx[bwdref_to_show - 1] = tmp; +#if USE_SYMM_MULTI_LAYER + } +#endif } else { /* For non key/golden frames */ // === ALTREF_FRAME === if (cpi->refresh_alt_ref_frame) { int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1]; - int which_arf = 0; ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx); - memcpy(cpi->interp_filter_selected[ALTREF_FRAME + which_arf], + memcpy(cpi->interp_filter_selected[ALTREF_FRAME], cpi->interp_filter_selected[0], sizeof(cpi->interp_filter_selected[0])); } @@ -3418,10 +3525,25 @@ static void update_reference_frames(AV1_COMP *cpi) { // === BWDREF_FRAME === if (cpi->refresh_bwd_ref_frame) { - ref_cnt_fb(pool->frame_bufs, - &cm->ref_frame_map[cpi->ref_fb_idx[BWDREF_FRAME - 1]], - cm->new_fb_idx); - +#if USE_SYMM_MULTI_LAYER + if (cpi->new_bwdref_update_rule) { + // We shift the backward reference frame as follows: + // BWDREF -> ALTREF2 -> EXTREF + // and assign the newly coded frame to BWDREF so that it always + // keeps the nearest future frame + int tmp = cpi->ref_fb_idx[EXTREF_FRAME - 1]; + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[tmp], cm->new_fb_idx); + + rshift_bwd_ref_frames(cpi); + cpi->ref_fb_idx[BWDREF_FRAME - 1] = tmp; + } else { +#endif // USE_SYMM_MULTI_LAYER + ref_cnt_fb(pool->frame_bufs, + &cm->ref_frame_map[cpi->ref_fb_idx[BWDREF_FRAME - 1]], + cm->new_fb_idx); +#if USE_SYMM_MULTI_LAYER + } +#endif memcpy(cpi->interp_filter_selected[BWDREF_FRAME], cpi->interp_filter_selected[0], sizeof(cpi->interp_filter_selected[0])); @@ -3486,7 +3608,14 @@ static void update_reference_frames(AV1_COMP *cpi) { cpi->interp_filter_selected[0], sizeof(cpi->interp_filter_selected[0])); + // If the new structure is used, we will always have overlay frames coupled + // with bwdref frames. Therefore, we won't have to perform this update + // in advance (we do this update when the overlay frame shows up). +#if USE_SYMM_MULTI_LAYER + if (cpi->new_bwdref_update_rule == 0 && cpi->rc.is_last_bipred_frame) { +#else if (cpi->rc.is_last_bipred_frame) { +#endif // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the // LAST3_FRAME by updating the virtual indices. // @@ -3555,13 +3684,14 @@ static void scale_references(AV1_COMP *cpi) { if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || new_fb_ptr->buf.y_crop_height != cm->height) { if (aom_realloc_frame_buffer( - &new_fb_ptr->buf, cm->width, cm->height, cm->subsampling_x, - cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, + &new_fb_ptr->buf, cm->width, cm->height, + cm->seq_params.subsampling_x, cm->seq_params.subsampling_y, + cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); - av1_resize_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth, - num_planes); + av1_resize_and_extend_frame( + ref, &new_fb_ptr->buf, (int)cm->seq_params.bit_depth, num_planes); cpi->scaled_ref_idx[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); } @@ -3706,13 +3836,14 @@ static void init_ref_frame_bufs(AV1_COMMON *cm) { static void check_initial_width(AV1_COMP *cpi, int use_highbitdepth, int subsampling_x, int subsampling_y) { AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = &cm->seq_params; - if (!cpi->initial_width || cm->use_highbitdepth != use_highbitdepth || - cm->subsampling_x != subsampling_x || - cm->subsampling_y != subsampling_y) { - cm->subsampling_x = subsampling_x; - cm->subsampling_y = subsampling_y; - cm->use_highbitdepth = use_highbitdepth; + if (!cpi->initial_width || seq_params->use_highbitdepth != use_highbitdepth || + seq_params->subsampling_x != subsampling_x || + seq_params->subsampling_y != subsampling_y) { + seq_params->subsampling_x = subsampling_x; + seq_params->subsampling_y = subsampling_y; + seq_params->use_highbitdepth = use_highbitdepth; alloc_raw_frame_buffers(cpi); init_ref_frame_bufs(cm); @@ -3730,8 +3861,9 @@ static void check_initial_width(AV1_COMP *cpi, int use_highbitdepth, static int set_size_literal(AV1_COMP *cpi, int width, int height) { AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); - check_initial_width(cpi, cm->use_highbitdepth, cm->subsampling_x, - cm->subsampling_y); + check_initial_width(cpi, cm->seq_params.use_highbitdepth, + cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y); if (width <= 0 || height <= 0) return 1; @@ -3753,6 +3885,7 @@ static int set_size_literal(AV1_COMP *cpi, int width, int height) { static void set_frame_size(AV1_COMP *cpi, int width, int height) { AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = &cm->seq_params; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; int ref_frame; @@ -3782,17 +3915,19 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) { } // Reset the frame pointers to the current frame size. - if (aom_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, - cm->byte_alignment, NULL, NULL, NULL)) + if (aom_realloc_frame_buffer( + get_frame_new_buffer(cm), cm->width, cm->height, + seq_params->subsampling_x, seq_params->subsampling_y, + seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); const int frame_width = cm->superres_upscaled_width; const int frame_height = cm->superres_upscaled_height; - set_restoration_unit_size(frame_width, frame_height, cm->subsampling_x, - cm->subsampling_y, cm->rst_info); + set_restoration_unit_size(frame_width, frame_height, + seq_params->subsampling_x, + seq_params->subsampling_y, cm->rst_info); for (int i = 0; i < num_planes; ++i) cm->rst_info[i].frame_restoration_type = RESTORE_NONE; @@ -4038,16 +4173,16 @@ static void superres_post_encode(AV1_COMP *cpi) { // av1_superres_upscale if (aom_realloc_frame_buffer( &cpi->scaled_source, cm->superres_upscaled_width, - cm->superres_upscaled_height, cm->subsampling_x, cm->subsampling_y, - cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->byte_alignment, - NULL, NULL, NULL)) + cm->superres_upscaled_height, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y, cm->seq_params.use_highbitdepth, + AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error( &cm->error, AOM_CODEC_MEM_ERROR, "Failed to reallocate scaled source buffer for superres"); assert(cpi->scaled_source.y_crop_width == cm->superres_upscaled_width); assert(cpi->scaled_source.y_crop_height == cm->superres_upscaled_height); av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source, - (int)cm->bit_depth, num_planes); + (int)cm->seq_params.bit_depth, num_planes); cpi->source = &cpi->scaled_source; } } @@ -4331,7 +4466,7 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { int64_t high_err_target = cpi->ambient_err; int64_t low_err_target = cpi->ambient_err >> 1; - if (cm->use_highbitdepth) { + if (cm->seq_params.use_highbitdepth) { kf_err = aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm)); } else { kf_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm)); @@ -4574,7 +4709,11 @@ static void set_ext_overrides(AV1_COMP *cpi) { cpi->ext_refresh_frame_flags_pending = 0; } cpi->common.allow_ref_frame_mvs = cpi->ext_use_ref_frame_mvs; - cpi->common.error_resilient_mode = cpi->ext_use_error_resilient; + // A keyframe is already error resilient and keyframes with + // error_resilient_mode interferes with the use of show_existing_frame + // when forward reference keyframes are enabled. + cpi->common.error_resilient_mode = + cpi->ext_use_error_resilient && cpi->common.frame_type != KEY_FRAME; } static int setup_interp_filter_search_mask(AV1_COMP *cpi) { @@ -4725,10 +4864,17 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) { } #endif // DUMP_RECON_FRAMES +static INLINE int is_frame_droppable(AV1_COMP *cpi) { + return !(cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame || + cpi->refresh_bwd_ref_frame || cpi->refresh_golden_frame || + cpi->refresh_last_frame); +} + static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, int skip_adapt, unsigned int *frame_flags) { AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = &cm->seq_params; const AV1EncoderConfig *const oxcf = &cpi->oxcf; struct segmentation *const seg = &cm->seg; @@ -4744,7 +4890,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, cm->large_scale_tile = cpi->oxcf.large_scale_tile; cm->single_tile_decoding = cpi->oxcf.single_tile_decoding; - if (cm->large_scale_tile) cm->seq_params.frame_id_numbers_present_flag = 0; + if (cm->large_scale_tile) seq_params->frame_id_numbers_present_flag = 0; cm->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm); // cm->allow_ref_frame_mvs needs to be written into the frame header while @@ -4756,7 +4902,8 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, cpi->oxcf.allow_warped_motion && frame_might_allow_warped_motion(cm); // Reset the frame packet stamp index. - if (cm->frame_type == KEY_FRAME) cm->current_video_frame = 0; + if (cm->frame_type == KEY_FRAME && cm->show_frame) + cm->current_video_frame = 0; // NOTE: // (1) Move the setup of the ref_frame_flags upfront as it would be @@ -4770,7 +4917,11 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, if (cm->show_existing_frame) { // NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current // BWDREF_FRAME in the reference frame buffer. - cm->frame_type = INTER_FRAME; + if (cm->frame_type == KEY_FRAME) { + cm->reset_decoder_state = 1; + } else { + cm->frame_type = INTER_FRAME; + } cm->show_frame = 1; cpi->frame_flags = *frame_flags; @@ -4839,6 +4990,10 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, av1_rc_postencode_update(cpi, *size); } + // Decrement count down till next gf + if (cpi->rc.frames_till_gf_update_due > 0) + cpi->rc.frames_till_gf_update_due--; + ++cm->current_video_frame; return AOM_CODEC_OK; @@ -4889,7 +5044,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, MAX_MODES * sizeof(*cpi->mode_chosen_counts)); #endif - if (cm->seq_params.frame_id_numbers_present_flag) { + if (seq_params->frame_id_numbers_present_flag) { /* Non-normative definition of current_frame_id ("frame counter" with * wraparound) */ const int frame_id_length = FRAME_ID_LENGTH; @@ -4935,7 +5090,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1; break; } - cm->timing_info_present &= !cm->seq_params.reduced_still_picture_hdr; + cm->timing_info_present &= !seq_params->reduced_still_picture_hdr; if (cpi->sf.recode_loop == DISALLOW_RECODE) { if (encode_without_recode_loop(cpi) != AOM_CODEC_OK) return AOM_CODEC_ERROR; @@ -4957,7 +5112,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, // fixed interval. Note the reconstruction error if it is the frame before // the force key frame if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) { - if (cm->use_highbitdepth) { + if (seq_params->use_highbitdepth) { cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm)); } else { @@ -4966,17 +5121,19 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, } // If the encoder forced a KEY_FRAME decision or if frame is an S_FRAME - if (cm->frame_type == KEY_FRAME || frame_is_sframe(cm)) { + if ((cm->frame_type == KEY_FRAME && cm->show_frame) || frame_is_sframe(cm)) { cpi->refresh_last_frame = 1; } cm->frame_to_show = get_frame_new_buffer(cm); - cm->frame_to_show->color_primaries = cm->color_primaries; - cm->frame_to_show->transfer_characteristics = cm->transfer_characteristics; - cm->frame_to_show->matrix_coefficients = cm->matrix_coefficients; - cm->frame_to_show->monochrome = cm->seq_params.monochrome; - cm->frame_to_show->chroma_sample_position = cm->chroma_sample_position; - cm->frame_to_show->color_range = cm->color_range; + cm->frame_to_show->color_primaries = seq_params->color_primaries; + cm->frame_to_show->transfer_characteristics = + seq_params->transfer_characteristics; + cm->frame_to_show->matrix_coefficients = seq_params->matrix_coefficients; + cm->frame_to_show->monochrome = seq_params->monochrome; + cm->frame_to_show->chroma_sample_position = + seq_params->chroma_sample_position; + cm->frame_to_show->color_range = seq_params->color_range; cm->frame_to_show->render_width = cm->render_width; cm->frame_to_show->render_height = cm->render_height; @@ -5014,7 +5171,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, if (skip_adapt) return AOM_CODEC_OK; - if (cm->seq_params.frame_id_numbers_present_flag) { + if (seq_params->frame_id_numbers_present_flag) { int i; // Update reference frame id values based on the value of refresh_frame_mask for (i = 0; i < REF_FRAMES; i++) { @@ -5085,6 +5242,19 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, cm->seg.update_data = 0; cm->lf.mode_ref_delta_update = 0; + // A droppable frame might not be shown but it always + // takes a space in the gf group. Therefore, even when + // it is not shown, we still need update the count down. + + // TODO(weitinglin): This is a work-around to handle the condition + // when a frame is drop. We should fix the cm->show_frame flag + // instead of checking the other condition to update the counter properly. + if (cm->show_frame || is_frame_droppable(cpi)) { + // Decrement count down till next gf + if (cpi->rc.frames_till_gf_update_due > 0) + cpi->rc.frames_till_gf_update_due--; + } + if (cm->show_frame) { // TODO(zoeliu): We may only swamp mi and prev_mi for those frames that // are @@ -5092,6 +5262,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, swap_mi_and_prev_mi(cm); // Don't increment frame counters if this was an altref buffer // update not a real frame + ++cm->current_video_frame; } @@ -5160,10 +5331,45 @@ static int Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest, return AOM_CODEC_OK; } +#if CONFIG_DENOISE +static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd, + int block_size, float noise_level, + int64_t time_stamp, int64_t end_time) { + AV1_COMMON *const cm = &cpi->common; + if (!cpi->denoise_and_model) { + cpi->denoise_and_model = aom_denoise_and_model_alloc( + cm->seq_params.bit_depth, block_size, noise_level); + if (!cpi->denoise_and_model) { + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating denoise and model"); + return -1; + } + } + if (!cpi->film_grain_table) { + cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table)); + if (!cpi->film_grain_table) { + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating grain table"); + return -1; + } + memset(cpi->film_grain_table, 0, sizeof(*cpi->film_grain_table)); + } + if (aom_denoise_and_model_run(cpi->denoise_and_model, sd, + &cm->film_grain_params)) { + if (cm->film_grain_params.apply_grain) { + aom_film_grain_table_append(cpi->film_grain_table, time_stamp, end_time, + &cm->film_grain_params); + } + } + return 0; +} +#endif + int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time) { AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = &cm->seq_params; struct aom_usec_timer timer; int res = 0; const int subsampling_x = sd->subsampling_x; @@ -5174,25 +5380,33 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, aom_usec_timer_start(&timer); +#if CONFIG_DENOISE + if (cpi->oxcf.noise_level > 0) + if (apply_denoise_2d(cpi, sd, cpi->oxcf.noise_block_size, + cpi->oxcf.noise_level, time_stamp, end_time) < 0) + res = -1; +#endif // CONFIG_DENOISE + if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, use_highbitdepth, frame_flags)) res = -1; aom_usec_timer_mark(&timer); cpi->time_receive_data += aom_usec_timer_elapsed(&timer); - if ((cm->profile == PROFILE_0) && !cm->seq_params.monochrome && + if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome && (subsampling_x != 1 || subsampling_y != 1)) { aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM, "Non-4:2:0 color format requires profile 1 or 2"); res = -1; } - if ((cm->profile == PROFILE_1) && + if ((seq_params->profile == PROFILE_1) && !(subsampling_x == 0 && subsampling_y == 0)) { aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM, "Profile 1 requires 4:4:4 color format"); res = -1; } - if ((cm->profile == PROFILE_2) && (cm->bit_depth <= AOM_BITS_10) && + if ((seq_params->profile == PROFILE_2) && + (seq_params->bit_depth <= AOM_BITS_10) && !(subsampling_x == 1 && subsampling_y == 0)) { aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM, "Profile 2 bit-depth < 10 requires 4:2:2 color format"); @@ -5364,9 +5578,9 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { #endif cpi->bytes += frame_bytes; - if (cm->use_highbitdepth) { + if (cm->seq_params.use_highbitdepth) { in_bit_depth = cpi->oxcf.input_bit_depth; - bit_depth = cm->bit_depth; + bit_depth = cm->seq_params.bit_depth; } if (cm->show_frame) { const YV12_BUFFER_CONFIG *orig = cpi->source; @@ -5387,7 +5601,7 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { cpi->total_samples += psnr.samples[0]; samples = psnr.samples[0]; // TODO(yaowu): unify these two versions into one. - if (cm->use_highbitdepth) + if (cm->seq_params.use_highbitdepth) frame_ssim2 = aom_highbd_calc_ssim(orig, recon, &weight, bit_depth, in_bit_depth); else @@ -5412,7 +5626,7 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { #endif } if (cpi->b_calculate_blockiness) { - if (!cm->use_highbitdepth) { + if (!cm->seq_params.use_highbitdepth) { const double frame_blockiness = av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height); @@ -5421,7 +5635,7 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { } if (cpi->b_calculate_consistency) { - if (!cm->use_highbitdepth) { + if (!cm->seq_params.use_highbitdepth) { const double this_inconsistency = aom_get_ssim_metrics( orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1); @@ -5622,18 +5836,17 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, if (oxcf->large_scale_tile) cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; + // default reference buffers update config + av1_configure_buffer_updates_firstpass(cpi, LF_UPDATE); - // TODO(zoeliu@gmail.com): To support forward-KEY_FRAME and set up the - // following flag accordingly. + // Initialize fields related to forward keyframes + cpi->no_show_kf = 0; cm->reset_decoder_state = 0; // Don't allow a show_existing_frame to coincide with an error resilient or - // S-Frame + // S-Frame. An exception can be made in the case of a keyframe, since it + // does not depend on any previous frames. We must make this exception here + // because of the use of show_existing_frame with forward coded keyframes. struct lookahead_entry *lookahead_src = NULL; if (cm->current_video_frame > 0) lookahead_src = av1_lookahead_peek(cpi->lookahead, 0); @@ -5641,7 +5854,8 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, ((cpi->oxcf.error_resilient_mode | ((lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT) != 0)) || (cpi->oxcf.s_frame_mode | - ((lookahead_src->flags & AOM_EFLAG_SET_S_FRAME) != 0)))) { + ((lookahead_src->flags & AOM_EFLAG_SET_S_FRAME) != 0))) && + !(rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY))) { cm->show_existing_frame = 0; } @@ -5719,22 +5933,29 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) { cm->showable_frame = 1; cpi->alt_ref_source = source; - - if (oxcf->arnr_max_frames > 0) { - // Produce the filtered ARF frame. - av1_temporal_filter(cpi, arf_src_index); - aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes); - force_src_buffer = &cpi->alt_ref_buffer; + // When arf_src_index == rc->frames_to_key, it indicates a fwd_kf + if (arf_src_index == rc->frames_to_key) { + // Skip temporal filtering and mark as intra_only if we have a fwd_kf + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + int which_arf = gf_group->arf_update_idx[gf_group->index]; + cpi->is_arf_filter_off[which_arf] = 1; + cpi->no_show_kf = 1; + } else { + if (oxcf->arnr_max_frames > 0) { + // Produce the filtered ARF frame. + av1_temporal_filter(cpi, arf_src_index); + aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes); + force_src_buffer = &cpi->alt_ref_buffer; + } } - cm->show_frame = 0; cm->intra_only = 0; - cpi->refresh_alt_ref_frame = 1; - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - rc->is_src_frame_alt_ref = 0; + + if (oxcf->pass < 2) { + // In second pass, the buffer updates configure will be set + // in the function av1_rc_get_second_pass_params + av1_configure_buffer_updates_firstpass(cpi, ARF_UPDATE); + } } rc->source_alt_ref_pending = 0; } @@ -5771,13 +5992,12 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, cm->show_frame = 0; cm->intra_only = 0; - cpi->refresh_alt2_ref_frame = 1; - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - rc->is_src_frame_alt_ref = 0; - rc->is_src_frame_ext_arf = 0; + + if (oxcf->pass < 2) { + // In second pass, the buffer updates configure will be set + // in the function av1_rc_get_second_pass_params + av1_configure_buffer_updates_firstpass(cpi, INTNL_ARF_UPDATE); + } } rc->source_alt_ref_pending = 0; } @@ -5791,13 +6011,11 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, cm->show_frame = 0; cm->intra_only = 0; - cpi->refresh_bwd_ref_frame = 1; - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - - rc->is_bwd_ref_frame = 1; + if (oxcf->pass < 2) { + // In second pass, the buffer updates configure will be set + // in the function av1_rc_get_second_pass_params + av1_configure_buffer_updates_firstpass(cpi, BIPRED_UPDATE); + } } } @@ -5865,16 +6083,18 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; cm->cur_frame->buf.buf_8bit_valid = 0; - if (cm->film_grain_table) { - cm->film_grain_params_present = aom_film_grain_table_lookup( - cm->film_grain_table, *time_stamp, *time_end, 0 /* erase */, + if (cpi->film_grain_table) { + cm->seq_params.film_grain_params_present = aom_film_grain_table_lookup( + cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */, &cm->film_grain_params); } - cm->cur_frame->film_grain_params_present = cm->film_grain_params_present; + cm->cur_frame->film_grain_params_present = + cm->seq_params.film_grain_params_present; // only one operating point supported now - cpi->common.tu_presentation_delay = - ticks_to_timebase_units(timebase, *time_stamp); + const int64_t pts64 = ticks_to_timebase_units(timebase, *time_stamp); + if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR; + cpi->common.frame_presentation_time = (uint32_t)pts64; // Start with a 0 size frame. *size = 0; @@ -6004,8 +6224,8 @@ int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) { *dest = *cm->frame_to_show; dest->y_width = cm->width; dest->y_height = cm->height; - dest->uv_width = cm->width >> cm->subsampling_x; - dest->uv_height = cm->height >> cm->subsampling_y; + dest->uv_width = cm->width >> cm->seq_params.subsampling_x; + dest->uv_height = cm->height >> cm->seq_params.subsampling_y; ret = 0; } else { ret = -1; diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h index 5212db2b1..2b7ab711d 100644 --- a/third_party/aom/av1/encoder/encoder.h +++ b/third_party/aom/av1/encoder/encoder.h @@ -41,6 +41,9 @@ #include "aom_dsp/ssim.h" #endif #include "aom_dsp/variance.h" +#if CONFIG_DENOISE +#include "aom_dsp/noise_model.h" +#endif #include "aom/internal/aom_codec_internal.h" #include "aom_util/aom_thread.h" @@ -277,7 +280,7 @@ typedef struct AV1EncoderConfig { aom_timing_info_t timing_info; int decoder_model_info_present_flag; int display_model_info_present_flag; - int buffer_removal_delay_present; + int buffer_removal_time_present; aom_dec_model_info_t buffer_model; aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1]; aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1]; @@ -301,6 +304,11 @@ typedef struct AV1EncoderConfig { int allow_warped_motion; int enable_superres; unsigned int save_as_annexb; + +#if CONFIG_DENOISE + float noise_level; + int noise_block_size; +#endif } AV1EncoderConfig; static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) { @@ -472,6 +480,7 @@ typedef struct AV1_COMP { AV1EncoderConfig oxcf; struct lookahead_ctx *lookahead; struct lookahead_entry *alt_ref_source; + int no_show_kf; int optimize_speed_feature; int optimize_seg_arr[MAX_SEGMENTS]; @@ -504,6 +513,9 @@ typedef struct AV1_COMP { int refresh_bwd_ref_frame; int refresh_alt2_ref_frame; int refresh_alt_ref_frame; +#if USE_SYMM_MULTI_LAYER + int new_bwdref_update_rule; +#endif int ext_refresh_frame_flags_pending; int ext_refresh_last_frame; @@ -666,7 +678,6 @@ typedef struct AV1_COMP { int existing_fb_idx_to_show; int is_arf_filter_off[MAX_EXT_ARFS + 1]; int num_extra_arfs; - int arf_map[MAX_EXT_ARFS + 1]; int arf_pos_in_gf[MAX_EXT_ARFS + 1]; int arf_pos_for_ovrly[MAX_EXT_ARFS + 1]; int global_motion_search_done; @@ -687,6 +698,11 @@ typedef struct AV1_COMP { AV1LfSync lf_row_sync; AV1LrSync lr_row_sync; AV1LrStruct lr_ctxt; + + aom_film_grain_table_t *film_grain_table; +#if CONFIG_DENOISE + struct aom_denoise_and_model_t *denoise_and_model; +#endif } AV1_COMP; void av1_initialize_enc(void); diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c index 4d4802b46..81f360733 100644 --- a/third_party/aom/av1/encoder/encodetxb.c +++ b/third_party/aom/av1/encoder/encodetxb.c @@ -792,9 +792,8 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb( } int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x, - const int plane, const int blk_row, const int blk_col, - const int block, const TX_SIZE tx_size, - const TXB_CTX *const txb_ctx) { + const int plane, const int block, const TX_SIZE tx_size, + const TX_TYPE tx_type, const TXB_CTX *const txb_ctx) { const struct macroblock_plane *p = &x->plane[plane]; const int eob = p->eobs[block]; const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); @@ -806,8 +805,6 @@ int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x, } const MACROBLOCKD *const xd = &x->e_mbd; - const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, - tx_size, cm->reduced_tx_set_used); const TX_CLASS tx_class = tx_type_to_class[tx_type]; #define WAREHOUSE_EFFICIENTS_TXB_CASE(tx_class_literal) \ @@ -1583,9 +1580,14 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, const int64_t rdmult = ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) + 2) >> - (sharpness + (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4 - ? 7 - mbmi->segment_id - : 2)); + (sharpness + + (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4 + ? 7 - mbmi->segment_id + : 2) + + (cpi->oxcf.aq_mode != VARIANCE_AQ && + cpi->oxcf.deltaq_mode > NO_DELTA_Q && x->sb_energy_level < 0 + ? (3 - x->sb_energy_level) + : 0)); uint8_t levels_buf[TX_PAD_2D]; uint8_t *const levels = set_levels(levels_buf, width); diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h index aa847ad62..0442cc613 100644 --- a/third_party/aom/av1/encoder/encodetxb.h +++ b/third_party/aom/av1/encoder/encodetxb.h @@ -50,9 +50,8 @@ typedef struct TxbInfo { void av1_alloc_txb_buf(AV1_COMP *cpi); void av1_free_txb_buf(AV1_COMP *cpi); int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x, - const int plane, const int blk_row, const int blk_col, - const int block, const TX_SIZE tx_size, - const TXB_CTX *const txb_ctx); + const int plane, const int block, const TX_SIZE tx_size, + const TX_TYPE tx_type, const TXB_CTX *const txb_ctx); void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_writer *w, int blk_row, int blk_col, int plane, TX_SIZE tx_size, const tran_low_t *tcoeff, @@ -77,9 +76,10 @@ void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x, int mi_row, int mi_col); void hbt_destroy(); -int av1_optimize_txb_new(const AV1_COMP *cpi, MACROBLOCK *x, int plane, +int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, TX_TYPE tx_type, - const TXB_CTX *txb_ctx, int *rate_cost, int sharpness); + const TXB_CTX *const txb_ctx, int *rate_cost, + int sharpness); #ifdef __cplusplus } #endif diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c index 404af2e7c..637d6824c 100644 --- a/third_party/aom/av1/encoder/ethread.c +++ b/third_party/aom/av1/encoder/ethread.c @@ -44,7 +44,7 @@ static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) { av1_encode_tile(cpi, thread_data->td, tile_row, tile_col); } - return 0; + return 1; } void av1_encode_tiles_mt(AV1_COMP *cpi) { @@ -126,12 +126,11 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) { for (i = 0; i < num_workers; i++) { AVxWorker *const worker = &cpi->workers[i]; - EncWorkerData *thread_data; + EncWorkerData *const thread_data = &cpi->tile_thr_data[i]; worker->hook = (AVxWorkerHook)enc_worker_hook; - worker->data1 = &cpi->tile_thr_data[i]; + worker->data1 = thread_data; worker->data2 = NULL; - thread_data = (EncWorkerData *)worker->data1; // Before encoding a frame, copy the thread data from cpi. if (thread_data->td != &cpi->td) { diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c index 113c068c1..ef0800c79 100644 --- a/third_party/aom/av1/encoder/firstpass.c +++ b/third_party/aom/av1/encoder/firstpass.c @@ -486,6 +486,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { int mb_row, mb_col; MACROBLOCK *const x = &cpi->td.mb; AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = &cm->seq_params; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; TileInfo tile; @@ -524,7 +525,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { double intra_factor; double brightness_factor; BufferPool *const pool = cm->buffer_pool; - const int qindex = find_fp_qindex(cm->bit_depth); + const int qindex = find_fp_qindex(seq_params->bit_depth); const int mb_scale = mi_size_wide[BLOCK_16X16]; int *raw_motion_err_list; @@ -555,11 +556,11 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { set_first_pass_params(cpi); av1_set_quantizer(cm, qindex); - av1_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y, - num_planes); + av1_setup_block_planes(&x->e_mbd, seq_params->subsampling_x, + seq_params->subsampling_y, num_planes); av1_setup_src_planes(x, cpi->source, 0, 0, num_planes); - av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, new_yv12, 0, 0, 0, + av1_setup_dst_planes(xd->plane, seq_params->sb_size, new_yv12, 0, 0, 0, num_planes); if (!frame_is_intra_only(cm)) { @@ -654,14 +655,14 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { image_data_start_row = mb_row; } - if (cm->use_highbitdepth) { - switch (cm->bit_depth) { + if (seq_params->use_highbitdepth) { + switch (seq_params->bit_depth) { case AOM_BITS_8: break; case AOM_BITS_10: this_error >>= 4; break; case AOM_BITS_12: this_error >>= 8; break; default: assert(0 && - "cm->bit_depth should be AOM_BITS_8, " + "seq_params->bit_depth should be AOM_BITS_8, " "AOM_BITS_10 or AOM_BITS_12"); return; } @@ -674,7 +675,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { else intra_factor += 1.0; - if (cm->use_highbitdepth) + if (seq_params->use_highbitdepth) level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0]; else level_sample = x->plane[0].src.buf[0]; @@ -1156,10 +1157,10 @@ static int get_twopass_worst_quality(const AV1_COMP *cpi, for (q = rc->best_quality; q < rc->worst_quality; ++q) { const double factor = calc_correction_factor( av_err_per_mb, ERR_DIVISOR - ediv_size_correction, FACTOR_PT_LOW, - FACTOR_PT_HIGH, q, cpi->common.bit_depth); + FACTOR_PT_HIGH, q, cpi->common.seq_params.bit_depth); const int bits_per_mb = av1_rc_bits_per_mb( INTER_FRAME, q, factor * speed_term * group_weight_factor, - cpi->common.bit_depth); + cpi->common.seq_params.bit_depth); if (bits_per_mb <= target_norm_bits_per_mb) break; } @@ -1377,7 +1378,7 @@ static double calc_frame_boost(AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame, double this_frame_mv_in_out, double max_boost) { double frame_boost; const double lq = av1_convert_qindex_to_q( - cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth); + cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.seq_params.bit_depth); const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5); int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs : cpi->common.MBs; @@ -2130,6 +2131,319 @@ static void define_gf_group_structure_16(AV1_COMP *cpi) { } #endif // USE_GF16_MULTI_LAYER +#if USE_SYMM_MULTI_LAYER +void check_frame_params(GF_GROUP *const gf_group, int gf_interval, + int frame_nums) { + static const char *update_type_strings[] = { + "KF_UPDATE", "LF_UPDATE", "GF_UPDATE", + "ARF_UPDATE", "OVERLAY_UPDATE", "BRF_UPDATE", + "LAST_BIPRED_UPDATE", "BIPRED_UPDATE", "INTNL_OVERLAY_UPDATE", + "INTNL_ARF_UPDATE" + }; + FILE *fid = fopen("GF_PARAMS.txt", "a"); + + fprintf(fid, "\n{%d}\n", gf_interval); + for (int i = 0; i <= frame_nums; ++i) { + fprintf(fid, "%s %d %d %d %d\n", + update_type_strings[gf_group->update_type[i]], + gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i], + gf_group->arf_update_idx[i], gf_group->pyramid_level[i]); + } + fclose(fid); +} + +static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) { + // Derive rf_level from update_type + switch (update_type) { + case LF_UPDATE: return INTER_NORMAL; + case ARF_UPDATE: return GF_ARF_STD; + case OVERLAY_UPDATE: return INTER_NORMAL; + case BRF_UPDATE: return GF_ARF_LOW; + case LAST_BIPRED_UPDATE: return INTER_NORMAL; + case BIPRED_UPDATE: return INTER_NORMAL; + case INTNL_ARF_UPDATE: return GF_ARF_LOW; + case INTNL_OVERLAY_UPDATE: return INTER_NORMAL; + default: return INTER_NORMAL; + } +} + +static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r, + int *frame_ind, int arf_ind, int level) { + if (r - l == 2) { + // leaf node, not a look-ahead frame + gf_group->update_type[*frame_ind] = LF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = 0; + gf_group->arf_pos_in_gf[*frame_ind] = 0; + gf_group->arf_update_idx[*frame_ind] = arf_ind; + gf_group->pyramid_level[*frame_ind] = level; + ++(*frame_ind); + } else { + int m = (l + r) / 2; + int arf_pos_in_gf = *frame_ind; + + gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = m - l - 1; + gf_group->arf_pos_in_gf[*frame_ind] = 0; + gf_group->arf_update_idx[*frame_ind] = 1; // mark all internal ARF 1 + gf_group->pyramid_level[*frame_ind] = level; + ++(*frame_ind); + + // set parameters for frames displayed before this frame + set_multi_layer_params(gf_group, l, m, frame_ind, 1, level - 1); + + // for overlay frames, we need to record the position of its corresponding + // arf frames for bit allocation + gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE; + gf_group->arf_src_offset[*frame_ind] = 0; + gf_group->arf_pos_in_gf[*frame_ind] = arf_pos_in_gf; + gf_group->arf_update_idx[*frame_ind] = 1; + gf_group->pyramid_level[*frame_ind] = 0; + ++(*frame_ind); + + // set parameters for frames displayed after this frame + set_multi_layer_params(gf_group, m, r, frame_ind, arf_ind, level - 1); + } +} + +static INLINE unsigned char get_pyramid_height(int pyramid_width) { + assert(pyramid_width <= 16 && pyramid_width >= 4 && + "invalid gf interval for pyramid structure"); + + return pyramid_width == 16 ? 4 : (pyramid_width >= 8 ? 3 : 2); +} + +static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group, + const int gf_interval) { + int frame_index = 0; + gf_group->pyramid_height = get_pyramid_height(gf_interval); + + // At the beginning of each GF group it will be a key or overlay frame, + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + gf_group->arf_src_offset[frame_index] = 0; + gf_group->arf_pos_in_gf[frame_index] = 0; + gf_group->arf_update_idx[frame_index] = 0; + gf_group->pyramid_level[frame_index] = 0; + ++frame_index; + + // ALT0 + gf_group->update_type[frame_index] = ARF_UPDATE; + gf_group->arf_src_offset[frame_index] = gf_interval - 1; + gf_group->arf_pos_in_gf[frame_index] = 0; + gf_group->arf_update_idx[frame_index] = 0; + gf_group->pyramid_level[frame_index] = gf_group->pyramid_height; + ++frame_index; + + // set parameters for the rest of the frames + set_multi_layer_params(gf_group, 0, gf_interval, &frame_index, 0, + gf_group->pyramid_height - 1); + + // check_frame_params(gf_group, gf_interval, frame_index); + + return frame_index; +} + +void define_customized_gf_group_structure(AV1_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + const int key_frame = cpi->common.frame_type == KEY_FRAME; + + assert(rc->baseline_gf_interval == 4 || rc->baseline_gf_interval == 8 || + rc->baseline_gf_interval == 16); + + const int gf_update_frames = + construct_multi_layer_gf_structure(gf_group, rc->baseline_gf_interval); + int frame_index; + + cpi->num_extra_arfs = 0; + + for (frame_index = 0; frame_index < gf_update_frames; ++frame_index) { + // Set unused variables to default values + gf_group->bidir_pred_enabled[frame_index] = 0; + gf_group->brf_src_offset[frame_index] = 0; + + // Special handle for the first frame for assigning update_type + if (frame_index == 0) { + // For key frames the frame target rate is already set and it + // is also the golden frame. + if (key_frame) { + gf_group->update_type[frame_index] = KF_UPDATE; + continue; + } + + if (rc->source_alt_ref_active) { + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + } else { + gf_group->update_type[frame_index] = GF_UPDATE; + } + } else { + if (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) + ++cpi->num_extra_arfs; + } + + // Assign rf level based on update type + gf_group->rf_level[frame_index] = + update_type_2_rf_level(gf_group->update_type[frame_index]); + } + + // NOTE: We need to configure the frame at the end of the sequence + 1 that + // will be the start frame for the next group. Otherwise prior to the + // call to av1_rc_get_second_pass_params() the data will be undefined. + if (rc->source_alt_ref_pending) { + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + gf_group->rf_level[frame_index] = INTER_NORMAL; + } else { + gf_group->update_type[frame_index] = GF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_STD; + } + + gf_group->bidir_pred_enabled[frame_index] = 0; + gf_group->brf_src_offset[frame_index] = 0; + gf_group->arf_update_idx[frame_index] = 0; + // This value is only used for INTNL_OVERLAY_UPDATE + gf_group->arf_pos_in_gf[frame_index] = 0; + + // This parameter is useless? + gf_group->arf_ref_idx[frame_index] = 0; + + check_frame_params(gf_group, rc->baseline_gf_interval, gf_update_frames); +} + +// It is an example of how to define a GF stucture manually. The function will +// result in exactly the same GF group structure as +// define_customized_gf_group_structure() when rc->baseline_gf_interval == 4 +#if USE_MANUAL_GF4_STRUCT +#define GF_INTERVAL_4 4 +static const unsigned char gf4_multi_layer_params[][GF_FRAME_PARAMS] = { + { + // gf_group->index == 0 (Frame 0) + // It can also be KEY frame. Will assign the proper value + // in define_gf_group_structure + OVERLAY_UPDATE, // update_type (default value) + 0, // arf_src_offset + 0, // arf_pos_in_gf + 0 // arf_update_idx + }, + { + // gf_group->index == 1 (Frame 4) + ARF_UPDATE, // update_type + GF_INTERVAL_4 - 1, // arf_src_offset + 0, // arf_pos_in_gf + 0 // arf_update_idx + }, + { + // gf_group->index == 2 (Frame 2) + INTNL_ARF_UPDATE, // update_type + (GF_INTERVAL_4 >> 1) - 1, // arf_src_offset + 0, // arf_pos_in_gf + 0 // arf_update_idx + }, + { + // gf_group->index == 3 (Frame 1) + LAST_BIPRED_UPDATE, // update_type + 0, // arf_src_offset + 0, // arf_pos_in_gf + 0 // arf_update_idx + }, + + { + // gf_group->index == 4 (Frame 2 - OVERLAY) + INTNL_OVERLAY_UPDATE, // update_type + 0, // arf_src_offset + 2, // arf_pos_in_gf + 0 // arf_update_idx + }, + { + // gf_group->index == 5 (Frame 3) + LF_UPDATE, // update_type + 0, // arf_src_offset + 0, // arf_pos_in_gf + 1 // arf_update_idx + } +}; + +static int define_gf_group_structure_4(AV1_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + const int key_frame = cpi->common.frame_type == KEY_FRAME; + + assert(rc->baseline_gf_interval == GF_INTERVAL_4); + + const int gf_update_frames = rc->baseline_gf_interval + 2; + int frame_index; + + for (frame_index = 0; frame_index < gf_update_frames; ++frame_index) { + int param_idx = 0; + + gf_group->bidir_pred_enabled[frame_index] = 0; + + if (frame_index == 0) { + // gf_group->arf_src_offset[frame_index] = 0; + gf_group->brf_src_offset[frame_index] = 0; + gf_group->bidir_pred_enabled[frame_index] = 0; + + // For key frames the frame target rate is already set and it + // is also the golden frame. + if (key_frame) continue; + + gf_group->update_type[frame_index] = + gf4_multi_layer_params[frame_index][param_idx++]; + + if (rc->source_alt_ref_active) { + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + } else { + gf_group->update_type[frame_index] = GF_UPDATE; + } + param_idx++; + } else { + gf_group->update_type[frame_index] = + gf4_multi_layer_params[frame_index][param_idx++]; + } + + // setup other parameters + gf_group->rf_level[frame_index] = + update_type_2_rf_level(gf_group->update_type[frame_index]); + + // == arf_src_offset == + gf_group->arf_src_offset[frame_index] = + gf4_multi_layer_params[frame_index][param_idx++]; + + // == arf_pos_in_gf == + gf_group->arf_pos_in_gf[frame_index] = + gf4_multi_layer_params[frame_index][param_idx++]; + + // == arf_update_idx == + gf_group->brf_src_offset[frame_index] = + gf4_multi_layer_params[frame_index][param_idx]; + } + + // NOTE: We need to configure the frame at the end of the sequence + 1 that + // will be the start frame for the next group. Otherwise prior to the + // call to av1_rc_get_second_pass_params() the data will be undefined. + gf_group->arf_update_idx[frame_index] = 0; + gf_group->arf_ref_idx[frame_index] = 0; + + if (rc->source_alt_ref_pending) { + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + gf_group->rf_level[frame_index] = INTER_NORMAL; + + } else { + gf_group->update_type[frame_index] = GF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_STD; + } + + gf_group->bidir_pred_enabled[frame_index] = 0; + gf_group->brf_src_offset[frame_index] = 0; + + // This value is only used for INTNL_OVERLAY_UPDATE + gf_group->arf_pos_in_gf[frame_index] = 0; + + return gf_update_frames; +} +#endif // USE_MANUAL_GF4_STRUCT +#endif // USE_SYMM_MULTI_LAYER + static void define_gf_group_structure(AV1_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; @@ -2139,6 +2453,25 @@ static void define_gf_group_structure(AV1_COMP *cpi) { return; } #endif // USE_GF16_MULTI_LAYER +#if USE_SYMM_MULTI_LAYER + const int valid_customized_gf_length = rc->baseline_gf_interval == 4 || + rc->baseline_gf_interval == 8 || + rc->baseline_gf_interval == 16; + // used the new structure only if extra_arf is allowed + if (valid_customized_gf_length && rc->source_alt_ref_pending && + cpi->extra_arf_allowed > 0) { +#if USE_MANUAL_GF4_STRUCT + if (rc->baseline_gf_interval == 4) + define_gf_group_structure_4(cpi); + else +#endif + define_customized_gf_group_structure(cpi); + cpi->new_bwdref_update_rule = 1; + return; + } else { + cpi->new_bwdref_update_rule = 0; + } +#endif TWO_PASS *const twopass = &cpi->twopass; GF_GROUP *const gf_group = &twopass->gf_group; @@ -2322,9 +2655,8 @@ static void define_gf_group_structure(AV1_COMP *cpi) { } // NOTE: We need to configure the frame at the end of the sequence + 1 that - // will - // be the start frame for the next group. Otherwise prior to the call to - // av1_rc_get_second_pass_params() the data will be undefined. + // will be the start frame for the next group. Otherwise prior to the + // call to av1_rc_get_second_pass_params() the data will be undefined. gf_group->arf_update_idx[frame_index] = 0; gf_group->arf_ref_idx[frame_index] = 0; @@ -2438,6 +2770,17 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits, // TODO(zoeliu): To investigate whether the allocated bits on // BIPRED_UPDATE frames need to be further adjusted. gf_group->bit_allocation[frame_index] = target_frame_size; +#if USE_SYMM_MULTI_LAYER + } else if (cpi->new_bwdref_update_rule == 1 && + gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) { + int arf_pos = gf_group->arf_pos_in_gf[frame_index]; + gf_group->bit_allocation[frame_index] = 0; + + // Tried boosting up the allocated bits on backward reference frame + // by (target_frame_size >> 2) as in the original setting. However it + // does not bring gains for pyramid structure with GF length = 16. + gf_group->bit_allocation[arf_pos] = target_frame_size; +#endif } else { assert(gf_group->update_type[frame_index] == LF_UPDATE || gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE); @@ -2453,10 +2796,11 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits, } } - // NOTE: We need to configure the frame at the end of the sequence + 1 that - // will be the start frame for the next group. Otherwise prior to the - // call to av1_rc_get_second_pass_params() the data will be undefined. +#if USE_SYMM_MULTI_LAYER + if (cpi->new_bwdref_update_rule == 0 && rc->source_alt_ref_pending) { +#else if (rc->source_alt_ref_pending) { +#endif if (cpi->num_extra_arfs) { // NOTE: For bit allocation, move the allocated bits associated with // INTNL_OVERLAY_UPDATE to the corresponding INTNL_ARF_UPDATE. @@ -2489,7 +2833,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { int i; double boost_score = 0.0; +#if !FIX_GF_INTERVAL_LENGTH double old_boost_score = 0.0; + double mv_ratio_accumulator_thresh; +#endif double gf_group_err = 0.0; #if GROUP_ADAPTIVE_MAXQ double gf_group_raw_error = 0.0; @@ -2509,7 +2856,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { double this_frame_mv_in_out = 0.0; double mv_in_out_accumulator = 0.0; double abs_mv_in_out_accumulator = 0.0; - double mv_ratio_accumulator_thresh; + unsigned int allow_alt_ref = is_altref_enabled(cpi); int f_boost = 0; @@ -2551,18 +2898,18 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { gf_group_skip_pct -= this_frame->intra_skip_pct; gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows; } - +#if !FIX_GF_INTERVAL_LENGTH // Motion breakout threshold for loop below depends on image size. mv_ratio_accumulator_thresh = (cpi->initial_height + cpi->initial_width) / 4.0; - +#endif // Set a maximum and minimum interval for the GF group. // If the image appears almost completely static we can extend beyond this. { - int int_max_q = (int)(av1_convert_qindex_to_q(twopass->active_worst_quality, - cpi->common.bit_depth)); - int int_lbq = (int)(av1_convert_qindex_to_q(rc->last_boosted_qindex, - cpi->common.bit_depth)); + int int_max_q = (int)(av1_convert_qindex_to_q( + twopass->active_worst_quality, cpi->common.seq_params.bit_depth)); + int int_lbq = (int)(av1_convert_qindex_to_q( + rc->last_boosted_qindex, cpi->common.seq_params.bit_depth)); active_min_gf_interval = rc->min_gf_interval + AOMMIN(2, int_max_q / 200); if (active_min_gf_interval > rc->max_gf_interval) @@ -2643,7 +2990,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { boost_score += decay_accumulator * calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST); - +#if FIX_GF_INTERVAL_LENGTH + if (i == (FIXED_GF_LENGTH + 1)) break; +#else + // Skip breaking condition for FIX_GF_INTERVAL_LENGTH // Break out conditions. if ( // Break at active_max_gf_interval unless almost totally static. @@ -2666,9 +3016,9 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { break; } } - - *this_frame = next_frame; old_boost_score = boost_score; +#endif // FIX_GF_INTERVAL_LENGTH + *this_frame = next_frame; } twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0); @@ -2693,7 +3043,18 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { } // Set the interval until the next gf. - rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending); + if (cpi->oxcf.fwd_kf_enabled) { + // Ensure the gf group before the next keyframe will contain an altref + if ((rc->frames_to_key - i < rc->min_gf_interval) && + (rc->frames_to_key != i)) { + rc->baseline_gf_interval = AOMMIN(rc->frames_to_key - rc->min_gf_interval, + rc->static_scene_max_gf_interval); + } else { + rc->baseline_gf_interval = i; + } + } else { + rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending); + } if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count; // Disable extra altrefs and backward refs for "still" gf group: @@ -2711,12 +3072,23 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (!cpi->extra_arf_allowed) { cpi->num_extra_arfs = 0; } else { +#if USE_SYMM_MULTI_LAYER + if (rc->baseline_gf_interval == 4 && rc->source_alt_ref_pending) + cpi->num_extra_arfs = 1; + else + cpi->num_extra_arfs = get_number_of_extra_arfs( + rc->baseline_gf_interval, rc->source_alt_ref_pending); +#else // Compute how many extra alt_refs we can have cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval, rc->source_alt_ref_pending); +#endif // USE_SYMM_MULTI_LAYER } + +#if !USE_SYMM_MULTI_LAYER // Currently at maximum two extra ARFs' are allowed assert(cpi->num_extra_arfs <= MAX_EXT_ARFS); +#endif rc->frames_till_gf_update_due = rc->baseline_gf_interval; @@ -3393,12 +3765,66 @@ static void configure_buffer_updates(AV1_COMP *cpi) { case INTNL_ARF_UPDATE: cpi->refresh_last_frame = 0; cpi->refresh_golden_frame = 0; +#if USE_SYMM_MULTI_LAYER + if (cpi->new_bwdref_update_rule == 1) { + cpi->refresh_bwd_ref_frame = 1; + cpi->refresh_alt2_ref_frame = 0; + } else { +#endif + cpi->refresh_bwd_ref_frame = 0; + cpi->refresh_alt2_ref_frame = 1; +#if USE_SYMM_MULTI_LAYER + } +#endif + cpi->refresh_alt_ref_frame = 0; + break; + + default: assert(0); break; + } +} + +void av1_configure_buffer_updates_firstpass(AV1_COMP *cpi, + FRAME_UPDATE_TYPE update_type) { + RATE_CONTROL *rc = &cpi->rc; + + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_bwd_ref_frame = 0; + cpi->refresh_alt2_ref_frame = 0; + cpi->refresh_alt_ref_frame = 0; + + rc->is_bwd_ref_frame = 0; + + switch (update_type) { + case ARF_UPDATE: + cpi->refresh_alt_ref_frame = 1; + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; cpi->refresh_bwd_ref_frame = 0; + cpi->refresh_alt2_ref_frame = 0; + + rc->is_src_frame_alt_ref = 0; + break; + case INTNL_ARF_UPDATE: cpi->refresh_alt2_ref_frame = 1; + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_bwd_ref_frame = 0; cpi->refresh_alt_ref_frame = 0; + rc->is_src_frame_alt_ref = 0; + rc->is_src_frame_ext_arf = 0; + break; + case BIPRED_UPDATE: + cpi->refresh_bwd_ref_frame = 1; + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt2_ref_frame = 0; + cpi->refresh_alt_ref_frame = 0; - default: assert(0); break; + rc->is_bwd_ref_frame = 1; + break; + default: break; } } @@ -3444,7 +3870,12 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) { target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate); rc->base_frame_target = target_rate; - cm->frame_type = INTER_FRAME; + if (cpi->no_show_kf) { + assert(gf_group->update_type[gf_group->index] == ARF_UPDATE); + cm->frame_type = KEY_FRAME; + } else { + cm->frame_type = INTER_FRAME; + } // Do the firstpass stats indicate that this frame is skippable for the // partition search? @@ -3479,7 +3910,7 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) { twopass->baseline_active_worst_quality = tmp_q; rc->ni_av_qi = tmp_q; rc->last_q[INTER_FRAME] = tmp_q; - rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->bit_depth); + rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth); rc->avg_frame_qindex[INTER_FRAME] = tmp_q; rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2; rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME]; diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h index 4ff0f73b0..b0c1a21e4 100644 --- a/third_party/aom/av1/encoder/firstpass.h +++ b/third_party/aom/av1/encoder/firstpass.h @@ -122,6 +122,11 @@ typedef struct { unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1]; unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1]; unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1]; +#if USE_SYMM_MULTI_LAYER + unsigned char arf_pos_in_gf[(MAX_LAG_BUFFERS * 2) + 1]; + unsigned char pyramid_level[(MAX_LAG_BUFFERS * 2) + 1]; + unsigned char pyramid_height; +#endif unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1]; unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1]; unsigned char ref_fb_idx_map[(MAX_LAG_BUFFERS * 2) + 1][REF_FRAMES]; @@ -186,6 +191,8 @@ void av1_end_first_pass(struct AV1_COMP *cpi); void av1_init_second_pass(struct AV1_COMP *cpi); void av1_rc_get_second_pass_params(struct AV1_COMP *cpi); +void av1_configure_buffer_updates_firstpass(struct AV1_COMP *cpi, + FRAME_UPDATE_TYPE update_type); // Post encode update of the rate control parameters for 2-pass void av1_twopass_postencode_update(struct AV1_COMP *cpi); diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c index 5a8f8cbba..f2ff5b495 100644 --- a/third_party/aom/av1/encoder/hash_motion.c +++ b/third_party/aom/av1/encoder/hash_motion.c @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + #include <assert.h> #include "config/av1_rtcd.h" diff --git a/third_party/aom/av1/encoder/ab_partition_model_weights.h b/third_party/aom/av1/encoder/partition_model_weights.h index 5b918fae2..279d39495 100644 --- a/third_party/aom/av1/encoder/ab_partition_model_weights.h +++ b/third_party/aom/av1/encoder/partition_model_weights.h @@ -1311,6 +1311,481 @@ static const NN_CONFIG av1_ab_partition_nnconfig_16 = { #undef FEATURE_SIZE #undef LABEL_SIZE +#define FEATURE_SIZE 18 +#define LABEL_SIZE 4 + +static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 48] = { + 0.121894f, 0.058485f, 0.702226f, 0.015457f, -0.123380f, -0.573450f, + 0.319576f, 0.118808f, 0.166057f, 0.526984f, 0.015211f, -0.025050f, + 0.085717f, -0.028221f, -0.580062f, -0.270530f, -0.092371f, 0.037679f, + 0.083573f, 0.007112f, -0.358623f, -0.264443f, -0.064819f, 0.022013f, + -0.040077f, -0.291967f, -0.293100f, 0.072266f, -0.270572f, -0.292253f, + -0.260105f, -0.294472f, -0.275752f, 0.054315f, 0.000085f, 0.105115f, + -0.363572f, -0.016542f, 0.185943f, -0.359903f, 0.038765f, -0.377668f, + 0.172692f, 0.127749f, -0.031275f, -0.242528f, -0.145880f, -0.055247f, + -0.000265f, -0.355224f, 0.089917f, -0.377841f, -0.209766f, 0.030899f, + 0.039546f, -0.375030f, -0.041605f, 0.137677f, 0.021282f, -0.150442f, + -0.189445f, 0.009293f, -0.316033f, 0.038745f, -0.278761f, 0.005692f, + -0.071763f, -0.302936f, -0.224572f, -0.211841f, 0.057503f, 0.005435f, + -0.930979f, 0.115513f, 0.689958f, 0.221318f, 1.003891f, 0.359540f, + -0.640534f, -0.162373f, -0.118105f, 0.205587f, 0.019710f, 0.025067f, + -0.025344f, 0.002831f, 0.033078f, 0.040175f, -0.007502f, 0.026272f, + 0.083443f, -0.880884f, 0.436948f, 0.293297f, 0.051678f, -0.133328f, + -0.180323f, 0.667835f, 0.070733f, -0.003060f, -0.221804f, 0.146601f, + 0.064024f, 0.056758f, -0.077361f, 0.105587f, -0.185500f, -0.133552f, + 0.138269f, 0.165055f, 0.628284f, 0.846449f, 0.058825f, 0.223157f, + 0.277896f, -0.381303f, 0.408241f, 0.643301f, 0.067494f, 0.120822f, + -0.182491f, -0.111373f, -0.033374f, 0.131387f, -0.114654f, 0.114318f, + 0.094718f, -0.052232f, 0.385903f, 1.212304f, 0.425305f, -0.052993f, + 0.291474f, -0.319730f, 0.023090f, -0.317259f, 0.011181f, -0.034185f, + -0.100671f, 0.186185f, -0.432511f, -0.115957f, -0.067746f, -0.177810f, + -0.226700f, 0.004464f, 0.006809f, 0.171360f, -0.080723f, 0.099826f, + -0.062301f, -0.358755f, -0.202549f, -0.084616f, -0.042313f, -0.325560f, + 0.010452f, -0.341089f, -0.013566f, -0.340129f, 0.034675f, -0.036518f, + -0.036473f, -0.192892f, 0.650235f, 0.609437f, -0.160982f, 0.125535f, + -1.004575f, 0.521969f, 1.318091f, 0.614004f, -0.106622f, -0.077453f, + -0.037328f, -0.081940f, 0.007640f, 0.026654f, -0.080332f, -0.077356f, + -0.288170f, -0.319680f, -0.131712f, -0.150985f, 0.073218f, 0.089502f, + -0.280502f, 0.003941f, -0.249937f, 0.244263f, 0.023269f, 0.080263f, + 0.073172f, -0.200036f, 0.022381f, 0.008592f, -0.339517f, -0.135073f, + 0.177199f, 0.208363f, 0.652360f, 0.272990f, 0.609535f, 0.145805f, + 0.022527f, -0.088378f, 0.205008f, 0.101021f, -0.019673f, -0.252681f, + 0.116034f, -0.062052f, 0.009991f, 0.138933f, -0.182428f, 0.052542f, + -0.350825f, -0.122654f, -0.154687f, 0.066747f, 0.021541f, -0.212169f, + -0.087093f, -0.087488f, 0.178129f, -0.146544f, 0.013919f, -0.273899f, + 0.223753f, -0.187327f, -0.118795f, -0.191892f, -0.355979f, 0.023794f, + -0.135236f, 0.058918f, 0.069080f, 0.279287f, 0.369689f, 1.134526f, + 0.659511f, 0.250223f, 0.286040f, 0.515284f, 0.067791f, -0.156385f, + 0.143283f, 0.050884f, 0.089956f, -0.040850f, -0.003650f, -0.081162f, + 0.086004f, 0.116578f, 0.826254f, 0.504869f, -0.196022f, -0.207279f, + 0.200503f, -0.196801f, 0.008211f, 0.411158f, -0.075855f, -0.036690f, + 0.111519f, -0.057838f, -0.005846f, 0.111067f, 0.174712f, -0.078054f, + 0.765897f, 0.018670f, -0.306960f, -0.020034f, -0.332875f, 0.662707f, + -0.461233f, -1.007542f, -0.693995f, -1.243352f, -0.014745f, 0.004036f, + -0.009141f, 0.003325f, -0.011233f, -0.000819f, 0.006369f, 0.002418f, + -0.035906f, -0.005135f, 1.073830f, 1.020736f, -0.182611f, -1.038976f, + -0.226695f, -0.375663f, 0.364568f, 0.620995f, -0.018615f, 0.011347f, + 0.045786f, 0.041077f, 0.010886f, -0.148428f, 0.028007f, -0.022322f, + -0.165985f, 0.233315f, -0.277531f, -0.329683f, -0.516967f, -0.390750f, + 0.006948f, 0.133744f, -0.375681f, -0.116877f, -0.009441f, -0.008597f, + -0.160679f, 0.102150f, -0.142647f, -0.117501f, 0.035035f, 0.228687f, + -1.117397f, -0.005171f, -0.008708f, 0.413042f, -0.298532f, 0.614909f, + -0.181084f, -0.711770f, 0.344033f, 0.287220f, -0.112848f, -0.052866f, + -0.222466f, 0.025029f, -0.107558f, 0.137036f, -0.276661f, -0.038808f, + -0.057448f, 0.037563f, 0.526020f, 0.447997f, 0.288366f, 0.264815f, + 0.319974f, -0.193091f, 0.353830f, 0.412950f, -0.280454f, 0.092737f, + 0.070919f, 0.043336f, 0.041214f, -0.052147f, 0.010860f, 0.191325f, + 0.079783f, -0.425672f, -0.053469f, -0.005495f, 0.184526f, -0.166171f, + 0.084459f, -0.042165f, -0.261759f, -0.248723f, -0.073483f, -0.377884f, + -0.189614f, -0.054146f, -0.261279f, 0.196347f, -0.087568f, 0.070533f, + -0.145492f, -0.041500f, -0.465861f, 0.077369f, 0.020645f, -0.440232f, + -0.414585f, -0.168627f, -0.050011f, -0.336676f, -0.344943f, -0.288140f, + 0.085513f, -0.200425f, 0.218516f, 0.049604f, -0.280952f, -0.242674f, + -1.969931f, 0.013374f, -0.039643f, 1.113947f, 0.018568f, 0.916330f, + -0.302934f, -0.225816f, 0.189529f, -0.361971f, 0.021073f, -0.050143f, + -0.041415f, 0.015126f, 0.018091f, -0.082401f, 0.017152f, 0.064856f, + 0.156170f, 0.145323f, -0.281409f, 0.213357f, -0.058966f, 0.158668f, + 0.033742f, 0.378820f, -0.662875f, -0.455532f, -0.702928f, 0.234325f, + 0.139627f, -1.360650f, 0.040921f, -0.044373f, -0.059999f, -0.048565f, + 0.115339f, -0.105888f, -0.170567f, -0.206097f, -0.349537f, 0.107941f, + -0.356286f, -0.374928f, 0.143257f, -0.317790f, 0.079875f, -0.359345f, + 0.081321f, -0.219772f, -0.077213f, 0.110624f, -0.252329f, -0.266481f, + 0.190135f, 0.121214f, 0.661064f, -0.037820f, -0.373068f, -0.065209f, + -0.286154f, -0.120695f, -0.110670f, -0.193589f, -0.010867f, -0.048054f, + -0.032010f, 0.110627f, 0.054094f, -0.884309f, -1.171623f, -0.386911f, + -0.756058f, 0.030362f, 0.563628f, -0.334227f, -0.111213f, 1.143898f, + -0.940454f, 0.084510f, 0.671010f, 0.312244f, -0.052592f, -0.014376f, + 0.039965f, -0.010763f, -0.114936f, -0.146020f, 0.015874f, 0.027439f, + -1.702315f, 0.148702f, 0.153021f, 0.363147f, -0.488933f, 0.220772f, + 0.640310f, -0.173911f, -0.169523f, -0.082261f, -0.014854f, 0.024414f, + 0.061041f, -0.013998f, 0.086539f, 0.000466f, 0.037472f, -0.010665f, + -0.326646f, 0.106971f, 0.405589f, 0.555345f, -0.318315f, 0.526498f, + 0.119246f, 0.022213f, 0.171237f, 0.214651f, 0.062904f, -0.023764f, + 0.011831f, 0.079644f, -0.096530f, -0.054373f, -0.306309f, -0.203709f, + -0.353217f, -0.350005f, -0.329549f, 0.062679f, -0.387625f, -0.237111f, + -0.025050f, -0.193987f, 0.002235f, -0.380821f, -0.051036f, -0.136020f, + 0.077989f, -0.361691f, 0.120485f, 0.157746f, 0.073394f, -0.284401f, + 0.113221f, 0.109808f, 0.000197f, 0.122523f, 0.081411f, -0.048544f, + -0.136577f, -0.007158f, -0.208952f, -0.276831f, 0.260479f, -1.392915f, + -0.865248f, 0.114577f, -0.000749f, -0.060338f, -0.091176f, -0.108421f, + 0.221256f, 0.100176f, -0.877560f, -1.248838f, 0.643005f, 0.064580f, + -0.049878f, 0.267988f, -0.434340f, -0.299254f, -0.097572f, 0.009606f, + 0.063810f, -0.090525f, 0.027760f, 0.043484f, 0.041697f, 0.108024f, + -0.359586f, -0.197090f, 0.121397f, 0.152206f, -0.391126f, -0.283145f, + 0.008754f, -0.059022f, -0.218745f, 0.043042f, -0.056716f, 0.153051f, + -0.210372f, -0.029681f, -0.288354f, 0.065242f, -0.189376f, 0.115013f, + -0.251488f, -0.533091f, 0.037768f, -0.319107f, -0.161364f, -0.103967f, + 0.063271f, -0.313289f, -0.312093f, -0.045239f, 0.150607f, 0.001487f, + 0.019602f, -0.338031f, -0.036214f, 0.112736f, -0.367762f, 0.122367f, + 0.094670f, 0.175590f, 0.301041f, -0.135257f, 0.539620f, 0.328619f, + -0.163971f, 0.137256f, 0.238805f, 0.483722f, 0.121353f, 0.083630f, + -0.283568f, 0.291661f, -0.061122f, -0.195295f, 0.153459f, -0.153727f, + -0.238839f, -0.071736f, 0.601437f, -0.664072f, 0.230827f, 0.198753f, + -0.039196f, 0.206751f, 0.529020f, 0.904132f, -0.219471f, 0.186694f, + -0.208608f, -0.093385f, -0.161617f, 0.003930f, -0.429869f, -0.123563f, + 0.626098f, -0.002495f, -0.245511f, -1.069848f, 0.296115f, -0.940267f, + -1.649122f, -0.512937f, -0.802874f, -1.000239f, -0.027629f, 0.020434f, + -0.003030f, 0.035986f, -0.004812f, -0.009193f, -0.004644f, -0.024347f, + 0.068439f, -0.314339f, 0.095057f, -0.212372f, 0.197523f, -0.040878f, + -0.272164f, -0.243326f, -0.204955f, 0.157199f, -0.049964f, -0.091537f, + -0.058012f, -0.306650f, 0.098621f, -0.146778f, -0.154447f, -0.177889f, + -0.009698f, 0.025427f, 0.350576f, -0.448237f, -0.068823f, 1.224960f, + -0.776883f, -0.692167f, -0.948497f, -0.492598f, 0.029440f, -0.056460f, + 0.021654f, 0.004352f, 0.041508f, -0.027179f, 0.006789f, -0.023573f, + 0.207775f, -0.280273f, -0.347984f, -0.129935f, 0.151512f, -0.087294f, + -0.494352f, -0.341424f, 0.044084f, -0.064080f, 0.073091f, -0.145574f, + 0.094715f, -0.258786f, -0.020419f, -0.401823f, 0.009397f, -0.138642f, + -0.034953f, -0.077419f, 0.636610f, 0.314980f, 1.110610f, -0.343368f, + 0.696647f, -0.649667f, 0.653491f, -0.096006f, -0.090469f, -0.066975f, + -0.105864f, -0.015666f, 0.102056f, -0.105344f, -0.273495f, -0.014686f, + 0.122031f, 0.139524f, -1.042029f, -0.562510f, 0.885644f, 1.088059f, + 0.189223f, 0.049404f, -0.167371f, 0.018703f, -0.208390f, -0.159002f, + -0.377130f, -0.151118f, 0.117861f, 0.026986f, -0.032433f, 0.081603f, + -0.106729f, -0.040134f, 0.015161f, 0.290572f, 0.241446f, 1.390085f, + 0.438915f, -0.358097f, -0.171799f, 0.879758f, -0.014110f, 0.029562f, + -0.073583f, -0.125817f, -0.036512f, -0.040275f, 0.037997f, 0.120979f, + 0.064538f, -0.038841f, 0.034797f, 0.110229f, -0.239779f, -0.004558f, + 0.226534f, 0.111286f, -0.268198f, 0.237673f, -0.328237f, -0.090774f, + -0.269690f, -0.202147f, -0.181808f, -0.305238f, 0.110058f, -0.169217f, + -0.300125f, 0.069031f, -0.081358f, -0.376174f, -0.349980f, 0.071443f, + -0.396278f, -0.389503f, -0.190410f, -0.014767f, -0.265229f, -0.099787f, + 0.079847f, -0.214580f, -0.235661f, -0.184227f, 0.111099f, -0.083945f, + -0.153809f, -0.284092f, -0.132497f, -0.154841f, -0.517157f, -0.640603f, + -0.357036f, -0.486142f, -0.182819f, -0.475022f, 0.079282f, 0.081168f, + -0.120831f, -0.016048f, -0.232495f, 0.214329f, -0.055058f, 0.032856f, + 0.061753f, 0.003226f, 0.097028f, 0.084535f, -1.563199f, 0.434928f, + -0.403710f, 0.520696f, -0.401696f, 0.450568f, -0.074121f, 0.076622f, + -0.098421f, 0.167036f, -0.255250f, -0.526313f, -0.933693f, -0.558104f, + 0.194341f, 0.173326f, 0.071112f, -0.651961f, -1.327587f, -0.705289f, + -1.138889f, 0.197167f, -0.714654f, -0.113891f, 0.080158f, 0.000301f, + 0.057905f, 0.060718f, -0.635995f, 0.100026f, -0.038239f, -0.025530f, +}; + +static const float av1_4_partition_nn_bias_16_layer0[48] = { + -0.079252f, -0.083606f, -0.112759f, -0.071622f, 0.444562f, 0.215649f, + -0.337661f, -0.242379f, -0.053829f, 0.165168f, -0.076613f, -0.190579f, + -0.060175f, -0.571661f, -0.454075f, -1.462711f, -0.161563f, -0.088748f, + -0.030279f, -0.456293f, -0.134473f, -0.194976f, 0.044373f, -0.503954f, + -0.083563f, 0.123344f, 0.011821f, 0.085445f, -0.050294f, -0.135194f, + 0.057815f, 0.543558f, -0.090602f, -0.104671f, -0.285075f, 0.354335f, + 1.037007f, -0.023879f, -0.025025f, -0.094408f, -0.101200f, -0.142105f, + -0.380607f, -0.059067f, -0.113017f, -0.137448f, -0.177840f, 0.468505f, +}; + +static const float av1_4_partition_nn_weights_16_layer1[48 * LABEL_SIZE] = { + 0.174954f, -0.239117f, 0.073252f, 0.258881f, 0.579781f, 0.441827f, + 0.372037f, -0.062362f, 0.068477f, 0.376811f, -0.130520f, 0.214951f, + -0.200674f, 0.240347f, 0.152954f, 1.360264f, 0.334630f, -0.064789f, + -0.270826f, 0.212699f, 0.045669f, -0.150852f, -0.412603f, 0.122481f, + -0.230246f, 0.005004f, 0.321417f, -0.554083f, -0.186742f, -0.197687f, + -0.028669f, -0.138559f, -0.117773f, 0.024953f, 0.326367f, -0.109951f, + -1.098959f, -0.136134f, 0.563218f, 0.191799f, 0.126191f, -0.093113f, + 0.185371f, 0.058468f, 0.245247f, -0.138064f, -0.471573f, -0.209372f, + -0.111171f, 0.222275f, -0.350556f, -0.106336f, 0.268877f, 0.090639f, + -0.083008f, -0.190791f, -0.243922f, -0.121182f, -0.133733f, -0.078450f, + 0.099751f, 0.353020f, -0.199079f, -0.463492f, -0.647884f, 0.166611f, + -0.464034f, 0.045096f, -0.312178f, -0.190972f, -0.468297f, 0.662376f, + -0.197071f, -0.653123f, -0.354365f, -0.088501f, -0.302671f, 0.140713f, + 0.885444f, 0.350273f, -0.003345f, 0.217260f, 0.219156f, 0.240653f, + 0.347840f, 0.101849f, -0.244565f, -0.166971f, 0.091056f, 0.319912f, + 0.268459f, 0.250726f, -0.155819f, -0.087588f, 0.010749f, -0.192344f, + 0.344808f, 0.223482f, -0.189563f, -0.067317f, -0.348191f, -0.085265f, + 0.259318f, 0.102408f, 0.096675f, -0.255564f, -0.168480f, -0.068189f, + -0.457704f, 0.010565f, 0.228573f, -0.124421f, 0.202488f, 0.148519f, + 0.002180f, 0.099099f, -0.179019f, 0.245414f, -0.038307f, 0.116897f, + -0.031377f, 0.368533f, -0.793891f, 0.148614f, 0.075441f, 0.102465f, + -0.310002f, -0.355369f, -0.206713f, -0.262276f, 0.068578f, -0.044980f, + 0.092689f, -0.181058f, 0.016279f, 0.155965f, 0.545361f, -0.390699f, + -0.042457f, 0.110238f, 0.114640f, 0.112525f, 0.522221f, 0.533164f, + -0.331720f, -0.212966f, 0.140823f, 0.251311f, -0.006092f, -0.800438f, + 0.007981f, -0.585140f, -0.006526f, 0.541683f, -0.298498f, 0.084322f, + -0.056467f, -0.361806f, -0.256347f, -1.419173f, -0.159093f, 0.023017f, + 0.667915f, -0.176995f, 0.022307f, -0.169493f, 0.581377f, 0.044929f, + 0.044914f, -0.056290f, 0.324196f, 0.648043f, -0.089381f, -0.054971f, + 0.064782f, 0.629356f, -0.003760f, -0.123822f, 0.144133f, -0.378821f, + 1.116858f, 0.128552f, -0.668783f, 0.207194f, -0.437781f, -0.283321f, + -0.549404f, 0.010538f, 0.208997f, 0.231396f, -0.174347f, 0.161910f, +}; + +static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = { + -0.197883f, + -0.136696f, + 0.094115f, + 0.612799f, +}; + +static const NN_CONFIG av1_4_partition_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 48, // num_hidden_nodes + }, + { + av1_4_partition_nn_weights_16_layer0, + av1_4_partition_nn_weights_16_layer1, + }, + { + av1_4_partition_nn_bias_16_layer0, + av1_4_partition_nn_bias_16_layer1, + }, +}; + +static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = { + 0.114554f, 0.043669f, 0.313291f, 0.167688f, -0.413357f, 0.088232f, + 0.301915f, -0.358117f, 0.267711f, -0.252716f, -0.038531f, -0.032805f, + -0.025382f, 0.023624f, -0.949694f, -0.065480f, -0.375721f, -0.697319f, + -0.117387f, -0.204309f, -0.190797f, -0.223867f, -0.190248f, 0.026668f, + 0.199717f, 0.216902f, -0.239241f, -0.096894f, -0.225046f, 0.246523f, + 0.002333f, -0.254385f, -0.205815f, 0.123139f, -0.476923f, 0.137557f, + 0.059686f, -0.124013f, 0.974675f, 0.889753f, 0.378940f, 0.526413f, + -0.208747f, -0.001913f, 0.094081f, 0.848010f, 0.062042f, 0.159831f, + 0.071016f, 0.024437f, 0.212611f, 0.039501f, -0.149922f, -0.055229f, + -0.229270f, 0.129004f, -0.182803f, 0.291223f, -1.197804f, -0.916991f, + -0.024095f, 0.738729f, -0.300326f, 0.402480f, 0.023944f, -0.022613f, + -0.004554f, 0.001784f, 0.035143f, -0.202237f, 0.080252f, -0.003912f, + -0.040345f, -0.121881f, 0.126672f, 0.093507f, -0.081305f, -0.081099f, + -0.218824f, -0.459254f, -0.055250f, -0.095096f, 0.207278f, 0.245259f, + -0.380849f, -0.334458f, -0.351449f, -0.513045f, -0.407823f, -0.222423f, + 0.103205f, -0.299965f, -0.211472f, -0.348690f, -0.283688f, -0.152743f, + -0.204005f, -0.173636f, 0.020302f, -0.109112f, 0.081203f, -0.137344f, + -0.364582f, -0.343133f, -0.176167f, -0.446541f, 0.144844f, -0.268105f, + -0.003889f, -0.309560f, -0.236092f, -0.299450f, 0.248269f, 0.207510f, + -0.279023f, -0.272472f, -0.166427f, 0.205973f, -0.345692f, -0.238400f, + -0.319178f, -0.327246f, -0.321756f, 0.043191f, -0.027520f, -0.029310f, + 0.161379f, 0.031154f, -0.605365f, -0.230926f, 0.261142f, -0.262678f, + -0.373351f, -0.326245f, 0.279222f, 0.684357f, -0.864302f, 0.036132f, + 0.239307f, 0.136262f, 0.124002f, -0.410379f, -0.172722f, -0.376670f, + -0.195889f, 0.037292f, -0.055295f, 1.022308f, 0.237600f, -0.618435f, + 0.366154f, 0.168308f, -0.473467f, -0.756558f, -0.044830f, 0.019057f, + -0.084214f, -0.007789f, -0.066028f, -0.074562f, 0.002082f, 0.001007f, + -0.269676f, -0.164768f, -0.027271f, -0.098935f, 0.009431f, 0.254431f, + 0.124238f, -0.198181f, 0.142723f, -0.112997f, -0.164224f, -0.355160f, + 0.135330f, -0.379557f, 0.079392f, 0.210607f, -0.354927f, -0.277678f, + -0.931111f, 0.056208f, -0.347710f, -0.355415f, 0.826145f, 0.390625f, + 0.374414f, -0.205685f, 0.562485f, 0.152288f, 0.130635f, 0.056622f, + 0.057972f, 0.095526f, -0.082436f, -0.085938f, -0.070570f, -0.087634f, + 0.335934f, 0.084860f, 0.544424f, -0.278917f, 0.476740f, 0.050927f, + -1.288817f, -0.078320f, -0.553041f, -0.160538f, -0.109365f, -0.127146f, + -0.032524f, -0.105117f, -0.182965f, -0.024723f, 0.083317f, 0.060073f, + -0.042945f, 0.015249f, 1.241504f, 0.662613f, 0.530496f, -0.180519f, + -1.099086f, -0.825844f, 0.551856f, -0.025009f, -0.006619f, -0.001049f, + 0.014828f, -0.035166f, -0.241091f, -0.136364f, -0.003219f, -0.014581f, + -0.379945f, -0.226191f, -0.161241f, -0.496390f, -0.147175f, -0.118004f, + -0.128206f, -0.389770f, -0.184288f, -0.119076f, -0.379211f, 0.236180f, + -0.468730f, -0.175170f, 0.136433f, 0.167739f, -0.377602f, 0.135772f, + 0.040972f, -0.193974f, -0.319475f, -0.016469f, -0.412027f, -0.322605f, + 0.111125f, -0.078456f, -0.387234f, -0.401605f, -0.088717f, -0.340682f, + 0.010556f, 0.058256f, -0.127352f, 0.017665f, 0.072632f, -0.171966f, + -0.117342f, -0.166050f, -0.182689f, -0.073182f, 0.096279f, -0.260229f, + 0.025216f, -0.332236f, -0.218706f, -0.200153f, -0.110303f, 0.073499f, + -0.280123f, 0.132262f, -0.308330f, -0.119036f, -0.303874f, -0.065445f, + -0.412137f, 0.057167f, 0.044582f, -0.330952f, -0.232572f, 0.039732f, + -0.326877f, -0.300569f, -0.467164f, -0.371499f, 0.034430f, 0.058277f, + -0.042485f, -0.409028f, -0.110889f, -0.500758f, -0.343141f, 0.042023f, + -1.071050f, 0.086854f, -0.004932f, -0.259698f, 0.125301f, -0.742663f, + -0.370517f, -0.772840f, 0.193628f, 0.554676f, 0.051283f, -0.196639f, + 0.040344f, 0.027391f, -0.040501f, 0.038303f, 0.032972f, -0.014638f, + 0.097720f, -0.206897f, -0.015480f, 0.008543f, 0.034469f, 0.127234f, + -0.396463f, -0.390189f, 0.117538f, -0.435622f, 0.043420f, -0.241987f, + -0.118254f, -0.190349f, 0.190273f, -0.085625f, -0.141253f, -0.377438f, + -0.249211f, 0.214512f, -0.363191f, -0.754851f, 0.238045f, 1.127635f, + 0.173947f, -0.357620f, 0.073671f, 0.220617f, 0.072067f, -0.076214f, + -0.044583f, -0.018371f, 0.010952f, -0.135116f, 0.076597f, 0.034480f, + -0.070212f, -0.454429f, -0.135215f, 0.163851f, -0.625990f, -0.283991f, + 0.284051f, 0.182935f, -0.048717f, 0.002484f, -0.009086f, 0.321724f, + 0.125162f, -0.069624f, -0.430299f, -0.007224f, -0.284725f, -0.475662f, + 0.123807f, -0.313614f, -0.103142f, 0.072125f, 0.100320f, -0.185558f, + -0.481522f, -0.247311f, -0.386762f, -0.258850f, 0.178844f, -0.381231f, + -0.436001f, -0.374834f, 0.230104f, -0.500679f, 0.170880f, 0.029657f, + -0.105857f, -0.366671f, -0.268833f, 0.036885f, -0.026776f, 0.037837f, + -0.362095f, -0.254933f, 0.129650f, 0.007945f, -0.304715f, -0.100813f, + -0.342849f, -0.269223f, 0.178490f, 0.186735f, -0.353995f, 0.050381f, + -0.440186f, 0.025985f, 1.096969f, 1.132937f, 0.581545f, 0.271734f, + -0.109169f, -0.014239f, 0.688644f, 0.602702f, 0.048616f, 0.022335f, + 0.037545f, 0.081667f, -0.109038f, -0.088565f, -0.002506f, -0.041420f, + -0.132515f, 0.187312f, 0.677273f, 1.111182f, 0.199096f, -0.211551f, + -0.896508f, 0.257981f, 0.007803f, 0.160343f, -0.124864f, -0.097150f, + 0.225090f, 0.242900f, -0.195665f, 0.011310f, 0.160765f, 0.169195f, + -0.081994f, -0.017372f, -0.566190f, -0.902086f, 0.027768f, 0.511419f, + 0.076009f, -0.165861f, 0.240487f, 0.006298f, -0.153334f, 0.041249f, + 0.387092f, 0.313011f, -0.032269f, 0.019024f, 0.052568f, 0.124247f, + 0.197640f, 0.002537f, 0.651044f, 0.829828f, -0.446444f, -0.402042f, + -0.469399f, -0.019842f, 0.371960f, 0.140373f, -0.044808f, 0.008283f, + 0.093791f, 0.052149f, 0.143123f, -0.449571f, -0.868816f, -0.265661f, + -0.225232f, -0.014704f, 0.543836f, -0.374498f, 0.561647f, 1.309445f, + 0.056789f, -0.048447f, 0.255758f, 0.644553f, -0.124802f, 0.097419f, + -0.149336f, 0.021596f, -0.043699f, 0.057591f, -0.000077f, 0.034488f, + -0.049353f, -0.007799f, 0.437914f, 0.509369f, 0.674428f, 1.858949f, + -0.205964f, 0.060776f, 0.184213f, 0.037177f, -0.062535f, -0.115408f, + 0.076498f, 0.010235f, -0.142253f, 0.009983f, 0.073436f, 0.038716f, + -0.369983f, -0.185959f, -0.137867f, 0.032134f, 0.213814f, -0.125571f, + 0.247874f, -0.166871f, -0.160890f, 0.147029f, 0.267143f, -0.298488f, + -0.210203f, -0.188313f, -0.085024f, -0.244962f, -0.189833f, -0.261242f, + 0.399519f, 0.143200f, -0.776419f, -0.374639f, -0.022066f, 0.582904f, + 0.006430f, -0.139134f, -0.491894f, -0.430579f, -0.358221f, -0.231365f, + -0.398255f, -0.173231f, 0.211789f, -0.036121f, -0.266856f, 0.042956f, + -1.138513f, -0.070313f, 0.158803f, 0.406989f, -0.015974f, 0.651020f, + -0.468982f, -0.310019f, 0.416922f, 0.895162f, 0.019921f, 0.004023f, + 0.006962f, 0.000863f, -0.216395f, -0.074913f, -0.002613f, 0.026703f, +}; + +static const float av1_4_partition_nn_bias_32_layer0[32] = { + 0.133615f, -0.113389f, -0.575989f, 0.589389f, -0.193574f, -0.132463f, + 0.000000f, 0.060317f, 0.264577f, -0.060599f, 0.540147f, -0.127782f, + -0.548802f, -0.172235f, -0.193032f, -0.026301f, -0.177527f, 0.267821f, + -0.115455f, -0.137162f, -0.079595f, -0.041443f, -0.043856f, -0.657220f, + -0.448931f, 0.446300f, 0.250002f, 0.223559f, -0.647723f, -0.014369f, + 0.084333f, -0.056270f, +}; + +static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = { + -0.069633f, -0.087239f, 0.365816f, -0.068579f, 0.231198f, -0.067856f, + -0.139892f, -0.100235f, -0.488166f, -0.150112f, -0.005546f, 0.210832f, + 0.778888f, 0.169624f, 0.089968f, -0.243569f, 0.353483f, 0.032296f, + -0.157408f, 0.286885f, -0.063537f, -0.324055f, -0.161464f, 0.430600f, + 0.277707f, -0.196463f, 0.154647f, 0.059804f, 0.176408f, 0.303179f, + -0.040156f, 0.375810f, -0.363032f, -0.186808f, -0.264561f, -0.158937f, + -0.007949f, -0.076394f, 0.056475f, 0.308528f, 0.695387f, 0.051336f, + 0.433063f, -0.229948f, -1.210712f, 0.036286f, 0.183868f, -0.117660f, + 0.230134f, -0.093469f, 0.237918f, 0.625986f, -0.236671f, -0.377172f, + 0.331091f, -0.394004f, -0.214349f, 0.243940f, -0.600348f, 0.069843f, + 0.088325f, 0.225775f, 0.276884f, -0.604493f, 0.769812f, 0.259574f, + 0.086220f, 0.511515f, -0.282584f, -0.157719f, 0.278778f, -0.332732f, + 0.068985f, -0.237236f, -0.006102f, -0.154883f, 0.710288f, -0.245896f, + -0.255895f, -0.398038f, 0.304084f, -0.317065f, 0.192609f, -0.235613f, + 0.461340f, 0.117194f, 0.116817f, 0.196150f, 0.421622f, -0.264495f, + 0.617852f, -0.351756f, -0.310016f, 0.135932f, -0.242622f, -0.073094f, + 0.042077f, 0.039230f, -0.482715f, 0.553187f, 0.360637f, 0.313484f, + -0.131540f, -0.104731f, 0.374704f, 0.222173f, 0.437657f, 0.029827f, + -0.545156f, -0.203176f, 0.267824f, 0.169237f, -0.057871f, 0.552197f, + 0.272243f, 0.025681f, -0.262192f, 0.255934f, -0.202407f, -0.483317f, + -0.204721f, 0.288807f, -0.030735f, -0.047161f, -0.780724f, 0.381939f, + -0.295318f, 0.537378f, +}; + +static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = { + -0.332518f, + 0.114452f, + 0.098949f, + 0.465896f, +}; + +static const NN_CONFIG av1_4_partition_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 32, // num_hidden_nodes + }, + { + av1_4_partition_nn_weights_32_layer0, + av1_4_partition_nn_weights_32_layer1, + }, + { + av1_4_partition_nn_bias_32_layer0, + av1_4_partition_nn_bias_32_layer1, + }, +}; + +static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 16] = { + 0.256343f, -0.021774f, -0.117102f, 0.416930f, 0.188160f, 0.148768f, + -0.611181f, -0.121607f, -0.394825f, -0.875025f, -0.167071f, 0.016408f, + 0.222769f, -0.199332f, 0.058667f, -0.679529f, 0.081744f, 0.044438f, + -0.182941f, -0.110339f, -0.137822f, -0.096164f, -0.132319f, 0.140036f, + -0.049503f, -0.309894f, -0.323991f, 0.166113f, 0.138104f, -0.263629f, + -0.368460f, -0.273989f, 0.147239f, 0.044566f, -0.363357f, -0.030792f, + 0.020734f, 0.068506f, -0.434214f, 0.581644f, -1.244146f, -0.569162f, + 0.179499f, -0.188900f, 0.078431f, -0.392126f, -0.006431f, 0.112146f, + -0.065892f, -0.051319f, 0.094607f, 0.251700f, -0.000650f, 0.011911f, + 0.080449f, 0.022816f, 0.322382f, 0.577070f, 0.927738f, 0.178707f, + -0.101237f, -0.212521f, 0.560261f, -0.206492f, -0.077591f, -0.069960f, + 0.025727f, 0.041122f, -0.735228f, -0.506091f, -0.600776f, -0.117829f, + 0.103556f, 0.141823f, 0.853448f, 0.339488f, 0.994022f, 0.121693f, + -2.065366f, -0.352510f, -0.174323f, -0.323400f, -0.002193f, 0.004161f, + 0.042469f, -0.005319f, -0.305784f, -0.371353f, 0.011194f, -0.018597f, + 0.209260f, 0.071577f, 0.242470f, -0.856593f, 0.288842f, 1.062608f, + -0.300472f, 0.221623f, -0.813563f, -0.250347f, -0.081455f, -0.092779f, + -0.168132f, -0.180640f, -0.075130f, -0.052906f, -0.015645f, 0.127158f, + -0.006546f, 0.051671f, 0.545608f, 1.101804f, 0.288086f, 1.107046f, + -0.200012f, 0.220182f, -0.189220f, -0.554973f, 0.040711f, -0.058029f, + 0.043737f, 0.016164f, -0.391790f, -0.287770f, -0.046545f, 0.045071f, + 0.190005f, -0.076963f, 0.836839f, 1.633266f, 0.902928f, 0.991972f, + -0.127932f, 0.293680f, -0.035984f, 0.476179f, -0.098024f, 0.068314f, + -0.058365f, 0.096221f, -0.000321f, -0.128840f, 0.136441f, -0.061853f, + 0.270367f, -0.184129f, -0.373670f, -0.177381f, 0.262109f, -0.378013f, + -0.053249f, -0.456389f, 0.222972f, -0.228067f, -0.115210f, -0.277797f, + 0.096913f, -0.014512f, -0.015533f, 0.026389f, -0.360536f, -0.078477f, + -0.203186f, 0.199574f, 0.770476f, 0.595592f, 0.360828f, 0.547721f, + -0.804787f, 0.389690f, -0.437645f, 0.576776f, 0.081903f, 0.082750f, + 0.007166f, -0.143755f, 0.114462f, 0.472432f, -0.058974f, 0.077761f, + -2.015181f, -0.054942f, -0.110894f, 0.529188f, -0.003300f, 0.913895f, + -0.324643f, 0.316135f, -0.291729f, 1.072647f, -0.029236f, 0.045592f, + -0.039399f, 0.043472f, -0.303244f, -0.108761f, -0.011154f, 0.009693f, + -0.374985f, 0.027758f, 0.302075f, -0.295758f, -0.165563f, -0.297259f, + -0.485624f, -0.469310f, -0.028247f, -0.124440f, -0.428082f, 0.096325f, + 0.089003f, -0.301585f, 0.022474f, 0.077477f, -0.032233f, -0.231036f, + 0.143206f, 0.169113f, -0.556486f, 0.346327f, -0.667790f, 0.126983f, + 0.179727f, 0.397307f, -0.490612f, -1.708789f, -0.040336f, -0.028547f, + -0.091319f, -0.119367f, -0.518796f, -0.543383f, 0.037162f, 0.031344f, + -0.131692f, 0.119353f, 0.799313f, 0.443848f, -0.499919f, -1.002983f, + 0.375477f, 0.221096f, -0.238033f, 0.284849f, 0.021897f, 0.023338f, + -0.059067f, 0.117276f, 0.039540f, 0.049630f, 0.175150f, 0.014166f, + -0.071486f, 0.091234f, -1.007432f, -1.417378f, 0.640528f, 1.442576f, + -0.257183f, -0.597016f, 0.861785f, 0.276121f, -0.098017f, 0.120514f, + -0.133184f, 0.106529f, 0.171644f, 0.059513f, 0.215952f, -0.009441f, + -0.505313f, 0.063174f, 0.229148f, -0.344213f, 0.862721f, 1.549941f, + -0.220129f, 0.493094f, 0.264095f, 0.143641f, 0.084968f, -0.078266f, + 0.032335f, -0.019006f, -0.098205f, 0.119213f, -0.103465f, 0.072811f, +}; + +static const float av1_4_partition_nn_bias_64_layer0[16] = { + 0.111611f, -0.067682f, 0.633594f, 0.143559f, -1.051284f, -0.266625f, + -0.829789f, -0.956123f, -0.153484f, -0.787741f, 0.004832f, -0.080769f, + 0.235166f, 0.449468f, 0.294689f, -0.395300f, +}; + +static const float av1_4_partition_nn_weights_64_layer1[16 * LABEL_SIZE] = { + -0.069999f, -0.093710f, -0.423714f, -0.028138f, 0.684415f, 0.141445f, + 0.507161f, 0.435533f, -0.263268f, 0.585105f, 0.235301f, 0.127536f, + -0.688639f, -0.217993f, -0.540066f, 0.406718f, 0.018210f, -0.077349f, + -0.124823f, -0.488220f, -0.957026f, 0.302632f, 0.285490f, -0.411356f, + 0.091089f, 0.103862f, -0.549291f, 0.148628f, 0.640603f, -0.601018f, + 0.178024f, 0.601370f, 0.313780f, 0.051938f, 0.524083f, 0.814631f, + -0.415522f, -0.738849f, 0.477881f, -0.342864f, 0.105181f, 0.040010f, + -0.177521f, 0.400646f, 0.167093f, 0.388279f, -0.898439f, -0.111936f, + 0.469875f, -0.099528f, -0.217370f, 0.283742f, -0.033798f, -0.142797f, + -0.174057f, -1.293311f, -0.038777f, -0.003846f, 0.093642f, -0.527150f, + -0.021259f, 0.194651f, -0.276294f, -0.109514f, +}; + +static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = { + -0.688947f, + 0.121075f, + 0.289597f, + 0.948091f, +}; + +static const NN_CONFIG av1_4_partition_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_4_partition_nn_weights_64_layer0, + av1_4_partition_nn_weights_64_layer1, + }, + { + av1_4_partition_nn_bias_64_layer0, + av1_4_partition_nn_bias_64_layer1, + }, +}; + +#undef FEATURE_SIZE +#undef LABEL_SIZE + #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c index 4f6265617..6d154a7d2 100644 --- a/third_party/aom/av1/encoder/pickcdef.c +++ b/third_party/aom/av1/encoder/pickcdef.c @@ -296,7 +296,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, int ydec[3]; int pli; int cdef_count; - int coeff_shift = AOMMAX(cm->bit_depth - 8, 0); + int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0); uint64_t best_tot_mse = (uint64_t)1 << 63; uint64_t tot_mse; int sb_count; @@ -317,8 +317,8 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]); uint16_t *in; DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]); - quantizer = - av1_ac_quant_Q3(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8); + quantizer = av1_ac_quant_Q3(cm->base_qindex, 0, cm->seq_params.bit_depth) >> + (cm->seq_params.bit_depth - 8); lambda = .12 * quantizer * quantizer / 256.; av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0, @@ -361,7 +361,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, for (r = 0; r < frame_height; ++r) { for (c = 0; c < frame_width; ++c) { - if (cm->use_highbitdepth) { + if (cm->seq_params.use_highbitdepth) { src[pli][r * stride[pli] + c] = CONVERT_TO_SHORTPTR( xd->plane[pli].dst.buf)[r * xd->plane[pli].dst.stride + c]; ref_coeff[pli][r * stride[pli] + c] = diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c index 5f802a707..461c3af83 100644 --- a/third_party/aom/av1/encoder/picklpf.c +++ b/third_party/aom/av1/encoder/picklpf.c @@ -82,10 +82,8 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd, plane + 1, partial_frame); #endif - int highbd = 0; - highbd = cm->use_highbitdepth; - - filt_err = aom_get_sse_plane(sd, cm->frame_to_show, plane, highbd); + filt_err = aom_get_sse_plane(sd, cm->frame_to_show, plane, + cm->seq_params.use_highbitdepth); // Re-instate the unfiltered frame yv12_copy_plane(&cpi->last_frame_uf, cm->frame_to_show, plane); @@ -202,7 +200,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, } else if (method >= LPF_PICK_FROM_Q) { const int min_filter_level = 0; const int max_filter_level = av1_get_max_filter_level(cpi); - const int q = av1_ac_quant_Q3(cm->base_qindex, 0, cm->bit_depth); + const int q = av1_ac_quant_Q3(cm->base_qindex, 0, cm->seq_params.bit_depth); // These values were determined by linear fitting the result of the // searched level for 8 bit depth: // Keyframes: filt_guess = q * 0.06699 - 1.60817 @@ -211,7 +209,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, // And high bit depth separately: // filt_guess = q * 0.316206 + 3.87252 int filt_guess; - switch (cm->bit_depth) { + switch (cm->seq_params.bit_depth) { case AOM_BITS_8: filt_guess = (cm->frame_type == KEY_FRAME) ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18) @@ -229,7 +227,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, "or AOM_BITS_12"); return; } - if (cm->bit_depth != AOM_BITS_8 && cm->frame_type == KEY_FRAME) + if (cm->seq_params.bit_depth != AOM_BITS_8 && cm->frame_type == KEY_FRAME) filt_guess -= 4; // TODO(chengchen): retrain the model for Y, U, V filter levels lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level); diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c index 93ea09690..28b693b08 100644 --- a/third_party/aom/av1/encoder/pickrst.c +++ b/third_party/aom/av1/encoder/pickrst.c @@ -163,8 +163,8 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc, const int is_uv = plane > 0; const RestorationInfo *rsi = &cm->rst_info[plane]; RestorationLineBuffers rlbs; - const int bit_depth = cm->bit_depth; - const int highbd = cm->use_highbitdepth; + const int bit_depth = cm->seq_params.bit_depth; + const int highbd = cm->seq_params.use_highbitdepth; const YV12_BUFFER_CONFIG *fts = cm->frame_to_show; // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be @@ -173,7 +173,8 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc, av1_loop_restoration_filter_unit( limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0, - is_uv && cm->subsampling_x, is_uv && cm->subsampling_y, highbd, bit_depth, + is_uv && cm->seq_params.subsampling_x, + is_uv && cm->seq_params.subsampling_y, highbd, bit_depth, fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane], rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr); @@ -540,8 +541,8 @@ static void search_sgrproj(const RestorationTileLimits *limits, const MACROBLOCK *const x = rsc->x; const AV1_COMMON *const cm = rsc->cm; - const int highbd = cm->use_highbitdepth; - const int bit_depth = cm->bit_depth; + const int highbd = cm->seq_params.use_highbitdepth; + const int bit_depth = cm->seq_params.bit_depth; uint8_t *dgd_start = rsc->dgd_buffer + limits->v_start * rsc->dgd_stride + limits->h_start; @@ -549,8 +550,8 @@ static void search_sgrproj(const RestorationTileLimits *limits, rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start; const int is_uv = rsc->plane > 0; - const int ss_x = is_uv && cm->subsampling_x; - const int ss_y = is_uv && cm->subsampling_y; + const int ss_x = is_uv && cm->seq_params.subsampling_x; + const int ss_y = is_uv && cm->seq_params.subsampling_y; const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x; const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; @@ -1067,7 +1068,7 @@ static void search_wiener(const RestorationTileLimits *limits, double vfilterd[WIENER_WIN], hfilterd[WIENER_WIN]; const AV1_COMMON *const cm = rsc->cm; - if (cm->use_highbitdepth) { + if (cm->seq_params.use_highbitdepth) { compute_stats_highbd(wiener_win, rsc->dgd_buffer, rsc->src_buffer, limits->h_start, limits->h_end, limits->v_start, limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H); @@ -1149,7 +1150,7 @@ static void search_norestore(const RestorationTileLimits *limits, RestSearchCtxt *rsc = (RestSearchCtxt *)priv; RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; - const int highbd = rsc->cm->use_highbitdepth; + const int highbd = rsc->cm->seq_params.use_highbitdepth; rusi->sse[RESTORE_NONE] = sse_restoration_unit( limits, rsc->src, rsc->cm->frame_to_show, rsc->plane, highbd); @@ -1280,7 +1281,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) { double best_cost = 0; RestorationType best_rtype = RESTORE_NONE; - const int highbd = rsc.cm->use_highbitdepth; + const int highbd = rsc.cm->seq_params.use_highbitdepth; extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height, rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER, highbd); diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h index ef333b6d8..42a4c590b 100644 --- a/third_party/aom/av1/encoder/pustats.h +++ b/third_party/aom/av1/encoder/pustats.h @@ -18,91 +18,79 @@ extern "C" { #include "av1/encoder/ml.h" -#define NUM_FEATURES 20 +#define NUM_FEATURES 11 #define NUM_HIDDEN_LAYERS 2 -#define HIDDEN_LAYERS_0_NODES 10 +#define HIDDEN_LAYERS_0_NODES 12 #define HIDDEN_LAYERS_1_NODES 10 #define LOGITS_NODES 1 static const float av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES * HIDDEN_LAYERS_0_NODES] = { - 13.8498f, 19.6630f, 13.3036f, 5.2448f, -18.0270f, 21.6671f, - -0.2135f, -0.0060f, 0.1211f, -0.3549f, -0.3550f, 0.0190f, - 0.0167f, -0.1192f, 0.2003f, 8.6663f, 32.0264f, 9.9558f, - 9.0935f, -110.4994f, 51.8056f, 64.8041f, 58.5392f, 53.0189f, - -61.6300f, 4.7540f, -0.0140f, 0.0185f, -15.8050f, 0.0790f, - 0.0707f, 0.0784f, 0.0766f, -0.3030f, 0.0392f, 49.3312f, - 63.3326f, 61.4025f, 54.2723f, -62.2769f, -147.1736f, -84.9432f, - -82.5422f, -70.4857f, 46.7622f, -1.0285f, -0.4809f, 0.0068f, - 1.0888f, -0.0515f, -0.0384f, -0.0232f, -0.0396f, 0.2429f, - 0.2040f, -144.4016f, -88.0868f, -80.3134f, -70.6685f, 66.8528f, - -53.8097f, -45.4011f, -52.8680f, -58.7226f, 99.7830f, 2.3728f, - 0.0229f, 0.0002f, -0.3288f, -0.0563f, -0.0550f, -0.0552f, - -0.0563f, 0.2214f, 0.0139f, -60.8965f, -45.5251f, -50.4188f, - -51.5623f, 85.7369f, 77.3415f, 47.4930f, 53.8120f, 58.2311f, - -45.9650f, -2.4938f, 0.1639f, -0.5270f, -75.4622f, -0.0026f, - 0.0031f, 0.0047f, 0.0015f, 0.0092f, 0.0654f, 75.6402f, - 54.7447f, 54.8156f, 52.6834f, -9.1246f, -34.0108f, -35.6423f, - -34.2911f, -38.5444f, 72.1123f, 10.9750f, -0.1595f, 0.1983f, - 22.5724f, -0.0556f, -0.0618f, -0.0571f, -0.0608f, 0.2439f, - -0.0805f, -32.5107f, -28.9688f, -33.7284f, -48.1365f, 61.5297f, - 39.2492f, -35.1928f, -11.5000f, 7.7038f, -94.2469f, 13.5586f, - 0.7541f, 0.0105f, 4.4041f, 0.1799f, 0.1339f, 0.1567f, - -0.6668f, -0.7384f, 0.2185f, 17.1700f, -26.4601f, -1.8970f, - 38.9635f, -30.1916f, 31.8139f, 14.6157f, 10.0565f, 3.3340f, - -40.6985f, -2.1186f, 0.0116f, 0.0962f, 0.7115f, -1.4071f, - -1.3701f, -1.4728f, -1.3404f, -1.7286f, 5.5632f, 28.4998f, - 5.4087f, 16.2668f, 11.8693f, -39.4153f, 106.3281f, 38.3075f, - 39.4933f, 47.3805f, -15.0514f, -21.2421f, -0.2358f, -0.0024f, - 0.3505f, -0.0429f, -0.0377f, -0.0322f, -0.0344f, 0.2020f, - 0.1417f, 99.6711f, 35.3896f, 43.1117f, 59.8879f, -17.8250f, - -16.6976f, 18.5100f, 6.3383f, 25.3020f, -55.8824f, 25.1027f, - -0.9926f, -0.0738f, -1.4892f, 0.0269f, -0.0051f, -5.8168f, - -0.0579f, -0.1500f, 0.7224f, 8.3066f, -3.8805f, -12.1482f, - 14.3492f, -20.8118f, + 21.5067f, 22.6709f, 0.0049f, 0.9288f, -0.0100f, 0.0060f, -0.0071f, + -0.0085f, 0.0348f, -0.1273f, 10.1154f, 6.3405f, 7.8589f, -0.0652f, + -4.6352f, 0.0445f, -3.2748f, 0.1025f, -0.0385f, -0.4505f, 1.1320f, + 3.2634f, 23.2420f, -7.9056f, 0.0522f, -18.1555f, 0.0977f, 0.1155f, + -0.0138f, 0.0267f, -0.3992f, 0.2735f, 22.8063f, 35.1043f, 3.8140f, + -0.0295f, 0.0771f, -0.6938f, 0.0302f, -0.0266f, 0.0989f, -0.0794f, + 0.2981f, 33.3333f, -24.1150f, 1.4986f, -0.0975f, -15.3938f, -0.0858f, + -0.0845f, -0.0869f, -0.0858f, 0.3542f, 0.0155f, -18.2629f, 9.6688f, + -11.9643f, -0.2904f, -5.3026f, -0.1011f, -0.1202f, 0.0127f, -0.0269f, + 0.3434f, 0.0595f, 16.6800f, 41.4730f, 6.9269f, -0.0512f, -1.4540f, + 0.0468f, 0.0077f, 0.0983f, 0.1265f, -0.5234f, 0.9477f, 36.6470f, + -0.4838f, -0.2269f, -0.1143f, -0.3907f, -0.5005f, -0.0179f, -0.1057f, + 0.1233f, -0.4412f, -0.0474f, 0.1140f, -21.6813f, -0.9077f, -0.0078f, + -3.3306f, 0.0417f, 0.0412f, 0.0427f, 0.0418f, -0.1699f, 0.0072f, + -22.3335f, 16.1203f, -10.1220f, -0.0019f, 0.0005f, -0.0054f, -0.0155f, + -0.0302f, -0.0379f, 0.1276f, 0.1568f, 21.6175f, 12.2919f, 11.0327f, + -0.2000f, -8.6691f, -0.5593f, -0.5952f, -0.4203f, -0.4857f, -1.1239f, + 3.1404f, -13.1098f, -5.9165f, 22.2060f, -0.0312f, -3.9642f, -0.0344f, + -0.0656f, -0.0273f, -0.0465f, 0.1412f, -6.1974f, 9.3661f, }; static const float av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = { - 17.6566f, 62.2217f, -107.2644f, -56.2255f, 68.2252f, - -37.5662f, 9.587f, 18.5206f, 69.6873f, 4.3903f, + -14.3065f, 2.059f, -62.9916f, -50.1209f, 57.643f, -59.3737f, + -30.4737f, -0.1112f, 72.5427f, 55.402f, 24.9523f, 18.5834f, }; static const float av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES * HIDDEN_LAYERS_1_NODES] = { - -0.0494f, 0.3505f, -0.0461f, -1.3451f, 0.0198f, -0.0746f, -0.2217f, - -0.9525f, 0.0633f, -0.0737f, -0.3568f, 1.8569f, -0.0189f, -1.8269f, - 0.6281f, -1.3266f, -0.9202f, 2.8978f, -0.6437f, -0.8709f, -1.5066f, - -1.0582f, -1.9509f, -0.0417f, -0.1315f, -0.3368f, 0.0014f, -0.5734f, - -1.4640f, -1.6042f, 3.3911f, -1.6815f, -1.9026f, -4.8702f, -0.1012f, - -1.4517f, -3.2156f, 0.8448f, 0.2331f, -0.1593f, 2.6627f, -0.8451f, - -1.7382f, 0.9303f, 2.3003f, -0.0659f, 0.5772f, 0.4253f, 0.2083f, - 0.3649f, -0.9198f, -0.2183f, -0.5381f, -1.0831f, 2.0359f, 0.0040f, - -0.0871f, -0.1715f, 2.2453f, 0.5099f, -0.5900f, -0.6313f, -1.3028f, - -1.7257f, 1.4130f, -0.7189f, -0.4336f, 1.9266f, 1.7495f, -0.3321f, - 0.2827f, 0.4015f, -0.5044f, -1.0420f, -0.1258f, -0.0342f, -0.1190f, - -3.1263f, 0.7485f, -0.3161f, -0.2224f, 2.5533f, -0.2121f, -1.3389f, - 0.5556f, -0.9407f, -0.7456f, 1.4137f, -0.0353f, -0.0521f, 2.4382f, - 0.1493f, -11.5631f, -1.6178f, 3.5538f, -3.6538f, -0.5972f, -3.0038f, - -2.1640f, 0.5754f, + 0.3883f, -0.2784f, -0.2850f, 0.4894f, -2.2450f, 0.4511f, -0.1969f, + -0.0077f, -1.4924f, 0.1138f, -2.9848f, 1.0211f, -0.1712f, -0.1952f, + -0.4774f, 0.0761f, -0.3186f, -0.1002f, 0.8663f, 0.5026f, 1.1920f, + 0.9337f, 0.3911f, -0.3841f, -0.0037f, 0.7295f, -0.3183f, 0.1829f, + -1.3670f, -0.1046f, 0.6629f, 0.0619f, -0.1551f, 0.8174f, 2.1521f, + -1.3323f, -0.0527f, -0.5772f, 0.2001f, -0.6270f, -1.0625f, 0.3342f, + 0.6676f, 0.4605f, -2.0049f, 0.7781f, 0.0713f, -0.0824f, -0.4529f, + 0.1757f, -0.1338f, -0.2319f, -0.2864f, 0.1248f, 0.3887f, -0.1676f, + 1.8422f, 0.6435f, 1.2123f, -0.5667f, -0.2423f, -0.0314f, 0.2411f, + -0.5013f, 0.0422f, 0.2559f, 0.4435f, -0.1223f, 1.5167f, 0.3939f, + 1.0898f, 0.0795f, -0.9251f, -0.0813f, -0.5929f, -0.0741f, 4.0687f, + -0.4368f, -0.0984f, 0.0837f, 3.6169f, 0.0662f, -0.1679f, -0.8090f, + -0.2610f, -0.5791f, 0.0642f, -0.2979f, -0.9036f, 0.2898f, 0.3265f, + 0.4660f, -1.6358f, -0.0347f, 0.1087f, 0.0353f, 0.5687f, -0.5242f, + -0.4895f, 0.7693f, -1.3829f, -0.2244f, -0.2880f, 0.0575f, 2.0563f, + -0.2322f, -1.1597f, 1.6125f, -0.0925f, 1.3540f, 0.1432f, 0.3993f, + -0.0303f, -1.1438f, -1.7323f, -0.4329f, 2.9443f, -0.5724f, 0.0122f, + -1.0829f, }; static const float av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = { - 69.1995f, 41.7369f, -1.4885f, -35.785f, 26.1678f, - 58.4472f, 36.2223f, 66.327f, 50.8867f, 2.8306f, + -10.3717f, 37.304f, -36.7221f, -52.7572f, 44.0877f, + 41.1631f, 36.3299f, -48.6087f, -4.5189f, 13.0611f, }; static const float av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = { - 1.811f, 0.9009f, 0.0694f, -0.9985f, -0.039f, - 0.2076f, 0.5643f, 0.5408f, 0.6071f, 0.277f, + 0.8362f, 1.0615f, -1.5178f, -1.2959f, 1.3233f, + 1.4909f, 1.3554f, -0.8626f, -0.618f, -0.9458f, }; static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = { - 39.5529f, + 30.6878f, }; static const NN_CONFIG av1_pustats_rate_nnconfig = { @@ -125,78 +113,70 @@ static const NN_CONFIG av1_pustats_rate_nnconfig = { static const float av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES * HIDDEN_LAYERS_0_NODES] = { - -39.0787f, -212.9998f, -174.2088f, -264.1454f, 292.7151f, -60.8750f, - -5.9915f, 0.0712f, -60.2312f, -0.2020f, -0.2135f, -0.1663f, - -0.0711f, 0.2267f, 0.9152f, -36.1294f, -159.9320f, -222.9809f, - -270.2556f, 300.7162f, 159.9224f, -172.5735f, -7.6852f, 54.3985f, - 110.6721f, 19.2907f, -15.1039f, -0.0457f, 0.3289f, 0.4529f, - -8.2222f, 1.3213f, -0.8378f, -0.2605f, 3.9600f, 17.3407f, - 113.1116f, 34.6326f, 11.6688f, 109.3541f, 240.8123f, 45.0615f, - 80.7443f, 39.2500f, -21.0931f, -27.1989f, -0.4264f, -0.1345f, - 1.6269f, -0.0716f, 0.0989f, -0.1382f, 0.0248f, 0.0913f, - 4.3903f, 244.1014f, 32.2567f, 58.6171f, 62.2273f, -2.8647f, - -227.5659f, 16.0031f, -70.5256f, 23.8071f, 290.7356f, 13.6094f, - -2.1842f, 0.0104f, -2.8760f, 0.3708f, 0.8501f, -3.2964f, - -0.2088f, -0.4474f, 1.2248f, 40.5180f, -130.7891f, -188.1583f, - -174.0906f, 205.9622f, 0.3425f, 0.2531f, 0.2822f, 0.0488f, - 0.1416f, -0.0433f, -0.1195f, -0.0413f, -0.0708f, -0.0787f, - -0.0889f, -0.4022f, -0.5055f, -0.4715f, 0.2315f, 0.1021f, - -0.3676f, -0.3499f, -0.0715f, 0.1913f, 205.7521f, 125.2265f, - 92.0640f, 77.5566f, -164.4280f, -19.3715f, -0.1346f, -0.4060f, - 0.5042f, -0.2395f, -0.1329f, -0.1397f, 0.2175f, 0.2895f, - 5.5019f, 198.9799f, 114.0018f, 94.9015f, 86.8434f, -183.4237f, - 121.5626f, 94.8945f, 65.0803f, 93.6487f, -346.5279f, -47.6168f, - 0.0633f, 0.0135f, -0.0692f, -0.1015f, -0.1146f, -0.1341f, - -0.1175f, 0.4186f, 0.1505f, 130.7402f, 107.8443f, 62.8497f, - 65.3501f, -312.7407f, 282.8321f, 98.1531f, 75.6648f, 25.8733f, - -176.9298f, -37.2695f, -0.3760f, 0.0017f, 0.1030f, -0.1483f, - 0.0787f, -0.0962f, 0.4109f, -0.2292f, 9.1681f, 274.3607f, - 60.9538f, 75.9405f, 68.3776f, -167.3098f, -335.1045f, -69.2583f, - -76.3441f, -16.5793f, 218.5244f, 28.2405f, 0.9169f, -0.0026f, - -0.8077f, -1.5756f, -0.0804f, 0.1404f, 1.2656f, 0.0272f, - -0.2529f, -340.8659f, -112.7778f, -58.3890f, -4.1224f, 108.1709f, - -180.7382f, -93.7114f, -77.8686f, -131.8134f, 353.3893f, 4.8233f, - 0.0205f, 0.0000f, -1.1654f, -0.0161f, -0.0255f, -0.0358f, - -0.0412f, 0.1103f, 0.1041f, -188.9934f, -110.1792f, -88.6301f, - -93.7226f, 336.9746f, + 0.7770f, 1.0881f, 0.0177f, 0.4939f, -0.2541f, -0.2672f, -0.1705f, + -0.1940f, -0.6395f, 1.2928f, 3.6240f, 2.4445f, 1.6790f, 0.0265f, + 0.1897f, 0.1776f, 0.0422f, 0.0197f, -0.0466f, 0.0462f, -1.0827f, + 2.0231f, 1.8044f, 2.7022f, 0.0064f, 0.2255f, -0.0552f, -0.1010f, + -0.0581f, -0.0781f, 0.2614f, -3.4085f, 1.7478f, 0.1155f, -0.1458f, + -0.0031f, -0.1797f, -0.4378f, -0.0539f, 0.0607f, -0.1347f, -0.3142f, + -0.2014f, -0.4484f, -0.2808f, 1.5913f, 0.0046f, -0.0610f, -0.6479f, + -0.7278f, -0.5592f, -0.6695f, -0.8120f, 2.9056f, -1.1501f, 9.3618f, + 4.2486f, 0.0011f, -0.1499f, -0.0834f, 0.1282f, 0.0409f, 0.1670f, + -0.1398f, -0.4661f, 13.7700f, 8.2061f, -0.0685f, 0.0061f, -0.2951f, + 0.0169f, 0.0520f, 0.0040f, 0.0374f, 0.0467f, -0.0107f, 14.2664f, + -2.2489f, -0.2516f, -0.0061f, -0.9921f, 0.1223f, 0.1212f, 0.1199f, + 0.1185f, -0.4867f, 0.0325f, -5.0757f, -8.7853f, 1.0450f, 0.0169f, + 0.5462f, 0.0051f, 0.1330f, 0.0143f, 0.1429f, -0.0258f, 0.2769f, + -12.8839f, 22.3093f, 1.2761f, 0.0037f, -1.2459f, -0.0466f, 0.0003f, + -0.0464f, -0.0067f, 0.2361f, 0.0355f, 23.3833f, 10.9218f, 2.6811f, + 0.0222f, -1.1055f, 0.1825f, 0.0575f, 0.0114f, -0.1259f, 0.3148f, + -2.0047f, 11.9559f, 5.7375f, 0.8802f, 0.0042f, -0.2469f, -0.1040f, + -1.5679f, 0.1969f, -0.0184f, 0.0157f, 0.6688f, 3.4492f, }; static const float av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = - { -175.6918f, 43.4519f, 154.196f, -81.1015f, -0.0758f, - 136.5695f, 110.8713f, 142.029f, -153.0901f, -145.2688f }; + { + 4.5051f, -4.5858f, 1.4693f, 0.f, 3.7968f, -3.6292f, + -7.3112f, 10.9743f, 8.027f, -2.2692f, -8.748f, -1.3689f, + }; static const float av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES * HIDDEN_LAYERS_1_NODES] = { - -0.1727f, -0.2859f, -0.3757f, -0.4260f, -0.5441f, -0.0666f, -0.3792f, - -0.1335f, -0.1521f, -0.0821f, -3.1590f, 0.2711f, 0.5889f, 0.0878f, - 0.4693f, 0.7773f, -9.2989f, 0.0414f, 0.4485f, 22.8958f, -3.7024f, - -2.4672f, -43.2908f, 0.0956f, 0.4431f, 2.3429f, 1.7183f, 0.3985f, - -0.2275f, -3.1583f, -0.3485f, 0.3280f, 0.3763f, 0.2069f, 0.4231f, - 0.7366f, -6.9527f, 0.0713f, 0.1359f, 16.6500f, -1.7655f, -0.1651f, - 0.1280f, -0.2678f, -0.2120f, 1.6243f, 1.8773f, -0.7543f, -0.3292f, - -0.7627f, -0.2001f, -0.1125f, -0.8100f, -0.1866f, 0.0567f, -0.4002f, - 3.2429f, 0.6427f, -0.3759f, -11.6518f, -2.2893f, 0.7708f, -1.8637f, - 1.7148f, 0.3124f, -0.7129f, -0.4927f, 0.1964f, -0.2570f, -25.0783f, - 2.5061f, 0.1457f, -1.1239f, 0.0570f, -0.2526f, -0.0669f, 0.6791f, - 1.1531f, -0.7246f, -0.3180f, -0.0015f, -0.0061f, -0.1626f, -0.0181f, - 0.1271f, -0.0140f, -0.6027f, 0.0736f, -0.0157f, 1.2420f, -6.4055f, - 0.2128f, -0.0386f, 0.3446f, 0.1840f, -0.7208f, -1.6979f, -0.0442f, - 0.3230f, -1.9745f, + -0.0182f, -0.0925f, -0.0311f, -0.2962f, 0.1177f, -0.0027f, -0.2136f, + -1.2094f, 0.0935f, -0.1403f, -0.1477f, -0.0752f, 0.1519f, -0.4726f, + -0.3521f, 0.4199f, -0.0168f, -0.2927f, -0.2510f, 0.0706f, -0.2920f, + 0.2046f, -0.0400f, -0.2114f, 0.4240f, -0.7070f, 0.4964f, 0.4471f, + 0.3841f, -0.0918f, -0.6140f, 0.6056f, -0.1123f, 0.3944f, -0.0178f, + -1.7702f, -0.4434f, 0.0560f, 0.1565f, -0.0793f, -0.0041f, 0.0052f, + -0.1843f, 0.2400f, -0.0605f, 0.3196f, -0.0286f, -0.0002f, -0.0595f, + -0.0493f, -0.2636f, -0.3994f, -0.1871f, -0.3298f, -0.0788f, -1.0685f, + 0.1900f, -0.5549f, -0.1350f, -0.0153f, -0.1195f, -0.5874f, 1.0468f, + 0.0212f, -0.2306f, -0.2677f, -0.3000f, -1.0702f, -0.1725f, -0.0656f, + -0.0226f, 0.0616f, -0.3453f, 0.0810f, 0.4838f, -0.3780f, -1.4486f, + 0.7777f, -0.0459f, -0.6568f, 0.0589f, -1.0286f, -0.6001f, 0.0826f, + 0.4794f, -0.0586f, -0.1759f, 0.3811f, -0.1313f, 0.3829f, -0.0968f, + -2.0445f, -0.3566f, -0.1491f, -0.0745f, -0.0202f, 0.0839f, 0.0470f, + -0.2432f, 0.3013f, -0.0743f, -0.3479f, 0.0749f, -5.2490f, 0.0209f, + -0.1653f, -0.0826f, -0.0535f, 0.3225f, -0.3786f, -0.0104f, 0.3091f, + 0.3652f, 0.1757f, -0.3252f, -1.1022f, -0.0574f, -0.4473f, 0.3469f, + -0.5539f, }; static const float av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = - { 0.f, 70.3414f, 9.6036f, -118.1096f, 49.2507f, - 95.1849f, 81.8015f, 167.0967f, -337.7945f, 169.8344f }; + { + 11.9337f, -0.3681f, -6.1324f, 12.674f, 9.0956f, + 4.6069f, -4.4158f, -12.4848f, 10.8473f, 5.7633f, + }; static const float av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = { - -0.3627f, 1.2272f, 0.2201f, -1.7406f, -0.6885f, - 0.8487f, -0.2761f, 0.7731f, -5.2096f, -0.7351f, + 0.3245f, 0.2979f, -0.157f, -0.1441f, 0.1413f, + -0.7496f, -0.1737f, -0.5322f, 0.0748f, 0.2518f, }; static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = { - 48.2331f, + 4.6065f, }; static const NN_CONFIG av1_pustats_dist_nnconfig = { diff --git a/third_party/aom/av1/encoder/rate_distortion_model_params.h b/third_party/aom/av1/encoder/rate_distortion_model_params.h new file mode 100644 index 000000000..14d23f10f --- /dev/null +++ b/third_party/aom/av1/encoder/rate_distortion_model_params.h @@ -0,0 +1,591 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_ +#define AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +// 22 float features + +// 2 categorical features with 4 possible values, converted to one-hot vectors. +// So, total 22 + 2 * 4 = 30 features. +#define NUM_FEATURES 30 +#define NUM_HIDDEN_LAYERS 1 +#define NUM_HIDDEN_NODES 96 +#define NUM_OUTPUTS 1 + +//------------------------------------------------------------------------------ +// RDCost model + +static const float + av1_rdcost_model_nn_weights_layer0[NUM_FEATURES * NUM_HIDDEN_NODES] = { + -0.0699f, 0.2790f, 0.1915f, 0.2669f, 0.4637f, 0.4095f, + 0.2129f, 0.0634f, 0.2306f, -0.2232f, -0.5711f, -0.6493f, + -0.7406f, -0.8440f, 0.4105f, 0.1392f, 0.5218f, -0.1618f, + -0.1719f, 0.3409f, 0.1111f, -0.3609f, -0.2929f, 0.3869f, + -0.5373f, 0.0700f, 0.2572f, 0.2483f, -0.0314f, 0.5228f, + 0.0169f, -0.1357f, 0.0419f, -0.1722f, 0.1303f, 0.1198f, + -0.0013f, 0.1309f, 0.0293f, -0.1941f, 0.0668f, -0.0643f, + -0.0381f, 0.1249f, -0.0731f, -0.1649f, 0.0964f, 0.0270f, + 0.1354f, 0.0538f, -0.2064f, -0.2067f, -0.0569f, 0.0449f, + 0.1680f, -0.0732f, -0.0785f, 0.1884f, -0.2137f, -0.0189f, + 0.2976f, 0.2818f, -0.0222f, 0.2658f, 0.0488f, 0.2778f, + -0.1110f, 0.2069f, -0.0072f, -0.0095f, -0.1105f, -0.1365f, + -0.4245f, -0.4751f, -0.0736f, 0.2333f, 0.0653f, -0.0249f, + 0.0055f, -0.0838f, -0.0489f, -0.2597f, 0.2621f, -0.0251f, + -0.0545f, 0.0816f, -0.0816f, 0.3396f, -0.1047f, 0.3678f, + 0.1487f, -0.0270f, 0.2574f, 0.1018f, 0.2560f, -0.0598f, + -0.0446f, -0.1792f, 0.5336f, -0.1590f, -0.9820f, -0.6514f, + -0.6304f, -0.8359f, -0.0699f, 0.0295f, -0.0057f, -0.3088f, + -0.1466f, 0.2220f, -0.1980f, -0.3400f, -0.1228f, 0.2667f, + -0.4816f, 0.0155f, -0.0194f, 0.2051f, 0.0513f, 0.1575f, + -121.4240f, -126.6840f, -124.1106f, -127.6184f, -85.0333f, -26.6396f, + 2.7020f, 102.0452f, -85.5128f, 0.0076f, 122.2206f, 107.5265f, + 108.3773f, 93.4847f, 20.3705f, -89.6993f, -176.9070f, -41.7543f, + -123.0293f, -91.6437f, -205.7099f, -62.5346f, -83.2987f, 21.3830f, + 56.6341f, -120.8647f, -127.7562f, -121.6688f, -127.4225f, -74.8045f, + -15.9247f, -14.6468f, -14.7788f, -15.4498f, -18.5514f, -11.1579f, + -5.8164f, -3.4318f, 0.8100f, 0.0642f, 203.5111f, 189.6872f, + 190.4776f, 176.4784f, -4.9427f, -12.5324f, -7.6861f, 21.9182f, + -6.7864f, -7.1906f, -8.1292f, 21.4780f, -7.8016f, -5.2653f, + 61.8526f, -15.5105f, -14.6900f, -14.1459f, -15.4350f, -19.1379f, + -0.7876f, -1.8558f, -4.6035f, -6.8405f, -0.2904f, 2.3202f, + 1.8127f, -2.9397f, -0.8187f, -0.6098f, 22.6173f, 10.3668f, + 12.9363f, 2.4541f, 6.6700f, 0.3804f, -3.3117f, 8.5464f, + -25.8348f, 1.8698f, -9.5753f, 8.5558f, -16.3576f, 7.2217f, + 35.3115f, -1.1447f, -2.6530f, -4.7027f, -5.7024f, -0.9513f, + 0.8393f, 0.7085f, 0.7879f, 0.3728f, 3.0574f, 1.1360f, + 26.0531f, 4.1084f, -1.7340f, 0.1683f, -450.7927f, -444.5818f, + -442.5239f, -438.1168f, 2.4924f, -0.0147f, -0.0797f, -47.5322f, + -1.7638f, -0.8608f, -0.6500f, -44.4326f, -0.9027f, 2.5560f, + -267.6517f, 0.2642f, 0.9457f, 0.7944f, 0.3609f, 3.2742f, + -74.3400f, -81.6894f, -76.2162f, -69.2979f, -90.2476f, -39.7389f, + 2.2545f, 36.5095f, -60.1129f, -1.0383f, 87.0348f, 83.9940f, + 83.7199f, 80.8609f, 14.9075f, -78.7405f, -74.3549f, -4.2382f, + -23.9739f, -91.8469f, -67.2654f, -21.5293f, -9.9857f, 11.8391f, + 35.8223f, -74.2551f, -81.0729f, -73.8347f, -70.3798f, -86.8052f, + 0.1701f, -0.1136f, 0.0060f, -0.0496f, -0.1727f, 0.0195f, + -0.1040f, 0.1027f, 0.0467f, -0.2538f, -0.1322f, 0.0860f, + 0.0093f, -0.2801f, -0.0958f, 0.0497f, -0.0582f, -0.0311f, + 0.1840f, 0.0752f, 0.0282f, 0.0297f, 0.0607f, 0.0650f, + 0.0893f, 0.1297f, 0.0373f, 0.0040f, -0.0973f, 0.0248f, + -0.1419f, 0.0322f, -0.0712f, 0.0860f, -0.0426f, -0.1989f, + 0.1393f, -0.1183f, 0.0735f, -0.1895f, 0.1447f, -0.0056f, + -0.1833f, 0.0884f, 0.0949f, 0.0476f, 0.0551f, 0.2125f, + -0.1537f, -0.0141f, -0.2182f, 0.1567f, 0.0457f, -0.1485f, + -0.1177f, 0.0391f, 0.1982f, -0.1288f, 0.1165f, -0.2019f, + 0.4550f, 0.5179f, 0.4311f, 0.1861f, 0.6199f, 0.4542f, + 0.2034f, 0.1128f, 1.3489f, -0.2525f, -2.1139f, -2.2444f, + -2.3679f, -2.3378f, 0.5682f, 0.1348f, 0.3032f, -1.5835f, + 0.2883f, 0.1693f, 0.0439f, -1.4556f, 0.3818f, 0.4875f, + -1.8899f, 0.2510f, 0.6450f, 0.6082f, 0.5962f, 0.8131f, + 12.0281f, 13.3899f, 13.6249f, 15.8068f, -1.5453f, 6.7456f, + -6.0877f, 26.2596f, 6.2223f, -0.5922f, 134.1428f, 128.8985f, + 128.7538f, 123.0920f, 1.3207f, 18.3069f, 15.7436f, 46.5230f, + 24.7455f, 15.0688f, 19.9965f, 34.7236f, 19.7171f, 1.2018f, + 49.7274f, 11.8957f, 13.1578f, 14.0451f, 15.3544f, -3.5601f, + 1.0048f, 0.9479f, 1.1832f, 2.0635f, -2.9808f, 2.0803f, + -7.5815f, 8.4733f, -4.2008f, 0.1217f, 226.5257f, 210.7018f, + 211.6235f, 195.2605f, 0.8283f, 1.0977f, 1.4858f, 41.1242f, + 1.5822f, 0.8742f, 2.0440f, 33.6213f, 1.6177f, 0.9661f, + 65.0014f, 1.4197f, 1.0109f, 1.3153f, 1.5470f, -3.2833f, + 2.0858f, 2.0012f, 2.1088f, 2.5593f, -0.9422f, 1.8554f, + -6.5378f, 0.6780f, 2.3186f, 0.0506f, 218.3285f, 203.4055f, + 204.0362f, 188.7854f, 0.3701f, 2.5257f, 3.5172f, 28.8144f, + 2.1511f, 3.4676f, 2.6337f, 28.5113f, 2.4254f, -0.0548f, + 59.4511f, 2.0757f, 2.1551f, 2.2271f, 2.5300f, -1.4173f, + 91.9240f, 88.2142f, 83.6155f, 82.2482f, -9.2566f, 10.9654f, + -2.6974f, 62.6750f, -3.6298f, -0.1245f, 69.6721f, 67.1340f, + 66.9162f, 64.1994f, -83.6778f, 76.8107f, 69.7832f, 64.9261f, + 68.4901f, 76.3615f, 70.8108f, 63.5435f, 69.1973f, -83.6034f, + 24.8275f, 90.1923f, 87.6831f, 82.9783f, 81.8558f, -7.1010f, + 95.1656f, 88.3853f, 80.5835f, 79.5990f, -3.0720f, 8.1290f, + -0.6151f, 63.6425f, -4.5833f, -0.0063f, 70.1861f, 66.6250f, + 66.6148f, 63.0886f, -89.2863f, 74.7684f, 64.8897f, 60.4134f, + 62.5241f, 78.7076f, 61.7234f, 60.1688f, 61.9509f, -89.4098f, + 30.3361f, 92.9144f, 88.5954f, 79.6336f, 79.2453f, -0.4101f, + 0.6287f, 0.8050f, 0.4417f, 0.5419f, 0.5972f, 1.3037f, + 0.4316f, -0.0013f, -0.3673f, -0.4952f, 6.1773f, 5.7825f, + 6.1705f, 5.3848f, 1.7607f, -0.0152f, -0.2924f, 0.8199f, + 1.3326f, 0.7197f, -0.6332f, 1.1127f, 1.0472f, 1.8468f, + 3.4419f, 0.8233f, 0.7175f, 0.8514f, 0.6372f, 0.9472f, + -0.0813f, -0.0197f, -0.0096f, -0.2015f, 0.1133f, -0.0305f, + 0.0578f, 0.1375f, -0.0750f, -0.1702f, 0.1246f, -0.1782f, + 0.2017f, 0.0425f, -0.0602f, 0.1837f, 0.1044f, -0.1273f, + -0.1431f, 0.0672f, -0.1807f, -0.1045f, -0.1355f, -0.0497f, + -0.0561f, -0.0633f, 0.1907f, -0.0777f, 0.1203f, 0.0754f, + 0.4079f, 0.2001f, 0.0558f, 0.0622f, 0.2951f, 0.6541f, + -0.0068f, 0.1070f, 0.4469f, -0.1266f, -1.3035f, -1.3324f, + -1.3612f, -0.9966f, 0.7986f, 0.3192f, -0.5028f, -0.3844f, + -0.4079f, 0.6690f, -0.5109f, -0.2719f, -0.4958f, 1.0310f, + -0.8044f, 0.1447f, 0.4221f, 0.3194f, 0.3063f, 0.5520f, + 0.4667f, -5.7238f, -0.5602f, 12.6339f, -15.1865f, -14.9035f, + -3.0726f, 9.5347f, -24.6225f, -2.7086f, 89.8557f, 95.0657f, + 93.8693f, 99.1085f, -35.9483f, -18.0363f, -1.6298f, 25.3484f, + 39.3975f, -15.3199f, 5.7664f, 17.2367f, 25.2788f, -36.5648f, + 29.1426f, 0.3857f, -5.2117f, 0.0533f, 12.1707f, -11.1735f, + 0.2673f, 0.0090f, 0.1574f, 0.0904f, 0.0281f, 0.1144f, + 0.1123f, -0.0061f, 0.0954f, -0.0094f, -0.4387f, -0.5006f, + -0.2560f, -0.2326f, -0.1769f, 0.0465f, 0.1273f, -0.1627f, + 0.2987f, -0.3041f, 0.1131f, -0.3620f, 0.0932f, -0.0649f, + -0.4597f, 0.2535f, -0.0994f, 0.1390f, 0.1279f, 0.4207f, + -39.1159f, -42.6382f, -38.4225f, -31.2301f, -28.2382f, -28.1176f, + -9.5822f, 1.1886f, -1.2964f, -0.7908f, 154.9819f, 147.1914f, + 147.0482f, 138.7535f, -21.7014f, -35.7117f, -28.8802f, -3.8968f, + -21.5007f, -28.2213f, -28.4878f, -3.7558f, -26.8317f, -22.8491f, + 50.9464f, -37.0918f, -42.8811f, -39.3079f, -32.1904f, -26.6354f, + -72.5346f, -75.5751f, -72.6896f, -71.3671f, -35.3279f, -21.6077f, + -5.8259f, 38.7516f, -6.8012f, 0.0172f, 170.0685f, 157.4452f, + 158.2334f, 145.0102f, 10.0653f, -45.1775f, -56.4571f, -5.1165f, + -75.8980f, -46.8672f, -55.3642f, -6.5631f, -81.0258f, 10.1348f, + 55.9786f, -70.8124f, -75.7040f, -73.9831f, -70.8786f, -34.9723f, + 88.6239f, 86.5330f, 80.9333f, 79.6833f, -10.0096f, 10.6312f, + -4.2350f, 62.6230f, -3.2991f, -0.0843f, 75.8659f, 72.7886f, + 72.5301f, 68.8265f, -81.8276f, 70.3025f, 62.9511f, 62.5706f, + 69.1842f, 69.3637f, 65.4820f, 65.4357f, 71.5347f, -82.1064f, + 24.1925f, 86.2418f, 85.4985f, 80.4091f, 79.5378f, -9.3877f, + -7.6594f, -4.9581f, -10.6385f, -20.2307f, -44.2261f, -13.7557f, + -4.5344f, 18.1793f, -10.5522f, -1.5878f, 110.3187f, 102.4945f, + 102.3305f, 94.1324f, -25.2665f, 9.8172f, -4.4791f, 69.4972f, + -6.7571f, 5.8378f, -11.6101f, 70.7066f, -4.9327f, -24.0513f, + 41.4598f, -7.0600f, -7.0940f, -10.2478f, -18.9616f, -46.7505f, + 90.9365f, 86.0260f, 73.2934f, 69.3406f, 3.3863f, 3.8524f, + 0.6536f, 63.2150f, -10.6304f, 0.0291f, 73.0071f, 69.7660f, + 69.0457f, 65.5611f, -92.3379f, 74.2756f, 54.5025f, 84.3183f, + 53.7481f, 73.5624f, 55.3827f, 82.3242f, 53.5432f, -92.5355f, + 25.3457f, 89.1858f, 84.4763f, 72.9840f, 69.1889f, 4.6719f, + -0.0129f, 0.1995f, 0.2069f, 0.0358f, 0.1209f, -0.1185f, + -0.1217f, -0.1456f, 0.0125f, -0.1354f, 0.0510f, -0.0572f, + 0.1397f, 0.1453f, -0.0086f, 0.0107f, 0.0232f, 0.1508f, + 0.0884f, -0.0967f, -0.1786f, 0.1361f, -0.1399f, -0.2021f, + -0.0242f, -0.2169f, 0.0133f, 0.0116f, -0.1489f, -0.0093f, + -0.0796f, 0.1507f, 0.0906f, 0.0228f, -0.0166f, -0.1875f, + 0.0471f, 0.1184f, -0.0007f, -0.2732f, -0.1386f, -0.2057f, + -0.0213f, -0.1699f, 0.0996f, 0.1562f, 0.1850f, -0.0362f, + -0.2059f, 0.0258f, -0.0135f, -0.1276f, 0.0034f, 0.2023f, + 0.0857f, -0.0085f, -0.1955f, -0.1666f, -0.0920f, 0.0971f, + -0.0292f, -0.0512f, -0.0753f, -0.0739f, -0.0873f, -0.1200f, + 0.0220f, -0.1359f, 0.2013f, -0.0445f, 0.1143f, -0.1484f, + -0.1556f, -0.0003f, 0.1711f, -0.0724f, -0.0531f, 0.1126f, + 0.0476f, -0.0057f, 0.0088f, 0.0792f, -0.0438f, -0.1118f, + -0.0244f, 0.0712f, 0.0930f, -0.0203f, 0.1662f, -0.0695f, + -12.3872f, -18.7022f, -13.4237f, -1.4731f, -18.6843f, -14.1515f, + -7.5057f, 40.2090f, -2.7774f, -1.8433f, 123.6006f, 119.0557f, + 118.2758f, 113.6423f, -32.6216f, -19.5865f, -16.2897f, 17.2068f, + 6.3559f, -17.8742f, 0.7098f, 11.5970f, -10.1104f, -33.1830f, + 39.5617f, -10.5499f, -17.8137f, -14.7185f, -2.6172f, -14.6004f, + 0.3893f, 0.4443f, 0.5305f, 0.3049f, 0.8316f, 0.8679f, + 0.2265f, 0.2393f, 1.1970f, -0.2891f, -1.8666f, -1.8266f, + -1.6984f, -1.8787f, 0.8706f, 0.4208f, 0.5076f, -0.8436f, + -0.1623f, 0.8008f, 0.1512f, -1.0839f, -0.3002f, 0.9263f, + -1.3031f, 0.5964f, 0.3413f, 0.5551f, 0.2618f, 0.7018f, + -0.1320f, -0.1944f, -0.0209f, -0.0877f, 0.0721f, -0.0840f, + 0.0589f, 0.1019f, 0.1927f, -0.2011f, -0.1117f, 0.1575f, + 0.1080f, -0.0516f, 0.2154f, -0.1231f, 0.0426f, -0.0522f, + -0.1824f, -0.1923f, -0.1206f, -0.1724f, -0.0798f, 0.0401f, + -0.2170f, 0.0293f, -0.0853f, 0.1517f, 0.2128f, -0.1934f, + 0.0406f, 0.0517f, 0.0822f, -0.0150f, 0.0943f, -0.0989f, + -0.1802f, -0.1453f, -0.1967f, -0.1797f, 0.1545f, -0.1217f, + 0.1755f, -0.1604f, -0.0515f, 0.0509f, 0.0310f, -0.1220f, + -0.1770f, -0.0157f, 0.1989f, -0.0069f, 0.1766f, 0.1267f, + -0.0517f, -0.0396f, 0.0346f, 0.1946f, 0.1162f, -0.1345f, + -106.6179f, -110.5917f, -107.5476f, -108.0601f, -61.1687f, -22.4247f, + 2.6632f, 109.5208f, -66.1177f, 0.0062f, 159.9339f, 144.7755f, + 145.5032f, 128.9872f, 18.9180f, -75.3569f, -105.0866f, -52.0704f, + -119.1299f, -74.7543f, -109.9468f, -59.0682f, -104.5754f, 19.2878f, + 67.2573f, -104.8061f, -111.8610f, -106.6751f, -107.3537f, -56.4758f, + -0.6967f, -0.8495f, -0.9586f, -1.0461f, 1.4522f, -0.2762f, + 28.2828f, 2.9157f, -2.1062f, 0.1566f, -467.2388f, -461.0685f, + -459.0092f, -453.8370f, 1.5422f, -0.8186f, -0.4884f, -53.0399f, + -2.0255f, -1.1348f, -1.1039f, -50.2489f, -1.4821f, 1.8021f, + -258.0319f, -1.0865f, -0.5542f, -1.0443f, -1.2732f, 1.8413f, + 0.2377f, 0.1937f, -0.0116f, 0.0935f, -0.0599f, 0.0118f, + -0.0875f, 0.0455f, -0.1301f, -0.1081f, -0.2622f, -0.1960f, + 0.0393f, -0.1490f, 0.1852f, -0.0964f, -0.0741f, 0.0419f, + 0.1162f, -0.0274f, 0.1200f, -0.0333f, -0.1337f, 0.2141f, + 0.0664f, 0.1044f, -0.1744f, 0.1060f, -0.1468f, 0.0679f, + 0.0218f, 0.0494f, 0.1064f, 0.1363f, 0.0013f, 0.1331f, + -0.2095f, 0.2088f, -0.0399f, -0.1811f, 0.0678f, -0.1974f, + 0.1855f, -0.0968f, -0.2008f, 0.0162f, -0.0096f, -0.1493f, + 0.2170f, -0.1248f, -0.2055f, 0.1276f, -0.0269f, -0.1697f, + -0.0662f, 0.1073f, -0.0029f, -0.1051f, -0.1573f, 0.2106f, + -0.2020f, -0.1565f, 0.0335f, -0.1818f, -0.1665f, 0.2169f, + 0.1974f, -0.1470f, -0.1738f, -0.2038f, 0.0558f, -0.0441f, + 0.0065f, -0.1485f, -0.1366f, -0.2131f, 0.1042f, 0.0349f, + -0.1804f, -0.1361f, -0.0116f, -0.1012f, -0.0860f, 0.0606f, + -0.2077f, 0.1826f, -0.1014f, -0.0721f, -0.1517f, 0.1022f, + -0.1110f, -0.0186f, 0.1505f, 0.1797f, 0.0911f, 0.0340f, + 0.1702f, -0.1404f, -0.0566f, -0.2744f, -0.1943f, -0.1871f, + 0.0046f, 0.0306f, -0.0436f, 0.1625f, -0.1302f, 0.0175f, + 0.1570f, -0.1425f, 0.0779f, 0.1398f, 0.0929f, 0.0897f, + 0.0458f, -0.0936f, 0.1321f, -0.1355f, 0.0974f, 0.0457f, + -73.3516f, -75.0655f, -72.1062f, -72.4624f, -34.8640f, -14.3727f, + -4.4720f, 66.4982f, -18.8358f, 0.0397f, 174.2172f, 160.4959f, + 161.1034f, 147.3250f, 9.5507f, -45.0180f, -73.1609f, -1.5230f, + -74.8677f, -43.8559f, -68.7622f, -4.8971f, -82.1922f, 9.6490f, + 64.7115f, -71.8566f, -75.3879f, -72.5479f, -71.7161f, -34.8056f, + 0.1442f, 0.1558f, 0.1267f, -0.1261f, -0.0506f, -0.0823f, + -0.1807f, -0.0889f, -0.2098f, -0.1295f, -0.2046f, -0.1749f, + -0.1197f, -0.1380f, 0.0799f, -0.0889f, -0.1209f, 0.1919f, + 0.1947f, -0.2086f, -0.1042f, -0.0468f, 0.0232f, 0.1052f, + -0.0535f, 0.1398f, 0.1713f, -0.1522f, 0.1453f, 0.0286f, + -64.8503f, -67.6746f, -63.6497f, -60.4614f, -35.6091f, -20.1605f, + -3.6082f, 84.2801f, -37.8552f, -2.2371f, 132.4947f, 123.5057f, + 123.5776f, 113.9060f, -14.8772f, -40.7130f, -79.1391f, -10.7024f, + -65.7831f, -43.6078f, -79.6847f, -13.0743f, -69.2533f, -16.0171f, + 50.4868f, -64.3678f, -68.7061f, -64.0823f, -59.3413f, -28.9405f, + 77.1601f, 75.4899f, 69.8696f, 67.8764f, -22.7548f, 5.9814f, + -3.2826f, 57.9754f, -5.9500f, -0.0014f, 77.2251f, 74.0737f, + 73.7004f, 70.5072f, -80.9661f, 69.3065f, 55.8337f, 76.8831f, + 57.9902f, 63.4765f, 56.4748f, 70.0282f, 61.0874f, -81.3960f, + 26.2594f, 76.0367f, 74.9115f, 69.2361f, 66.9262f, -20.1637f, + 0.1886f, -0.1108f, 0.1262f, 0.0189f, 0.1382f, 0.0859f, + -0.1874f, -0.1986f, -0.0171f, -0.1400f, -0.2944f, -0.0750f, + -0.0395f, -0.2092f, -0.0878f, 0.1216f, -0.0870f, -0.1613f, + 0.2495f, 0.0754f, 0.0244f, -0.1205f, -0.0196f, -0.1729f, + 0.1170f, 0.1585f, 0.1482f, -0.1705f, -0.1337f, 0.0199f, + 13.0897f, 9.1111f, 6.7413f, 6.3907f, -28.1187f, 0.4556f, + -5.3116f, 30.7293f, -16.3644f, -0.0365f, 118.9118f, 111.6125f, + 111.3227f, 103.4680f, -30.1883f, 8.9328f, -4.1876f, 79.3936f, + -9.0522f, 12.7861f, -1.2736f, 78.0446f, -5.9485f, -30.5716f, + 27.8951f, 13.9613f, 6.7173f, 5.2345f, 8.3271f, -27.3705f, + 1.0488f, 1.0864f, 1.0710f, 1.7332f, -3.0561f, 1.1622f, + -7.6688f, 3.0491f, -1.3865f, 0.0769f, 222.5451f, 207.8170f, + 208.1767f, 193.1396f, 0.4447f, 2.1654f, 1.8929f, 35.1469f, + 1.1783f, 2.6199f, 1.1611f, 26.2989f, 3.4446f, 0.1551f, + 65.6529f, 1.2229f, 0.9851f, 1.0241f, 1.4373f, -3.3421f, + 0.1388f, 0.0756f, 0.2047f, 0.1140f, 0.0945f, 0.2038f, + 0.1038f, -0.2068f, -0.0626f, -0.1937f, 0.1347f, -0.0464f, + -0.0866f, 0.0250f, 0.0264f, -0.1556f, -0.1625f, 0.1028f, + -0.1255f, -0.0854f, 0.1033f, 0.0008f, -0.2133f, -0.0317f, + 0.1725f, -0.1054f, -0.1900f, 0.0383f, 0.0440f, -0.1900f, + -30.0811f, -30.9929f, -29.3194f, -26.8347f, -20.5957f, -4.1595f, + -1.9066f, 42.4707f, -9.0435f, 0.0064f, 175.7328f, 163.1350f, + 163.5085f, 151.1648f, 4.4620f, -20.6011f, -19.3402f, 1.5468f, + -32.0920f, -25.4581f, -12.3706f, -2.1636f, -32.4569f, 3.9365f, + 61.0117f, -28.4195f, -31.0837f, -30.2749f, -27.5522f, -22.8688f, + -0.3000f, 0.0092f, -0.3675f, -0.4113f, 0.0033f, 0.1138f, + 0.2182f, -0.5803f, 0.7507f, -0.2529f, -1.7724f, -1.4702f, + -1.5805f, -1.4294f, 0.1435f, -0.0168f, 0.2356f, -0.4373f, + -0.4500f, -0.4803f, -0.0041f, -0.3878f, 0.1321f, 0.2761f, + -1.1975f, -0.3509f, -0.0465f, -0.4050f, -0.1110f, 0.2233f, + 0.0950f, 0.0974f, -0.1600f, -0.1753f, -0.0328f, 0.0741f, + -0.0706f, 0.1839f, -0.0833f, -0.1367f, -0.1094f, -0.1739f, + -0.1069f, 0.0370f, -0.1404f, 0.1631f, -0.1570f, 0.2117f, + -0.1891f, 0.0395f, 0.1081f, 0.1760f, 0.0997f, 0.0853f, + -0.1018f, 0.1306f, -0.0924f, -0.2078f, 0.0801f, -0.0949f, + 0.5803f, 0.5578f, 0.4089f, 0.1912f, 0.6774f, 0.3145f, + 0.3992f, -0.1316f, 1.3142f, -0.2457f, -2.3536f, -2.4939f, + -2.3165f, -2.4879f, 0.2321f, 0.1901f, 0.1789f, -1.5215f, + 0.2645f, 0.2231f, 0.2411f, -1.2361f, 0.2971f, 0.1421f, + -1.6715f, 0.3158f, 0.2476f, 0.3596f, 0.3029f, 0.9297f, + -88.8401f, -89.5209f, -86.1926f, -87.4196f, -39.6504f, -17.9684f, + -4.2702f, 80.2017f, -29.1676f, -0.4190f, 150.2820f, 138.4751f, + 139.1087f, 126.6569f, 13.7188f, -57.0739f, -80.3383f, -18.8351f, + -87.4103f, -56.0072f, -82.7707f, -23.1871f, -93.6787f, 13.9287f, + 59.6213f, -87.4843f, -90.4227f, -86.2635f, -86.6841f, -37.9086f, + 0.1184f, -0.2169f, -0.1915f, 0.0543f, 0.1253f, -0.1370f, + 0.0836f, -0.1198f, 0.1544f, -0.2004f, -0.1118f, -0.0786f, + 0.1517f, -0.1000f, -0.1055f, 0.0936f, -0.1579f, 0.1098f, + -0.0234f, -0.0499f, 0.0951f, -0.1711f, 0.0186f, -0.2008f, + 0.1777f, 0.1386f, -0.1495f, -0.0684f, -0.2149f, -0.1198f, + -0.6205f, -0.7209f, -0.5487f, -0.9080f, 1.3400f, 0.0085f, + 28.2837f, 3.2217f, -1.8463f, 0.1620f, -464.3599f, -458.4327f, + -455.9967f, -451.0393f, 1.6619f, -0.6944f, -0.3167f, -52.3630f, + -1.6971f, -0.7340f, -0.8923f, -49.2771f, -1.1177f, 1.8810f, + -258.9386f, -1.0765f, -0.7279f, -0.5208f, -0.8839f, 1.8175f, + -78.8510f, -80.5740f, -77.8843f, -77.9798f, -36.5560f, -16.0818f, + -5.5362f, 66.4228f, -16.8150f, 0.0036f, 181.8365f, 167.7181f, + 168.2344f, 153.9725f, 11.2659f, -47.5786f, -92.6978f, 6.7573f, + -68.7704f, -48.3850f, -95.3637f, 8.8888f, -76.9497f, 11.2243f, + 60.9020f, -77.6515f, -80.7610f, -78.4537f, -77.4659f, -36.2872f, + -0.0936f, 0.1966f, -0.2121f, 0.0193f, 0.0489f, -0.1445f, + 0.0060f, 0.0358f, -0.0783f, -0.0985f, -0.2072f, -0.0802f, + -0.0185f, 0.1868f, -0.0631f, 0.1260f, -0.0675f, 0.2167f, + -0.2174f, -0.1085f, 0.1483f, -0.1655f, -0.1040f, 0.1605f, + -0.1673f, -0.0148f, -0.1856f, -0.1454f, 0.1603f, -0.1620f, + -0.9205f, -1.2716f, -3.6561f, -5.0834f, -0.7934f, 1.8710f, + 2.2999f, -2.9516f, -1.7631f, -0.3804f, 41.2998f, 26.2358f, + 28.9763f, 15.7315f, 5.2164f, 3.2963f, -5.4457f, 18.6310f, + -25.0076f, 5.4368f, -12.0085f, 17.1462f, -14.6992f, 5.6365f, + 48.6207f, -1.0921f, -1.8723f, -3.5354f, -5.1774f, -1.0200f, + -0.1065f, -0.2021f, 0.0332f, 0.1692f, -0.1239f, 0.1325f, + -0.0660f, -0.0567f, 0.2107f, -0.2084f, -0.0263f, 0.1411f, + 0.0178f, 0.0451f, 0.2024f, -0.1756f, -0.0771f, -0.1690f, + -0.2097f, -0.2130f, 0.0714f, 0.0172f, -0.0310f, 0.0649f, + -0.1550f, 0.0701f, 0.0306f, -0.1750f, -0.1988f, -0.2060f, + 0.0005f, -0.1325f, -0.1823f, -0.0900f, -0.1291f, -0.1817f, + 0.0144f, 0.0951f, -0.1954f, -0.0171f, -0.1985f, 0.0875f, + 0.0901f, -0.0857f, 0.1681f, 0.0465f, 0.1023f, 0.0985f, + -0.2152f, -0.1723f, -0.0825f, 0.0203f, -0.1206f, -0.1431f, + -0.1552f, 0.1344f, 0.0398f, 0.0169f, 0.2180f, -0.1530f, + 2.7964f, 2.7312f, 2.8831f, 3.4729f, -3.1366f, 2.4043f, + -7.2004f, 1.4128f, 2.8648f, 0.0578f, 225.5640f, 210.3712f, + 210.6907f, 195.0339f, 0.3140f, 1.8060f, 2.7355f, 33.6917f, + 3.3542f, 3.3682f, 1.7371f, 31.2424f, 3.4094f, -0.1192f, + 63.0864f, 3.0562f, 2.8633f, 2.6777f, 3.5495f, -4.2616f, + -1.4034f, 0.3930f, -4.6756f, -9.9870f, -27.8511f, 5.6071f, + -1.0862f, 34.4907f, -10.4831f, -0.0281f, 117.2617f, 104.9590f, + 106.1515f, 93.9707f, -16.8801f, 5.3036f, -21.7458f, 98.5306f, + -20.7596f, 6.4733f, -17.6440f, 98.3097f, -31.9540f, -17.0600f, + 27.4543f, -0.6140f, -1.6182f, -4.9167f, -8.9017f, -26.2485f, + -0.1952f, -0.0462f, -0.1958f, 0.1679f, -0.1592f, -0.1634f, + -0.0507f, -0.0542f, 0.0038f, -0.0343f, 0.0567f, -0.1983f, + 0.0250f, -0.0762f, 0.0902f, -0.0343f, 0.1240f, 0.1161f, + 0.1237f, 0.1870f, 0.0346f, 0.0340f, 0.0625f, -0.0355f, + 0.0278f, -0.1043f, 0.1755f, 0.0253f, 0.1750f, -0.2070f, + -5.5531f, -5.3122f, -4.9348f, -4.4782f, -7.5686f, -1.5478f, + -5.4341f, 0.5087f, -2.1382f, 0.0798f, 208.3677f, 194.0083f, + 194.4168f, 179.3082f, 1.4443f, -1.5038f, -1.4021f, 25.9363f, + -4.0635f, -2.6785f, -1.6640f, 22.2589f, -1.4910f, 1.4715f, + 59.1972f, -4.9638f, -5.1920f, -4.9193f, -5.2649f, -8.0556f, + 20.1226f, 12.0195f, 9.7385f, 10.7058f, -27.4201f, 8.4869f, + -5.0826f, 32.9212f, -2.0674f, -0.0290f, 120.5002f, 112.3222f, + 112.3287f, 104.1107f, -20.6293f, 14.8534f, -0.8748f, 103.1141f, + -1.1368f, 15.3716f, 2.7653f, 91.7285f, -0.5991f, -20.7338f, + 35.9363f, 20.5104f, 11.1988f, 9.0368f, 10.6355f, -26.5309f, + -0.2058f, -0.2176f, 0.1331f, -0.1415f, -0.0825f, -0.0470f, + -0.0615f, 0.1274f, 0.0076f, -0.0575f, -0.2065f, 0.0866f, + 0.2166f, -0.1942f, -0.1952f, 0.1323f, -0.1016f, 0.1803f, + -0.0424f, 0.1555f, 0.1118f, 0.1559f, 0.0337f, -0.0341f, + -0.0430f, 0.1988f, -0.0553f, -0.0255f, 0.1817f, 0.0608f, + 0.1431f, 0.0686f, -0.0245f, -0.2107f, 0.2001f, -0.0964f, + -0.0090f, 0.1151f, -0.0365f, -0.1986f, 0.1740f, -0.2098f, + 0.0013f, 0.1369f, 0.1910f, 0.1801f, -0.2019f, 0.0348f, + -0.1175f, 0.0627f, -0.1929f, -0.0099f, 0.1349f, 0.1804f, + -0.1071f, -0.1651f, -0.1146f, -0.0259f, 0.1626f, -0.0271f, + 0.1393f, 0.1304f, -0.0200f, 0.0924f, -0.0839f, -0.0031f, + -0.1311f, 0.0350f, -0.1330f, -0.0911f, 0.1949f, -0.0209f, + -0.1883f, 0.0269f, 0.2040f, 0.1552f, 0.1532f, 0.1157f, + -0.1102f, -0.1220f, -0.0808f, -0.1050f, 0.1716f, 0.0846f, + -0.0180f, -0.1037f, 0.2063f, 0.1237f, 0.1253f, -0.0496f, + -0.0183f, 0.0491f, 0.1703f, -0.0824f, -0.0702f, -0.1100f, + -0.0965f, 0.0130f, -0.1222f, -0.1081f, 0.0329f, 0.2115f, + -0.1438f, 0.0799f, -0.1602f, -0.0330f, 0.0501f, 0.1072f, + -0.0744f, -0.1783f, -0.0240f, 0.0777f, -0.1944f, 0.0438f, + -0.0033f, -0.1873f, 0.0984f, -0.0318f, 0.0773f, 0.1489f, + 0.3966f, 0.4711f, 0.3972f, 0.0623f, 0.5970f, 0.1018f, + 0.1375f, -0.1881f, 0.8921f, -0.1854f, -2.1138f, -2.1178f, + -1.8295f, -2.1703f, 0.5784f, -0.1937f, -0.0728f, -0.9953f, + 0.2442f, -0.4074f, -0.1591f, -1.1660f, 0.4832f, 0.2203f, + -1.4957f, 0.1544f, 0.1810f, 0.2275f, 0.4075f, 0.8153f, + 0.0715f, 0.0222f, 0.0463f, -0.0201f, 0.0396f, 0.5951f, + -0.2779f, -0.0306f, 0.7532f, -0.1596f, -4.1080f, -3.7925f, + -3.8522f, -3.2468f, 0.7728f, 0.0188f, -0.1448f, 0.4084f, + -0.4666f, -0.1036f, -1.1469f, 0.4243f, 0.2778f, 0.9023f, + -3.0216f, 0.0384f, -0.3348f, -0.0314f, -0.2788f, 0.0479f, + 139.0773f, 131.6164f, 115.0392f, 111.1817f, 41.7596f, 9.5379f, + 1.8542f, 46.9890f, -12.8221f, 0.0241f, 52.9779f, 51.5268f, + 50.8060f, 48.7028f, -132.9665f, 118.3478f, 101.1239f, 81.4608f, + 75.4251f, 121.0643f, 97.8947f, 86.8911f, 74.5576f, -133.7606f, + 29.2657f, 135.8916f, 131.3661f, 114.1687f, 111.0784f, 31.3790f, + -0.0807f, -0.0657f, -0.0027f, 0.0410f, 0.0765f, 0.1194f, + 0.0953f, -0.0060f, 0.1531f, -0.2339f, 0.1488f, -0.0615f, + -0.0579f, 0.0761f, 0.1250f, -0.0469f, 0.1480f, 0.0683f, + -0.0049f, 0.1558f, 0.2168f, -0.0736f, 0.1135f, -0.1244f, + 0.0725f, -0.1297f, -0.0215f, -0.0412f, -0.1632f, -0.0200f, + -0.1346f, -0.1954f, 0.0053f, 0.0151f, 0.1379f, -0.1497f, + -0.0102f, -0.0336f, 0.0900f, -0.1706f, -0.0932f, -0.2084f, + 0.1242f, -0.2027f, 0.0849f, -0.2139f, -0.2015f, 0.0944f, + -0.0984f, 0.2082f, 0.1625f, -0.0227f, -0.1676f, 0.1021f, + 0.1516f, 0.0245f, 0.0955f, -0.1488f, -0.0057f, 0.1783f, + -0.8568f, -0.8175f, -0.6282f, -1.3107f, 1.5712f, 0.1044f, + 28.2289f, 3.0885f, -1.9829f, 0.1600f, -465.9583f, -459.5893f, + -457.5055f, -452.7600f, 1.7229f, -0.6620f, -0.1065f, -52.8017f, + -2.0293f, -0.8224f, -1.0389f, -49.9049f, -1.2250f, 1.7647f, + -259.2465f, -1.0978f, -0.5169f, -0.8721f, -0.8197f, 1.9158f, + 16.2234f, 15.8523f, 13.8343f, 9.8509f, -21.4326f, 15.7650f, + -6.4451f, 34.8575f, 1.1387f, -0.0223f, 117.7213f, 109.8494f, + 109.7624f, 101.8532f, -20.3275f, 16.0812f, 4.9165f, 92.4919f, + 4.1615f, 13.8451f, 9.2112f, 97.1580f, -8.7037f, -20.4420f, + 27.1105f, 17.4922f, 13.9998f, 12.3888f, 11.4705f, -20.9568f, + 0.5457f, 0.5322f, 0.2823f, 0.3581f, 0.5359f, 0.1576f, + 0.1969f, -0.0136f, -0.2748f, -0.3168f, -0.3918f, -0.2167f, + -0.1797f, -0.1869f, 0.2986f, -0.2116f, -0.4226f, -0.2022f, + 0.9452f, 0.5474f, -0.1218f, 0.2067f, -0.1600f, 0.1937f, + 0.0808f, 0.4877f, 0.5106f, 0.2626f, 0.5076f, 0.6228f, + 0.5124f, 0.4044f, 0.4023f, 0.1222f, 2.5446f, 0.9623f, + 24.9875f, 4.7442f, -2.0551f, 0.1642f, -449.9478f, -444.1841f, + -442.0153f, -437.1498f, 2.3209f, -0.6986f, -0.3456f, -47.4074f, + -1.2374f, -1.0939f, -0.9112f, -41.1851f, -0.5064f, 2.4209f, + -263.4446f, -0.0433f, 0.3460f, 0.1475f, 0.3770f, 2.9154f, + 0.2032f, 0.1527f, 0.2161f, -0.1981f, 0.1893f, -0.2003f, + 0.1734f, 0.1713f, 0.1207f, -0.2073f, -0.1018f, 0.0770f, + 0.0728f, 0.1665f, 0.0689f, 0.1884f, -0.1399f, -0.1326f, + -0.0518f, -0.1948f, 0.1576f, -0.1835f, 0.1436f, 0.0497f, + 0.0883f, -0.1253f, -0.0417f, -0.0507f, -0.1555f, 0.2076f, + -2.4080f, 6.1616f, -0.8564f, -13.6773f, -32.7238f, -16.3144f, + -1.9828f, 20.5110f, -17.0191f, -1.7154f, 103.6642f, 95.3675f, + 95.5662f, 86.9504f, -35.5340f, 19.6681f, -2.4900f, 65.0847f, + -15.8119f, 13.7256f, -4.6753f, 63.4713f, -6.5992f, -34.2369f, + 41.3959f, -1.5528f, 3.8106f, -0.7762f, -12.3204f, -35.1734f, + -83.9509f, -87.4861f, -83.5925f, -81.5047f, -54.1256f, -45.7506f, + -13.5325f, -6.0331f, -8.5062f, 0.0261f, 189.9450f, 177.7870f, + 178.6945f, 164.9762f, 9.8521f, -68.0619f, -68.6145f, 6.5056f, + -55.9651f, -66.9540f, -65.3349f, -2.1954f, -57.2408f, 8.6577f, + 60.6966f, -82.1056f, -88.5245f, -83.3057f, -80.7283f, -50.5285f, + -0.1397f, 0.1862f, -0.0691f, -0.0906f, 0.1560f, 0.1377f, + -0.0066f, -0.0213f, 0.0708f, -0.0386f, -0.0015f, -0.0020f, + -0.2122f, 0.0747f, 0.0795f, 0.0229f, 0.1923f, -0.1661f, + 0.0895f, 0.1176f, 0.1398f, -0.0443f, 0.0934f, 0.0638f, + -0.1924f, 0.0602f, 0.0404f, 0.1597f, 0.1387f, -0.0601f, + -28.3967f, -21.8483f, -25.5175f, -29.9252f, 2.0161f, -3.0092f, + 7.7435f, 28.2367f, -35.0188f, -0.1578f, 105.0164f, 93.4495f, + 94.9134f, 81.0315f, 4.3602f, 8.1303f, -37.7665f, -16.6986f, + -40.8902f, 8.2542f, -33.3215f, -2.0457f, -69.0245f, 4.1016f, + 47.2770f, -25.8268f, -23.6034f, -26.4339f, -27.8305f, 8.4468f, + 13.8742f, 8.3874f, 4.2044f, 1.4619f, -40.2909f, -0.6358f, + -0.7982f, 36.1931f, -17.3147f, -0.3348f, 106.8135f, 96.5298f, + 97.8829f, 86.9994f, -25.8170f, 15.0652f, -0.9181f, 85.8544f, + 2.5475f, 9.8009f, -3.5931f, 89.2017f, -3.7252f, -25.2986f, + 22.5505f, 14.0434f, 7.0708f, 4.6646f, 1.5807f, -39.4024f, + -0.1436f, 0.0256f, 0.0274f, -0.2126f, 0.0401f, 0.0745f, + -0.0379f, -0.0357f, 0.0777f, -0.0709f, -0.1093f, -0.2047f, + -0.0713f, -0.0478f, -0.0908f, 0.1963f, 0.1282f, 0.0977f, + 0.1304f, 0.2058f, 0.0700f, 0.0518f, 0.0239f, 0.0686f, + -0.1909f, 0.0828f, -0.1243f, -0.1920f, 0.1908f, -0.0808f, + 90.8028f, 89.2894f, 84.5339f, 83.3491f, -13.3838f, 12.0240f, + -3.9443f, 63.0867f, -2.5321f, -0.0099f, 68.9140f, 66.3206f, + 66.0278f, 63.1498f, -83.7261f, 74.3448f, 73.4998f, 64.8477f, + 69.7701f, 74.5878f, 71.0331f, 63.2116f, 74.3162f, -83.9282f, + 20.8163f, 89.6818f, 88.6452f, 83.7338f, 82.9360f, -13.2357f, + 0.1299f, -0.1765f, -0.0168f, -0.1372f, -0.1183f, 0.0472f, + 0.1312f, 0.0267f, 0.0194f, -0.1593f, 0.0059f, 0.1775f, + 0.0668f, -0.1239f, -0.1982f, -0.1415f, -0.1659f, -0.1148f, + 0.0136f, 0.0913f, -0.1254f, -0.0357f, 0.0892f, 0.0835f, + -0.0554f, 0.1969f, -0.0888f, -0.0623f, -0.0236f, -0.1492f, + 0.4196f, 0.3218f, 0.2287f, 0.5095f, 0.7210f, 0.2279f, + 0.4523f, -0.1832f, 1.3095f, -0.2041f, -2.1443f, -2.1947f, + -1.9292f, -2.1142f, 0.5840f, 0.1018f, 0.1011f, -1.6565f, + 0.4325f, 0.0424f, 0.2836f, -1.7183f, 0.2595f, 0.2686f, + -1.8784f, 0.3891f, 0.3050f, 0.6195f, 0.2896f, 0.5905f, + -5.3024f, -3.2518f, -12.5192f, -29.1732f, 1.6538f, -1.8315f, + 9.9788f, 10.5155f, 6.3234f, -0.3460f, 76.9925f, 51.3785f, + 55.7120f, 29.0432f, 5.5901f, 25.6578f, -3.9565f, 13.0509f, + -106.0371f, 23.2124f, -18.2004f, 8.4618f, -69.3585f, 5.5651f, + 80.0565f, -6.4941f, -5.3742f, -14.4209f, -24.1565f, 6.6801f, + -22.0585f, -20.9909f, -26.7939f, -29.6890f, -14.5085f, 2.1866f, + -4.2608f, 17.3977f, -30.8824f, -0.4017f, 135.6957f, 126.9320f, + 127.0044f, 118.1835f, -1.8768f, -0.8629f, -32.0882f, 44.7862f, + -23.9174f, 1.6485f, -27.9940f, 51.9078f, -48.5279f, -1.7550f, + 49.9230f, -19.9785f, -22.4647f, -27.6911f, -27.3197f, -10.6545f, + -0.1922f, -0.1999f, -0.1396f, 0.1065f, 0.0085f, -0.1940f, + 0.0351f, 0.1285f, -0.0292f, -0.1296f, 0.1543f, -0.2082f, + -0.1758f, 0.0719f, 0.0764f, 0.1394f, -0.0255f, -0.0370f, + 0.1615f, -0.0568f, 0.1920f, -0.1631f, 0.0199f, 0.1884f, + 0.0693f, 0.1074f, -0.0273f, 0.1540f, 0.0098f, 0.2111f, + 0.1805f, -0.0555f, 0.1159f, 0.0469f, 0.1789f, -0.1711f, + -0.1304f, 0.1912f, -0.0737f, -0.1408f, 0.1804f, -0.2023f, + -0.0467f, -0.1019f, -0.0136f, 0.0691f, 0.1454f, -0.0213f, + 0.0929f, -0.0958f, 0.1299f, 0.1137f, 0.1175f, 0.1042f, + -0.2081f, -0.0737f, 0.0582f, 0.1640f, 0.2120f, -0.0646f, + -0.0326f, 0.1976f, 0.1182f, -0.1365f, -0.1784f, 0.2113f, + 0.0469f, 0.0763f, -0.0197f, -0.1902f, 0.1259f, 0.1598f, + -0.0180f, -0.1339f, -0.1675f, -0.1884f, -0.1973f, 0.1529f, + 0.1160f, 0.2154f, -0.1446f, -0.1395f, 0.0355f, 0.1513f, + -0.2086f, -0.1135f, -0.1502f, -0.0018f, 0.0486f, -0.0110f, + -0.0843f, -0.0716f, -0.1367f, 0.0753f, 0.0114f, 0.0475f, + -0.0632f, 0.2045f, -0.0512f, -0.0906f, -0.1071f, -0.1957f, + 0.1361f, 0.1821f, -0.1684f, -0.1383f, 0.1059f, 0.1579f, + -0.0064f, -0.1205f, -0.0718f, -0.1323f, -0.0174f, -0.1092f, + -0.1915f, 0.1978f, -0.1245f, 0.1297f, -0.1542f, 0.1556f, + -0.1752f, 0.0718f, -0.1020f, -0.1970f, 0.0518f, -0.0888f, + 0.0541f, -0.1922f, -0.1467f, -0.0653f, -0.1940f, -0.0800f, + -0.1096f, -0.0796f, -0.1310f, 0.0191f, -0.1077f, -0.0973f, + 0.1566f, 0.0074f, 0.0500f, -0.0415f, -0.2116f, 0.0227f, + 0.0895f, 0.1528f, 0.1404f, 0.0467f, 0.0462f, -0.0973f, + -0.1669f, 0.0551f, 0.1167f, -0.1470f, -0.0542f, -0.1006f, + 0.2104f, 0.1039f, -0.0211f, -0.1726f, -0.0694f, -0.0270f, + 0.0277f, -0.0715f, -0.2055f, -0.1502f, -0.1718f, -0.0043f, + 0.0174f, 0.1019f, -0.0233f, -0.1518f, -0.1331f, -0.0001f, + -0.1483f, -0.2115f, 0.0666f, 0.0014f, 0.1601f, -0.0690f, + }; + +static const float av1_rdcost_model_nn_biases_layer0[NUM_HIDDEN_NODES] = { + 0.156824f, 0.f, 0.130013f, 0.084482f, -129.058197f, -15.090252f, + -3.859116f, 0.736356f, -81.361557f, -0.001922f, -0.000713f, 0.440181f, + 14.982646f, 1.282223f, 2.23122f, 94.26635f, 93.920929f, 0.614672f, + 0.f, 0.315858f, 4.746014f, 0.116901f, -35.661354f, -75.148285f, + 92.006989f, -14.112332f, 86.673157f, -0.000307f, -0.000544f, 0.f, + -7.851313f, 0.505186f, 0.f, 0.f, -111.681091f, -0.937782f, + 0.035789f, 0.f, 0.f, -0.00102f, -75.180527f, 0.f, + -63.821148f, 79.592392f, 0.085068f, 11.184906f, 1.25406f, 0.f, + -29.779242f, -0.181732f, 0.f, 0.425554f, -90.78405f, 0.f, + -0.828326f, -81.132179f, 0.f, -2.757063f, 0.f, 0.f, + 2.967951f, -4.440599f, 0.f, -5.105355f, 14.734543f, 0.f, + 0.f, 0.f, 0.f, 0.295342f, -0.026907f, 133.375412f, + -0.000855f, 0.f, -0.875029f, 15.665165f, 0.437296f, 0.321257f, + -0.001932f, -4.235782f, -87.187782f, 0.f, -28.84696f, 7.055514f, + 0.f, 95.548302f, -0.000425f, 0.38969f, -13.88008f, -27.347931f, + 0.f, 0.f, 0.f, -0.000026f, 0.f, 0.f, +}; + +static const float + av1_rdcost_model_nn_weights_layer1[NUM_HIDDEN_NODES * NUM_OUTPUTS] = { + -0.101706f, -0.14411f, -0.139118f, -0.132945f, 118.811302f, + 3.137232f, -32.969776f, -4.150725f, 26.263071f, 0.092841f, + 0.174125f, -0.028195f, 15.712872f, 17.722702f, 5.666006f, + -121.143929f, -131.933731f, -3.000318f, -0.032063f, -0.380065f, + -1.660653f, -0.164802f, 7.177527f, 87.759155f, -119.564224f, + -98.051651f, -110.581116f, -0.069982f, 0.023906f, 0.183792f, + 40.606274f, -0.080804f, -0.053744f, -0.187848f, 157.44313f, + -4.820149f, 0.089499f, 0.070232f, -0.043038f, 0.072996f, + 93.347313f, 0.225259f, 103.223228f, -110.682541f, 0.14314f, + -89.827538f, 6.505952f, -0.076949f, 73.816132f, -0.063416f, + -0.23736f, -0.066059f, 116.049599f, 0.120871f, -4.708246f, + 107.501671f, -0.206708f, -32.688675f, 0.047608f, -0.105907f, + 6.505825f, -75.461891f, -0.160341f, 6.532121f, -84.868111f, + -0.065622f, 0.044756f, 0.008672f, 0.017155f, 0.046108f, + -0.218818f, -126.507957f, 0.028271f, 0.180625f, -4.707376f, + -121.524307f, -0.03853f, -4.103166f, -0.018947f, -95.768463f, + 15.941695f, 0.147154f, -102.863029f, -72.521698f, -0.037133f, + -138.1492f, 0.210016f, -0.084692f, -68.693665f, -52.523472f, + -0.133385f, -0.17438f, 0.008654f, -0.035642f, -0.145202f, + 0.211135f, + }; + +static const float av1_rdcost_model_nn_biases_layer1[NUM_OUTPUTS] = { + 0.251909f +}; + +static const NN_CONFIG av1_rdcost_model_nnconfig = { + NUM_FEATURES, + NUM_OUTPUTS, + NUM_HIDDEN_LAYERS, + { + NUM_HIDDEN_NODES, + }, + { + av1_rdcost_model_nn_weights_layer0, + av1_rdcost_model_nn_weights_layer1, + }, + { + av1_rdcost_model_nn_biases_layer0, + av1_rdcost_model_nn_biases_layer1, + }, +}; + +//------------------------------------------------------------------------------ + +#undef NUM_FEATURES +#undef NUM_HIDDEN_LAYERS +#undef NUM_HIDDEN_NODES +#undef NUM_OUTPUTS + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_ diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c index ac9392fa1..3aae0144e 100644 --- a/third_party/aom/av1/encoder/ratectrl.c +++ b/third_party/aom/av1/encoder/ratectrl.c @@ -421,9 +421,9 @@ void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int width, projected_size_based_on_q = av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor); } else { - projected_size_based_on_q = - av1_estimate_bits_at_q(cpi->common.frame_type, cm->base_qindex, MBs, - rate_correction_factor, cm->bit_depth); + projected_size_based_on_q = av1_estimate_bits_at_q( + cpi->common.frame_type, cm->base_qindex, MBs, rate_correction_factor, + cm->seq_params.bit_depth); } // Work out a size correction factor. if (projected_size_based_on_q > FRAME_OVERHEAD_BITS) @@ -495,7 +495,7 @@ int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame, (int)av1_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor); } else { bits_per_mb_at_this_q = (int)av1_rc_bits_per_mb( - cm->frame_type, i, correction_factor, cm->bit_depth); + cm->frame_type, i, correction_factor, cm->seq_params.bit_depth); } if (bits_per_mb_at_this_q <= target_bits_per_mb) { @@ -643,7 +643,8 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width, int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi); int q; int *rtc_minq; - ASSIGN_MINQ_TABLE(cm->bit_depth, rtc_minq); + const int bit_depth = cm->seq_params.bit_depth; + ASSIGN_MINQ_TABLE(bit_depth, rtc_minq); if (frame_is_intra_only(cm)) { active_best_quality = rc->best_quality; @@ -652,17 +653,17 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width, // based on the ambient Q to reduce the risk of popping. if (rc->this_key_frame_forced) { int qindex = rc->last_boosted_qindex; - double last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth); - int delta_qindex = av1_compute_qdelta( - rc, last_boosted_q, (last_boosted_q * 0.75), cm->bit_depth); + double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); + int delta_qindex = av1_compute_qdelta(rc, last_boosted_q, + (last_boosted_q * 0.75), bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); } else if (cm->current_video_frame > 0) { // not first frame of one pass and kf_boost is set double q_adj_factor = 1.0; double q_val; - active_best_quality = get_kf_active_quality( - rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth); + active_best_quality = + get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth); // Allow somewhat lower kf minq with small image formats. if ((width * height) <= (352 * 288)) { @@ -671,9 +672,9 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width, // Convert the adjustment factor to a qindex delta // on active_best_quality. - q_val = av1_convert_qindex_to_q(active_best_quality, cm->bit_depth); + q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth); active_best_quality += - av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); + av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); } } else if (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { @@ -686,7 +687,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width, } else { q = active_worst_quality; } - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(rc, q, bit_depth); } else { // Use the lower of active_worst_quality and recent/average Q. if (cm->current_video_frame > 1) { @@ -716,8 +717,8 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width, !(cm->current_video_frame == 0)) { int qdelta = 0; aom_clear_system_state(); - qdelta = av1_compute_qdelta_by_rate( - &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth); + qdelta = av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, + active_worst_quality, 2.0, bit_depth); *top_index = active_worst_quality + qdelta; *top_index = AOMMAX(*top_index, *bottom_index); } @@ -768,27 +769,27 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width, int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi); int q; int *inter_minq; - ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq); + const int bit_depth = cm->seq_params.bit_depth; + ASSIGN_MINQ_TABLE(bit_depth, inter_minq); if (frame_is_intra_only(cm)) { if (oxcf->rc_mode == AOM_Q) { const int qindex = cq_level; - const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth); + const double q_val = av1_convert_qindex_to_q(qindex, bit_depth); const int delta_qindex = - av1_compute_qdelta(rc, q_val, q_val * 0.25, cm->bit_depth); + av1_compute_qdelta(rc, q_val, q_val * 0.25, bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); } else if (rc->this_key_frame_forced) { const int qindex = rc->last_boosted_qindex; - const double last_boosted_q = - av1_convert_qindex_to_q(qindex, cm->bit_depth); + const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); const int delta_qindex = av1_compute_qdelta( - rc, last_boosted_q, last_boosted_q * 0.75, cm->bit_depth); + rc, last_boosted_q, last_boosted_q * 0.75, bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); } else { // not first frame of one pass and kf_boost is set double q_adj_factor = 1.0; - active_best_quality = get_kf_active_quality( - rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth); + active_best_quality = + get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth); // Allow somewhat lower kf minq with small image formats. if ((width * height) <= (352 * 288)) { @@ -798,9 +799,9 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width, // Convert the adjustment factor to a qindex delta on active_best_quality. { const double q_val = - av1_convert_qindex_to_q(active_best_quality, cm->bit_depth); + av1_convert_qindex_to_q(active_best_quality, bit_depth); active_best_quality += - av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); + av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); } } } else if (!rc->is_src_frame_alt_ref && @@ -815,30 +816,30 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width, // For constrained quality dont allow Q less than the cq level if (oxcf->rc_mode == AOM_CQ) { if (q < cq_level) q = cq_level; - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(rc, q, bit_depth); // Constrained quality use slightly lower active best. active_best_quality = active_best_quality * 15 / 16; } else if (oxcf->rc_mode == AOM_Q) { const int qindex = cq_level; - const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth); + const double q_val = av1_convert_qindex_to_q(qindex, bit_depth); const int delta_qindex = (cpi->refresh_alt_ref_frame) - ? av1_compute_qdelta(rc, q_val, q_val * 0.40, cm->bit_depth) - : av1_compute_qdelta(rc, q_val, q_val * 0.50, cm->bit_depth); + ? av1_compute_qdelta(rc, q_val, q_val * 0.40, bit_depth) + : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(rc, q, bit_depth); } } else { if (oxcf->rc_mode == AOM_Q) { const int qindex = cq_level; - const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth); + const double q_val = av1_convert_qindex_to_q(qindex, bit_depth); const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0, 0.70, 1.0, 0.85, 1.0 }; const int delta_qindex = av1_compute_qdelta( rc, q_val, q_val * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL], - cm->bit_depth); + bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); } else { // Use the lower of active_worst_quality and recent/average Q. @@ -868,12 +869,12 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width, aom_clear_system_state(); if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced && !(cm->current_video_frame == 0)) { - qdelta = av1_compute_qdelta_by_rate( - &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth); + qdelta = av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, + active_worst_quality, 2.0, bit_depth); } else if (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { qdelta = av1_compute_qdelta_by_rate( - &cpi->rc, cm->frame_type, active_worst_quality, 1.75, cm->bit_depth); + &cpi->rc, cm->frame_type, active_worst_quality, 1.75, bit_depth); } *top_index = active_worst_quality + qdelta; *top_index = AOMMAX(*top_index, *bottom_index); @@ -908,9 +909,9 @@ int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) { INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME }; const AV1_COMMON *const cm = &cpi->common; - int qdelta = - av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q, - rate_factor_deltas[rf_level], cm->bit_depth); + int qdelta = av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q, + rate_factor_deltas[rf_level], + cm->seq_params.bit_depth); return qdelta; } @@ -927,7 +928,15 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, int active_worst_quality = cpi->twopass.active_worst_quality; int q; int *inter_minq; - ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq); + const int bit_depth = cm->seq_params.bit_depth; + ASSIGN_MINQ_TABLE(bit_depth, inter_minq); + +#if CUSTOMIZED_GF + const int is_intrl_arf_boost = + gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE; +#else + const int is_intrl_arf_boost = cpi->refresh_alt2_ref_frame; +#endif // CUSTOMIZED_GF if (frame_is_intra_only(cm)) { // Handle the special case for key frames forced when we have reached @@ -941,16 +950,16 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { qindex = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex); active_best_quality = qindex; - last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth); + last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); delta_qindex = av1_compute_qdelta(rc, last_boosted_q, - last_boosted_q * 1.25, cm->bit_depth); + last_boosted_q * 1.25, bit_depth); active_worst_quality = AOMMIN(qindex + delta_qindex, active_worst_quality); } else { qindex = rc->last_boosted_qindex; - last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth); + last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); delta_qindex = av1_compute_qdelta(rc, last_boosted_q, - last_boosted_q * 0.75, cm->bit_depth); + last_boosted_q * 0.75, bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); } } else { @@ -960,7 +969,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, // Baseline value derived from cpi->active_worst_quality and kf boost. active_best_quality = - get_kf_active_quality(rc, active_worst_quality, cm->bit_depth); + get_kf_active_quality(rc, active_worst_quality, bit_depth); // Allow somewhat lower kf minq with small image formats. if ((width * height) <= (352 * 288)) { @@ -972,12 +981,12 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, // Convert the adjustment factor to a qindex delta // on active_best_quality. - q_val = av1_convert_qindex_to_q(active_best_quality, cm->bit_depth); + q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth); active_best_quality += - av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); + av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); } } else if (!rc->is_src_frame_alt_ref && - (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame || + (cpi->refresh_golden_frame || is_intrl_arf_boost || cpi->refresh_alt_ref_frame)) { // Use the lower of active_worst_quality and recent // average Q as basis for GF/ARF best Q limit unless last frame was @@ -992,24 +1001,45 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, if (oxcf->rc_mode == AOM_CQ) { if (q < cq_level) q = cq_level; - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(rc, q, bit_depth); // Constrained quality use slightly lower active best. active_best_quality = active_best_quality * 15 / 16; } else if (oxcf->rc_mode == AOM_Q) { - if (!cpi->refresh_alt_ref_frame && !cpi->refresh_alt2_ref_frame) { + if (!cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) { active_best_quality = cq_level; } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); - - // Modify best quality for second level arfs. For mode AOM_Q this - // becomes the baseline frame q. - if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) - active_best_quality = (active_best_quality + cq_level + 1) / 2; + active_best_quality = get_gf_active_quality(rc, q, bit_depth); +#if USE_SYMM_MULTI_LAYER + if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) { + int this_height = gf_group->pyramid_level[gf_group->index]; + while (this_height < gf_group->pyramid_height) { + active_best_quality = (active_best_quality + cq_level + 1) / 2; + ++this_height; + } + } else { +#endif + // Modify best quality for second level arfs. For mode AOM_Q this + // becomes the baseline frame q. + if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) + active_best_quality = (active_best_quality + cq_level + 1) / 2; +#if USE_SYMM_MULTI_LAYER + } +#endif } } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(rc, q, bit_depth); +#if USE_SYMM_MULTI_LAYER + if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) { + int this_height = gf_group->pyramid_level[gf_group->index]; + while (this_height < gf_group->pyramid_height) { + active_best_quality = + (active_best_quality + active_worst_quality + 1) / 2; + ++this_height; + } + } +#endif } } else { if (oxcf->rc_mode == AOM_Q) { @@ -1031,7 +1061,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) { if (frame_is_intra_only(cm) || (!rc->is_src_frame_alt_ref && - (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame || + (cpi->refresh_golden_frame || is_intrl_arf_boost || cpi->refresh_alt_ref_frame))) { active_best_quality -= (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast); @@ -1056,7 +1086,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, // Modify active_best_quality for downscaled normal frames. if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) { int qdelta = av1_compute_qdelta_by_rate( - rc, cm->frame_type, active_best_quality, 2.0, cm->bit_depth); + rc, cm->frame_type, active_best_quality, 2.0, bit_depth); active_best_quality = AOMMAX(active_best_quality + qdelta, rc->best_quality); } @@ -1164,6 +1194,16 @@ static void update_alt_ref_frame_stats(AV1_COMP *cpi) { static void update_golden_frame_stats(AV1_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; +#if CUSTOMIZED_GF + const TWO_PASS *const twopass = &cpi->twopass; + const GF_GROUP *const gf_group = &twopass->gf_group; + const int is_intrnl_arf = + cpi->oxcf.pass == 2 + ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE + : cpi->refresh_alt2_ref_frame; +#else + const int is_intnl_arf = cpi->refresh_alt2_ref_frame; +#endif // Update the Golden frame usage counts. // NOTE(weitinglin): If we use show_existing_frame for an OVERLAY frame, @@ -1184,14 +1224,7 @@ static void update_golden_frame_stats(AV1_COMP *cpi) { } else if (!rc->source_alt_ref_pending) { rc->source_alt_ref_active = 0; } - - // Decrement count down till next gf - if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--; - - } else if (!cpi->refresh_alt_ref_frame && !cpi->refresh_alt2_ref_frame) { - // Decrement count down till next gf - if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--; - + } else if (!cpi->refresh_alt_ref_frame && !is_intrnl_arf) { rc->frames_since_golden++; } } @@ -1199,6 +1232,17 @@ static void update_golden_frame_stats(AV1_COMP *cpi) { void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { const AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; +#if CUSTOMIZED_GF + const TWO_PASS *const twopass = &cpi->twopass; + const GF_GROUP *const gf_group = &twopass->gf_group; + const int is_intrnl_arf = + cpi->oxcf.pass == 2 + ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE + : cpi->refresh_alt2_ref_frame; +#else + const int is_intrnl_arf = cpi->refresh_alt2_ref_frame; +#endif + const int qindex = cm->base_qindex; if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) { @@ -1218,13 +1262,13 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2); } else { if (!rc->is_src_frame_alt_ref && - !(cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame || + !(cpi->refresh_golden_frame || is_intrnl_arf || cpi->refresh_alt_ref_frame)) { rc->last_q[INTER_FRAME] = qindex; rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2); rc->ni_frames++; - rc->tot_q += av1_convert_qindex_to_q(qindex, cm->bit_depth); + rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params.bit_depth); rc->avg_q = rc->tot_q / rc->ni_frames; // Calculate the average Q for normal inter frames (not key or GFU // frames). @@ -1240,7 +1284,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { // This is used to help set quality in forced key frames to reduce popping if ((qindex < rc->last_boosted_qindex) || (cm->frame_type == KEY_FRAME) || (!rc->constrained_gf_group && - (cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame || + (cpi->refresh_alt_ref_frame || is_intrnl_arf || (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) { rc->last_boosted_qindex = qindex; } @@ -1591,6 +1635,10 @@ void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi, if (rc->max_gf_interval > rc->static_scene_max_gf_interval) rc->max_gf_interval = rc->static_scene_max_gf_interval; +#if FIX_GF_INTERVAL_LENGTH + rc->max_gf_interval = FIXED_GF_LENGTH + 1; +#endif + // Clamp min to max rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval); } diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h index 81157ce72..f0508da9e 100644 --- a/third_party/aom/av1/encoder/ratectrl.h +++ b/third_party/aom/av1/encoder/ratectrl.h @@ -24,6 +24,20 @@ extern "C" { // Bits Per MB at different Q (Multiplied by 512) #define BPER_MB_NORMBITS 9 +#define CUSTOMIZED_GF 1 +#define FIX_GF_INTERVAL_LENGTH 0 + +#if FIX_GF_INTERVAL_LENGTH +#define FIXED_GF_LENGTH 16 +#define USE_SYMM_MULTI_LAYER 1 +#else +#define USE_SYMM_MULTI_LAYER 0 +#endif + +#if USE_SYMM_MULTI_LAYER +#define USE_MANUAL_GF4_STRUCT 0 +#endif + #define MIN_GF_INTERVAL 4 #define MAX_GF_INTERVAL 16 #define FIXED_GF_INTERVAL 8 // Used in some testing modes only diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c index 17f23e5ec..c4d4777bf 100644 --- a/third_party/aom/av1/encoder/rd.c +++ b/third_party/aom/av1/encoder/rd.c @@ -44,9 +44,6 @@ #define RD_THRESH_POW 1.25 -// Factor to weigh the rate for switchable interp filters. -#define SWITCHABLE_INTERP_RATE_FACTOR 1 - // The baseline rd thresholds for breaking out of the rd loop for // certain modes are assumed to be based on 8x8 blocks. // This table is used to correct for block size. @@ -357,9 +354,10 @@ static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { }; int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) { - const int64_t q = av1_dc_quant_Q3(qindex, 0, cpi->common.bit_depth); + const int64_t q = + av1_dc_quant_Q3(qindex, 0, cpi->common.seq_params.bit_depth); int64_t rdmult = 0; - switch (cpi->common.bit_depth) { + switch (cpi->common.seq_params.bit_depth) { case AOM_BITS_8: rdmult = 88 * q * q / 24; break; case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); break; case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); break; @@ -394,7 +392,7 @@ static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) { } void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) { - switch (cpi->common.bit_depth) { + switch (cpi->common.seq_params.bit_depth) { case AOM_BITS_8: x->sadperbit16 = sad_per_bit16lut_8[qindex]; x->sadperbit4 = sad_per_bit4lut_8[qindex]; @@ -420,7 +418,7 @@ static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) { clamp(av1_get_qindex(&cm->seg, segment_id, cm->base_qindex) + cm->y_dc_delta_q, 0, MAXQ); - const int q = compute_rd_thresh_factor(qindex, cm->bit_depth); + const int q = compute_rd_thresh_factor(qindex, cm->seq_params.bit_depth); for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { // Threshold here seems unnecessarily harsh but fine given actual diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h index 281b676b0..692367d7a 100644 --- a/third_party/aom/av1/encoder/rd.h +++ b/third_party/aom/av1/encoder/rd.h @@ -43,6 +43,9 @@ extern "C" { #define RD_THRESH_MAX_FACT 64 #define RD_THRESH_INC 1 +// Factor to weigh the rate for switchable interp filters. +#define SWITCHABLE_INTERP_RATE_FACTOR 1 + // This enumerator type needs to be kept aligned with the mode order in // const MODE_DEFINITION av1_mode_order[MAX_MODES] used in the rd code. typedef enum { diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c index 6f4fced87..fef6d2875 100644 --- a/third_party/aom/av1/encoder/rdopt.c +++ b/third_party/aom/av1/encoder/rdopt.c @@ -58,8 +58,11 @@ #include "av1/encoder/tokenize.h" #include "av1/encoder/tx_prune_model_weights.h" +#define DNN_BASED_RD_INTERP_FILTER 0 + // Set this macro as 1 to collect data about tx size selection. #define COLLECT_TX_SIZE_DATA 0 + #if COLLECT_TX_SIZE_DATA static const char av1_tx_size_data_output_file[] = "tx_size_data.txt"; #endif @@ -916,9 +919,9 @@ static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w, int activity_masking = 0; int i, j; - DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]); - DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]); - DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]); + DECLARE_ALIGNED(16, od_coeff, e[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, od_coeff, tmp[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_SB_SQUARE]); for (i = 0; i < bsize_h; i++) { for (j = 0; j < bsize_w; j++) { e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j]; @@ -944,9 +947,9 @@ static double od_compute_dist_diff(uint16_t *x, int16_t *e, int bsize_w, int activity_masking = 0; - DECLARE_ALIGNED(16, uint16_t, y[MAX_TX_SQUARE]); - DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]); - DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]); + DECLARE_ALIGNED(16, uint16_t, y[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, od_coeff, tmp[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_SB_SQUARE]); int i, j; for (i = 0; i < bsize_h; i++) { for (j = 0; j < bsize_w; j++) { @@ -975,8 +978,8 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x, int i, j; const MACROBLOCKD *xd = &x->e_mbd; - DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]); - DECLARE_ALIGNED(16, uint16_t, rec[MAX_TX_SQUARE]); + DECLARE_ALIGNED(16, uint16_t, orig[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, uint16_t, rec[MAX_SB_SQUARE]); assert(bsw >= 8); assert(bsh >= 8); @@ -1068,8 +1071,8 @@ static int64_t dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src, int i, j; const MACROBLOCKD *xd = &x->e_mbd; - DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]); - DECLARE_ALIGNED(16, int16_t, diff16[MAX_TX_SQUARE]); + DECLARE_ALIGNED(16, uint16_t, orig[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, int16_t, diff16[MAX_SB_SQUARE]); assert(bsw >= 8); assert(bsh >= 8); @@ -1112,7 +1115,7 @@ static int64_t dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src, d = (int64_t)od_compute_dist_diff(orig, diff16, bsw, bsh, qindex); } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) { int coeff_shift = AOMMAX(xd->bd - 8, 0); - DECLARE_ALIGNED(16, uint16_t, dst16[MAX_TX_SQUARE]); + DECLARE_ALIGNED(16, uint16_t, dst16[MAX_SB_SQUARE]); for (i = 0; i < bsh; i++) { for (j = 0; j < bsw; j++) { @@ -1146,11 +1149,15 @@ static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize, const int bh = block_size_high[bsize]; unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - const int f_index = bsize - BLOCK_16X16; - if (f_index < 0) { - const int w_shift = bw == 8 ? 1 : 2; - const int h_shift = bh == 8 ? 1 : 2; - if (cpi->common.use_highbitdepth) { + if (bsize < BLOCK_16X16 || (bsize >= BLOCK_4X16 && bsize <= BLOCK_32X8)) { + // Special cases: calculate 'esq' values manually, as we don't have 'vf' + // functions for the 16 (very small) sub-blocks of this block. + const int w_shift = (bw == 4) ? 0 : (bw == 8) ? 1 : (bw == 16) ? 2 : 3; + const int h_shift = (bh == 4) ? 0 : (bh == 8) ? 1 : (bh == 16) ? 2 : 3; + assert(bw <= 32); + assert(bh <= 32); + assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15); + if (cpi->common.seq_params.use_highbitdepth) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); for (int i = 0; i < bh; ++i) @@ -1168,43 +1175,49 @@ static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize, (src[j + i * src_stride] - dst[j + i * dst_stride]); } } - } else { - cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[0]); - cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, + } else { // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks. + const int f_index = + (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16; + assert(f_index >= 0 && f_index < BLOCK_SIZES_ALL); + const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index; + assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]); + assert(block_size_high[bsize] == 4 * block_size_high[subsize]); + cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]); + cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, &esq[1]); - cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, + cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, &esq[2]); - cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, dst_stride, &esq[3]); src += bh / 4 * src_stride; dst += bh / 4 * dst_stride; - cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[4]); - cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, + cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]); + cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, &esq[5]); - cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, + cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, &esq[6]); - cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, dst_stride, &esq[7]); src += bh / 4 * src_stride; dst += bh / 4 * dst_stride; - cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[8]); - cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, + cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]); + cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, &esq[9]); - cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, + cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, &esq[10]); - cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, dst_stride, &esq[11]); src += bh / 4 * src_stride; dst += bh / 4 * dst_stride; - cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[12]); - cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, + cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]); + cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, &esq[13]); - cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, + cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, &esq[14]); - cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, dst_stride, &esq[15]); } @@ -1371,16 +1384,27 @@ static void get_energy_distribution_finer(const int16_t *diff, int stride, unsigned int esq[256]; const int w_shift = bw <= 8 ? 0 : 1; const int h_shift = bh <= 8 ? 0 : 1; - const int esq_w = bw <= 8 ? bw : bw / 2; - const int esq_h = bh <= 8 ? bh : bh / 2; + const int esq_w = bw >> w_shift; + const int esq_h = bh >> h_shift; const int esq_sz = esq_w * esq_h; int i, j; memset(esq, 0, esq_sz * sizeof(esq[0])); - for (i = 0; i < bh; i++) { - unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w; - const int16_t *cur_diff_row = diff + i * stride; - for (j = 0; j < bw; j++) { - cur_esq_row[j >> w_shift] += cur_diff_row[j] * cur_diff_row[j]; + if (w_shift) { + for (i = 0; i < bh; i++) { + unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w; + const int16_t *cur_diff_row = diff + i * stride; + for (j = 0; j < bw; j += 2) { + cur_esq_row[j >> 1] += (cur_diff_row[j] * cur_diff_row[j] + + cur_diff_row[j + 1] * cur_diff_row[j + 1]); + } + } + } else { + for (i = 0; i < bh; i++) { + unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w; + const int16_t *cur_diff_row = diff + i * stride; + for (j = 0; j < bw; j++) { + cur_esq_row[j] += cur_diff_row[j] * cur_diff_row[j]; + } } } @@ -1558,9 +1582,9 @@ static const float *prune_2D_adaptive_thresholds[] = { NULL, }; -static int prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size, - int blk_row, int blk_col, TxSetType tx_set_type, - TX_TYPE_PRUNE_MODE prune_mode) { +static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size, + int blk_row, int blk_col, TxSetType tx_set_type, + TX_TYPE_PRUNE_MODE prune_mode) { static const int tx_type_table_2D[16] = { DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT, ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST, @@ -1636,7 +1660,7 @@ static int prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size, const float score_thresh = prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness - 1]; - int prune_bitmask = 0; + uint16_t prune_bitmask = 0; for (int i = 0; i < 16; i++) { if (scores_2D[i] < score_thresh && i != max_score_i) prune_bitmask |= (1 << tx_type_table_2D[i]); @@ -1644,9 +1668,27 @@ static int prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size, return prune_bitmask; } +// ((prune >> vtx_tab[tx_type]) & 1) +static const uint16_t prune_v_mask[] = { + 0x0000, 0x0425, 0x108a, 0x14af, 0x4150, 0x4575, 0x51da, 0x55ff, + 0xaa00, 0xae25, 0xba8a, 0xbeaf, 0xeb50, 0xef75, 0xfbda, 0xffff, +}; + +// ((prune >> (htx_tab[tx_type] + 8)) & 1) +static const uint16_t prune_h_mask[] = { + 0x0000, 0x0813, 0x210c, 0x291f, 0x80e0, 0x88f3, 0xa1ec, 0xa9ff, + 0x5600, 0x5e13, 0x770c, 0x7f1f, 0xd6e0, 0xdef3, 0xf7ec, 0xffff, +}; + +static INLINE uint16_t gen_tx_search_prune_mask(int tx_search_prune) { + uint8_t prune_v = tx_search_prune & 0x0F; + uint8_t prune_h = (tx_search_prune >> 8) & 0x0F; + return (prune_v_mask[prune_v] & prune_h_mask[prune_h]); +} + static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, const MACROBLOCKD *const xd, int tx_set_type) { - av1_zero(x->tx_search_prune); + x->tx_search_prune[tx_set_type] = 0; x->tx_split_prune_flag = 0; const MB_MODE_INFO *mbmi = xd->mi[0]; if (!is_inter_block(mbmi) || cpi->sf.tx_type_search.prune_mode == NO_PRUNE || @@ -1656,24 +1698,24 @@ static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, int tx_set = ext_tx_set_index[1][tx_set_type]; assert(tx_set >= 0); const int *tx_set_1D = ext_tx_used_inter_1D[tx_set]; + int prune = 0; switch (cpi->sf.tx_type_search.prune_mode) { case NO_PRUNE: return; case PRUNE_ONE: if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) return; - x->tx_search_prune[tx_set_type] = prune_one_for_sby(cpi, bsize, x, xd); + prune = prune_one_for_sby(cpi, bsize, x, xd); + x->tx_search_prune[tx_set_type] = gen_tx_search_prune_mask(prune); break; case PRUNE_TWO: if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) { if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return; - x->tx_search_prune[tx_set_type] = - prune_two_for_sby(cpi, bsize, x, xd, 0, 1); - } - if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) { - x->tx_search_prune[tx_set_type] = - prune_two_for_sby(cpi, bsize, x, xd, 1, 0); + prune = prune_two_for_sby(cpi, bsize, x, xd, 0, 1); + } else if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) { + prune = prune_two_for_sby(cpi, bsize, x, xd, 1, 0); + } else { + prune = prune_two_for_sby(cpi, bsize, x, xd, 1, 1); } - x->tx_search_prune[tx_set_type] = - prune_two_for_sby(cpi, bsize, x, xd, 1, 1); + x->tx_search_prune[tx_set_type] = gen_tx_search_prune_mask(prune); break; case PRUNE_2D_ACCURATE: case PRUNE_2D_FAST: break; @@ -1681,17 +1723,6 @@ static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, } } -static int do_tx_type_search(TX_TYPE tx_type, int prune, - TX_TYPE_PRUNE_MODE mode) { - // TODO(sarahparker) implement for non ext tx - if (mode >= PRUNE_2D_ACCURATE) { - return !((prune >> tx_type) & 1); - } else { - return !(((prune >> vtx_tab[tx_type]) & 1) | - ((prune >> (htx_tab[tx_type] + 8)) & 1)); - } -} - static void model_rd_from_sse(const AV1_COMP *const cpi, const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, int64_t sse, int *rate, @@ -1764,9 +1795,11 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, for (plane = plane_from; plane <= plane_to; ++plane) { struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; - const BLOCK_SIZE bs = + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); - unsigned int sse; + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + int64_t sse; int rate; int64_t dist; @@ -1774,14 +1807,14 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, // TODO(geza): Write direct sse functions that do not compute // variance as well. - cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, - &sse); + sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh); + sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); - if (plane == 0) x->pred_sse[ref] = sse; + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); total_sse += sse; - model_rd_from_sse(cpi, xd, bs, plane, sse, &rate, &dist); + model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &rate, &dist); rate_sum += rate; dist_sum += dist; @@ -1934,7 +1967,8 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x, static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row, int blk_col, const BLOCK_SIZE plane_bsize, - const BLOCK_SIZE tx_bsize) { + const BLOCK_SIZE tx_bsize, + int force_sse) { int visible_rows, visible_cols; const MACROBLOCKD *xd = &x->e_mbd; get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL, @@ -1944,13 +1978,17 @@ static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane, #if CONFIG_DIST_8X8 int txb_height = block_size_high[tx_bsize]; int txb_width = block_size_wide[tx_bsize]; - if (x->using_dist_8x8 && plane == 0 && txb_width >= 8 && txb_height >= 8) { + if (!force_sse && x->using_dist_8x8 && plane == 0 && txb_width >= 8 && + txb_height >= 8) { const int src_stride = x->plane[plane].src.stride; const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0]; + const int diff_idx = (blk_row * diff_stride + blk_col) + << tx_size_wide_log2[0]; const uint8_t *src = &x->plane[plane].src.buf[src_idx]; - return dist_8x8_diff(x, src, src_stride, diff, diff_stride, txb_width, - txb_height, visible_cols, visible_rows, x->qindex); + return dist_8x8_diff(x, src, src_stride, diff + diff_idx, diff_stride, + txb_width, txb_height, visible_cols, visible_rows, + x->qindex); } #endif diff += ((blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]); @@ -2182,10 +2220,14 @@ static void get_2x2_normalized_sses_and_sads( for (int col = 0; col < 2; ++col) { const int16_t *const this_src_diff = src_diff + row * half_height * diff_stride + col * half_width; - sse_norm_arr[row * 2 + col] = - get_sse_norm(this_src_diff, diff_stride, half_width, half_height); - sad_norm_arr[row * 2 + col] = - get_sad_norm(this_src_diff, diff_stride, half_width, half_height); + if (sse_norm_arr) { + sse_norm_arr[row * 2 + col] = + get_sse_norm(this_src_diff, diff_stride, half_width, half_height); + } + if (sad_norm_arr) { + sad_norm_arr[row * 2 + col] = + get_sad_norm(this_src_diff, diff_stride, half_width, half_height); + } } } } else { // use function pointers to calculate stats @@ -2199,28 +2241,35 @@ static void get_2x2_normalized_sses_and_sads( const uint8_t *const this_dst = dst + row * half_height * dst_stride + col * half_width; - unsigned int this_sse; - cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst, - dst_stride, &this_sse); - sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half; + if (sse_norm_arr) { + unsigned int this_sse; + cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst, + dst_stride, &this_sse); + sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half; + } - const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf( - this_src, src_stride, this_dst, dst_stride); - sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half; + if (sad_norm_arr) { + const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf( + this_src, src_stride, this_dst, dst_stride); + sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half; + } } } } } #if CONFIG_COLLECT_RD_STATS -// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values -// 0: Do not collect any RD stats -// 1: Collect RD stats for transform units -// 2: Collect RD stats for partition units + // NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values + // 0: Do not collect any RD stats + // 1: Collect RD stats for transform units + // 2: Collect RD stats for partition units + +#if CONFIG_COLLECT_RD_STATS == 1 static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, const RD_STATS *const rd_stats, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, TX_TYPE tx_type) { + TX_SIZE tx_size, TX_TYPE tx_type, + int64_t rd) { if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return; // Generate small sample to restrict output size. @@ -2304,9 +2353,12 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2], hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]); + fprintf(fout, " %d %" PRId64, x->rdmult, rd); + fprintf(fout, "\n"); fclose(fout); } +#endif // CONFIG_COLLECT_RD_STATS == 1 #if CONFIG_COLLECT_RD_STATS == 2 static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, @@ -2327,12 +2379,14 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, const int plane = 0; struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; - const int bw = block_size_wide[plane_bsize]; - const int bh = block_size_high[plane_bsize]; + const int diff_stride = block_size_wide[plane_bsize]; + int bw, bh; + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, + &bh); + const int num_samples = bw * bh; const int dequant_shift = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; const int q_step = pd->dequant_Q3[1] >> dequant_shift; - const double num_samples = bw * bh; const double rate_norm = (double)rd_stats->rate / num_samples; const double dist_norm = (double)rd_stats->dist / num_samples; @@ -2343,23 +2397,28 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, const uint8_t *const src = p->src.buf; const int dst_stride = pd->dst.stride; const uint8_t *const dst = pd->dst.buf; - unsigned int sse; - cpi->fn_ptr[plane_bsize].vf(src, src_stride, dst, dst_stride, &sse); + const int16_t *const src_diff = p->src_diff; + const int shift = (xd->bd - 8); + + int64_t sse = aom_sum_squares_2d_i16(src_diff, diff_stride, bw, bh); + sse = ROUND_POWER_OF_TWO(sse, shift * 2); const double sse_norm = (double)sse / num_samples; const unsigned int sad = cpi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride); - const double sad_norm = (double)sad / num_samples; + const double sad_norm = + (double)sad / (1 << num_pels_log2_lookup[plane_bsize]); fprintf(fout, " %g %g", sse_norm, sad_norm); - const int diff_stride = block_size_wide[plane_bsize]; - const int16_t *const src_diff = p->src_diff; - double sse_norm_arr[4], sad_norm_arr[4]; get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst, dst_stride, src_diff, diff_stride, sse_norm_arr, sad_norm_arr); + if (shift) { + for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift)); + for (int k = 0; k < 4; ++k) sad_norm_arr[k] /= (1 << shift); + } for (int i = 0; i < 4; ++i) { fprintf(fout, " %g", sse_norm_arr[i]); } @@ -2376,7 +2435,8 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, const double model_dist_norm = (double)model_dist / num_samples; fprintf(fout, " %g %g", model_rate_norm, model_dist_norm); - const double mean = get_mean(src_diff, diff_stride, bw, bh); + double mean = get_mean(src_diff, diff_stride, bw, bh); + mean /= (1 << shift); double hor_corr, vert_corr; get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr); fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr); @@ -2393,20 +2453,19 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, #endif // CONFIG_COLLECT_RD_STATS == 2 #endif // CONFIG_COLLECT_RD_STATS -static void model_rd_with_dnn(const AV1_COMP *const cpi, - const MACROBLOCK *const x, BLOCK_SIZE bsize, - int plane, unsigned int *rsse, int *rate, - int64_t *dist) { +static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, int64_t *rsse, + int *rate, int64_t *dist) { const MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; - const BLOCK_SIZE plane_bsize = - get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); const int log_numpels = num_pels_log2_lookup[plane_bsize]; - const int num_samples = (1 << log_numpels); const struct macroblock_plane *const p = &x->plane[plane]; - const int bw = block_size_wide[plane_bsize]; - const int bh = block_size_high[plane_bsize]; + int bw, bh; + const int diff_stride = block_size_wide[plane_bsize]; + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, + &bh); + const int num_samples = bw * bh; const int dequant_shift = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; const int q_step = pd->dequant_Q3[1] >> dequant_shift; @@ -2415,55 +2474,73 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi, const uint8_t *const src = p->src.buf; const int dst_stride = pd->dst.stride; const uint8_t *const dst = pd->dst.buf; - unsigned int sse; - cpi->fn_ptr[plane_bsize].vf(src, src_stride, dst, dst_stride, &sse); + const int16_t *const src_diff = p->src_diff; + const int shift = (xd->bd - 8); + int64_t sse = aom_sum_squares_2d_i16(p->src_diff, diff_stride, bw, bh); + sse = ROUND_POWER_OF_TWO(sse, shift * 2); const double sse_norm = (double)sse / num_samples; - const int diff_stride = block_size_wide[plane_bsize]; - const int16_t *const src_diff = p->src_diff; + if (sse == 0) { + if (rate) *rate = 0; + if (dist) *dist = 0; + if (rsse) *rsse = sse; + return; + } + if (plane) { + int model_rate; + int64_t model_dist; + model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate, + &model_dist); + if (rate) *rate = model_rate; + if (dist) *dist = model_dist; + if (rsse) *rsse = sse; + return; + } - double sse_norm_arr[4], sad_norm_arr[4]; + double sse_norm_arr[4]; get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst, dst_stride, src_diff, diff_stride, - sse_norm_arr, sad_norm_arr); - const double mean = get_mean(src_diff, diff_stride, bw, bh); + sse_norm_arr, NULL); + double mean = get_mean(src_diff, bw, bw, bh); + if (shift) { + for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift)); + mean /= (1 << shift); + } const double variance = sse_norm - mean * mean; + assert(variance >= 0.0); const double q_sqr = (double)(q_step * q_step); - const double q_sqr_by_variance = q_sqr / variance; + const double q_sqr_by_sse_norm = q_sqr / (sse_norm + 1.0); double hor_corr, vert_corr; get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr); - double hdist[4] = { 0 }, vdist[4] = { 0 }; - get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst, - dst_stride, 1, hdist, vdist); - float features[20]; - features[0] = (float)hdist[0]; - features[1] = (float)hdist[1]; - features[2] = (float)hdist[2]; - features[3] = (float)hdist[3]; - features[4] = (float)hor_corr; - features[5] = (float)log_numpels; - features[6] = (float)mean; - features[7] = (float)q_sqr; - features[8] = (float)q_sqr_by_variance; - features[9] = (float)sse_norm_arr[0]; - features[10] = (float)sse_norm_arr[1]; - features[11] = (float)sse_norm_arr[2]; - features[12] = (float)sse_norm_arr[3]; - features[13] = (float)sse_norm_arr[3]; - features[14] = (float)variance; - features[15] = (float)vdist[0]; - features[16] = (float)vdist[1]; - features[17] = (float)vdist[2]; - features[18] = (float)vdist[3]; - features[19] = (float)vert_corr; - - float rate_f, dist_f; - av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_f); + float features[11]; + features[0] = (float)hor_corr; + features[1] = (float)log_numpels; + features[2] = (float)q_sqr; + features[3] = (float)q_sqr_by_sse_norm; + features[4] = (float)sse_norm_arr[0]; + features[5] = (float)sse_norm_arr[1]; + features[6] = (float)sse_norm_arr[2]; + features[7] = (float)sse_norm_arr[3]; + features[8] = (float)sse_norm; + features[9] = (float)variance; + features[10] = (float)vert_corr; + + float rate_f, dist_by_sse_norm_f; + av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_by_sse_norm_f); av1_nn_predict(features, &av1_pustats_rate_nnconfig, &rate_f); - const int rate_i = (int)(AOMMAX(0.0, rate_f * (1 << log_numpels)) + 0.5); - const int64_t dist_i = - (int64_t)(AOMMAX(0.0, dist_f * (1 << log_numpels)) + 0.5); + const float dist_f = (float)((double)dist_by_sse_norm_f * (1.0 + sse_norm)); + int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5); + int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5); + + // Check if skip is better + if (RDCOST(x->rdmult, rate_i, dist_i) >= RDCOST(x->rdmult, 0, (sse << 4))) { + dist_i = sse << 4; + rate_i = 0; + } else if (rate_i == 0) { + dist_i = sse << 4; + } + if (rate) *rate = rate_i; if (dist) *dist = dist_i; if (rsse) *rsse = sse; @@ -2488,15 +2565,18 @@ void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize, x->pred_sse[ref] = 0; for (int plane = plane_from; plane <= plane_to; ++plane) { - unsigned int sse; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + int64_t sse; int rate; int64_t dist; if (x->skip_chroma_rd && plane) continue; - model_rd_with_dnn(cpi, x, bsize, plane, &sse, &rate, &dist); + model_rd_with_dnn(cpi, x, plane_bsize, plane, &sse, &rate, &dist); - if (plane == 0) x->pred_sse[ref] = sse; + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); total_sse += sse; rate_sum += rate; @@ -2586,27 +2666,16 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int rate_cost = 0; TX_TYPE txk_start = DCT_DCT; TX_TYPE txk_end = TX_TYPES - 1; - if (!(!is_inter && x->use_default_intra_tx_type) && - !(is_inter && x->use_default_inter_tx_type)) - if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan) - if (plane == 0) txk_end = DCT_DCT; + if ((!is_inter && x->use_default_intra_tx_type) || + (is_inter && x->use_default_inter_tx_type)) { + txk_start = txk_end = get_default_tx_type(0, xd, tx_size); + } else if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan) { + if (plane == 0) txk_end = DCT_DCT; + } uint8_t best_txb_ctx = 0; const TxSetType tx_set_type = av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used); - int prune = 0; - const int do_prune = plane == 0 && !fast_tx_search && txk_end != DCT_DCT && - !(!is_inter && x->use_default_intra_tx_type) && - !(is_inter && x->use_default_inter_tx_type) && - cpi->sf.tx_type_search.prune_mode > NO_PRUNE; - if (do_prune && is_inter) { - if (cpi->sf.tx_type_search.prune_mode >= PRUNE_2D_ACCURATE) { - prune = prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, - tx_set_type, cpi->sf.tx_type_search.prune_mode); - } else { - prune = x->tx_search_prune[tx_set_type]; - } - } TX_TYPE uv_tx_type = DCT_DCT; if (plane) { @@ -2615,39 +2684,38 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, av1_get_tx_type(get_plane_type(plane), xd, blk_row, blk_col, tx_size, cm->reduced_tx_set_used); } - if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) { + const uint16_t ext_tx_used_flag = av1_ext_tx_used_flag[tx_set_type]; + if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 || + ext_tx_used_flag == 0x0001) { txk_start = txk_end = DCT_DCT; } - - int8_t allowed_tx_mask[TX_TYPES] = { 0 }; // 1: allow; 0: skip. - int allowed_tx_num = 0; - if (fast_tx_search) { - allowed_tx_mask[DCT_DCT] = 1; - allowed_tx_mask[H_DCT] = 1; - allowed_tx_mask[V_DCT] = 1; + uint16_t allowed_tx_mask = 0; // 1: allow; 0: skip. + if (txk_start == txk_end) { + allowed_tx_mask = 1 << txk_start; + allowed_tx_mask &= ext_tx_used_flag; + } else if (fast_tx_search) { + allowed_tx_mask = 0x0c01; // V_DCT, H_DCT, DCT_DCT + allowed_tx_mask &= ext_tx_used_flag; } else { - memset(allowed_tx_mask + txk_start, 1, txk_end - txk_start + 1); - } - for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) { - if (do_prune) { - if (!do_tx_type_search(tx_type, prune, cpi->sf.tx_type_search.prune_mode)) - allowed_tx_mask[tx_type] = 0; - } - if (plane == 0 && allowed_tx_mask[tx_type]) { - if (!av1_ext_tx_used[tx_set_type][tx_type]) - allowed_tx_mask[tx_type] = 0; - else if (!is_inter && x->use_default_intra_tx_type && - tx_type != get_default_tx_type(0, xd, tx_size)) - allowed_tx_mask[tx_type] = 0; - else if (is_inter && x->use_default_inter_tx_type && - tx_type != get_default_tx_type(0, xd, tx_size)) - allowed_tx_mask[tx_type] = 0; - } - allowed_tx_num += allowed_tx_mask[tx_type]; + assert(plane == 0); + allowed_tx_mask = ext_tx_used_flag; + // !fast_tx_search && txk_end != txk_start && plane == 0 + const int do_prune = cpi->sf.tx_type_search.prune_mode > NO_PRUNE; + if (do_prune && is_inter) { + if (cpi->sf.tx_type_search.prune_mode >= PRUNE_2D_ACCURATE) { + const uint16_t prune = + prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type, + cpi->sf.tx_type_search.prune_mode); + allowed_tx_mask &= (~prune); + } else { + allowed_tx_mask &= (~x->tx_search_prune[tx_set_type]); + } + } } // Need to have at least one transform type allowed. - if (allowed_tx_num == 0) { - allowed_tx_mask[plane ? uv_tx_type : DCT_DCT] = 1; + if (allowed_tx_mask == 0) { + txk_start = txk_end = (plane ? uv_tx_type : DCT_DCT); + allowed_tx_mask = (1 << txk_start); } int use_transform_domain_distortion = @@ -2664,20 +2732,21 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, cpi->sf.use_transform_domain_distortion == 1 && use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD && !x->cb_partition_scan; - if (calc_pixel_domain_distortion_final && allowed_tx_num <= 1) + if (calc_pixel_domain_distortion_final && + (txk_start == txk_end || allowed_tx_mask == 0x0001)) calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0; const uint16_t *eobs_ptr = x->plane[plane].eobs; const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; int64_t block_sse = - pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize); + pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize, 1); if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2); block_sse *= 16; for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) { - if (!allowed_tx_mask[tx_type]) continue; + if (!(allowed_tx_mask & (1 << tx_type))) continue; if (plane == 0) mbmi->txk_type[txk_type_idx] = tx_type; RD_STATS this_rd_stats; av1_invalid_rd_stats(&this_rd_stats); @@ -2686,8 +2755,8 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, av1_xform_quant( cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type, USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP); - rate_cost = av1_cost_coeffs(cm, x, plane, blk_row, blk_col, block, - tx_size, txb_ctx, use_fast_coef_costing); + rate_cost = av1_cost_coeffs(cm, x, plane, block, tx_size, tx_type, + txb_ctx, use_fast_coef_costing); } else { av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type, AV1_XFORM_QUANT_FP); @@ -2696,13 +2765,18 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, // Calculate distortion quickly in transform domain. dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist, &this_rd_stats.sse); - rate_cost = av1_cost_coeffs(cm, x, plane, blk_row, blk_col, block, - tx_size, txb_ctx, use_fast_coef_costing); + + const int64_t best_rd_ = AOMMIN(best_rd, ref_best_rd); + const int64_t dist_cost_estimate = + RDCOST(x->rdmult, 0, AOMMIN(this_rd_stats.dist, this_rd_stats.sse)); + if (dist_cost_estimate - (dist_cost_estimate >> 3) > best_rd_) continue; + + rate_cost = av1_cost_coeffs(cm, x, plane, block, tx_size, tx_type, + txb_ctx, use_fast_coef_costing); const int64_t rd_estimate = AOMMIN(RDCOST(x->rdmult, rate_cost, this_rd_stats.dist), RDCOST(x->rdmult, 0, this_rd_stats.sse)); - if (rd_estimate - (rd_estimate >> 3) > AOMMIN(best_rd, ref_best_rd)) - continue; + if (rd_estimate - (rd_estimate >> 3) > best_rd_) continue; } av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, 1, &rate_cost); @@ -2741,7 +2815,7 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, #if CONFIG_COLLECT_RD_STATS == 1 if (plane == 0) { PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col, - plane_bsize, tx_size, tx_type); + plane_bsize, tx_size, tx_type, rd); } #endif // CONFIG_COLLECT_RD_STATS == 1 @@ -3097,6 +3171,7 @@ static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs, MACROBLOCK *x, int *r, int64_t *d, int *s, int64_t *sse, int64_t ref_best_rd) { RD_STATS rd_stats; + av1_subtract_plane(x, bs, 0); x->rd_model = LOW_TXFM_RD; int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs, max_txsize_rect_lookup[bs], FTXS_NONE); @@ -3267,7 +3342,7 @@ static int intra_mode_info_cost_y(const AV1_COMP *cpi, const MACROBLOCK *x, const int n_cache = av1_get_palette_cache(xd, 0, color_cache); palette_mode_cost += av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache, - n_cache, cpi->common.bit_depth); + n_cache, cpi->common.seq_params.bit_depth); palette_mode_cost += av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP); total_rate += palette_mode_cost; @@ -3318,8 +3393,8 @@ static int intra_mode_info_cost_uv(const AV1_COMP *cpi, const MACROBLOCK *x, write_uniform_cost(plt_size, color_map[0]); uint16_t color_cache[2 * PALETTE_MAX_SIZE]; const int n_cache = av1_get_palette_cache(xd, 1, color_cache); - palette_mode_cost += av1_palette_color_cost_uv(pmi, color_cache, n_cache, - cpi->common.bit_depth); + palette_mode_cost += av1_palette_color_cost_uv( + pmi, color_cache, n_cache, cpi->common.seq_params.bit_depth); palette_mode_cost += av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP); total_rate += palette_mode_cost; @@ -3375,6 +3450,7 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x, } } // RD estimation. + av1_subtract_plane(x, bsize, 0); model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate, &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL, NULL, NULL); @@ -3458,10 +3534,10 @@ static void palette_rd_y( return; } PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; - if (cpi->common.use_highbitdepth) + if (cpi->common.seq_params.use_highbitdepth) for (int i = 0; i < k; ++i) - pmi->palette_colors[i] = - clip_pixel_highbd((int)centroids[i], cpi->common.bit_depth); + pmi->palette_colors[i] = clip_pixel_highbd( + (int)centroids[i], cpi->common.seq_params.bit_depth); else for (int i = 0; i < k; ++i) pmi->palette_colors[i] = clip_pixel(centroids[i]); @@ -3514,6 +3590,7 @@ static int rd_pick_palette_intra_sby( MB_MODE_INFO *const mbmi = xd->mi[0]; assert(!is_inter_block(mbmi)); assert(av1_allow_palette(cpi->common.allow_screen_content_tools, bsize)); + const SequenceHeader *const seq_params = &cpi->common.seq_params; int colors, n; const int src_stride = x->plane[0].src.stride; const uint8_t *const src = x->plane[0].src.buf; @@ -3523,9 +3600,9 @@ static int rd_pick_palette_intra_sby( &cols); int count_buf[1 << 12]; // Maximum (1 << 12) color levels. - if (cpi->common.use_highbitdepth) + if (seq_params->use_highbitdepth) colors = av1_count_colors_highbd(src, src_stride, rows, cols, - cpi->common.bit_depth, count_buf); + seq_params->bit_depth, count_buf); else colors = av1_count_colors(src, src_stride, rows, cols, count_buf); mbmi->filter_intra_mode_info.use_filter_intra = 0; @@ -3537,12 +3614,12 @@ static int rd_pick_palette_intra_sby( int centroids[PALETTE_MAX_SIZE]; int lb, ub, val; uint16_t *src16 = CONVERT_TO_SHORTPTR(src); - if (cpi->common.use_highbitdepth) + if (seq_params->use_highbitdepth) lb = ub = src16[0]; else lb = ub = src[0]; - if (cpi->common.use_highbitdepth) { + if (seq_params->use_highbitdepth) { for (r = 0; r < rows; ++r) { for (c = 0; c < cols; ++c) { val = src16[r * src_stride + c]; @@ -3576,7 +3653,7 @@ static int rd_pick_palette_intra_sby( int top_colors[PALETTE_MAX_SIZE] = { 0 }; for (i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) { int max_count = 0; - for (int j = 0; j < (1 << cpi->common.bit_depth); ++j) { + for (int j = 0; j < (1 << seq_params->bit_depth); ++j) { if (count_buf[j] > max_count) { max_count = count_buf[j]; top_colors[i] = j; @@ -4316,6 +4393,244 @@ static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row, return (int)(score * 100); } +typedef struct { + int64_t rd; + int txb_entropy_ctx; + TX_TYPE tx_type; +} TxCandidateInfo; + +static void try_tx_block_no_split( + const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, + TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, + const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl, + int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd, + FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node, + TxCandidateInfo *no_split) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct macroblock_plane *const p = &x->plane[0]; + const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + + no_split->rd = INT64_MAX; + no_split->txb_entropy_ctx = 0; + no_split->tx_type = TX_TYPES; + + const ENTROPY_CONTEXT *const pta = ta + blk_col; + const ENTROPY_CONTEXT *const ptl = tl + blk_row; + + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx); + const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y] + .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; + + rd_stats->ref_rdcost = ref_best_rd; + rd_stats->zero_rate = zero_blk_rate; + const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col); + mbmi->inter_tx_size[index] = tx_size; + tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, pta, + ptl, rd_stats, ftxs_mode, ref_best_rd, + rd_info_node != NULL ? rd_info_node->rd_info_array : NULL); + assert(rd_stats->rate < INT_MAX); + + if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= + RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) || + rd_stats->skip == 1) && + !xd->lossless[mbmi->segment_id]) { +#if CONFIG_RD_DEBUG + av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col, + zero_blk_rate - rd_stats->rate); +#endif // CONFIG_RD_DEBUG + rd_stats->rate = zero_blk_rate; + rd_stats->dist = rd_stats->sse; + rd_stats->skip = 1; + x->blk_skip[blk_row * bw + blk_col] = 1; + p->eobs[block] = 0; + update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, + DCT_DCT); + } else { + x->blk_skip[blk_row * bw + blk_col] = 0; + rd_stats->skip = 0; + } + + if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) + rd_stats->rate += x->txfm_partition_cost[txfm_partition_ctx][0]; + + no_split->rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + no_split->txb_entropy_ctx = p->txb_entropy_ctx[block]; + const int txk_type_idx = + av1_get_txk_type_index(plane_bsize, blk_row, blk_col); + no_split->tx_type = mbmi->txk_type[txk_type_idx]; +} + +static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, + int blk_col, int block, TX_SIZE tx_size, int depth, + BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta, + ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, + TXFM_CONTEXT *tx_left, RD_STATS *rd_stats, + int64_t ref_best_rd, int *is_cost_valid, + FAST_TX_SEARCH_MODE ftxs_mode, + TXB_RD_INFO_NODE *rd_info_node); + +static void try_tx_block_split( + const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, + TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta, + ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, + int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd, + FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node, + RD_STATS *split_rd_stats, int64_t *split_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + const int max_blocks_high = max_block_high(xd, plane_bsize, 0); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0); + struct macroblock_plane *const p = &x->plane[0]; + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int sub_step = bsw * bsh; + RD_STATS this_rd_stats; + int this_cost_valid = 1; + int64_t tmp_rd = 0; +#if CONFIG_DIST_8X8 + int sub8x8_eob[4] = { 0, 0, 0, 0 }; + struct macroblockd_plane *const pd = &xd->plane[0]; +#endif + split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1]; + + assert(tx_size < TX_SIZES_ALL); + + int blk_idx = 0; + for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) { + for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw, ++blk_idx) { + const int offsetr = blk_row + r; + const int offsetc = blk_col + c; + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + assert(blk_idx < 4); + select_tx_block( + cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, ta, + tl, tx_above, tx_left, &this_rd_stats, ref_best_rd - tmp_rd, + &this_cost_valid, ftxs_mode, + (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL); + +#if CONFIG_DIST_8X8 + if (!x->using_dist_8x8) +#endif + if (!this_cost_valid) goto LOOP_EXIT; +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8 && tx_size == TX_8X8) { + sub8x8_eob[2 * (r / bsh) + (c / bsw)] = p->eobs[block]; + } +#endif // CONFIG_DIST_8X8 + av1_merge_rd_stats(split_rd_stats, &this_rd_stats); + + tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist); +#if CONFIG_DIST_8X8 + if (!x->using_dist_8x8) +#endif + if (no_split_rd < tmp_rd) { + this_cost_valid = 0; + goto LOOP_EXIT; + } + block += sub_step; + } + } + +LOOP_EXIT : {} + +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8 && this_cost_valid && tx_size == TX_8X8) { + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + + const uint8_t *src = + &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]]; + const uint8_t *dst = + &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; + + int64_t dist_8x8; + const int qindex = x->qindex; + const int pred_stride = block_size_wide[plane_bsize]; + const int pred_idx = (blk_row * pred_stride + blk_col) + << tx_size_wide_log2[0]; + const int16_t *pred = &x->pred_luma[pred_idx]; + int i, j; + int row, col; + + uint8_t *pred8; + DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]); + + dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, BLOCK_8X8, + 8, 8, 8, 8, qindex) * + 16; + +#ifdef DEBUG_DIST_8X8 + if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) + assert(sum_rd_stats.sse == dist_8x8); +#endif // DEBUG_DIST_8X8 + + split_rd_stats->sse = dist_8x8; + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + pred8 = CONVERT_TO_BYTEPTR(pred8_16); + else + pred8 = (uint8_t *)pred8_16; + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + for (row = 0; row < 2; ++row) { + for (col = 0; col < 2; ++col) { + int idx = row * 2 + col; + int eob = sub8x8_eob[idx]; + + if (eob > 0) { + for (j = 0; j < 4; j++) + for (i = 0; i < 4; i++) + CONVERT_TO_SHORTPTR(pred8) + [(row * 4 + j) * 8 + 4 * col + i] = + pred[(row * 4 + j) * pred_stride + 4 * col + i]; + } else { + for (j = 0; j < 4; j++) + for (i = 0; i < 4; i++) + CONVERT_TO_SHORTPTR(pred8) + [(row * 4 + j) * 8 + 4 * col + i] = CONVERT_TO_SHORTPTR( + dst)[(row * 4 + j) * dst_stride + 4 * col + i]; + } + } + } + } else { + for (row = 0; row < 2; ++row) { + for (col = 0; col < 2; ++col) { + int idx = row * 2 + col; + int eob = sub8x8_eob[idx]; + + if (eob > 0) { + for (j = 0; j < 4; j++) + for (i = 0; i < 4; i++) + pred8[(row * 4 + j) * 8 + 4 * col + i] = + (uint8_t)pred[(row * 4 + j) * pred_stride + 4 * col + i]; + } else { + for (j = 0; j < 4; j++) + for (i = 0; i < 4; i++) + pred8[(row * 4 + j) * 8 + 4 * col + i] = + dst[(row * 4 + j) * dst_stride + 4 * col + i]; + } + } + } + } + dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8, 8, + 8, 8, qindex) * + 16; + +#ifdef DEBUG_DIST_8X8 + if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) + assert(sum_rd_stats.dist == dist_8x8); +#endif // DEBUG_DIST_8X8 + + split_rd_stats->dist = dist_8x8; + tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist); + } +#endif // CONFIG_DIST_8X8 + if (this_cost_valid) *split_rd = tmp_rd; +} + // Search for the best tx partition/type for a given luma block. static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, TX_SIZE tx_size, int depth, @@ -4338,8 +4653,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; - ENTROPY_CONTEXT *pta = ta + blk_col; - ENTROPY_CONTEXT *ptl = tl + blk_row; MB_MODE_INFO *const mbmi = xd->mi[0]; const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row, mbmi->sb_type, tx_size); @@ -4348,64 +4661,25 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, const int try_no_split = 1; int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH; - int64_t no_split_rd = INT64_MAX; - int no_split_txb_entropy_ctx = 0; - TX_TYPE no_split_tx_type = TX_TYPES; + TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES }; + // TX no split if (try_no_split) { - const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); - TXB_CTX txb_ctx; - get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx); - const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y] - .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; + try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth, + plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd, + ftxs_mode, rd_info_node, &no_split); - rd_stats->ref_rdcost = ref_best_rd; - rd_stats->zero_rate = zero_blk_rate; - const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col); - mbmi->inter_tx_size[index] = tx_size; - tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, pta, - ptl, rd_stats, ftxs_mode, ref_best_rd, - rd_info_node != NULL ? rd_info_node->rd_info_array : NULL); - assert(rd_stats->rate < INT_MAX); - - if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= - RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) || - rd_stats->skip == 1) && - !xd->lossless[mbmi->segment_id]) { -#if CONFIG_RD_DEBUG - av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col, - zero_blk_rate - rd_stats->rate); -#endif // CONFIG_RD_DEBUG - rd_stats->rate = zero_blk_rate; - rd_stats->dist = rd_stats->sse; - rd_stats->skip = 1; - x->blk_skip[blk_row * bw + blk_col] = 1; - p->eobs[block] = 0; - update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, - DCT_DCT); - } else { - x->blk_skip[blk_row * bw + blk_col] = 0; - rd_stats->skip = 0; - } - - if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) - rd_stats->rate += x->txfm_partition_cost[ctx][0]; - no_split_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); if (cpi->sf.adaptive_txb_search_level && - (no_split_rd - - (no_split_rd >> (1 + cpi->sf.adaptive_txb_search_level))) > + (no_split.rd - + (no_split.rd >> (1 + cpi->sf.adaptive_txb_search_level))) > ref_best_rd) { *is_cost_valid = 0; return; } - no_split_txb_entropy_ctx = p->txb_entropy_ctx[block]; - const int txk_type_idx = - av1_get_txk_type_index(plane_bsize, blk_row, blk_col); - no_split_tx_type = mbmi->txk_type[txk_type_idx]; - - if (cpi->sf.txb_split_cap) + if (cpi->sf.txb_split_cap) { if (p->eobs[block] == 0) try_split = 0; + } } if (x->e_mbd.bd == 8 && !x->cb_partition_scan && try_split) { @@ -4427,155 +4701,10 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, RD_STATS split_rd_stats; av1_init_rd_stats(&split_rd_stats); if (try_split) { - const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; - const int bsw = tx_size_wide_unit[sub_txs]; - const int bsh = tx_size_high_unit[sub_txs]; - const int sub_step = bsw * bsh; - RD_STATS this_rd_stats; - int this_cost_valid = 1; - int64_t tmp_rd = 0; -#if CONFIG_DIST_8X8 - int sub8x8_eob[4] = { 0, 0, 0, 0 }; - struct macroblockd_plane *const pd = &xd->plane[0]; -#endif - split_rd_stats.rate = x->txfm_partition_cost[ctx][1]; - - assert(tx_size < TX_SIZES_ALL); - - ref_best_rd = AOMMIN(no_split_rd, ref_best_rd); - - int blk_idx = 0; - for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) { - for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw, ++blk_idx) { - const int offsetr = blk_row + r; - const int offsetc = blk_col + c; - if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; - assert(blk_idx < 4); - select_tx_block( - cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, - ta, tl, tx_above, tx_left, &this_rd_stats, ref_best_rd - tmp_rd, - &this_cost_valid, ftxs_mode, - (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL); - -#if CONFIG_DIST_8X8 - if (!x->using_dist_8x8) -#endif - if (!this_cost_valid) goto LOOP_EXIT; -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && tx_size == TX_8X8) { - sub8x8_eob[2 * (r / bsh) + (c / bsw)] = p->eobs[block]; - } -#endif // CONFIG_DIST_8X8 - av1_merge_rd_stats(&split_rd_stats, &this_rd_stats); - - tmp_rd = RDCOST(x->rdmult, split_rd_stats.rate, split_rd_stats.dist); -#if CONFIG_DIST_8X8 - if (!x->using_dist_8x8) -#endif - if (no_split_rd < tmp_rd) { - this_cost_valid = 0; - goto LOOP_EXIT; - } - block += sub_step; - } - } - - LOOP_EXIT : {} - -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && this_cost_valid && tx_size == TX_8X8) { - const int src_stride = p->src.stride; - const int dst_stride = pd->dst.stride; - - const uint8_t *src = - &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]]; - const uint8_t *dst = - &pd->dst - .buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; - - int64_t dist_8x8; - const int qindex = x->qindex; - const int pred_stride = block_size_wide[plane_bsize]; - const int pred_idx = (blk_row * pred_stride + blk_col) - << tx_size_wide_log2[0]; - const int16_t *pred = &x->pred_luma[pred_idx]; - int i, j; - int row, col; - - uint8_t *pred8; - DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]); - - dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, - BLOCK_8X8, 8, 8, 8, 8, qindex) * - 16; - -#ifdef DEBUG_DIST_8X8 - if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) - assert(sum_rd_stats.sse == dist_8x8); -#endif // DEBUG_DIST_8X8 - - split_rd_stats.sse = dist_8x8; - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - pred8 = CONVERT_TO_BYTEPTR(pred8_16); - else - pred8 = (uint8_t *)pred8_16; - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - for (row = 0; row < 2; ++row) { - for (col = 0; col < 2; ++col) { - int idx = row * 2 + col; - int eob = sub8x8_eob[idx]; - - if (eob > 0) { - for (j = 0; j < 4; j++) - for (i = 0; i < 4; i++) - CONVERT_TO_SHORTPTR(pred8) - [(row * 4 + j) * 8 + 4 * col + i] = - pred[(row * 4 + j) * pred_stride + 4 * col + i]; - } else { - for (j = 0; j < 4; j++) - for (i = 0; i < 4; i++) - CONVERT_TO_SHORTPTR(pred8) - [(row * 4 + j) * 8 + 4 * col + i] = CONVERT_TO_SHORTPTR( - dst)[(row * 4 + j) * dst_stride + 4 * col + i]; - } - } - } - } else { - for (row = 0; row < 2; ++row) { - for (col = 0; col < 2; ++col) { - int idx = row * 2 + col; - int eob = sub8x8_eob[idx]; - - if (eob > 0) { - for (j = 0; j < 4; j++) - for (i = 0; i < 4; i++) - pred8[(row * 4 + j) * 8 + 4 * col + i] = - (uint8_t)pred[(row * 4 + j) * pred_stride + 4 * col + i]; - } else { - for (j = 0; j < 4; j++) - for (i = 0; i < 4; i++) - pred8[(row * 4 + j) * 8 + 4 * col + i] = - dst[(row * 4 + j) * dst_stride + 4 * col + i]; - } - } - } - } - dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8, - 8, 8, 8, qindex) * - 16; - -#ifdef DEBUG_DIST_8X8 - if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) - assert(sum_rd_stats.dist == dist_8x8); -#endif // DEBUG_DIST_8X8 - - split_rd_stats.dist = dist_8x8; - tmp_rd = RDCOST(x->rdmult, split_rd_stats.rate, split_rd_stats.dist); - } -#endif // CONFIG_DIST_8X8 - if (this_cost_valid) split_rd = tmp_rd; + try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth, + plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd, + AOMMIN(no_split.rd, ref_best_rd), ftxs_mode, + rd_info_node, &split_rd_stats, &split_rd); } #if COLLECT_TX_SIZE_DATA @@ -4626,9 +4755,11 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, } while (0); #endif // COLLECT_TX_SIZE_DATA - if (no_split_rd < split_rd) { + if (no_split.rd < split_rd) { + ENTROPY_CONTEXT *pta = ta + blk_col; + ENTROPY_CONTEXT *ptl = tl + blk_row; const TX_SIZE tx_size_selected = tx_size; - p->txb_entropy_ctx[block] = no_split_txb_entropy_ctx; + p->txb_entropy_ctx[block] = no_split.txb_entropy_ctx; av1_set_txb_context(x, 0, block, tx_size_selected, pta, ptl); txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size, tx_size); @@ -4641,7 +4772,7 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, } mbmi->tx_size = tx_size_selected; update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, - no_split_tx_type); + no_split.tx_type); x->blk_skip[blk_row * bw + blk_col] = rd_stats->skip; } else { *rd_stats = split_rd_stats; @@ -4707,13 +4838,19 @@ static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, } } } - int64_t zero_rd = RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse); - this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - if (zero_rd < this_rd) { - this_rd = zero_rd; - rd_stats->rate = rd_stats->zero_rate; + + const int skip_ctx = av1_get_skip_context(xd); + const int s0 = x->skip_cost[skip_ctx][0]; + const int s1 = x->skip_cost[skip_ctx][1]; + int64_t skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse); + this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist); + if (skip_rd <= this_rd) { + this_rd = skip_rd; + rd_stats->rate = 0; rd_stats->dist = rd_stats->sse; rd_stats->skip = 1; + } else { + rd_stats->skip = 0; } if (this_rd > ref_best_rd) is_cost_valid = 0; @@ -4921,11 +5058,15 @@ static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, } } } - int64_t zero_rd = RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse); - this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - if (zero_rd < this_rd) { - this_rd = zero_rd; - rd_stats->rate = rd_stats->zero_rate; + + const int skip_ctx = av1_get_skip_context(xd); + const int s0 = x->skip_cost[skip_ctx][0]; + const int s1 = x->skip_cost[skip_ctx][1]; + int64_t skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse); + this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist); + if (skip_rd < this_rd) { + this_rd = skip_rd; + rd_stats->rate = 0; rd_stats->dist = rd_stats->sse; rd_stats->skip = 1; } @@ -5159,7 +5300,7 @@ static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist, const MACROBLOCKD *xd = &x->e_mbd; const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd); - *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize); + *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, 1); const int64_t mse = *dist / bw / bh; // Normalized quantizer takes the transform upscaling factor (8 for tx size // smaller than 32) into account. @@ -5215,23 +5356,7 @@ static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize, mbmi->tx_size = tx_size; memset(x->blk_skip, 1, sizeof(x->blk_skip[0]) * n4); rd_stats->skip = 1; - - // Rate. - const int tx_size_ctx = get_txsize_entropy_ctx(tx_size); - ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; - ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; - av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl); - TXB_CTX txb_ctx; - // Because plane is 0, plane_bsize equal to bsize - get_txb_ctx(bsize, tx_size, 0, ctxa, ctxl, &txb_ctx); - int rate = x->coeff_costs[tx_size_ctx][PLANE_TYPE_Y] - .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; - if (tx_size > TX_4X4) { - int ctx = txfm_partition_context( - xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size); - rate += x->txfm_partition_cost[ctx][0]; - } - rd_stats->rate = rate; + rd_stats->rate = 0; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2); rd_stats->dist = rd_stats->sse = (dist << 4); @@ -5322,6 +5447,8 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd, found_rd_info ? matched_rd_info : NULL); + assert(IMPLIES(this_rd_stats.skip && !this_rd_stats.invalid_rate, + this_rd_stats.rate == 0)); ref_best_rd = AOMMIN(rd, ref_best_rd); if (rd < best_rd) { @@ -5455,6 +5582,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type)); PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const BLOCK_SIZE bsize = mbmi->sb_type; + const SequenceHeader *const seq_params = &cpi->common.seq_params; int this_rate; int64_t this_rd; int colors_u, colors_v, colors; @@ -5470,11 +5598,11 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, mbmi->uv_mode = UV_DC_PRED; int count_buf[1 << 12]; // Maximum (1 << 12) color levels. - if (cpi->common.use_highbitdepth) { + if (seq_params->use_highbitdepth) { colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols, - cpi->common.bit_depth, count_buf); + seq_params->bit_depth, count_buf); colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols, - cpi->common.bit_depth, count_buf); + seq_params->bit_depth, count_buf); } else { colors_u = av1_count_colors(src_u, src_stride, rows, cols, count_buf); colors_v = av1_count_colors(src_v, src_stride, rows, cols, count_buf); @@ -5494,7 +5622,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u); uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v); - if (cpi->common.use_highbitdepth) { + if (seq_params->use_highbitdepth) { lb_u = src_u16[0]; ub_u = src_u16[0]; lb_v = src_v16[0]; @@ -5508,7 +5636,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, for (r = 0; r < rows; ++r) { for (c = 0; c < cols; ++c) { - if (cpi->common.use_highbitdepth) { + if (seq_params->use_highbitdepth) { val_u = src_u16[r * src_stride + c]; val_v = src_v16[r * src_stride + c]; data[(r * cols + c) * 2] = val_u; @@ -5557,9 +5685,9 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, pmi->palette_size[1] = n; for (i = 1; i < 3; ++i) { for (j = 0; j < n; ++j) { - if (cpi->common.use_highbitdepth) + if (seq_params->use_highbitdepth) pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd( - (int)centroids[j * 2 + i - 1], cpi->common.bit_depth); + (int)centroids[j * 2 + i - 1], seq_params->bit_depth); else pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel((int)centroids[j * 2 + i - 1]); @@ -5907,8 +6035,9 @@ static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x, *mode_uv = UV_DC_PRED; return; } - xd->cfl.is_chroma_reference = is_chroma_reference( - mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y); + xd->cfl.is_chroma_reference = + is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y); bsize = scale_chroma_bsize(bsize, xd->plane[AOM_PLANE_U].subsampling_x, xd->plane[AOM_PLANE_U].subsampling_y); // Only store reconstructed luma when there's chroma RDO. When there's no @@ -7038,7 +7167,9 @@ static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x, // Choose the best wedge index and sign static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, const BLOCK_SIZE bsize, const uint8_t *const p0, - const uint8_t *const p1, int *const best_wedge_sign, + const int16_t *const residual1, + const int16_t *const diff10, + int *const best_wedge_sign, int *const best_wedge_index) { const MACROBLOCKD *const xd = &x->e_mbd; const struct buf_2d *const src = &x->plane[0].src; @@ -7056,34 +7187,22 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; - DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]); - DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]); - DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]); - DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]); - - int64_t sign_limit; - + DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]); // src - pred0 if (hbd) { - aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride, - CONVERT_TO_BYTEPTR(p0), bw, xd->bd); - aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride, - CONVERT_TO_BYTEPTR(p1), bw, xd->bd); - aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw, + aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, CONVERT_TO_BYTEPTR(p0), bw, xd->bd); } else { - aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw); - aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw); - aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw); + aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw); } - sign_limit = ((int64_t)aom_sum_squares_i16(r0, N) - - (int64_t)aom_sum_squares_i16(r1, N)) * - (1 << WEDGE_WEIGHT_BITS) / 2; - + int64_t sign_limit = ((int64_t)aom_sum_squares_i16(residual0, N) - + (int64_t)aom_sum_squares_i16(residual1, N)) * + (1 << WEDGE_WEIGHT_BITS) / 2; + int16_t *ds = residual0; if (N < 64) - av1_wedge_compute_delta_squares_c(ds, r0, r1, N); + av1_wedge_compute_delta_squares_c(ds, residual0, residual1, N); else - av1_wedge_compute_delta_squares(ds, r0, r1, N); + av1_wedge_compute_delta_squares(ds, residual0, residual1, N); for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize); @@ -7096,9 +7215,9 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); if (N < 64) - sse = av1_wedge_sse_from_residuals_c(r1, d10, mask, N); + sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N); else - sse = av1_wedge_sse_from_residuals(r1, d10, mask, N); + sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); sse = ROUND_POWER_OF_TWO(sse, bd_round); model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist); @@ -7117,12 +7236,15 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, } // Choose the best wedge index the specified sign -static int64_t pick_wedge_fixed_sign( - const AV1_COMP *const cpi, const MACROBLOCK *const x, - const BLOCK_SIZE bsize, const uint8_t *const p0, const uint8_t *const p1, - const int wedge_sign, int *const best_wedge_index) { +static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + const BLOCK_SIZE bsize, + const int16_t *const residual1, + const int16_t *const diff10, + const int wedge_sign, + int *const best_wedge_index) { const MACROBLOCKD *const xd = &x->e_mbd; - const struct buf_2d *const src = &x->plane[0].src; + const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int N = bw * bh; @@ -7135,26 +7257,12 @@ static int64_t pick_wedge_fixed_sign( uint64_t sse; const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; - - DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]); - DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]); - - if (hbd) { - aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride, - CONVERT_TO_BYTEPTR(p1), bw, xd->bd); - aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw, - CONVERT_TO_BYTEPTR(p0), bw, xd->bd); - } else { - aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw); - aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw); - } - for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); if (N < 64) - sse = av1_wedge_sse_from_residuals_c(r1, d10, mask, N); + sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N); else - sse = av1_wedge_sse_from_residuals(r1, d10, mask, N); + sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); sse = ROUND_POWER_OF_TWO(sse, bd_round); model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist); @@ -7166,16 +7274,14 @@ static int64_t pick_wedge_fixed_sign( best_rd = rd; } } - return best_rd - RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0); } -static int64_t pick_interinter_wedge(const AV1_COMP *const cpi, - MACROBLOCK *const x, - const BLOCK_SIZE bsize, - const uint8_t *const p0, - const uint8_t *const p1) { +static int64_t pick_interinter_wedge( + const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize, + const uint8_t *const p0, const uint8_t *const p1, + const int16_t *const residual1, const int16_t *const diff10) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const int bw = block_size_wide[bsize]; @@ -7189,9 +7295,11 @@ static int64_t pick_interinter_wedge(const AV1_COMP *const cpi, if (cpi->sf.fast_wedge_sign_estimate) { wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw); - rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, wedge_sign, &wedge_index); + rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, wedge_sign, + &wedge_index); } else { - rd = pick_wedge(cpi, x, bsize, p0, p1, &wedge_sign, &wedge_index); + rd = pick_wedge(cpi, x, bsize, p0, residual1, diff10, &wedge_sign, + &wedge_index); } mbmi->interinter_comp.wedge_sign = wedge_sign; @@ -7202,10 +7310,11 @@ static int64_t pick_interinter_wedge(const AV1_COMP *const cpi, static int64_t pick_interinter_seg(const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize, const uint8_t *const p0, - const uint8_t *const p1) { + const uint8_t *const p1, + const int16_t *const residual1, + const int16_t *const diff10) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; - const struct buf_2d *const src = &x->plane[0].src; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int N = bw * bh; @@ -7218,23 +7327,6 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi, DIFFWTD_MASK_TYPE best_mask_type = 0; const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; - DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]); - DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]); - DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]); - - if (hbd) { - aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride, - CONVERT_TO_BYTEPTR(p0), bw, xd->bd); - aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride, - CONVERT_TO_BYTEPTR(p1), bw, xd->bd); - aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw, - CONVERT_TO_BYTEPTR(p0), bw, xd->bd); - } else { - aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw); - aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw); - aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw); - } - // try each mask type and its inverse for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) { // build mask and inverse @@ -7247,7 +7339,7 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi, bw, bh, bw); // compute rd for mask - sse = av1_wedge_sse_from_residuals(r1, d10, xd->seg_mask, N); + sse = av1_wedge_sse_from_residuals(residual1, diff10, xd->seg_mask, N); sse = ROUND_POWER_OF_TWO(sse, bd_round); model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist); @@ -7279,14 +7371,26 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi, const uint8_t *const p1) { const MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; - - int64_t rd; - int wedge_index = -1; - assert(is_interintra_wedge_used(bsize)); assert(cpi->common.seq_params.enable_interintra_compound); - rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, 0, &wedge_index); + const struct buf_2d *const src = &x->plane[0].src; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1 + DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0 + if (get_bitdepth_data_path_index(xd)) { + aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(p1), bw, xd->bd); + aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw, + CONVERT_TO_BYTEPTR(p0), bw, xd->bd); + } else { + aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw); + aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw); + } + int wedge_index = -1; + int64_t rd = + pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, 0, &wedge_index); mbmi->interintra_wedge_sign = 0; mbmi->interintra_wedge_index = wedge_index; @@ -7296,11 +7400,15 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi, static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize, const uint8_t *const p0, - const uint8_t *const p1) { + const uint8_t *const p1, + const int16_t *const residual1, + const int16_t *const diff10) { const COMPOUND_TYPE compound_type = x->e_mbd.mi[0]->interinter_comp.type; switch (compound_type) { - case COMPOUND_WEDGE: return pick_interinter_wedge(cpi, x, bsize, p0, p1); - case COMPOUND_DIFFWTD: return pick_interinter_seg(cpi, x, bsize, p0, p1); + case COMPOUND_WEDGE: + return pick_interinter_wedge(cpi, x, bsize, p0, p1, residual1, diff10); + case COMPOUND_DIFFWTD: + return pick_interinter_seg(cpi, x, bsize, p0, p1, residual1, diff10); default: assert(0); return 0; } } @@ -7336,7 +7444,7 @@ static int64_t build_and_cost_compound_type( const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, const BLOCK_SIZE bsize, const int this_mode, int *rs2, int rate_mv, BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1, - int *strides, int mi_row, int mi_col) { + int16_t *residual1, int16_t *diff10, int *strides, int mi_row, int mi_col) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; @@ -7348,7 +7456,8 @@ static int64_t build_and_cost_compound_type( int64_t tmp_skip_sse_sb; const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type; - best_rd_cur = pick_interinter_mask(cpi, x, bsize, *preds0, *preds1); + best_rd_cur = + pick_interinter_mask(cpi, x, bsize, *preds0, *preds1, residual1, diff10); *rs2 += get_interinter_compound_mask_rate(x, mbmi); best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0); @@ -7357,6 +7466,7 @@ static int64_t build_and_cost_compound_type( *out_rate_mv = interinter_compound_motion_search(cpi, x, cur_mv, bsize, this_mode, mi_row, mi_col); av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize); + av1_subtract_plane(x, bsize, 0); model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum); @@ -7367,7 +7477,6 @@ static int64_t build_and_cost_compound_type( av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides, preds1, strides); } - av1_subtract_plane(x, bsize, 0); rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); if (rd != INT64_MAX) @@ -7377,7 +7486,6 @@ static int64_t build_and_cost_compound_type( } else { av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides, preds1, strides); - av1_subtract_plane(x, bsize, 0); rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); if (rd != INT64_MAX) @@ -7393,11 +7501,11 @@ typedef struct { int above_pred_stride[MAX_MB_PLANE]; uint8_t *left_pred_buf[MAX_MB_PLANE]; int left_pred_stride[MAX_MB_PLANE]; - int_mv *single_newmv; + int_mv (*single_newmv)[REF_FRAMES]; // Pointer to array of motion vectors to use for each ref and their rates // Should point to first of 2 arrays in 2D array - int *single_newmv_rate; - int *single_newmv_valid; + int (*single_newmv_rate)[REF_FRAMES]; + int (*single_newmv_valid)[REF_FRAMES]; // Pointer to array of predicted rate-distortion // Should point to first of 2 arrays in 2D array int64_t (*modelled_rd)[REF_FRAMES]; @@ -7428,14 +7536,15 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x, const PREDICTION_MODE this_mode = mbmi->mode; const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] }; + const int ref_mv_idx = mbmi->ref_mv_idx; int i; (void)args; if (is_comp_pred) { if (this_mode == NEW_NEWMV) { - cur_mv[0].as_int = args->single_newmv[refs[0]].as_int; - cur_mv[1].as_int = args->single_newmv[refs[1]].as_int; + cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int; + cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int; if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { joint_motion_search(cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, NULL, @@ -7451,7 +7560,7 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x, } } } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) { - cur_mv[1].as_int = args->single_newmv[refs[1]].as_int; + cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int; if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { compound_single_motion_search_interinter( cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 1); @@ -7464,7 +7573,7 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x, } } else { assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV); - cur_mv[0].as_int = args->single_newmv[refs[0]].as_int; + cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int; if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { compound_single_motion_search_interinter( cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 0); @@ -7480,9 +7589,9 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x, single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, rate_mv); if (x->best_mv.as_int == INVALID_MV) return INT64_MAX; - args->single_newmv[refs[0]] = x->best_mv; - args->single_newmv_rate[refs[0]] = *rate_mv; - args->single_newmv_valid[refs[0]] = 1; + args->single_newmv[ref_mv_idx][refs[0]] = x->best_mv; + args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv; + args->single_newmv_valid[ref_mv_idx][refs[0]] = 1; cur_mv[0].as_int = x->best_mv.as_int; @@ -7508,12 +7617,25 @@ static INLINE void swap_dst_buf(MACROBLOCKD *xd, const BUFFER_SET *dst_bufs[2], restore_dst_buf(xd, *dst_bufs[0], num_planes); } +static INLINE int get_switchable_rate(MACROBLOCK *const x, + const InterpFilters filters, + const int ctx[2]) { + int inter_filter_cost; + const InterpFilter filter0 = av1_extract_interp_filter(filters, 0); + const InterpFilter filter1 = av1_extract_interp_filter(filters, 1); + inter_filter_cost = x->switchable_interp_costs[ctx[0]][filter0]; + inter_filter_cost += x->switchable_interp_costs[ctx[1]][filter1]; + return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost; +} + // calculate the rdcost of given interpolation_filter static INLINE int64_t interpolation_filter_rd( MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb, - int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], int filter_idx) { + int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], int filter_idx, + const int switchable_ctx[2], const int skip_pred, int *rate, + int64_t *dist) { const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; @@ -7523,23 +7645,136 @@ static INLINE int64_t interpolation_filter_rd( const InterpFilters last_best = mbmi->interp_filters; mbmi->interp_filters = filter_sets[filter_idx]; - const int tmp_rs = av1_get_switchable_rate(cm, x, xd); - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); - model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist, - &tmp_skip_sb, &tmp_skip_sse, NULL, NULL, NULL); + const int tmp_rs = + get_switchable_rate(x, mbmi->interp_filters, switchable_ctx); + + if (!skip_pred) { + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); + av1_subtract_plane(x, bsize, 0); +#if DNN_BASED_RD_INTERP_FILTER + model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist, + &tmp_skip_sb, &tmp_skip_sse, NULL, NULL, NULL); +#else + model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist, &tmp_skip_sb, + &tmp_skip_sse, NULL, NULL, NULL); +#endif + if (num_planes > 1) { + int64_t tmp_y_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist); + if (tmp_y_rd > *rd) { + mbmi->interp_filters = last_best; + return 0; + } + int tmp_rate_uv, tmp_skip_sb_uv; + int64_t tmp_dist_uv, tmp_skip_sse_uv; + av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize); + for (int plane = 1; plane < num_planes; ++plane) + av1_subtract_plane(x, bsize, plane); +#if DNN_BASED_RD_INTERP_FILTER + model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 1, num_planes - 1, + &tmp_rate_uv, &tmp_dist_uv, &tmp_skip_sb_uv, + &tmp_skip_sse_uv, NULL, NULL, NULL); +#else + model_rd_for_sb(cpi, bsize, x, xd, 1, num_planes - 1, &tmp_rate_uv, + &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL, + NULL, NULL); +#endif + tmp_rate += tmp_rate_uv; + tmp_skip_sb &= tmp_skip_sb_uv; + tmp_dist += tmp_dist_uv; + tmp_skip_sse += tmp_skip_sse_uv; + } + } else { + tmp_rate = *rate; + tmp_dist = *dist; + } int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist); if (tmp_rd < *rd) { *rd = tmp_rd; *switchable_rate = tmp_rs; *skip_txfm_sb = tmp_skip_sb; *skip_sse_sb = tmp_skip_sse; - swap_dst_buf(xd, dst_bufs, num_planes); + *rate = tmp_rate; + *dist = tmp_dist; + if (!skip_pred) { + swap_dst_buf(xd, dst_bufs, num_planes); + } return 1; } mbmi->interp_filters = last_best; return 0; } +// Find the best rd filter in horizontal direction +static INLINE int find_best_horiz_interp_filter_rd( + MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd, + int *const switchable_rate, int *const skip_txfm_sb, + int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], + const int switchable_ctx[2], const int skip_hor, int *rate, int64_t *dist, + int best_dual_mode) { + int i; + const int bw = block_size_wide[bsize]; + assert(best_dual_mode == 0); + if ((bw <= 4) && (!skip_hor)) { + int skip_pred = 1; + // Process the filters in reverse order to enable reusing rate and + // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP + for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) { + if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, + switchable_rate, skip_txfm_sb, skip_sse_sb, + dst_bufs, i, switchable_ctx, skip_pred, rate, + dist)) { + best_dual_mode = i; + } + skip_pred = 0; + } + } else { + for (i = 1; i < SWITCHABLE_FILTERS; ++i) { + if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, + switchable_rate, skip_txfm_sb, skip_sse_sb, + dst_bufs, i, switchable_ctx, skip_hor, rate, + dist)) { + best_dual_mode = i; + } + } + } + return best_dual_mode; +} + +// Find the best rd filter in vertical direction +static INLINE void find_best_vert_interp_filter_rd( + MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd, + int *const switchable_rate, int *const skip_txfm_sb, + int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], + const int switchable_ctx[2], const int skip_ver, int *rate, int64_t *dist, + int best_dual_mode, int filter_set_size) { + int i; + const int bh = block_size_high[bsize]; + if ((bh <= 4) && (!skip_ver)) { + int skip_pred = 1; + // Process the filters in reverse order to enable reusing rate and + // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP + assert(filter_set_size == DUAL_FILTER_SET_SIZE); + for (i = (filter_set_size - SWITCHABLE_FILTERS + best_dual_mode); + i >= (best_dual_mode + SWITCHABLE_FILTERS); i -= SWITCHABLE_FILTERS) { + interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, + switchable_rate, skip_txfm_sb, skip_sse_sb, + dst_bufs, i, switchable_ctx, skip_pred, rate, + dist); + skip_pred = 0; + } + } else { + for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size; + i += SWITCHABLE_FILTERS) { + interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, + switchable_rate, skip_txfm_sb, skip_sse_sb, + dst_bufs, i, switchable_ctx, skip_ver, rate, + dist); + } + } +} + // check if there is saved result match with this search static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st, MB_MODE_INFO *const mi) { @@ -7605,10 +7840,22 @@ static int64_t interpolation_filter_search( if (!need_search || match_found == -1) { set_default_interp_filters(mbmi, assign_filter); } - *switchable_rate = av1_get_switchable_rate(cm, x, xd); + int switchable_ctx[2]; + switchable_ctx[0] = av1_get_pred_context_switchable_interp(xd, 0); + switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1); + *switchable_rate = + get_switchable_rate(x, mbmi->interp_filters, switchable_ctx); av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); + for (int plane = 0; plane < num_planes; ++plane) + av1_subtract_plane(x, bsize, plane); +#if DNN_BASED_RD_INTERP_FILTER + model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, + &tmp_dist, skip_txfm_sb, skip_sse_sb, NULL, NULL, + NULL); +#else model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist, skip_txfm_sb, skip_sse_sb, NULL, NULL, NULL); +#endif // DNN_BASED_RD_INTERP_FILTER *rd = RDCOST(x->rdmult, *switchable_rate + tmp_rate, tmp_dist); if (assign_filter != SWITCHABLE || match_found != -1) { @@ -7619,6 +7866,23 @@ static int64_t interpolation_filter_search( av1_broadcast_interp_filter(EIGHTTAP_REGULAR)); return 0; } + int skip_hor = 1; + int skip_ver = 1; + const int is_compound = has_second_ref(mbmi); + for (int k = 0; k < num_planes - 1; ++k) { + struct macroblockd_plane *const pd = &xd->plane[k]; + const int bw = pd->width; + const int bh = pd->height; + for (int j = 0; j < 1 + is_compound; ++j) { + const MV mv = mbmi->mv[j].as_mv; + const MV mv_q4 = clamp_mv_to_umv_border_sb( + xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y); + const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS; + const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS; + skip_hor &= (sub_x == 0); + skip_ver &= (sub_y == 0); + } + } // do interp_filter search const int filter_set_size = DUAL_FILTER_SET_SIZE; restore_dst_buf(xd, *tmp_dst, num_planes); @@ -7629,20 +7893,16 @@ static int64_t interpolation_filter_search( int best_dual_mode = 0; // Find best of {R}x{R,Sm,Sh} // EIGHTTAP_REGULAR mode is calculated beforehand - for (i = 1; i < SWITCHABLE_FILTERS; ++i) { - if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, - switchable_rate, skip_txfm_sb, skip_sse_sb, - dst_bufs, i)) { - best_dual_mode = i; - } - } + best_dual_mode = find_best_horiz_interp_filter_rd( + x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate, + skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_hor, + &tmp_rate, &tmp_dist, best_dual_mode); + // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes - for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size; - i += SWITCHABLE_FILTERS) { - interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, - switchable_rate, skip_txfm_sb, skip_sse_sb, - dst_bufs, i); - } + find_best_vert_interp_filter_rd( + x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate, + skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_ver, + &tmp_rate, &tmp_dist, best_dual_mode, filter_set_size); } else { // EIGHTTAP_REGULAR mode is calculated beforehand for (i = 1; i < filter_set_size; ++i) { @@ -7653,7 +7913,8 @@ static int64_t interpolation_filter_search( } interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate, skip_txfm_sb, skip_sse_sb, - dst_bufs, i); + dst_bufs, i, switchable_ctx, 0, &tmp_rate, + &tmp_dist); } } swap_dst_buf(xd, dst_bufs, num_planes); @@ -7848,6 +8109,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, intrapred, bw); av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + av1_subtract_plane(x, bsize, 0); model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum); @@ -7861,7 +8123,6 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, intrapred, bw); av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - av1_subtract_plane(x, bsize, 0); rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); if (rd != INT64_MAX) @@ -7908,6 +8169,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, mbmi->mv[0].as_int = tmp_mv.as_int; av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); + av1_subtract_plane(x, bsize, 0); model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); @@ -7925,7 +8187,6 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); } // Evaluate closer to true rd - av1_subtract_plane(x, bsize, 0); rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); @@ -8323,6 +8584,148 @@ static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi, return cost; } +static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_col, int mi_row, + int_mv *cur_mv, int masked_compound_used, + BUFFER_SET *orig_dst, BUFFER_SET *tmp_dst, + int *rate_mv, int64_t *rd, + RD_STATS *rd_stats, int64_t ref_best_rd) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int this_mode = mbmi->mode; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + int rate_sum, rs2; + int64_t dist_sum; + + int_mv best_mv[2]; + int best_tmp_rate_mv = *rate_mv; + int tmp_skip_txfm_sb; + int64_t tmp_skip_sse_sb; + INTERINTER_COMPOUND_DATA best_compound_data; + best_compound_data.type = COMPOUND_AVERAGE; + DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1 + DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0 + uint8_t tmp_best_mask_buf[2 * MAX_SB_SQUARE]; + uint8_t *preds0[1] = { pred0 }; + uint8_t *preds1[1] = { pred1 }; + int strides[1] = { bw }; + int tmp_rate_mv; + const int num_pix = 1 << num_pels_log2_lookup[bsize]; + const int mask_len = 2 * num_pix * sizeof(uint8_t); + COMPOUND_TYPE cur_type; + int best_compmode_interinter_cost = 0; + int can_use_previous = cm->allow_warped_motion; + + best_mv[0].as_int = cur_mv[0].as_int; + best_mv[1].as_int = cur_mv[1].as_int; + *rd = INT64_MAX; + if (masked_compound_used) { + // get inter predictors to use for masked compound modes + av1_build_inter_predictors_for_planes_single_buf( + xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous); + av1_build_inter_predictors_for_planes_single_buf( + xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous); + const struct buf_2d *const src = &x->plane[0].src; + if (get_bitdepth_data_path_index(xd)) { + aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(pred1), bw, xd->bd); + aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(pred1), + bw, CONVERT_TO_BYTEPTR(pred0), bw, xd->bd); + } else { + aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, pred1, + bw); + aom_subtract_block(bh, bw, diff10, bw, pred1, bw, pred0, bw); + } + } + const int orig_is_best = xd->plane[0].dst.buf == orig_dst->plane[0]; + const BUFFER_SET *backup_buf = orig_is_best ? tmp_dst : orig_dst; + const BUFFER_SET *best_buf = orig_is_best ? orig_dst : tmp_dst; + for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) { + if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break; + if (!is_interinter_compound_used(cur_type, bsize)) continue; + tmp_rate_mv = *rate_mv; + int64_t best_rd_cur = INT64_MAX; + mbmi->interinter_comp.type = cur_type; + int masked_type_cost = 0; + + const int comp_group_idx_ctx = get_comp_group_idx_context(xd); + const int comp_index_ctx = get_comp_index_context(cm, xd); + mbmi->compound_idx = 1; + if (cur_type == COMPOUND_AVERAGE) { + mbmi->comp_group_idx = 0; + if (masked_compound_used) { + masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][0]; + } + masked_type_cost += x->comp_idx_cost[comp_index_ctx][1]; + rs2 = masked_type_cost; + // No need to call av1_build_inter_predictors_sby here + // 1. COMPOUND_AVERAGE is always the first candidate + // 2. av1_build_inter_predictors_sby has been called by + // interpolation_filter_search + int64_t est_rd = + estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + // use spare buffer for following compound type try + restore_dst_buf(xd, *backup_buf, 1); + if (est_rd != INT64_MAX) + best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum); + } else { + mbmi->comp_group_idx = 1; + masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1]; + masked_type_cost += x->compound_type_cost[bsize][cur_type - 1]; + rs2 = masked_type_cost; + if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh && + *rd / 3 < ref_best_rd) { + best_rd_cur = build_and_cost_compound_type( + cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst, + &tmp_rate_mv, preds0, preds1, residual1, diff10, strides, mi_row, + mi_col); + } + } + if (best_rd_cur < *rd) { + *rd = best_rd_cur; + best_compound_data = mbmi->interinter_comp; + if (masked_compound_used && cur_type != COMPOUND_TYPES - 1) { + memcpy(tmp_best_mask_buf, xd->seg_mask, mask_len); + } + best_compmode_interinter_cost = rs2; + if (have_newmv_in_inter_mode(this_mode)) { + if (use_masked_motion_search(cur_type)) { + best_tmp_rate_mv = tmp_rate_mv; + best_mv[0].as_int = mbmi->mv[0].as_int; + best_mv[1].as_int = mbmi->mv[1].as_int; + } else { + best_mv[0].as_int = cur_mv[0].as_int; + best_mv[1].as_int = cur_mv[1].as_int; + } + } + } + // reset to original mvs for next iteration + mbmi->mv[0].as_int = cur_mv[0].as_int; + mbmi->mv[1].as_int = cur_mv[1].as_int; + } + if (mbmi->interinter_comp.type != best_compound_data.type) { + mbmi->comp_group_idx = + (best_compound_data.type == COMPOUND_AVERAGE) ? 0 : 1; + mbmi->interinter_comp = best_compound_data; + memcpy(xd->seg_mask, tmp_best_mask_buf, mask_len); + } + if (have_newmv_in_inter_mode(this_mode)) { + mbmi->mv[0].as_int = best_mv[0].as_int; + mbmi->mv[1].as_int = best_mv[1].as_int; + if (use_masked_motion_search(mbmi->interinter_comp.type)) { + rd_stats->rate += best_tmp_rate_mv - *rate_mv; + *rate_mv = best_tmp_rate_mv; + } + } + restore_dst_buf(xd, *best_buf, 1); + return best_compmode_interinter_cost; +} + static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, @@ -8344,63 +8747,24 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, int refs[2] = { mbmi->ref_frame[0], (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; int rate_mv = 0; - const int bw = block_size_wide[bsize]; DECLARE_ALIGNED(32, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]); - uint8_t *tmp_buf; + uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_); int64_t rd = INT64_MAX; BUFFER_SET orig_dst, tmp_dst; int skip_txfm_sb = 0; int64_t skip_sse_sb = INT64_MAX; int16_t mode_ctx; - - mbmi->interinter_comp.type = COMPOUND_AVERAGE; - mbmi->comp_group_idx = 0; - mbmi->compound_idx = 1; - if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME; - - mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_); - else - tmp_buf = tmp_buf_; - // Make sure that we didn't leave the plane destination buffers set - // to tmp_buf at the end of the last iteration - assert(xd->plane[0].dst.buf != tmp_buf); - - mbmi->num_proj_ref[0] = 0; - mbmi->num_proj_ref[1] = 0; - - if (is_comp_pred) { - for (int ref_idx = 0; ref_idx < is_comp_pred + 1; ++ref_idx) { - const int single_mode = get_single_mode(this_mode, ref_idx, is_comp_pred); - if (single_mode == NEWMV && - args->single_newmv[mbmi->ref_frame[ref_idx]].as_int == INVALID_MV) - return INT64_MAX; - } - } - - mbmi->motion_mode = SIMPLE_TRANSLATION; const int masked_compound_used = is_any_masked_compound_used(bsize) && cm->seq_params.enable_masked_compound; int64_t ret_val = INT64_MAX; const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); - rd_stats->rate += args->ref_frame_cost + args->single_comp_cost; - rd_stats->rate += - get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type); - const RD_STATS backup_rd_stats = *rd_stats; - const RD_STATS backup_rd_stats_y = *rd_stats_y; - const RD_STATS backup_rd_stats_uv = *rd_stats_uv; - const MB_MODE_INFO backup_mbmi = *mbmi; - INTERINTER_COMPOUND_DATA best_compound_data; - uint8_t tmp_best_mask_buf[2 * MAX_SB_SQUARE]; RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv; int64_t best_rd = INT64_MAX; - int64_t best_ret_val = INT64_MAX; uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; MB_MODE_INFO best_mbmi = *mbmi; - int64_t early_terminate = 0; + int best_disable_skip; + int best_xskip; int plane_rate[MAX_MB_PLANE] = { 0 }; int64_t plane_sse[MAX_MB_PLANE] = { 0 }; int64_t plane_dist[MAX_MB_PLANE] = { 0 }; @@ -8411,387 +8775,311 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, int comp_idx; const int search_jnt_comp = is_comp_pred & cm->seq_params.enable_jnt_comp & (mbmi->mode != GLOBAL_GLOBALMV); - // If !search_jnt_comp, we need to force mbmi->compound_idx = 1. - for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) { - int rs = 0; - int compmode_interinter_cost = 0; - early_terminate = 0; - *rd_stats = backup_rd_stats; - *rd_stats_y = backup_rd_stats_y; - *rd_stats_uv = backup_rd_stats_uv; - *mbmi = backup_mbmi; - mbmi->compound_idx = comp_idx; - - if (is_comp_pred && comp_idx == 0) { - mbmi->comp_group_idx = 0; - mbmi->compound_idx = 0; - const int comp_group_idx_ctx = get_comp_group_idx_context(xd); - const int comp_index_ctx = get_comp_index_context(cm, xd); - if (masked_compound_used) { - compmode_interinter_cost += - x->comp_group_idx_cost[comp_group_idx_ctx][0]; + const int has_drl = (have_nearmv_in_inter_mode(mbmi->mode) && + mbmi_ext->ref_mv_count[ref_frame_type] > 2) || + ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) && + mbmi_ext->ref_mv_count[ref_frame_type] > 1); + + // TODO(jingning): This should be deprecated shortly. + const int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; + const int ref_set = + has_drl ? AOMMIN(MAX_REF_MV_SERCH, + mbmi_ext->ref_mv_count[ref_frame_type] - idx_offset) + : 1; + + for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) { + if (cpi->sf.reduce_inter_modes && ref_mv_idx > 0) { + if (mbmi->ref_frame[0] == LAST2_FRAME || + mbmi->ref_frame[0] == LAST3_FRAME || + mbmi->ref_frame[1] == LAST2_FRAME || + mbmi->ref_frame[1] == LAST3_FRAME) { + if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + idx_offset] + .weight < REF_CAT_LEVEL) { + continue; + } } - compmode_interinter_cost += x->comp_idx_cost[comp_index_ctx][0]; } - int_mv cur_mv[2]; - if (!build_cur_mv(cur_mv, this_mode, cm, x)) { - early_terminate = INT64_MAX; - continue; - } - if (have_newmv_in_inter_mode(this_mode)) { - if (comp_idx == 0) { - cur_mv[0] = backup_mv[0]; - cur_mv[1] = backup_mv[1]; - rate_mv = backup_rate_mv; - } + av1_init_rd_stats(rd_stats); - // when jnt_comp_skip_mv_search flag is on, new mv will be searched once - if (!(search_jnt_comp && cpi->sf.jnt_comp_skip_mv_search && - comp_idx == 0)) { - newmv_ret_val = - handle_newmv(cpi, x, bsize, cur_mv, mi_row, mi_col, &rate_mv, args); - - // Store cur_mv and rate_mv so that they can be restored in the next - // iteration of the loop - backup_mv[0] = cur_mv[0]; - backup_mv[1] = cur_mv[1]; - backup_rate_mv = rate_mv; - } - - if (newmv_ret_val != 0) { - early_terminate = INT64_MAX; - continue; - } else { - rd_stats->rate += rate_mv; - } - } - for (i = 0; i < is_comp_pred + 1; ++i) { - mbmi->mv[i].as_int = cur_mv[i].as_int; - } + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 1; + if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME; - // Initialise tmp_dst and orig_dst buffers to prevent "may be used - // uninitialized" warnings in GCC when the stream is monochrome. - memset(tmp_dst.plane, 0, sizeof(tmp_dst.plane)); - memset(tmp_dst.stride, 0, sizeof(tmp_dst.stride)); - memset(orig_dst.plane, 0, sizeof(tmp_dst.plane)); - memset(orig_dst.stride, 0, sizeof(tmp_dst.stride)); + mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); - // do first prediction into the destination buffer. Do the next - // prediction into a temporary buffer. Then keep track of which one - // of these currently holds the best predictor, and use the other - // one for future predictions. In the end, copy from tmp_buf to - // dst if necessary. - for (i = 0; i < num_planes; i++) { - tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE; - tmp_dst.stride[i] = MAX_SB_SIZE; - } - for (i = 0; i < num_planes; i++) { - orig_dst.plane[i] = xd->plane[i].dst.buf; - orig_dst.stride[i] = xd->plane[i].dst.stride; - } + mbmi->num_proj_ref[0] = 0; + mbmi->num_proj_ref[1] = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->ref_mv_idx = ref_mv_idx; - const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx); -#if USE_DISCOUNT_NEWMV_TEST - // We don't include the cost of the second reference here, because there - // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other - // words if you present them in that order, the second one is always known - // if the first is known. - // - // Under some circumstances we discount the cost of new mv mode to encourage - // initiation of a motion field. - if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) { - // discount_newmv_test only applies discount on NEWMV mode. - assert(this_mode == NEWMV); - rd_stats->rate += AOMMIN(cost_mv_ref(x, this_mode, mode_ctx), - cost_mv_ref(x, NEARESTMV, mode_ctx)); - } else { - rd_stats->rate += ref_mv_cost; + if (is_comp_pred) { + for (int ref_idx = 0; ref_idx < is_comp_pred + 1; ++ref_idx) { + const int single_mode = + get_single_mode(this_mode, ref_idx, is_comp_pred); + if (single_mode == NEWMV && + args->single_newmv[mbmi->ref_mv_idx][mbmi->ref_frame[ref_idx]] + .as_int == INVALID_MV) + continue; + } } -#else - rd_stats->rate += ref_mv_cost; -#endif - if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd && - mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) { - early_terminate = INT64_MAX; - continue; - } + rd_stats->rate += args->ref_frame_cost + args->single_comp_cost; + rd_stats->rate += + get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type); - ret_val = interpolation_filter_search( - x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, args->single_filter, - &rd, &rs, &skip_txfm_sb, &skip_sse_sb); - if (ret_val != 0) { - early_terminate = INT64_MAX; - restore_dst_buf(xd, orig_dst, num_planes); - continue; - } else if (cpi->sf.model_based_post_interp_filter_breakout && - ref_best_rd != INT64_MAX && (rd / 6) > ref_best_rd) { - early_terminate = INT64_MAX; - restore_dst_buf(xd, orig_dst, num_planes); - if ((rd >> 4) > ref_best_rd) break; - continue; - } + const RD_STATS backup_rd_stats = *rd_stats; + const MB_MODE_INFO backup_mbmi = *mbmi; + int64_t best_rd2 = INT64_MAX; - if (is_comp_pred && comp_idx) { - int rate_sum, rs2; - int64_t dist_sum; - int64_t best_rd_compound = INT64_MAX, best_rd_cur = INT64_MAX; - int_mv best_mv[2]; - int best_tmp_rate_mv = rate_mv; - int tmp_skip_txfm_sb; - int64_t tmp_skip_sse_sb; - DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]); - DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]); - uint8_t *preds0[1] = { pred0 }; - uint8_t *preds1[1] = { pred1 }; - int strides[1] = { bw }; - int tmp_rate_mv; - const int num_pix = 1 << num_pels_log2_lookup[bsize]; - COMPOUND_TYPE cur_type; - int best_compmode_interinter_cost = 0; - int can_use_previous = cm->allow_warped_motion; - - best_mv[0].as_int = cur_mv[0].as_int; - best_mv[1].as_int = cur_mv[1].as_int; + // If !search_jnt_comp, we need to force mbmi->compound_idx = 1. + for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) { + int rs = 0; + int compmode_interinter_cost = 0; + *rd_stats = backup_rd_stats; + *mbmi = backup_mbmi; + mbmi->compound_idx = comp_idx; - if (masked_compound_used) { - // get inter predictors to use for masked compound modes - av1_build_inter_predictors_for_planes_single_buf( - xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, - can_use_previous); - av1_build_inter_predictors_for_planes_single_buf( - xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, - can_use_previous); - } - - int best_comp_group_idx = 0; - int best_compound_idx = 1; - for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) { - if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break; - if (!is_interinter_compound_used(cur_type, bsize)) continue; - tmp_rate_mv = rate_mv; - best_rd_cur = INT64_MAX; - mbmi->interinter_comp.type = cur_type; - int masked_type_cost = 0; + if (is_comp_pred && comp_idx == 0) { + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 0; const int comp_group_idx_ctx = get_comp_group_idx_context(xd); const int comp_index_ctx = get_comp_index_context(cm, xd); if (masked_compound_used) { - if (cur_type == COMPOUND_AVERAGE) { - mbmi->comp_group_idx = 0; - mbmi->compound_idx = 1; - - masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][0]; - masked_type_cost += x->comp_idx_cost[comp_index_ctx][1]; - } else { - mbmi->comp_group_idx = 1; - mbmi->compound_idx = 1; - - masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1]; - masked_type_cost += - x->compound_type_cost[bsize][mbmi->interinter_comp.type - 1]; - } - } else { - mbmi->comp_group_idx = 0; - mbmi->compound_idx = 1; - - masked_type_cost += x->comp_idx_cost[comp_index_ctx][1]; + compmode_interinter_cost += + x->comp_group_idx_cost[comp_group_idx_ctx][0]; } - rs2 = masked_type_cost; + compmode_interinter_cost += x->comp_idx_cost[comp_index_ctx][0]; + } - switch (cur_type) { - case COMPOUND_AVERAGE: - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst, - bsize); - av1_subtract_plane(x, bsize, 0); - rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, - INT64_MAX); - if (rd != INT64_MAX) - best_rd_cur = - RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum); - break; - case COMPOUND_WEDGE: - if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh && - best_rd_compound / 3 < ref_best_rd) { - best_rd_cur = build_and_cost_compound_type( - cpi, x, cur_mv, bsize, this_mode, &rs2, rate_mv, &orig_dst, - &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col); - } - break; - case COMPOUND_DIFFWTD: - if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh && - best_rd_compound / 3 < ref_best_rd) { - best_rd_cur = build_and_cost_compound_type( - cpi, x, cur_mv, bsize, this_mode, &rs2, rate_mv, &orig_dst, - &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col); - } - break; - default: assert(0); return INT64_MAX; + int_mv cur_mv[2]; + if (!build_cur_mv(cur_mv, this_mode, cm, x)) { + continue; + } + if (have_newmv_in_inter_mode(this_mode)) { + if (comp_idx == 0) { + cur_mv[0] = backup_mv[0]; + cur_mv[1] = backup_mv[1]; + rate_mv = backup_rate_mv; } - if (best_rd_cur < best_rd_compound) { - best_comp_group_idx = mbmi->comp_group_idx; - best_compound_idx = mbmi->compound_idx; - best_rd_compound = best_rd_cur; - best_compound_data = mbmi->interinter_comp; - memcpy(tmp_best_mask_buf, xd->seg_mask, - 2 * num_pix * sizeof(uint8_t)); - best_compmode_interinter_cost = rs2; - if (have_newmv_in_inter_mode(this_mode)) { - if (use_masked_motion_search(cur_type)) { - best_tmp_rate_mv = tmp_rate_mv; - best_mv[0].as_int = mbmi->mv[0].as_int; - best_mv[1].as_int = mbmi->mv[1].as_int; - } else { - best_mv[0].as_int = cur_mv[0].as_int; - best_mv[1].as_int = cur_mv[1].as_int; - } - } + // when jnt_comp_skip_mv_search flag is on, new mv will be searched once + if (!(search_jnt_comp && cpi->sf.jnt_comp_skip_mv_search && + comp_idx == 0)) { + newmv_ret_val = handle_newmv(cpi, x, bsize, cur_mv, mi_row, mi_col, + &rate_mv, args); + + // Store cur_mv and rate_mv so that they can be restored in the next + // iteration of the loop + backup_mv[0] = cur_mv[0]; + backup_mv[1] = cur_mv[1]; + backup_rate_mv = rate_mv; } - // reset to original mvs for next iteration - mbmi->mv[0].as_int = cur_mv[0].as_int; - mbmi->mv[1].as_int = cur_mv[1].as_int; - } - mbmi->comp_group_idx = best_comp_group_idx; - mbmi->compound_idx = best_compound_idx; - mbmi->interinter_comp = best_compound_data; - assert(IMPLIES(mbmi->comp_group_idx == 1, - mbmi->interinter_comp.type != COMPOUND_AVERAGE)); - memcpy(xd->seg_mask, tmp_best_mask_buf, 2 * num_pix * sizeof(uint8_t)); - if (have_newmv_in_inter_mode(this_mode)) { - mbmi->mv[0].as_int = best_mv[0].as_int; - mbmi->mv[1].as_int = best_mv[1].as_int; - if (use_masked_motion_search(mbmi->interinter_comp.type)) { - rd_stats->rate += best_tmp_rate_mv - rate_mv; - rate_mv = best_tmp_rate_mv; + + if (newmv_ret_val != 0) { + continue; + } else { + rd_stats->rate += rate_mv; } } + for (i = 0; i < is_comp_pred + 1; ++i) { + mbmi->mv[i].as_int = cur_mv[i].as_int; + } - if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) { - restore_dst_buf(xd, orig_dst, num_planes); - early_terminate = INT64_MAX; + // Initialise tmp_dst and orig_dst buffers to prevent "may be used + // uninitialized" warnings in GCC when the stream is monochrome. + memset(tmp_dst.plane, 0, sizeof(tmp_dst.plane)); + memset(tmp_dst.stride, 0, sizeof(tmp_dst.stride)); + memset(orig_dst.plane, 0, sizeof(tmp_dst.plane)); + memset(orig_dst.stride, 0, sizeof(tmp_dst.stride)); + + // do first prediction into the destination buffer. Do the next + // prediction into a temporary buffer. Then keep track of which one + // of these currently holds the best predictor, and use the other + // one for future predictions. In the end, copy from tmp_buf to + // dst if necessary. + for (i = 0; i < num_planes; i++) { + tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE; + tmp_dst.stride[i] = MAX_SB_SIZE; + } + for (i = 0; i < num_planes; i++) { + orig_dst.plane[i] = xd->plane[i].dst.buf; + orig_dst.stride[i] = xd->plane[i].dst.stride; + } + + const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx); +#if USE_DISCOUNT_NEWMV_TEST + // We don't include the cost of the second reference here, because there + // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in + // other words if you present them in that order, the second one is always + // known if the first is known. + // + // Under some circumstances we discount the cost of new mv mode to + // encourage initiation of a motion field. + if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) { + // discount_newmv_test only applies discount on NEWMV mode. + assert(this_mode == NEWMV); + rd_stats->rate += AOMMIN(cost_mv_ref(x, this_mode, mode_ctx), + cost_mv_ref(x, NEARESTMV, mode_ctx)); + } else { + rd_stats->rate += ref_mv_cost; + } +#else + rd_stats->rate += ref_mv_cost; +#endif + + if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd && + mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) { continue; } - compmode_interinter_cost = best_compmode_interinter_cost; - } - if (is_comp_pred) { - int tmp_rate; - int64_t tmp_dist; - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, bsize); - model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, - &tmp_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, - plane_sse, plane_dist); - rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist); - } - - if (search_jnt_comp) { - // if 1/2 model rd is larger than best_rd in jnt_comp mode, - // use jnt_comp mode, save additional search - if ((rd >> 1) > best_rd) { + ret_val = interpolation_filter_search( + x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, + args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb); + if (ret_val != 0) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } else if (cpi->sf.model_based_post_interp_filter_breakout && + ref_best_rd != INT64_MAX && (rd / 6 > ref_best_rd)) { restore_dst_buf(xd, orig_dst, num_planes); + if ((rd >> 4) > ref_best_rd) break; continue; } - } - if (!is_comp_pred) - args->single_filter[this_mode][refs[0]] = - av1_extract_interp_filter(mbmi->interp_filters, 0); + if (is_comp_pred && comp_idx) { + int64_t best_rd_compound; + compmode_interinter_cost = compound_type_rd( + cpi, x, bsize, mi_col, mi_row, cur_mv, masked_compound_used, + &orig_dst, &tmp_dst, &rate_mv, &best_rd_compound, rd_stats, + ref_best_rd); + if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } + if (mbmi->interinter_comp.type != COMPOUND_AVERAGE) { + int tmp_rate; + int64_t tmp_dist; + av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, + bsize); + for (int plane = 0; plane < num_planes; ++plane) + av1_subtract_plane(x, bsize, plane); + model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, + &tmp_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, + plane_sse, plane_dist); + rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist); + } + } - if (args->modelled_rd != NULL) { - if (is_comp_pred) { - const int mode0 = compound_ref0_mode(this_mode); - const int mode1 = compound_ref1_mode(this_mode); - const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]], - args->modelled_rd[mode1][refs[1]]); - if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) { + if (search_jnt_comp) { + // if 1/2 model rd is larger than best_rd in jnt_comp mode, + // use jnt_comp mode, save additional search + if ((rd >> 1) > best_rd) { restore_dst_buf(xd, orig_dst, num_planes); - early_terminate = INT64_MAX; continue; } - } else { - args->modelled_rd[this_mode][refs[0]] = rd; } - } - if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { - // if current pred_error modeled rd is substantially more than the best - // so far, do not bother doing full rd - if (rd / 2 > ref_best_rd) { - restore_dst_buf(xd, orig_dst, num_planes); - early_terminate = INT64_MAX; - continue; + if (!is_comp_pred) + args->single_filter[this_mode][refs[0]] = + av1_extract_interp_filter(mbmi->interp_filters, 0); + + if (args->modelled_rd != NULL) { + if (is_comp_pred) { + const int mode0 = compound_ref0_mode(this_mode); + const int mode1 = compound_ref1_mode(this_mode); + const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]], + args->modelled_rd[mode1][refs[1]]); + if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } + } else { + args->modelled_rd[this_mode][refs[0]] = rd; + } } - } - rd_stats->rate += compmode_interinter_cost; + if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { + // if current pred_error modeled rd is substantially more than the best + // so far, do not bother doing full rd + if (rd / 2 > ref_best_rd) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } + } - if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) { - // TODO(chengchen): this speed feature introduces big loss. - // Need better estimation of rate distortion. - rd_stats->rate += rs; - rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2]; - rd_stats_y->rate = plane_rate[0]; - rd_stats_uv->rate = plane_rate[1] + plane_rate[2]; - rd_stats->sse = plane_sse[0] + plane_sse[1] + plane_sse[2]; - rd_stats_y->sse = plane_sse[0]; - rd_stats_uv->sse = plane_sse[1] + plane_sse[2]; - rd_stats->dist = plane_dist[0] + plane_dist[1] + plane_dist[2]; - rd_stats_y->dist = plane_dist[0]; - rd_stats_uv->dist = plane_dist[1] + plane_dist[2]; - } else { + rd_stats->rate += compmode_interinter_cost; + + if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) { + // TODO(chengchen): this speed feature introduces big loss. + // Need better estimation of rate distortion. + rd_stats->rate += rs; + rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2]; + rd_stats_y->rate = plane_rate[0]; + rd_stats_uv->rate = plane_rate[1] + plane_rate[2]; + rd_stats->sse = plane_sse[0] + plane_sse[1] + plane_sse[2]; + rd_stats_y->sse = plane_sse[0]; + rd_stats_uv->sse = plane_sse[1] + plane_sse[2]; + rd_stats->dist = plane_dist[0] + plane_dist[1] + plane_dist[2]; + rd_stats_y->dist = plane_dist[0]; + rd_stats_uv->dist = plane_dist[1] + plane_dist[2]; + } else { #if CONFIG_COLLECT_INTER_MODE_RD_STATS - ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, - disable_skip, mi_row, mi_col, args, ref_best_rd, - refs, rate_mv, &orig_dst, best_est_rd); + ret_val = + motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, + disable_skip, mi_row, mi_col, args, ref_best_rd, + refs, rate_mv, &orig_dst, best_est_rd); #else - ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, - disable_skip, mi_row, mi_col, args, ref_best_rd, - refs, rate_mv, &orig_dst); + ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, + rd_stats_uv, disable_skip, mi_row, mi_col, + args, ref_best_rd, refs, rate_mv, &orig_dst); #endif - } - if (ret_val != INT64_MAX) { - if (search_jnt_comp) { + } + if (ret_val != INT64_MAX) { int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); if (tmp_rd < best_rd) { best_rd_stats = *rd_stats; best_rd_stats_y = *rd_stats_y; best_rd_stats_uv = *rd_stats_uv; - best_ret_val = ret_val; best_rd = tmp_rd; best_mbmi = *mbmi; + best_disable_skip = *disable_skip; + best_xskip = x->skip; memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w); } + + if (tmp_rd < best_rd2) { + best_rd2 = tmp_rd; + } + if (tmp_rd < ref_best_rd) { ref_best_rd = tmp_rd; } } - } - if (!search_jnt_comp && ret_val != 0) { restore_dst_buf(xd, orig_dst, num_planes); - return ret_val; } - restore_dst_buf(xd, orig_dst, num_planes); + + args->modelled_rd = NULL; } + if (best_rd == INT64_MAX) return INT64_MAX; + // re-instate status of the best choice - if (is_comp_pred && best_ret_val != INT64_MAX) { - *rd_stats = best_rd_stats; - *rd_stats_y = best_rd_stats_y; - *rd_stats_uv = best_rd_stats_uv; - ret_val = best_ret_val; - *mbmi = best_mbmi; - assert(IMPLIES(mbmi->comp_group_idx == 1, - mbmi->interinter_comp.type != COMPOUND_AVERAGE)); - memcpy(x->blk_skip, best_blk_skip, - sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w); - } - if (early_terminate == INT64_MAX) return INT64_MAX; - if (ret_val != 0) return ret_val; + *rd_stats = best_rd_stats; + *rd_stats_y = best_rd_stats_y; + *rd_stats_uv = best_rd_stats_uv; + *mbmi = best_mbmi; + *disable_skip = best_disable_skip; + x->skip = best_xskip; + assert(IMPLIES(mbmi->comp_group_idx == 1, + mbmi->interinter_comp.type != COMPOUND_AVERAGE)); + memcpy(x->blk_skip, best_blk_skip, + sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w); + return RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); } @@ -8822,6 +9110,13 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, av1_find_best_ref_mvs_from_stack(0, mbmi_ext, ref_frame, &nearestmv, &nearmv, 0); + if (nearestmv.as_int == INVALID_MV) { + nearestmv.as_int = 0; + } + if (nearmv.as_int == INVALID_MV) { + nearmv.as_int = 0; + } + int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv; if (dv_ref.as_int == 0) av1_find_ref_dv(&dv_ref, tile, cm->seq_params.mib_size, mi_row, mi_col); @@ -9013,8 +9308,9 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, if (intra_yrd < best_rd) { // Only store reconstructed luma when there's chroma RDO. When there's no // chroma RDO, the reconstructed luma will be stored in encode_superblock(). - xd->cfl.is_chroma_reference = is_chroma_reference( - mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y); + xd->cfl.is_chroma_reference = + is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y); xd->cfl.store_y = store_cfl_required_rdo(cm, x); if (xd->cfl.store_y) { // Restore reconstructed luma values. @@ -9081,7 +9377,7 @@ static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) { for (r = 0; r < rows; ++r) { for (c = 0; c < cols; ++c) { - if (cpi->common.use_highbitdepth) { + if (cpi->common.seq_params.use_highbitdepth) { data[(r * cols + c) * 2] = src_u16[r * src_stride + c]; data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c]; } else { @@ -9760,6 +10056,8 @@ static int inter_mode_search_order_independent_skip( if (comp_pred) { if (!cpi->allow_comp_inter_inter) return 1; + if (cm->reference_mode == SINGLE_REFERENCE) return 1; + // Skip compound inter modes if ARF is not available. if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame[1]])) return 1; @@ -9857,7 +10155,7 @@ static int handle_intra_mode(InterModeSearchState *search_state, av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type); const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]]; const int intra_cost_penalty = av1_get_intra_cost_penalty( - cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); + cm->base_qindex, cm->y_dc_delta_q, cm->seq_params.bit_depth); const int rows = block_size_high[bsize]; const int cols = block_size_wide[bsize]; const int num_planes = av1_num_planes(cm); @@ -10050,7 +10348,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, const int try_palette = av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type); PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; - MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; const struct segmentation *const seg = &cm->seg; PREDICTION_MODE this_mode; MV_REFERENCE_FRAME ref_frame, second_ref_frame; @@ -10097,7 +10394,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, int64_t distortion2 = 0; int skippable = 0; int this_skip2 = 0; - uint8_t ref_frame_type; this_mode = av1_mode_order[mode_index].mode; ref_frame = av1_mode_order[mode_index].ref_frame[0]; @@ -10195,7 +10491,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, mbmi->angle_delta[PLANE_TYPE_UV] = 0; mbmi->filter_intra_mode_info.use_filter_intra = 0; mbmi->ref_mv_idx = 0; - ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); int64_t ref_best_rd = search_state.best_rd; { RD_STATS rd_stats, rd_stats_y, rd_stats_uv; @@ -10203,9 +10498,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, rd_stats.rate = rate2; // Point to variables that are maintained between loop iterations - args.single_newmv = search_state.single_newmv[0]; - args.single_newmv_rate = search_state.single_newmv_rate[0]; - args.single_newmv_valid = search_state.single_newmv_valid[0]; + args.single_newmv = search_state.single_newmv; + args.single_newmv_rate = search_state.single_newmv_rate; + args.single_newmv_valid = search_state.single_newmv_valid; args.modelled_rd = search_state.modelled_rd; args.single_comp_cost = real_compmode_cost; args.ref_frame_cost = ref_frame_cost; @@ -10218,10 +10513,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, &rd_stats_uv, &disable_skip, mi_row, mi_col, &args, ref_best_rd); #endif - if (this_rd < ref_best_rd) { - ref_best_rd = this_rd; - } - rate2 = rd_stats.rate; skippable = rd_stats.skip; distortion2 = rd_stats.dist; @@ -10229,108 +10520,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, rate_uv = rd_stats_uv.rate; } - // TODO(jingning): This needs some refactoring to improve code quality - // and reduce redundant steps. - if ((have_nearmv_in_inter_mode(mbmi->mode) && - mbmi_ext->ref_mv_count[ref_frame_type] > 2) || - ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) && - mbmi_ext->ref_mv_count[ref_frame_type] > 1)) { - MB_MODE_INFO backup_mbmi = *mbmi; - int backup_skip = x->skip; - int64_t tmp_ref_rd = this_rd; - int ref_idx; - - // TODO(jingning): This should be deprecated shortly. - int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; - int ref_set = - AOMMIN(MAX_REF_MV_SERCH - 1, - mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset); - memcpy(x->blk_skip_drl, x->blk_skip, - sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); - - for (ref_idx = 0; ref_idx < ref_set; ++ref_idx) { - int64_t tmp_alt_rd = INT64_MAX; - int dummy_disable_skip = 0; - int_mv cur_mv; - RD_STATS tmp_rd_stats, tmp_rd_stats_y, tmp_rd_stats_uv; - - av1_invalid_rd_stats(&tmp_rd_stats); - - x->skip = 0; - - mbmi->ref_mv_idx = 1 + ref_idx; - - if (cpi->sf.reduce_inter_modes) { - if (mbmi->ref_frame[0] == LAST2_FRAME || - mbmi->ref_frame[0] == LAST3_FRAME || - mbmi->ref_frame[1] == LAST2_FRAME || - mbmi->ref_frame[1] == LAST3_FRAME) { - if (mbmi_ext - ->ref_mv_stack[ref_frame_type] - [mbmi->ref_mv_idx + idx_offset] - .weight < REF_CAT_LEVEL) { - *mbmi = backup_mbmi; - x->skip = backup_skip; - continue; - } - } - } - - cur_mv = - mbmi_ext->ref_mv_stack[ref_frame][mbmi->ref_mv_idx + idx_offset] - .this_mv; - clamp_mv2(&cur_mv.as_mv, xd); - - if (!mv_check_bounds(&x->mv_limits, &cur_mv.as_mv)) { - av1_init_rd_stats(&tmp_rd_stats); - - args.modelled_rd = NULL; - args.single_newmv = search_state.single_newmv[mbmi->ref_mv_idx]; - args.single_newmv_rate = - search_state.single_newmv_rate[mbmi->ref_mv_idx]; - args.single_newmv_valid = - search_state.single_newmv_valid[mbmi->ref_mv_idx]; - args.single_comp_cost = real_compmode_cost; - args.ref_frame_cost = ref_frame_cost; -#if CONFIG_COLLECT_INTER_MODE_RD_STATS - tmp_alt_rd = - handle_inter_mode(cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y, - &tmp_rd_stats_uv, &dummy_disable_skip, mi_row, - mi_col, &args, ref_best_rd, &best_est_rd); -#else - tmp_alt_rd = handle_inter_mode( - cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y, &tmp_rd_stats_uv, - &dummy_disable_skip, mi_row, mi_col, &args, ref_best_rd); -#endif - - // Prevent pointers from escaping local scope - args.single_newmv = search_state.single_newmv[0]; - args.single_newmv_rate = search_state.single_newmv_rate[0]; - args.single_newmv_valid = search_state.single_newmv_valid[0]; - } - - if (tmp_ref_rd > tmp_alt_rd) { - rate2 = tmp_rd_stats.rate; - disable_skip = dummy_disable_skip; - distortion2 = tmp_rd_stats.dist; - skippable = tmp_rd_stats.skip; - rate_y = tmp_rd_stats_y.rate; - rate_uv = tmp_rd_stats_uv.rate; - this_rd = tmp_alt_rd; - tmp_ref_rd = tmp_alt_rd; - backup_mbmi = *mbmi; - backup_skip = x->skip; - memcpy(x->blk_skip_drl, x->blk_skip, - sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); - } else { - *mbmi = backup_mbmi; - x->skip = backup_skip; - } - } - - memcpy(x->blk_skip, x->blk_skip_drl, - sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); - } if (this_rd == INT64_MAX) continue; this_skip2 = mbmi->skip; diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h index 1fa3d68ce..12df472c1 100644 --- a/third_party/aom/av1/encoder/rdopt.h +++ b/third_party/aom/av1/encoder/rdopt.h @@ -78,8 +78,8 @@ static INLINE int av1_cost_skip_txb(MACROBLOCK *x, const TXB_CTX *const txb_ctx, } static INLINE int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, - int plane, int blk_row, int blk_col, - int block, TX_SIZE tx_size, + int plane, int block, TX_SIZE tx_size, + const TX_TYPE tx_type, const TXB_CTX *const txb_ctx, int use_fast_coef_costing) { #if TXCOEFF_COST_TIMER @@ -87,8 +87,8 @@ static INLINE int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, aom_usec_timer_start(&timer); #endif (void)use_fast_coef_costing; - const int cost = av1_cost_coeffs_txb(cm, x, plane, blk_row, blk_col, block, - tx_size, txb_ctx); + const int cost = + av1_cost_coeffs_txb(cm, x, plane, block, tx_size, tx_type, txb_ctx); #if TXCOEFF_COST_TIMER AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common; aom_usec_timer_mark(&timer); diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c index 49740817c..d4b4b19c4 100644 --- a/third_party/aom/av1/encoder/speed_features.c +++ b/third_party/aom/av1/encoder/speed_features.c @@ -89,9 +89,27 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi, SPEED_FEATURES *sf, int speed) { AV1_COMMON *const cm = &cpi->common; + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + + if (is_480p_or_larger) { + sf->use_square_partition_only_threshold = BLOCK_128X128; + } else { + sf->use_square_partition_only_threshold = BLOCK_64X64; + } + + if (speed >= 1) { + if (is_720p_or_larger) { + sf->use_square_partition_only_threshold = BLOCK_128X128; + } else if (is_480p_or_larger) { + sf->use_square_partition_only_threshold = BLOCK_64X64; + } else { + sf->use_square_partition_only_threshold = BLOCK_32X32; + } + } if (speed >= 2) { - if (AOMMIN(cm->width, cm->height) >= 720) { + if (is_720p_or_larger) { sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; sf->adaptive_pred_interp_filter = 0; @@ -106,7 +124,7 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi, } if (speed >= 3) { - if (AOMMIN(cm->width, cm->height) >= 720) { + if (is_720p_or_larger) { sf->disable_split_mask = DISABLE_ALL_SPLIT; sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0; sf->partition_search_breakout_dist_thr = (1 << 25); @@ -130,7 +148,7 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi, } if (speed >= 4) { - if (AOMMIN(cm->width, cm->height) >= 720) { + if (is_720p_or_larger) { sf->partition_search_breakout_dist_thr = (1 << 26); } else { sf->partition_search_breakout_dist_thr = (1 << 24); @@ -149,6 +167,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->reduce_inter_modes = 1; sf->prune_ext_partition_types_search_level = 1; sf->ml_prune_ab_partition = 1; + sf->ml_prune_4_partition = 1; sf->adaptive_txb_search_level = 1; sf->jnt_comp_skip_mv_search = 1; sf->model_based_prune_tx_search_level = 1; @@ -195,7 +214,9 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL; sf->partition_search_breakout_rate_thr = 80; - sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; + // Note: This speed feature is disable as it seems to be worse in + // compression/quality and is also slower. + // sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; sf->allow_partition_search_skip = 1; sf->disable_wedge_search_var_thresh = 100; sf->fast_wedge_sign_estimate = 1; @@ -221,7 +242,8 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, if (speed >= 4) { sf->tx_type_search.fast_intra_tx_type_search = 1; sf->tx_type_search.fast_inter_tx_type_search = 1; - sf->use_square_partition_only = !boosted; + sf->use_square_partition_only_threshold = + boosted ? BLOCK_128X128 : BLOCK_4X4; sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL; sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED; @@ -242,7 +264,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL; sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL; - sf->use_square_partition_only = 1; + sf->use_square_partition_only_threshold = BLOCK_4X4; sf->tx_size_search_method = USE_LARGESTALL; sf->mv.search_method = BIGDIA; sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; @@ -363,9 +385,11 @@ static void set_dev_sf(AV1_COMP *cpi, SPEED_FEATURES *sf, int speed) { if (speed & PARTITION_SF) { if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) || has_internal_image_edge(cpi)) { - sf->use_square_partition_only = !frame_is_boosted(cpi); + sf->use_square_partition_only_threshold = + frame_is_boosted(cpi) ? BLOCK_128X128 : BLOCK_4X4; } else { - sf->use_square_partition_only = !frame_is_intra_only(cm); + sf->use_square_partition_only_threshold = + frame_is_intra_only(cm) ? BLOCK_128X128 : BLOCK_4X4; } sf->less_rectangular_check = 1; sf->prune_ext_partition_types_search_level = 2; @@ -438,7 +462,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->tx_type_search.skip_tx_search = 0; sf->selective_ref_frame = 0; sf->less_rectangular_check = 0; - sf->use_square_partition_only = 0; + sf->use_square_partition_only_threshold = BLOCK_128X128; sf->auto_min_max_partition_size = NOT_IN_USE; sf->rd_auto_partition_min_limit = BLOCK_4X4; sf->default_max_partition_size = BLOCK_LARGEST; @@ -493,6 +517,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->simple_model_rd_from_var = 0; sf->prune_ext_partition_types_search_level = 0; sf->ml_prune_ab_partition = 0; + sf->ml_prune_4_partition = 0; sf->fast_cdef_search = 0; // Set this at the appropriate speed levels diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h index 59cb6be58..d0408ba2f 100644 --- a/third_party/aom/av1/encoder/speed_features.h +++ b/third_party/aom/av1/encoder/speed_features.h @@ -400,6 +400,9 @@ typedef struct SPEED_FEATURES { // Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions. int ml_prune_ab_partition; + // Use a ML model to prune horz4 and vert4 partitions. + int ml_prune_4_partition; + int fast_cdef_search; // 2-pass coding block partition search @@ -413,8 +416,8 @@ typedef struct SPEED_FEATURES { // rd than partition type split. int less_rectangular_check; - // Disable testing non square partitions. (eg 16x32) - int use_square_partition_only; + // Use square partition only beyond this block size. + BLOCK_SIZE use_square_partition_only_threshold; // Sets min and max partition sizes for this superblock based on the // same superblock in last encoded frame, and the left and above neighbor. diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c index 250feab81..d7e4f4eb3 100644 --- a/third_party/aom/av1/encoder/temporal_filter.c +++ b/third_party/aom/av1/encoder/temporal_filter.c @@ -535,10 +535,10 @@ static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost, // Adjust the strength based on active max q. if (cpi->common.current_video_frame > 1) q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME], - cpi->common.bit_depth)); + cpi->common.seq_params.bit_depth)); else q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[KEY_FRAME], - cpi->common.bit_depth)); + cpi->common.seq_params.bit_depth)); if (q > 16) { strength = oxcf->arnr_strength; } else { diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c index 84065d6de..c71f2e74c 100644 --- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + #include "av1/encoder/x86/av1_txfm1d_sse4.h" void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output, diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c new file mode 100644 index 000000000..592462e20 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c @@ -0,0 +1,2068 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" + +#include "av1/common/enums.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/x86/av1_fwd_txfm_avx2.h" +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_txfm1d_sse4.h" +#include "av1/encoder/x86/av1_fwd_txfm_sse2.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +static INLINE void fdct16x16_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + + // stage 1 + __m256i x1[16]; + btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]); + btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]); + btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]); + btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]); + btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]); + btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]); + btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]); + btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]); + + // stage 2 + btf_16_adds_subs_avx2(&x1[0], &x1[7]); + btf_16_adds_subs_avx2(&x1[1], &x1[6]); + btf_16_adds_subs_avx2(&x1[2], &x1[5]); + btf_16_adds_subs_avx2(&x1[3], &x1[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); + + // stage 4 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + + // stage 5 + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + + // stage 6 + btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); + + // stage 7 + output[0] = x1[0]; + output[1] = x1[8]; + output[2] = x1[4]; + output[3] = x1[12]; + output[4] = x1[2]; + output[5] = x1[10]; + output[6] = x1[6]; + output[7] = x1[14]; + output[8] = x1[1]; + output[9] = x1[9]; + output[10] = x1[5]; + output[11] = x1[13]; + output[12] = x1[3]; + output[13] = x1[11]; + output[14] = x1[7]; + output[15] = x1[15]; +} + +static INLINE void fdct16x32_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]); + __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]); + __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]); + __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]); + __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]); + __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]); + __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]); + __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]); + __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]); + __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]); + __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]); + __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]); + __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]); + __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]); + __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]); + __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]); + + // stage 1 + __m256i x1[32]; + btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]); + btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]); + btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]); + btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]); + btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]); + btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]); + btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]); + btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]); + btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]); + btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]); + btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]); + btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]); + btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]); + btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]); + btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]); + btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]); + + // stage 2 + btf_16_adds_subs_avx2(&x1[0], &x1[15]); + btf_16_adds_subs_avx2(&x1[1], &x1[14]); + btf_16_adds_subs_avx2(&x1[2], &x1[13]); + btf_16_adds_subs_avx2(&x1[3], &x1[12]); + btf_16_adds_subs_avx2(&x1[4], &x1[11]); + btf_16_adds_subs_avx2(&x1[5], &x1[10]); + btf_16_adds_subs_avx2(&x1[6], &x1[9]); + btf_16_adds_subs_avx2(&x1[7], &x1[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[7]); + btf_16_adds_subs_avx2(&x1[1], &x1[6]); + btf_16_adds_subs_avx2(&x1[2], &x1[5]); + btf_16_adds_subs_avx2(&x1[3], &x1[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[23]); + btf_16_adds_subs_avx2(&x1[17], &x1[22]); + btf_16_adds_subs_avx2(&x1[18], &x1[21]); + btf_16_adds_subs_avx2(&x1[19], &x1[20]); + btf_16_adds_subs_avx2(&x1[31], &x1[24]); + btf_16_adds_subs_avx2(&x1[30], &x1[25]); + btf_16_adds_subs_avx2(&x1[29], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[27]); + + // stage 4 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit); + + // stage 5 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[19]); + btf_16_adds_subs_avx2(&x1[17], &x1[18]); + btf_16_adds_subs_avx2(&x1[23], &x1[20]); + btf_16_adds_subs_avx2(&x1[22], &x1[21]); + btf_16_adds_subs_avx2(&x1[24], &x1[27]); + btf_16_adds_subs_avx2(&x1[25], &x1[26]); + btf_16_adds_subs_avx2(&x1[31], &x1[28]); + btf_16_adds_subs_avx2(&x1[30], &x1[29]); + + // stage 6 + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit); + + // stage 7 + btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[17]); + btf_16_adds_subs_avx2(&x1[19], &x1[18]); + btf_16_adds_subs_avx2(&x1[20], &x1[21]); + btf_16_adds_subs_avx2(&x1[23], &x1[22]); + btf_16_adds_subs_avx2(&x1[24], &x1[25]); + btf_16_adds_subs_avx2(&x1[27], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[29]); + btf_16_adds_subs_avx2(&x1[31], &x1[30]); + + // stage 8 + btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit); + btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit); + + // stage 9 + output[0] = x1[0]; + output[1] = x1[16]; + output[2] = x1[8]; + output[3] = x1[24]; + output[4] = x1[4]; + output[5] = x1[20]; + output[6] = x1[12]; + output[7] = x1[28]; + output[8] = x1[2]; + output[9] = x1[18]; + output[10] = x1[10]; + output[11] = x1[26]; + output[12] = x1[6]; + output[13] = x1[22]; + output[14] = x1[14]; + output[15] = x1[30]; + output[16] = x1[1]; + output[17] = x1[17]; + output[18] = x1[9]; + output[19] = x1[25]; + output[20] = x1[5]; + output[21] = x1[21]; + output[22] = x1[13]; + output[23] = x1[29]; + output[24] = x1[3]; + output[25] = x1[19]; + output[26] = x1[11]; + output[27] = x1[27]; + output[28] = x1[7]; + output[29] = x1[23]; + output[30] = x1[15]; + output[31] = x1[31]; +} + +static INLINE void fdct16x64_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]); + __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); + __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]); + __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); + __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]); + __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]); + __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]); + __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]); + __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]); + __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]); + __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]); + __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]); + __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]); + __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]); + __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]); + __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]); + __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]); + __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]); + __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]); + __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]); + __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]); + __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]); + __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]); + __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]); + __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]); + __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]); + __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]); + __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]); + __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]); + __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]); + __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]); + __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]); + __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]); + __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]); + __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]); + __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]); + __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]); + __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]); + __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]); + __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]); + __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]); + __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]); + __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]); + __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]); + __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]); + __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]); + __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]); + __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]); + __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]); + __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]); + __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]); + __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]); + + // stage 1 + __m256i x1[64]; + btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]); + btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]); + btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]); + btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]); + btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]); + btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]); + btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]); + btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]); + btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]); + btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]); + btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]); + btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]); + btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]); + btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]); + btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]); + btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]); + btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]); + btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]); + btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]); + btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]); + btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]); + btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]); + btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]); + btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]); + btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]); + btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]); + btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]); + btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]); + btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]); + btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]); + btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]); + btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]); + + // stage 2 + btf_16_adds_subs_avx2(&x1[0], &x1[31]); + btf_16_adds_subs_avx2(&x1[1], &x1[30]); + btf_16_adds_subs_avx2(&x1[2], &x1[29]); + btf_16_adds_subs_avx2(&x1[3], &x1[28]); + btf_16_adds_subs_avx2(&x1[4], &x1[27]); + btf_16_adds_subs_avx2(&x1[5], &x1[26]); + btf_16_adds_subs_avx2(&x1[6], &x1[25]); + btf_16_adds_subs_avx2(&x1[7], &x1[24]); + btf_16_adds_subs_avx2(&x1[8], &x1[23]); + btf_16_adds_subs_avx2(&x1[9], &x1[22]); + btf_16_adds_subs_avx2(&x1[10], &x1[21]); + btf_16_adds_subs_avx2(&x1[11], &x1[20]); + btf_16_adds_subs_avx2(&x1[12], &x1[19]); + btf_16_adds_subs_avx2(&x1[13], &x1[18]); + btf_16_adds_subs_avx2(&x1[14], &x1[17]); + btf_16_adds_subs_avx2(&x1[15], &x1[16]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[15]); + btf_16_adds_subs_avx2(&x1[1], &x1[14]); + btf_16_adds_subs_avx2(&x1[2], &x1[13]); + btf_16_adds_subs_avx2(&x1[3], &x1[12]); + btf_16_adds_subs_avx2(&x1[4], &x1[11]); + btf_16_adds_subs_avx2(&x1[5], &x1[10]); + btf_16_adds_subs_avx2(&x1[6], &x1[9]); + btf_16_adds_subs_avx2(&x1[7], &x1[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[47]); + btf_16_adds_subs_avx2(&x1[33], &x1[46]); + btf_16_adds_subs_avx2(&x1[34], &x1[45]); + btf_16_adds_subs_avx2(&x1[35], &x1[44]); + btf_16_adds_subs_avx2(&x1[36], &x1[43]); + btf_16_adds_subs_avx2(&x1[37], &x1[42]); + btf_16_adds_subs_avx2(&x1[38], &x1[41]); + btf_16_adds_subs_avx2(&x1[39], &x1[40]); + btf_16_adds_subs_avx2(&x1[63], &x1[48]); + btf_16_adds_subs_avx2(&x1[62], &x1[49]); + btf_16_adds_subs_avx2(&x1[61], &x1[50]); + btf_16_adds_subs_avx2(&x1[60], &x1[51]); + btf_16_adds_subs_avx2(&x1[59], &x1[52]); + btf_16_adds_subs_avx2(&x1[58], &x1[53]); + btf_16_adds_subs_avx2(&x1[57], &x1[54]); + btf_16_adds_subs_avx2(&x1[56], &x1[55]); + + // stage 4 + btf_16_adds_subs_avx2(&x1[0], &x1[7]); + btf_16_adds_subs_avx2(&x1[1], &x1[6]); + btf_16_adds_subs_avx2(&x1[2], &x1[5]); + btf_16_adds_subs_avx2(&x1[3], &x1[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[23]); + btf_16_adds_subs_avx2(&x1[17], &x1[22]); + btf_16_adds_subs_avx2(&x1[18], &x1[21]); + btf_16_adds_subs_avx2(&x1[19], &x1[20]); + btf_16_adds_subs_avx2(&x1[31], &x1[24]); + btf_16_adds_subs_avx2(&x1[30], &x1[25]); + btf_16_adds_subs_avx2(&x1[29], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[27]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit); + + // stage 5 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[39]); + btf_16_adds_subs_avx2(&x1[33], &x1[38]); + btf_16_adds_subs_avx2(&x1[34], &x1[37]); + btf_16_adds_subs_avx2(&x1[35], &x1[36]); + btf_16_adds_subs_avx2(&x1[47], &x1[40]); + btf_16_adds_subs_avx2(&x1[46], &x1[41]); + btf_16_adds_subs_avx2(&x1[45], &x1[42]); + btf_16_adds_subs_avx2(&x1[44], &x1[43]); + btf_16_adds_subs_avx2(&x1[48], &x1[55]); + btf_16_adds_subs_avx2(&x1[49], &x1[54]); + btf_16_adds_subs_avx2(&x1[50], &x1[53]); + btf_16_adds_subs_avx2(&x1[51], &x1[52]); + btf_16_adds_subs_avx2(&x1[63], &x1[56]); + btf_16_adds_subs_avx2(&x1[62], &x1[57]); + btf_16_adds_subs_avx2(&x1[61], &x1[58]); + btf_16_adds_subs_avx2(&x1[60], &x1[59]); + + // stage 6 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[19]); + btf_16_adds_subs_avx2(&x1[17], &x1[18]); + btf_16_adds_subs_avx2(&x1[23], &x1[20]); + btf_16_adds_subs_avx2(&x1[22], &x1[21]); + btf_16_adds_subs_avx2(&x1[24], &x1[27]); + btf_16_adds_subs_avx2(&x1[25], &x1[26]); + btf_16_adds_subs_avx2(&x1[31], &x1[28]); + btf_16_adds_subs_avx2(&x1[30], &x1[29]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit); + + // stage 7 + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[35]); + btf_16_adds_subs_avx2(&x1[33], &x1[34]); + btf_16_adds_subs_avx2(&x1[39], &x1[36]); + btf_16_adds_subs_avx2(&x1[38], &x1[37]); + btf_16_adds_subs_avx2(&x1[40], &x1[43]); + btf_16_adds_subs_avx2(&x1[41], &x1[42]); + btf_16_adds_subs_avx2(&x1[47], &x1[44]); + btf_16_adds_subs_avx2(&x1[46], &x1[45]); + btf_16_adds_subs_avx2(&x1[48], &x1[51]); + btf_16_adds_subs_avx2(&x1[49], &x1[50]); + btf_16_adds_subs_avx2(&x1[55], &x1[52]); + btf_16_adds_subs_avx2(&x1[54], &x1[53]); + btf_16_adds_subs_avx2(&x1[56], &x1[59]); + btf_16_adds_subs_avx2(&x1[57], &x1[58]); + btf_16_adds_subs_avx2(&x1[63], &x1[60]); + btf_16_adds_subs_avx2(&x1[62], &x1[61]); + + // stage 8 + btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[17]); + btf_16_adds_subs_avx2(&x1[19], &x1[18]); + btf_16_adds_subs_avx2(&x1[20], &x1[21]); + btf_16_adds_subs_avx2(&x1[23], &x1[22]); + btf_16_adds_subs_avx2(&x1[24], &x1[25]); + btf_16_adds_subs_avx2(&x1[27], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[29]); + btf_16_adds_subs_avx2(&x1[31], &x1[30]); + btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit); + btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit); + + // stage 9 + btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit); + btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[33]); + btf_16_adds_subs_avx2(&x1[35], &x1[34]); + btf_16_adds_subs_avx2(&x1[36], &x1[37]); + btf_16_adds_subs_avx2(&x1[39], &x1[38]); + btf_16_adds_subs_avx2(&x1[40], &x1[41]); + btf_16_adds_subs_avx2(&x1[43], &x1[42]); + btf_16_adds_subs_avx2(&x1[44], &x1[45]); + btf_16_adds_subs_avx2(&x1[47], &x1[46]); + btf_16_adds_subs_avx2(&x1[48], &x1[49]); + btf_16_adds_subs_avx2(&x1[51], &x1[50]); + btf_16_adds_subs_avx2(&x1[52], &x1[53]); + btf_16_adds_subs_avx2(&x1[55], &x1[54]); + btf_16_adds_subs_avx2(&x1[56], &x1[57]); + btf_16_adds_subs_avx2(&x1[59], &x1[58]); + btf_16_adds_subs_avx2(&x1[60], &x1[61]); + btf_16_adds_subs_avx2(&x1[63], &x1[62]); + + // stage 10 + btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit); + btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit); + btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit); + btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit); + btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit); + btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit); + btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit); + btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit); + btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit); + btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit); + btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit); + btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit); + btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit); + + // stage 11 + output[0] = x1[0]; + output[1] = x1[32]; + output[2] = x1[16]; + output[3] = x1[48]; + output[4] = x1[8]; + output[5] = x1[40]; + output[6] = x1[24]; + output[7] = x1[56]; + output[8] = x1[4]; + output[9] = x1[36]; + output[10] = x1[20]; + output[11] = x1[52]; + output[12] = x1[12]; + output[13] = x1[44]; + output[14] = x1[28]; + output[15] = x1[60]; + output[16] = x1[2]; + output[17] = x1[34]; + output[18] = x1[18]; + output[19] = x1[50]; + output[20] = x1[10]; + output[21] = x1[42]; + output[22] = x1[26]; + output[23] = x1[58]; + output[24] = x1[6]; + output[25] = x1[38]; + output[26] = x1[22]; + output[27] = x1[54]; + output[28] = x1[14]; + output[29] = x1[46]; + output[30] = x1[30]; + output[31] = x1[62]; + output[32] = x1[1]; + output[33] = x1[33]; + output[34] = x1[17]; + output[35] = x1[49]; + output[36] = x1[9]; + output[37] = x1[41]; + output[38] = x1[25]; + output[39] = x1[57]; + output[40] = x1[5]; + output[41] = x1[37]; + output[42] = x1[21]; + output[43] = x1[53]; + output[44] = x1[13]; + output[45] = x1[45]; + output[46] = x1[29]; + output[47] = x1[61]; + output[48] = x1[3]; + output[49] = x1[35]; + output[50] = x1[19]; + output[51] = x1[51]; + output[52] = x1[11]; + output[53] = x1[43]; + output[54] = x1[27]; + output[55] = x1[59]; + output[56] = x1[7]; + output[57] = x1[39]; + output[58] = x1[23]; + output[59] = x1[55]; + output[60] = x1[15]; + output[61] = x1[47]; + output[62] = x1[31]; + output[63] = x1[63]; +} + +static INLINE void av1_fdct32_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + __m256i x1[32]; + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + // stage 0 + // stage 1 + btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]); + btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]); + btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]); + btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]); + btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]); + btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]); + btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]); + btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]); + btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]); + btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]); + btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]); + btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]); + btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]); + btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]); + btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]); + btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]); + + // stage 2 + btf_32_add_sub_avx2(&x1[0], &x1[15]); + btf_32_add_sub_avx2(&x1[1], &x1[14]); + btf_32_add_sub_avx2(&x1[2], &x1[13]); + btf_32_add_sub_avx2(&x1[3], &x1[12]); + btf_32_add_sub_avx2(&x1[4], &x1[11]); + btf_32_add_sub_avx2(&x1[5], &x1[10]); + btf_32_add_sub_avx2(&x1[6], &x1[9]); + btf_32_add_sub_avx2(&x1[7], &x1[8]); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit); + + // stage 3 + btf_32_add_sub_avx2(&x1[0], &x1[7]); + btf_32_add_sub_avx2(&x1[1], &x1[6]); + btf_32_add_sub_avx2(&x1[2], &x1[5]); + btf_32_add_sub_avx2(&x1[3], &x1[4]); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[23]); + btf_32_add_sub_avx2(&x1[17], &x1[22]); + btf_32_add_sub_avx2(&x1[18], &x1[21]); + btf_32_add_sub_avx2(&x1[19], &x1[20]); + btf_32_add_sub_avx2(&x1[31], &x1[24]); + btf_32_add_sub_avx2(&x1[30], &x1[25]); + btf_32_add_sub_avx2(&x1[29], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[27]); + + // stage 4 + btf_32_add_sub_avx2(&x1[0], &x1[3]); + btf_32_add_sub_avx2(&x1[1], &x1[2]); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[11]); + btf_32_add_sub_avx2(&x1[9], &x1[10]); + btf_32_add_sub_avx2(&x1[15], &x1[12]); + btf_32_add_sub_avx2(&x1[14], &x1[13]); + btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit); + + // stage 5 + btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit); + btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit); + btf_32_add_sub_avx2(&x1[4], &x1[5]); + btf_32_add_sub_avx2(&x1[7], &x1[6]); + btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[19]); + btf_32_add_sub_avx2(&x1[17], &x1[18]); + btf_32_add_sub_avx2(&x1[23], &x1[20]); + btf_32_add_sub_avx2(&x1[22], &x1[21]); + btf_32_add_sub_avx2(&x1[24], &x1[27]); + btf_32_add_sub_avx2(&x1[25], &x1[26]); + btf_32_add_sub_avx2(&x1[31], &x1[28]); + btf_32_add_sub_avx2(&x1[30], &x1[29]); + + // stage 6 + btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit); + btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[9]); + btf_32_add_sub_avx2(&x1[11], &x1[10]); + btf_32_add_sub_avx2(&x1[12], &x1[13]); + btf_32_add_sub_avx2(&x1[15], &x1[14]); + btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit); + + // stage 7 + btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit); + btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[17]); + btf_32_add_sub_avx2(&x1[19], &x1[18]); + btf_32_add_sub_avx2(&x1[20], &x1[21]); + btf_32_add_sub_avx2(&x1[23], &x1[22]); + btf_32_add_sub_avx2(&x1[24], &x1[25]); + btf_32_add_sub_avx2(&x1[27], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[29]); + btf_32_add_sub_avx2(&x1[31], &x1[30]); + + // stage 8 + btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit); + btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit); + + // stage 9 + output[0] = x1[0]; + output[1] = x1[16]; + output[2] = x1[8]; + output[3] = x1[24]; + output[4] = x1[4]; + output[5] = x1[20]; + output[6] = x1[12]; + output[7] = x1[28]; + output[8] = x1[2]; + output[9] = x1[18]; + output[10] = x1[10]; + output[11] = x1[26]; + output[12] = x1[6]; + output[13] = x1[22]; + output[14] = x1[14]; + output[15] = x1[30]; + output[16] = x1[1]; + output[17] = x1[17]; + output[18] = x1[9]; + output[19] = x1[25]; + output[20] = x1[5]; + output[21] = x1[21]; + output[22] = x1[13]; + output[23] = x1[29]; + output[24] = x1[3]; + output[25] = x1[19]; + output[26] = x1[11]; + output[27] = x1[27]; + output[28] = x1[7]; + output[29] = x1[23]; + output[30] = x1[15]; + output[31] = x1[31]; +} + +static INLINE void av1_fdct64_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]); + __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]); + __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]); + __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]); + __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]); + __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]); + __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]); + __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]); + __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]); + __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]); + __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]); + __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]); + __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]); + __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]); + __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]); + __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]); + __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]); + __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]); + __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]); + __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]); + __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]); + __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]); + __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]); + __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]); + __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]); + __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]); + __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]); + __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]); + __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]); + __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]); + __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]); + __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]); + __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]); + __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]); + __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]); + __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]); + __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]); + __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]); + __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]); + __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]); + __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]); + __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]); + __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]); + __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]); + __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]); + __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]); + __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]); + __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]); + __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]); + __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]); + __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]); + __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]); + __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]); + __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]); + __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]); + __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]); + __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]); + __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]); + __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]); + __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]); + __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]); + __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]); + __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]); + __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]); + __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]); + __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]); + __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]); + __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]); + __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]); + __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]); + __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]); + __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]); + __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]); + __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]); + __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]); + __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]); + __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]); + __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]); + + // stage 1 + __m256i x1[64]; + btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]); + btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]); + btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]); + btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]); + btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]); + btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]); + btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]); + btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]); + btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]); + btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]); + btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]); + btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]); + btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]); + btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]); + btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]); + btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]); + btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]); + btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]); + btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]); + btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]); + btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]); + btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]); + btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]); + btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]); + btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]); + btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]); + btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]); + btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]); + btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]); + btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]); + btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]); + btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]); + + // stage 2 + btf_32_add_sub_avx2(&x1[0], &x1[31]); + btf_32_add_sub_avx2(&x1[1], &x1[30]); + btf_32_add_sub_avx2(&x1[2], &x1[29]); + btf_32_add_sub_avx2(&x1[3], &x1[28]); + btf_32_add_sub_avx2(&x1[4], &x1[27]); + btf_32_add_sub_avx2(&x1[5], &x1[26]); + btf_32_add_sub_avx2(&x1[6], &x1[25]); + btf_32_add_sub_avx2(&x1[7], &x1[24]); + btf_32_add_sub_avx2(&x1[8], &x1[23]); + btf_32_add_sub_avx2(&x1[9], &x1[22]); + btf_32_add_sub_avx2(&x1[10], &x1[21]); + btf_32_add_sub_avx2(&x1[11], &x1[20]); + btf_32_add_sub_avx2(&x1[12], &x1[19]); + btf_32_add_sub_avx2(&x1[13], &x1[18]); + btf_32_add_sub_avx2(&x1[14], &x1[17]); + btf_32_add_sub_avx2(&x1[15], &x1[16]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit); + + // stage 3 + btf_32_add_sub_avx2(&x1[0], &x1[15]); + btf_32_add_sub_avx2(&x1[1], &x1[14]); + btf_32_add_sub_avx2(&x1[2], &x1[13]); + btf_32_add_sub_avx2(&x1[3], &x1[12]); + btf_32_add_sub_avx2(&x1[4], &x1[11]); + btf_32_add_sub_avx2(&x1[5], &x1[10]); + btf_32_add_sub_avx2(&x1[6], &x1[9]); + btf_32_add_sub_avx2(&x1[7], &x1[8]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[47]); + btf_32_add_sub_avx2(&x1[33], &x1[46]); + btf_32_add_sub_avx2(&x1[34], &x1[45]); + btf_32_add_sub_avx2(&x1[35], &x1[44]); + btf_32_add_sub_avx2(&x1[36], &x1[43]); + btf_32_add_sub_avx2(&x1[37], &x1[42]); + btf_32_add_sub_avx2(&x1[38], &x1[41]); + btf_32_add_sub_avx2(&x1[39], &x1[40]); + btf_32_add_sub_avx2(&x1[63], &x1[48]); + btf_32_add_sub_avx2(&x1[62], &x1[49]); + btf_32_add_sub_avx2(&x1[61], &x1[50]); + btf_32_add_sub_avx2(&x1[60], &x1[51]); + btf_32_add_sub_avx2(&x1[59], &x1[52]); + btf_32_add_sub_avx2(&x1[58], &x1[53]); + btf_32_add_sub_avx2(&x1[57], &x1[54]); + btf_32_add_sub_avx2(&x1[56], &x1[55]); + + // stage 4 + btf_32_add_sub_avx2(&x1[0], &x1[7]); + btf_32_add_sub_avx2(&x1[1], &x1[6]); + btf_32_add_sub_avx2(&x1[2], &x1[5]); + btf_32_add_sub_avx2(&x1[3], &x1[4]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[23]); + btf_32_add_sub_avx2(&x1[17], &x1[22]); + btf_32_add_sub_avx2(&x1[18], &x1[21]); + btf_32_add_sub_avx2(&x1[19], &x1[20]); + btf_32_add_sub_avx2(&x1[31], &x1[24]); + btf_32_add_sub_avx2(&x1[30], &x1[25]); + btf_32_add_sub_avx2(&x1[29], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[27]); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit); + + // stage 5 + btf_32_add_sub_avx2(&x1[0], &x1[3]); + btf_32_add_sub_avx2(&x1[1], &x1[2]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[11]); + btf_32_add_sub_avx2(&x1[9], &x1[10]); + btf_32_add_sub_avx2(&x1[15], &x1[12]); + btf_32_add_sub_avx2(&x1[14], &x1[13]); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[39]); + btf_32_add_sub_avx2(&x1[33], &x1[38]); + btf_32_add_sub_avx2(&x1[34], &x1[37]); + btf_32_add_sub_avx2(&x1[35], &x1[36]); + btf_32_add_sub_avx2(&x1[47], &x1[40]); + btf_32_add_sub_avx2(&x1[46], &x1[41]); + btf_32_add_sub_avx2(&x1[45], &x1[42]); + btf_32_add_sub_avx2(&x1[44], &x1[43]); + btf_32_add_sub_avx2(&x1[48], &x1[55]); + btf_32_add_sub_avx2(&x1[49], &x1[54]); + btf_32_add_sub_avx2(&x1[50], &x1[53]); + btf_32_add_sub_avx2(&x1[51], &x1[52]); + btf_32_add_sub_avx2(&x1[63], &x1[56]); + btf_32_add_sub_avx2(&x1[62], &x1[57]); + btf_32_add_sub_avx2(&x1[61], &x1[58]); + btf_32_add_sub_avx2(&x1[60], &x1[59]); + + // stage 6 + btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit); + btf_32_add_sub_avx2(&x1[4], &x1[5]); + btf_32_add_sub_avx2(&x1[7], &x1[6]); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[19]); + btf_32_add_sub_avx2(&x1[17], &x1[18]); + btf_32_add_sub_avx2(&x1[23], &x1[20]); + btf_32_add_sub_avx2(&x1[22], &x1[21]); + btf_32_add_sub_avx2(&x1[24], &x1[27]); + btf_32_add_sub_avx2(&x1[25], &x1[26]); + btf_32_add_sub_avx2(&x1[31], &x1[28]); + btf_32_add_sub_avx2(&x1[30], &x1[29]); + btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit); + + // stage 7 + btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[9]); + btf_32_add_sub_avx2(&x1[11], &x1[10]); + btf_32_add_sub_avx2(&x1[12], &x1[13]); + btf_32_add_sub_avx2(&x1[15], &x1[14]); + btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[35]); + btf_32_add_sub_avx2(&x1[33], &x1[34]); + btf_32_add_sub_avx2(&x1[39], &x1[36]); + btf_32_add_sub_avx2(&x1[38], &x1[37]); + btf_32_add_sub_avx2(&x1[40], &x1[43]); + btf_32_add_sub_avx2(&x1[41], &x1[42]); + btf_32_add_sub_avx2(&x1[47], &x1[44]); + btf_32_add_sub_avx2(&x1[46], &x1[45]); + btf_32_add_sub_avx2(&x1[48], &x1[51]); + btf_32_add_sub_avx2(&x1[49], &x1[50]); + btf_32_add_sub_avx2(&x1[55], &x1[52]); + btf_32_add_sub_avx2(&x1[54], &x1[53]); + btf_32_add_sub_avx2(&x1[56], &x1[59]); + btf_32_add_sub_avx2(&x1[57], &x1[58]); + btf_32_add_sub_avx2(&x1[63], &x1[60]); + btf_32_add_sub_avx2(&x1[62], &x1[61]); + + // stage 8 + btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[17]); + btf_32_add_sub_avx2(&x1[19], &x1[18]); + btf_32_add_sub_avx2(&x1[20], &x1[21]); + btf_32_add_sub_avx2(&x1[23], &x1[22]); + btf_32_add_sub_avx2(&x1[24], &x1[25]); + btf_32_add_sub_avx2(&x1[27], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[29]); + btf_32_add_sub_avx2(&x1[31], &x1[30]); + btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit); + + // stage 9 + btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[33]); + btf_32_add_sub_avx2(&x1[35], &x1[34]); + btf_32_add_sub_avx2(&x1[36], &x1[37]); + btf_32_add_sub_avx2(&x1[39], &x1[38]); + btf_32_add_sub_avx2(&x1[40], &x1[41]); + btf_32_add_sub_avx2(&x1[43], &x1[42]); + btf_32_add_sub_avx2(&x1[44], &x1[45]); + btf_32_add_sub_avx2(&x1[47], &x1[46]); + btf_32_add_sub_avx2(&x1[48], &x1[49]); + btf_32_add_sub_avx2(&x1[51], &x1[50]); + btf_32_add_sub_avx2(&x1[52], &x1[53]); + btf_32_add_sub_avx2(&x1[55], &x1[54]); + btf_32_add_sub_avx2(&x1[56], &x1[57]); + btf_32_add_sub_avx2(&x1[59], &x1[58]); + btf_32_add_sub_avx2(&x1[60], &x1[61]); + btf_32_add_sub_avx2(&x1[63], &x1[62]); + + // stage 10 + btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit); + + // stage 11 + output[0] = x1[0]; + output[1] = x1[32]; + output[2] = x1[16]; + output[3] = x1[48]; + output[4] = x1[8]; + output[5] = x1[40]; + output[6] = x1[24]; + output[7] = x1[56]; + output[8] = x1[4]; + output[9] = x1[36]; + output[10] = x1[20]; + output[11] = x1[52]; + output[12] = x1[12]; + output[13] = x1[44]; + output[14] = x1[28]; + output[15] = x1[60]; + output[16] = x1[2]; + output[17] = x1[34]; + output[18] = x1[18]; + output[19] = x1[50]; + output[20] = x1[10]; + output[21] = x1[42]; + output[22] = x1[26]; + output[23] = x1[58]; + output[24] = x1[6]; + output[25] = x1[38]; + output[26] = x1[22]; + output[27] = x1[54]; + output[28] = x1[14]; + output[29] = x1[46]; + output[30] = x1[30]; + output[31] = x1[62]; + output[32] = x1[1]; + output[33] = x1[33]; + output[34] = x1[17]; + output[35] = x1[49]; + output[36] = x1[9]; + output[37] = x1[41]; + output[38] = x1[25]; + output[39] = x1[57]; + output[40] = x1[5]; + output[41] = x1[37]; + output[42] = x1[21]; + output[43] = x1[53]; + output[44] = x1[13]; + output[45] = x1[45]; + output[46] = x1[29]; + output[47] = x1[61]; + output[48] = x1[3]; + output[49] = x1[35]; + output[50] = x1[19]; + output[51] = x1[51]; + output[52] = x1[11]; + output[53] = x1[43]; + output[54] = x1[27]; + output[55] = x1[59]; + output[56] = x1[7]; + output[57] = x1[39]; + output[58] = x1[23]; + output[59] = x1[55]; + output[60] = x1[15]; + output[61] = x1[47]; + output[62] = x1[31]; + output[63] = x1[63]; +} + +static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __zero = _mm256_setzero_si256(); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); + __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); + __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); + __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]); + __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]); + __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); + __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); + __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); + __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); + __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); + __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); + __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); + __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); + __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); + __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); + __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); + __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); + __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); + __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); + __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); + __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); + + // stage 1 + __m256i x1[16]; + x1[0] = input[0]; + x1[1] = _mm256_subs_epi16(__zero, input[15]); + x1[2] = _mm256_subs_epi16(__zero, input[7]); + x1[3] = input[8]; + x1[4] = _mm256_subs_epi16(__zero, input[3]); + x1[5] = input[12]; + x1[6] = input[4]; + x1[7] = _mm256_subs_epi16(__zero, input[11]); + x1[8] = _mm256_subs_epi16(__zero, input[1]); + x1[9] = input[14]; + x1[10] = input[6]; + x1[11] = _mm256_subs_epi16(__zero, input[9]); + x1[12] = input[2]; + x1[13] = _mm256_subs_epi16(__zero, input[13]); + x1[14] = _mm256_subs_epi16(__zero, input[5]); + x1[15] = input[10]; + + // stage 2 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[2]); + btf_16_adds_subs_avx2(&x1[1], &x1[3]); + btf_16_adds_subs_avx2(&x1[4], &x1[6]); + btf_16_adds_subs_avx2(&x1[5], &x1[7]); + btf_16_adds_subs_avx2(&x1[8], &x1[10]); + btf_16_adds_subs_avx2(&x1[9], &x1[11]); + btf_16_adds_subs_avx2(&x1[12], &x1[14]); + btf_16_adds_subs_avx2(&x1[13], &x1[15]); + + // stage 4 + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit); + + // stage 5 + btf_16_adds_subs_avx2(&x1[0], &x1[4]); + btf_16_adds_subs_avx2(&x1[1], &x1[5]); + btf_16_adds_subs_avx2(&x1[2], &x1[6]); + btf_16_adds_subs_avx2(&x1[3], &x1[7]); + btf_16_adds_subs_avx2(&x1[8], &x1[12]); + btf_16_adds_subs_avx2(&x1[9], &x1[13]); + btf_16_adds_subs_avx2(&x1[10], &x1[14]); + btf_16_adds_subs_avx2(&x1[11], &x1[15]); + + // stage 6 + btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit); + + // stage 7 + btf_16_adds_subs_avx2(&x1[0], &x1[8]); + btf_16_adds_subs_avx2(&x1[1], &x1[9]); + btf_16_adds_subs_avx2(&x1[2], &x1[10]); + btf_16_adds_subs_avx2(&x1[3], &x1[11]); + btf_16_adds_subs_avx2(&x1[4], &x1[12]); + btf_16_adds_subs_avx2(&x1[5], &x1[13]); + btf_16_adds_subs_avx2(&x1[6], &x1[14]); + btf_16_adds_subs_avx2(&x1[7], &x1[15]); + + // stage 8 + btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit); + btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit); + + // stage 9 + output[0] = x1[1]; + output[1] = x1[14]; + output[2] = x1[3]; + output[3] = x1[12]; + output[4] = x1[5]; + output[5] = x1[10]; + output[6] = x1[7]; + output[7] = x1[8]; + output[8] = x1[9]; + output[9] = x1[6]; + output[10] = x1[11]; + output[11] = x1[4]; + output[12] = x1[13]; + output[13] = x1[2]; + output[14] = x1[15]; + output[15] = x1[0]; +} + +static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) { + const __m256i scale__r = pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1)); + const __m256i b = _mm256_madd_epi16(a, scale__r); + return _mm256_srai_epi32(b, NewSqrt2Bits); +} + +static INLINE void fidentity16x16_new_avx2(const __m256i *input, + __m256i *output, int8_t cos_bit) { + (void)cos_bit; + const __m256i one = _mm256_set1_epi16(1); + + for (int i = 0; i < 16; ++i) { + const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one); + const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one); + const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2); + output[i] = _mm256_packs_epi32(b_lo, b_hi); + } +} + +static INLINE void fidentity16x32_new_avx2(const __m256i *input, + __m256i *output, int8_t cos_bit) { + (void)cos_bit; + for (int i = 0; i < 32; ++i) { + output[i] = _mm256_slli_epi16(input[i], 2); + } +} + +static INLINE void av1_round_shift_array_32_avx2(__m256i *input, + __m256i *output, + const int size, + const int bit) { + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + output[i] = av1_round_shift_32_avx2(input[i], bit); + } + } else { + int i; + for (i = 0; i < size; i++) { + output[i] = _mm256_slli_epi32(input[i], -bit); + } + } +} + +static INLINE void av1_round_shift_rect_array_32_avx2(__m256i *input, + __m256i *output, + const int size, + const int bit) { + const __m256i sqrt2 = _mm256_set1_epi32(NewSqrt2); + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + const __m256i r0 = av1_round_shift_32_avx2(input[i], bit); + const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0); + output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits); + } + } else { + int i; + for (i = 0; i < size; i++) { + const __m256i r0 = _mm256_slli_epi32(input[i], -bit); + const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0); + output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits); + } + } +} + +static INLINE void transpose_32_8x8_avx2(int stride, const __m256i *inputA, + __m256i *output) { + __m256i temp0 = _mm256_unpacklo_epi32(inputA[0], inputA[2]); + __m256i temp1 = _mm256_unpackhi_epi32(inputA[0], inputA[2]); + __m256i temp2 = _mm256_unpacklo_epi32(inputA[1], inputA[3]); + __m256i temp3 = _mm256_unpackhi_epi32(inputA[1], inputA[3]); + __m256i temp4 = _mm256_unpacklo_epi32(inputA[4], inputA[6]); + __m256i temp5 = _mm256_unpackhi_epi32(inputA[4], inputA[6]); + __m256i temp6 = _mm256_unpacklo_epi32(inputA[5], inputA[7]); + __m256i temp7 = _mm256_unpackhi_epi32(inputA[5], inputA[7]); + + __m256i t0 = _mm256_unpacklo_epi32(temp0, temp2); + __m256i t1 = _mm256_unpackhi_epi32(temp0, temp2); + __m256i t2 = _mm256_unpacklo_epi32(temp1, temp3); + __m256i t3 = _mm256_unpackhi_epi32(temp1, temp3); + __m256i t4 = _mm256_unpacklo_epi32(temp4, temp6); + __m256i t5 = _mm256_unpackhi_epi32(temp4, temp6); + __m256i t6 = _mm256_unpacklo_epi32(temp5, temp7); + __m256i t7 = _mm256_unpackhi_epi32(temp5, temp7); + + output[0 * stride] = _mm256_permute2x128_si256(t0, t4, 0x20); + output[1 * stride] = _mm256_permute2x128_si256(t1, t5, 0x20); + output[2 * stride] = _mm256_permute2x128_si256(t2, t6, 0x20); + output[3 * stride] = _mm256_permute2x128_si256(t3, t7, 0x20); + output[4 * stride] = _mm256_permute2x128_si256(t0, t4, 0x31); + output[5 * stride] = _mm256_permute2x128_si256(t1, t5, 0x31); + output[6 * stride] = _mm256_permute2x128_si256(t2, t6, 0x31); + output[7 * stride] = _mm256_permute2x128_si256(t3, t7, 0x31); +} + +// Store 8 16 bit values. Sign extend the values. +static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in, + int32_t *out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + _mm256_store_si256((__m256i *)(out), + _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i]))); + _mm256_store_si256( + (__m256i *)(out + 8), + _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1))); + out += stride; + } +} + +static INLINE void store_rect_16bit_to_32bit_avx2(const __m256i a, + int32_t *const b) { + const __m256i one = _mm256_set1_epi16(1); + const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8); + const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one); + const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one); + const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2); + _mm256_store_si256((__m256i *)b, b_lo); + _mm256_store_si256((__m256i *)(b + 8), b_hi); +} + +static INLINE void store_rect_buffer_16bit_to_32bit_w16_avx2( + const __m256i *const in, int32_t *const out, const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_rect_16bit_to_32bit_avx2(in[i], out + i * stride); + } +} + +static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = { + fdct16x32_new_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity16x32_new_avx2, // IDTX + fdct16x32_new_avx2, // V_DCT + fidentity16x32_new_avx2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = { + fdct16x32_new_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity16x32_new_avx2, // IDTX + fidentity16x32_new_avx2, // V_DCT + fdct16x32_new_avx2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = { + fdct16x16_new_avx2, // DCT_DCT + fadst16x16_new_avx2, // ADST_DCT + fdct16x16_new_avx2, // DCT_ADST + fadst16x16_new_avx2, // ADST_ADST + fadst16x16_new_avx2, // FLIPADST_DCT + fdct16x16_new_avx2, // DCT_FLIPADST + fadst16x16_new_avx2, // FLIPADST_FLIPADST + fadst16x16_new_avx2, // ADST_FLIPADST + fadst16x16_new_avx2, // FLIPADST_ADST + fidentity16x16_new_avx2, // IDTX + fdct16x16_new_avx2, // V_DCT + fidentity16x16_new_avx2, // H_DCT + fadst16x16_new_avx2, // V_ADST + fidentity16x16_new_avx2, // H_ADST + fadst16x16_new_avx2, // V_FLIPADST + fidentity16x16_new_avx2 // H_FLIPADST +}; + +static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = { + fdct16x16_new_avx2, // DCT_DCT + fdct16x16_new_avx2, // ADST_DCT + fadst16x16_new_avx2, // DCT_ADST + fadst16x16_new_avx2, // ADST_ADST + fdct16x16_new_avx2, // FLIPADST_DCT + fadst16x16_new_avx2, // DCT_FLIPADST + fadst16x16_new_avx2, // FLIPADST_FLIPADST + fadst16x16_new_avx2, // ADST_FLIPADST + fadst16x16_new_avx2, // FLIPADST_ADST + fidentity16x16_new_avx2, // IDTX + fidentity16x16_new_avx2, // V_DCT + fdct16x16_new_avx2, // H_DCT + fidentity16x16_new_avx2, // V_ADST + fadst16x16_new_avx2, // H_ADST + fidentity16x16_new_avx2, // V_FLIPADST + fadst16x16_new_avx2 // H_FLIPADST +}; + +static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_16X16; + __m256i buf0[16], buf1[16]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + const int32_t i = 0; + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i); + + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + transpose_16bit_16x16_avx2(buf, buf); + store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width, 16); +} + +static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_32X32; + __m256i buf0[32], buf1[128]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, + height); + } else { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i); + transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i); + } + + for (int i = 0; i < 2; i++) { + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + transpose_16bit_16x16_avx2(buf, buf); + store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width, + 16); + transpose_16bit_16x16_avx2(buf + 16, buf + 16); + store_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16 * width * i + 16, + width, 16); + } +} + +static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X64; + __m256i buf0[64], buf1[256]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(2, height_div16); ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < AOMMIN(2, height_div16); i++) { + __m256i bufA[64]; + __m256i bufB[64]; + __m128i *buf = (__m128i *)(buf1 + width * i); + for (int j = 0; j < width; ++j) { + bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); + bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); + } + av1_fdct64_new_avx2(bufA, bufA, cos_bit_row); + av1_fdct64_new_avx2(bufB, bufB, cos_bit_row); + av1_round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]); + av1_round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]); + + int32_t *output8 = output + 16 * 32 * i; + for (int j = 0; j < 4; ++j) { + __m256i *out = (__m256i *)(output8 + 8 * j); + transpose_32_8x8_avx2(4, bufA + 8 * j, out); + transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4); + } + } +} + +static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_16X32; + __m256i buf0[32], buf1[32]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0, buf1); + transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16); + + for (int i = 0; i < 2; i++) { + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + transpose_16bit_16x16_avx2(buf, buf); + store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, + width, 16); + } +} + +static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m256i buf0[32], buf1[64]; + const int8_t *shift = fwd_txfm_shift_ls[TX_32X16]; + const int txw_idx = get_txw_idx(TX_32X16); + const int txh_idx = get_txh_idx(TX_32X16); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 16; + const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, + height); + } else { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i); + } + + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + transpose_16bit_16x16_avx2(buf, buf); + store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, width, 16); + + transpose_16bit_16x16_avx2(buf + 16, buf + 16); + store_rect_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16, width, 16); +} + +static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_64X32; + __m256i buf0[64], buf1[256]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div16); ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + assert(tx_type == DCT_DCT); + for (int i = 0; i < AOMMIN(2, height_div16); i++) { + __m256i bufA[64]; + __m256i bufB[64]; + __m128i *buf = (__m128i *)(buf1 + width * i); + for (int j = 0; j < width; ++j) { + bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); + bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); + } + av1_fdct64_new_avx2(bufA, bufA, cos_bit_row); + av1_fdct64_new_avx2(bufB, bufB, cos_bit_row); + av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2]); + av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2]); + + int32_t *output8 = output + 16 * 32 * i; + for (int j = 0; j < 4; ++j) { + __m256i *out = (__m256i *)(output8 + 8 * j); + transpose_32_8x8_avx2(4, bufA + 8 * j, out); + transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4); + } + } +} + +static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_32X64; + __m256i buf0[64], buf1[256]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(2, height_div16); ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < AOMMIN(2, height_div16); i++) { + __m256i bufA[32]; + __m256i bufB[32]; + __m128i *buf = (__m128i *)(buf1 + width * i); + for (int j = 0; j < width; ++j) { + bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); + bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); + } + av1_fdct32_new_avx2(bufA, bufA, cos_bit_row); + av1_fdct32_new_avx2(bufB, bufB, cos_bit_row); + av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2]); + av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2]); + + int32_t *output8 = output + 16 * 32 * i; + for (int j = 0; j < 4; ++j) { + __m256i *out = (__m256i *)(output8 + 8 * j); + transpose_32_8x8_avx2(4, bufA + 8 * j, out); + transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4); + } + } +} + +static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_16X64; + __m256i buf0[64], buf1[64]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; + const transform_1d_avx2 row_txfm = fdct16x16_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < height_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < AOMMIN(4, height_div16); i++) { + __m256i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + int32_t *output16 = output + 16 * width * i; + for (int j = 0; j < width_div16; ++j) { + __m256i *buf16 = buf + 16 * j; + transpose_16bit_16x16_avx2(buf16, buf16); + store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, width, 16); + } + } + // Zero out the bottom 16x32 area. + memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); +} + +static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X16; + __m256i buf0[64], buf1[64]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x16_new_avx2; + const transform_1d_avx2 row_txfm = fdct16x64_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < height_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < height_div16; i++) { + __m256i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + int32_t *output16 = output + 16 * 32 * i; + for (int j = 0; j < 2; ++j) { + __m256i *buf16 = buf + 16 * j; + transpose_16bit_16x16_avx2(buf16, buf16); + store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, 32, 16); + } + } +} + +static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { + av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform + av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform + lowbd_fwd_txfm2d_16x16_avx2, // 16x16 transform + lowbd_fwd_txfm2d_32x32_avx2, // 32x32 transform + lowbd_fwd_txfm2d_64x64_avx2, // 64x64 transform + av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform + av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform + av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform + av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform + lowbd_fwd_txfm2d_16x32_avx2, // 16x32 transform + lowbd_fwd_txfm2d_32x16_avx2, // 32x16 transform + lowbd_fwd_txfm2d_32x64_avx2, // 32x64 transform + lowbd_fwd_txfm2d_64x32_avx2, // 64x32 transform + av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform + av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform + av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform + av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform + lowbd_fwd_txfm2d_16x64_avx2, // 16x64 transform + lowbd_fwd_txfm2d_64x16_avx2, // 64x16 transform +}; + +void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; + if ((fwd_txfm2d_func == NULL) || + (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) { + av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); + } else { + fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h new file mode 100644 index 000000000..c582ca0e3 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_FWD_TXFM_AVX2_H_ +#define AV1_FWD_TXFM_AVX2_H_ +#include <immintrin.h> + +static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) { + __m256i tmp, round; + round = _mm256_set1_epi32(1 << (bit - 1)); + tmp = _mm256_add_epi32(vec, round); + return _mm256_srai_epi32(tmp, bit); +} + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +static INLINE void btf_32_avx2_type0(const int32_t w0, const int32_t w1, + __m256i *in0, __m256i *in1, + const __m256i _r, const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i ww0 = _mm256_set1_epi32(w0); + const __m256i ww1 = _mm256_set1_epi32(w1); + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +static INLINE void btf_32_avx2_type1(const int32_t w0, const int32_t w1, + __m256i *in0, __m256i *in1, + const __m256i _r, const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i ww0 = _mm256_set1_epi32(w0); + const __m256i ww1 = _mm256_set1_epi32(w1); + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +static INLINE void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1, + __m256i *in0, __m256i *in1, + const __m256i _r, + const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +// out0 = in0*w0 + in1*w1 +// out1 = in1*w0 - in0*w1 +static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1, + __m256i *in0, __m256i *in1, + const __m256i _r, + const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +#endif // AV1_FWD_TXFM_AVX2_H_ diff --git a/third_party/aom/av1/encoder/x86/corner_match_sse4.c b/third_party/aom/av1/encoder/x86/corner_match_sse4.c index 381f757da..93f37b71d 100644 --- a/third_party/aom/av1/encoder/x86/corner_match_sse4.c +++ b/third_party/aom/av1/encoder/x86/corner_match_sse4.c @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + #include <stdlib.h> #include <memory.h> #include <math.h> diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c new file mode 100644 index 000000000..f776e84c7 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <immintrin.h> +#include <smmintrin.h> + +#include "aom_dsp/x86/synonyms.h" + +#include "aom/aom_integer.h" + +#include "av1/common/reconinter.h" + +#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) + +/** + * See av1_wedge_sse_from_residuals_c + */ +uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + int n = -N; + + uint64_t csse; + + const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE); + const __m256i v_zext_q = _mm256_set1_epi64x(0xffffffff); + + __m256i v_acc0_q = _mm256_setzero_si256(); + + assert(N % 64 == 0); + + r1 += N; + d += N; + m += N; + + do { + const __m256i v_r0_w = _mm256_lddqu_si256((__m256i *)(r1 + n)); + const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(d + n)); + const __m128i v_m01_b = _mm_lddqu_si128((__m128i *)(m + n)); + + const __m256i v_rd0l_w = _mm256_unpacklo_epi16(v_d0_w, v_r0_w); + const __m256i v_rd0h_w = _mm256_unpackhi_epi16(v_d0_w, v_r0_w); + const __m256i v_m0_w = _mm256_cvtepu8_epi16(v_m01_b); + + const __m256i v_m0l_w = _mm256_unpacklo_epi16(v_m0_w, v_mask_max_w); + const __m256i v_m0h_w = _mm256_unpackhi_epi16(v_m0_w, v_mask_max_w); + + const __m256i v_t0l_d = _mm256_madd_epi16(v_rd0l_w, v_m0l_w); + const __m256i v_t0h_d = _mm256_madd_epi16(v_rd0h_w, v_m0h_w); + + const __m256i v_t0_w = _mm256_packs_epi32(v_t0l_d, v_t0h_d); + + const __m256i v_sq0_d = _mm256_madd_epi16(v_t0_w, v_t0_w); + + const __m256i v_sum0_q = _mm256_add_epi64( + _mm256_and_si256(v_sq0_d, v_zext_q), _mm256_srli_epi64(v_sq0_d, 32)); + + v_acc0_q = _mm256_add_epi64(v_acc0_q, v_sum0_q); + + n += 16; + } while (n); + + v_acc0_q = _mm256_add_epi64(v_acc0_q, _mm256_srli_si256(v_acc0_q, 8)); + __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc0_q); + __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc0_q, 1); + v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1); +#if ARCH_X86_64 + csse = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0); +#else + xx_storel_64(&csse, v_acc_q_0); +#endif + + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +/** + * See av1_wedge_sign_from_residuals_c + */ +int av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m, + int N, int64_t limit) { + int64_t acc; + __m256i v_acc0_d = _mm256_setzero_si256(); + + // Input size limited to 8192 by the use of 32 bit accumulators and m + // being between [0, 64]. Overflow might happen at larger sizes, + // though it is practically impossible on real video input. + assert(N < 8192); + assert(N % 64 == 0); + + do { + const __m256i v_m01_b = _mm256_lddqu_si256((__m256i *)(m)); + const __m256i v_m23_b = _mm256_lddqu_si256((__m256i *)(m + 32)); + + const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(ds)); + const __m256i v_d1_w = _mm256_lddqu_si256((__m256i *)(ds + 16)); + const __m256i v_d2_w = _mm256_lddqu_si256((__m256i *)(ds + 32)); + const __m256i v_d3_w = _mm256_lddqu_si256((__m256i *)(ds + 48)); + + const __m256i v_m0_w = + _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m01_b)); + const __m256i v_m1_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m01_b, 1)); + const __m256i v_m2_w = + _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m23_b)); + const __m256i v_m3_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m23_b, 1)); + + const __m256i v_p0_d = _mm256_madd_epi16(v_d0_w, v_m0_w); + const __m256i v_p1_d = _mm256_madd_epi16(v_d1_w, v_m1_w); + const __m256i v_p2_d = _mm256_madd_epi16(v_d2_w, v_m2_w); + const __m256i v_p3_d = _mm256_madd_epi16(v_d3_w, v_m3_w); + + const __m256i v_p01_d = _mm256_add_epi32(v_p0_d, v_p1_d); + const __m256i v_p23_d = _mm256_add_epi32(v_p2_d, v_p3_d); + + const __m256i v_p0123_d = _mm256_add_epi32(v_p01_d, v_p23_d); + + v_acc0_d = _mm256_add_epi32(v_acc0_d, v_p0123_d); + + ds += 64; + m += 64; + + N -= 64; + } while (N); + + __m256i v_sign_d = _mm256_srai_epi32(v_acc0_d, 31); + v_acc0_d = _mm256_add_epi64(_mm256_unpacklo_epi32(v_acc0_d, v_sign_d), + _mm256_unpackhi_epi32(v_acc0_d, v_sign_d)); + + __m256i v_acc_q = _mm256_add_epi64(v_acc0_d, _mm256_srli_si256(v_acc0_d, 8)); + + __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc_q); + __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc_q, 1); + v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1); + +#if ARCH_X86_64 + acc = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0); +#else + xx_storel_64(&acc, v_acc_q_0); +#endif + + return acc > limit; +} + +/** + * av1_wedge_compute_delta_squares_c + */ +void av1_wedge_compute_delta_squares_avx2(int16_t *d, const int16_t *a, + const int16_t *b, int N) { + const __m256i v_neg_w = _mm256_set1_epi32(0xffff0001); + + assert(N % 64 == 0); + + do { + const __m256i v_a0_w = _mm256_lddqu_si256((__m256i *)(a)); + const __m256i v_b0_w = _mm256_lddqu_si256((__m256i *)(b)); + const __m256i v_a1_w = _mm256_lddqu_si256((__m256i *)(a + 16)); + const __m256i v_b1_w = _mm256_lddqu_si256((__m256i *)(b + 16)); + const __m256i v_a2_w = _mm256_lddqu_si256((__m256i *)(a + 32)); + const __m256i v_b2_w = _mm256_lddqu_si256((__m256i *)(b + 32)); + const __m256i v_a3_w = _mm256_lddqu_si256((__m256i *)(a + 48)); + const __m256i v_b3_w = _mm256_lddqu_si256((__m256i *)(b + 48)); + + const __m256i v_ab0l_w = _mm256_unpacklo_epi16(v_a0_w, v_b0_w); + const __m256i v_ab0h_w = _mm256_unpackhi_epi16(v_a0_w, v_b0_w); + const __m256i v_ab1l_w = _mm256_unpacklo_epi16(v_a1_w, v_b1_w); + const __m256i v_ab1h_w = _mm256_unpackhi_epi16(v_a1_w, v_b1_w); + const __m256i v_ab2l_w = _mm256_unpacklo_epi16(v_a2_w, v_b2_w); + const __m256i v_ab2h_w = _mm256_unpackhi_epi16(v_a2_w, v_b2_w); + const __m256i v_ab3l_w = _mm256_unpacklo_epi16(v_a3_w, v_b3_w); + const __m256i v_ab3h_w = _mm256_unpackhi_epi16(v_a3_w, v_b3_w); + + // Negate top word of pairs + const __m256i v_abl0n_w = _mm256_sign_epi16(v_ab0l_w, v_neg_w); + const __m256i v_abh0n_w = _mm256_sign_epi16(v_ab0h_w, v_neg_w); + const __m256i v_abl1n_w = _mm256_sign_epi16(v_ab1l_w, v_neg_w); + const __m256i v_abh1n_w = _mm256_sign_epi16(v_ab1h_w, v_neg_w); + const __m256i v_abl2n_w = _mm256_sign_epi16(v_ab2l_w, v_neg_w); + const __m256i v_abh2n_w = _mm256_sign_epi16(v_ab2h_w, v_neg_w); + const __m256i v_abl3n_w = _mm256_sign_epi16(v_ab3l_w, v_neg_w); + const __m256i v_abh3n_w = _mm256_sign_epi16(v_ab3h_w, v_neg_w); + + const __m256i v_r0l_w = _mm256_madd_epi16(v_ab0l_w, v_abl0n_w); + const __m256i v_r0h_w = _mm256_madd_epi16(v_ab0h_w, v_abh0n_w); + const __m256i v_r1l_w = _mm256_madd_epi16(v_ab1l_w, v_abl1n_w); + const __m256i v_r1h_w = _mm256_madd_epi16(v_ab1h_w, v_abh1n_w); + const __m256i v_r2l_w = _mm256_madd_epi16(v_ab2l_w, v_abl2n_w); + const __m256i v_r2h_w = _mm256_madd_epi16(v_ab2h_w, v_abh2n_w); + const __m256i v_r3l_w = _mm256_madd_epi16(v_ab3l_w, v_abl3n_w); + const __m256i v_r3h_w = _mm256_madd_epi16(v_ab3h_w, v_abh3n_w); + + const __m256i v_r0_w = _mm256_packs_epi32(v_r0l_w, v_r0h_w); + const __m256i v_r1_w = _mm256_packs_epi32(v_r1l_w, v_r1h_w); + const __m256i v_r2_w = _mm256_packs_epi32(v_r2l_w, v_r2h_w); + const __m256i v_r3_w = _mm256_packs_epi32(v_r3l_w, v_r3h_w); + + _mm256_store_si256((__m256i *)(d), v_r0_w); + _mm256_store_si256((__m256i *)(d + 16), v_r1_w); + _mm256_store_si256((__m256i *)(d + 32), v_r2_w); + _mm256_store_si256((__m256i *)(d + 48), v_r3_w); + + a += 64; + b += 64; + d += 64; + N -= 64; + } while (N); +} diff --git a/third_party/aom/build/cmake/aom_config_defaults.cmake b/third_party/aom/build/cmake/aom_config_defaults.cmake index c7252f064..19af5c43b 100644 --- a/third_party/aom/build/cmake/aom_config_defaults.cmake +++ b/third_party/aom/build/cmake/aom_config_defaults.cmake @@ -76,6 +76,7 @@ set(CONFIG_MISMATCH_DEBUG 0 CACHE NUMBER "Mismatch debugging flag.") set(CONFIG_ACCOUNTING 0 CACHE NUMBER "Enables bit accounting.") set(CONFIG_ANALYZER 0 CACHE NUMBER "Enables bit stream analyzer.") set(CONFIG_COEFFICIENT_RANGE_CHECKING 0 CACHE NUMBER "Coefficient range check.") +set(CONFIG_DENOISE 0 CACHE NUMBER "Denoise/noise modeling support in encoder.") set(CONFIG_FILEOPTIONS 1 CACHE NUMBER "Enables encoder config file support.") set(CONFIG_INSPECTION 0 CACHE NUMBER "Enables bitstream inspection.") set(CONFIG_INTERNAL_STATS 0 CACHE NUMBER "Enables internal encoder stats.") diff --git a/third_party/aom/build/cmake/aom_configure.cmake b/third_party/aom/build/cmake/aom_configure.cmake index 5d782aaf9..a12389778 100644 --- a/third_party/aom/build/cmake/aom_configure.cmake +++ b/third_party/aom/build/cmake/aom_configure.cmake @@ -40,14 +40,6 @@ include("${AOM_ROOT}/build/cmake/compiler_flags.cmake") include("${AOM_ROOT}/build/cmake/compiler_tests.cmake") include("${AOM_ROOT}/build/cmake/util.cmake") -# Build a list of all configurable variables. -get_cmake_property(cmake_cache_vars CACHE_VARIABLES) -foreach(var ${cmake_cache_vars}) - if("${var}" MATCHES "^CONFIG_") - list(APPEND AOM_CONFIG_VARS ${var}) - endif() -endforeach() - # Detect target CPU. if(NOT AOM_TARGET_CPU) if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "AMD64" OR @@ -269,7 +261,7 @@ else() add_compiler_flag_if_supported("-Wlogical-op") add_compiler_flag_if_supported("-Wpointer-arith") add_compiler_flag_if_supported("-Wsign-compare") - add_compiler_flag_if_supported("-Wstack-usage=320000") + add_compiler_flag_if_supported("-Wstack-usage=360000") add_compiler_flag_if_supported("-Wstring-conversion") add_compiler_flag_if_supported("-Wtype-limits") add_compiler_flag_if_supported("-Wuninitialized") @@ -334,9 +326,6 @@ if(NOT PERL_FOUND) message(FATAL_ERROR "Perl is required to build libaom.") endif() -configure_file("${AOM_CONFIG_DIR}/rtcd_config.cmake" - "${AOM_CONFIG_DIR}/${AOM_TARGET_CPU}_rtcd_config.rtcd") - set(AOM_RTCD_CONFIG_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl" "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl" "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl") @@ -355,13 +344,12 @@ foreach(NUM RANGE ${AOM_RTCD_CUSTOM_COMMAND_COUNT}) list(GET AOM_RTCD_HEADER_FILE_LIST ${NUM} AOM_RTCD_HEADER_FILE) list(GET AOM_RTCD_SOURCE_FILE_LIST ${NUM} AOM_RTCD_SOURCE_FILE) list(GET AOM_RTCD_SYMBOL_LIST ${NUM} AOM_RTCD_SYMBOL) - execute_process( - COMMAND ${PERL_EXECUTABLE} "${AOM_ROOT}/build/make/rtcd.pl" - --arch=${AOM_TARGET_CPU} - --sym=${AOM_RTCD_SYMBOL} ${AOM_RTCD_FLAGS} - --config=${AOM_CONFIG_DIR}/${AOM_TARGET_CPU}_rtcd_config.rtcd - ${AOM_RTCD_CONFIG_FILE} - OUTPUT_FILE ${AOM_RTCD_HEADER_FILE}) + execute_process(COMMAND ${PERL_EXECUTABLE} "${AOM_ROOT}/build/make/rtcd.pl" + --arch=${AOM_TARGET_CPU} + --sym=${AOM_RTCD_SYMBOL} ${AOM_RTCD_FLAGS} + --config=${AOM_CONFIG_DIR}/config/aom_config.h + ${AOM_RTCD_CONFIG_FILE} + OUTPUT_FILE ${AOM_RTCD_HEADER_FILE}) endforeach() # Generate aom_version.h. diff --git a/third_party/aom/build/cmake/aom_optimization.cmake b/third_party/aom/build/cmake/aom_optimization.cmake index 069ea1bb9..ce3dc0340 100644 --- a/third_party/aom/build/cmake/aom_optimization.cmake +++ b/third_party/aom/build/cmake/aom_optimization.cmake @@ -197,16 +197,16 @@ endfunction() # include file, $source is the C source file, and $symbol is used for the symbol # argument passed to rtcd.pl. function(add_rtcd_build_step config output source symbol) - add_custom_command( - OUTPUT ${output} - COMMAND ${PERL_EXECUTABLE} ARGS "${AOM_ROOT}/build/make/rtcd.pl" - --arch=${AOM_TARGET_CPU} - --sym=${symbol} ${AOM_RTCD_FLAGS} - --config=${AOM_CONFIG_DIR}/${AOM_TARGET_CPU}_rtcd_config.rtcd - ${config} > ${output} - DEPENDS ${config} - COMMENT "Generating ${output}" - WORKING_DIRECTORY ${AOM_CONFIG_DIR} VERBATIM) + add_custom_command(OUTPUT ${output} + COMMAND ${PERL_EXECUTABLE} ARGS + "${AOM_ROOT}/build/make/rtcd.pl" + --arch=${AOM_TARGET_CPU} + --sym=${symbol} ${AOM_RTCD_FLAGS} + --config=${AOM_CONFIG_DIR}/config/aom_config.h + ${config} > ${output} + DEPENDS ${config} + COMMENT "Generating ${output}" + WORKING_DIRECTORY ${AOM_CONFIG_DIR} VERBATIM) set_property(SOURCE ${source} PROPERTY OBJECT_DEPENDS ${output}) set_property(SOURCE ${output} PROPERTY GENERATED) endfunction() diff --git a/third_party/aom/build/cmake/cpu.cmake b/third_party/aom/build/cmake/cpu.cmake index 6f866d04d..6e8089e63 100644 --- a/third_party/aom/build/cmake/cpu.cmake +++ b/third_party/aom/build/cmake/cpu.cmake @@ -91,9 +91,3 @@ elseif("${AOM_TARGET_CPU}" MATCHES "^x86") endif() endforeach() endif() - -foreach(config_var ${AOM_CONFIG_VARS}) - if(${${config_var}}) - set(RTCD_${config_var} yes) - endif() -endforeach() diff --git a/third_party/aom/build/cmake/generate_aom_config_templates.cmake b/third_party/aom/build/cmake/generate_aom_config_templates.cmake index 6ea02295c..8fbb4737b 100644 --- a/third_party/aom/build/cmake/generate_aom_config_templates.cmake +++ b/third_party/aom/build/cmake/generate_aom_config_templates.cmake @@ -98,13 +98,3 @@ foreach(cache_var ${cmake_cache_vars}) "${cache_var} equ \${${cache_var}}\n") endif() endforeach() - -set(aom_rtcd_config_template "${AOM_CONFIG_DIR}/rtcd_config.cmake") -file(WRITE "${aom_rtcd_config_template}" ${cmake_file_header_block}) -foreach(cache_var ${cmake_cache_vars}) - if(NOT "${cache_var}" MATCHES "AOM_CONFIG_DIR\|AOM_ROOT\|^CMAKE_\|INLINE") - file(APPEND "${aom_rtcd_config_template}" - "${cache_var}=\${RTCD_${cache_var}}\n") - endif() -endforeach() - diff --git a/third_party/aom/build/make/iosbuild.sh b/third_party/aom/build/make/iosbuild.sh index 75f0b1b08..167ece200 100755 --- a/third_party/aom/build/make/iosbuild.sh +++ b/third_party/aom/build/make/iosbuild.sh @@ -245,7 +245,7 @@ build_framework() { # Trap function. Cleans up the subtree used to build all targets contained in # $TARGETS. cleanup() { - local readonly res=$? + local res=$? cd "${ORIG_PWD}" if [ $res -ne 0 ]; then diff --git a/third_party/aom/build/make/rtcd.pl b/third_party/aom/build/make/rtcd.pl index 8d8be25c0..b849a1eba 100755 --- a/third_party/aom/build/make/rtcd.pl +++ b/third_party/aom/build/make/rtcd.pl @@ -58,11 +58,15 @@ open CONFIG_FILE, $opts{config} or my %config = (); while (<CONFIG_FILE>) { - next if !/^(?:CONFIG_|HAVE_)/; + next if !/^#define\s+(?:CONFIG_|HAVE_)/; chomp; - s/\r$//; - my @pair = split /=/; - $config{$pair[0]} = $pair[1]; + my @line_components = split /\s/; + scalar @line_components > 2 or + die "Invalid input passed to rtcd.pl via $opts{config}."; + # $line_components[0] = #define + # $line_components[1] = flag name (CONFIG_SOMETHING or HAVE_SOMETHING) + # $line_components[2] = flag value (0 or 1) + $config{$line_components[1]} = "$line_components[2]" eq "1" ? "yes" : ""; } close CONFIG_FILE; @@ -415,19 +419,11 @@ if ($opts{arch} eq 'x86') { x86; } elsif ($opts{arch} eq 'mips32' || $opts{arch} eq 'mips64') { @ALL_ARCHS = filter("$opts{arch}"); - open CONFIG_FILE, $opts{config} or - die "Error opening config file '$opts{config}': $!\n"; - while (<CONFIG_FILE>) { - if (/HAVE_DSPR2=yes/) { - @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/); - last; - } - if (/HAVE_MSA=yes/) { - @ALL_ARCHS = filter("$opts{arch}", qw/msa/); - last; - } + if (aom_config("HAVE_DSPR2") eq "yes") { + @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/); + } elsif (aom_config("HAVE_MSA") eq "yes") { + @ALL_ARCHS = filter("$opts{arch}", qw/msa/); } - close CONFIG_FILE; mips; } elsif ($opts{arch} =~ /armv7\w?/) { @ALL_ARCHS = filter(qw/neon/); @@ -466,4 +462,4 @@ Options: --disable-EXT Disable support for EXT extensions --require-EXT Require support for EXT extensions --sym=SYMBOL Unique symbol to use for RTCD initialization function - --config=FILE File with CONFIG_FOO=yes lines to parse + --config=FILE Path to file containing C preprocessor directives to parse diff --git a/third_party/aom/common/tools_common.c b/third_party/aom/common/tools_common.c index 359ec7341..21cd80026 100644 --- a/third_party/aom/common/tools_common.c +++ b/third_party/aom/common/tools_common.c @@ -236,7 +236,7 @@ double sse_to_psnr(double samples, double peak, double sse) { } // TODO(debargha): Consolidate the functions below into a separate file. -static void highbd_img_upshift(aom_image_t *dst, aom_image_t *src, +static void highbd_img_upshift(aom_image_t *dst, const aom_image_t *src, int input_shift) { // Note the offset is 1 less than half. const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0; @@ -262,8 +262,8 @@ static void highbd_img_upshift(aom_image_t *dst, aom_image_t *src, h = (h + src->y_chroma_shift) >> src->y_chroma_shift; } for (y = 0; y < h; y++) { - uint16_t *p_src = - (uint16_t *)(src->planes[plane] + y * src->stride[plane]); + const uint16_t *p_src = + (const uint16_t *)(src->planes[plane] + y * src->stride[plane]); uint16_t *p_dst = (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]); for (x = 0; x < w; x++) *p_dst++ = (*p_src++ << input_shift) + offset; @@ -271,7 +271,7 @@ static void highbd_img_upshift(aom_image_t *dst, aom_image_t *src, } } -static void lowbd_img_upshift(aom_image_t *dst, aom_image_t *src, +static void lowbd_img_upshift(aom_image_t *dst, const aom_image_t *src, int input_shift) { // Note the offset is 1 less than half. const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0; @@ -297,7 +297,7 @@ static void lowbd_img_upshift(aom_image_t *dst, aom_image_t *src, h = (h + src->y_chroma_shift) >> src->y_chroma_shift; } for (y = 0; y < h; y++) { - uint8_t *p_src = src->planes[plane] + y * src->stride[plane]; + const uint8_t *p_src = src->planes[plane] + y * src->stride[plane]; uint16_t *p_dst = (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]); for (x = 0; x < w; x++) { @@ -307,7 +307,8 @@ static void lowbd_img_upshift(aom_image_t *dst, aom_image_t *src, } } -void aom_img_upshift(aom_image_t *dst, aom_image_t *src, int input_shift) { +void aom_img_upshift(aom_image_t *dst, const aom_image_t *src, + int input_shift) { if (src->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { highbd_img_upshift(dst, src, input_shift); } else { @@ -315,7 +316,7 @@ void aom_img_upshift(aom_image_t *dst, aom_image_t *src, int input_shift) { } } -void aom_img_truncate_16_to_8(aom_image_t *dst, aom_image_t *src) { +void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src) { int plane; if (dst->fmt + AOM_IMG_FMT_HIGHBITDEPTH != src->fmt || dst->d_w != src->d_w || dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift || @@ -337,8 +338,8 @@ void aom_img_truncate_16_to_8(aom_image_t *dst, aom_image_t *src) { h = (h + src->y_chroma_shift) >> src->y_chroma_shift; } for (y = 0; y < h; y++) { - uint16_t *p_src = - (uint16_t *)(src->planes[plane] + y * src->stride[plane]); + const uint16_t *p_src = + (const uint16_t *)(src->planes[plane] + y * src->stride[plane]); uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane]; for (x = 0; x < w; x++) { *p_dst++ = (uint8_t)(*p_src++); @@ -347,7 +348,7 @@ void aom_img_truncate_16_to_8(aom_image_t *dst, aom_image_t *src) { } } -static void highbd_img_downshift(aom_image_t *dst, aom_image_t *src, +static void highbd_img_downshift(aom_image_t *dst, const aom_image_t *src, int down_shift) { int plane; if (dst->d_w != src->d_w || dst->d_h != src->d_h || @@ -371,8 +372,8 @@ static void highbd_img_downshift(aom_image_t *dst, aom_image_t *src, h = (h + src->y_chroma_shift) >> src->y_chroma_shift; } for (y = 0; y < h; y++) { - uint16_t *p_src = - (uint16_t *)(src->planes[plane] + y * src->stride[plane]); + const uint16_t *p_src = + (const uint16_t *)(src->planes[plane] + y * src->stride[plane]); uint16_t *p_dst = (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]); for (x = 0; x < w; x++) *p_dst++ = *p_src++ >> down_shift; @@ -380,7 +381,7 @@ static void highbd_img_downshift(aom_image_t *dst, aom_image_t *src, } } -static void lowbd_img_downshift(aom_image_t *dst, aom_image_t *src, +static void lowbd_img_downshift(aom_image_t *dst, const aom_image_t *src, int down_shift) { int plane; if (dst->d_w != src->d_w || dst->d_h != src->d_h || @@ -404,8 +405,8 @@ static void lowbd_img_downshift(aom_image_t *dst, aom_image_t *src, h = (h + src->y_chroma_shift) >> src->y_chroma_shift; } for (y = 0; y < h; y++) { - uint16_t *p_src = - (uint16_t *)(src->planes[plane] + y * src->stride[plane]); + const uint16_t *p_src = + (const uint16_t *)(src->planes[plane] + y * src->stride[plane]); uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane]; for (x = 0; x < w; x++) { *p_dst++ = *p_src++ >> down_shift; @@ -414,7 +415,8 @@ static void lowbd_img_downshift(aom_image_t *dst, aom_image_t *src, } } -void aom_img_downshift(aom_image_t *dst, aom_image_t *src, int down_shift) { +void aom_img_downshift(aom_image_t *dst, const aom_image_t *src, + int down_shift) { if (dst->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { highbd_img_downshift(dst, src, down_shift); } else { diff --git a/third_party/aom/common/tools_common.h b/third_party/aom/common/tools_common.h index abee4ea63..587903650 100644 --- a/third_party/aom/common/tools_common.h +++ b/third_party/aom/common/tools_common.h @@ -152,9 +152,10 @@ void aom_img_write(const aom_image_t *img, FILE *file); int aom_img_read(aom_image_t *img, FILE *file); double sse_to_psnr(double samples, double peak, double mse); -void aom_img_upshift(aom_image_t *dst, aom_image_t *src, int input_shift); -void aom_img_downshift(aom_image_t *dst, aom_image_t *src, int down_shift); -void aom_img_truncate_16_to_8(aom_image_t *dst, aom_image_t *src); +void aom_img_upshift(aom_image_t *dst, const aom_image_t *src, int input_shift); +void aom_img_downshift(aom_image_t *dst, const aom_image_t *src, + int down_shift); +void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src); #ifdef __cplusplus } /* extern "C" */ diff --git a/third_party/aom/common/video_common.h b/third_party/aom/common/video_common.h index f96af4b7e..965038d39 100644 --- a/third_party/aom/common/video_common.h +++ b/third_party/aom/common/video_common.h @@ -19,6 +19,7 @@ typedef struct { int frame_width; int frame_height; struct AvxRational time_base; + unsigned int is_annexb; } AvxVideoInfo; #endif // VIDEO_COMMON_H_ diff --git a/third_party/aom/common/video_reader.c b/third_party/aom/common/video_reader.c index f5327c928..b54c250c2 100644 --- a/third_party/aom/common/video_reader.c +++ b/third_party/aom/common/video_reader.c @@ -8,19 +8,20 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "common/video_reader.h" - #include <stdlib.h> #include <string.h> +#include <assert.h> #include "aom_ports/mem_ops.h" #include "common/ivfdec.h" - -static const char *const kIVFSignature = "DKIF"; +#include "common/obudec.h" +#include "common/tools_common.h" +#include "common/video_reader.h" struct AvxVideoReaderStruct { AvxVideoInfo info; - FILE *file; + struct AvxInputContext input_ctx; + struct ObuDecInputContext obu_ctx; uint8_t *buffer; size_t buffer_size; size_t frame_size; @@ -28,42 +29,64 @@ struct AvxVideoReaderStruct { }; AvxVideoReader *aom_video_reader_open(const char *filename) { - char header[32]; AvxVideoReader *reader = NULL; FILE *const file = fopen(filename, "rb"); if (!file) return NULL; // Can't open file - if (fread(header, 1, 32, file) != 32) return NULL; // Can't read file header - - if (memcmp(kIVFSignature, header, 4) != 0) - return NULL; // Wrong IVF signature - - if (mem_get_le16(header + 4) != 0) return NULL; // Wrong IVF version - reader = (AvxVideoReader *)calloc(1, sizeof(*reader)); - if (!reader) return NULL; // Can't allocate AvxVideoReader + if (!reader) { + fclose(file); + return NULL; // Can't allocate AvxVideoReader + } - reader->file = file; - reader->info.codec_fourcc = mem_get_le32(header + 8); - reader->info.frame_width = mem_get_le16(header + 12); - reader->info.frame_height = mem_get_le16(header + 14); - reader->info.time_base.numerator = mem_get_le32(header + 16); - reader->info.time_base.denominator = mem_get_le32(header + 20); + reader->input_ctx.filename = filename; + reader->input_ctx.file = file; + reader->obu_ctx.avx_ctx = &reader->input_ctx; + reader->obu_ctx.is_annexb = 1; + + if (file_is_ivf(&reader->input_ctx)) { + reader->input_ctx.file_type = FILE_TYPE_IVF; + reader->info.codec_fourcc = reader->input_ctx.fourcc; + reader->info.frame_width = reader->input_ctx.width; + reader->info.frame_height = reader->input_ctx.height; + } else if (file_is_obu(&reader->obu_ctx)) { + reader->input_ctx.file_type = FILE_TYPE_OBU; + // assume AV1 + reader->info.codec_fourcc = AV1_FOURCC; + reader->info.is_annexb = reader->obu_ctx.is_annexb; + } else { + fclose(file); + free(reader); + return NULL; // Unknown file type + } return reader; } void aom_video_reader_close(AvxVideoReader *reader) { if (reader) { - fclose(reader->file); + fclose(reader->input_ctx.file); + if (reader->input_ctx.file_type == FILE_TYPE_OBU) { + obudec_free(&reader->obu_ctx); + } free(reader->buffer); free(reader); } } int aom_video_reader_read_frame(AvxVideoReader *reader) { - return !ivf_read_frame(reader->file, &reader->buffer, &reader->frame_size, - &reader->buffer_size, &reader->pts); + if (reader->input_ctx.file_type == FILE_TYPE_IVF) { + return !ivf_read_frame(reader->input_ctx.file, &reader->buffer, + &reader->frame_size, &reader->buffer_size, + &reader->pts); + } else if (reader->input_ctx.file_type == FILE_TYPE_OBU) { + return !obudec_read_temporal_unit(&reader->obu_ctx, &reader->buffer, + &reader->frame_size, + &reader->buffer_size); + } else { + assert(0); + return 0; + } } const uint8_t *aom_video_reader_get_frame(AvxVideoReader *reader, @@ -77,7 +100,9 @@ int64_t aom_video_reader_get_frame_pts(AvxVideoReader *reader) { return (int64_t)reader->pts; } -FILE *aom_video_reader_get_file(AvxVideoReader *reader) { return reader->file; } +FILE *aom_video_reader_get_file(AvxVideoReader *reader) { + return reader->input_ctx.file; +} const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader) { return &reader->info; diff --git a/third_party/aom/examples/aom_cx_set_ref.c b/third_party/aom/examples/aom_cx_set_ref.c index e02e94c07..fc037d484 100644 --- a/third_party/aom/examples/aom_cx_set_ref.c +++ b/third_party/aom/examples/aom_cx_set_ref.c @@ -163,7 +163,7 @@ static int encode_frame(aom_codec_ctx_t *ecodec, aom_image_t *img, // Copy out first decoded frame, and use it as reference later. if (*frame_out == 1 && ext_ref != NULL) - if (aom_codec_control(dcodec, AV1_GET_NEW_FRAME_IMAGE, ext_ref)) + if (aom_codec_control(dcodec, AV1_COPY_NEW_FRAME_IMAGE, ext_ref)) die_codec(dcodec, "Failed to get decoder new frame"); } } diff --git a/third_party/aom/examples/inspect.c b/third_party/aom/examples/inspect.c index 4887fc4a3..9d5f0dcfc 100644 --- a/third_party/aom/examples/inspect.c +++ b/third_party/aom/examples/inspect.c @@ -630,7 +630,9 @@ int read_frame() { die_codec(&codec, "Failed to decode frame."); } int got_any_frames = 0; - while ((img = aom_codec_get_frame(&codec, &iter))) { + aom_image_t *frame_img; + while ((frame_img = aom_codec_get_frame(&codec, &iter))) { + img = frame_img; ++frame_count; got_any_frames = 1; } diff --git a/third_party/aom/examples/lightfield_bitstream_parsing.c b/third_party/aom/examples/lightfield_bitstream_parsing.c index d13f3f172..71d4dec77 100644 --- a/third_party/aom/examples/lightfield_bitstream_parsing.c +++ b/third_party/aom/examples/lightfield_bitstream_parsing.c @@ -12,15 +12,14 @@ // Lightfield Bitstream Parsing // ============================ // -// This is an lightfield bitstream parsing example. It takes an input file +// This is a lightfield bitstream parsing example. It takes an input file // containing the whole compressed lightfield bitstream(ivf file), and parses it // and constructs and outputs a new bitstream that can be decoded by an AV1 -// decoder. The output bitstream contains tile list OBUs. The lf_width and -// lf_height arguments are the number of lightfield images in each dimension. -// The lf_blocksize determines the number of reference images used. +// decoder. The output bitstream contains reference frames(i.e. anchor frames), +// camera frame header, and tile list OBUs. num_references is the number of +// anchor frames coded at the beginning of the light field file. // After running the lightfield encoder, run lightfield bitstream parsing: -// examples/lightfield_bitstream_parsing vase10x10.ivf vase_tile_list.ivf 10 10 -// 5 +// examples/lightfield_bitstream_parsing vase10x10.ivf vase_tile_list.ivf 4 #include <stdio.h> #include <stdlib.h> @@ -35,13 +34,13 @@ #include "common/video_reader.h" #include "common/video_writer.h" +#define MAX_TILES 512 + static const char *exec_name; void usage_exit(void) { - fprintf( - stderr, - "Usage: %s <infile> <outfile> <lf_width> <lf_height> <lf_blocksize> \n", - exec_name); + fprintf(stderr, "Usage: %s <infile> <outfile> <num_references> \n", + exec_name); exit(EXIT_FAILURE); } @@ -95,39 +94,36 @@ const TILE_LIST_INFO tile_list[2][9] = { { 50, 2, 5, 4 } }, }; +static int get_image_bps(aom_img_fmt_t fmt) { + switch (fmt) { + case AOM_IMG_FMT_I420: return 12; + case AOM_IMG_FMT_I422: return 16; + case AOM_IMG_FMT_I444: return 24; + case AOM_IMG_FMT_I42016: return 24; + case AOM_IMG_FMT_I42216: return 32; + case AOM_IMG_FMT_I44416: return 48; + default: die("Invalid image format"); + } +} + int main(int argc, char **argv) { aom_codec_ctx_t codec; AvxVideoReader *reader = NULL; AvxVideoWriter *writer = NULL; const AvxInterface *decoder = NULL; const AvxVideoInfo *info = NULL; - const char *lf_width_arg; - const char *lf_height_arg; - const char *lf_blocksize_arg; - int width, height; - int lf_width, lf_height; - int lf_blocksize; - int u_blocks, v_blocks; + int num_references; int n, i; aom_codec_pts_t pts; exec_name = argv[0]; - if (argc != 6) die("Invalid number of arguments."); + if (argc != 4) die("Invalid number of arguments."); reader = aom_video_reader_open(argv[1]); if (!reader) die("Failed to open %s for reading.", argv[1]); - lf_width_arg = argv[3]; - lf_height_arg = argv[4]; - lf_blocksize_arg = argv[5]; - - lf_width = (int)strtol(lf_width_arg, NULL, 0); - lf_height = (int)strtol(lf_height_arg, NULL, 0); - lf_blocksize = (int)strtol(lf_blocksize_arg, NULL, 0); - + num_references = (int)strtol(argv[3], NULL, 0); info = aom_video_reader_get_info(reader); - width = info->frame_width; - height = info->frame_height; // The writer to write out ivf file in tile list OBU, which can be decoded by // AV1 decoder. @@ -144,11 +140,6 @@ int main(int argc, char **argv) { // Decode anchor frames. aom_codec_control_(&codec, AV1_SET_TILE_MODE, 0); - // How many anchor frames we have. - u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize; - v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize; - - int num_references = v_blocks * u_blocks; for (i = 0; i < num_references; ++i) { aom_video_reader_read_frame(reader); @@ -223,10 +214,20 @@ int main(int argc, char **argv) { die_codec(&codec, "Failed to copy compressed camera frame header."); } - // Allocate a buffer to store tile list bitstream. Image format - // AOM_IMG_FMT_I420. - size_t data_sz = - ALIGN_POWER_OF_TWO(width, 5) * ALIGN_POWER_OF_TWO(height, 5) * 12 / 8; + // Read out the image format. + aom_img_fmt_t ref_fmt = 0; + if (aom_codec_control(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt)) + die_codec(&codec, "Failed to get the image format"); + const int bps = get_image_bps(ref_fmt); + // read out the tile size. + unsigned int tile_size = 0; + if (aom_codec_control(&codec, AV1D_GET_TILE_SIZE, &tile_size)) + die_codec(&codec, "Failed to get the tile size"); + const unsigned int tile_width = tile_size >> 16; + const unsigned int tile_height = tile_size & 65535; + // Allocate a buffer to store tile list bitstream. + const size_t data_sz = MAX_TILES * ALIGN_POWER_OF_TWO(tile_width, 5) * + ALIGN_POWER_OF_TWO(tile_height, 5) * bps / 8; unsigned char *tl_buf = (unsigned char *)malloc(data_sz); if (tl_buf == NULL) die_codec(&codec, "Failed to allocate tile list buffer."); @@ -251,7 +252,8 @@ int main(int argc, char **argv) { // Write the OBU size using a fixed length_field_size of 4 bytes. saved_obu_size_loc = tl; - aom_wb_write_literal(&wb, 0, 32); + // aom_wb_write_unsigned_literal(&wb, data, bits) requires that bits <= 32. + aom_wb_write_unsigned_literal(&wb, 0, 32); tl += 4; tile_list_obu_header_size += 4; diff --git a/third_party/aom/examples/lightfield_decoder.c b/third_party/aom/examples/lightfield_decoder.c index 625cddcac..5da468413 100644 --- a/third_party/aom/examples/lightfield_decoder.c +++ b/third_party/aom/examples/lightfield_decoder.c @@ -14,14 +14,10 @@ // // This is an example of a simple lightfield decoder. It builds upon the // simple_decoder.c example. It takes an input file containing the compressed -// data (in ivf format), treating it as a lightfield instead of a video and -// will decode a single lightfield tile. The lf_width and lf_height arguments -// are the number of lightfield images in each dimension. The tile to decode -// is specified by the tile_u, tile_v, tile_s, tile_t arguments. The tile_u, -// tile_v specify the image and tile_s, tile_t specify the tile in the image. +// data (in ivf format), treating it as a lightfield instead of a video. // After running the lightfield encoder, run lightfield decoder to decode a -// single tile: -// examples/lightfield_decoder vase10x10.ivf vase_reference.yuv 10 10 5 +// batch of tiles: +// examples/lightfield_decoder vase10x10.ivf vase_reference.yuv 4 #include <stdio.h> #include <stdlib.h> @@ -38,10 +34,7 @@ static const char *exec_name; void usage_exit(void) { - fprintf( - stderr, - "Usage: %s <infile> <outfile> <lf_width> <lf_height> <lf_blocksize>\n", - exec_name); + fprintf(stderr, "Usage: %s <infile> <outfile> <num_references>\n", exec_name); exit(EXIT_FAILURE); } @@ -85,22 +78,14 @@ int main(int argc, char **argv) { AvxVideoReader *reader = NULL; const AvxInterface *decoder = NULL; const AvxVideoInfo *info = NULL; - const char *lf_width_arg; - const char *lf_height_arg; - const char *lf_blocksize_arg; - int width, height; - int lf_width, lf_height; - int lf_blocksize; - int u_blocks; - int v_blocks; + int num_references; aom_image_t reference_images[MAX_EXTERNAL_REFERENCES]; size_t frame_size = 0; const unsigned char *frame = NULL; - int n, i; - + int n, i, j; exec_name = argv[0]; - if (argc != 6) die("Invalid number of arguments."); + if (argc != 4) die("Invalid number of arguments."); reader = aom_video_reader_open(argv[1]); if (!reader) die("Failed to open %s for reading.", argv[1]); @@ -108,16 +93,9 @@ int main(int argc, char **argv) { if (!(outfile = fopen(argv[2], "wb"))) die("Failed to open %s for writing.", argv[2]); - lf_width_arg = argv[3]; - lf_height_arg = argv[4]; - lf_blocksize_arg = argv[5]; - lf_width = (int)strtol(lf_width_arg, NULL, 0); - lf_height = (int)strtol(lf_height_arg, NULL, 0); - lf_blocksize = (int)strtol(lf_blocksize_arg, NULL, 0); + num_references = (int)strtol(argv[3], NULL, 0); info = aom_video_reader_get_info(reader); - width = info->frame_width; - height = info->frame_height; decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); if (!decoder) die("Unknown input codec."); @@ -126,33 +104,39 @@ int main(int argc, char **argv) { if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) die_codec(&codec, "Failed to initialize decoder."); - // How many anchor frames we have. - u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize; - v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize; - - int num_references = v_blocks * u_blocks; - - // Allocate memory to store decoded references. - aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420; - if (!CONFIG_LOWBITDEPTH) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH; - // Allocate memory with the border so that it can be used as a reference. - for (i = 0; i < num_references; i++) { - unsigned int border = AOM_BORDER_IN_PIXELS; - if (!aom_img_alloc_with_border(&reference_images[i], ref_fmt, width, height, - 32, 8, border)) { - die("Failed to allocate references."); - } + if (aom_codec_control(&codec, AV1D_SET_IS_ANNEXB, info->is_annexb)) { + die("Failed to set annex b status"); } // Decode anchor frames. aom_codec_control_(&codec, AV1_SET_TILE_MODE, 0); - for (i = 0; i < num_references; ++i) { aom_video_reader_read_frame(reader); frame = aom_video_reader_get_frame(reader, &frame_size); if (aom_codec_decode(&codec, frame, frame_size, NULL)) die_codec(&codec, "Failed to decode frame."); + if (i == 0) { + aom_img_fmt_t ref_fmt = 0; + if (aom_codec_control(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt)) + die_codec(&codec, "Failed to get the image format"); + + int frame_res[2]; + if (aom_codec_control(&codec, AV1D_GET_FRAME_SIZE, frame_res)) + die_codec(&codec, "Failed to get the image frame size"); + + // Allocate memory to store decoded references. Allocate memory with the + // border so that it can be used as a reference. + for (j = 0; j < num_references; j++) { + unsigned int border = AOM_BORDER_IN_PIXELS; + if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt, + frame_res[0], frame_res[1], 32, 8, + border)) { + die("Failed to allocate references."); + } + } + } + if (aom_codec_control(&codec, AV1_COPY_NEW_FRAME_IMAGE, &reference_images[i])) die_codec(&codec, "Failed to copy decoded reference frame"); diff --git a/third_party/aom/examples/lightfield_encoder.c b/third_party/aom/examples/lightfield_encoder.c index 22daf622c..f8c37fbb0 100644 --- a/third_party/aom/examples/lightfield_encoder.c +++ b/third_party/aom/examples/lightfield_encoder.c @@ -240,7 +240,8 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name, AvxVideoInfo info = { encoder->fourcc, cfg->g_w, cfg->g_h, - { cfg->g_timebase.num, cfg->g_timebase.den } }; + { cfg->g_timebase.num, cfg->g_timebase.den }, + 0 }; AvxVideoWriter *writer = NULL; aom_codec_ctx_t codec; int frame_count = 0; diff --git a/third_party/aom/examples/lightfield_tile_list_decoder.c b/third_party/aom/examples/lightfield_tile_list_decoder.c index cec6baa2c..2e4f3898d 100644 --- a/third_party/aom/examples/lightfield_tile_list_decoder.c +++ b/third_party/aom/examples/lightfield_tile_list_decoder.c @@ -16,12 +16,12 @@ // contains the anchor frames that are references of the coded tiles, the camera // frame header, and tile list OBUs that include the tile information and the // compressed tile data. This input file is reconstructed from the encoded -// lightfield ivf file, and is decodable by AV1 decoder. The lf_width and -// lf_height arguments are the number of lightfield images in each dimension. -// The lf_blocksize determines the number of reference images used. +// lightfield ivf file, and is decodable by AV1 decoder. num_references is +// the number of anchor frames coded at the beginning of the light field file. +// num_tile_lists is the number of tile lists need to be decoded. // Run lightfield tile list decoder to decode an AV1 tile list file: // examples/lightfield_tile_list_decoder vase_tile_list.ivf vase_tile_list.yuv -// 10 10 5 2 +// 4 2 #include <stdio.h> #include <stdlib.h> @@ -40,8 +40,7 @@ static const char *exec_name; void usage_exit(void) { fprintf(stderr, - "Usage: %s <infile> <outfile> <lf_width> <lf_height> <lf_blocksize> " - "<num_tile_lists>\n", + "Usage: %s <infile> <outfile> <num_references> <num_tile_lists>\n", exec_name); exit(EXIT_FAILURE); } @@ -52,21 +51,16 @@ int main(int argc, char **argv) { AvxVideoReader *reader = NULL; const AvxInterface *decoder = NULL; const AvxVideoInfo *info = NULL; - const char *lf_width_arg; - const char *lf_height_arg; - const char *lf_blocksize_arg; - int width, height; - int lf_width, lf_height, lf_blocksize; - int u_blocks, v_blocks; + int num_references; int num_tile_lists; aom_image_t reference_images[MAX_EXTERNAL_REFERENCES]; size_t frame_size = 0; const unsigned char *frame = NULL; - int i, n; + int i, j, n; exec_name = argv[0]; - if (argc != 7) die("Invalid number of arguments."); + if (argc != 5) die("Invalid number of arguments."); reader = aom_video_reader_open(argv[1]); if (!reader) die("Failed to open %s for reading.", argv[1]); @@ -74,17 +68,10 @@ int main(int argc, char **argv) { if (!(outfile = fopen(argv[2], "wb"))) die("Failed to open %s for writing.", argv[2]); - lf_width_arg = argv[3]; - lf_height_arg = argv[4]; - lf_blocksize_arg = argv[5]; - lf_width = (int)strtol(lf_width_arg, NULL, 0); - lf_height = (int)strtol(lf_height_arg, NULL, 0); - lf_blocksize = (int)strtol(lf_blocksize_arg, NULL, 0); - num_tile_lists = (int)strtol(argv[6], NULL, 0); + num_references = (int)strtol(argv[3], NULL, 0); + num_tile_lists = (int)strtol(argv[4], NULL, 0); info = aom_video_reader_get_info(reader); - width = info->frame_width; - height = info->frame_height; decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); if (!decoder) die("Unknown input codec."); @@ -93,33 +80,39 @@ int main(int argc, char **argv) { if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) die_codec(&codec, "Failed to initialize decoder."); - // How many anchor frames we have. - u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize; - v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize; - - int num_references = v_blocks * u_blocks; - - // Allocate memory to store decoded references. - aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420; - if (!CONFIG_LOWBITDEPTH) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH; - // Allocate memory with the border so that it can be used as a reference. - for (i = 0; i < num_references; i++) { - unsigned int border = AOM_BORDER_IN_PIXELS; - if (!aom_img_alloc_with_border(&reference_images[i], ref_fmt, width, height, - 32, 8, border)) { - die("Failed to allocate references."); - } + if (aom_codec_control(&codec, AV1D_SET_IS_ANNEXB, info->is_annexb)) { + die("Failed to set annex b status"); } // Decode anchor frames. aom_codec_control_(&codec, AV1_SET_TILE_MODE, 0); - for (i = 0; i < num_references; ++i) { aom_video_reader_read_frame(reader); frame = aom_video_reader_get_frame(reader, &frame_size); if (aom_codec_decode(&codec, frame, frame_size, NULL)) die_codec(&codec, "Failed to decode frame."); + if (i == 0) { + aom_img_fmt_t ref_fmt = 0; + if (aom_codec_control(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt)) + die_codec(&codec, "Failed to get the image format"); + + int frame_res[2]; + if (aom_codec_control(&codec, AV1D_GET_FRAME_SIZE, frame_res)) + die_codec(&codec, "Failed to get the image frame size"); + + // Allocate memory to store decoded references. Allocate memory with the + // border so that it can be used as a reference. + for (j = 0; j < num_references; j++) { + unsigned int border = AOM_BORDER_IN_PIXELS; + if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt, + frame_res[0], frame_res[1], 32, 8, + border)) { + die("Failed to allocate references."); + } + } + } + if (aom_codec_control(&codec, AV1_COPY_NEW_FRAME_IMAGE, &reference_images[i])) die_codec(&codec, "Failed to copy decoded reference frame"); @@ -142,13 +135,11 @@ int main(int argc, char **argv) { // Set external references. av1_ext_ref_frame_t set_ext_ref = { &reference_images[0], num_references }; aom_codec_control_(&codec, AV1D_SET_EXT_REF_PTR, &set_ext_ref); - // Must decode the camera frame header first. aom_video_reader_read_frame(reader); frame = aom_video_reader_get_frame(reader, &frame_size); if (aom_codec_decode(&codec, frame, frame_size, NULL)) die_codec(&codec, "Failed to decode the frame."); - // Decode tile lists one by one. for (n = 0; n < num_tile_lists; n++) { aom_video_reader_read_frame(reader); @@ -156,7 +147,6 @@ int main(int argc, char **argv) { if (aom_codec_decode(&codec, frame, frame_size, NULL)) die_codec(&codec, "Failed to decode the tile list."); - aom_codec_iter_t iter = NULL; aom_image_t *img; while ((img = aom_codec_get_frame(&codec, &iter))) diff --git a/third_party/aom/examples/twopass_encoder.c b/third_party/aom/examples/twopass_encoder.c index 1b134cce0..a03bc6cc2 100644 --- a/third_party/aom/examples/twopass_encoder.c +++ b/third_party/aom/examples/twopass_encoder.c @@ -148,7 +148,8 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name, AvxVideoInfo info = { encoder->fourcc, cfg->g_w, cfg->g_h, - { cfg->g_timebase.num, cfg->g_timebase.den } }; + { cfg->g_timebase.num, cfg->g_timebase.den }, + 0 }; AvxVideoWriter *writer = NULL; aom_codec_ctx_t codec; int frame_count = 0; diff --git a/third_party/aom/test/aomdec.sh b/third_party/aom/test/aomdec.sh index 5f54ae0af..927142287 100755 --- a/third_party/aom/test/aomdec.sh +++ b/third_party/aom/test/aomdec.sh @@ -37,7 +37,7 @@ aomdec_verify_environment() { # input file path and shifted away. All remaining parameters are passed through # to aomdec. aomdec_pipe() { - local readonly input="$1" + local input="$1" shift if [ ! -e "${input}" ]; then elog "Input file ($input) missing in aomdec_pipe()" @@ -51,8 +51,8 @@ aomdec_pipe() { # the directory containing aomdec. $1 one is used as the input file path and # shifted away. All remaining parameters are passed through to aomdec. aomdec() { - local readonly decoder="$(aom_tool_path aomdec)" - local readonly input="$1" + local decoder="$(aom_tool_path aomdec)" + local input="$1" shift eval "${AOM_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull} } @@ -65,7 +65,7 @@ aomdec_can_decode_av1() { aomdec_av1_ivf() { if [ "$(aomdec_can_decode_av1)" = "yes" ]; then - local readonly file="${AV1_IVF_FILE}" + local file="${AV1_IVF_FILE}" if [ ! -e "${file}" ]; then encode_yuv_raw_input_av1 "${file}" --ivf fi @@ -75,7 +75,7 @@ aomdec_av1_ivf() { aomdec_av1_ivf_error_resilient() { if [ "$(aomdec_can_decode_av1)" = "yes" ]; then - local readonly file="av1.error-resilient.ivf" + local file="av1.error-resilient.ivf" if [ ! -e "${file}" ]; then encode_yuv_raw_input_av1 "${file}" --ivf --error-resilient=1 fi @@ -85,7 +85,7 @@ aomdec_av1_ivf_error_resilient() { aomdec_av1_ivf_multithread() { if [ "$(aomdec_can_decode_av1)" = "yes" ]; then - local readonly file="${AV1_IVF_FILE}" + local file="${AV1_IVF_FILE}" if [ ! -e "${file}" ]; then encode_yuv_raw_input_av1 "${file}" --ivf fi @@ -97,7 +97,7 @@ aomdec_av1_ivf_multithread() { aomdec_aom_ivf_pipe_input() { if [ "$(aomdec_can_decode_av1)" = "yes" ]; then - local readonly file="${AV1_IVF_FILE}" + local file="${AV1_IVF_FILE}" if [ ! -e "${file}" ]; then encode_yuv_raw_input_av1 "${file}" --ivf fi @@ -107,7 +107,7 @@ aomdec_aom_ivf_pipe_input() { aomdec_av1_obu_annexb() { if [ "$(aomdec_can_decode_av1)" = "yes" ]; then - local readonly file="${AV1_OBU_ANNEXB_FILE}" + local file="${AV1_OBU_ANNEXB_FILE}" if [ ! -e "${file}" ]; then encode_yuv_raw_input_av1 "${file}" --obu --annexb=1 fi @@ -117,7 +117,7 @@ aomdec_av1_obu_annexb() { aomdec_av1_obu_section5() { if [ "$(aomdec_can_decode_av1)" = "yes" ]; then - local readonly file="${AV1_OBU_SEC5_FILE}" + local file="${AV1_OBU_SEC5_FILE}" if [ ! -e "${file}" ]; then encode_yuv_raw_input_av1 "${file}" --obu fi @@ -128,7 +128,7 @@ aomdec_av1_obu_section5() { aomdec_av1_webm() { if [ "$(aomdec_can_decode_av1)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly file="${AV1_WEBM_FILE}" + local file="${AV1_WEBM_FILE}" if [ ! -e "${file}" ]; then encode_yuv_raw_input_av1 "${file}" fi diff --git a/third_party/aom/test/aomenc.sh b/third_party/aom/test/aomenc.sh index a0ab8c8aa..b030397a3 100755 --- a/third_party/aom/test/aomenc.sh +++ b/third_party/aom/test/aomenc.sh @@ -60,8 +60,8 @@ y4m_input_720p() { # input file path and shifted away. All remaining parameters are passed through # to aomenc. aomenc_pipe() { - local readonly encoder="$(aom_tool_path aomenc)" - local readonly input="$1" + local encoder="$(aom_tool_path aomenc)" + local input="$1" shift cat "${input}" | eval "${AOM_TEST_PREFIX}" "${encoder}" - \ --test-decode=fatal \ @@ -72,8 +72,8 @@ aomenc_pipe() { # the directory containing aomenc. $1 one is used as the input file path and # shifted away. All remaining parameters are passed through to aomenc. aomenc() { - local readonly encoder="$(aom_tool_path aomenc)" - local readonly input="$1" + local encoder="$(aom_tool_path aomenc)" + local input="$1" shift eval "${AOM_TEST_PREFIX}" "${encoder}" "${input}" \ --test-decode=fatal \ @@ -156,7 +156,7 @@ aomenc_av1_webm() { aomenc_av1_webm_1pass() { if [ "$(aomenc_can_encode_av1)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${AOM_TEST_OUTPUT_DIR}/av1_test.webm" + local output="${AOM_TEST_OUTPUT_DIR}/av1_test.webm" aomenc $(yuv_raw_input) \ $(aomenc_encode_test_fast_params) \ --passes=1 \ @@ -171,7 +171,7 @@ aomenc_av1_webm_1pass() { aomenc_av1_ivf_lossless() { if [ "$(aomenc_can_encode_av1)" = "yes" ]; then - local readonly output="${AOM_TEST_OUTPUT_DIR}/av1_lossless.ivf" + local output="${AOM_TEST_OUTPUT_DIR}/av1_lossless.ivf" aomenc $(yuv_raw_input) \ $(aomenc_encode_test_fast_params) \ --ivf \ @@ -187,7 +187,7 @@ aomenc_av1_ivf_lossless() { aomenc_av1_ivf_minq0_maxq0() { if [ "$(aomenc_can_encode_av1)" = "yes" ]; then - local readonly output="${AOM_TEST_OUTPUT_DIR}/av1_lossless_minq0_maxq0.ivf" + local output="${AOM_TEST_OUTPUT_DIR}/av1_lossless_minq0_maxq0.ivf" aomenc $(yuv_raw_input) \ $(aomenc_encode_test_fast_params) \ --ivf \ @@ -205,9 +205,9 @@ aomenc_av1_ivf_minq0_maxq0() { aomenc_av1_webm_lag5_frames10() { if [ "$(aomenc_can_encode_av1)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly lag_total_frames=10 - local readonly lag_frames=5 - local readonly output="${AOM_TEST_OUTPUT_DIR}/av1_lag5_frames10.webm" + local lag_total_frames=10 + local lag_frames=5 + local output="${AOM_TEST_OUTPUT_DIR}/av1_lag5_frames10.webm" aomenc $(yuv_raw_input) \ $(aomenc_encode_test_fast_params) \ --limit=${lag_total_frames} \ @@ -225,7 +225,7 @@ aomenc_av1_webm_lag5_frames10() { aomenc_av1_webm_non_square_par() { if [ "$(aomenc_can_encode_av1)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${AOM_TEST_OUTPUT_DIR}/av1_non_square_par.webm" + local output="${AOM_TEST_OUTPUT_DIR}/av1_non_square_par.webm" aomenc $(y4m_input_non_square_par) \ $(aomenc_encode_test_fast_params) \ --output="${output}" @@ -241,7 +241,7 @@ aomenc_av1_webm_cdf_update_mode() { if [ "$(aomenc_can_encode_av1)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then for mode in 0 1 2; do - local readonly output="${AOM_TEST_OUTPUT_DIR}/cdf_mode_${mode}.webm" + local output="${AOM_TEST_OUTPUT_DIR}/cdf_mode_${mode}.webm" aomenc $(yuv_raw_input) \ $(aomenc_encode_test_fast_params) \ --cdf-update-mode=${mode} \ diff --git a/third_party/aom/test/av1_convolve_2d_test_util.cc b/third_party/aom/test/av1_convolve_2d_test_util.cc index cbe3f8c9f..1aa08044e 100644 --- a/third_party/aom/test/av1_convolve_2d_test_util.cc +++ b/third_party/aom/test/av1_convolve_2d_test_util.cc @@ -63,10 +63,10 @@ void AV1Convolve2DSrTest::RunCheckOutput(convolve_2d_func test_impl) { for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) { for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) { - InterpFilterParams filter_params_x = + const InterpFilterParams *filter_params_x = av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter, out_w); - InterpFilterParams filter_params_y = + const InterpFilterParams *filter_params_y = av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter, out_h); for (int do_average = 0; do_average < 1; ++do_average) { @@ -83,11 +83,11 @@ void AV1Convolve2DSrTest::RunCheckOutput(convolve_2d_func test_impl) { const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7); const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); av1_convolve_2d_sr_c(input + offset_r * w + offset_c, w, output, - MAX_SB_SIZE, out_w, out_h, &filter_params_x, - &filter_params_y, subx, suby, &conv_params1); + MAX_SB_SIZE, out_w, out_h, filter_params_x, + filter_params_y, subx, suby, &conv_params1); test_impl(input + offset_r * w + offset_c, w, output2, - MAX_SB_SIZE, out_w, out_h, &filter_params_x, - &filter_params_y, subx, suby, &conv_params2); + MAX_SB_SIZE, out_w, out_h, filter_params_x, + filter_params_y, subx, suby, &conv_params2); if (memcmp(output, output2, sizeof(output))) { for (int i = 0; i < MAX_SB_SIZE; ++i) { @@ -137,10 +137,10 @@ void AV1Convolve2DSrTest::RunSpeedTest(convolve_2d_func test_impl) { const int out_h = block_size_high[block_idx] >> shift; const int num_loops = 1000000000 / (out_w + out_h); - InterpFilterParams filter_params_x = + const InterpFilterParams *filter_params_x = av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter, out_w); - InterpFilterParams filter_params_y = + const InterpFilterParams *filter_params_y = av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter, out_h); @@ -148,8 +148,8 @@ void AV1Convolve2DSrTest::RunSpeedTest(convolve_2d_func test_impl) { aom_usec_timer_start(&timer); for (int i = 0; i < num_loops; ++i) - test_impl(input, w, output, MAX_SB_SIZE, out_w, out_h, &filter_params_x, - &filter_params_y, subx, suby, &conv_params2); + test_impl(input, w, output, MAX_SB_SIZE, out_w, out_h, filter_params_x, + filter_params_y, subx, suby, &conv_params2); aom_usec_timer_mark(&timer); const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); @@ -188,10 +188,10 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) { const int out_h = block_size_high[block_idx]; for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) { for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) { - InterpFilterParams filter_params_x = + const InterpFilterParams *filter_params_x = av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter, out_w); - InterpFilterParams filter_params_y = + const InterpFilterParams *filter_params_y = av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter, out_h); for (int do_average = 0; do_average <= 1; ++do_average) { @@ -212,11 +212,11 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) { const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7); const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); av1_jnt_convolve_2d_c(input + offset_r * w + offset_c, w, output8_1, - MAX_SB_SIZE, out_w, out_h, &filter_params_x, - &filter_params_y, subx, suby, &conv_params1); + MAX_SB_SIZE, out_w, out_h, filter_params_x, + filter_params_y, subx, suby, &conv_params1); test_impl(input + offset_r * w + offset_c, w, output8_2, - MAX_SB_SIZE, out_w, out_h, &filter_params_x, - &filter_params_y, subx, suby, &conv_params2); + MAX_SB_SIZE, out_w, out_h, filter_params_x, + filter_params_y, subx, suby, &conv_params2); for (int i = 0; i < out_h; ++i) { for (int j = 0; j < out_w; ++j) { @@ -261,11 +261,11 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) { const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); av1_jnt_convolve_2d_c(input + offset_r * w + offset_c, w, output8_1, MAX_SB_SIZE, out_w, out_h, - &filter_params_x, &filter_params_y, subx, + filter_params_x, filter_params_y, subx, suby, &conv_params1); test_impl(input + offset_r * w + offset_c, w, output8_2, - MAX_SB_SIZE, out_w, out_h, &filter_params_x, - &filter_params_y, subx, suby, &conv_params2); + MAX_SB_SIZE, out_w, out_h, filter_params_x, + filter_params_y, subx, suby, &conv_params2); for (int i = 0; i < out_h; ++i) { for (int j = 0; j < out_w; ++j) { @@ -323,10 +323,10 @@ void AV1JntConvolve2DTest::RunSpeedTest(convolve_2d_func test_impl) { const int num_loops = 1000000000 / (out_w + out_h); const int do_average = 0; - InterpFilterParams filter_params_x = + const InterpFilterParams *filter_params_x = av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter, out_w); - InterpFilterParams filter_params_y = + const InterpFilterParams *filter_params_y = av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter, out_h); @@ -344,7 +344,7 @@ void AV1JntConvolve2DTest::RunSpeedTest(convolve_2d_func test_impl) { for (int i = 0; i < num_loops; ++i) test_impl(input + offset_r * w + offset_c, w, output8, MAX_SB_SIZE, out_w, - out_h, &filter_params_x, &filter_params_y, subx, suby, + out_h, filter_params_x, filter_params_y, subx, suby, &conv_params); aom_usec_timer_mark(&timer); @@ -407,10 +407,10 @@ void AV1HighbdConvolve2DSrTest::RunSpeedTest( const int out_h = block_size_high[block_idx] >> shift; const int num_loops = 1000000000 / (out_w + out_h); - InterpFilterParams filter_params_x = + const InterpFilterParams *filter_params_x = av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter, out_w); - InterpFilterParams filter_params_y = + const InterpFilterParams *filter_params_y = av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter, out_h); @@ -418,7 +418,7 @@ void AV1HighbdConvolve2DSrTest::RunSpeedTest( aom_usec_timer_start(&timer); for (int i = 0; i < num_loops; ++i) test_impl(input + offset_r * w + offset_c, w, output, MAX_SB_SIZE, out_w, - out_h, &filter_params_x, &filter_params_y, subx, suby, + out_h, filter_params_x, filter_params_y, subx, suby, &conv_params, bd); aom_usec_timer_mark(&timer); @@ -456,10 +456,10 @@ void AV1HighbdConvolve2DSrTest::RunCheckOutput( for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) { for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) { - InterpFilterParams filter_params_x = + const InterpFilterParams *filter_params_x = av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter, out_w); - InterpFilterParams filter_params_y = + const InterpFilterParams *filter_params_y = av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter, out_h); for (int do_average = 0; do_average < 1; ++do_average) { @@ -477,11 +477,11 @@ void AV1HighbdConvolve2DSrTest::RunCheckOutput( const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); av1_highbd_convolve_2d_sr_c(input + offset_r * w + offset_c, w, output, MAX_SB_SIZE, out_w, out_h, - &filter_params_x, &filter_params_y, + filter_params_x, filter_params_y, subx, suby, &conv_params1, bd); test_impl(input + offset_r * w + offset_c, w, output2, - MAX_SB_SIZE, out_w, out_h, &filter_params_x, - &filter_params_y, subx, suby, &conv_params2, bd); + MAX_SB_SIZE, out_w, out_h, filter_params_x, + filter_params_y, subx, suby, &conv_params2, bd); if (memcmp(output, output2, sizeof(output))) { for (int i = 0; i < MAX_SB_SIZE; ++i) { @@ -530,10 +530,10 @@ void AV1HighbdJntConvolve2DTest::RunSpeedTest( const int out_w = block_size_wide[block_idx]; const int out_h = block_size_high[block_idx]; - InterpFilterParams filter_params_x = + const InterpFilterParams *filter_params_x = av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter, out_w); - InterpFilterParams filter_params_y = + const InterpFilterParams *filter_params_y = av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter, out_h); @@ -554,8 +554,8 @@ void AV1HighbdJntConvolve2DTest::RunSpeedTest( aom_usec_timer_start(&timer); for (int i = 0; i < num_loops; ++i) test_impl(input + offset_r * w + offset_c, w, output16, MAX_SB_SIZE, out_w, - out_h, &filter_params_x, &filter_params_y, subx, suby, - &conv_params, bd); + out_h, filter_params_x, filter_params_y, subx, suby, &conv_params, + bd); aom_usec_timer_mark(&timer); const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); @@ -589,10 +589,10 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput( const int out_h = block_size_high[block_idx]; for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) { for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) { - InterpFilterParams filter_params_x = + const InterpFilterParams *filter_params_x = av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter, out_w); - InterpFilterParams filter_params_y = + const InterpFilterParams *filter_params_y = av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter, out_h); for (int do_average = 0; do_average <= 1; ++do_average) { @@ -614,11 +614,11 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput( const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); av1_highbd_jnt_convolve_2d_c(input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE, out_w, out_h, - &filter_params_x, &filter_params_y, - subx, suby, &conv_params1, bd); + filter_params_x, filter_params_y, subx, + suby, &conv_params1, bd); test_impl(input + offset_r * w + offset_c, w, output16_2, - MAX_SB_SIZE, out_w, out_h, &filter_params_x, - &filter_params_y, subx, suby, &conv_params2, bd); + MAX_SB_SIZE, out_w, out_h, filter_params_x, + filter_params_y, subx, suby, &conv_params2, bd); for (int i = 0; i < out_h; ++i) { for (int j = 0; j < out_w; ++j) { @@ -664,11 +664,11 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput( const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); av1_highbd_jnt_convolve_2d_c( input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE, - out_w, out_h, &filter_params_x, &filter_params_y, subx, - suby, &conv_params1, bd); + out_w, out_h, filter_params_x, filter_params_y, subx, suby, + &conv_params1, bd); test_impl(input + offset_r * w + offset_c, w, output16_2, - MAX_SB_SIZE, out_w, out_h, &filter_params_x, - &filter_params_y, subx, suby, &conv_params2, bd); + MAX_SB_SIZE, out_w, out_h, filter_params_x, + filter_params_y, subx, suby, &conv_params2, bd); for (int i = 0; i < out_h; ++i) { for (int j = 0; j < out_w; ++j) { diff --git a/third_party/aom/test/av1_convolve_2d_test_util.h b/third_party/aom/test/av1_convolve_2d_test_util.h index 3a53dbdfe..cd4607d68 100644 --- a/third_party/aom/test/av1_convolve_2d_test_util.h +++ b/third_party/aom/test/av1_convolve_2d_test_util.h @@ -28,8 +28,8 @@ namespace AV1Convolve2D { typedef void (*convolve_2d_func)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); @@ -71,8 +71,8 @@ class AV1JntConvolve2DTest : public ::testing::TestWithParam<Convolve2DParam> { namespace AV1HighbdConvolve2D { typedef void (*highbd_convolve_2d_func)( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, const int subpel_x_q4, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); typedef ::testing::tuple<int, highbd_convolve_2d_func, int, int, BLOCK_SIZE> diff --git a/third_party/aom/test/av1_convolve_scale_test.cc b/third_party/aom/test/av1_convolve_scale_test.cc index e0571423c..b99caaeeb 100644 --- a/third_party/aom/test/av1_convolve_scale_test.cc +++ b/third_party/aom/test/av1_convolve_scale_test.cc @@ -390,8 +390,8 @@ typedef tuple<int, int> BlockDimension; typedef void (*LowbdConvolveFunc)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params); @@ -463,8 +463,8 @@ INSTANTIATE_TEST_CASE_P( typedef void (*HighbdConvolveFunc)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd); diff --git a/third_party/aom/test/av1_ext_tile_test.cc b/third_party/aom/test/av1_ext_tile_test.cc index d2abbab7f..424d2f065 100644 --- a/third_party/aom/test/av1_ext_tile_test.cc +++ b/third_party/aom/test/av1_ext_tile_test.cc @@ -47,6 +47,7 @@ class AV1ExtTileTest decoder_ = codec_->CreateDecoder(cfg, 0); decoder_->Control(AV1_SET_TILE_MODE, 1); + decoder_->Control(AV1D_EXT_TILE_DEBUG, 1); decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1); decoder_->Control(AV1_SET_DECODE_TILE_COL, -1); @@ -82,13 +83,14 @@ class AV1ExtTileTest encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0); encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); - // The tile size is 64x64. - encoder->Control(AV1E_SET_TILE_COLUMNS, kTileSize); - encoder->Control(AV1E_SET_TILE_ROWS, kTileSize); // TODO(yunqingwang): test single_tile_decoding = 0. encoder->Control(AV1E_SET_SINGLE_TILE_DECODING, 1); // Always use 64x64 max partition. encoder->Control(AV1E_SET_SUPERBLOCK_SIZE, AOM_SUPERBLOCK_SIZE_64X64); + // Set tile_columns and tile_rows to MAX values, which guarantees the tile + // size of 64 x 64 pixels(i.e. 1 SB) for <= 4k resolution. + encoder->Control(AV1E_SET_TILE_COLUMNS, 6); + encoder->Control(AV1E_SET_TILE_ROWS, 6); } if (video->frame() == 1) { @@ -195,7 +197,7 @@ class AV1ExtTileTest std::vector<std::string> tile_md5_; }; -TEST_P(AV1ExtTileTest, DISABLED_DecoderResultTest) { TestRoundTrip(); } +TEST_P(AV1ExtTileTest, DecoderResultTest) { TestRoundTrip(); } AV1_INSTANTIATE_TEST_CASE( // Now only test 2-pass mode. @@ -204,7 +206,7 @@ AV1_INSTANTIATE_TEST_CASE( class AV1ExtTileTestLarge : public AV1ExtTileTest {}; -TEST_P(AV1ExtTileTestLarge, DISABLED_DecoderResultTest) { TestRoundTrip(); } +TEST_P(AV1ExtTileTestLarge, DecoderResultTest) { TestRoundTrip(); } AV1_INSTANTIATE_TEST_CASE( // Now only test 2-pass mode. diff --git a/third_party/aom/test/av1_fwd_txfm2d_test.cc b/third_party/aom/test/av1_fwd_txfm2d_test.cc index e0294be4e..6577e33b8 100644 --- a/third_party/aom/test/av1_fwd_txfm2d_test.cc +++ b/third_party/aom/test/av1_fwd_txfm2d_test.cc @@ -247,9 +247,9 @@ void AV1FwdTxfm2dMatchTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) { FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size]; if (ref_func != NULL) { - DECLARE_ALIGNED(16, int16_t, input[64 * 64]) = { 0 }; - DECLARE_ALIGNED(16, int32_t, output[64 * 64]); - DECLARE_ALIGNED(16, int32_t, ref_output[64 * 64]); + DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 }; + DECLARE_ALIGNED(32, int32_t, output[64 * 64]); + DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]); int input_stride = 64; ACMRandom rnd(ACMRandom::DeterministicSeed()); for (int cnt = 0; cnt < 500; ++cnt) { @@ -339,4 +339,16 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, AV1FwdTxfm2dTest, Combine(ValuesIn(fwd_txfm_for_sse41), Values(av1_lowbd_fwd_txfm_sse4_1))); #endif // HAVE_SSE4_1 + +#if HAVE_AVX2 +static TX_SIZE fwd_txfm_for_avx2[] = { + TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_64X64, TX_4X8, TX_8X4, + TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16, + TX_16X4, TX_8X32, TX_32X8, TX_16X64, TX_64X16, +}; + +INSTANTIATE_TEST_CASE_P(AVX2, AV1FwdTxfm2dTest, + Combine(ValuesIn(fwd_txfm_for_avx2), + Values(av1_lowbd_fwd_txfm_avx2))); +#endif // HAVE_AVX2 } // namespace diff --git a/third_party/aom/test/av1_inv_txfm2d_test.cc b/third_party/aom/test/av1_inv_txfm2d_test.cc index 461e7ebcd..11e231ba6 100644 --- a/third_party/aom/test/av1_inv_txfm2d_test.cc +++ b/third_party/aom/test/av1_inv_txfm2d_test.cc @@ -364,4 +364,15 @@ INSTANTIATE_TEST_CASE_P(AVX2, AV1LbdInvTxfm2d, ::testing::Values(av1_lowbd_inv_txfm2d_add_avx2)); #endif // HAVE_AVX2 +#if HAVE_NEON + +extern "C" void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob); + +INSTANTIATE_TEST_CASE_P(NEON, AV1LbdInvTxfm2d, + ::testing::Values(av1_lowbd_inv_txfm2d_add_neon)); +#endif // HAVE_NEON + } // namespace diff --git a/third_party/aom/test/av1_wedge_utils_test.cc b/third_party/aom/test/av1_wedge_utils_test.cc index cfdf2d36c..e8fbe69a4 100644 --- a/third_party/aom/test/av1_wedge_utils_test.cc +++ b/third_party/aom/test/av1_wedge_utils_test.cc @@ -217,14 +217,6 @@ TEST_P(WedgeUtilsSSEOptTest, ExtremeValues) { } } -#if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( - SSE2, WedgeUtilsSSEOptTest, - ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_c, - av1_wedge_sse_from_residuals_sse2))); - -#endif // HAVE_SSE2 - ////////////////////////////////////////////////////////////////////////////// // av1_wedge_sign_from_residuals ////////////////////////////////////////////////////////////////////////////// @@ -325,15 +317,6 @@ TEST_P(WedgeUtilsSignOptTest, ExtremeValues) { } } -#if HAVE_SSE2 - -INSTANTIATE_TEST_CASE_P( - SSE2, WedgeUtilsSignOptTest, - ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_c, - av1_wedge_sign_from_residuals_sse2))); - -#endif // HAVE_SSE2 - ////////////////////////////////////////////////////////////////////////////// // av1_wedge_compute_delta_squares ////////////////////////////////////////////////////////////////////////////// @@ -371,12 +354,37 @@ TEST_P(WedgeUtilsDeltaSquaresOptTest, RandomValues) { } #if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, WedgeUtilsSSEOptTest, + ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_c, + av1_wedge_sse_from_residuals_sse2))); + +INSTANTIATE_TEST_CASE_P( + SSE2, WedgeUtilsSignOptTest, + ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_c, + av1_wedge_sign_from_residuals_sse2))); INSTANTIATE_TEST_CASE_P( SSE2, WedgeUtilsDeltaSquaresOptTest, ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_c, av1_wedge_compute_delta_squares_sse2))); - #endif // HAVE_SSE2 +#if HAVE_AVX2 +INSTANTIATE_TEST_CASE_P( + AVX2, WedgeUtilsSSEOptTest, + ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_sse2, + av1_wedge_sse_from_residuals_avx2))); + +INSTANTIATE_TEST_CASE_P( + AVX2, WedgeUtilsSignOptTest, + ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_sse2, + av1_wedge_sign_from_residuals_avx2))); + +INSTANTIATE_TEST_CASE_P( + AVX2, WedgeUtilsDeltaSquaresOptTest, + ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_sse2, + av1_wedge_compute_delta_squares_avx2))); +#endif // HAVE_AVX2 + } // namespace diff --git a/third_party/aom/test/codec_factory.h b/third_party/aom/test/codec_factory.h index 65b76094c..e6ae7f8c3 100644 --- a/third_party/aom/test/codec_factory.h +++ b/third_party/aom/test/codec_factory.h @@ -71,6 +71,11 @@ class CodecTestWith4Params : public ::testing::TestWithParam< ::testing::tuple< const libaom_test::CodecFactory *, T1, T2, T3, T4> > {}; +template <class T1, class T2, class T3, class T4, class T5> +class CodecTestWith5Params + : public ::testing::TestWithParam< ::testing::tuple< + const libaom_test::CodecFactory *, T1, T2, T3, T4, T5> > {}; + /* * AV1 Codec Definitions */ diff --git a/third_party/aom/test/comp_mask_variance_test.cc b/third_party/aom/test/comp_mask_variance_test.cc index a5e3f3411..0016ddd59 100644 --- a/third_party/aom/test/comp_mask_variance_test.cc +++ b/third_party/aom/test/comp_mask_variance_test.cc @@ -33,6 +33,7 @@ typedef void (*comp_mask_pred_func)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); + #if HAVE_SSSE3 || HAVE_AV2 const BLOCK_SIZE kValidBlockSize[] = { BLOCK_8X8, BLOCK_8X16, BLOCK_8X32, BLOCK_16X8, BLOCK_16X16, @@ -270,4 +271,274 @@ INSTANTIATE_TEST_CASE_P( #endif #endif // ifndef aom_comp_mask_pred + +typedef void (*highbd_comp_mask_pred_func)(uint16_t *comp_pred, + const uint8_t *pred8, int width, + int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask); + +typedef ::testing::tuple<highbd_comp_mask_pred_func, BLOCK_SIZE, int> + HighbdCompMaskPredParam; + +class AV1HighbdCompMaskVarianceTest + : public ::testing::TestWithParam<HighbdCompMaskPredParam> { + public: + ~AV1HighbdCompMaskVarianceTest(); + void SetUp(); + + void TearDown(); + + protected: + void RunCheckOutput(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, + int inv); + void RunSpeedTest(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize); + bool CheckResult(int width, int height) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + const int idx = y * width + x; + if (comp_pred1_[idx] != comp_pred2_[idx]) { + printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, y, x); + printf("%d != %d ", comp_pred1_[idx], comp_pred2_[idx]); + return false; + } + } + } + return true; + } + + libaom_test::ACMRandom rnd_; + uint16_t *comp_pred1_; + uint16_t *comp_pred2_; + uint16_t *pred_; + uint16_t *ref_buffer_; + uint16_t *ref_; +}; + +AV1HighbdCompMaskVarianceTest::~AV1HighbdCompMaskVarianceTest() { ; } + +void AV1HighbdCompMaskVarianceTest::SetUp() { + rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); + av1_init_wedge_masks(); + + comp_pred1_ = + (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred1_)); + comp_pred2_ = + (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred2_)); + pred_ = (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*pred_)); + ref_buffer_ = (uint16_t *)aom_memalign( + 16, (MAX_SB_SQUARE + (8 * MAX_SB_SIZE)) * sizeof(*ref_buffer_)); + ref_ = ref_buffer_ + (8 * MAX_SB_SIZE); +} + +void AV1HighbdCompMaskVarianceTest::TearDown() { + aom_free(comp_pred1_); + aom_free(comp_pred2_); + aom_free(pred_); + aom_free(ref_buffer_); + libaom_test::ClearSystemState(); +} + +void AV1HighbdCompMaskVarianceTest::RunCheckOutput( + highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv) { + int bd_ = GET_PARAM(2); + + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + + int wedge_types = (1 << get_wedge_bits_lookup(bsize)); + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1); + } + for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) { + ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1); + } + + for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { + const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize); + + aom_highbd_comp_mask_pred_c(comp_pred1_, CONVERT_TO_BYTEPTR(pred_), w, h, + CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, + inv); + + test_impl(comp_pred2_, CONVERT_TO_BYTEPTR(pred_), w, h, + CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv); + + ASSERT_EQ(CheckResult(w, h), true) + << " wedge " << wedge_index << " inv " << inv; + } +} + +void AV1HighbdCompMaskVarianceTest::RunSpeedTest( + highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize) { + int bd_ = GET_PARAM(2); + + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + + int wedge_types = (1 << get_wedge_bits_lookup(bsize)); + int wedge_index = wedge_types / 2; + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1); + } + for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) { + ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1); + } + + const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize); + const int num_loops = 1000000000 / (w + h); + + highbd_comp_mask_pred_func funcs[2] = { aom_highbd_comp_mask_pred_c, + test_impl }; + double elapsed_time[2] = { 0 }; + for (int i = 0; i < 2; ++i) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + highbd_comp_mask_pred_func func = funcs[i]; + for (int j = 0; j < num_loops; ++j) { + func(comp_pred1_, CONVERT_TO_BYTEPTR(pred_), w, h, + CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, 0); + } + aom_usec_timer_mark(&timer); + double time = static_cast<double>(aom_usec_timer_elapsed(&timer)); + elapsed_time[i] = 1000.0 * time / num_loops; + } + printf("compMask %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0], + elapsed_time[1]); + printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]); +} + +TEST_P(AV1HighbdCompMaskVarianceTest, CheckOutput) { + // inv = 0, 1 + RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0); + RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1); +} + +TEST_P(AV1HighbdCompMaskVarianceTest, DISABLED_Speed) { + RunSpeedTest(GET_PARAM(0), GET_PARAM(1)); +} + +#if HAVE_AVX2 +INSTANTIATE_TEST_CASE_P( + AVX2, AV1HighbdCompMaskVarianceTest, + ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_avx2), + ::testing::ValuesIn(kValidBlockSize), + ::testing::Range(8, 13, 2))); +#endif + +#ifndef aom_highbd_comp_mask_pred +// can't run this test if aom_highbd_comp_mask_pred is defined to +// aom_highbd_comp_mask_pred_c +class AV1HighbdCompMaskUpVarianceTest : public AV1HighbdCompMaskVarianceTest { + public: + ~AV1HighbdCompMaskUpVarianceTest(); + + protected: + void RunCheckOutput(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, + int inv); + void RunSpeedTest(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, + int havSub); +}; + +AV1HighbdCompMaskUpVarianceTest::~AV1HighbdCompMaskUpVarianceTest() { ; } + +void AV1HighbdCompMaskUpVarianceTest::RunCheckOutput( + highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv) { + int bd_ = GET_PARAM(2); + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + int wedge_types = (1 << get_wedge_bits_lookup(bsize)); + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1); + } + for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) { + ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1); + } + + // loop through subx and suby + for (int sub = 0; sub < 8 * 8; ++sub) { + int subx = sub & 0x7; + int suby = (sub >> 3); + for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { + const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize); + + aom_highbd_comp_mask_pred = aom_highbd_comp_mask_pred_c; // ref + aom_highbd_comp_mask_upsampled_pred( + NULL, NULL, 0, 0, NULL, comp_pred1_, CONVERT_TO_BYTEPTR(pred_), w, h, + subx, suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv, bd_); + + aom_highbd_comp_mask_pred = test_impl; // test + aom_highbd_comp_mask_upsampled_pred( + NULL, NULL, 0, 0, NULL, comp_pred2_, CONVERT_TO_BYTEPTR(pred_), w, h, + subx, suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv, bd_); + ASSERT_EQ(CheckResult(w, h), true) + << " wedge " << wedge_index << " inv " << inv << "sub (" << subx + << "," << suby << ")"; + } + } +} + +void AV1HighbdCompMaskUpVarianceTest::RunSpeedTest( + highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int havSub) { + int bd_ = GET_PARAM(2); + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int subx = havSub ? 3 : 0; + const int suby = havSub ? 4 : 0; + + int wedge_types = (1 << get_wedge_bits_lookup(bsize)); + int wedge_index = wedge_types / 2; + const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize); + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1); + } + for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) { + ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1); + } + + const int num_loops = 1000000000 / (w + h); + highbd_comp_mask_pred_func funcs[2] = { &aom_highbd_comp_mask_pred_c, + test_impl }; + double elapsed_time[2] = { 0 }; + for (int i = 0; i < 2; ++i) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + aom_highbd_comp_mask_pred = funcs[i]; + for (int j = 0; j < num_loops; ++j) { + aom_highbd_comp_mask_upsampled_pred( + NULL, NULL, 0, 0, NULL, comp_pred1_, CONVERT_TO_BYTEPTR(pred_), w, h, + subx, suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, 0, bd_); + } + aom_usec_timer_mark(&timer); + double time = static_cast<double>(aom_usec_timer_elapsed(&timer)); + elapsed_time[i] = 1000.0 * time / num_loops; + } + printf("CompMaskUp[%d] %3dx%-3d:%7.2f/%7.2fns", havSub, w, h, elapsed_time[0], + elapsed_time[1]); + printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]); +} + +TEST_P(AV1HighbdCompMaskUpVarianceTest, CheckOutput) { + // inv mask = 0, 1 + RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0); + RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1); +} + +TEST_P(AV1HighbdCompMaskUpVarianceTest, DISABLED_Speed) { + RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 1); +} + +#if HAVE_AVX2 +INSTANTIATE_TEST_CASE_P( + AVX2, AV1HighbdCompMaskUpVarianceTest, + ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_avx2), + ::testing::ValuesIn(kValidBlockSize), + ::testing::Range(8, 13, 2))); +#endif + +#endif // ifndef aom_highbd_comp_mask_pred } // namespace AV1CompMaskVariance diff --git a/third_party/aom/test/convolve_test.cc b/third_party/aom/test/convolve_test.cc index 7098e8af6..de3f47628 100644 --- a/third_party/aom/test/convolve_test.cc +++ b/third_party/aom/test/convolve_test.cc @@ -490,9 +490,9 @@ TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) { const InterpFilter filter = (InterpFilter)filter_bank; const InterpKernel *filters = (const InterpKernel *)av1_get_interp_filter_kernel(filter); - const InterpFilterParams filter_params = + const InterpFilterParams *filter_params = av1_get_interp_filter_params_with_block_size(filter, 8); - if (filter_params.taps != SUBPEL_TAPS) continue; + if (filter_params->taps != SUBPEL_TAPS) continue; for (int i = 0; i < kNumFilters; i++) { const int p0 = filters[i][0] + filters[i][1]; const int p1 = filters[i][2] + filters[i][3]; @@ -528,9 +528,9 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) { const InterpFilter filter = (InterpFilter)filter_bank; const InterpKernel *filters = (const InterpKernel *)av1_get_interp_filter_kernel(filter); - const InterpFilterParams filter_params = + const InterpFilterParams *filter_params = av1_get_interp_filter_params_with_block_size(filter, 8); - if (filter_params.taps != SUBPEL_TAPS) continue; + if (filter_params->taps != SUBPEL_TAPS) continue; for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) { for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) { @@ -614,9 +614,9 @@ TEST_P(ConvolveTest, FilterExtremes) { const InterpFilter filter = (InterpFilter)filter_bank; const InterpKernel *filters = (const InterpKernel *)av1_get_interp_filter_kernel(filter); - const InterpFilterParams filter_params = + const InterpFilterParams *filter_params = av1_get_interp_filter_params_with_block_size(filter, 8); - if (filter_params.taps != SUBPEL_TAPS) continue; + if (filter_params->taps != SUBPEL_TAPS) continue; for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) { for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) { wrapper_filter_block2d_8_c(in, kInputStride, filters[filter_x], @@ -713,9 +713,9 @@ TEST_P(ConvolveTest, DISABLED_Speed) { const InterpFilter filter = (InterpFilter)filter_bank; const InterpKernel *filters = (const InterpKernel *)av1_get_interp_filter_kernel(filter); - const InterpFilterParams filter_params = + const InterpFilterParams *filter_params = av1_get_interp_filter_params_with_block_size(filter, 8); - if (filter_params.taps != SUBPEL_TAPS) continue; + if (filter_params->taps != SUBPEL_TAPS) continue; for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) { for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) { @@ -832,20 +832,25 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, #endif #if HAVE_AVX2 -const ConvolveFunctions convolve8_avx2(wrap_convolve_copy_avx2_8, - wrap_convolve8_horiz_avx2_8, - wrap_convolve8_vert_avx2_8, 8); -const ConvolveFunctions convolve10_avx2(wrap_convolve_copy_avx2_10, - wrap_convolve8_horiz_avx2_10, - wrap_convolve8_vert_avx2_10, 10); -const ConvolveFunctions convolve12_avx2(wrap_convolve_copy_avx2_12, - wrap_convolve8_horiz_avx2_12, - wrap_convolve8_vert_avx2_12, 12); -const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES_64(convolve8_avx2), - ALL_SIZES_64(convolve10_avx2), - ALL_SIZES_64(convolve12_avx2) }; +const ConvolveFunctions convolve8_avx2(aom_convolve_copy_c, + aom_convolve8_horiz_avx2, + aom_convolve8_vert_avx2, 0); + +const ConvolveFunctions wrap_convolve8_avx2(wrap_convolve_copy_avx2_8, + wrap_convolve8_horiz_avx2_8, + wrap_convolve8_vert_avx2_8, 8); +const ConvolveFunctions wrap_convolve10_avx2(wrap_convolve_copy_avx2_10, + wrap_convolve8_horiz_avx2_10, + wrap_convolve8_vert_avx2_10, 10); +const ConvolveFunctions wrap_convolve12_avx2(wrap_convolve_copy_avx2_12, + wrap_convolve8_horiz_avx2_12, + wrap_convolve8_vert_avx2_12, 12); +const ConvolveParam kArray_Convolve8_avx2[] = { + ALL_SIZES_64(wrap_convolve8_avx2), ALL_SIZES_64(wrap_convolve10_avx2), + ALL_SIZES_64(wrap_convolve12_avx2), ALL_SIZES(convolve8_avx2) +}; INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve8_avx2)); + ::testing::ValuesIn(kArray_Convolve8_avx2)); #endif // HAVE_AVX2 } // namespace diff --git a/third_party/aom/test/decode_multithreaded_test.cc b/third_party/aom/test/decode_multithreaded_test.cc index ed9a9ceef..cea1d144f 100644 --- a/third_party/aom/test/decode_multithreaded_test.cc +++ b/third_party/aom/test/decode_multithreaded_test.cc @@ -26,13 +26,14 @@ namespace { static const int kNumMultiThreadDecoders = 3; class AV1DecodeMultiThreadedTest - : public ::libaom_test::CodecTestWith4Params<int, int, int, int>, + : public ::libaom_test::CodecTestWith5Params<int, int, int, int, int>, public ::libaom_test::EncoderTest { protected: AV1DecodeMultiThreadedTest() : EncoderTest(GET_PARAM(0)), md5_single_thread_(), md5_multi_thread_(), n_tile_cols_(GET_PARAM(1)), n_tile_rows_(GET_PARAM(2)), - n_tile_groups_(GET_PARAM(3)), set_cpu_used_(GET_PARAM(4)) { + n_tile_groups_(GET_PARAM(3)), set_cpu_used_(GET_PARAM(4)), + row_mt_(GET_PARAM(5)) { init_flags_ = AOM_CODEC_USE_PSNR; aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t(); cfg.w = 704; @@ -45,14 +46,17 @@ class AV1DecodeMultiThreadedTest for (int i = 0; i < kNumMultiThreadDecoders; ++i) { cfg.threads <<= 1; multi_thread_dec_[i] = codec_->CreateDecoder(cfg, 0); + multi_thread_dec_[i]->Control(AV1D_SET_ROW_MT, row_mt_); } if (single_thread_dec_->IsAV1()) { + single_thread_dec_->Control(AV1D_EXT_TILE_DEBUG, 1); single_thread_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1); single_thread_dec_->Control(AV1_SET_DECODE_TILE_COL, -1); } for (int i = 0; i < kNumMultiThreadDecoders; ++i) { if (multi_thread_dec_[i]->IsAV1()) { + multi_thread_dec_[i]->Control(AV1D_EXT_TILE_DEBUG, 1); multi_thread_dec_[i]->Control(AV1_SET_DECODE_TILE_ROW, -1); multi_thread_dec_[i]->Control(AV1_SET_DECODE_TILE_COL, -1); } @@ -128,6 +132,7 @@ class AV1DecodeMultiThreadedTest int n_tile_rows_; int n_tile_groups_; int set_cpu_used_; + int row_mt_; }; // run an encode and do the decode both in single thread @@ -154,16 +159,17 @@ TEST_P(AV1DecodeMultiThreadedTestLarge, MD5Match) { // TODO(ranjit): More tests have to be added using pre-generated MD5. AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedTest, ::testing::Values(1, 2), ::testing::Values(1, 2), ::testing::Values(1), - ::testing::Values(3)); + ::testing::Values(3), ::testing::Values(0, 1)); AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedTestLarge, ::testing::Values(0, 1, 2, 6), ::testing::Values(0, 1, 2, 6), - ::testing::Values(1, 4), ::testing::Values(0)); + ::testing::Values(1, 4), ::testing::Values(0), + ::testing::Values(0, 1)); class AV1DecodeMultiThreadedLSTestLarge : public AV1DecodeMultiThreadedTestLarge {}; -TEST_P(AV1DecodeMultiThreadedLSTestLarge, DISABLED_MD5Match) { +TEST_P(AV1DecodeMultiThreadedLSTestLarge, MD5Match) { cfg_.large_scale_tile = 1; single_thread_dec_->Control(AV1_SET_TILE_MODE, 1); for (int i = 0; i < kNumMultiThreadDecoders; ++i) @@ -172,8 +178,8 @@ TEST_P(AV1DecodeMultiThreadedLSTestLarge, DISABLED_MD5Match) { } AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedLSTestLarge, - ::testing::Values(1, 2, 32), - ::testing::Values(1, 2, 32), ::testing::Values(1), - ::testing::Values(0, 3)); + ::testing::Values(6), ::testing::Values(6), + ::testing::Values(1), ::testing::Values(0, 3), + ::testing::Values(0, 1)); } // namespace diff --git a/third_party/aom/test/decode_test_driver.cc b/third_party/aom/test/decode_test_driver.cc index ed261b527..70de0cff6 100644 --- a/third_party/aom/test/decode_test_driver.cc +++ b/third_party/aom/test/decode_test_driver.cc @@ -94,7 +94,7 @@ void DecoderTest::RunLoop(CompressedVideoSource *video, const aom_image_t *img = NULL; // Get decompressed data - while ((img = dec_iter.Next())) + while (!::testing::Test::HasFailure() && (img = dec_iter.Next())) DecompressedFrameHook(*img, video->frame_number()); } delete decoder; diff --git a/third_party/aom/test/dr_prediction_test.cc b/third_party/aom/test/dr_prediction_test.cc index 22b9832a1..ff2c1de4e 100644 --- a/third_party/aom/test/dr_prediction_test.cc +++ b/third_party/aom/test/dr_prediction_test.cc @@ -143,8 +143,8 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > { static const int kBufSize = ((2 * MAX_TX_SIZE) << 1) + 16; DrPredTest() - : upsample_above_(0), upsample_left_(0), bw_(0), bh_(0), dx_(1), dy_(1), - bd_(8), txsize_(TX_4X4) { + : enable_upsample_(0), upsample_above_(0), upsample_left_(0), bw_(0), + bh_(0), dx_(1), dy_(1), bd_(8), txsize_(TX_4X4) { params_ = this->GetParam(); start_angle_ = params_.start_angle; stop_angle_ = start_angle_ + 90; @@ -193,7 +193,7 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > { OutputTimes(kNumTests, ref_time, tst_time, tx); } - void RunTest(bool speedtest) { + void RunTest(bool speedtest, int p_angle) { for (int i = 0; i < kBufSize; ++i) { above_data_[i] = left_data_[i] = (1 << bd_) - 1; } @@ -212,6 +212,15 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > { bw_ = tx_size_wide[kTxSize[tx]]; bh_ = tx_size_high[kTxSize[tx]]; + if (enable_upsample_) { + upsample_above_ = + av1_use_intra_edge_upsample(bw_, bh_, p_angle - 90, 0); + upsample_left_ = + av1_use_intra_edge_upsample(bw_, bh_, p_angle - 180, 0); + } else { + upsample_above_ = upsample_left_ = 0; + } + Predict(speedtest, tx); for (int r = 0; r < bh_; ++r) { @@ -252,6 +261,7 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > { Pixel *left_; int dst_stride_; + int enable_upsample_; int upsample_above_; int upsample_left_; int bw_; @@ -273,25 +283,25 @@ class LowbdDrPredTest : public DrPredTest<uint8_t, DrPred> {}; TEST_P(LowbdDrPredTest, SaturatedValues) { for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { - upsample_above_ = iter & 1; + enable_upsample_ = iter & 1; for (int angle = start_angle_; angle < stop_angle_; ++angle) { dx_ = av1_get_dx(angle); dy_ = av1_get_dy(angle); - if (dx_ && dy_) RunTest(false); + if (dx_ && dy_) RunTest(false, angle); } } } TEST_P(LowbdDrPredTest, DISABLED_Speed) { const int angles[] = { 3, 45, 87 }; - for (upsample_above_ = 0; upsample_above_ < 2; ++upsample_above_) { - upsample_left_ = upsample_above_; + for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { for (int i = 0; i < 3; ++i) { - dx_ = av1_get_dx(angles[i] + start_angle_); - dy_ = av1_get_dy(angles[i] + start_angle_); - printf("upsample_above: %d upsample_left: %d angle: %d ~~~~~~~~~~~~~~~\n", - upsample_above_, upsample_left_, angles[i] + start_angle_); - if (dx_ && dy_) RunTest(true); + const int angle = angles[i] + start_angle_; + dx_ = av1_get_dx(angle); + dy_ = av1_get_dy(angle); + printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n", + enable_upsample_, angle); + if (dx_ && dy_) RunTest(true, angle); } } } @@ -311,25 +321,25 @@ class HighbdDrPredTest : public DrPredTest<uint16_t, DrPred_Hbd> {}; TEST_P(HighbdDrPredTest, SaturatedValues) { for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { - upsample_above_ = iter & 1; + enable_upsample_ = iter & 1; for (int angle = start_angle_; angle < stop_angle_; ++angle) { dx_ = av1_get_dx(angle); dy_ = av1_get_dy(angle); - if (dx_ && dy_) RunTest(false); + if (dx_ && dy_) RunTest(false, angle); } } } TEST_P(HighbdDrPredTest, DISABLED_Speed) { const int angles[] = { 3, 45, 87 }; - for (upsample_above_ = 0; upsample_above_ < 2; ++upsample_above_) { - upsample_left_ = upsample_above_; + for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { for (int i = 0; i < 3; ++i) { - dx_ = av1_get_dx(angles[i] + start_angle_); - dy_ = av1_get_dy(angles[i] + start_angle_); - printf("upsample_above: %d upsample_left: %d angle: %d ~~~~~~~~~~~~~~~\n", - upsample_above_, upsample_left_, angles[i] + start_angle_); - if (dx_ && dy_) RunTest(true); + const int angle = angles[i] + start_angle_; + dx_ = av1_get_dx(angle); + dy_ = av1_get_dy(angle); + printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n", + enable_upsample_, angle); + if (dx_ && dy_) RunTest(true, angle); } } } diff --git a/third_party/aom/test/dump_obu.sh b/third_party/aom/test/dump_obu.sh index 182e894f5..da44dd7e6 100755 --- a/third_party/aom/test/dump_obu.sh +++ b/third_party/aom/test/dump_obu.sh @@ -44,7 +44,7 @@ aomenc_available() { encode_test_file() { if [ "$(aomenc_available)" = "yes" ]; then - local readonly encoder="$(aom_tool_path aomenc)" + local encoder="$(aom_tool_path aomenc)" eval "${encoder}" \ $(aomenc_encode_test_fast_params) \ diff --git a/third_party/aom/test/encode_test_driver.cc b/third_party/aom/test/encode_test_driver.cc index b75d7be16..35908430d 100644 --- a/third_party/aom/test/encode_test_driver.cc +++ b/third_party/aom/test/encode_test_driver.cc @@ -217,6 +217,7 @@ void EncoderTest::RunLoop(VideoSource *video) { // Set dec_cfg.tile_row = -1 and dec_cfg.tile_col = -1 so that the whole // frame is decoded. decoder->Control(AV1_SET_TILE_MODE, cfg_.large_scale_tile); + decoder->Control(AV1D_EXT_TILE_DEBUG, 1); decoder->Control(AV1_SET_DECODE_TILE_ROW, -1); decoder->Control(AV1_SET_DECODE_TILE_COL, -1); } diff --git a/third_party/aom/test/ethread_test.cc b/third_party/aom/test/ethread_test.cc index 3dcc2a707..dd9fc2f8d 100644 --- a/third_party/aom/test/ethread_test.cc +++ b/third_party/aom/test/ethread_test.cc @@ -20,12 +20,14 @@ namespace { class AVxEncoderThreadTest - : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>, + : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int, + int, int>, public ::libaom_test::EncoderTest { protected: AVxEncoderThreadTest() : EncoderTest(GET_PARAM(0)), encoder_initialized_(false), - encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)) { + encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)), + tile_cols_(GET_PARAM(3)), tile_rows_(GET_PARAM(4)) { init_flags_ = AOM_CODEC_USE_PSNR; aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t(); cfg.w = 1280; @@ -84,9 +86,8 @@ class AVxEncoderThreadTest } virtual void SetTileSize(libaom_test::Encoder *encoder) { - // Encode 4 tile columns. - encoder->Control(AV1E_SET_TILE_COLUMNS, 2); - encoder->Control(AV1E_SET_TILE_ROWS, 0); + encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_); + encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_); } virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) { @@ -153,6 +154,8 @@ class AVxEncoderThreadTest bool encoder_initialized_; ::libaom_test::TestMode encoding_mode_; int set_cpu_used_; + int tile_cols_; + int tile_rows_; ::libaom_test::Decoder *decoder_; std::vector<size_t> size_enc_; std::vector<std::string> md5_enc_; @@ -177,42 +180,46 @@ TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) { AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTest, ::testing::Values(::libaom_test::kTwoPassGood, ::libaom_test::kOnePassGood), - ::testing::Range(2, 4)); + ::testing::Range(2, 4), ::testing::Values(1, 2), + ::testing::Values(0, 1)); AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTestLarge, ::testing::Values(::libaom_test::kTwoPassGood, ::libaom_test::kOnePassGood), - ::testing::Range(0, 2)); + ::testing::Range(0, 2), ::testing::Values(0, 1, 2, 6), + ::testing::Values(0, 1, 2, 6)); class AVxEncoderThreadLSTest : public AVxEncoderThreadTest { virtual void SetTileSize(libaom_test::Encoder *encoder) { - encoder->Control(AV1E_SET_TILE_COLUMNS, 1); - // TODO(geza): Start using multiple tile rows when the multi-threaded - // encoder can handle them - encoder->Control(AV1E_SET_TILE_ROWS, 32); + encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_); + encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_); } }; -TEST_P(AVxEncoderThreadLSTest, DISABLED_EncoderResultTest) { +TEST_P(AVxEncoderThreadLSTest, EncoderResultTest) { cfg_.large_scale_tile = 1; decoder_->Control(AV1_SET_TILE_MODE, 1); + decoder_->Control(AV1D_EXT_TILE_DEBUG, 1); DoTest(); } class AVxEncoderThreadLSTestLarge : public AVxEncoderThreadLSTest {}; -TEST_P(AVxEncoderThreadLSTestLarge, DISABLED_EncoderResultTest) { +TEST_P(AVxEncoderThreadLSTestLarge, EncoderResultTest) { cfg_.large_scale_tile = 1; decoder_->Control(AV1_SET_TILE_MODE, 1); + decoder_->Control(AV1D_EXT_TILE_DEBUG, 1); DoTest(); } AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadLSTest, ::testing::Values(::libaom_test::kTwoPassGood, ::libaom_test::kOnePassGood), - ::testing::Range(2, 4)); + ::testing::Range(2, 4), ::testing::Values(6), + ::testing::Values(0, 6)); AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadLSTestLarge, ::testing::Values(::libaom_test::kTwoPassGood, ::libaom_test::kOnePassGood), - ::testing::Range(0, 2)); + ::testing::Range(0, 2), ::testing::Values(6), + ::testing::Values(0, 6)); } // namespace diff --git a/third_party/aom/test/fft_test.cc b/third_party/aom/test/fft_test.cc index 56187cdbb..5c8ec069c 100644 --- a/third_party/aom/test/fft_test.cc +++ b/third_party/aom/test/fft_test.cc @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + #include <math.h> #include <algorithm> diff --git a/third_party/aom/test/film_grain_table_test.cc b/third_party/aom/test/film_grain_table_test.cc index 068814635..524d67d7b 100644 --- a/third_party/aom/test/film_grain_table_test.cc +++ b/third_party/aom/test/film_grain_table_test.cc @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + #include <string> #include "third_party/googletest/src/googletest/include/gtest/gtest.h" #include "aom_dsp/grain_table.h" diff --git a/third_party/aom/test/intrapred_test.cc b/third_party/aom/test/intrapred_test.cc index 82f191449..1a1c0fc42 100644 --- a/third_party/aom/test/intrapred_test.cc +++ b/third_party/aom/test/intrapred_test.cc @@ -37,6 +37,15 @@ typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride, typedef void (*IntraPred)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +} // namespace + +// NOTE: Under gcc version 7.3.0 (Debian 7.3.0-5), if this template is in the +// anonymous namespace, then we get a strange compiler warning in +// the begin() and end() methods of the ParamGenerator template class in +// gtest/internal/gtest-param-util.h: +// warning: ‘<anonymous>’ is used uninitialized in this function +// As a workaround, put this template outside the anonymous namespace. +// See bug aomedia:2003. template <typename FuncType> struct IntraPredFunc { IntraPredFunc(FuncType pred = NULL, FuncType ref = NULL, @@ -52,6 +61,8 @@ struct IntraPredFunc { int bit_depth; }; +namespace { + template <typename FuncType, typename Pixel> class AV1IntraPredTest : public ::testing::TestWithParam<IntraPredFunc<FuncType> > { diff --git a/third_party/aom/test/lightfield_test.sh b/third_party/aom/test/lightfield_test.sh new file mode 100755 index 000000000..b957a6b79 --- /dev/null +++ b/third_party/aom/test/lightfield_test.sh @@ -0,0 +1,98 @@ +#!/bin/sh +## Copyright (c) 2018, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +## This file tests the lightfield example. +## +. $(dirname $0)/tools_common.sh + +# Environment check: $infile is required. +lightfield_test_verify_environment() { + local infile="${LIBAOM_TEST_DATA_PATH}/vase10x10.yuv" + if [ ! -e "${infile}" ]; then + echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH." + return 1 + fi +} + +# Run the lightfield example +lightfield_test() { + local img_width=1024 + local img_height=1024 + local lf_width=10 + local lf_height=10 + local lf_blocksize=5 + local num_references=4 + local num_tile_lists=2 + + # Encode the lightfield. + local encoder="${LIBAOM_BIN_PATH}/lightfield_encoder${AOM_TEST_EXE_SUFFIX}" + local yuv_file="${LIBAOM_TEST_DATA_PATH}/vase10x10.yuv" + local lf_file="${AOM_TEST_OUTPUT_DIR}/vase10x10.ivf" + if [ ! -x "${encoder}" ]; then + elog "${encoder} does not exist or is not executable." + return 1 + fi + + eval "${AOM_TEST_PREFIX}" "${encoder}" "${img_width}" "${img_height}" \ + "${yuv_file}" "${lf_file}" "${lf_width}" \ + "${lf_height}" "${lf_blocksize}" ${devnull} + + [ -e "${lf_file}" ] || return 1 + + # Parse lightfield bitstream to construct and output a new bitstream that can + # be decoded by an AV1 decoder. + local bs_decoder="${LIBAOM_BIN_PATH}/lightfield_bitstream_parsing${AOM_TEST_EXE_SUFFIX}" + local tl_file="${AOM_TEST_OUTPUT_DIR}/vase_tile_list.ivf" + if [ ! -x "${bs_decoder}" ]; then + elog "${bs_decoder} does not exist or is not executable." + return 1 + fi + + eval "${AOM_TEST_PREFIX}" "${bs_decoder}" "${lf_file}" "${tl_file}" \ + "${num_references}" ${devnull} + + [ -e "${tl_file}" ] || return 1 + + # Run lightfield tile list decoder + local tl_decoder="${LIBAOM_BIN_PATH}/lightfield_tile_list_decoder${AOM_TEST_EXE_SUFFIX}" + local tl_outfile="${AOM_TEST_OUTPUT_DIR}/vase_tile_list.yuv" + if [ ! -x "${tl_decoder}" ]; then + elog "${tl_decoder} does not exist or is not executable." + return 1 + fi + + eval "${AOM_TEST_PREFIX}" "${tl_decoder}" "${tl_file}" "${tl_outfile}" \ + "${num_references}" "${num_tile_lists}" ${devnull} + + [ -e "${tl_outfile}" ] || return 1 + + # Run reference lightfield decoder + local ref_decoder="${LIBAOM_BIN_PATH}/lightfield_decoder${AOM_TEST_EXE_SUFFIX}" + local tl_reffile="${AOM_TEST_OUTPUT_DIR}/vase_reference.yuv" + if [ ! -x "${ref_decoder}" ]; then + elog "${ref_decoder} does not exist or is not executable." + return 1 + fi + + eval "${AOM_TEST_PREFIX}" "${ref_decoder}" "${lf_file}" "${tl_reffile}" \ + "${num_references}" ${devnull} + + [ -e "${tl_reffile}" ] || return 1 + + # Check if tl_outfile and tl_reffile are identical. If not identical, this test fails. + diff ${tl_outfile} ${tl_reffile} > /dev/null + if [ $? -eq 1 ]; then + return 1 + fi +} + +lightfield_test_tests="lightfield_test" + +run_tests lightfield_test_verify_environment "${lightfield_test_tests}" diff --git a/third_party/aom/test/lpf_test.cc b/third_party/aom/test/lpf_test.cc index 1e2862ac8..451bffd2a 100644 --- a/third_party/aom/test/lpf_test.cc +++ b/third_party/aom/test/lpf_test.cc @@ -581,8 +581,12 @@ INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param_hbd, const loop_param_t kLoop8Test6[] = { make_tuple(&aom_lpf_vertical_14_neon, &aom_lpf_vertical_14_c, 8), make_tuple(&aom_lpf_vertical_8_neon, &aom_lpf_vertical_8_c, 8), + make_tuple(&aom_lpf_vertical_6_neon, &aom_lpf_vertical_6_c, 8), + make_tuple(&aom_lpf_vertical_4_neon, &aom_lpf_vertical_4_c, 8), + make_tuple(&aom_lpf_horizontal_14_neon, &aom_lpf_horizontal_14_c, 8), make_tuple(&aom_lpf_horizontal_8_neon, &aom_lpf_horizontal_8_c, 8), - make_tuple(&aom_lpf_horizontal_6_neon, &aom_lpf_horizontal_6_c, 8) + make_tuple(&aom_lpf_horizontal_6_neon, &aom_lpf_horizontal_6_c, 8), + make_tuple(&aom_lpf_horizontal_4_neon, &aom_lpf_horizontal_4_c, 8) }; INSTANTIATE_TEST_CASE_P(NEON, Loop8Test6Param_lbd, diff --git a/third_party/aom/test/masked_sad_test.cc b/third_party/aom/test/masked_sad_test.cc index 1a393a001..311f1877d 100644 --- a/third_party/aom/test/masked_sad_test.cc +++ b/third_party/aom/test/masked_sad_test.cc @@ -44,14 +44,14 @@ class MaskedSADTest : public ::testing::TestWithParam<MaskedSADParam> { } virtual void TearDown() { libaom_test::ClearSystemState(); } + void runMaskedSADTest(int run_times); protected: MaskedSADFunc maskedSAD_op_; MaskedSADFunc ref_maskedSAD_op_; }; - -TEST_P(MaskedSADTest, OperationCheck) { - unsigned int ref_ret, ret; +void MaskedSADTest::runMaskedSADTest(int run_times) { + unsigned int ref_ret = 0, ret = 1; ACMRandom rnd(ACMRandom::DeterministicSeed()); DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]); DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]); @@ -62,7 +62,8 @@ TEST_P(MaskedSADTest, OperationCheck) { int src_stride = MAX_SB_SIZE; int ref_stride = MAX_SB_SIZE; int msk_stride = MAX_SB_SIZE; - for (int i = 0; i < number_of_iterations; ++i) { + const int iters = run_times == 1 ? number_of_iterations : 1; + for (int i = 0; i < iters; ++i) { for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) { src_ptr[j] = rnd.Rand8(); ref_ptr[j] = rnd.Rand8(); @@ -72,24 +73,48 @@ TEST_P(MaskedSADTest, OperationCheck) { } for (int invert_mask = 0; invert_mask < 2; ++invert_mask) { - ref_ret = - ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride, + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int repeat = 0; repeat < run_times; ++repeat) { + ref_ret = ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride, + second_pred_ptr, msk_ptr, msk_stride, + invert_mask); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + if (run_times == 1) { + ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src_ptr, src_stride, + ref_ptr, ref_stride, + second_pred_ptr, msk_ptr, + msk_stride, invert_mask)); + } else { + for (int repeat = 0; repeat < run_times; ++repeat) { + ret = + maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride, second_pred_ptr, msk_ptr, msk_stride, invert_mask); - ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src_ptr, src_stride, ref_ptr, - ref_stride, second_pred_ptr, - msk_ptr, msk_stride, - invert_mask)); + } + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer)); + if (run_times > 10) { + printf("%7.2f/%7.2fns", time1, time2); + printf("(%3.2f)\n", time1 / time2); + } if (ret != ref_ret) { err_count++; if (first_failure == -1) first_failure = i; } } } - EXPECT_EQ(0, err_count) - << "Error: Masked SAD Test, C output doesn't match SSSE3 output. " - << "First failed at test case " << first_failure; + EXPECT_EQ(0, err_count) << "Error: Masked SAD Test, output doesn't match. " + << "First failed at test case " << first_failure; } +TEST_P(MaskedSADTest, OperationCheck) { runMaskedSADTest(1); } + +TEST_P(MaskedSADTest, DISABLED_Speed) { runMaskedSADTest(2000000); } + typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, @@ -108,14 +133,14 @@ class HighbdMaskedSADTest } virtual void TearDown() { libaom_test::ClearSystemState(); } + void runHighbdMaskedSADTest(int run_times); protected: HighbdMaskedSADFunc maskedSAD_op_; HighbdMaskedSADFunc ref_maskedSAD_op_; }; - -TEST_P(HighbdMaskedSADTest, OperationCheck) { - unsigned int ref_ret, ret; +void HighbdMaskedSADTest::runHighbdMaskedSADTest(int run_times) { + unsigned int ref_ret = 0, ret = 1; ACMRandom rnd(ACMRandom::DeterministicSeed()); DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]); DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]); @@ -129,7 +154,8 @@ TEST_P(HighbdMaskedSADTest, OperationCheck) { int src_stride = MAX_SB_SIZE; int ref_stride = MAX_SB_SIZE; int msk_stride = MAX_SB_SIZE; - for (int i = 0; i < number_of_iterations; ++i) { + const int iters = run_times == 1 ? number_of_iterations : 1; + for (int i = 0; i < iters; ++i) { for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) { src_ptr[j] = rnd.Rand16() & 0xfff; ref_ptr[j] = rnd.Rand16() & 0xfff; @@ -138,13 +164,34 @@ TEST_P(HighbdMaskedSADTest, OperationCheck) { } for (int invert_mask = 0; invert_mask < 2; ++invert_mask) { - ref_ret = - ref_maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride, + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int repeat = 0; repeat < run_times; ++repeat) { + ref_ret = ref_maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride, + second_pred8_ptr, msk_ptr, msk_stride, + invert_mask); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + if (run_times == 1) { + ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride, + ref8_ptr, ref_stride, + second_pred8_ptr, msk_ptr, + msk_stride, invert_mask)); + } else { + for (int repeat = 0; repeat < run_times; ++repeat) { + ret = + maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride, second_pred8_ptr, msk_ptr, msk_stride, invert_mask); - ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride, - ref8_ptr, ref_stride, - second_pred8_ptr, msk_ptr, - msk_stride, invert_mask)); + } + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer)); + if (run_times > 10) { + printf("%7.2f/%7.2fns", time1, time2); + printf("(%3.2f)\n", time1 / time2); + } if (ret != ref_ret) { err_count++; if (first_failure == -1) first_failure = i; @@ -152,57 +199,144 @@ TEST_P(HighbdMaskedSADTest, OperationCheck) { } } EXPECT_EQ(0, err_count) - << "Error: High BD Masked SAD Test, C output doesn't match SSSE3 output. " + << "Error: High BD Masked SAD Test, output doesn't match. " << "First failed at test case " << first_failure; } +TEST_P(HighbdMaskedSADTest, OperationCheck) { runHighbdMaskedSADTest(1); } + +TEST_P(HighbdMaskedSADTest, DISABLED_Speed) { runHighbdMaskedSADTest(1000000); } + using ::testing::make_tuple; #if HAVE_SSSE3 const MaskedSADParam msad_test[] = { - make_tuple(&aom_masked_sad128x128_ssse3, &aom_masked_sad128x128_c), - make_tuple(&aom_masked_sad128x64_ssse3, &aom_masked_sad128x64_c), - make_tuple(&aom_masked_sad64x128_ssse3, &aom_masked_sad64x128_c), - make_tuple(&aom_masked_sad64x64_ssse3, &aom_masked_sad64x64_c), - make_tuple(&aom_masked_sad64x32_ssse3, &aom_masked_sad64x32_c), - make_tuple(&aom_masked_sad32x64_ssse3, &aom_masked_sad32x64_c), - make_tuple(&aom_masked_sad32x32_ssse3, &aom_masked_sad32x32_c), - make_tuple(&aom_masked_sad32x16_ssse3, &aom_masked_sad32x16_c), - make_tuple(&aom_masked_sad16x32_ssse3, &aom_masked_sad16x32_c), - make_tuple(&aom_masked_sad16x16_ssse3, &aom_masked_sad16x16_c), - make_tuple(&aom_masked_sad16x8_ssse3, &aom_masked_sad16x8_c), - make_tuple(&aom_masked_sad8x16_ssse3, &aom_masked_sad8x16_c), - make_tuple(&aom_masked_sad8x8_ssse3, &aom_masked_sad8x8_c), - make_tuple(&aom_masked_sad8x4_ssse3, &aom_masked_sad8x4_c), + make_tuple(&aom_masked_sad4x4_ssse3, &aom_masked_sad4x4_c), make_tuple(&aom_masked_sad4x8_ssse3, &aom_masked_sad4x8_c), - make_tuple(&aom_masked_sad4x4_ssse3, &aom_masked_sad4x4_c) + make_tuple(&aom_masked_sad8x4_ssse3, &aom_masked_sad8x4_c), + make_tuple(&aom_masked_sad8x8_ssse3, &aom_masked_sad8x8_c), + make_tuple(&aom_masked_sad8x16_ssse3, &aom_masked_sad8x16_c), + make_tuple(&aom_masked_sad16x8_ssse3, &aom_masked_sad16x8_c), + make_tuple(&aom_masked_sad16x16_ssse3, &aom_masked_sad16x16_c), + make_tuple(&aom_masked_sad16x32_ssse3, &aom_masked_sad16x32_c), + make_tuple(&aom_masked_sad32x16_ssse3, &aom_masked_sad32x16_c), + make_tuple(&aom_masked_sad32x32_ssse3, &aom_masked_sad32x32_c), + make_tuple(&aom_masked_sad32x64_ssse3, &aom_masked_sad32x64_c), + make_tuple(&aom_masked_sad64x32_ssse3, &aom_masked_sad64x32_c), + make_tuple(&aom_masked_sad64x64_ssse3, &aom_masked_sad64x64_c), + make_tuple(&aom_masked_sad64x128_ssse3, &aom_masked_sad64x128_c), + make_tuple(&aom_masked_sad128x64_ssse3, &aom_masked_sad128x64_c), + make_tuple(&aom_masked_sad128x128_ssse3, &aom_masked_sad128x128_c), + make_tuple(&aom_masked_sad4x16_ssse3, &aom_masked_sad4x16_c), + make_tuple(&aom_masked_sad16x4_ssse3, &aom_masked_sad16x4_c), + make_tuple(&aom_masked_sad8x32_ssse3, &aom_masked_sad8x32_c), + make_tuple(&aom_masked_sad32x8_ssse3, &aom_masked_sad32x8_c), + make_tuple(&aom_masked_sad16x64_ssse3, &aom_masked_sad16x64_c), + make_tuple(&aom_masked_sad64x16_ssse3, &aom_masked_sad64x16_c), }; -INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, MaskedSADTest, - ::testing::ValuesIn(msad_test)); +INSTANTIATE_TEST_CASE_P(SSSE3, MaskedSADTest, ::testing::ValuesIn(msad_test)); + const HighbdMaskedSADParam hbd_msad_test[] = { - make_tuple(&aom_highbd_masked_sad128x128_ssse3, - &aom_highbd_masked_sad128x128_c), - make_tuple(&aom_highbd_masked_sad128x64_ssse3, - &aom_highbd_masked_sad128x64_c), + make_tuple(&aom_highbd_masked_sad4x4_ssse3, &aom_highbd_masked_sad4x4_c), + make_tuple(&aom_highbd_masked_sad4x8_ssse3, &aom_highbd_masked_sad4x8_c), + make_tuple(&aom_highbd_masked_sad8x4_ssse3, &aom_highbd_masked_sad8x4_c), + make_tuple(&aom_highbd_masked_sad8x8_ssse3, &aom_highbd_masked_sad8x8_c), + make_tuple(&aom_highbd_masked_sad8x16_ssse3, &aom_highbd_masked_sad8x16_c), + make_tuple(&aom_highbd_masked_sad16x8_ssse3, &aom_highbd_masked_sad16x8_c), + make_tuple(&aom_highbd_masked_sad16x16_ssse3, &aom_highbd_masked_sad16x16_c), + make_tuple(&aom_highbd_masked_sad16x32_ssse3, &aom_highbd_masked_sad16x32_c), + make_tuple(&aom_highbd_masked_sad32x16_ssse3, &aom_highbd_masked_sad32x16_c), + make_tuple(&aom_highbd_masked_sad32x32_ssse3, &aom_highbd_masked_sad32x32_c), + make_tuple(&aom_highbd_masked_sad32x64_ssse3, &aom_highbd_masked_sad32x64_c), + make_tuple(&aom_highbd_masked_sad64x32_ssse3, &aom_highbd_masked_sad64x32_c), + make_tuple(&aom_highbd_masked_sad64x64_ssse3, &aom_highbd_masked_sad64x64_c), make_tuple(&aom_highbd_masked_sad64x128_ssse3, &aom_highbd_masked_sad64x128_c), - make_tuple(&aom_highbd_masked_sad64x64_ssse3, &aom_highbd_masked_sad64x64_c), - make_tuple(&aom_highbd_masked_sad64x32_ssse3, &aom_highbd_masked_sad64x32_c), - make_tuple(&aom_highbd_masked_sad32x64_ssse3, &aom_highbd_masked_sad32x64_c), - make_tuple(&aom_highbd_masked_sad32x32_ssse3, &aom_highbd_masked_sad32x32_c), - make_tuple(&aom_highbd_masked_sad32x16_ssse3, &aom_highbd_masked_sad32x16_c), - make_tuple(&aom_highbd_masked_sad16x32_ssse3, &aom_highbd_masked_sad16x32_c), - make_tuple(&aom_highbd_masked_sad16x16_ssse3, &aom_highbd_masked_sad16x16_c), - make_tuple(&aom_highbd_masked_sad16x8_ssse3, &aom_highbd_masked_sad16x8_c), - make_tuple(&aom_highbd_masked_sad8x16_ssse3, &aom_highbd_masked_sad8x16_c), - make_tuple(&aom_highbd_masked_sad8x8_ssse3, &aom_highbd_masked_sad8x8_c), - make_tuple(&aom_highbd_masked_sad8x4_ssse3, &aom_highbd_masked_sad8x4_c), - make_tuple(&aom_highbd_masked_sad4x8_ssse3, &aom_highbd_masked_sad4x8_c), - make_tuple(&aom_highbd_masked_sad4x4_ssse3, &aom_highbd_masked_sad4x4_c) + make_tuple(&aom_highbd_masked_sad128x64_ssse3, + &aom_highbd_masked_sad128x64_c), + make_tuple(&aom_highbd_masked_sad128x128_ssse3, + &aom_highbd_masked_sad128x128_c), + make_tuple(&aom_highbd_masked_sad4x16_ssse3, &aom_highbd_masked_sad4x16_c), + make_tuple(&aom_highbd_masked_sad16x4_ssse3, &aom_highbd_masked_sad16x4_c), + make_tuple(&aom_highbd_masked_sad8x32_ssse3, &aom_highbd_masked_sad8x32_c), + make_tuple(&aom_highbd_masked_sad32x8_ssse3, &aom_highbd_masked_sad32x8_c), + make_tuple(&aom_highbd_masked_sad16x64_ssse3, &aom_highbd_masked_sad16x64_c), + make_tuple(&aom_highbd_masked_sad64x16_ssse3, &aom_highbd_masked_sad64x16_c), }; -INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, HighbdMaskedSADTest, +INSTANTIATE_TEST_CASE_P(SSSE3, HighbdMaskedSADTest, ::testing::ValuesIn(hbd_msad_test)); #endif // HAVE_SSSE3 + +#if HAVE_AVX2 +const MaskedSADParam msad_avx2_test[] = { + make_tuple(&aom_masked_sad4x4_avx2, &aom_masked_sad4x4_ssse3), + make_tuple(&aom_masked_sad4x8_avx2, &aom_masked_sad4x8_ssse3), + make_tuple(&aom_masked_sad8x4_avx2, &aom_masked_sad8x4_ssse3), + make_tuple(&aom_masked_sad8x8_avx2, &aom_masked_sad8x8_ssse3), + make_tuple(&aom_masked_sad8x16_avx2, &aom_masked_sad8x16_ssse3), + make_tuple(&aom_masked_sad16x8_avx2, &aom_masked_sad16x8_ssse3), + make_tuple(&aom_masked_sad16x16_avx2, &aom_masked_sad16x16_ssse3), + make_tuple(&aom_masked_sad16x32_avx2, &aom_masked_sad16x32_ssse3), + make_tuple(&aom_masked_sad32x16_avx2, &aom_masked_sad32x16_ssse3), + make_tuple(&aom_masked_sad32x32_avx2, &aom_masked_sad32x32_ssse3), + make_tuple(&aom_masked_sad32x64_avx2, &aom_masked_sad32x64_ssse3), + make_tuple(&aom_masked_sad64x32_avx2, &aom_masked_sad64x32_ssse3), + make_tuple(&aom_masked_sad64x64_avx2, &aom_masked_sad64x64_ssse3), + make_tuple(&aom_masked_sad64x128_avx2, &aom_masked_sad64x128_ssse3), + make_tuple(&aom_masked_sad128x64_avx2, &aom_masked_sad128x64_ssse3), + make_tuple(&aom_masked_sad128x128_avx2, &aom_masked_sad128x128_ssse3), + make_tuple(&aom_masked_sad4x16_avx2, &aom_masked_sad4x16_ssse3), + make_tuple(&aom_masked_sad16x4_avx2, &aom_masked_sad16x4_ssse3), + make_tuple(&aom_masked_sad8x32_avx2, &aom_masked_sad8x32_ssse3), + make_tuple(&aom_masked_sad32x8_avx2, &aom_masked_sad32x8_ssse3), + make_tuple(&aom_masked_sad16x64_avx2, &aom_masked_sad16x64_ssse3), + make_tuple(&aom_masked_sad64x16_avx2, &aom_masked_sad64x16_ssse3) +}; + +INSTANTIATE_TEST_CASE_P(AVX2, MaskedSADTest, + ::testing::ValuesIn(msad_avx2_test)); + +const HighbdMaskedSADParam hbd_msad_avx2_test[] = { + make_tuple(&aom_highbd_masked_sad4x4_avx2, &aom_highbd_masked_sad4x4_ssse3), + make_tuple(&aom_highbd_masked_sad4x8_avx2, &aom_highbd_masked_sad4x8_ssse3), + make_tuple(&aom_highbd_masked_sad8x4_avx2, &aom_highbd_masked_sad8x4_ssse3), + make_tuple(&aom_highbd_masked_sad8x8_avx2, &aom_highbd_masked_sad8x8_ssse3), + make_tuple(&aom_highbd_masked_sad8x16_avx2, &aom_highbd_masked_sad8x16_ssse3), + make_tuple(&aom_highbd_masked_sad16x8_avx2, &aom_highbd_masked_sad16x8_ssse3), + make_tuple(&aom_highbd_masked_sad16x16_avx2, + &aom_highbd_masked_sad16x16_ssse3), + make_tuple(&aom_highbd_masked_sad16x32_avx2, + &aom_highbd_masked_sad16x32_ssse3), + make_tuple(&aom_highbd_masked_sad32x16_avx2, + &aom_highbd_masked_sad32x16_ssse3), + make_tuple(&aom_highbd_masked_sad32x32_avx2, + &aom_highbd_masked_sad32x32_ssse3), + make_tuple(&aom_highbd_masked_sad32x64_avx2, + &aom_highbd_masked_sad32x64_ssse3), + make_tuple(&aom_highbd_masked_sad64x32_avx2, + &aom_highbd_masked_sad64x32_ssse3), + make_tuple(&aom_highbd_masked_sad64x64_avx2, + &aom_highbd_masked_sad64x64_ssse3), + make_tuple(&aom_highbd_masked_sad64x128_avx2, + &aom_highbd_masked_sad64x128_ssse3), + make_tuple(&aom_highbd_masked_sad128x64_avx2, + &aom_highbd_masked_sad128x64_ssse3), + make_tuple(&aom_highbd_masked_sad128x128_avx2, + &aom_highbd_masked_sad128x128_ssse3), + make_tuple(&aom_highbd_masked_sad4x16_avx2, &aom_highbd_masked_sad4x16_ssse3), + make_tuple(&aom_highbd_masked_sad16x4_avx2, &aom_highbd_masked_sad16x4_ssse3), + make_tuple(&aom_highbd_masked_sad8x32_avx2, &aom_highbd_masked_sad8x32_ssse3), + make_tuple(&aom_highbd_masked_sad32x8_avx2, &aom_highbd_masked_sad32x8_ssse3), + make_tuple(&aom_highbd_masked_sad16x64_avx2, + &aom_highbd_masked_sad16x64_ssse3), + make_tuple(&aom_highbd_masked_sad64x16_avx2, + &aom_highbd_masked_sad64x16_ssse3) +}; + +INSTANTIATE_TEST_CASE_P(AVX2, HighbdMaskedSADTest, + ::testing::ValuesIn(hbd_msad_avx2_test)); +#endif // HAVE_AVX2 + } // namespace diff --git a/third_party/aom/test/noise_model_test.cc b/third_party/aom/test/noise_model_test.cc index 9b7fff8a2..b5b387e31 100644 --- a/third_party/aom/test/noise_model_test.cc +++ b/third_party/aom/test/noise_model_test.cc @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + #include <math.h> #include <algorithm> #include <vector> diff --git a/third_party/aom/test/obmc_sad_test.cc b/third_party/aom/test/obmc_sad_test.cc index 1820da266..6cef86961 100644 --- a/third_party/aom/test/obmc_sad_test.cc +++ b/third_party/aom/test/obmc_sad_test.cc @@ -108,6 +108,29 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, ObmcSadTest, ::testing::ValuesIn(sse4_functions)); #endif // HAVE_SSE4_1 +#if HAVE_AVX2 +const ObmcSadTest::ParamType avx2_functions[] = { + TestFuncs(aom_obmc_sad128x128_c, aom_obmc_sad128x128_avx2), + TestFuncs(aom_obmc_sad128x64_c, aom_obmc_sad128x64_avx2), + TestFuncs(aom_obmc_sad64x128_c, aom_obmc_sad64x128_avx2), + TestFuncs(aom_obmc_sad64x64_c, aom_obmc_sad64x64_avx2), + TestFuncs(aom_obmc_sad64x32_c, aom_obmc_sad64x32_avx2), + TestFuncs(aom_obmc_sad32x64_c, aom_obmc_sad32x64_avx2), + TestFuncs(aom_obmc_sad32x32_c, aom_obmc_sad32x32_avx2), + TestFuncs(aom_obmc_sad32x16_c, aom_obmc_sad32x16_avx2), + TestFuncs(aom_obmc_sad16x32_c, aom_obmc_sad16x32_avx2), + TestFuncs(aom_obmc_sad16x16_c, aom_obmc_sad16x16_avx2), + TestFuncs(aom_obmc_sad16x8_c, aom_obmc_sad16x8_avx2), + TestFuncs(aom_obmc_sad8x16_c, aom_obmc_sad8x16_avx2), + TestFuncs(aom_obmc_sad8x8_c, aom_obmc_sad8x8_avx2), + TestFuncs(aom_obmc_sad8x4_c, aom_obmc_sad8x4_avx2), + TestFuncs(aom_obmc_sad4x8_c, aom_obmc_sad4x8_avx2), + TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_avx2) +}; + +INSTANTIATE_TEST_CASE_P(AVX2, ObmcSadTest, ::testing::ValuesIn(avx2_functions)); +#endif // HAVE_AVX2 + //////////////////////////////////////////////////////////////////////////////// // High bit-depth //////////////////////////////////////////////////////////////////////////////// @@ -187,4 +210,28 @@ ObmcSadHBDTest::ParamType sse4_functions_hbd[] = { INSTANTIATE_TEST_CASE_P(SSE4_1, ObmcSadHBDTest, ::testing::ValuesIn(sse4_functions_hbd)); #endif // HAVE_SSE4_1 + +#if HAVE_AVX2 +ObmcSadHBDTest::ParamType avx2_functions_hbd[] = { + TestFuncs(aom_highbd_obmc_sad128x128_c, aom_highbd_obmc_sad128x128_avx2), + TestFuncs(aom_highbd_obmc_sad128x64_c, aom_highbd_obmc_sad128x64_avx2), + TestFuncs(aom_highbd_obmc_sad64x128_c, aom_highbd_obmc_sad64x128_avx2), + TestFuncs(aom_highbd_obmc_sad64x64_c, aom_highbd_obmc_sad64x64_avx2), + TestFuncs(aom_highbd_obmc_sad64x32_c, aom_highbd_obmc_sad64x32_avx2), + TestFuncs(aom_highbd_obmc_sad32x64_c, aom_highbd_obmc_sad32x64_avx2), + TestFuncs(aom_highbd_obmc_sad32x32_c, aom_highbd_obmc_sad32x32_avx2), + TestFuncs(aom_highbd_obmc_sad32x16_c, aom_highbd_obmc_sad32x16_avx2), + TestFuncs(aom_highbd_obmc_sad16x32_c, aom_highbd_obmc_sad16x32_avx2), + TestFuncs(aom_highbd_obmc_sad16x16_c, aom_highbd_obmc_sad16x16_avx2), + TestFuncs(aom_highbd_obmc_sad16x8_c, aom_highbd_obmc_sad16x8_avx2), + TestFuncs(aom_highbd_obmc_sad8x16_c, aom_highbd_obmc_sad8x16_avx2), + TestFuncs(aom_highbd_obmc_sad8x8_c, aom_highbd_obmc_sad8x8_avx2), + TestFuncs(aom_highbd_obmc_sad8x4_c, aom_highbd_obmc_sad8x4_avx2), + TestFuncs(aom_highbd_obmc_sad4x8_c, aom_highbd_obmc_sad4x8_avx2), + TestFuncs(aom_highbd_obmc_sad4x4_c, aom_highbd_obmc_sad4x4_avx2) +}; + +INSTANTIATE_TEST_CASE_P(AVX2, ObmcSadHBDTest, + ::testing::ValuesIn(avx2_functions_hbd)); +#endif // HAVE_AVX2 } // namespace diff --git a/third_party/aom/test/reconinter_test.cc b/third_party/aom/test/reconinter_test.cc index 4f74c817e..9b849404c 100644 --- a/third_party/aom/test/reconinter_test.cc +++ b/third_party/aom/test/reconinter_test.cc @@ -28,12 +28,30 @@ namespace { using libaom_test::ACMRandom; -class BuildCompDiffwtdMaskTest : public ::testing::TestWithParam<int> { +typedef void (*buildcompdiffwtdmaskd_func)(uint8_t *mask, + DIFFWTD_MASK_TYPE mask_type, + const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + int h, int w); + +typedef ::testing::tuple<BLOCK_SIZE, buildcompdiffwtdmaskd_func> + BuildCompDiffwtdMaskDParam; + +#if HAVE_SSE4_1 +::testing::internal::ParamGenerator<BuildCompDiffwtdMaskDParam> BuildParams( + buildcompdiffwtdmaskd_func filter) { + return ::testing::Combine(::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL), + ::testing::Values(filter)); +} +#endif + +class BuildCompDiffwtdMaskTest + : public ::testing::TestWithParam<BuildCompDiffwtdMaskDParam> { public: virtual ~BuildCompDiffwtdMaskTest() {} virtual void TearDown() { libaom_test::ClearSystemState(); } - void RunTest(const int sb_type, const int is_speed, + void RunTest(buildcompdiffwtdmaskd_func test_impl, const int is_speed, const DIFFWTD_MASK_TYPE type); private: @@ -159,8 +177,10 @@ void BuildCompDiffwtdMaskD16Test::RunSpeedTest( width, height, 1000.0 * elapsed_time1 / num_loops); } #if HAVE_SSE4_1 -void BuildCompDiffwtdMaskTest::RunTest(const int sb_type, const int is_speed, +void BuildCompDiffwtdMaskTest::RunTest(buildcompdiffwtdmaskd_func test_impl, + const int is_speed, const DIFFWTD_MASK_TYPE type) { + const int sb_type = GET_PARAM(0); const int width = block_size_wide[sb_type]; const int height = block_size_high[sb_type]; DECLARE_ALIGNED(16, uint8_t, mask_ref[MAX_SB_SQUARE]); @@ -182,8 +202,7 @@ void BuildCompDiffwtdMaskTest::RunTest(const int sb_type, const int is_speed, const double t1 = get_time_mark(&timer); aom_usec_timer_start(&timer); for (int i = 0; i < run_times; ++i) { - av1_build_compound_diffwtd_mask_sse4_1(mask_test, type, src0, width, src1, - width, height, width); + test_impl(mask_test, type, src0, width, src1, width, height, width); } const double t2 = get_time_mark(&timer); if (is_speed) { @@ -200,12 +219,12 @@ void BuildCompDiffwtdMaskTest::RunTest(const int sb_type, const int is_speed, } TEST_P(BuildCompDiffwtdMaskTest, match) { - RunTest(GetParam(), 0, DIFFWTD_38); - RunTest(GetParam(), 0, DIFFWTD_38_INV); + RunTest(GET_PARAM(1), 0, DIFFWTD_38); + RunTest(GET_PARAM(1), 0, DIFFWTD_38_INV); } TEST_P(BuildCompDiffwtdMaskTest, DISABLED_Speed) { - RunTest(GetParam(), 1, DIFFWTD_38); - RunTest(GetParam(), 1, DIFFWTD_38_INV); + RunTest(GET_PARAM(1), 1, DIFFWTD_38); + RunTest(GET_PARAM(1), 1, DIFFWTD_38_INV); } #endif TEST_P(BuildCompDiffwtdMaskD16Test, CheckOutput) { @@ -218,8 +237,7 @@ TEST_P(BuildCompDiffwtdMaskD16Test, DISABLED_Speed) { #if HAVE_SSE4_1 INSTANTIATE_TEST_CASE_P(SSE4_1, BuildCompDiffwtdMaskTest, - ::testing::Range(0, static_cast<int>(BLOCK_SIZES_ALL), - 1)); + BuildParams(av1_build_compound_diffwtd_mask_sse4_1)); INSTANTIATE_TEST_CASE_P( SSE4_1, BuildCompDiffwtdMaskD16Test, diff --git a/third_party/aom/test/resize_test.cc b/third_party/aom/test/resize_test.cc index e1c4e9fa5..b270b8362 100644 --- a/third_party/aom/test/resize_test.cc +++ b/third_party/aom/test/resize_test.cc @@ -546,12 +546,6 @@ TEST_P(ResizeRealtimeTest, DISABLED_TestInternalResizeDownUpChangeBitRate) { #endif } -aom_img_fmt_t CspForFrameNumber(int frame) { - if (frame < 10) return AOM_IMG_FMT_I420; - if (frame < 20) return AOM_IMG_FMT_I444; - return AOM_IMG_FMT_I420; -} - class ResizeCspTest : public ResizeTest { protected: #if WRITE_COMPRESSED_STREAM @@ -580,20 +574,6 @@ class ResizeCspTest : public ResizeTest { #endif } - virtual void PreEncodeFrameHook(libaom_test::VideoSource *video, - libaom_test::Encoder *encoder) { - if (CspForFrameNumber(video->frame()) != AOM_IMG_FMT_I420 && - cfg_.g_profile != 1) { - cfg_.g_profile = 1; - encoder->Config(&cfg_); - } - if (CspForFrameNumber(video->frame()) == AOM_IMG_FMT_I420 && - cfg_.g_profile != 0) { - cfg_.g_profile = 0; - encoder->Config(&cfg_); - } - } - virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0]; EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0); @@ -621,19 +601,13 @@ class ResizeCspTest : public ResizeTest { class ResizingCspVideoSource : public ::libaom_test::DummyVideoSource { public: - ResizingCspVideoSource() { + explicit ResizingCspVideoSource(aom_img_fmt_t image_format) { SetSize(kInitialWidth, kInitialHeight); + SetImageFormat(image_format); limit_ = 30; } virtual ~ResizingCspVideoSource() {} - - protected: - virtual void Next() { - ++frame_; - SetImageFormat(CspForFrameNumber(frame_)); - FillFrame(); - } }; #if (defined(DISABLE_TRELLISQ_SEARCH) && DISABLE_TRELLISQ_SEARCH) @@ -641,14 +615,19 @@ TEST_P(ResizeCspTest, DISABLED_TestResizeCspWorks) { #else TEST_P(ResizeCspTest, TestResizeCspWorks) { #endif - ResizingCspVideoSource video; - init_flags_ = AOM_CODEC_USE_PSNR; - cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48; - cfg_.g_lag_in_frames = 0; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const aom_img_fmt_t image_formats[] = { AOM_IMG_FMT_I420, AOM_IMG_FMT_I444 }; + for (size_t i = 0; i < GTEST_ARRAY_SIZE_(image_formats); ++i) { + ResizingCspVideoSource video(image_formats[i]); + init_flags_ = AOM_CODEC_USE_PSNR; + cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48; + cfg_.g_lag_in_frames = 0; + cfg_.g_profile = (image_formats[i] == AOM_IMG_FMT_I420) ? 0 : 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - // Check we decoded the same number of frames as we attempted to encode - ASSERT_EQ(frame_info_list_.size(), video.limit()); + // Check we decoded the same number of frames as we attempted to encode + ASSERT_EQ(frame_info_list_.size(), video.limit()); + frame_info_list_.clear(); + } } AV1_INSTANTIATE_TEST_CASE(ResizeTest, diff --git a/third_party/aom/test/selfguided_filter_test.cc b/third_party/aom/test/selfguided_filter_test.cc index 4506a90db..d2d5c6105 100644 --- a/third_party/aom/test/selfguided_filter_test.cc +++ b/third_party/aom/test/selfguided_filter_test.cc @@ -208,6 +208,11 @@ INSTANTIATE_TEST_CASE_P(AVX2, AV1SelfguidedFilterTest, ::testing::Values(apply_selfguided_restoration_avx2)); #endif +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P(NEON, AV1SelfguidedFilterTest, + ::testing::Values(apply_selfguided_restoration_neon)); +#endif + // Test parameter list: // <tst_fun_, bit_depth> typedef tuple<SgrFunc, int> HighbdFilterTestParam; @@ -395,5 +400,11 @@ INSTANTIATE_TEST_CASE_P( ::testing::Combine(::testing::Values(apply_selfguided_restoration_avx2), ::testing::ValuesIn(highbd_params_avx2))); #endif - +#if HAVE_NEON +const int highbd_params_neon[] = { 8, 10, 12 }; +INSTANTIATE_TEST_CASE_P( + NEON, AV1HighbdSelfguidedFilterTest, + ::testing::Combine(::testing::Values(apply_selfguided_restoration_neon), + ::testing::ValuesIn(highbd_params_neon))); +#endif } // namespace diff --git a/third_party/aom/test/test-data.sha1 b/third_party/aom/test/test-data.sha1 index 67aeb5208..723e06104 100644 --- a/third_party/aom/test/test-data.sha1 +++ b/third_party/aom/test/test-data.sha1 @@ -356,4 +356,134 @@ c8baedb48fd5d4c956aa8d73fd957370f718f047 *av1-1-b8-01-size-208x226.ivf.md5 25a4940922761239809d82c45c2be1c5e4f48785 *av1-1-b8-01-size-224x226.ivf 87ae7e7558241bf3575a333f56fbad4dfdade8ff *av1-1-b8-01-size-224x226.ivf.md5 40dd208eb525cd90d7c0674cf787097fb909afae *av1-1-b8-01-size-226x226.ivf -34bdef682a4eae0e0a05e4486a968af1df8b220a *av1-1-b8-01-size-226x226.ivf.md5
\ No newline at end of file +34bdef682a4eae0e0a05e4486a968af1df8b220a *av1-1-b8-01-size-226x226.ivf.md5 +9bbe8499796aa588ff02e313fb0d4349940d2fea *av1-1-b10-00-quantizer-00.ivf +36b402eedad2bacee8ac09acce44e2fc356dd80b *av1-1-b10-00-quantizer-00.ivf.md5 +1d5e1d2827624f328020bf123df213bb175577e0 *av1-1-b10-00-quantizer-01.ivf +16c529be5502369e43ce9c6fe99a9709968e3daf *av1-1-b10-00-quantizer-01.ivf.md5 +39abc20739242a8f05efd4b35d7603c8ad7ff45d *av1-1-b10-00-quantizer-02.ivf +81faa72c3d43b003966fe09ffaae51b07b1059be *av1-1-b10-00-quantizer-02.ivf.md5 +92ebf349b803333a43824a83d997b8cf76f656f9 *av1-1-b10-00-quantizer-03.ivf +5e7556dc998cb8b506a43cc078e30802d7e600e6 *av1-1-b10-00-quantizer-03.ivf.md5 +1c496177c66e49f2e3556af87ec67afb5060170b *av1-1-b10-00-quantizer-04.ivf +560fea4800a44fe19ed8d3e74f425bdbf1fb8abd *av1-1-b10-00-quantizer-04.ivf.md5 +7de864b8475ce0acd0ecb01827f2c9add815352b *av1-1-b10-00-quantizer-05.ivf +1c1aea3db3f54a91866d89fd3b1a0d285ca10310 *av1-1-b10-00-quantizer-05.ivf.md5 +b6501c165619b036d0f7864fd4739973d2d18970 *av1-1-b10-00-quantizer-06.ivf +d758c8eff275651006c41e7dd447cac13b489ad7 *av1-1-b10-00-quantizer-06.ivf.md5 +e4df6f588f156dffaafd9517b64f753cfc9ccf05 *av1-1-b10-00-quantizer-07.ivf +3c577f67dade4537de642fd457ea2b367424f336 *av1-1-b10-00-quantizer-07.ivf.md5 +07e9c4c18abb36c8699c1c12bebcc727f090b525 *av1-1-b10-00-quantizer-08.ivf +4981568ade3170f311cb114fa2689edc4bc35e67 *av1-1-b10-00-quantizer-08.ivf.md5 +2268ecd2899f1b41ae9898925b1d62cfefa30282 *av1-1-b10-00-quantizer-09.ivf +029b03029b65b7c4c208961f0820467ad42fd3d6 *av1-1-b10-00-quantizer-09.ivf.md5 +3d2adaf6441cfa9585dcbf7d19d65bf6992a29a3 *av1-1-b10-00-quantizer-10.ivf +017b7fb4c3ba0747c2d5688d493da33ef993d110 *av1-1-b10-00-quantizer-10.ivf.md5 +006535760bd7dc1cfc95e648b05215954a2e76c2 *av1-1-b10-00-quantizer-11.ivf +c0ae083deb8e820aa49034af4d100944dd977018 *av1-1-b10-00-quantizer-11.ivf.md5 +840e0cbfe1acc8a7a45c823dc55ab44a0b6b553e *av1-1-b10-00-quantizer-12.ivf +49232ea38bdef650c94808f53834f1137cd4bf39 *av1-1-b10-00-quantizer-12.ivf.md5 +04b0e5a7387e07474f51be4b2c3e05211b40f0d0 *av1-1-b10-00-quantizer-13.ivf +a51b5ec4b890df3a64f9f0d866b8c41296c9e081 *av1-1-b10-00-quantizer-13.ivf.md5 +5dc47a140fbcbf08bf91481ee3585e9e067561ab *av1-1-b10-00-quantizer-14.ivf +2625319eef69d6225e6ab6e5ce7790491406cb5d *av1-1-b10-00-quantizer-14.ivf.md5 +f866be86d8d8aa08ded30e42988b0936c1a16064 *av1-1-b10-00-quantizer-15.ivf +03b7c1eefb54d99e30051c7123c0453f04a6579d *av1-1-b10-00-quantizer-15.ivf.md5 +548df2371dfb485419ed9baf28e3f495c64f364a *av1-1-b10-00-quantizer-16.ivf +8a0d6bf1626b05b65c77331305414fe9be54e8c6 *av1-1-b10-00-quantizer-16.ivf.md5 +0077c82f96a2e095a3cb8de9bfa63715e3c9f438 *av1-1-b10-00-quantizer-17.ivf +5d85f77f3087f4b206930722a945c60039262be4 *av1-1-b10-00-quantizer-17.ivf.md5 +1e0f1245ecb4c903b5dc7072d959fc43a7bba381 *av1-1-b10-00-quantizer-18.ivf +06316ae2b45f2359a70cc3855ffd6ab81048b41a *av1-1-b10-00-quantizer-18.ivf.md5 +f197198f7ec058110185fda5297a1a43993654df *av1-1-b10-00-quantizer-19.ivf +bac522c7f234d506c75b5495d74b3fa57c83a4df *av1-1-b10-00-quantizer-19.ivf.md5 +c2f57324d000b349323f37d5ebebde8c2b861f30 *av1-1-b10-00-quantizer-20.ivf +999c6110786cbc25e67792234a5a02f2cb4553c7 *av1-1-b10-00-quantizer-20.ivf.md5 +2ffad9adfd19286fe2166ba877289d201c9a634f *av1-1-b10-00-quantizer-21.ivf +d55713eaa791cfd7bf69b6c26d5032029d9a0f06 *av1-1-b10-00-quantizer-21.ivf.md5 +382528db53328c1a38976f5d9b579eef35d839f4 *av1-1-b10-00-quantizer-22.ivf +cb5bd459e1a90126da9264cff4281515f95755b2 *av1-1-b10-00-quantizer-22.ivf.md5 +b52cc6160fc66f72ad66c198d275a1c73f925022 *av1-1-b10-00-quantizer-23.ivf +c0f9d6659e1f283e9356fd7b4ac9f7cc5544cdc2 *av1-1-b10-00-quantizer-23.ivf.md5 +e11f15e3b63e7606b1122bb3670ee77c09c04840 *av1-1-b10-00-quantizer-24.ivf +e9f141b924440e044270c81a68458fe498599a8e *av1-1-b10-00-quantizer-24.ivf.md5 +fb91793b69824c99b0218788dcea0a74ebd7e84e *av1-1-b10-00-quantizer-25.ivf +434e33d609b2683c3cfbcc3a2cdfc26339590fb6 *av1-1-b10-00-quantizer-25.ivf.md5 +d82e38f31cdcf8b43479e6ddaa83373de38f70a2 *av1-1-b10-00-quantizer-26.ivf +183943b851ba383a536f13c83b93f61ac8961ad5 *av1-1-b10-00-quantizer-26.ivf.md5 +6bf5e4e8e0aca699e493b9eb3672d2117494d74d *av1-1-b10-00-quantizer-27.ivf +f0fb7e0a99180828b0e38b2cfe0622eecc2d26b8 *av1-1-b10-00-quantizer-27.ivf.md5 +d5adee2567544c3ae4223b3f3528a770377878d2 *av1-1-b10-00-quantizer-28.ivf +14edf588efc67570e529b0ff8aeb8e7a0c69238b *av1-1-b10-00-quantizer-28.ivf.md5 +e6dcdc106847956035e3f00aabf4470f97e1887e *av1-1-b10-00-quantizer-29.ivf +413c5cb778611c7c1a810b53861b9ab1fb391f17 *av1-1-b10-00-quantizer-29.ivf.md5 +b5e98b3f6b1db04d46bf43064c6ac64f797aff00 *av1-1-b10-00-quantizer-30.ivf +d1a603661d76c28658c7cd2892b408e91d77893e *av1-1-b10-00-quantizer-30.ivf.md5 +80168371d1150e82e3f46bcbbcabba458b835b19 *av1-1-b10-00-quantizer-31.ivf +904ecd033d4af5239c4d5b3f86e51ed5c3c2e3fb *av1-1-b10-00-quantizer-31.ivf.md5 +96291f6ace85980892d135a5b74188cd629c325f *av1-1-b10-00-quantizer-32.ivf +a5ceace390d4a75d48281fe29060c21557e4f5ae *av1-1-b10-00-quantizer-32.ivf.md5 +0f80495de34eae07c4905b72573a315a879390ec *av1-1-b10-00-quantizer-33.ivf +72b8f662973a660412946687dff878b276ae518e *av1-1-b10-00-quantizer-33.ivf.md5 +24905e3be7db320994b7fb8311dfd50a7c9e54da *av1-1-b10-00-quantizer-34.ivf +cea514bb1b7b064c4d31914a2cb266611c278577 *av1-1-b10-00-quantizer-34.ivf.md5 +083012960dd7c17d3b00fa0e807759c98faded8f *av1-1-b10-00-quantizer-35.ivf +de5fdb9e1e581484af1cc7d2dd3c3e84c90cebb2 *av1-1-b10-00-quantizer-35.ivf.md5 +f725f179aeee5b413620c0dd81b007b245c2a7ed *av1-1-b10-00-quantizer-36.ivf +246b1931c04c02df1f168090e2650827cd5dbabd *av1-1-b10-00-quantizer-36.ivf.md5 +f6aa824156e9848f237481889a8103eb6130f31d *av1-1-b10-00-quantizer-37.ivf +a8f78dd15fc2994369a08c2ddddcd0760c62ea5b *av1-1-b10-00-quantizer-37.ivf.md5 +a8dd662338c493aea266b99203e70af25982633f *av1-1-b10-00-quantizer-38.ivf +09f36d998e85d0450060f540e50b075ae1432fc6 *av1-1-b10-00-quantizer-38.ivf.md5 +d97428871720ed658da6ed0e3f7c15da83387e4c *av1-1-b10-00-quantizer-39.ivf +8c5230048909ee8f86f87c116f153cd910d0141f *av1-1-b10-00-quantizer-39.ivf.md5 +86e754e55e9b63c6e0a4fef01761414f8a6b61ca *av1-1-b10-00-quantizer-40.ivf +99a71accf6457264e45ca80d3b1f082ee5acdecc *av1-1-b10-00-quantizer-40.ivf.md5 +9d18b7236506ab7e107c062620b64096ec0cf423 *av1-1-b10-00-quantizer-41.ivf +5771159a9a7c7b66c9e13bb13ec3d53b37860208 *av1-1-b10-00-quantizer-41.ivf.md5 +54b72bc879a80e66613f421e67db62bba1c0041b *av1-1-b10-00-quantizer-42.ivf +bf958236883ee7209ef4cb0b7503b430634a291e *av1-1-b10-00-quantizer-42.ivf.md5 +a06d5321a51d90404dd7085ae511d7df5d5e1e05 *av1-1-b10-00-quantizer-43.ivf +ddb25723d976043d863634b9dc3b5fb84a245803 *av1-1-b10-00-quantizer-43.ivf.md5 +2ea0b64c170d7299dae1c14a8a49349aee8e0d08 *av1-1-b10-00-quantizer-44.ivf +d18bde1b4893792173fa2014665e9364395ad5e9 *av1-1-b10-00-quantizer-44.ivf.md5 +73e506a32d3518e23424f231c7b5323d7a34a3d6 *av1-1-b10-00-quantizer-45.ivf +be6224ebc77a3e5fb9c1645b876007e584a09d89 *av1-1-b10-00-quantizer-45.ivf.md5 +841223871374464194edc739c48dc7cefd1ff255 *av1-1-b10-00-quantizer-46.ivf +4766d616f923496a8dc113c9b7f875f0c0735f9a *av1-1-b10-00-quantizer-46.ivf.md5 +8bbbbea130aaea453f7b826956a5520d10a0eccf *av1-1-b10-00-quantizer-47.ivf +3ea21fac0c492b03d8ec25e4ee0971cd57e5f71a *av1-1-b10-00-quantizer-47.ivf.md5 +3ce83e0f1e1835b9a6c10fe502a16fd3650839e0 *av1-1-b10-00-quantizer-48.ivf +b468de2c09fca5a6b2bb7a20bab4afd8d192c31d *av1-1-b10-00-quantizer-48.ivf.md5 +f3a757c678aa00f9a9c4c4658d37733fd935925a *av1-1-b10-00-quantizer-49.ivf +f888dc88db576122695d4eb41c486aacd28a2d1d *av1-1-b10-00-quantizer-49.ivf.md5 +a9d78aaef105cc5a95b7ebb54783f37e75673123 *av1-1-b10-00-quantizer-50.ivf +06d0c5e79cc794030c4be022089b1d12c1383f71 *av1-1-b10-00-quantizer-50.ivf.md5 +165c20ee372f83682d094541097e375227353239 *av1-1-b10-00-quantizer-51.ivf +b3d90214b8c6e6f6d9357bb5784d10081325c356 *av1-1-b10-00-quantizer-51.ivf.md5 +5b3ea7a18654d943065f5c176974c3960b56664e *av1-1-b10-00-quantizer-52.ivf +dc61a6e4e2549074130023b14b137fb4fe442ce3 *av1-1-b10-00-quantizer-52.ivf.md5 +74c3b5851b6a94d33b575a689eb8d34592e95d5f *av1-1-b10-00-quantizer-53.ivf +a80e43a0fb2b852426bd941b8d4b8f56690e9bc9 *av1-1-b10-00-quantizer-53.ivf.md5 +d05b8dea2cddd4f0d9e792f42f71afbd29f7811c *av1-1-b10-00-quantizer-54.ivf +432937893321f4bd25fa400b8988c5788cb06ecf *av1-1-b10-00-quantizer-54.ivf.md5 +4eaee0f1970426be0bbeb7d4fccdc7e804e9bea4 *av1-1-b10-00-quantizer-55.ivf +710ab95ce1dcd2540db4477ff4ee6ab771fe0759 *av1-1-b10-00-quantizer-55.ivf.md5 +fe637930c9faa8744cba37effc4cb5510315d1c0 *av1-1-b10-00-quantizer-56.ivf +2f9431b30523fb6a3e4122f22c6c3ff7b96a7987 *av1-1-b10-00-quantizer-56.ivf.md5 +ed54fc7fcec194eef1f50adbbe12a6a36ab6836b *av1-1-b10-00-quantizer-57.ivf +43bccac7800b399210cf15520a83739c23a5d9c7 *av1-1-b10-00-quantizer-57.ivf.md5 +a7b8d628ba3e4c5f37aa6a3d7b82afda73ac89dc *av1-1-b10-00-quantizer-58.ivf +b26638272b787df54f45a46629b852acbcb73e3d *av1-1-b10-00-quantizer-58.ivf.md5 +c077f22ff547fb5ffd020e8dac91d05942fb52df *av1-1-b10-00-quantizer-59.ivf +4efd99cc0891bf345b8cd2ae8e21709d61be497b *av1-1-b10-00-quantizer-59.ivf.md5 +301ab53039d75e1ffa8cc6a0874d9ea94e4a6a0d *av1-1-b10-00-quantizer-60.ivf +4729bd734a6edd2d8d0432a3f66b3d91d565050e *av1-1-b10-00-quantizer-60.ivf.md5 +c78640d3211034df9fcb273bdfc18625819652f2 *av1-1-b10-00-quantizer-61.ivf +3d823eb2b33ccfea68db506626bcbecf49b0f167 *av1-1-b10-00-quantizer-61.ivf.md5 +bf241a449a28773b93e6e529a06dfc28109577e4 *av1-1-b10-00-quantizer-62.ivf +75457d8476f1927f737d089dcf3d0f7f99f3c4fb *av1-1-b10-00-quantizer-62.ivf.md5 +8b6eb3fff2e0db7eac775b08c745250ca591e2d9 *av1-1-b10-00-quantizer-63.ivf +63ea689d025593e5d91760785b8e446d04d4671e *av1-1-b10-00-quantizer-63.ivf.md5 +a9f7ea6312a533cc6426a6145edd190d45813c37 *av1-1-b8-02-allintra.ivf +8fd8f789cfee1069d20f3e2c241f5cad7292239e *av1-1-b8-02-allintra.ivf.md5
\ No newline at end of file diff --git a/third_party/aom/test/test.cmake b/third_party/aom/test/test.cmake index 8594d059c..7b584880f 100644 --- a/third_party/aom/test/test.cmake +++ b/third_party/aom/test/test.cmake @@ -405,7 +405,7 @@ function(setup_aom_test_targets) OR (CONFIG_AV1_DECODER AND "${var}" MATCHES "_TEST_DECODER_")) list(APPEND aom_test_source_vars ${var}) endif() - # cmake-format:on + # cmake-format: on endforeach() # Libaom_test_srcs.txt generation. diff --git a/third_party/aom/test/test_data_util.cmake b/third_party/aom/test/test_data_util.cmake index bbdd5f4a2..9fe00a07d 100644 --- a/third_party/aom/test/test_data_util.cmake +++ b/third_party/aom/test/test_data_util.cmake @@ -165,6 +165,134 @@ if(CONFIG_AV1_DECODER) "av1-1-b8-00-quantizer-62.ivf.md5" "av1-1-b8-00-quantizer-63.ivf" "av1-1-b8-00-quantizer-63.ivf.md5" + "av1-1-b10-00-quantizer-00.ivf" + "av1-1-b10-00-quantizer-00.ivf.md5" + "av1-1-b10-00-quantizer-01.ivf" + "av1-1-b10-00-quantizer-01.ivf.md5" + "av1-1-b10-00-quantizer-02.ivf" + "av1-1-b10-00-quantizer-02.ivf.md5" + "av1-1-b10-00-quantizer-03.ivf" + "av1-1-b10-00-quantizer-03.ivf.md5" + "av1-1-b10-00-quantizer-04.ivf" + "av1-1-b10-00-quantizer-04.ivf.md5" + "av1-1-b10-00-quantizer-05.ivf" + "av1-1-b10-00-quantizer-05.ivf.md5" + "av1-1-b10-00-quantizer-06.ivf" + "av1-1-b10-00-quantizer-06.ivf.md5" + "av1-1-b10-00-quantizer-07.ivf" + "av1-1-b10-00-quantizer-07.ivf.md5" + "av1-1-b10-00-quantizer-08.ivf" + "av1-1-b10-00-quantizer-08.ivf.md5" + "av1-1-b10-00-quantizer-09.ivf" + "av1-1-b10-00-quantizer-09.ivf.md5" + "av1-1-b10-00-quantizer-10.ivf" + "av1-1-b10-00-quantizer-10.ivf.md5" + "av1-1-b10-00-quantizer-11.ivf" + "av1-1-b10-00-quantizer-11.ivf.md5" + "av1-1-b10-00-quantizer-12.ivf" + "av1-1-b10-00-quantizer-12.ivf.md5" + "av1-1-b10-00-quantizer-13.ivf" + "av1-1-b10-00-quantizer-13.ivf.md5" + "av1-1-b10-00-quantizer-14.ivf" + "av1-1-b10-00-quantizer-14.ivf.md5" + "av1-1-b10-00-quantizer-15.ivf" + "av1-1-b10-00-quantizer-15.ivf.md5" + "av1-1-b10-00-quantizer-16.ivf" + "av1-1-b10-00-quantizer-16.ivf.md5" + "av1-1-b10-00-quantizer-17.ivf" + "av1-1-b10-00-quantizer-17.ivf.md5" + "av1-1-b10-00-quantizer-18.ivf" + "av1-1-b10-00-quantizer-18.ivf.md5" + "av1-1-b10-00-quantizer-19.ivf" + "av1-1-b10-00-quantizer-19.ivf.md5" + "av1-1-b10-00-quantizer-20.ivf" + "av1-1-b10-00-quantizer-20.ivf.md5" + "av1-1-b10-00-quantizer-21.ivf" + "av1-1-b10-00-quantizer-21.ivf.md5" + "av1-1-b10-00-quantizer-22.ivf" + "av1-1-b10-00-quantizer-22.ivf.md5" + "av1-1-b10-00-quantizer-23.ivf" + "av1-1-b10-00-quantizer-23.ivf.md5" + "av1-1-b10-00-quantizer-24.ivf" + "av1-1-b10-00-quantizer-24.ivf.md5" + "av1-1-b10-00-quantizer-25.ivf" + "av1-1-b10-00-quantizer-25.ivf.md5" + "av1-1-b10-00-quantizer-26.ivf" + "av1-1-b10-00-quantizer-26.ivf.md5" + "av1-1-b10-00-quantizer-27.ivf" + "av1-1-b10-00-quantizer-27.ivf.md5" + "av1-1-b10-00-quantizer-28.ivf" + "av1-1-b10-00-quantizer-28.ivf.md5" + "av1-1-b10-00-quantizer-29.ivf" + "av1-1-b10-00-quantizer-29.ivf.md5" + "av1-1-b10-00-quantizer-30.ivf" + "av1-1-b10-00-quantizer-30.ivf.md5" + "av1-1-b10-00-quantizer-31.ivf" + "av1-1-b10-00-quantizer-31.ivf.md5" + "av1-1-b10-00-quantizer-32.ivf" + "av1-1-b10-00-quantizer-32.ivf.md5" + "av1-1-b10-00-quantizer-33.ivf" + "av1-1-b10-00-quantizer-33.ivf.md5" + "av1-1-b10-00-quantizer-34.ivf" + "av1-1-b10-00-quantizer-34.ivf.md5" + "av1-1-b10-00-quantizer-35.ivf" + "av1-1-b10-00-quantizer-35.ivf.md5" + "av1-1-b10-00-quantizer-36.ivf" + "av1-1-b10-00-quantizer-36.ivf.md5" + "av1-1-b10-00-quantizer-37.ivf" + "av1-1-b10-00-quantizer-37.ivf.md5" + "av1-1-b10-00-quantizer-38.ivf" + "av1-1-b10-00-quantizer-38.ivf.md5" + "av1-1-b10-00-quantizer-39.ivf" + "av1-1-b10-00-quantizer-39.ivf.md5" + "av1-1-b10-00-quantizer-40.ivf" + "av1-1-b10-00-quantizer-40.ivf.md5" + "av1-1-b10-00-quantizer-41.ivf" + "av1-1-b10-00-quantizer-41.ivf.md5" + "av1-1-b10-00-quantizer-42.ivf" + "av1-1-b10-00-quantizer-42.ivf.md5" + "av1-1-b10-00-quantizer-43.ivf" + "av1-1-b10-00-quantizer-43.ivf.md5" + "av1-1-b10-00-quantizer-44.ivf" + "av1-1-b10-00-quantizer-44.ivf.md5" + "av1-1-b10-00-quantizer-45.ivf" + "av1-1-b10-00-quantizer-45.ivf.md5" + "av1-1-b10-00-quantizer-46.ivf" + "av1-1-b10-00-quantizer-46.ivf.md5" + "av1-1-b10-00-quantizer-47.ivf" + "av1-1-b10-00-quantizer-47.ivf.md5" + "av1-1-b10-00-quantizer-48.ivf" + "av1-1-b10-00-quantizer-48.ivf.md5" + "av1-1-b10-00-quantizer-49.ivf" + "av1-1-b10-00-quantizer-49.ivf.md5" + "av1-1-b10-00-quantizer-50.ivf" + "av1-1-b10-00-quantizer-50.ivf.md5" + "av1-1-b10-00-quantizer-51.ivf" + "av1-1-b10-00-quantizer-51.ivf.md5" + "av1-1-b10-00-quantizer-52.ivf" + "av1-1-b10-00-quantizer-52.ivf.md5" + "av1-1-b10-00-quantizer-53.ivf" + "av1-1-b10-00-quantizer-53.ivf.md5" + "av1-1-b10-00-quantizer-54.ivf" + "av1-1-b10-00-quantizer-54.ivf.md5" + "av1-1-b10-00-quantizer-55.ivf" + "av1-1-b10-00-quantizer-55.ivf.md5" + "av1-1-b10-00-quantizer-56.ivf" + "av1-1-b10-00-quantizer-56.ivf.md5" + "av1-1-b10-00-quantizer-57.ivf" + "av1-1-b10-00-quantizer-57.ivf.md5" + "av1-1-b10-00-quantizer-58.ivf" + "av1-1-b10-00-quantizer-58.ivf.md5" + "av1-1-b10-00-quantizer-59.ivf" + "av1-1-b10-00-quantizer-59.ivf.md5" + "av1-1-b10-00-quantizer-60.ivf" + "av1-1-b10-00-quantizer-60.ivf.md5" + "av1-1-b10-00-quantizer-61.ivf" + "av1-1-b10-00-quantizer-61.ivf.md5" + "av1-1-b10-00-quantizer-62.ivf" + "av1-1-b10-00-quantizer-62.ivf.md5" + "av1-1-b10-00-quantizer-63.ivf" + "av1-1-b10-00-quantizer-63.ivf.md5" "av1-1-b8-01-size-16x16.ivf" "av1-1-b8-01-size-16x16.ivf.md5" "av1-1-b8-01-size-16x18.ivf" @@ -364,7 +492,9 @@ if(CONFIG_AV1_DECODER) "av1-1-b8-01-size-66x64.ivf" "av1-1-b8-01-size-66x64.ivf.md5" "av1-1-b8-01-size-66x66.ivf" - "av1-1-b8-01-size-66x66.ivf.md5") + "av1-1-b8-01-size-66x66.ivf.md5" + "av1-1-b8-02-allintra.ivf" + "av1-1-b8-02-allintra.ivf.md5") endif() if(ENABLE_ENCODE_PERF_TESTS AND CONFIG_AV1_ENCODER) diff --git a/third_party/aom/test/test_vector_test.cc b/third_party/aom/test/test_vector_test.cc index 85223177c..1bfeacba1 100644 --- a/third_party/aom/test/test_vector_test.cc +++ b/third_party/aom/test/test_vector_test.cc @@ -30,8 +30,9 @@ namespace { const int kThreads = 0; const int kFileName = 1; +const int kRowMT = 2; -typedef ::testing::tuple<int, const char *> DecodeParam; +typedef ::testing::tuple<int, const char *, int> DecodeParam; class TestVectorTest : public ::libaom_test::DecoderTest, public ::libaom_test::CodecTestWithParam<DecodeParam> { @@ -48,6 +49,12 @@ class TestVectorTest : public ::libaom_test::DecoderTest, << "Md5 file open failed. Filename: " << md5_file_name_; } + virtual void PreDecodeFrameHook( + const libaom_test::CompressedVideoSource &video, + libaom_test::Decoder *decoder) { + if (video.frame_number() == 0) decoder->Control(AV1D_SET_ROW_MT, row_mt_); + } + virtual void DecompressedFrameHook(const aom_image_t &img, const unsigned int frame_number) { ASSERT_TRUE(md5_file_ != NULL); @@ -60,14 +67,32 @@ class TestVectorTest : public ::libaom_test::DecoderTest, expected_md5[32] = '\0'; ::libaom_test::MD5 md5_res; - md5_res.Add(&img); - const char *actual_md5 = md5_res.Get(); +#if !CONFIG_LOWBITDEPTH + const aom_img_fmt_t shifted_fmt = + (aom_img_fmt)(img.fmt & ~AOM_IMG_FMT_HIGHBITDEPTH); + if (img.bit_depth == 8 && shifted_fmt != img.fmt) { + aom_image_t *img_shifted = + aom_img_alloc(NULL, shifted_fmt, img.d_w, img.d_h, 16); + img_shifted->bit_depth = img.bit_depth; + img_shifted->monochrome = img.monochrome; + aom_img_downshift(img_shifted, &img, 0); + md5_res.Add(img_shifted); + aom_img_free(img_shifted); + } else { +#endif + md5_res.Add(&img); +#if !CONFIG_LOWBITDEPTH + } +#endif + const char *actual_md5 = md5_res.Get(); // Check md5 match. ASSERT_STREQ(expected_md5, actual_md5) << "Md5 checksums don't match: frame number = " << frame_number; } + unsigned int row_mt_; + private: FILE *md5_file_; }; @@ -84,6 +109,7 @@ TEST_P(TestVectorTest, MD5Match) { char str[256]; cfg.threads = ::testing::get<kThreads>(input); + row_mt_ = ::testing::get<kRowMT>(input); snprintf(str, sizeof(str) / sizeof(str[0]) - 1, "file: %s threads: %d", filename.c_str(), cfg.threads); @@ -118,17 +144,14 @@ TEST_P(TestVectorTest, MD5Match) { ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg)); } -// TODO(yaowu): Current md5 check works only when CONFIG_LOWBITDEPTH is enabled, -// remove CONFIG_LOWBITDEPTH when md5 check is reworked to be compatible with -// CONFIG_LOWBITDEPTH = 0 -#if CONFIG_AV1_DECODER && CONFIG_LOWBITDEPTH +#if CONFIG_AV1_DECODER AV1_INSTANTIATE_TEST_CASE( TestVectorTest, - ::testing::Combine( - ::testing::Values(1), // Single thread. - ::testing::ValuesIn(libaom_test::kAV1TestVectors, - libaom_test::kAV1TestVectors + - libaom_test::kNumAV1TestVectors))); + ::testing::Combine(::testing::Values(1), // Single thread. + ::testing::ValuesIn(libaom_test::kAV1TestVectors, + libaom_test::kAV1TestVectors + + libaom_test::kNumAV1TestVectors), + ::testing::Values(0))); // Test AV1 decode in with different numbers of threads. INSTANTIATE_TEST_CASE_P( @@ -140,7 +163,8 @@ INSTANTIATE_TEST_CASE_P( ::testing::Range(2, 9), // With 2 ~ 8 threads. ::testing::ValuesIn(libaom_test::kAV1TestVectors, libaom_test::kAV1TestVectors + - libaom_test::kNumAV1TestVectors)))); + libaom_test::kNumAV1TestVectors), + ::testing::Range(0, 2)))); #endif // CONFIG_AV1_DECODER diff --git a/third_party/aom/test/test_vectors.cc b/third_party/aom/test/test_vectors.cc index a9edf7520..f478c0183 100644 --- a/third_party/aom/test/test_vectors.cc +++ b/third_party/aom/test/test_vectors.cc @@ -17,88 +17,121 @@ namespace libaom_test { #if CONFIG_AV1_DECODER const char *const kAV1TestVectors[] = { - "av1-1-b8-00-quantizer-00.ivf", "av1-1-b8-00-quantizer-01.ivf", - "av1-1-b8-00-quantizer-02.ivf", "av1-1-b8-00-quantizer-03.ivf", - "av1-1-b8-00-quantizer-04.ivf", "av1-1-b8-00-quantizer-05.ivf", - "av1-1-b8-00-quantizer-06.ivf", "av1-1-b8-00-quantizer-07.ivf", - "av1-1-b8-00-quantizer-08.ivf", "av1-1-b8-00-quantizer-09.ivf", - "av1-1-b8-00-quantizer-10.ivf", "av1-1-b8-00-quantizer-11.ivf", - "av1-1-b8-00-quantizer-12.ivf", "av1-1-b8-00-quantizer-13.ivf", - "av1-1-b8-00-quantizer-14.ivf", "av1-1-b8-00-quantizer-15.ivf", - "av1-1-b8-00-quantizer-16.ivf", "av1-1-b8-00-quantizer-17.ivf", - "av1-1-b8-00-quantizer-18.ivf", "av1-1-b8-00-quantizer-19.ivf", - "av1-1-b8-00-quantizer-20.ivf", "av1-1-b8-00-quantizer-21.ivf", - "av1-1-b8-00-quantizer-22.ivf", "av1-1-b8-00-quantizer-23.ivf", - "av1-1-b8-00-quantizer-24.ivf", "av1-1-b8-00-quantizer-25.ivf", - "av1-1-b8-00-quantizer-26.ivf", "av1-1-b8-00-quantizer-27.ivf", - "av1-1-b8-00-quantizer-28.ivf", "av1-1-b8-00-quantizer-29.ivf", - "av1-1-b8-00-quantizer-30.ivf", "av1-1-b8-00-quantizer-31.ivf", - "av1-1-b8-00-quantizer-32.ivf", "av1-1-b8-00-quantizer-33.ivf", - "av1-1-b8-00-quantizer-34.ivf", "av1-1-b8-00-quantizer-35.ivf", - "av1-1-b8-00-quantizer-36.ivf", "av1-1-b8-00-quantizer-37.ivf", - "av1-1-b8-00-quantizer-38.ivf", "av1-1-b8-00-quantizer-39.ivf", - "av1-1-b8-00-quantizer-40.ivf", "av1-1-b8-00-quantizer-41.ivf", - "av1-1-b8-00-quantizer-42.ivf", "av1-1-b8-00-quantizer-43.ivf", - "av1-1-b8-00-quantizer-44.ivf", "av1-1-b8-00-quantizer-45.ivf", - "av1-1-b8-00-quantizer-46.ivf", "av1-1-b8-00-quantizer-47.ivf", - "av1-1-b8-00-quantizer-48.ivf", "av1-1-b8-00-quantizer-49.ivf", - "av1-1-b8-00-quantizer-50.ivf", "av1-1-b8-00-quantizer-51.ivf", - "av1-1-b8-00-quantizer-52.ivf", "av1-1-b8-00-quantizer-53.ivf", - "av1-1-b8-00-quantizer-54.ivf", "av1-1-b8-00-quantizer-55.ivf", - "av1-1-b8-00-quantizer-56.ivf", "av1-1-b8-00-quantizer-57.ivf", - "av1-1-b8-00-quantizer-58.ivf", "av1-1-b8-00-quantizer-59.ivf", - "av1-1-b8-00-quantizer-60.ivf", "av1-1-b8-00-quantizer-61.ivf", - "av1-1-b8-00-quantizer-62.ivf", "av1-1-b8-00-quantizer-63.ivf", - "av1-1-b8-01-size-16x16.ivf", "av1-1-b8-01-size-16x18.ivf", - "av1-1-b8-01-size-16x32.ivf", "av1-1-b8-01-size-16x34.ivf", - "av1-1-b8-01-size-16x64.ivf", "av1-1-b8-01-size-16x66.ivf", - "av1-1-b8-01-size-18x16.ivf", "av1-1-b8-01-size-18x18.ivf", - "av1-1-b8-01-size-18x32.ivf", "av1-1-b8-01-size-18x34.ivf", - "av1-1-b8-01-size-18x64.ivf", "av1-1-b8-01-size-18x66.ivf", - "av1-1-b8-01-size-196x196.ivf", "av1-1-b8-01-size-196x198.ivf", - "av1-1-b8-01-size-196x200.ivf", "av1-1-b8-01-size-196x202.ivf", - "av1-1-b8-01-size-196x208.ivf", "av1-1-b8-01-size-196x210.ivf", - "av1-1-b8-01-size-196x224.ivf", "av1-1-b8-01-size-196x226.ivf", - "av1-1-b8-01-size-198x196.ivf", "av1-1-b8-01-size-198x198.ivf", - "av1-1-b8-01-size-198x200.ivf", "av1-1-b8-01-size-198x202.ivf", - "av1-1-b8-01-size-198x208.ivf", "av1-1-b8-01-size-198x210.ivf", - "av1-1-b8-01-size-198x224.ivf", "av1-1-b8-01-size-198x226.ivf", - "av1-1-b8-01-size-200x196.ivf", "av1-1-b8-01-size-200x198.ivf", - "av1-1-b8-01-size-200x200.ivf", "av1-1-b8-01-size-200x202.ivf", - "av1-1-b8-01-size-200x208.ivf", "av1-1-b8-01-size-200x210.ivf", - "av1-1-b8-01-size-200x224.ivf", "av1-1-b8-01-size-200x226.ivf", - "av1-1-b8-01-size-202x196.ivf", "av1-1-b8-01-size-202x198.ivf", - "av1-1-b8-01-size-202x200.ivf", "av1-1-b8-01-size-202x202.ivf", - "av1-1-b8-01-size-202x208.ivf", "av1-1-b8-01-size-202x210.ivf", - "av1-1-b8-01-size-202x224.ivf", "av1-1-b8-01-size-202x226.ivf", - "av1-1-b8-01-size-208x196.ivf", "av1-1-b8-01-size-208x198.ivf", - "av1-1-b8-01-size-208x200.ivf", "av1-1-b8-01-size-208x202.ivf", - "av1-1-b8-01-size-208x208.ivf", "av1-1-b8-01-size-208x210.ivf", - "av1-1-b8-01-size-208x224.ivf", "av1-1-b8-01-size-208x226.ivf", - "av1-1-b8-01-size-210x196.ivf", "av1-1-b8-01-size-210x198.ivf", - "av1-1-b8-01-size-210x200.ivf", "av1-1-b8-01-size-210x202.ivf", - "av1-1-b8-01-size-210x208.ivf", "av1-1-b8-01-size-210x210.ivf", - "av1-1-b8-01-size-210x224.ivf", "av1-1-b8-01-size-210x226.ivf", - "av1-1-b8-01-size-224x196.ivf", "av1-1-b8-01-size-224x198.ivf", - "av1-1-b8-01-size-224x200.ivf", "av1-1-b8-01-size-224x202.ivf", - "av1-1-b8-01-size-224x208.ivf", "av1-1-b8-01-size-224x210.ivf", - "av1-1-b8-01-size-224x224.ivf", "av1-1-b8-01-size-224x226.ivf", - "av1-1-b8-01-size-226x196.ivf", "av1-1-b8-01-size-226x198.ivf", - "av1-1-b8-01-size-226x200.ivf", "av1-1-b8-01-size-226x202.ivf", - "av1-1-b8-01-size-226x208.ivf", "av1-1-b8-01-size-226x210.ivf", - "av1-1-b8-01-size-226x224.ivf", "av1-1-b8-01-size-226x226.ivf", - "av1-1-b8-01-size-32x16.ivf", "av1-1-b8-01-size-32x18.ivf", - "av1-1-b8-01-size-32x32.ivf", "av1-1-b8-01-size-32x34.ivf", - "av1-1-b8-01-size-32x64.ivf", "av1-1-b8-01-size-32x66.ivf", - "av1-1-b8-01-size-34x16.ivf", "av1-1-b8-01-size-34x18.ivf", - "av1-1-b8-01-size-34x32.ivf", "av1-1-b8-01-size-34x34.ivf", - "av1-1-b8-01-size-34x64.ivf", "av1-1-b8-01-size-34x66.ivf", - "av1-1-b8-01-size-64x16.ivf", "av1-1-b8-01-size-64x18.ivf", - "av1-1-b8-01-size-64x32.ivf", "av1-1-b8-01-size-64x34.ivf", - "av1-1-b8-01-size-64x64.ivf", "av1-1-b8-01-size-64x66.ivf", - "av1-1-b8-01-size-66x16.ivf", "av1-1-b8-01-size-66x18.ivf", - "av1-1-b8-01-size-66x32.ivf", "av1-1-b8-01-size-66x34.ivf", - "av1-1-b8-01-size-66x64.ivf", "av1-1-b8-01-size-66x66.ivf", + "av1-1-b8-00-quantizer-00.ivf", "av1-1-b8-00-quantizer-01.ivf", + "av1-1-b8-00-quantizer-02.ivf", "av1-1-b8-00-quantizer-03.ivf", + "av1-1-b8-00-quantizer-04.ivf", "av1-1-b8-00-quantizer-05.ivf", + "av1-1-b8-00-quantizer-06.ivf", "av1-1-b8-00-quantizer-07.ivf", + "av1-1-b8-00-quantizer-08.ivf", "av1-1-b8-00-quantizer-09.ivf", + "av1-1-b8-00-quantizer-10.ivf", "av1-1-b8-00-quantizer-11.ivf", + "av1-1-b8-00-quantizer-12.ivf", "av1-1-b8-00-quantizer-13.ivf", + "av1-1-b8-00-quantizer-14.ivf", "av1-1-b8-00-quantizer-15.ivf", + "av1-1-b8-00-quantizer-16.ivf", "av1-1-b8-00-quantizer-17.ivf", + "av1-1-b8-00-quantizer-18.ivf", "av1-1-b8-00-quantizer-19.ivf", + "av1-1-b8-00-quantizer-20.ivf", "av1-1-b8-00-quantizer-21.ivf", + "av1-1-b8-00-quantizer-22.ivf", "av1-1-b8-00-quantizer-23.ivf", + "av1-1-b8-00-quantizer-24.ivf", "av1-1-b8-00-quantizer-25.ivf", + "av1-1-b8-00-quantizer-26.ivf", "av1-1-b8-00-quantizer-27.ivf", + "av1-1-b8-00-quantizer-28.ivf", "av1-1-b8-00-quantizer-29.ivf", + "av1-1-b8-00-quantizer-30.ivf", "av1-1-b8-00-quantizer-31.ivf", + "av1-1-b8-00-quantizer-32.ivf", "av1-1-b8-00-quantizer-33.ivf", + "av1-1-b8-00-quantizer-34.ivf", "av1-1-b8-00-quantizer-35.ivf", + "av1-1-b8-00-quantizer-36.ivf", "av1-1-b8-00-quantizer-37.ivf", + "av1-1-b8-00-quantizer-38.ivf", "av1-1-b8-00-quantizer-39.ivf", + "av1-1-b8-00-quantizer-40.ivf", "av1-1-b8-00-quantizer-41.ivf", + "av1-1-b8-00-quantizer-42.ivf", "av1-1-b8-00-quantizer-43.ivf", + "av1-1-b8-00-quantizer-44.ivf", "av1-1-b8-00-quantizer-45.ivf", + "av1-1-b8-00-quantizer-46.ivf", "av1-1-b8-00-quantizer-47.ivf", + "av1-1-b8-00-quantizer-48.ivf", "av1-1-b8-00-quantizer-49.ivf", + "av1-1-b8-00-quantizer-50.ivf", "av1-1-b8-00-quantizer-51.ivf", + "av1-1-b8-00-quantizer-52.ivf", "av1-1-b8-00-quantizer-53.ivf", + "av1-1-b8-00-quantizer-54.ivf", "av1-1-b8-00-quantizer-55.ivf", + "av1-1-b8-00-quantizer-56.ivf", "av1-1-b8-00-quantizer-57.ivf", + "av1-1-b8-00-quantizer-58.ivf", "av1-1-b8-00-quantizer-59.ivf", + "av1-1-b8-00-quantizer-60.ivf", "av1-1-b8-00-quantizer-61.ivf", + "av1-1-b8-00-quantizer-62.ivf", "av1-1-b8-00-quantizer-63.ivf", + "av1-1-b10-00-quantizer-00.ivf", "av1-1-b10-00-quantizer-01.ivf", + "av1-1-b10-00-quantizer-02.ivf", "av1-1-b10-00-quantizer-03.ivf", + "av1-1-b10-00-quantizer-04.ivf", "av1-1-b10-00-quantizer-05.ivf", + "av1-1-b10-00-quantizer-06.ivf", "av1-1-b10-00-quantizer-07.ivf", + "av1-1-b10-00-quantizer-08.ivf", "av1-1-b10-00-quantizer-09.ivf", + "av1-1-b10-00-quantizer-10.ivf", "av1-1-b10-00-quantizer-11.ivf", + "av1-1-b10-00-quantizer-12.ivf", "av1-1-b10-00-quantizer-13.ivf", + "av1-1-b10-00-quantizer-14.ivf", "av1-1-b10-00-quantizer-15.ivf", + "av1-1-b10-00-quantizer-16.ivf", "av1-1-b10-00-quantizer-17.ivf", + "av1-1-b10-00-quantizer-18.ivf", "av1-1-b10-00-quantizer-19.ivf", + "av1-1-b10-00-quantizer-20.ivf", "av1-1-b10-00-quantizer-21.ivf", + "av1-1-b10-00-quantizer-22.ivf", "av1-1-b10-00-quantizer-23.ivf", + "av1-1-b10-00-quantizer-24.ivf", "av1-1-b10-00-quantizer-25.ivf", + "av1-1-b10-00-quantizer-26.ivf", "av1-1-b10-00-quantizer-27.ivf", + "av1-1-b10-00-quantizer-28.ivf", "av1-1-b10-00-quantizer-29.ivf", + "av1-1-b10-00-quantizer-30.ivf", "av1-1-b10-00-quantizer-31.ivf", + "av1-1-b10-00-quantizer-32.ivf", "av1-1-b10-00-quantizer-33.ivf", + "av1-1-b10-00-quantizer-34.ivf", "av1-1-b10-00-quantizer-35.ivf", + "av1-1-b10-00-quantizer-36.ivf", "av1-1-b10-00-quantizer-37.ivf", + "av1-1-b10-00-quantizer-38.ivf", "av1-1-b10-00-quantizer-39.ivf", + "av1-1-b10-00-quantizer-40.ivf", "av1-1-b10-00-quantizer-41.ivf", + "av1-1-b10-00-quantizer-42.ivf", "av1-1-b10-00-quantizer-43.ivf", + "av1-1-b10-00-quantizer-44.ivf", "av1-1-b10-00-quantizer-45.ivf", + "av1-1-b10-00-quantizer-46.ivf", "av1-1-b10-00-quantizer-47.ivf", + "av1-1-b10-00-quantizer-48.ivf", "av1-1-b10-00-quantizer-49.ivf", + "av1-1-b10-00-quantizer-50.ivf", "av1-1-b10-00-quantizer-51.ivf", + "av1-1-b10-00-quantizer-52.ivf", "av1-1-b10-00-quantizer-53.ivf", + "av1-1-b10-00-quantizer-54.ivf", "av1-1-b10-00-quantizer-55.ivf", + "av1-1-b10-00-quantizer-56.ivf", "av1-1-b10-00-quantizer-57.ivf", + "av1-1-b10-00-quantizer-58.ivf", "av1-1-b10-00-quantizer-59.ivf", + "av1-1-b10-00-quantizer-60.ivf", "av1-1-b10-00-quantizer-61.ivf", + "av1-1-b10-00-quantizer-62.ivf", "av1-1-b10-00-quantizer-63.ivf", + "av1-1-b8-01-size-16x16.ivf", "av1-1-b8-01-size-16x18.ivf", + "av1-1-b8-01-size-16x32.ivf", "av1-1-b8-01-size-16x34.ivf", + "av1-1-b8-01-size-16x64.ivf", "av1-1-b8-01-size-16x66.ivf", + "av1-1-b8-01-size-18x16.ivf", "av1-1-b8-01-size-18x18.ivf", + "av1-1-b8-01-size-18x32.ivf", "av1-1-b8-01-size-18x34.ivf", + "av1-1-b8-01-size-18x64.ivf", "av1-1-b8-01-size-18x66.ivf", + "av1-1-b8-01-size-196x196.ivf", "av1-1-b8-01-size-196x198.ivf", + "av1-1-b8-01-size-196x200.ivf", "av1-1-b8-01-size-196x202.ivf", + "av1-1-b8-01-size-196x208.ivf", "av1-1-b8-01-size-196x210.ivf", + "av1-1-b8-01-size-196x224.ivf", "av1-1-b8-01-size-196x226.ivf", + "av1-1-b8-01-size-198x196.ivf", "av1-1-b8-01-size-198x198.ivf", + "av1-1-b8-01-size-198x200.ivf", "av1-1-b8-01-size-198x202.ivf", + "av1-1-b8-01-size-198x208.ivf", "av1-1-b8-01-size-198x210.ivf", + "av1-1-b8-01-size-198x224.ivf", "av1-1-b8-01-size-198x226.ivf", + "av1-1-b8-01-size-200x196.ivf", "av1-1-b8-01-size-200x198.ivf", + "av1-1-b8-01-size-200x200.ivf", "av1-1-b8-01-size-200x202.ivf", + "av1-1-b8-01-size-200x208.ivf", "av1-1-b8-01-size-200x210.ivf", + "av1-1-b8-01-size-200x224.ivf", "av1-1-b8-01-size-200x226.ivf", + "av1-1-b8-01-size-202x196.ivf", "av1-1-b8-01-size-202x198.ivf", + "av1-1-b8-01-size-202x200.ivf", "av1-1-b8-01-size-202x202.ivf", + "av1-1-b8-01-size-202x208.ivf", "av1-1-b8-01-size-202x210.ivf", + "av1-1-b8-01-size-202x224.ivf", "av1-1-b8-01-size-202x226.ivf", + "av1-1-b8-01-size-208x196.ivf", "av1-1-b8-01-size-208x198.ivf", + "av1-1-b8-01-size-208x200.ivf", "av1-1-b8-01-size-208x202.ivf", + "av1-1-b8-01-size-208x208.ivf", "av1-1-b8-01-size-208x210.ivf", + "av1-1-b8-01-size-208x224.ivf", "av1-1-b8-01-size-208x226.ivf", + "av1-1-b8-01-size-210x196.ivf", "av1-1-b8-01-size-210x198.ivf", + "av1-1-b8-01-size-210x200.ivf", "av1-1-b8-01-size-210x202.ivf", + "av1-1-b8-01-size-210x208.ivf", "av1-1-b8-01-size-210x210.ivf", + "av1-1-b8-01-size-210x224.ivf", "av1-1-b8-01-size-210x226.ivf", + "av1-1-b8-01-size-224x196.ivf", "av1-1-b8-01-size-224x198.ivf", + "av1-1-b8-01-size-224x200.ivf", "av1-1-b8-01-size-224x202.ivf", + "av1-1-b8-01-size-224x208.ivf", "av1-1-b8-01-size-224x210.ivf", + "av1-1-b8-01-size-224x224.ivf", "av1-1-b8-01-size-224x226.ivf", + "av1-1-b8-01-size-226x196.ivf", "av1-1-b8-01-size-226x198.ivf", + "av1-1-b8-01-size-226x200.ivf", "av1-1-b8-01-size-226x202.ivf", + "av1-1-b8-01-size-226x208.ivf", "av1-1-b8-01-size-226x210.ivf", + "av1-1-b8-01-size-226x224.ivf", "av1-1-b8-01-size-226x226.ivf", + "av1-1-b8-01-size-32x16.ivf", "av1-1-b8-01-size-32x18.ivf", + "av1-1-b8-01-size-32x32.ivf", "av1-1-b8-01-size-32x34.ivf", + "av1-1-b8-01-size-32x64.ivf", "av1-1-b8-01-size-32x66.ivf", + "av1-1-b8-01-size-34x16.ivf", "av1-1-b8-01-size-34x18.ivf", + "av1-1-b8-01-size-34x32.ivf", "av1-1-b8-01-size-34x34.ivf", + "av1-1-b8-01-size-34x64.ivf", "av1-1-b8-01-size-34x66.ivf", + "av1-1-b8-01-size-64x16.ivf", "av1-1-b8-01-size-64x18.ivf", + "av1-1-b8-01-size-64x32.ivf", "av1-1-b8-01-size-64x34.ivf", + "av1-1-b8-01-size-64x64.ivf", "av1-1-b8-01-size-64x66.ivf", + "av1-1-b8-01-size-66x16.ivf", "av1-1-b8-01-size-66x18.ivf", + "av1-1-b8-01-size-66x32.ivf", "av1-1-b8-01-size-66x34.ivf", + "av1-1-b8-01-size-66x64.ivf", "av1-1-b8-01-size-66x66.ivf", + "av1-1-b8-02-allintra.ivf", }; const int kNumAV1TestVectors = NELEMENTS(kAV1TestVectors); #endif // CONFIG_AV1_DECODER diff --git a/third_party/aom/test/tile_independence_test.cc b/third_party/aom/test/tile_independence_test.cc index e8b2e1fe4..cf534c0c5 100644 --- a/third_party/aom/test/tile_independence_test.cc +++ b/third_party/aom/test/tile_independence_test.cc @@ -146,25 +146,28 @@ AV1_INSTANTIATE_TEST_CASE(TileIndependenceTestLarge, ::testing::Values(0, 1), class TileIndependenceLSTest : public TileIndependenceTest {}; -TEST_P(TileIndependenceLSTest, DISABLED_MD5Match) { +TEST_P(TileIndependenceLSTest, MD5Match) { cfg_.large_scale_tile = 1; fw_dec_->Control(AV1_SET_TILE_MODE, 1); + fw_dec_->Control(AV1D_EXT_TILE_DEBUG, 1); inv_dec_->Control(AV1_SET_TILE_MODE, 1); + inv_dec_->Control(AV1D_EXT_TILE_DEBUG, 1); DoTest(); } class TileIndependenceLSTestLarge : public TileIndependenceTestLarge {}; -TEST_P(TileIndependenceLSTestLarge, DISABLED_MD5Match) { +TEST_P(TileIndependenceLSTestLarge, MD5Match) { cfg_.large_scale_tile = 1; fw_dec_->Control(AV1_SET_TILE_MODE, 1); + fw_dec_->Control(AV1D_EXT_TILE_DEBUG, 1); inv_dec_->Control(AV1_SET_TILE_MODE, 1); + inv_dec_->Control(AV1D_EXT_TILE_DEBUG, 1); DoTest(); } -AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTest, ::testing::Values(1, 2, 32), - ::testing::Values(1, 2, 32), ::testing::Values(1)); -AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTestLarge, - ::testing::Values(1, 2, 32), - ::testing::Values(1, 2, 32), ::testing::Values(1)); +AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTest, ::testing::Values(6), + ::testing::Values(6), ::testing::Values(1)); +AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTestLarge, ::testing::Values(6), + ::testing::Values(6), ::testing::Values(1)); } // namespace diff --git a/third_party/aom/test/tools_common.sh b/third_party/aom/test/tools_common.sh index 21a6b9b8e..c08710606 100755 --- a/third_party/aom/test/tools_common.sh +++ b/third_party/aom/test/tools_common.sh @@ -47,26 +47,16 @@ test_end() { # Echoes the target configuration being tested. test_configuration_target() { - aom_config_mk="${LIBAOM_CONFIG_PATH}/config.mk" - # TODO(tomfinegan): Remove the parts requiring config.mk when the configure - # script is removed from the repository. - if [ ! -f "${aom_config_mk}" ]; then - aom_config_c="${LIBAOM_CONFIG_PATH}/aom_config.c" - # Clean up the cfg pointer line from aom_config.c for easier re-use by - # someone examining a failure in the example tests. - # 1. Run grep on aom_config.c for cfg and limit the results to 1. - # 2. Split the line using ' = ' as separator. - # 3. Abuse sed to consume the leading " and trailing "; from the assignment - # to the cfg pointer. - cmake_config=$(awk -F ' = ' '/cfg/ { print $NF; exit }' "${aom_config_c}" \ - | sed -e s/\"// -e s/\"\;//) - echo cmake generated via command: cmake path/to/aom ${cmake_config} - return - fi - # Find the TOOLCHAIN line, split it using ':=' as the field separator, and - # print the last field to get the value. Then pipe the value to tr to consume - # any leading/trailing spaces while allowing tr to echo the output to stdout. - awk -F ':=' '/TOOLCHAIN/ { print $NF }' "${aom_config_mk}" | tr -d ' ' + aom_config_c="${LIBAOM_CONFIG_PATH}/config/aom_config.c" + # Clean up the cfg pointer line from aom_config.c for easier re-use by + # someone examining a failure in the example tests. + # 1. Run grep on aom_config.c for cfg and limit the results to 1. + # 2. Split the line using ' = ' as separator. + # 3. Abuse sed to consume the leading " and trailing "; from the assignment + # to the cfg pointer. + cmake_config=$(awk -F ' = ' '/cfg/ { print $NF; exit }' "${aom_config_c}" \ + | sed -e s/\"// -e s/\"\;//) + echo cmake generated via command: cmake path/to/aom ${cmake_config} } # Trap function used for failure reports and tool output directory removal. @@ -163,10 +153,10 @@ is_windows_target() { # included in $tool_paths, or an empty string. Caller is responsible for testing # the string once the function returns. aom_tool_path() { - local readonly tool_name="$1" - local readonly root_path="${LIBAOM_BIN_PATH}" - local readonly suffix="${AOM_TEST_EXE_SUFFIX}" - local readonly tool_paths="\ + local tool_name="$1" + local root_path="${LIBAOM_BIN_PATH}" + local suffix="${AOM_TEST_EXE_SUFFIX}" + local tool_paths="\ ${root_path}/${tool_name}${suffix} \ ${root_path}/../${tool_name}${suffix} \ ${root_path}/tools/${tool_name}${suffix} \ @@ -348,8 +338,8 @@ yuv_raw_input() { # Do a small encode for testing decoders. encode_yuv_raw_input_av1() { if [ "$(av1_encode_available)" = "yes" ]; then - local readonly output="$1" - local readonly encoder="$(aom_tool_path aomenc)" + local output="$1" + local encoder="$(aom_tool_path aomenc)" shift eval "${encoder}" $(yuv_raw_input) \ $(aomenc_encode_test_fast_params) \ diff --git a/third_party/aom/test/variance_test.cc b/third_party/aom/test/variance_test.cc index eb801b442..2f1b1fc5a 100644 --- a/third_party/aom/test/variance_test.cc +++ b/third_party/aom/test/variance_test.cc @@ -23,6 +23,7 @@ #include "aom/aom_codec.h" #include "aom/aom_integer.h" #include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" #include "aom_ports/mem.h" namespace { @@ -46,6 +47,10 @@ typedef unsigned int (*JntSubpixAvgVarMxNFunc)( const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, int b_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param); +typedef uint32_t (*ObmcSubpelVarFunc)(const uint8_t *pre, int pre_stride, + int xoffset, int yoffset, + const int32_t *wsrc, const int32_t *mask, + unsigned int *sse); using libaom_test::ACMRandom; @@ -269,6 +274,56 @@ static uint32_t jnt_subpel_avg_variance_ref( return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h))); } +static uint32_t obmc_subpel_variance_ref(const uint8_t *pre, int l2w, int l2h, + int xoff, int yoff, + const int32_t *wsrc, + const int32_t *mask, uint32_t *sse_ptr, + bool use_high_bit_depth_, + aom_bit_depth_t bit_depth) { + int64_t se = 0; + uint64_t sse = 0; + const int w = 1 << l2w; + const int h = 1 << l2h; + + xoff <<= 1; + yoff <<= 1; + + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + // Bilinear interpolation at a 16th pel step. + if (!use_high_bit_depth_) { + const int a1 = pre[(w + 1) * (y + 0) + x + 0]; + const int a2 = pre[(w + 1) * (y + 0) + x + 1]; + const int b1 = pre[(w + 1) * (y + 1) + x + 0]; + const int b2 = pre[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int diff = ROUND_POWER_OF_TWO_SIGNED( + wsrc[w * y + x] - r * mask[w * y + x], 12); + se += diff; + sse += diff * diff; + } else { + uint16_t *pre16 = CONVERT_TO_SHORTPTR(pre); + const int a1 = pre16[(w + 1) * (y + 0) + x + 0]; + const int a2 = pre16[(w + 1) * (y + 0) + x + 1]; + const int b1 = pre16[(w + 1) * (y + 1) + x + 0]; + const int b2 = pre16[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int diff = ROUND_POWER_OF_TWO_SIGNED( + wsrc[w * y + x] - r * mask[w * y + x], 12); + se += diff; + sse += diff * diff; + } + } + } + RoundHighBitDepth(bit_depth, &se, &sse); + *sse_ptr = static_cast<uint32_t>(sse); + return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h))); +} + //////////////////////////////////////////////////////////////////////////////// class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> { @@ -803,12 +858,170 @@ void SubpelVarianceTest<JntSubpixAvgVarMxNFunc>::RefTest() { } } +//////////////////////////////////////////////////////////////////////////////// + +static const int kMaskMax = 64; + +typedef TestParams<ObmcSubpelVarFunc> ObmcSubpelVarianceParams; + +template <typename FunctionType> +class ObmcVarianceTest + : public ::testing::TestWithParam<TestParams<FunctionType> > { + public: + virtual void SetUp() { + params_ = this->GetParam(); + + rnd_.Reset(ACMRandom::DeterministicSeed()); + if (!use_high_bit_depth()) { + pre_ = reinterpret_cast<uint8_t *>( + aom_memalign(32, block_size() + width() + height() + 1)); + } else { + pre_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(aom_memalign( + 32, block_size() + width() + height() + 1 * sizeof(uint16_t)))); + } + wsrc_ = reinterpret_cast<int32_t *>( + aom_memalign(32, block_size() * sizeof(uint32_t))); + mask_ = reinterpret_cast<int32_t *>( + aom_memalign(32, block_size() * sizeof(uint32_t))); + ASSERT_TRUE(pre_ != NULL); + ASSERT_TRUE(wsrc_ != NULL); + ASSERT_TRUE(mask_ != NULL); + } + + virtual void TearDown() { + if (!use_high_bit_depth()) { + aom_free(pre_); + } else { + aom_free(CONVERT_TO_SHORTPTR(pre_)); + } + aom_free(wsrc_); + aom_free(mask_); + libaom_test::ClearSystemState(); + } + + protected: + void RefTest(); + void ExtremeRefTest(); + void SpeedTest(); + + ACMRandom rnd_; + uint8_t *pre_; + int32_t *wsrc_; + int32_t *mask_; + TestParams<FunctionType> params_; + + // some relay helpers + bool use_high_bit_depth() const { return params_.use_high_bit_depth; } + int byte_shift() const { return params_.bit_depth - 8; } + int block_size() const { return params_.block_size; } + int width() const { return params_.width; } + int height() const { return params_.height; } + uint32_t bd_mask() const { return params_.mask; } +}; + +template <> +void ObmcVarianceTest<ObmcSubpelVarFunc>::RefTest() { + for (int x = 0; x < 8; ++x) { + for (int y = 0; y < 8; ++y) { + if (!use_high_bit_depth()) + for (int j = 0; j < block_size() + width() + height() + 1; j++) + pre_[j] = rnd_.Rand8(); + else + for (int j = 0; j < block_size() + width() + height() + 1; j++) + CONVERT_TO_SHORTPTR(pre_)[j] = rnd_.Rand16() & bd_mask(); + for (int j = 0; j < block_size(); j++) { + wsrc_[j] = (rnd_.Rand16() & bd_mask()) * rnd_(kMaskMax * kMaskMax + 1); + mask_[j] = rnd_(kMaskMax * kMaskMax + 1); + } + + uint32_t sse1, sse2; + uint32_t var1, var2; + ASM_REGISTER_STATE_CHECK( + var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1)); + var2 = obmc_subpel_variance_ref( + pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_, + &sse2, use_high_bit_depth(), params_.bit_depth); + EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y; + EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y; + } + } +} + +template <> +void ObmcVarianceTest<ObmcSubpelVarFunc>::ExtremeRefTest() { + // Pre: Set the first half of values to the maximum, the second half to 0. + // Mask: same as above + // WSrc: Set the first half of values to 0, the second half to the maximum. + for (int x = 0; x < 8; ++x) { + for (int y = 0; y < 8; ++y) { + const int half = block_size() / 2; + if (!use_high_bit_depth()) { + memset(pre_, 255, half); + memset(pre_ + half, 0, half + width() + height() + 1); + } else { + aom_memset16(CONVERT_TO_SHORTPTR(pre_), bd_mask(), half); + aom_memset16(CONVERT_TO_SHORTPTR(pre_) + half, 0, half); + } + for (int j = 0; j < half; j++) { + wsrc_[j] = bd_mask() * kMaskMax * kMaskMax; + mask_[j] = 0; + } + for (int j = half; j < block_size(); j++) { + wsrc_[j] = 0; + mask_[j] = kMaskMax * kMaskMax; + } + + uint32_t sse1, sse2; + uint32_t var1, var2; + ASM_REGISTER_STATE_CHECK( + var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1)); + var2 = obmc_subpel_variance_ref( + pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_, + &sse2, use_high_bit_depth(), params_.bit_depth); + EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y; + EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y; + } + } +} + +template <> +void ObmcVarianceTest<ObmcSubpelVarFunc>::SpeedTest() { + if (!use_high_bit_depth()) + for (int j = 0; j < block_size() + width() + height() + 1; j++) + pre_[j] = rnd_.Rand8(); + else + for (int j = 0; j < block_size() + width() + height() + 1; j++) + CONVERT_TO_SHORTPTR(pre_)[j] = rnd_.Rand16() & bd_mask(); + for (int j = 0; j < block_size(); j++) { + wsrc_[j] = (rnd_.Rand16() & bd_mask()) * rnd_(kMaskMax * kMaskMax + 1); + mask_[j] = rnd_(kMaskMax * kMaskMax + 1); + } + unsigned int sse1; + const int stride = width() + 1; + int run_time = 1000000000 / block_size(); + aom_usec_timer timer; + + aom_usec_timer_start(&timer); + for (int i = 0; i < run_time; ++i) { + int x = rnd_(8); + int y = rnd_(8); + ASM_REGISTER_STATE_CHECK( + params_.func(pre_, stride, x, y, wsrc_, mask_, &sse1)); + } + aom_usec_timer_mark(&timer); + + const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); + printf("obmc_sub_pixel_variance_%dx%d_%d: %d us\n", width(), height(), + params_.bit_depth, elapsed_time); +} + typedef MainTestClass<Get4x4SseFunc> AvxSseTest; typedef MainTestClass<VarianceMxNFunc> AvxMseTest; typedef MainTestClass<VarianceMxNFunc> AvxVarianceTest; typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxSubpelVarianceTest; typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxSubpelAvgVarianceTest; typedef SubpelVarianceTest<JntSubpixAvgVarMxNFunc> AvxJntSubpelAvgVarianceTest; +typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxObmcSubpelVarianceTest; TEST_P(AvxSseTest, RefSse) { RefTestSse(); } TEST_P(AvxSseTest, MaxSse) { MaxTestSse(); } @@ -825,6 +1038,9 @@ TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); } TEST_P(AvxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(AvxSubpelAvgVarianceTest, Ref) { RefTest(); } TEST_P(AvxJntSubpelAvgVarianceTest, Ref) { RefTest(); } +TEST_P(AvxObmcSubpelVarianceTest, Ref) { RefTest(); } +TEST_P(AvxObmcSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } +TEST_P(AvxObmcSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); } INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest, ::testing::Values(aom_get_mb_ss_c)); @@ -934,10 +1150,32 @@ INSTANTIATE_TEST_CASE_P( JntSubpelAvgVarianceParams(2, 2, &aom_jnt_sub_pixel_avg_variance4x4_c, 0))); +INSTANTIATE_TEST_CASE_P( + C, AvxObmcSubpelVarianceTest, + ::testing::Values( + ObmcSubpelVarianceParams(7, 7, &aom_obmc_sub_pixel_variance128x128_c, + 0), + ObmcSubpelVarianceParams(7, 6, &aom_obmc_sub_pixel_variance128x64_c, 0), + ObmcSubpelVarianceParams(6, 7, &aom_obmc_sub_pixel_variance64x128_c, 0), + ObmcSubpelVarianceParams(6, 6, &aom_obmc_sub_pixel_variance64x64_c, 0), + ObmcSubpelVarianceParams(6, 5, &aom_obmc_sub_pixel_variance64x32_c, 0), + ObmcSubpelVarianceParams(5, 6, &aom_obmc_sub_pixel_variance32x64_c, 0), + ObmcSubpelVarianceParams(5, 5, &aom_obmc_sub_pixel_variance32x32_c, 0), + ObmcSubpelVarianceParams(5, 4, &aom_obmc_sub_pixel_variance32x16_c, 0), + ObmcSubpelVarianceParams(4, 5, &aom_obmc_sub_pixel_variance16x32_c, 0), + ObmcSubpelVarianceParams(4, 4, &aom_obmc_sub_pixel_variance16x16_c, 0), + ObmcSubpelVarianceParams(4, 3, &aom_obmc_sub_pixel_variance16x8_c, 0), + ObmcSubpelVarianceParams(3, 4, &aom_obmc_sub_pixel_variance8x16_c, 0), + ObmcSubpelVarianceParams(3, 3, &aom_obmc_sub_pixel_variance8x8_c, 0), + ObmcSubpelVarianceParams(3, 2, &aom_obmc_sub_pixel_variance8x4_c, 0), + ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_c, 0), + ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_c, 0))); + typedef MainTestClass<VarianceMxNFunc> AvxHBDMseTest; typedef MainTestClass<VarianceMxNFunc> AvxHBDVarianceTest; typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxHBDSubpelVarianceTest; typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxHBDSubpelAvgVarianceTest; +typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxHBDObmcSubpelVarianceTest; TEST_P(AvxHBDMseTest, RefMse) { RefTestMse(); } TEST_P(AvxHBDMseTest, MaxMse) { MaxTestMse(); } @@ -1161,6 +1399,94 @@ const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_c[] = { INSTANTIATE_TEST_CASE_P(C, AvxHBDSubpelAvgVarianceTest, ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_c)); +const ObmcSubpelVarianceParams kArrayHBDObmcSubpelVariance_c[] = { + ObmcSubpelVarianceParams(7, 7, &aom_highbd_obmc_sub_pixel_variance128x128_c, + 8), + ObmcSubpelVarianceParams(7, 6, &aom_highbd_obmc_sub_pixel_variance128x64_c, + 8), + ObmcSubpelVarianceParams(6, 7, &aom_highbd_obmc_sub_pixel_variance64x128_c, + 8), + ObmcSubpelVarianceParams(6, 6, &aom_highbd_obmc_sub_pixel_variance64x64_c, 8), + ObmcSubpelVarianceParams(6, 5, &aom_highbd_obmc_sub_pixel_variance64x32_c, 8), + ObmcSubpelVarianceParams(5, 6, &aom_highbd_obmc_sub_pixel_variance32x64_c, 8), + ObmcSubpelVarianceParams(5, 5, &aom_highbd_obmc_sub_pixel_variance32x32_c, 8), + ObmcSubpelVarianceParams(5, 4, &aom_highbd_obmc_sub_pixel_variance32x16_c, 8), + ObmcSubpelVarianceParams(4, 5, &aom_highbd_obmc_sub_pixel_variance16x32_c, 8), + ObmcSubpelVarianceParams(4, 4, &aom_highbd_obmc_sub_pixel_variance16x16_c, 8), + ObmcSubpelVarianceParams(4, 3, &aom_highbd_obmc_sub_pixel_variance16x8_c, 8), + ObmcSubpelVarianceParams(3, 4, &aom_highbd_obmc_sub_pixel_variance8x16_c, 8), + ObmcSubpelVarianceParams(3, 3, &aom_highbd_obmc_sub_pixel_variance8x8_c, 8), + ObmcSubpelVarianceParams(3, 2, &aom_highbd_obmc_sub_pixel_variance8x4_c, 8), + ObmcSubpelVarianceParams(2, 3, &aom_highbd_obmc_sub_pixel_variance4x8_c, 8), + ObmcSubpelVarianceParams(2, 2, &aom_highbd_obmc_sub_pixel_variance4x4_c, 8), + ObmcSubpelVarianceParams(7, 7, + &aom_highbd_10_obmc_sub_pixel_variance128x128_c, 10), + ObmcSubpelVarianceParams(7, 6, &aom_highbd_10_obmc_sub_pixel_variance128x64_c, + 10), + ObmcSubpelVarianceParams(6, 7, &aom_highbd_10_obmc_sub_pixel_variance64x128_c, + 10), + ObmcSubpelVarianceParams(6, 6, &aom_highbd_10_obmc_sub_pixel_variance64x64_c, + 10), + ObmcSubpelVarianceParams(6, 5, &aom_highbd_10_obmc_sub_pixel_variance64x32_c, + 10), + ObmcSubpelVarianceParams(5, 6, &aom_highbd_10_obmc_sub_pixel_variance32x64_c, + 10), + ObmcSubpelVarianceParams(5, 5, &aom_highbd_10_obmc_sub_pixel_variance32x32_c, + 10), + ObmcSubpelVarianceParams(5, 4, &aom_highbd_10_obmc_sub_pixel_variance32x16_c, + 10), + ObmcSubpelVarianceParams(4, 5, &aom_highbd_10_obmc_sub_pixel_variance16x32_c, + 10), + ObmcSubpelVarianceParams(4, 4, &aom_highbd_10_obmc_sub_pixel_variance16x16_c, + 10), + ObmcSubpelVarianceParams(4, 3, &aom_highbd_10_obmc_sub_pixel_variance16x8_c, + 10), + ObmcSubpelVarianceParams(3, 4, &aom_highbd_10_obmc_sub_pixel_variance8x16_c, + 10), + ObmcSubpelVarianceParams(3, 3, &aom_highbd_10_obmc_sub_pixel_variance8x8_c, + 10), + ObmcSubpelVarianceParams(3, 2, &aom_highbd_10_obmc_sub_pixel_variance8x4_c, + 10), + ObmcSubpelVarianceParams(2, 3, &aom_highbd_10_obmc_sub_pixel_variance4x8_c, + 10), + ObmcSubpelVarianceParams(2, 2, &aom_highbd_10_obmc_sub_pixel_variance4x4_c, + 10), + ObmcSubpelVarianceParams(7, 7, + &aom_highbd_12_obmc_sub_pixel_variance128x128_c, 12), + ObmcSubpelVarianceParams(7, 6, &aom_highbd_12_obmc_sub_pixel_variance128x64_c, + 12), + ObmcSubpelVarianceParams(6, 7, &aom_highbd_12_obmc_sub_pixel_variance64x128_c, + 12), + ObmcSubpelVarianceParams(6, 6, &aom_highbd_12_obmc_sub_pixel_variance64x64_c, + 12), + ObmcSubpelVarianceParams(6, 5, &aom_highbd_12_obmc_sub_pixel_variance64x32_c, + 12), + ObmcSubpelVarianceParams(5, 6, &aom_highbd_12_obmc_sub_pixel_variance32x64_c, + 12), + ObmcSubpelVarianceParams(5, 5, &aom_highbd_12_obmc_sub_pixel_variance32x32_c, + 12), + ObmcSubpelVarianceParams(5, 4, &aom_highbd_12_obmc_sub_pixel_variance32x16_c, + 12), + ObmcSubpelVarianceParams(4, 5, &aom_highbd_12_obmc_sub_pixel_variance16x32_c, + 12), + ObmcSubpelVarianceParams(4, 4, &aom_highbd_12_obmc_sub_pixel_variance16x16_c, + 12), + ObmcSubpelVarianceParams(4, 3, &aom_highbd_12_obmc_sub_pixel_variance16x8_c, + 12), + ObmcSubpelVarianceParams(3, 4, &aom_highbd_12_obmc_sub_pixel_variance8x16_c, + 12), + ObmcSubpelVarianceParams(3, 3, &aom_highbd_12_obmc_sub_pixel_variance8x8_c, + 12), + ObmcSubpelVarianceParams(3, 2, &aom_highbd_12_obmc_sub_pixel_variance8x4_c, + 12), + ObmcSubpelVarianceParams(2, 3, &aom_highbd_12_obmc_sub_pixel_variance4x8_c, + 12), + ObmcSubpelVarianceParams(2, 2, &aom_highbd_12_obmc_sub_pixel_variance4x4_c, + 12) +}; +INSTANTIATE_TEST_CASE_P(C, AvxHBDObmcSubpelVarianceTest, + ::testing::ValuesIn(kArrayHBDObmcSubpelVariance_c)); + #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest, ::testing::Values(aom_get_mb_ss_sse2)); @@ -1519,6 +1845,44 @@ INSTANTIATE_TEST_CASE_P( 0))); #endif // HAVE_SSSE3 +#if HAVE_SSE4_1 +INSTANTIATE_TEST_CASE_P( + SSE4_1, AvxObmcSubpelVarianceTest, + ::testing::Values( + ObmcSubpelVarianceParams(7, 7, + &aom_obmc_sub_pixel_variance128x128_sse4_1, 0), + ObmcSubpelVarianceParams(7, 6, + &aom_obmc_sub_pixel_variance128x64_sse4_1, 0), + ObmcSubpelVarianceParams(6, 7, + &aom_obmc_sub_pixel_variance64x128_sse4_1, 0), + ObmcSubpelVarianceParams(6, 6, &aom_obmc_sub_pixel_variance64x64_sse4_1, + 0), + ObmcSubpelVarianceParams(6, 5, &aom_obmc_sub_pixel_variance64x32_sse4_1, + 0), + ObmcSubpelVarianceParams(5, 6, &aom_obmc_sub_pixel_variance32x64_sse4_1, + 0), + ObmcSubpelVarianceParams(5, 5, &aom_obmc_sub_pixel_variance32x32_sse4_1, + 0), + ObmcSubpelVarianceParams(5, 4, &aom_obmc_sub_pixel_variance32x16_sse4_1, + 0), + ObmcSubpelVarianceParams(4, 5, &aom_obmc_sub_pixel_variance16x32_sse4_1, + 0), + ObmcSubpelVarianceParams(4, 4, &aom_obmc_sub_pixel_variance16x16_sse4_1, + 0), + ObmcSubpelVarianceParams(4, 3, &aom_obmc_sub_pixel_variance16x8_sse4_1, + 0), + ObmcSubpelVarianceParams(3, 4, &aom_obmc_sub_pixel_variance8x16_sse4_1, + 0), + ObmcSubpelVarianceParams(3, 3, &aom_obmc_sub_pixel_variance8x8_sse4_1, + 0), + ObmcSubpelVarianceParams(3, 2, &aom_obmc_sub_pixel_variance8x4_sse4_1, + 0), + ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_sse4_1, + 0), + ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_sse4_1, + 0))); +#endif // HAVE_SSE4_1 + #if HAVE_AVX2 INSTANTIATE_TEST_CASE_P(AVX2, AvxMseTest, ::testing::Values(MseParams(4, 4, &aom_mse16x16_avx2))); |