diff options
author | trav90 <travawine@palemoon.org> | 2018-10-19 21:52:15 -0500 |
---|---|---|
committer | trav90 <travawine@palemoon.org> | 2018-10-19 21:52:20 -0500 |
commit | bbcc64772580c8a979288791afa02d30bc476d2e (patch) | |
tree | 437ce94c3fdd7497508e5b55de06c6d011678597 /third_party/aom/av1/encoder | |
parent | 14805f6ddbfb173c327768fff9f81f40ce5e81b0 (diff) | |
download | UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.gz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.lz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.xz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.zip |
Update aom to v1.0.0
Update aom to commit id d14c5bb4f336ef1842046089849dee4a301fbbf0.
Diffstat (limited to 'third_party/aom/av1/encoder')
123 files changed, 32091 insertions, 46326 deletions
diff --git a/third_party/aom/av1/encoder/ab_partition_model_weights.h b/third_party/aom/av1/encoder/ab_partition_model_weights.h new file mode 100644 index 000000000..5b918fae2 --- /dev/null +++ b/third_party/aom/av1/encoder/ab_partition_model_weights.h @@ -0,0 +1,1318 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_ +#define AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +#define FEATURE_SIZE 10 +#define LABEL_SIZE 16 +// nn model for ab partition pruning, 128x128. +static const float av1_ab_partition_nn_weights_128_layer0[FEATURE_SIZE * 64] = { + -0.715251f, -0.015767f, -0.667353f, -0.345255f, 0.177887f, -0.469759f, + 0.426152f, 0.489798f, 0.469865f, 0.773821f, 0.088517f, 0.074585f, + 0.838754f, 0.048449f, -0.007584f, 0.638968f, 0.233305f, -0.319236f, + -0.257124f, -0.170869f, 0.137180f, 0.114852f, -0.721241f, -0.947962f, + -0.411298f, 0.494306f, -0.060435f, -0.648421f, -0.126624f, 0.072686f, + -0.143904f, -0.115839f, -0.175527f, -0.117728f, 0.040686f, -0.189925f, + 0.134361f, -0.258070f, -0.177558f, 0.158049f, 0.168668f, -0.062919f, + 0.341986f, 0.038100f, -0.435577f, -0.321255f, 0.203213f, 0.213061f, + 0.533304f, 0.359296f, -0.079558f, 0.004637f, 0.663904f, 0.043779f, + 0.383018f, 1.136559f, -0.084155f, 0.333057f, -0.199011f, 0.152059f, + -0.078419f, -0.167752f, -0.093651f, 0.083171f, -0.190143f, 0.086195f, + -0.280632f, -0.160663f, -0.017298f, 0.122628f, -0.138116f, 0.062927f, + 0.222462f, 0.626979f, 0.426928f, 0.117170f, -0.240457f, 0.053750f, + 0.038017f, 0.007359f, -0.017595f, 0.101407f, 0.332891f, 0.074933f, + 0.306498f, 0.219380f, -0.151638f, -0.247976f, 0.343405f, 0.121256f, + 0.049173f, 0.171474f, -0.139608f, -1.016599f, -0.345553f, -0.901138f, + 0.243401f, 0.059928f, -0.089396f, -0.195565f, 0.364705f, -0.020400f, + -1.383672f, 0.413018f, 0.536950f, -0.020904f, -1.335306f, -0.732290f, + 0.102885f, 0.315290f, -0.208521f, -0.081811f, 0.182300f, 0.125712f, + -0.593833f, -0.220639f, -0.314155f, 0.188327f, 0.118503f, 0.524427f, + -1.083859f, -1.130640f, 0.390352f, -0.045591f, 0.113160f, -0.009149f, + -0.096183f, 0.115829f, 0.377752f, 0.318396f, -0.591983f, 0.004797f, + -0.497377f, -0.342248f, 0.079546f, -0.025249f, -0.295972f, 0.615501f, + -0.464372f, 0.418315f, -0.173556f, 0.105217f, 0.298073f, 0.082478f, + 0.033223f, 0.977341f, -0.372982f, -0.052337f, 0.154124f, 0.396787f, + 0.536654f, -0.139061f, -0.223702f, 0.229666f, -0.846766f, 0.107723f, + 0.563839f, -0.483141f, 0.304813f, -0.765283f, 0.070964f, 0.151101f, + 0.275188f, 0.490303f, 1.175892f, 0.085377f, -0.191200f, 0.544532f, + -0.365075f, 0.167546f, 0.052183f, -0.220529f, -0.212227f, -0.144988f, + -0.273356f, -0.062023f, 0.103993f, -0.238493f, -0.161204f, -0.054611f, + -0.166672f, 0.128327f, 0.461751f, -0.545822f, 0.739798f, 0.594386f, + -0.163192f, -0.332501f, 0.363834f, -0.065043f, 0.474812f, -0.138811f, + 0.170924f, -0.778142f, -0.316474f, -0.508065f, -0.039986f, -0.478001f, + 0.340591f, 0.041783f, 0.055419f, 0.015155f, -0.981830f, -1.355237f, + 0.347516f, 1.155327f, 0.081319f, 0.274163f, -0.327230f, -0.113478f, + 0.556552f, -0.055986f, 0.217318f, -0.445351f, 0.325759f, 0.526547f, + -0.657434f, -0.572214f, -0.037087f, 0.081384f, 0.064518f, 0.014892f, + 0.215279f, 1.834504f, -0.242107f, 0.079810f, 0.129558f, 0.079588f, + -0.035189f, -0.221745f, -0.163414f, 0.043978f, -1.028662f, -0.623609f, + 1.130336f, 0.664661f, -0.063975f, -0.415863f, 0.018581f, 0.157758f, + 0.200570f, 0.063420f, 0.901039f, -0.746286f, 0.196230f, -0.290592f, + 0.042373f, -0.502500f, 0.183638f, 0.103394f, -0.298858f, 0.145436f, + 0.196916f, 0.108319f, -0.448572f, -0.881385f, 0.302497f, 0.121679f, + -0.021327f, 0.025150f, 0.481306f, -0.359634f, 0.350257f, -0.228647f, + -0.669860f, 0.260025f, -0.034182f, 0.619247f, -0.158826f, -0.405864f, + 0.674112f, -0.027885f, -0.325274f, -0.241492f, 0.036024f, -0.437685f, + -0.091458f, -0.109295f, -0.350676f, 0.044706f, 0.297059f, 0.016290f, + 1.121203f, 1.289062f, -1.299476f, -1.129221f, 0.103752f, 0.131302f, + -0.263265f, 0.222155f, -0.229908f, 0.013922f, -0.226001f, -0.248383f, + -0.004415f, -0.020958f, 0.055634f, 0.086200f, 0.114556f, -0.184061f, + -0.096210f, -0.146466f, -0.249618f, -0.195998f, 0.088758f, 0.023781f, + -0.264460f, 0.157026f, -0.235228f, -0.102564f, 0.043463f, -0.187823f, + -0.257500f, -0.199049f, -0.242210f, 0.030448f, 0.221604f, 0.151804f, + -0.100404f, -0.073931f, 0.144749f, -0.001572f, -1.438079f, -0.233716f, + 0.733422f, 1.727080f, -0.036397f, 0.027551f, 0.425321f, 0.085703f, + 0.031186f, 0.032333f, -0.675130f, 1.437733f, -0.202392f, -0.525003f, + 0.087048f, 0.328194f, -0.079989f, -0.391088f, -0.238732f, -0.120660f, + -0.139600f, 0.154665f, 0.026202f, -0.233501f, -0.009046f, -0.149187f, + -0.199646f, 0.115375f, 0.209762f, -0.014875f, 0.124038f, -0.119985f, + 1.079625f, -0.461513f, 0.614114f, 0.021003f, 0.439449f, -0.824834f, + -0.299701f, 0.193817f, -0.870551f, -1.262313f, -0.079517f, 0.341570f, + 0.305310f, -0.089721f, -0.317314f, -0.075631f, 0.127172f, -0.208635f, + 1.191922f, 0.163141f, 0.564285f, 0.286352f, 0.480865f, 0.173094f, + -0.094034f, -0.071339f, -0.328992f, -0.006382f, 0.314705f, 0.090258f, + -0.016099f, 0.193230f, 0.188061f, 0.398144f, 0.722781f, 0.769949f, + 0.025442f, -0.162016f, 0.070192f, -0.056946f, -0.100957f, -0.219934f, + -0.203492f, -0.015454f, -0.013272f, -0.098008f, 0.051707f, -0.017493f, + 0.527446f, 0.083605f, 0.588318f, 0.878215f, 0.028747f, -0.146479f, + -0.345170f, -0.136059f, -0.152005f, -0.203634f, 0.232702f, -0.101340f, + -0.027733f, -0.282611f, 0.265366f, 0.082362f, -0.265420f, -0.131124f, + 0.166303f, 0.040194f, -0.100710f, 0.579151f, -0.530136f, 0.163422f, + -0.998821f, -1.565311f, -1.774785f, -2.493372f, 0.116970f, -0.090302f, + 1.723272f, 0.552370f, -0.295954f, -0.439095f, -0.266730f, 0.027936f, + 0.539616f, -0.234902f, -0.167601f, -0.149877f, -0.242983f, 0.122353f, + -0.121620f, -0.205517f, -0.180144f, -0.264208f, 0.151500f, -0.159378f, + 0.029145f, -0.050892f, -0.223407f, -0.246239f, 0.043152f, -0.018460f, + 0.169972f, -0.187769f, -0.034670f, -0.238330f, 0.288070f, -0.093243f, + -0.437105f, -0.573376f, 0.660073f, 0.285727f, 0.408470f, 0.158475f, + 0.032699f, 0.056280f, -0.237176f, -0.083003f, 0.105598f, -0.169522f, + -0.260420f, -0.121100f, -0.173983f, -0.195693f, -0.232028f, 0.224940f, + 0.029124f, 0.009580f, -0.252034f, 0.103087f, 1.156561f, 0.603848f, + -0.562805f, -1.652742f, -0.568288f, -1.829395f, 0.046169f, 0.076095f, + 1.490819f, 0.415893f, -0.277788f, -0.115787f, 0.093750f, 0.270726f, + -0.395983f, -0.353742f, 0.034605f, 0.005342f, 0.184537f, 0.086445f, + 0.156417f, 1.476367f, 0.122587f, 0.002145f, 0.431057f, -0.381184f, + -1.646457f, -0.014009f, -0.671224f, 0.193726f, -0.019247f, -0.031267f, + -0.046208f, 0.298733f, 0.064734f, 0.616984f, 0.039381f, 0.182722f, + -0.116670f, 0.233093f, -1.214374f, -0.817970f, -0.064394f, -0.584783f, + 0.077697f, -0.266720f, 0.130875f, -0.235295f, -0.265754f, -0.159999f, + -0.250114f, -0.183017f, 0.194403f, -0.105808f, -0.169215f, -0.240866f, + -0.026662f, -0.045123f, -0.036175f, -0.167471f, -0.192908f, -0.232602f, + -0.267036f, -0.112500f, -0.257944f, -0.111909f, -0.802226f, -0.008800f, + 0.881460f, -0.678603f, 0.008666f, -0.252053f, -0.341035f, -0.175290f, + 0.183012f, 0.385991f, 0.079888f, -0.014039f, -0.148653f, 0.671778f, + -0.130219f, 1.086467f, 0.129267f, -0.040400f, -0.201221f, -0.077005f, + 0.015890f, 0.000781f, 0.137764f, 1.389546f, 0.172152f, 0.047279f, + -0.042783f, 0.127740f, 0.141467f, -0.335738f, -1.396392f, 0.031496f, + 0.357385f, 0.343602f, -0.714553f, 0.311014f, 0.132845f, 0.061149f, + 0.006796f, 0.568106f, -0.255949f, 0.104134f, -0.993447f, 0.298135f, + -0.406590f, -0.049228f, -0.578570f, -0.188561f, -0.107046f, 0.374095f, + 0.068481f, 0.036240f, -0.495801f, 0.180574f, -0.766129f, 0.886967f, + -0.568868f, -0.936062f, -0.418886f, -0.058735f, -0.511964f, -0.438596f, + 0.019016f, -0.015837f, 0.600197f, 0.429773f, 0.315026f, 0.319667f, + 0.214617f, -0.017316f, 0.270257f, -0.040524f, 0.695803f, -0.015223f, + -1.554965f, 0.356997f, -1.472428f, 0.024637f, -0.562958f, 0.870351f, + 0.193635f, 0.036063f, 0.328638f, 0.200274f, -1.634707f, 0.110534f, + 0.420104f, -0.072042f, -0.006404f, 0.171680f, +}; + +static const float av1_ab_partition_nn_bias_128_layer0[64] = { + 0.643147f, -1.348826f, 0.431627f, 0.000000f, 0.102717f, -0.772628f, + -0.034351f, -0.761977f, -0.638397f, 0.541969f, -0.391311f, 0.563076f, + 0.148553f, 0.267217f, -0.788092f, 0.544573f, -0.546280f, 0.000000f, + -0.446945f, 0.127732f, 0.270624f, -0.219435f, -1.220203f, 0.324584f, + 0.110885f, 0.276547f, 0.179726f, -0.375160f, 0.026401f, -0.032595f, + 0.000000f, -0.047932f, -0.648602f, -0.512637f, -0.031661f, -0.236761f, + 0.476453f, -0.028021f, -0.013673f, -0.015578f, -0.920077f, 0.000000f, + 0.915351f, -0.209962f, 0.000000f, -0.025731f, 0.218288f, 0.000000f, + 0.047726f, -0.813077f, -1.263281f, 0.239087f, 0.278614f, -0.030753f, + 0.000000f, 0.346744f, -0.948543f, -1.174211f, 0.216377f, 0.498913f, + 0.853918f, 0.002504f, -0.190403f, 0.452050f, +}; + +static const float av1_ab_partition_nn_weights_128_layer1[64 * LABEL_SIZE] = { + 0.179769f, 1.499417f, -0.445135f, -0.142278f, -0.337661f, 0.682064f, + -0.203213f, 0.302171f, 0.226877f, -0.422169f, 1.687586f, 0.783773f, + 0.220995f, 0.253482f, 0.370435f, -1.342775f, 0.337229f, -0.271473f, + 0.291796f, 1.362227f, -1.751397f, -0.086178f, 0.725496f, -0.118597f, + 0.227963f, -0.501577f, 0.223849f, -0.122421f, -0.123437f, -0.051045f, + -0.020115f, 0.212711f, 0.246025f, 0.088120f, -0.168995f, 1.740190f, + -0.195098f, 0.680339f, -0.589572f, -0.075244f, 0.878766f, 0.064092f, + -3.548527f, 0.001660f, 0.107926f, -0.169501f, -0.455212f, 0.123045f, + -1.836998f, 0.330365f, 1.301475f, 0.454761f, -0.576552f, -0.190761f, + 0.208459f, 0.618483f, 1.383364f, 0.970718f, 0.390174f, 0.406252f, + -0.564519f, -0.312062f, 1.345712f, -0.151873f, 0.109290f, 0.408847f, + 0.391243f, 0.152024f, 0.181764f, -0.036263f, -0.160466f, 0.153595f, + 0.049163f, -0.753012f, -1.804062f, 0.347475f, -2.746580f, 0.575618f, + 0.261799f, 0.210505f, -0.302054f, -0.109872f, 0.199506f, -1.182971f, + 0.723668f, 0.177758f, -0.338202f, 0.254396f, -0.220023f, 0.043504f, + 0.669866f, -0.040816f, -0.402730f, 0.017990f, 0.215523f, -0.216816f, + 0.454826f, -0.726067f, -0.018750f, -0.928679f, 0.154315f, -0.465641f, + 0.144566f, -0.030064f, -0.054667f, -0.154055f, 0.625384f, 1.323795f, + -0.159496f, 0.097072f, -0.463197f, -0.057938f, 0.750290f, -0.233061f, + 0.412631f, -0.535223f, -0.151423f, -0.154583f, 0.024721f, -0.494448f, + 0.230594f, -0.980138f, -0.653968f, 0.126079f, 0.051814f, -0.053219f, + -0.421708f, -0.228853f, 0.237885f, 0.888157f, 0.059655f, 0.241295f, + 0.210443f, 0.228238f, 0.119127f, -0.051989f, -0.355408f, 0.182215f, + 0.244277f, -0.104577f, -0.558035f, -0.023270f, 0.054571f, 0.700646f, + -0.223006f, 0.115523f, 0.023391f, 0.437264f, 0.709477f, -0.531212f, + -0.094731f, 0.328161f, -0.105418f, -0.133511f, 0.497168f, -0.030948f, + -0.407132f, -0.043943f, 0.155505f, 0.251945f, 0.205010f, 0.167160f, + 0.083654f, -0.636810f, 0.401315f, -0.398414f, 0.290046f, 0.206846f, + 0.042218f, 0.168150f, 0.843181f, -0.671242f, -0.202392f, -0.073301f, + 0.142895f, 0.237466f, 0.212145f, -0.091828f, 0.187038f, -0.720841f, + -0.616069f, -0.238021f, 0.065365f, 0.434119f, 0.179023f, -0.040107f, + -0.430734f, -0.297368f, 0.575954f, 0.382619f, -0.709787f, -0.320810f, + 0.242342f, -0.047614f, 0.705216f, 0.098077f, 0.357179f, 0.046017f, + 0.115074f, -0.412305f, -0.272304f, 0.048096f, -0.803811f, 0.275000f, + 0.642198f, 0.180286f, -0.087178f, -0.112707f, -0.394443f, 0.201989f, + 0.241759f, -1.038870f, 0.728124f, 0.800559f, -1.296268f, 0.198612f, + -0.053478f, 0.414344f, -0.510529f, 0.124179f, -2.219115f, -0.074583f, + -0.143055f, 0.001697f, 0.810811f, -0.657140f, 0.186818f, -0.936414f, + 0.539578f, -0.308244f, -0.126624f, -0.204767f, 0.091145f, -0.049340f, + 0.252014f, 0.394582f, 0.018764f, -0.060377f, -0.019133f, 0.064083f, + 0.069211f, -0.526693f, 0.209850f, -0.481466f, -0.468302f, -0.100407f, + 0.241018f, -1.037781f, 0.038539f, -2.113840f, -0.974895f, 0.163187f, + 0.425132f, -0.772546f, -1.261254f, -0.217488f, -0.971748f, -0.805640f, + -0.745175f, -0.177077f, 0.217658f, 0.381431f, -0.052338f, 0.087176f, + -0.165972f, 0.085937f, 0.472564f, -0.796627f, -2.453307f, 0.569664f, + -0.233010f, -0.192134f, 0.064339f, -0.111411f, -0.262469f, -0.410022f, + 0.519993f, -0.684620f, 0.393460f, -0.277753f, -0.153624f, 0.528984f, + -0.415558f, -0.445863f, 0.588512f, -0.142439f, -0.132127f, 0.199776f, + -0.579284f, 0.119488f, -0.033590f, -0.503846f, -0.674979f, 0.335125f, + 0.020519f, 0.233973f, -0.297998f, -0.051511f, 0.518626f, -0.412782f, + -0.074045f, 0.130523f, 0.465751f, -0.117795f, 2.535813f, 0.352108f, + -0.499228f, 0.379784f, 0.056699f, 0.173142f, -0.076519f, -0.026666f, + 0.017834f, 0.492333f, 0.093364f, 0.037867f, -0.165420f, -0.356429f, + -0.562334f, 0.057656f, -0.307544f, 0.085857f, -0.559851f, 0.107230f, + -0.398633f, 0.152618f, -0.216835f, -0.024539f, 0.026044f, -0.249519f, + -0.563594f, -0.746025f, 0.025265f, -0.298888f, -0.185243f, 0.058794f, + 0.233696f, -0.115223f, 0.144617f, -0.864390f, 0.619944f, -0.023980f, + 0.019481f, 0.225252f, 0.416552f, -0.115993f, 0.935387f, 0.744386f, + 0.053353f, -0.052582f, -0.065650f, 0.228488f, -0.032042f, -0.371252f, + -0.003638f, -0.736984f, -0.203776f, 0.030922f, -0.065577f, -0.031643f, + -0.049253f, -0.054640f, 0.787134f, 0.545414f, -0.140297f, -0.124274f, + -0.110011f, -0.029552f, 0.657005f, 0.214973f, -0.374300f, 0.251642f, + 0.276591f, 0.030566f, -0.145470f, 0.350579f, -0.356436f, -0.052694f, + -0.063966f, -0.751008f, -1.042392f, 0.328892f, -0.425058f, -0.421571f, + -0.571889f, -1.141472f, -0.125216f, 0.212713f, -0.485170f, -0.088791f, + 0.124589f, 0.023237f, 0.077635f, 0.020901f, -0.271402f, -0.321424f, + -0.513946f, -0.867872f, -0.284593f, 0.106276f, 0.220192f, -0.143532f, + -0.014648f, 0.073402f, 0.327256f, -0.139803f, 0.168763f, 0.048199f, + -0.122526f, 0.111713f, -0.134257f, 0.810364f, -0.085222f, -0.259221f, + -0.239349f, 0.044448f, 0.205031f, 0.413113f, -0.107720f, -0.018816f, + -0.247741f, -0.004963f, 0.041170f, -0.158019f, 0.134839f, 0.129502f, + 0.800488f, -1.041584f, -0.129336f, 0.170834f, 0.566586f, -0.230443f, + 0.437937f, -0.149922f, -0.046665f, -0.094646f, 0.200070f, 0.072943f, + -0.076943f, -0.084971f, -0.515843f, -0.146720f, 0.472869f, -0.444731f, + -0.100877f, 0.545196f, -1.786626f, -0.482946f, 0.500509f, -0.843257f, + 0.200374f, 0.045103f, -0.575718f, -0.164335f, -0.232522f, -0.021825f, + -0.139490f, 0.356058f, -0.352075f, 0.061751f, -0.200616f, -1.180921f, + -0.181355f, -0.137459f, 0.247574f, 0.181541f, 0.184314f, -0.961482f, + 0.493615f, 0.910261f, -2.279238f, 0.648631f, -0.055526f, -0.037137f, + 0.038643f, 0.136609f, -0.819373f, -0.040840f, -0.265989f, 0.006877f, + 0.454651f, -0.595323f, -0.099500f, -0.263717f, 0.150456f, 0.245077f, + -0.268666f, 0.162232f, -0.516451f, -0.024501f, 0.188046f, -0.002262f, + 0.261319f, 0.004173f, 0.746982f, 0.174761f, 0.470447f, -0.159558f, + -0.385240f, 0.023084f, -0.133520f, -0.220607f, -0.018731f, -0.373558f, + -0.707763f, -1.850150f, -0.807404f, -0.168063f, -0.071435f, -0.160740f, + -0.478789f, -1.070674f, -0.489740f, -0.255796f, 0.100486f, -0.153361f, + 0.334394f, -0.569472f, -0.198118f, 0.255922f, 0.104717f, -0.065179f, + 0.111879f, -0.447237f, 1.373623f, -0.190191f, -0.063311f, 0.337529f, + -0.138800f, 0.057009f, -0.137006f, 0.641378f, 0.883147f, -0.679655f, + 0.267717f, -0.351602f, -0.135225f, 0.229398f, -0.513225f, -1.120345f, + 0.528786f, -0.051081f, 0.086653f, 0.140141f, -0.563969f, 0.333402f, + -0.174745f, 0.321093f, -0.438641f, -0.005131f, 0.247415f, 0.110120f, + -0.076308f, -0.083244f, 0.838944f, -0.113043f, -0.013258f, -0.175028f, + -0.179941f, 0.272676f, -0.047946f, -0.088076f, -0.450031f, 0.053929f, + -0.083549f, -0.089952f, -0.186253f, 0.257483f, 0.011019f, 0.586435f, + 0.060580f, -0.052078f, 0.090277f, -0.780869f, 0.969811f, -0.025349f, + -0.281917f, 0.014857f, 0.231863f, -0.228601f, -0.003861f, 0.226550f, + 0.141825f, -0.102171f, -0.010387f, 0.220378f, -2.561975f, -0.497071f, + -0.315117f, 0.371981f, 0.138247f, 0.625031f, -0.308133f, -0.217876f, + 0.005615f, -0.860179f, 0.747491f, 0.006356f, -0.057024f, -0.483189f, + 0.055592f, -0.316834f, 0.069858f, 0.218788f, -0.200044f, 0.227588f, + 0.215496f, -0.055324f, -0.393147f, -0.394062f, -0.253264f, -0.075619f, + -0.152512f, -0.332995f, 0.129053f, 0.178668f, -0.302694f, 0.030678f, + 0.925896f, 0.964375f, 0.169021f, -0.218657f, -0.627204f, 0.206437f, + -0.521336f, 0.176206f, 0.142733f, 0.139248f, 0.411682f, 0.181544f, + 0.224850f, -0.935547f, -0.558208f, 0.348096f, 0.342129f, -0.389340f, + -0.236308f, -0.132099f, 0.073642f, 0.089391f, -0.306901f, -0.397842f, + 0.444282f, 0.074623f, -0.051075f, -0.106617f, -0.184037f, -0.239046f, + -0.138761f, 0.120794f, -0.647577f, -0.336471f, 0.527899f, -0.164234f, + -0.028354f, 1.083678f, -0.251534f, -0.145903f, -0.182783f, 0.070976f, + -0.199590f, -0.400306f, -0.029763f, -0.548042f, -0.266270f, -0.118084f, + -1.152632f, 0.383685f, -0.105895f, -0.096829f, 0.118382f, 0.047447f, + -0.019051f, 0.310180f, -0.162793f, -0.029574f, 0.058054f, -0.636017f, + 0.490639f, 0.158347f, -0.385701f, -0.147057f, 1.285825f, -1.276083f, + -0.021795f, -0.101600f, 0.163254f, 0.267160f, -2.317864f, -0.098598f, + -0.296337f, -0.309017f, 0.164127f, -0.270012f, -0.071187f, -0.262270f, + 0.075415f, -0.368328f, 0.186728f, -0.158031f, 0.481663f, 0.515950f, + -0.162551f, 0.497981f, 0.262196f, 0.168479f, 0.726066f, -0.243856f, + -0.058998f, 0.140168f, 0.053242f, -0.624623f, -0.249480f, 0.055197f, + -1.376804f, 0.417571f, 0.203784f, 0.174370f, -0.155531f, -0.029400f, + -0.491473f, 0.079811f, -0.080123f, 1.345900f, 0.637077f, 0.434862f, + -1.787438f, 0.005756f, -0.362706f, 0.179458f, -0.288263f, 0.516788f, + -0.921248f, 0.043794f, -0.137729f, -0.196171f, -0.046295f, -0.793781f, + -0.156532f, -0.132566f, 0.517989f, -0.154321f, -0.054174f, -0.077900f, + -0.373316f, -0.117718f, 0.188986f, -0.476188f, -0.245312f, 0.181439f, + -0.161024f, -0.229059f, -3.079907f, -0.225452f, -0.594355f, -0.558027f, + -0.135429f, 0.125766f, -0.081314f, -0.350894f, -0.163165f, -1.936507f, + -0.205966f, 0.031472f, 0.744446f, -0.006680f, -0.837551f, 0.605862f, + -0.854929f, -1.543750f, -0.307704f, -0.240517f, 0.178240f, -0.183586f, + -0.010307f, 0.099373f, -0.228278f, 0.175236f, -0.000133f, 0.104491f, + -1.540545f, -0.570971f, -0.252885f, 0.483036f, 0.052531f, 0.260214f, + -0.515016f, -0.602081f, -0.485690f, -0.730710f, 0.163719f, -1.775975f, + -0.298634f, 0.323626f, -0.373579f, -0.872977f, 0.619574f, 0.026862f, + -0.122531f, -0.084698f, -2.436297f, 0.483996f, -0.203640f, -0.302157f, + -0.150666f, -0.238320f, 0.089250f, 0.236485f, -0.668654f, -0.122863f, + 0.491152f, -0.226444f, -0.181248f, 0.120158f, 0.294027f, 0.250056f, + 0.307601f, 0.357875f, -1.746455f, -0.175670f, 0.385447f, -0.108808f, + -0.090235f, -0.642504f, -0.486004f, -0.055160f, -0.068692f, 0.009736f, + 0.607555f, -0.489426f, 0.150624f, 0.598114f, -0.128816f, -0.445793f, + -0.066524f, -0.254380f, 0.227106f, -0.406495f, -0.121632f, -0.275960f, + -0.136494f, 0.339457f, -1.318132f, -0.417572f, -2.614077f, 0.324603f, + -0.001211f, 0.375192f, -0.473448f, -0.162510f, 0.099329f, -0.277965f, + 0.101221f, -0.060263f, 0.121867f, -1.042140f, 0.440851f, 0.078898f, + -0.209007f, -0.243699f, 0.715197f, -0.093997f, 0.086022f, -0.178203f, + -2.275496f, -0.098413f, 0.199352f, -0.526791f, -0.162086f, -0.197806f, + -0.231657f, -0.269202f, -0.794294f, -0.223461f, 0.503584f, 0.416236f, + 0.064082f, 0.197655f, 0.340871f, -0.186645f, -0.291498f, 0.433938f, + -1.110063f, 0.003751f, 0.392738f, 0.069360f, 0.102088f, -0.302128f, + -1.518457f, 0.106939f, 0.404527f, -0.306868f, -0.286928f, 0.729276f, + -0.531710f, 0.745048f, -0.168837f, -1.953886f, -0.258828f, -0.190252f, + 0.241877f, -0.916744f, -0.030326f, -0.070541f, -0.271037f, 0.211303f, + -0.489957f, 0.100850f, 0.323999f, -0.802837f, -0.462408f, -0.079350f, + -0.029374f, 0.131213f, -0.825032f, 0.040202f, 0.351821f, 0.002869f, + -0.132516f, -0.471264f, -0.297002f, 0.263913f, 0.033478f, 0.146161f, + 0.533229f, -0.228608f, -0.200639f, -0.170955f, -0.915037f, 0.724491f, + 0.005151f, 0.018584f, -0.029771f, -0.396038f, -0.159236f, 0.038691f, + -1.197056f, 0.146302f, 0.226840f, -0.852126f, 0.031214f, 0.108880f, + 0.562000f, -0.134633f, -0.713343f, -0.342252f, -1.764521f, -0.114653f, + 0.515073f, -0.080515f, -0.121155f, -0.865139f, -0.833694f, -0.368553f, + 0.347673f, 0.623379f, 0.722067f, -0.492458f, -0.513263f, 0.585167f, + 0.721518f, -0.693499f, 0.343725f, -0.273861f, -0.040230f, -0.785664f, + -0.157500f, -0.308445f, 0.054062f, 0.600131f, -0.860887f, 0.434470f, + -0.191382f, -0.306150f, -0.243965f, 0.705444f, 0.007789f, -0.146154f, + -0.054499f, -0.073500f, -1.067364f, 0.404936f, -2.864590f, 0.182323f, + 0.326126f, 0.102405f, -0.135800f, 1.128095f, -0.012267f, -0.023996f, + -0.264834f, -0.108967f, -1.176746f, -0.926666f, 0.082999f, -0.498361f, + 0.083560f, -0.210074f, 0.019225f, -0.201614f, -0.904760f, 0.181421f, + 0.586384f, -0.177706f, 0.065471f, 0.168552f, 0.054705f, 0.045241f, + 0.048057f, -0.410957f, -2.188854f, -0.169812f, 0.015521f, 0.176856f, + -0.179331f, -0.352640f, -0.491735f, -1.743206f, 0.044227f, 0.010454f, + 0.823643f, -0.119781f, -0.098359f, 0.093119f, +}; + +static const float av1_ab_partition_nn_bias_128_layer1[LABEL_SIZE] = { + -0.433195f, -0.120488f, -0.116721f, 0.112134f, 0.118170f, -0.259769f, + -0.077530f, 0.394044f, 0.279167f, -0.317988f, 0.189538f, 0.314776f, + 0.325655f, -0.107123f, 0.591049f, 0.358744f, +}; + +static const NN_CONFIG av1_ab_partition_nnconfig_128 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 64, // num_hidden_nodes + }, + { + av1_ab_partition_nn_weights_128_layer0, + av1_ab_partition_nn_weights_128_layer1, + }, + { + av1_ab_partition_nn_bias_128_layer0, + av1_ab_partition_nn_bias_128_layer1, + }, +}; + +// nn model for ab partition pruning, 64x64. +static const float av1_ab_partition_nn_weights_64_layer0[FEATURE_SIZE * 64] = { + -0.495347f, -0.049498f, -0.026804f, 0.030474f, -0.289308f, -0.264193f, + -0.141121f, -0.072562f, -0.391665f, -0.051491f, -0.234761f, 0.027155f, + -0.038217f, 0.014872f, -0.289728f, -0.233577f, -0.415875f, -0.343615f, + -0.442543f, -0.482492f, 0.073510f, 0.007503f, 2.162329f, -0.362849f, + 2.145915f, -0.883135f, 0.185636f, -0.062859f, -0.465574f, -0.486205f, + -0.056710f, -0.330642f, -0.321860f, 0.042321f, -0.348965f, 0.003542f, + -0.291365f, -0.078164f, -0.345093f, -0.220272f, -0.471270f, -0.763853f, + 0.246622f, 0.199651f, -0.663420f, -0.154152f, -1.220383f, 0.047138f, + 0.816811f, 0.083247f, -0.218839f, 0.038143f, -0.063436f, 0.015517f, + -0.307320f, -0.166956f, -0.169499f, -0.399005f, -0.234638f, -0.162266f, + 0.050425f, -0.221723f, -0.256942f, -0.287285f, 0.144011f, -0.033245f, + 0.083649f, 0.119428f, -0.056706f, -0.117805f, 0.021866f, -0.257300f, + -0.201378f, -0.217484f, -0.413780f, -0.145793f, 0.082792f, -0.347247f, + 0.042539f, -0.302697f, 1.652316f, 0.000701f, -0.482843f, -0.160332f, + -0.450099f, 0.212399f, -4.715360f, -5.336774f, -5.375758f, -6.048339f, + 0.085956f, -0.037767f, 1.052409f, -0.931924f, -2.221907f, 0.268946f, + 0.015512f, 1.237094f, -1.092185f, 0.418247f, -0.082143f, -0.076914f, + -0.060749f, -0.325440f, -0.296960f, -0.066815f, -0.158477f, -0.373945f, + -0.122322f, -0.113495f, -0.097978f, -0.192816f, -0.270418f, 0.035840f, + -0.015458f, -0.121071f, -0.279582f, -0.067683f, 0.097855f, 0.019839f, + 0.451127f, 0.004376f, 1.410392f, 3.255835f, -0.344815f, 0.145202f, + 0.204132f, 0.171948f, -0.527736f, -0.110353f, 0.901448f, 0.003238f, + -3.822090f, 0.235462f, 1.024823f, -0.821244f, 0.876056f, 2.553762f, + -3.478597f, -2.076582f, -0.265515f, -0.055923f, -0.156980f, -0.164097f, + -0.246040f, 0.039430f, -0.071769f, -0.118847f, -0.304053f, -0.281541f, + -0.226021f, -0.263091f, -0.127359f, -0.249410f, -0.051023f, 0.083911f, + 0.084721f, 0.168089f, -0.272169f, -0.204998f, -0.008303f, -0.173998f, + 0.079376f, -0.197426f, -0.199052f, -0.118794f, -0.063753f, -0.094769f, + 0.066176f, -0.175832f, -0.238752f, -0.287960f, -0.134307f, -0.185953f, + -0.385845f, 0.119769f, -0.006567f, -0.382126f, -0.214221f, 0.038449f, + -0.253484f, -0.282766f, -0.020249f, -0.193929f, 0.016281f, -0.114423f, + -0.145940f, -0.281621f, -0.007588f, -0.131470f, -0.189012f, -0.185699f, + -0.279011f, -0.008132f, 0.208463f, 0.020569f, -0.206803f, -0.213408f, + -0.206131f, -0.290245f, 0.069701f, -0.000371f, -0.307572f, -0.451785f, + -0.300838f, -0.453186f, -0.301691f, 0.046327f, -0.312668f, 0.058272f, + -0.303131f, -0.376252f, 0.108384f, -0.086623f, -0.100630f, -0.027330f, + -0.003969f, 0.089502f, -0.200722f, -0.107889f, 0.061843f, -0.008478f, + -0.265057f, -0.271132f, -0.073562f, 0.129337f, -0.283698f, -0.353414f, + 0.076420f, -0.244280f, -0.119537f, -0.105366f, -0.184692f, -0.038817f, + -0.478507f, -0.118808f, -0.472979f, -0.305884f, -0.462813f, -0.189581f, + -0.011932f, -0.585700f, 0.253212f, -1.061900f, -0.205116f, -0.336407f, + -0.762199f, 0.577737f, 0.230832f, 0.434440f, -0.096713f, 0.038552f, + -0.147800f, -0.213553f, 0.041740f, -0.281907f, -0.026154f, -0.082356f, + -0.331871f, -0.408247f, -0.129022f, -0.037550f, -0.310233f, -0.320883f, + -0.391963f, -0.467392f, 0.027453f, -0.394761f, -0.045544f, 0.076052f, + 0.483985f, 0.067093f, 0.141361f, 0.576772f, 0.859718f, 2.566515f, + -0.025476f, 0.769738f, -0.680235f, -1.683309f, -2.394131f, -0.000714f, + -0.615021f, -0.195856f, -0.434035f, -0.295010f, -0.668659f, -0.245959f, + 0.551148f, 1.777227f, -0.461630f, 0.043093f, 0.012293f, -0.255841f, + -0.097070f, -0.371156f, -0.146323f, -0.015508f, -0.103873f, -0.087476f, + -0.297266f, -0.128699f, -0.149555f, 0.016534f, -0.375498f, -0.346759f, + -0.455156f, -0.147509f, -0.427076f, -0.354431f, -0.158025f, -0.164604f, + -0.237038f, -0.010314f, -0.092884f, -0.397084f, -0.217980f, -0.127184f, + -0.048421f, -0.144133f, 0.889073f, 0.012606f, 3.007608f, -0.602584f, + -1.849480f, -0.373159f, -1.890695f, -3.609938f, 0.811923f, -1.867208f, + -0.244326f, -0.018012f, -0.211192f, -0.220196f, 0.169363f, 0.119141f, + -0.230715f, 0.083247f, 0.020367f, -0.128629f, -0.217455f, -0.159640f, + 1.815952f, -0.369238f, -1.186447f, -0.658753f, -0.511026f, -0.096934f, + 0.662971f, 0.486475f, 0.159746f, -0.018932f, 3.692397f, 1.384353f, + -0.401984f, -0.248380f, -0.140861f, 0.215248f, -0.023711f, 0.059679f, + -0.072260f, 0.004271f, 0.039545f, -0.347971f, -0.081851f, -0.474896f, + -0.181572f, 0.066736f, -0.157822f, -0.163760f, -0.171113f, -0.089935f, + -0.338281f, -0.421444f, -0.306687f, -0.085283f, -0.377953f, -0.138750f, + -0.102701f, -0.312336f, 0.149831f, 0.007229f, -0.155700f, -0.173611f, + 4.074261f, 1.342306f, -1.272712f, 1.570899f, -0.545093f, -0.317605f, + -0.189440f, -0.133910f, -0.273190f, -0.108020f, -0.166107f, 0.021413f, + -0.239130f, -0.067211f, 0.041957f, -0.039234f, -1.003587f, -0.094412f, + 0.532512f, -0.870538f, -1.118023f, -1.160983f, -0.736307f, -0.418752f, + 0.419466f, 0.492122f, -0.004368f, -0.022096f, -1.115132f, 0.150886f, + 2.396852f, 2.660000f, -0.376537f, 0.468628f, 0.149413f, -0.074898f, + -0.067154f, 0.021245f, 0.127857f, 0.294189f, 0.508056f, 0.390232f, + -3.899177f, -3.414681f, -3.929195f, -4.160545f, -0.274323f, -0.052583f, + -0.003545f, -0.433084f, -0.404891f, -0.145051f, -0.312367f, 0.004579f, + -0.398724f, -0.372068f, -0.234279f, 0.017799f, -0.424760f, -0.646717f, + -0.047568f, 2.924664f, -0.644165f, 0.359349f, -0.294800f, 0.591746f, + -0.404710f, -0.092358f, -0.250729f, 0.030829f, -0.147149f, -0.476023f, + -0.071803f, -0.482516f, -0.293117f, -0.215923f, -0.373122f, -0.085315f, + -0.377052f, -0.449899f, -0.056452f, 0.138081f, -0.085350f, -0.308391f, + 0.106661f, 0.176234f, 0.258869f, -0.230172f, -0.233029f, -0.241208f, + -0.067509f, -0.223172f, -0.118353f, -0.302478f, -0.579632f, -0.561326f, + -0.158114f, -0.223167f, -0.026689f, 0.051863f, 0.212834f, -0.304714f, + -0.169071f, -0.193695f, -0.075682f, -0.170860f, -0.241008f, -0.044648f, + 0.280815f, -0.002585f, -0.283552f, -0.037701f, -0.681169f, -0.274535f, + -0.380595f, 0.109504f, -0.111141f, -0.437685f, -0.094459f, 0.144206f, + -0.106139f, -0.211832f, -0.054742f, -0.172813f, -0.295905f, -0.071907f, + -0.418429f, -0.183240f, 0.031319f, -0.095785f, -0.315447f, 0.069404f, + -0.422910f, -0.029867f, -0.357321f, -0.199976f, -0.337707f, -0.070188f, + -0.178198f, 0.177208f, 0.134688f, -0.081933f, -0.229452f, -0.208872f, + 0.026287f, -0.364040f, -0.063696f, -0.227443f, -0.234401f, -0.205699f, + -0.267238f, -0.494125f, -0.056255f, 0.053715f, -0.487754f, 0.014818f, + 0.087383f, -0.077556f, -0.168085f, -0.436851f, -0.276286f, -0.137845f, + -0.107606f, -0.103653f, -0.233766f, -0.419083f, 0.169185f, 0.010186f, + -0.001587f, 0.086735f, -2.465718f, 1.482185f, 1.621193f, -2.081680f, + 1.386553f, -3.204335f, -0.267111f, -0.004508f, 0.164712f, 0.274147f, + 1.724306f, -2.273659f, 0.749574f, -0.891905f, 0.105965f, -0.030428f, + -0.416018f, -0.300762f, 0.122911f, -0.316908f, -0.292504f, 0.138666f, + -0.161327f, -0.042143f, -0.249128f, 0.149210f, -0.088987f, -0.654101f, + -1.501843f, 0.216777f, 0.955914f, 0.524158f, -1.642561f, -1.643626f, + 0.864797f, -0.425451f, -2.115764f, -0.012502f, 0.065172f, 1.297270f, + 0.018845f, 1.167276f, -0.470970f, -0.244995f, 0.374782f, -1.811056f, + -0.055430f, -0.024102f, -0.376519f, -0.339640f, -0.119177f, -0.277995f, + -0.290095f, -0.081362f, -0.144139f, -0.118037f, -0.180357f, -0.217559f, + -0.370683f, 0.172816f, -0.265069f, 0.194321f, -0.273478f, 0.037442f, + -0.235552f, -0.078625f, -0.447541f, 0.016836f, -0.271123f, -0.171481f, + -0.321477f, -0.184826f, -0.442981f, -0.227273f, -0.370666f, -0.237232f, + -0.257493f, -0.225714f, -0.153716f, -0.283487f, -0.155399f, 0.067697f, + 0.230343f, -0.034318f, -0.022687f, -0.047090f, +}; + +static const float av1_ab_partition_nn_bias_64_layer0[64] = { + -0.212182f, -0.233725f, -0.758846f, -0.158162f, 0.614743f, -0.150944f, + -0.075727f, -0.208414f, 1.054996f, 0.713758f, -0.300051f, -0.151482f, + -2.443570f, 0.430590f, -0.129001f, -0.160733f, -0.230547f, -0.143228f, + -0.140577f, -0.086812f, -0.212298f, -0.159557f, -0.055647f, -0.211423f, + 0.578161f, -0.220318f, -0.210107f, -3.111584f, 0.604419f, -0.232622f, + -0.209924f, -0.130794f, -0.084097f, -0.036005f, 0.294594f, -2.535531f, + -0.209783f, -0.211189f, -2.766337f, 0.000000f, 0.450177f, -1.754884f, + 3.262664f, -0.209691f, -0.614886f, -0.211257f, -0.109096f, -0.190492f, + -0.109007f, -0.026910f, -0.136035f, -0.212321f, -0.139320f, -0.212233f, + -0.305430f, 0.739171f, 0.991277f, -0.088150f, 0.086313f, -0.023379f, + -0.125366f, -0.063576f, -0.212169f, -0.047463f, +}; + +static const float av1_ab_partition_nn_weights_64_layer1[64 * LABEL_SIZE] = { + -0.036800f, 0.528721f, 0.490767f, 0.144409f, 1.103640f, 0.361910f, + -0.180069f, 0.068033f, -14.868382f, 0.359013f, 0.322567f, -0.199212f, + 0.906164f, -0.488254f, 0.149653f, -0.216394f, -0.099347f, 0.004936f, + -0.111391f, 0.074848f, -0.041709f, 0.147627f, -0.018905f, 0.096116f, + 0.184817f, -0.016241f, 0.115739f, 2.376754f, 0.637097f, 0.052954f, + 0.136428f, 0.225267f, -0.181873f, -0.142876f, 0.684048f, 0.658791f, + 0.105795f, 0.241705f, 1.381114f, -0.209379f, 1.145949f, 0.795293f, + -9.361877f, 0.198302f, 0.539600f, 0.092317f, -0.081695f, 0.200777f, + 0.102334f, 0.081583f, 0.060948f, -0.025110f, 0.160951f, -0.020170f, + 0.234006f, -0.029369f, 0.375036f, 0.270209f, -0.556529f, 1.402949f, + 0.101777f, -0.027331f, 0.004502f, -0.153166f, -0.116651f, 0.151573f, + -0.022187f, 0.144044f, -0.108719f, -0.129942f, -0.270321f, 0.227363f, + 1.892330f, -0.661052f, -0.219398f, -0.229417f, -0.856438f, -1.196988f, + -0.081774f, 0.078847f, -0.207057f, -0.048947f, 0.152073f, -0.243056f, + -0.233329f, -0.288689f, -0.158333f, -0.141177f, -0.715436f, 0.016947f, + -0.093752f, 0.204984f, -1.209782f, 0.155683f, 0.092239f, 0.146495f, + 0.813146f, -0.027757f, 0.330982f, 2.173948f, -0.028867f, -0.141815f, + 0.292708f, -0.204794f, 0.014496f, 1.032799f, 1.312155f, 0.107020f, + 0.824752f, -0.013945f, 0.184829f, -0.041633f, 0.215300f, -0.476088f, + -0.053213f, 0.126862f, -0.020777f, 0.082893f, -0.223727f, -0.923063f, + 0.466529f, 0.082140f, -0.845758f, -1.140791f, -0.262033f, 0.138491f, + 0.151717f, -0.182479f, -0.131128f, 0.055411f, 0.106771f, 0.125552f, + 0.297184f, -0.257403f, -0.059884f, -0.274903f, 2.694357f, -0.108244f, + 0.025377f, 0.043092f, -0.558317f, 3.517159f, -0.270833f, -0.240676f, + 0.205100f, -0.057068f, -0.140445f, -0.193449f, -0.030061f, -0.286762f, + -0.467523f, -0.012647f, 0.190564f, 0.022394f, -0.101479f, 0.339684f, + -0.902743f, -0.169578f, -0.178029f, -0.041836f, -3.952108f, -0.028298f, + -0.221137f, -0.733895f, -0.223895f, 0.039012f, 0.687867f, 0.021423f, + 0.113063f, 0.676087f, -0.961000f, -0.064847f, 0.712856f, -0.192765f, + -0.001132f, 0.016689f, -0.236020f, -0.766186f, -0.175729f, 0.012879f, + -0.251064f, -0.105523f, -0.039212f, -0.347584f, 0.304352f, -0.034174f, + -0.364258f, -0.685252f, -0.266115f, -0.247345f, -0.155905f, 0.152283f, + -0.156315f, 0.174082f, -0.757654f, 0.102303f, -2.192316f, -0.245815f, + 0.119882f, -0.086542f, 1.987246f, -1.353163f, -0.374813f, -0.233504f, + -1.980895f, 0.692093f, -0.168351f, 0.172700f, -0.009052f, -0.015734f, + 0.106679f, -0.060472f, -0.256813f, -0.074874f, -0.207488f, -0.329515f, + -0.418268f, -0.017940f, -0.036081f, 0.064719f, -1.488016f, 0.020591f, + -0.176325f, -0.141074f, 0.944494f, 0.150237f, -0.249805f, -0.277280f, + 0.012686f, 0.132483f, 0.116123f, 0.013737f, -0.116091f, 0.750340f, + 3.251343f, -0.188864f, 1.096992f, 0.058467f, -0.041433f, -0.037937f, + -0.133294f, -0.137908f, -0.171132f, 0.106362f, 0.069383f, -0.052662f, + -0.177883f, -0.408049f, 0.680221f, -0.117035f, -0.904240f, -1.395228f, + 0.154527f, 0.134427f, 0.022767f, -0.158886f, -0.230316f, 0.161096f, + 0.362213f, -0.235060f, -0.941620f, 0.055912f, -0.049458f, -0.166632f, + 0.481418f, 0.930146f, 0.041108f, 0.033674f, 1.372066f, -1.847709f, + 0.003324f, 0.259534f, 0.177014f, -0.202761f, -0.262017f, -0.190852f, + -0.102839f, 0.028338f, 0.187193f, -0.041684f, 0.123973f, -0.198576f, + -0.110369f, -1.431400f, 0.208369f, -0.302370f, -0.248549f, 0.062985f, + 0.673409f, 0.036662f, -0.711340f, -0.120584f, -0.189789f, 0.098812f, + 2.947819f, 0.216567f, -0.414472f, -0.181742f, 1.873779f, -0.222726f, + -0.782870f, 0.007889f, 0.015062f, -0.554328f, 0.182928f, -0.191430f, + 0.123636f, -0.215460f, -0.225245f, 0.251516f, -0.013025f, -1.359595f, + -0.750602f, 0.342667f, -0.141899f, -0.687493f, -0.072639f, 0.048018f, + -0.242107f, -0.031917f, -0.287472f, -0.046088f, 0.832197f, -0.016576f, + -1.553349f, -0.216341f, 0.023077f, -0.410867f, 4.243743f, -0.514878f, + -0.066007f, -0.160696f, -0.262678f, -0.648790f, -0.430586f, 0.199940f, + -0.202496f, -0.222241f, -0.016406f, -0.121473f, 0.000828f, -0.081584f, + -0.152641f, -0.190166f, 0.644400f, 0.040196f, -0.302104f, -1.143654f, + -0.160327f, -0.320780f, -0.187006f, 0.037311f, 0.440618f, -0.070733f, + -0.117785f, 1.527539f, -0.419310f, 0.001300f, 1.389956f, -0.036366f, + -0.269203f, 0.612265f, 2.721897f, -0.086836f, -0.446999f, 0.012525f, + -0.078317f, -0.287052f, -0.111188f, -0.085181f, -0.164667f, -0.010466f, + -0.569722f, -0.018888f, -0.101663f, -1.147130f, -0.465204f, 0.114524f, + -2.192402f, -0.221325f, 0.375748f, 0.206284f, -0.261548f, -0.246257f, + -0.143004f, -0.069981f, -0.057306f, -0.116481f, -0.435903f, -0.314970f, + 0.013210f, -0.010175f, 4.630571f, -0.473226f, -0.197199f, -0.028204f, + 0.122907f, 2.475548f, 0.025011f, -0.092603f, -0.127561f, -0.151330f, + -0.077295f, 0.245016f, -0.045005f, 0.183396f, -0.330556f, -0.384887f, + 0.356374f, -0.016618f, -0.463353f, -1.291546f, -0.071986f, -0.311599f, + 0.072385f, -0.430786f, -2.094788f, 0.202733f, -0.910109f, -1.336543f, + -0.086800f, -0.096413f, 1.544383f, 0.031860f, -0.796211f, 0.762786f, + 3.250022f, -0.441798f, -0.698537f, 0.062839f, 0.033525f, -0.362996f, + 0.027022f, -1.131264f, -0.228926f, 0.053885f, -0.338628f, 0.155037f, + -0.046844f, -0.888172f, -0.241767f, 0.084965f, -0.617743f, -0.049896f, + -0.036894f, -0.304783f, -0.002639f, 0.137957f, 0.052121f, -0.131161f, + -0.117200f, -0.253380f, -0.205561f, -0.302450f, -0.047397f, -0.330518f, + 3.613420f, -1.525951f, -0.026738f, 0.209150f, -2.103534f, 2.019689f, + -0.366199f, -0.095260f, 0.027417f, -0.242512f, 0.162579f, 0.052113f, + -0.293851f, -0.068138f, -0.005799f, -0.344696f, -0.114824f, -0.431107f, + -0.120058f, -1.139926f, -1.048379f, 0.036446f, -0.323020f, -0.432945f, + 0.454151f, -0.140058f, 0.050649f, -0.094900f, -0.017278f, -0.238719f, + 1.193153f, 0.120447f, -0.496061f, 0.917431f, 2.936126f, -0.115521f, + -0.347397f, -0.435325f, -0.004383f, -0.211864f, 0.162383f, -1.040726f, + 0.089537f, -0.128579f, -0.133505f, 0.107129f, -0.435657f, -0.180388f, + 0.043650f, 0.018709f, -0.773242f, -0.687192f, -0.120633f, -0.063626f, + 0.029912f, 0.113972f, -0.403502f, -0.127640f, -0.269625f, 0.129794f, + -0.188539f, 0.041641f, 0.029769f, -0.198374f, 1.401407f, 0.353887f, + -0.219925f, 0.260515f, 1.157034f, -2.992044f, -0.097618f, -0.064417f, + -0.203626f, -0.008217f, -0.112339f, -0.227407f, -0.155118f, 0.247705f, + -0.012304f, -0.248447f, -0.913463f, -0.064788f, -0.214619f, -0.251761f, + -0.386861f, -0.040574f, -0.163219f, -0.100700f, 1.488274f, -0.071684f, + -0.033626f, -0.006497f, -0.246945f, -0.145221f, -3.747390f, 0.149609f, + -0.263326f, -0.297385f, -1.039896f, -0.083174f, -0.025473f, -0.235586f, + -0.001087f, 0.254286f, 0.265106f, 0.007325f, 0.199239f, 0.134103f, + -0.578211f, -0.259801f, -0.062373f, 2.368348f, 0.560556f, -0.252260f, + 0.889997f, -0.447872f, -0.059218f, -0.095315f, -0.061667f, 0.183580f, + -0.157479f, 0.055387f, -0.831734f, 0.007606f, -1.104906f, 0.301180f, + -0.117115f, 0.212959f, 4.727223f, -0.243833f, -0.397495f, -0.025021f, + -0.367587f, -2.082058f, -0.217699f, 0.148111f, 0.252430f, 0.111088f, + -0.260692f, 0.095124f, -0.407774f, -0.322169f, 0.002927f, 0.126169f, + -1.272325f, -0.279772f, -0.373680f, -0.485177f, -0.605458f, 0.021225f, + -0.092031f, -0.226585f, 1.895162f, 0.037866f, -0.275475f, 1.614360f, + -0.014972f, -0.277679f, -3.449082f, -0.092060f, -0.747873f, 0.020716f, + 2.776178f, -0.049963f, 0.183999f, -0.295259f, -0.028868f, 0.221895f, + 0.001265f, 0.336823f, 0.219372f, 0.112824f, 0.408132f, -0.017940f, + -0.311666f, 1.489606f, -0.058093f, -0.305659f, -0.491933f, -0.143847f, + 0.166115f, 0.042867f, -0.123447f, -0.087099f, -0.305395f, -0.365079f, + -0.755801f, -0.160649f, 0.736260f, -0.008611f, 0.095836f, -0.017345f, + 5.697515f, -0.498971f, -0.125280f, 0.199907f, 0.300053f, 0.605026f, + -0.228225f, -0.259523f, 0.016384f, 0.146973f, 0.210258f, 0.226766f, + -0.075178f, -0.050924f, 0.188496f, -0.415266f, -0.484880f, -0.236384f, + 0.071931f, -0.331863f, -0.601243f, -0.232479f, -0.285272f, 0.123789f, + -1.341333f, 0.037082f, -0.315202f, -1.587215f, -0.271576f, 0.003216f, + -4.437186f, -0.256205f, -0.576589f, -0.114147f, 2.153916f, -0.369618f, + 0.271415f, 0.145036f, -0.158731f, -0.240938f, -0.187369f, 0.036325f, + 0.254771f, 0.211488f, -0.240297f, 0.098417f, -0.415011f, 2.334793f, + -0.127252f, 0.020069f, -0.168755f, -0.448922f, -0.219207f, 0.016232f, + -0.221935f, -0.269500f, -0.100636f, 0.102545f, -0.809376f, -0.054979f, + 0.360713f, -0.326541f, 0.112933f, 0.138073f, 4.229404f, -0.763801f, + -0.305429f, 0.199955f, -1.787713f, 0.272866f, 0.109895f, 0.138466f, + -0.250259f, -0.167162f, -0.212588f, -0.217589f, -0.067125f, -0.077490f, + -0.208970f, -0.006863f, -0.671146f, -0.298320f, -0.165509f, 0.044597f, + -1.408624f, -0.213957f, -0.220947f, 0.129718f, 1.316777f, -0.098928f, + -0.008121f, -0.558293f, -0.297290f, -0.218873f, -4.346638f, -0.228174f, + -0.204710f, -0.388864f, 2.697919f, 0.025260f, 0.857020f, 0.009921f, + 0.036915f, -0.320275f, -0.087937f, 0.022636f, 0.236667f, 0.135496f, + -0.059616f, -0.192955f, 0.009470f, 2.139589f, -0.200449f, 0.129818f, + 1.017444f, -0.608299f, 0.257914f, -0.134306f, -0.033327f, 0.002855f, + -0.338598f, 0.015559f, 0.117362f, -0.166760f, 0.086903f, -0.167666f, + 0.193523f, 0.033852f, -1.147686f, 0.489468f, -0.006969f, 0.125630f, + 1.557907f, -1.604449f, -0.071114f, 0.096178f, 0.007065f, 0.200013f, + 0.213393f, 0.168466f, -0.100568f, -0.117861f, -0.161542f, -0.072561f, + -1.069871f, -0.470138f, -0.352578f, -1.503513f, -0.001394f, -0.380109f, + 0.065089f, -0.281668f, 0.988953f, -0.002778f, -0.659026f, -0.470692f, + -0.407292f, 0.011710f, -1.362085f, 0.184738f, -0.135786f, -1.374241f, + 4.487930f, -0.067274f, -0.956404f, -0.233995f, 0.224527f, -0.454556f, + 0.037900f, -0.281658f, 0.208224f, -0.254753f, 0.045740f, 0.051444f, + -0.388281f, 0.257112f, -0.485030f, -0.082659f, 0.148103f, -1.007456f, + -0.022295f, 0.036984f, -0.369401f, -0.076943f, -0.007636f, -0.293022f, + 0.470466f, 0.199012f, -2.158182f, 0.036577f, -0.014725f, -0.229516f, + 2.236929f, 0.030945f, -0.400045f, 0.109348f, 0.214691f, -0.891516f, + -0.251379f, -0.217358f, 0.013733f, 0.205573f, -0.151725f, -0.191782f, + -0.339630f, -0.163905f, -0.119191f, -0.032516f, 0.503015f, 0.025772f, + 0.029094f, -1.146153f, 0.216723f, -0.330023f, 0.064695f, -0.262521f, + 0.425612f, -0.093080f, -0.489648f, 1.051293f, -0.092332f, 0.095557f, + -0.874132f, 0.218483f, -0.127648f, -1.605802f, 2.763617f, -0.186734f, + -1.243166f, -0.193514f, -0.173748f, 0.337822f, 0.183873f, -0.251594f, + -0.211582f, 0.144081f, 0.029620f, -0.024853f, -0.385140f, 0.467341f, + -0.928316f, -0.195442f, 0.917783f, 0.357084f, 0.174445f, -0.073659f, + -0.012811f, -0.115420f, -0.181147f, -0.364449f, -0.567395f, -0.012969f, + -1.680714f, 0.065323f, 0.198063f, -0.244201f, 1.428545f, -0.432539f, + -0.208931f, -0.091205f, 0.957125f, 0.813519f, -0.262677f, 0.246852f, + 0.015536f, 0.055026f, 0.067054f, 0.262103f, -0.358115f, -0.095206f, + -0.267522f, -0.402710f, -0.680397f, -0.123627f, -0.385590f, -1.504680f, + -0.169513f, -0.215338f, 0.043633f, -0.079052f, -0.464410f, 0.122894f, + -0.278231f, -2.456445f, -0.159917f, -0.015597f, -0.735449f, -0.078854f, + -0.400290f, -1.153870f, 3.657228f, -0.287093f, -1.174355f, -0.102001f, + -0.288281f, 0.185209f, -0.145228f, -0.200449f, -0.099914f, -0.138354f, + 0.254428f, -0.161751f, -0.118206f, 0.296043f, -0.482613f, 0.080932f, + 1.097605f, -0.010190f, 0.232439f, 0.447617f, -0.133508f, 0.115763f, + -0.388589f, 0.174695f, -0.236014f, 0.006284f, -1.374129f, 0.092015f, + -0.241419f, -0.231667f, 2.763950f, -0.922932f, -0.061605f, 0.208740f, + -1.597190f, 1.353325f, -0.198528f, 0.250498f, -0.013950f, -0.203861f, + -0.254563f, 0.081931f, -0.413369f, 0.011844f, 0.080961f, -0.231161f, + -1.234909f, -0.440843f, -0.174980f, -0.315283f, -0.337474f, -0.123243f, + -0.310001f, -0.271028f, 0.364179f, 0.022845f, -0.535517f, -0.772936f, + -0.188435f, 0.039667f, -0.807463f, 0.266550f, -0.288857f, -1.630789f, + 1.280155f, 0.065712f, -0.279960f, -0.300056f, 0.258440f, -0.073781f, + 0.213878f, 0.042196f, 0.021360f, 0.211698f, -0.003751f, -0.192673f, + -0.137008f, 0.247878f, -0.470604f, 0.073164f, 1.523241f, 0.734755f, + -0.114126f, -0.193834f, -0.025759f, 0.263183f, +}; + +static const float av1_ab_partition_nn_bias_64_layer1[LABEL_SIZE] = { + -0.343508f, -0.706936f, -0.160676f, -0.877101f, -0.517567f, -0.253254f, + -0.148074f, 0.923430f, -0.364770f, 0.203550f, 0.401216f, 0.938246f, + -0.872737f, 0.718723f, 0.703398f, 2.560015f, +}; + +static const NN_CONFIG av1_ab_partition_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 64, // num_hidden_nodes + }, + { + av1_ab_partition_nn_weights_64_layer0, + av1_ab_partition_nn_weights_64_layer1, + }, + { + av1_ab_partition_nn_bias_64_layer0, + av1_ab_partition_nn_bias_64_layer1, + }, +}; + +// nn model for ab partition pruning, 32x32. +static const float av1_ab_partition_nn_weights_32_layer0[FEATURE_SIZE * 64] = { + -0.323723f, -0.214013f, -0.007772f, -0.458851f, -0.125542f, -0.123860f, + -0.410973f, -0.209389f, -0.087580f, -0.272881f, -0.168500f, -1.130845f, + 0.344916f, -0.475017f, -0.362262f, -0.195662f, -0.566124f, 0.782163f, + 0.411575f, -0.013378f, -0.318650f, -0.124678f, -0.612909f, -0.315788f, + -0.263990f, -0.508783f, -0.048938f, -0.416407f, -0.402648f, -0.156644f, + 0.225887f, -0.000493f, 2.682241f, 0.871204f, 0.059014f, 0.803542f, + -1.407028f, -1.154669f, 1.388148f, -0.293348f, -0.003669f, -0.009607f, + 1.330030f, -0.337841f, 2.118617f, 1.033059f, -0.084788f, 0.212904f, + 0.082405f, -0.070579f, -0.494005f, -0.173392f, 0.039546f, -0.463865f, + 0.077163f, -0.434066f, 0.030835f, -0.427139f, -0.560520f, -0.031606f, + -0.368541f, -0.027458f, 0.370574f, 0.461418f, 1.087682f, -0.572137f, + -1.509596f, -0.765697f, -0.499383f, -0.277998f, -0.106492f, -0.129564f, + -0.169133f, -0.269834f, -0.114270f, -0.275431f, 0.016339f, -0.156744f, + -0.267922f, 0.171216f, 0.110556f, 0.002954f, -0.200327f, -0.187663f, + 3.691601f, 1.234152f, 0.186315f, -0.125370f, -0.211235f, -0.554432f, + -0.131072f, -0.124982f, -0.130339f, -0.235350f, 0.018903f, 0.012896f, + -0.159372f, -0.269571f, -0.025709f, -0.221251f, 0.061919f, 0.016307f, + 0.384673f, -0.134525f, -1.599126f, -0.416459f, -0.743052f, 0.670249f, + -0.169709f, 0.421681f, -0.033360f, -0.072817f, 0.003647f, -0.110632f, + -0.158651f, -0.095136f, 0.223759f, 0.165767f, -0.269129f, -0.196075f, + -0.023183f, -0.293420f, 0.014875f, 0.018688f, -0.153407f, -0.172009f, + -0.259947f, -0.124015f, 0.173653f, -0.089103f, -0.021001f, -0.334230f, + 0.027177f, 0.103371f, -0.183860f, -0.204051f, -0.023721f, -0.192297f, + -0.143771f, -0.247106f, 0.218116f, -0.013240f, 2.831783f, 1.483928f, + -0.877025f, -0.313462f, -0.411320f, -0.447825f, 0.605977f, 0.234684f, + -0.119150f, -0.075182f, -0.330463f, 0.071503f, -0.254924f, -0.360071f, + -0.037022f, 0.063261f, -0.148759f, -0.238254f, -0.462018f, -0.027166f, + 0.065318f, -0.235743f, -0.257194f, -0.094784f, 0.022423f, 0.055925f, + 0.086672f, -0.021010f, 0.009965f, -0.001648f, -0.104917f, -0.387443f, + -0.102673f, -0.281706f, 0.145923f, -0.233391f, -0.378365f, -0.145584f, + -0.077751f, -0.121166f, 1.134565f, -0.097500f, -0.749202f, -0.544566f, + -1.361374f, -0.102494f, 1.089275f, 0.375299f, -0.105091f, 0.037641f, + -0.054248f, -0.282691f, -0.377797f, -0.066427f, -0.253815f, -0.329677f, + -0.339326f, -0.128217f, -0.282905f, 0.014937f, 1.067185f, -0.171764f, + 0.484458f, 0.396706f, -0.557055f, -0.891596f, -0.257839f, -0.720879f, + -0.218449f, -0.004755f, 1.572857f, 0.006229f, 1.962895f, -0.029746f, + -4.137691f, -2.185991f, -2.763477f, -0.520437f, -0.208708f, 0.006444f, + -1.263078f, -0.304560f, 1.072374f, 2.556429f, 0.312850f, 0.257488f, + -0.634264f, 0.156769f, -0.188943f, 0.040295f, -0.389915f, 0.085250f, + -0.248525f, 0.045667f, -0.776115f, -0.274680f, -0.448145f, -0.566161f, + -1.285316f, 0.079060f, 0.389124f, -0.510401f, -0.015299f, -0.664661f, + 0.099901f, -0.470694f, -0.051593f, -1.076381f, -0.442104f, -0.197867f, + -0.330011f, -0.448523f, -0.301018f, -0.442093f, -0.491953f, -0.582091f, + -0.064569f, -0.156516f, 0.543522f, -0.005924f, 0.161432f, 0.974793f, + 0.273712f, 1.104850f, -0.290312f, 0.313417f, -0.125370f, 0.136234f, + -0.191227f, -0.165054f, 0.011872f, -0.298871f, 0.095740f, 0.142760f, + -0.215771f, -0.031437f, 0.101041f, -0.085620f, 0.435387f, 0.002786f, + 1.971375f, 0.018392f, -1.771940f, -0.401433f, 0.808263f, -3.350013f, + 2.296952f, -1.024403f, -0.041645f, -0.034799f, -0.024078f, -0.347301f, + -0.276088f, -0.455907f, 0.266021f, 0.087348f, -0.146566f, 0.040492f, + -0.539866f, -0.206851f, -0.387874f, -0.125508f, -0.496676f, -0.373845f, + -0.472356f, -0.357082f, -0.081254f, -0.456466f, 0.554713f, 0.002185f, + -4.225019f, 0.344025f, 0.728796f, -0.262936f, 1.383924f, 1.577300f, + -2.653320f, -2.516156f, -0.301604f, -0.204105f, -0.138252f, -0.587536f, + -0.097889f, -0.352414f, -0.288276f, -0.184340f, -0.122741f, -0.243376f, + 0.031970f, -0.373402f, -0.396079f, 0.045566f, 0.072595f, -0.222681f, + -0.243802f, -0.340129f, -0.258494f, -0.192041f, -0.386112f, -0.240940f, + -0.047268f, -0.555802f, -0.032514f, -0.241341f, -0.167463f, -0.478308f, + -0.205936f, -0.316275f, 0.103729f, -0.197893f, -0.128029f, -0.218796f, + -0.167362f, -0.111814f, -0.126062f, -0.394260f, -0.025357f, -0.402697f, + -0.587395f, -0.400385f, -0.259664f, -0.415588f, -0.338503f, -0.399166f, + -0.270504f, 0.234505f, 0.272144f, 0.266938f, -0.392395f, -0.011717f, + -0.384221f, -0.473446f, -0.038420f, -0.241101f, -0.234402f, -0.275567f, + -0.410454f, -0.377599f, -0.179099f, -0.138432f, -0.248083f, -0.543026f, + -0.428043f, -0.239895f, -0.333193f, -0.103346f, -0.039038f, -0.171109f, + -0.119432f, -0.222351f, 0.000450f, 0.208724f, -0.510526f, -0.144656f, + -0.316721f, -0.344846f, -0.244794f, -0.129134f, -0.045634f, -0.400183f, + 0.043714f, -0.235414f, 0.115594f, -0.195616f, -0.106693f, -0.124242f, + 0.083990f, 0.049110f, -0.196130f, -0.059860f, -0.464235f, -0.516443f, + -0.101521f, -0.422379f, -0.413955f, -0.042991f, -0.345263f, -0.129264f, + -0.106911f, -0.140156f, -0.457841f, -0.199848f, -0.218954f, -0.329850f, + -0.364097f, -0.335262f, -0.312254f, -0.299331f, -0.052710f, -0.251019f, + -0.023459f, -0.222538f, 0.028849f, -0.088038f, -0.301550f, -0.273566f, + 0.067295f, -0.174608f, -0.445784f, -0.158366f, -0.567275f, -0.557652f, + -0.353503f, -0.302092f, -0.302049f, -0.551793f, -0.034535f, -0.225190f, + -0.210733f, -0.219377f, -0.057197f, -0.430933f, -0.025185f, -0.388150f, + -0.086147f, -0.430088f, 0.058466f, -0.152129f, -0.058411f, -0.236392f, + -0.547669f, -0.613849f, -0.893774f, -0.351715f, -0.399227f, -0.454909f, + -0.324501f, 0.000490f, -0.282167f, -0.073163f, -0.281452f, 0.047932f, + -0.175500f, 0.165220f, -0.276212f, 0.062153f, -0.217054f, -0.255487f, + -0.146416f, -0.097718f, -0.173809f, -0.559328f, -0.055695f, -0.391193f, + -0.132020f, -0.561184f, -0.308666f, -0.474053f, -0.219149f, -0.246558f, + -0.158325f, 0.151907f, -0.266835f, -0.144697f, -0.193960f, -0.046587f, + -0.220028f, -0.247355f, 0.135584f, 0.016511f, 0.367705f, -1.855877f, + 0.435622f, 0.444710f, -3.372301f, -3.030489f, 1.013267f, 0.380951f, + -0.170011f, -0.111415f, -0.456146f, -0.107254f, -0.095220f, -0.053078f, + -0.135864f, -0.591949f, -0.252810f, -0.324799f, -0.094796f, -0.260969f, + -0.391981f, -0.063170f, -0.336130f, -0.470127f, -0.405168f, -0.433219f, + -0.309563f, -0.295462f, -0.552270f, -0.012300f, -0.057793f, -0.034494f, + -0.446843f, -0.640160f, -1.188681f, -0.791361f, 0.543271f, 1.189112f, + 1.458468f, -0.005876f, -0.927475f, 0.062038f, -1.170818f, 0.338227f, + -3.007096f, -4.559296f, -4.045457f, -5.953635f, -0.228386f, -0.266890f, + -0.092595f, -0.377440f, -0.044534f, -0.053565f, -0.349268f, -0.415030f, + -0.310094f, 0.062721f, 0.251422f, -0.014350f, -1.282910f, 1.619560f, + 1.180566f, -0.032163f, -1.322951f, -0.603601f, 1.443710f, 0.654650f, + -0.393227f, 0.003536f, 0.029725f, -0.108925f, -0.053911f, 0.133977f, + -0.036145f, -0.168438f, 0.046989f, -0.331463f, -0.176983f, -0.311922f, + -0.272389f, -0.379592f, -0.399993f, -0.297873f, -0.193425f, -0.177524f, + -0.258309f, -0.567312f, -0.260217f, -0.241869f, 0.024010f, -0.032867f, + -0.039424f, -0.063670f, 0.193808f, -0.303514f, -0.013376f, -0.057761f, + 0.187922f, 0.006938f, 0.031810f, 0.180594f, -1.198427f, 2.820662f, + 0.154986f, -0.375518f, 0.116925f, -0.795782f, -0.085139f, -0.079365f, + -0.197936f, -0.321468f, -0.205271f, -0.558203f, -0.296235f, -0.151193f, + -0.158282f, -0.245402f, -0.208504f, -0.042335f, -0.087426f, -0.557129f, + -0.381427f, -0.441551f, -0.541011f, -0.060567f, -0.469305f, -0.032326f, + -2.453587f, -0.045568f, -0.296932f, 0.613061f, -0.320284f, 0.191620f, + -0.827145f, -0.225277f, 0.275800f, 1.696635f, +}; + +static const float av1_ab_partition_nn_bias_32_layer0[64] = { + -0.176206f, 0.660189f, -0.186156f, -2.481963f, -1.564218f, -0.280424f, + 0.732684f, -0.135581f, -2.193132f, -0.172771f, 0.605001f, -0.060392f, + -0.067190f, -0.132969f, -1.410812f, -0.298701f, -0.105963f, -0.086173f, + 0.632779f, 0.005585f, 1.310169f, 1.392136f, -0.563860f, -0.051053f, + 0.660998f, -0.214726f, -1.894342f, -0.128288f, -0.330721f, -0.053988f, + -0.177726f, 1.200859f, -0.178902f, -0.172620f, -0.184476f, -0.175559f, + 0.538503f, -0.322158f, -0.219080f, -0.058208f, -0.171347f, -0.216060f, + -0.174950f, -0.295740f, -0.184820f, -0.213896f, 1.317728f, -0.020116f, + -0.208096f, 0.000000f, 1.246166f, -0.225421f, -0.181555f, 0.861761f, + 1.172429f, -0.172892f, -0.737092f, -0.189904f, -0.179385f, -0.114618f, + -1.384604f, -0.201713f, -0.271948f, 0.372351f, +}; + +static const float av1_ab_partition_nn_weights_32_layer1[64 * 16] = { + -0.037828f, 1.529029f, 0.004927f, 1.475763f, 0.627172f, 0.325872f, + -0.990757f, 0.129476f, 0.889958f, -0.082031f, 0.332133f, 0.074422f, + -0.176212f, -0.074355f, 0.774378f, 0.110987f, -0.155469f, 0.253310f, + 0.882538f, 0.253605f, 0.332436f, -5.389474f, 0.278470f, 0.168644f, + 0.914611f, 0.154165f, 0.809262f, -0.174734f, 0.923673f, 0.064716f, + -0.070228f, -0.228735f, 0.002312f, 0.112222f, -0.045502f, -0.046004f, + 0.514101f, 0.306480f, 0.021232f, -0.015955f, -0.288260f, 0.189177f, + -0.104158f, 0.103273f, 0.096910f, -0.086328f, 1.327289f, -0.154247f, + 0.056676f, -0.243327f, -0.646676f, 0.177221f, -0.086761f, 0.729729f, + -14.710893f, -0.044881f, 0.339003f, -0.134737f, 0.073621f, -0.162913f, + 1.215237f, 0.140723f, 0.138630f, 1.241719f, 0.204092f, -0.463080f, + -0.176086f, 1.125868f, 1.034814f, 0.225455f, -0.203421f, -0.078787f, + -0.527498f, 0.012491f, -0.563307f, -0.170792f, 0.002679f, 0.116153f, + 0.211348f, -0.191900f, -0.212505f, 0.263445f, -0.074679f, -0.081441f, + -0.815405f, 2.448215f, 0.781299f, 0.149542f, -1.045162f, 0.043014f, + 0.217381f, -0.094500f, -0.090427f, 0.025784f, -0.228906f, -2.741798f, + 0.230475f, -0.256112f, -0.103297f, 0.159121f, -0.229793f, -0.014883f, + -0.104131f, -0.123816f, 0.164148f, -0.052279f, -0.071845f, -0.041197f, + 0.208527f, -0.234197f, -0.542336f, 0.020053f, 0.088870f, 0.014346f, + 2.502164f, -0.010244f, -0.267792f, 0.844394f, 2.711486f, -0.015262f, + -0.868053f, -0.295704f, 0.222289f, -0.000286f, -0.352098f, -0.079000f, + 0.021267f, -0.721739f, -0.240558f, -0.384775f, 0.065974f, -2.161058f, + 0.195889f, 0.268966f, -0.009329f, 0.014949f, 0.314943f, 0.235885f, + 0.072591f, -0.127120f, 0.150784f, 0.105697f, -1.297403f, -0.207509f, + -0.217688f, -0.076752f, 0.170952f, -0.294235f, 0.449973f, -1.712690f, + 0.860989f, 0.054757f, -0.812627f, -0.105316f, -0.736230f, -0.133192f, + -3.741608f, 0.495660f, -0.288936f, 4.654852f, -0.021305f, -0.308916f, + 0.049205f, -0.259996f, 0.114248f, -0.252647f, -0.253180f, -0.449314f, + 0.022979f, 0.063281f, -0.196154f, 0.078295f, -0.322317f, -0.145142f, + 0.300573f, 0.048385f, -0.254787f, 0.123939f, -1.263088f, -0.228565f, + -0.389061f, 0.391084f, 2.322438f, 0.075009f, 0.225743f, -0.198808f, + -0.280538f, -0.173939f, -0.120543f, -0.070792f, -0.417187f, -0.781056f, + -0.102756f, -1.760965f, 0.019149f, -0.867342f, 0.347141f, 0.031588f, + 0.302572f, -0.203573f, -0.357320f, -0.096078f, -0.527528f, 0.046699f, + -0.108561f, -0.167077f, -2.851509f, -0.307116f, 0.202720f, -0.160280f, + -0.215525f, 0.064355f, -0.427220f, 1.516230f, 0.634453f, 0.099400f, + -1.013887f, -0.029740f, -0.093426f, -0.044272f, -1.297636f, -0.237614f, + -0.160953f, 0.399036f, -0.030685f, -0.113619f, -0.184704f, 0.040519f, + -0.588252f, -0.210235f, -0.067623f, -0.031841f, -0.107261f, -0.192582f, + -0.253959f, -0.430821f, -0.103184f, -0.280185f, -0.357723f, 0.197761f, + -0.175087f, -0.055171f, 1.642014f, -0.192559f, -0.288147f, 0.610311f, + 4.688195f, -0.128728f, -0.914869f, -0.108286f, 0.013789f, 0.092125f, + 0.019770f, -0.178386f, 0.074164f, -1.152658f, -0.216738f, -0.277286f, + 0.012381f, 0.418259f, -0.680727f, -0.221917f, -0.485946f, 0.101672f, + 2.009457f, 0.054302f, 1.019838f, -0.116170f, 0.165134f, -0.112567f, + 0.852632f, -0.385796f, -0.108666f, 0.053181f, -0.311797f, -0.372875f, + -0.675717f, 2.409268f, -0.514720f, -0.214245f, -0.646596f, 0.009756f, + 0.203993f, 0.093617f, -0.301290f, 0.253551f, -0.128909f, -1.448442f, + -0.186823f, -0.278001f, -0.294993f, -0.176928f, -0.473605f, 0.062049f, + -0.212084f, -0.137326f, 0.012505f, 0.087850f, -0.200413f, -0.394119f, + -0.132224f, 0.146917f, 0.155746f, 0.198725f, -0.322541f, 0.196391f, + -0.945500f, 0.036736f, -0.155646f, -0.677341f, 1.130545f, -0.339554f, + 0.411628f, -0.355813f, -0.249843f, 0.213694f, -2.035607f, 0.055694f, + -0.111669f, 0.408696f, -0.067043f, -0.048182f, 0.398110f, -0.067542f, + 1.459801f, 0.236833f, -0.178806f, 0.168758f, 0.492387f, 0.099691f, + -0.776680f, -0.172865f, 0.204225f, 0.193982f, 0.575685f, -0.062248f, + 0.011486f, 0.058571f, -0.493391f, 0.026893f, -0.900467f, 3.793129f, + -0.634613f, -0.064660f, -0.048262f, 0.361905f, 0.033641f, 0.245171f, + -0.064671f, 0.034954f, 0.204358f, -0.904023f, -0.052714f, -0.250134f, + 0.136700f, 0.000734f, -0.371720f, 0.226483f, 0.217958f, 0.060559f, + 0.180111f, 0.000970f, 0.079556f, -0.096775f, 0.093855f, -0.026224f, + -0.243664f, 0.004290f, 0.123281f, -0.239476f, 1.230374f, -0.107826f, + -0.101982f, -0.153917f, 5.464427f, 0.304375f, -0.809957f, 0.090564f, + -0.278416f, -0.245555f, -2.078421f, 0.243093f, -0.127666f, 0.052451f, + -0.126662f, -0.783505f, 0.025149f, -1.422675f, -0.207769f, -0.362547f, + 0.115310f, 0.133390f, 1.264754f, -0.027055f, -0.485312f, -0.240717f, + -0.239722f, 0.146818f, -1.265043f, -0.235553f, 0.267104f, -0.021357f, + -0.435949f, -0.309371f, 0.049920f, 1.302721f, -0.233978f, -0.097551f, + -0.240631f, -0.287821f, -0.378380f, -0.273131f, -3.075169f, 0.226404f, + -0.029361f, 2.703590f, -0.430659f, 0.067927f, -0.387520f, -0.370630f, + -0.229236f, 0.085653f, -0.370956f, -0.065556f, -0.187859f, 0.068309f, + -0.109299f, -0.259898f, -0.103644f, -0.271199f, -0.209350f, 0.140993f, + -0.196713f, -0.135508f, -1.423209f, -0.406385f, -0.019956f, -0.864694f, + 5.963707f, -0.201157f, 0.726377f, -0.011076f, 0.010553f, -0.102918f, + -2.230088f, -0.258098f, -0.039547f, -0.029262f, -0.082324f, -0.860222f, + -0.094735f, -1.381839f, 0.587298f, -0.173048f, 0.721360f, 0.241900f, + 0.764302f, -0.023609f, -1.173755f, 0.103912f, -0.185363f, 0.078435f, + -2.245062f, -0.127269f, 0.202234f, 0.158975f, -0.260909f, 0.098608f, + -0.348247f, 1.732502f, -0.412298f, -0.269602f, -0.425771f, -0.146243f, + -0.530730f, 0.125716f, -1.004419f, 0.145109f, -0.059289f, 1.096304f, + 0.012891f, 0.045033f, -0.306875f, 0.003514f, -0.176110f, 0.037544f, + -0.441537f, -0.518921f, -0.262149f, -0.060407f, -0.379419f, -0.141245f, + -0.128894f, -0.176537f, -1.161318f, -0.249100f, -0.118330f, 0.042816f, + 1.173404f, 0.088312f, -0.393568f, -0.175134f, 6.529819f, -0.326652f, + -0.631917f, -0.393476f, 0.057781f, -0.217748f, -1.781139f, -0.012614f, + -0.212621f, -0.720322f, -0.218498f, -0.388556f, -0.254796f, -0.248399f, + -0.608744f, -0.265146f, 0.238517f, 0.066882f, -2.916806f, 0.054642f, + 0.282590f, 0.075248f, 0.010188f, -0.133486f, 0.985945f, -0.045849f, + -0.347564f, 0.057320f, -0.417920f, 0.063664f, 0.387062f, -2.692059f, + -0.535549f, 0.263736f, 0.327889f, -0.070273f, -0.775254f, 0.147250f, + 3.309425f, -0.212191f, -0.067204f, -2.912663f, -0.061496f, 0.084233f, + 0.022907f, 0.138421f, -0.112159f, -0.288447f, -0.010799f, 0.056049f, + -0.036527f, 0.021525f, 0.106649f, -0.291883f, 0.088424f, -0.057773f, + -0.086031f, 0.015277f, -0.318505f, -0.269049f, -1.008913f, -0.224785f, + -0.025820f, -0.649037f, 0.706381f, 0.096410f, 0.643776f, -0.046743f, + -0.009654f, -0.024246f, 1.469255f, -0.183536f, -0.370046f, -0.048442f, + -0.376527f, -0.431264f, -0.245109f, -0.093951f, 0.203683f, -0.099872f, + 0.087210f, 0.160692f, -3.527694f, -0.068891f, -0.228994f, -0.231817f, + -0.241949f, 0.193613f, 0.979597f, -0.091259f, 0.414424f, -0.047341f, + -0.209582f, -0.295134f, -0.016824f, 0.460327f, -0.072671f, 0.246234f, + 0.235896f, 0.127238f, -1.068683f, 0.035648f, 2.254888f, 0.180105f, + -0.260098f, -2.322120f, -0.184249f, -0.314801f, -0.099969f, -0.272117f, + -0.237916f, 0.031103f, -0.274063f, -0.049384f, -0.044917f, 0.102477f, + -0.342148f, -0.257558f, -0.346300f, 0.115333f, -0.115456f, 0.208354f, + -0.359301f, -0.167395f, 1.146514f, -0.177861f, -0.098658f, -0.444570f, + 6.759993f, -0.369772f, -0.831118f, 0.001866f, -0.073298f, -0.072095f, + 0.811902f, -0.431997f, -0.286587f, -0.269500f, 0.111492f, -0.525364f, + -0.351785f, -2.463474f, -1.852659f, 0.135325f, 0.138267f, 0.100643f, + -2.373278f, -0.285514f, -0.395388f, -0.185016f, -0.030249f, -0.005767f, + -0.716424f, -0.031674f, 0.011147f, 0.057405f, -0.215873f, -0.094401f, + 0.573528f, -1.223820f, 0.414852f, -0.059053f, -0.076488f, -0.287168f, + -0.842640f, 0.174084f, -0.567186f, 0.336629f, -0.062514f, 2.075448f, + -0.061680f, -0.131529f, -0.098994f, -0.204111f, -0.347865f, 0.108516f, + -0.049616f, -0.069212f, -0.273935f, -0.096545f, -0.210784f, -0.284698f, + 0.141501f, -0.176924f, -0.361341f, -0.251197f, -0.286694f, 0.245569f, + -1.521661f, -0.122639f, -0.015760f, -0.718912f, 5.877828f, 0.146916f, + 0.151767f, 0.220785f, -0.032298f, 0.230902f, 0.663943f, -0.252613f, + 0.057718f, -0.436038f, -0.323994f, -1.139787f, -0.042489f, -1.326298f, + -1.031206f, -0.104136f, 0.389897f, 0.127602f, -2.667789f, -0.212366f, + -0.506262f, -0.009115f, -0.213202f, 0.076167f, -1.629405f, 0.055129f, + 0.375393f, -0.150272f, -0.241515f, -0.326497f, 0.100069f, 0.410703f, + 0.340622f, 0.042437f, -0.349945f, 0.041176f, -1.178950f, 0.030992f, + 0.933908f, -0.035844f, -0.098660f, 1.030584f, -0.092043f, -0.355739f, + -0.305562f, 0.036161f, -0.049558f, -0.033225f, -0.403856f, -0.088276f, + 0.215493f, -0.149105f, -0.013363f, 0.025886f, -0.101306f, -0.205781f, + -1.072487f, -0.076019f, 0.077555f, 0.131003f, 1.267763f, -0.008954f, + -0.327617f, -0.246539f, 6.664081f, -0.404403f, -1.442489f, 0.191301f, + -0.336361f, 0.181156f, 0.833108f, 0.007879f, -0.194464f, -1.029408f, + -0.036268f, -0.927110f, -0.379190f, -0.293443f, -1.848579f, -0.242548f, + -0.065990f, 0.203160f, -0.291788f, 0.000680f, 0.587011f, -0.241289f, + 0.037034f, 0.000552f, 1.072308f, -0.387230f, -0.230050f, 0.292322f, + -0.720001f, 0.034109f, -0.467260f, 2.211644f, -1.839191f, -0.048797f, + -0.083469f, -0.334686f, -0.269056f, 0.051295f, 1.319904f, -0.035603f, + -0.018457f, -0.824915f, -0.212285f, -0.230516f, -0.035093f, -0.400843f, + -0.305469f, -0.099011f, 0.014225f, -0.452772f, 0.170331f, -0.389312f, + -0.115084f, -0.014770f, -0.429387f, -0.155961f, -0.568200f, -0.037853f, + -0.125137f, 0.067228f, -1.329271f, -0.117874f, -0.132499f, -0.218376f, + -0.588325f, -0.320024f, 0.085695f, -0.235047f, -0.217790f, 0.103015f, + -0.698644f, 0.017766f, -0.058299f, 0.199411f, -0.122485f, -0.563949f, + -0.349011f, -0.557045f, -0.131165f, 0.002281f, 0.118559f, -0.210302f, + -1.153815f, 0.116738f, -0.236007f, -0.003487f, -0.006885f, -0.244816f, + 0.953222f, 0.093748f, 0.266869f, 0.241869f, -0.860832f, -0.387012f, + -0.338986f, 2.097515f, -1.942512f, -0.298021f, 0.543911f, -0.043214f, + 0.082125f, -0.120242f, 0.712231f, 0.213327f, -0.301687f, -0.544011f, + -0.392131f, 0.004302f, 0.004825f, -0.317440f, -0.107518f, -0.293407f, + -0.159111f, -0.080367f, 0.132663f, -0.017726f, -0.237521f, -0.190297f, + -0.361633f, 0.200518f, -0.538296f, -0.027975f, -0.381704f, -0.016963f, + 0.630105f, -0.190997f, -0.287840f, -0.603488f, 3.605598f, -0.276614f, + -1.346383f, 0.186912f, -0.047575f, -0.189232f, -1.519072f, 0.097816f, + -0.223722f, 0.304924f, -0.213022f, -1.052433f, -0.322283f, -1.706734f, + -2.458027f, 0.237976f, 0.171050f, -0.103139f, -0.278689f, 0.329824f, + -0.262448f, -0.122916f, -0.236398f, -0.013848f, -0.969160f, -0.374907f, + 0.091018f, -0.386471f, -0.723940f, 0.064956f, -0.057652f, 1.321024f, + -1.397418f, -0.143136f, 0.272468f, -0.030749f, 0.037324f, 0.069316f, + -0.904925f, -0.333693f, -0.117709f, 2.279598f, -0.428065f, -0.131157f, + -0.014288f, -0.402862f, -0.666090f, 0.017070f, -0.028333f, 0.002481f, + 0.197156f, -0.038120f, -0.271062f, -0.188275f, -0.021370f, -0.070849f, + -0.905007f, -0.095886f, -0.093055f, -0.121821f, -1.239812f, -0.411799f, + -0.089948f, -0.936827f, 1.437569f, -0.388908f, 0.126170f, 0.186162f, + -0.018819f, -0.138364f, -1.066412f, -0.138222f, -0.022186f, 0.107331f, + -0.230436f, -1.352605f, -0.161323f, -1.081810f, -0.933825f, -0.136675f, + 0.378157f, 0.113377f, -0.850610f, 0.080245f, -0.087305f, -0.002852f, + 0.044408f, -0.188172f, -1.891998f, 0.092189f, 0.125325f, -0.105090f, + -0.848510f, -0.396308f, -0.384130f, 2.007509f, -1.480787f, -0.126946f, + 0.314767f, 0.000195f, -0.285628f, -0.110442f, -0.293948f, 0.258559f, + -0.417603f, 1.570705f, 0.092459f, -0.340974f, -0.284754f, -0.007801f, + -0.324610f, -0.004734f, -0.207716f, -0.057175f, 0.055467f, -0.210830f, + -0.113005f, -0.299177f, 0.068074f, 0.017929f, -2.897598f, -0.260074f, + -0.014422f, -0.206467f, 1.246997f, -0.372863f, -0.214160f, -0.114035f, + 5.805862f, 0.003611f, -1.340990f, -0.021085f, -0.260431f, -0.002720f, + -1.251640f, -0.353531f, -0.304009f, -0.153376f, +}; + +static const float av1_ab_partition_nn_bias_32_layer1[LABEL_SIZE] = { + -0.521497f, -1.061572f, -0.078756f, -0.660662f, -0.403741f, -0.960163f, + 0.001427f, 0.523607f, 0.225068f, -0.055273f, 1.019519f, 1.181880f, + -0.010198f, 0.130597f, 1.276752f, 2.028188f, +}; + +static const NN_CONFIG av1_ab_partition_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 64, // num_hidden_nodes + }, + { + av1_ab_partition_nn_weights_32_layer0, + av1_ab_partition_nn_weights_32_layer1, + }, + { + av1_ab_partition_nn_bias_32_layer0, + av1_ab_partition_nn_bias_32_layer1, + }, +}; + +// nn model for ab partition pruning, 16x16. +static const float av1_ab_partition_nn_weights_16_layer0[FEATURE_SIZE * 64] = { + 0.151902f, 0.007947f, -1.788454f, 0.431869f, -2.971387f, 0.923566f, + 1.632542f, -1.665136f, -0.338632f, -5.075884f, 0.398267f, 0.030467f, + 2.263534f, -0.045532f, -1.066128f, 0.915139f, -0.560500f, -3.293125f, + 2.072793f, -1.011414f, 0.122716f, -0.060169f, -0.388860f, 0.031019f, + -0.381861f, 0.001551f, -0.328472f, 0.038296f, -0.060398f, -0.375556f, + 0.209226f, 0.014764f, -1.443469f, -0.345486f, 2.409269f, 1.524846f, + -0.640666f, 1.322139f, -2.074771f, -0.580944f, -0.203960f, -0.072893f, + 0.329701f, 0.115339f, -1.339542f, 0.249024f, -0.421545f, -0.409151f, + -0.258293f, 0.836288f, -0.073685f, -0.009624f, 0.895712f, 0.320639f, + 0.451002f, -1.544558f, 0.193709f, -1.389012f, 1.305451f, 0.089795f, + 0.050338f, -0.017433f, -0.304667f, 0.500729f, 0.504346f, 0.073757f, + 0.582649f, -0.993623f, 1.766766f, -3.067265f, -0.415774f, -0.006036f, + -1.245281f, 0.253205f, -0.591245f, -0.626238f, 0.551852f, 0.593755f, + 0.491023f, 1.099384f, -0.348448f, 0.054564f, -0.451422f, -0.375781f, + -0.248390f, -0.052548f, -0.380069f, -0.165391f, -0.297968f, -0.052142f, + -0.316381f, -0.045246f, -0.243905f, -0.034169f, -0.247523f, -0.180773f, + 0.068066f, -0.374920f, 0.057536f, -0.189748f, 0.058375f, -0.267749f, + -0.147286f, -0.246153f, 0.006183f, -0.202029f, -0.059128f, 0.116852f, + 0.134719f, -0.126900f, -0.064646f, -0.196458f, -0.182331f, 0.108029f, + -0.264499f, 0.155816f, -0.107255f, -0.056983f, -0.209771f, -0.099070f, + 0.007313f, -0.254124f, -0.231964f, -0.275972f, 0.032098f, -0.264564f, + -0.208743f, 0.155599f, -0.121511f, -0.156145f, -0.162315f, -0.059788f, + -0.257073f, -0.076654f, -0.110616f, -0.321675f, -0.051952f, 0.006301f, + -0.154114f, 0.017032f, -0.017364f, -0.233247f, 0.009918f, -0.179289f, + -0.190722f, 0.147106f, -0.063910f, -0.396872f, -0.263123f, -0.003850f, + -0.040718f, -0.324699f, 0.118660f, -0.170727f, -0.316788f, 0.100886f, + -0.202842f, 0.045371f, 0.150561f, -0.057054f, -0.308150f, 0.028346f, + -0.381473f, -0.195365f, 0.026221f, -0.281795f, 0.087204f, 0.047689f, + -0.027643f, -0.104724f, -0.089030f, -0.117661f, -0.349160f, 0.056982f, + -0.340273f, 0.048086f, 0.046103f, -0.121527f, 0.021697f, 0.054109f, + -0.002768f, -0.008461f, -2.297240f, 0.124651f, 3.621661f, -0.057120f, + -1.151656f, 2.296894f, -3.678720f, -0.290240f, 0.087683f, -0.186389f, + 0.007656f, -0.090236f, -0.245217f, 0.110389f, -0.251719f, -0.029084f, + -0.128203f, -0.100005f, -0.032779f, 0.007281f, -0.366596f, -0.267870f, + -0.215620f, 0.047687f, 0.010303f, 0.097980f, -0.191569f, -0.341162f, + 0.119249f, 0.026279f, -2.161546f, 0.459591f, 1.290566f, 1.791797f, + -0.409835f, 0.127081f, -1.156367f, 0.198286f, 0.099561f, -0.067445f, + -0.034352f, 0.017966f, -0.277380f, -0.057220f, -0.174198f, -0.014164f, + 0.146090f, -0.357530f, 0.097644f, -0.000932f, 0.446603f, -0.066793f, + 2.448620f, 0.937617f, -1.232922f, 0.313183f, 0.816827f, -0.275115f, + -0.245205f, -0.126895f, 0.156668f, -0.186977f, -0.273505f, 0.013315f, + 0.168629f, -0.089084f, 0.006166f, -0.116107f, -0.199316f, -0.024010f, + -0.242303f, 0.011612f, -0.218485f, -0.229661f, -0.123922f, 0.136699f, + 0.006732f, -0.148718f, -0.164225f, 0.116063f, 1.587898f, 0.690519f, + 0.360566f, 0.009739f, -0.678702f, -0.046003f, 0.126984f, 0.605212f, + 1.240663f, -0.000228f, -1.119369f, -0.415589f, -0.721003f, 0.097936f, + -1.410586f, -2.358833f, -2.773129f, -3.983361f, -0.087144f, -0.050029f, + -0.242255f, 0.137424f, -0.307490f, -0.084637f, -0.023812f, -0.196582f, + -0.078695f, 0.038257f, -0.012110f, -0.263521f, 0.009839f, -0.109125f, + -0.226036f, 0.060712f, 0.093671f, 0.153143f, 0.039116f, -0.290891f, + 0.227057f, -0.204633f, -0.207539f, -0.148242f, 0.046204f, -0.231268f, + -0.209315f, -0.307579f, -0.436556f, 0.023475f, 0.131793f, -0.038301f, + 1.650584f, 0.392570f, 1.446576f, 1.254380f, -0.516867f, -0.057116f, + 0.149320f, 0.414424f, -0.246309f, 0.003877f, -0.480238f, -1.037035f, + -0.830779f, -1.122244f, -0.408267f, -0.253956f, 0.382005f, 0.940609f, + -1.113370f, -0.018554f, 0.141064f, -0.182504f, 1.270707f, 0.414904f, + -0.216036f, 0.203831f, 0.450716f, -0.452909f, 0.139358f, -0.027143f, + 1.956892f, 1.643732f, -0.867839f, -0.620520f, -0.334607f, -0.519982f, + 0.205023f, 0.661159f, -0.000809f, 0.049033f, -0.348579f, -0.200338f, + -0.362144f, -0.346590f, -0.230096f, 0.180746f, -0.149954f, -0.253429f, + -0.378170f, -0.040724f, -0.041597f, 0.243659f, -0.472181f, 0.015401f, + -0.180376f, 0.153139f, -0.247738f, -0.010485f, -0.157158f, 0.016825f, + -0.238925f, -0.265798f, -0.318374f, 0.142352f, -0.210520f, 0.051928f, + -0.352190f, -0.179052f, -0.185498f, 0.025540f, -0.111667f, -0.235187f, + -0.215454f, 0.010931f, -0.238372f, -0.126659f, 0.075691f, -0.091167f, + -2.462379f, -0.007950f, -0.637990f, 0.285554f, -0.051275f, 0.282279f, + -0.744083f, -0.570646f, 0.592198f, 1.421332f, -0.256027f, -0.140315f, + 0.160247f, -0.063185f, -0.055895f, -0.199864f, -0.287353f, -0.074561f, + -0.071228f, 0.055864f, -1.084764f, -0.263409f, 0.779266f, 0.228187f, + 0.375013f, 0.121204f, -0.656948f, 0.533561f, 0.272671f, -0.015423f, + -0.124180f, -0.009127f, 2.934838f, -0.150998f, 1.163152f, 0.081997f, + -4.715939f, -3.676595f, -1.524886f, -0.167593f, 0.281186f, 0.024046f, + -1.451709f, 0.332558f, 0.990504f, 0.376290f, -1.466773f, -0.448439f, + -2.929108f, -4.255188f, 0.065238f, 0.019950f, 1.372393f, 0.444052f, + -2.538772f, 1.579767f, -0.464911f, -1.866114f, 1.053958f, 0.434467f, + -0.125964f, 0.034671f, 0.077116f, -0.138466f, -0.413395f, -0.223453f, + -0.172127f, -0.251265f, -0.048239f, -0.395519f, 0.023141f, 0.037459f, + -0.249593f, -0.062215f, -0.047209f, -0.435189f, -0.164155f, -0.077590f, + -0.241164f, -0.126128f, -0.038243f, -0.180888f, 0.198840f, -0.328036f, + -0.169790f, 0.036506f, 0.052572f, -0.183570f, -0.073617f, -0.244959f, + 0.266498f, 0.032846f, -1.902106f, 0.486078f, 2.414993f, 0.975182f, + -0.382875f, 1.647810f, -2.197017f, -0.890107f, 0.221287f, 0.010889f, + 3.817042f, 0.572728f, 0.092466f, 0.473337f, -1.634659f, -1.069455f, + 1.486776f, -1.023850f, 0.088184f, 0.008842f, 0.518202f, 0.270259f, + 1.757191f, -0.121839f, -2.912229f, -1.250866f, -2.381808f, 0.335309f, + -0.120079f, -0.061294f, -0.058725f, -0.315169f, -0.262443f, 0.072434f, + -0.267836f, -0.319354f, -0.274975f, 0.068970f, -0.406467f, 0.044074f, + -0.152311f, -0.333656f, -0.228355f, -0.185613f, 0.017346f, -0.177674f, + -0.090675f, -0.102047f, -0.011768f, -0.025280f, -0.271661f, 0.098099f, + -0.312272f, -0.222217f, -0.100548f, 0.106260f, -0.034655f, 0.135109f, + -0.021276f, 0.018177f, -0.353097f, -0.011128f, 0.061136f, -0.511662f, + -0.223236f, -0.308841f, 0.118789f, -0.154628f, -0.053178f, -0.055973f, + 0.013175f, -0.368337f, -0.090863f, -0.116920f, 0.178990f, -0.025278f, + -0.190553f, -0.238092f, 0.303943f, -0.024944f, 0.719373f, 0.384332f, + -0.378480f, -0.423316f, 0.709922f, 0.758514f, -1.559023f, -2.503173f, + 0.068652f, -0.234741f, -0.182932f, 0.037878f, 0.020684f, -0.174142f, + -0.182300f, -0.052796f, -0.219145f, 0.113028f, -1.041826f, 0.035317f, + 0.919904f, -0.676011f, 0.652297f, 1.456447f, -0.166904f, -0.861823f, + 0.895827f, 0.429821f, -0.180376f, -0.076587f, -0.273945f, -0.288990f, + -0.206692f, -0.080745f, -0.085444f, 0.186953f, -0.050135f, 0.044243f, + -0.391706f, -0.160498f, -0.292268f, 0.164060f, 0.412649f, 0.211611f, + -0.327294f, -0.919399f, 0.320297f, 0.385284f, -0.088848f, -0.072556f, + -0.384813f, -0.176267f, -0.065918f, 0.134724f, -0.231104f, -0.337707f, + -0.195442f, -0.263569f, 0.098090f, -0.341411f, -0.189211f, -0.439276f, + -0.404046f, 0.262491f, -0.311093f, -0.086454f, -0.013400f, -0.061447f, + -0.026945f, -0.112036f, -0.322985f, 0.078500f, -0.230205f, -0.344535f, + -0.021087f, 0.110220f, -0.128671f, 0.044219f, +}; + +static const float av1_ab_partition_nn_bias_16_layer0[64] = { + 2.936406f, -0.396539f, -0.110456f, -1.254954f, 0.785350f, 0.516290f, + -0.172341f, 0.254386f, -0.192465f, -0.106751f, -0.055518f, -0.094994f, + 0.000000f, -0.065018f, -0.004908f, -0.130483f, -0.119580f, -0.142072f, + 0.457446f, -0.125051f, -0.107712f, 0.714607f, -0.140809f, -1.788650f, + -0.087199f, 0.000000f, -1.290050f, 0.443930f, -0.110634f, -0.109380f, + -0.188213f, -1.414179f, 1.193579f, 0.388775f, -0.873193f, -0.110050f, + -0.072565f, -0.117050f, -0.119132f, 0.456959f, -0.132069f, 0.131974f, + 1.160474f, 1.746465f, 0.442628f, -0.188849f, -0.207794f, -0.108364f, + -0.856655f, -2.141620f, 0.335476f, -0.105508f, -0.212162f, -0.109319f, + -0.237213f, -0.109980f, -0.291044f, -0.137877f, 0.470191f, -0.023908f, + 0.123809f, -0.109797f, 0.200510f, -0.147542f, +}; + +static const float av1_ab_partition_nn_weights_16_layer1[64 * LABEL_SIZE] = { + -6.823716f, 1.406568f, -0.144009f, 2.228765f, 0.838336f, 0.738107f, + -0.319014f, -0.148756f, 0.240862f, -0.111089f, -0.004241f, 0.025758f, + -0.193820f, -0.246362f, -0.181363f, -0.201556f, 0.024268f, 0.252994f, + -0.289443f, 0.194932f, 0.057467f, 0.724735f, 0.014063f, 1.361352f, + 0.025191f, 0.024274f, 0.231462f, -7.227959f, -0.094515f, 0.039946f, + 0.412719f, 0.812318f, 3.038903f, -0.286289f, 0.647482f, -0.115114f, + 0.053590f, 0.066069f, 0.153134f, 0.996250f, -0.125700f, 0.951365f, + -6.243494f, -4.827697f, 0.566320f, 0.239515f, -0.099702f, 0.054546f, + 1.847330f, 3.680076f, -3.049829f, -0.127709f, 0.068469f, -0.017794f, + 0.223864f, -0.106778f, -0.020425f, -0.040226f, -0.251890f, -0.168673f, + -0.552073f, 0.043311f, 0.218668f, 0.033209f, -3.199210f, 0.193079f, + 0.321406f, 0.718307f, -0.181418f, -0.459612f, -1.981170f, 0.968496f, + -0.029757f, -0.130065f, 0.043782f, 0.072394f, -0.088686f, 0.025322f, + 0.129882f, 0.101324f, 0.335707f, 0.072714f, -2.079774f, 0.203997f, + 0.239321f, -0.301757f, 0.257845f, 1.288382f, -0.031275f, -0.234194f, + 0.310722f, 2.045469f, 0.034716f, 0.135638f, -0.251388f, 0.320071f, + -1.065301f, -0.322731f, -0.545028f, 0.226276f, 0.090799f, 0.019289f, + 0.048950f, -1.079300f, 0.231938f, 0.083683f, 4.762127f, 0.145037f, + -0.145549f, 0.075592f, 0.172336f, 0.108175f, 0.333751f, 1.090501f, + 1.056114f, 0.047073f, 0.182052f, -0.081587f, 0.089900f, 0.339286f, + 2.049988f, 0.073585f, 0.537355f, -0.243322f, -0.010179f, -0.052601f, + -0.174915f, 0.117793f, 2.222990f, -2.520837f, -0.092699f, 1.199887f, + 0.138720f, 0.679918f, -0.463155f, -0.659496f, -0.109913f, -0.003398f, + 0.114633f, -0.128377f, 0.092970f, -0.107489f, -0.191078f, 0.185182f, + 0.216980f, -0.019343f, 3.443133f, 0.287953f, 0.099314f, 0.985958f, + 0.157268f, -0.606516f, 0.049418f, -0.221809f, -0.453081f, -0.344796f, + -0.003735f, -0.107269f, -0.128541f, -0.259543f, -0.934806f, -0.542456f, + -1.011192f, 0.022795f, 0.186363f, -0.076356f, -0.050932f, -0.165098f, + 0.168177f, -0.101596f, -5.270886f, 2.553943f, -0.440870f, -0.017494f, + 0.215208f, -0.017032f, 1.495915f, -4.304677f, 0.762211f, 0.182937f, + 0.254406f, -0.029433f, -0.088364f, -0.110160f, -0.108257f, -0.036538f, + 0.737697f, -0.234989f, 0.168095f, 0.245118f, -0.077262f, 0.195718f, + 0.753302f, -1.637869f, 0.126227f, 0.982129f, -0.121444f, -0.295570f, + -1.215799f, 0.147867f, -0.068496f, 0.132726f, -0.005772f, -0.181774f, + 0.126513f, 0.204723f, -0.366123f, 0.103906f, -0.148053f, -0.075272f, + 0.243884f, -0.104828f, 0.198988f, 0.501034f, -0.112671f, 0.111421f, + 0.167508f, -0.117803f, -0.738624f, 2.046292f, 0.124011f, 0.057983f, + -0.359154f, -0.648883f, -0.259462f, -0.459041f, -2.501223f, -0.065138f, + 0.122417f, 0.060291f, -0.129033f, -0.843086f, 0.268241f, -0.399927f, + 1.585888f, 1.816393f, -0.631427f, 0.127826f, 0.088105f, 0.073488f, + 0.717694f, -1.497362f, 2.608528f, 0.066896f, -0.079230f, 0.223436f, + -0.010530f, 0.175310f, 1.120365f, 0.034391f, 0.835312f, 0.071652f, + -0.080615f, 0.111395f, 0.162742f, 0.079927f, -3.859582f, -0.638431f, + -0.167880f, -0.992659f, -0.885355f, -1.276197f, 1.334344f, 0.931940f, + -0.078244f, -0.149030f, -0.070974f, -0.133566f, 0.200034f, 0.102793f, + -0.048546f, 0.063545f, 0.023864f, -0.190863f, 1.934257f, -0.136286f, + -0.107916f, -0.637468f, 0.066449f, 1.089693f, -0.214047f, -0.265780f, + 0.899660f, -0.130333f, 0.288311f, -0.049024f, 0.090202f, 0.487969f, + 0.339704f, 0.858479f, 0.841253f, -0.184100f, -0.637070f, -0.125071f, + -0.077650f, -0.087877f, 0.202268f, -0.027300f, 2.842862f, -0.100698f, + -0.259080f, 0.260556f, 0.157912f, -0.070364f, 0.467190f, 1.200037f, + 1.419317f, -0.033588f, -0.227824f, 0.292617f, 0.228574f, 0.213839f, + -1.091099f, -0.022258f, -1.294681f, 0.136118f, 0.081652f, -0.185359f, + -0.039706f, 0.191407f, -2.053219f, -0.261934f, 0.047812f, -0.029536f, + -0.823869f, -1.090534f, -0.755890f, 0.441035f, -0.167945f, 0.231441f, + -0.135013f, -0.260762f, 0.256872f, 0.130339f, -0.243751f, 0.189760f, + -0.288454f, 0.145363f, 0.338490f, 0.403898f, -0.022814f, -1.263598f, + -0.101315f, 0.860135f, 0.136511f, 0.028942f, 0.574047f, 2.656370f, + 0.037587f, -0.188690f, -0.125312f, 1.100435f, -1.080402f, 0.380905f, + 0.004635f, 0.097144f, -0.214309f, 0.085552f, -0.285066f, -0.705134f, + -0.054704f, -0.319951f, 5.486626f, 0.958158f, -1.380585f, 0.223340f, + -0.169167f, -0.170697f, -0.216748f, 0.324232f, 2.684204f, -0.008490f, + -0.211052f, -0.201190f, 0.123466f, -0.000234f, 0.579907f, 0.096938f, + -0.042745f, 0.201855f, 0.157195f, -0.261440f, 0.029699f, -0.046599f, + 1.618216f, -2.596280f, -0.377420f, -0.526725f, -0.493592f, -0.579615f, + 0.579699f, -0.100392f, 0.150694f, 0.061794f, 0.200425f, -0.062515f, + -0.179122f, 0.250112f, -0.344675f, -0.118359f, -0.095670f, 0.152311f, + 3.662276f, -0.154921f, -0.312991f, 0.972008f, -0.308596f, -0.190426f, + 0.133889f, -0.238673f, -0.094726f, 1.683835f, -0.215629f, -0.198890f, + -0.035278f, -0.367973f, -0.822435f, 0.240848f, -0.194656f, 0.034655f, + -0.079424f, 0.146670f, 0.026646f, -0.034507f, 0.059467f, -0.153109f, + -0.431033f, 2.552991f, -1.894091f, -0.180462f, -0.306839f, -0.025648f, + 1.026326f, -3.096230f, 1.346935f, 0.033633f, -0.181827f, 0.094376f, + 0.001696f, -0.379264f, -1.069503f, -0.140972f, -0.208769f, -0.195239f, + 0.281795f, -0.127251f, 0.180776f, 0.067763f, 0.697124f, -1.040779f, + 0.111280f, 0.188351f, -0.340234f, -0.207790f, -0.720075f, -0.137409f, + -0.070310f, -0.032918f, -0.060787f, 0.131484f, -0.077845f, -0.258652f, + 0.056911f, -0.062034f, 0.007663f, -0.185100f, 1.340361f, 0.014096f, + -0.124602f, 0.194241f, 0.128383f, 0.360465f, 0.082979f, -0.050475f, + -0.519294f, 3.323262f, 0.067014f, 0.221203f, -0.085082f, -0.228606f, + -0.916668f, -0.022643f, -1.386737f, -0.131902f, -0.349952f, -0.032874f, + -0.189190f, -0.898790f, -0.102394f, -1.017387f, 2.214050f, 1.790253f, + -1.913561f, -0.043716f, -0.214924f, -0.194598f, -0.064723f, -1.671793f, + 2.251166f, -0.146007f, 0.138527f, -0.003134f, 0.103665f, 0.006928f, + -0.240253f, -0.227464f, 0.578437f, -0.214724f, 0.503085f, 0.158093f, + 0.033091f, 0.008061f, 4.815371f, 2.132264f, 0.281850f, -2.288560f, + -0.145012f, 1.296832f, -0.362401f, -0.403252f, 0.109873f, 0.185746f, + 0.244764f, 0.172367f, -0.185588f, 0.139801f, -0.178254f, 0.068629f, + 0.358488f, -0.153969f, -6.433524f, 0.225983f, -0.138123f, -0.095971f, + -0.036089f, -1.400083f, 0.265908f, 0.257787f, 0.181144f, -1.647228f, + -0.136289f, -0.074206f, 0.122988f, -0.088895f, -1.266717f, 0.006010f, + 0.536681f, 0.263061f, -0.032207f, -0.155136f, 0.086431f, 0.441950f, + -0.060755f, -0.280683f, -0.783475f, -2.567033f, 1.093221f, 0.117667f, + -0.000408f, 0.225719f, -2.199698f, 0.141447f, -1.459051f, 0.051315f, + 0.203228f, 0.354432f, -0.005775f, -0.028073f, -0.965817f, 0.231083f, + -0.666884f, 0.026283f, -0.317486f, 0.210754f, 0.123897f, 0.223827f, + 4.214405f, 1.457334f, -0.253945f, -1.306733f, -0.391235f, 0.451154f, + -1.553888f, -0.353429f, 0.069533f, 0.159278f, -0.173836f, -0.004952f, + -0.137033f, 0.127012f, 0.143600f, 0.051587f, -0.070549f, 0.066509f, + -5.776547f, 0.180021f, -0.189183f, -1.288504f, -0.233575f, -1.473873f, + 0.140940f, 0.144451f, -0.104534f, 2.089873f, -0.168168f, 0.110726f, + 0.132134f, -0.215223f, -1.682754f, 0.157757f, -0.146163f, 0.064882f, + 0.117313f, -0.038780f, -0.124720f, -0.501697f, 0.092047f, -0.233992f, + 3.324976f, 0.516601f, 1.294202f, 0.119989f, 0.061055f, 0.043420f, + -2.750727f, -0.382812f, -0.648496f, -0.115353f, -0.334205f, 0.024354f, + -0.282998f, -0.282705f, 0.073798f, 0.169851f, 0.135651f, 0.182677f, + -0.040220f, 0.132462f, -0.303120f, -0.230113f, 6.165739f, -0.258596f, + 0.024127f, -1.388283f, -0.006042f, 0.572600f, 0.348411f, -0.387376f, + -0.075845f, 0.122319f, -0.029616f, 0.077873f, 0.154763f, 0.049073f, + 0.018597f, 0.102688f, -0.204165f, 0.020734f, -1.389133f, -0.032854f, + -0.147561f, 0.853944f, 0.132100f, -3.259659f, 0.243745f, 0.181529f, + -0.738414f, 1.509994f, 0.023470f, -0.005329f, 0.066115f, -1.345081f, + -1.455402f, -0.172023f, -0.194625f, 0.071885f, -0.201742f, -0.262402f, + 0.077601f, -0.048938f, 0.257993f, -0.504029f, -2.032415f, 1.158880f, + 0.448647f, -0.025633f, 0.117586f, -0.072275f, -0.673744f, -3.854342f, + -0.983843f, 0.047766f, -0.017193f, -0.215775f, -0.158743f, -0.232042f, + -0.509112f, 0.148812f, 0.130122f, 0.006486f, -0.099016f, 0.022514f, + -0.486850f, -0.059623f, 4.012731f, 0.025454f, 0.029059f, -0.783546f, + -0.295260f, 0.322521f, -0.473201f, -0.172100f, -0.100087f, -0.076516f, + -0.258367f, -0.112897f, 0.269364f, -0.065912f, 0.169022f, -0.178783f, + -0.095114f, 0.122089f, -2.790099f, -0.100431f, -0.087963f, -0.009431f, + -0.087819f, -2.774399f, -0.100757f, 0.013005f, -0.964533f, 3.236665f, + -0.354903f, -0.144169f, -0.166869f, -1.396513f, -0.931271f, -0.046261f, + -1.799262f, -0.365269f, 0.108611f, 0.037994f, 0.024747f, -1.073639f, + -0.203158f, -0.935006f, 1.880891f, 1.578385f, 0.726272f, -0.024546f, + -0.011626f, -0.151363f, -1.121716f, -1.787484f, 0.232806f, 0.075451f, + 0.182899f, 0.092215f, -0.207347f, -0.030111f, 0.054316f, 0.192481f, + 0.594639f, -0.247694f, 0.547471f, -0.032094f, -0.065000f, 0.007198f, + 1.605377f, -0.155945f, -0.066200f, -2.343716f, -1.016283f, -0.079321f, + 0.919365f, 0.599980f, 0.125545f, 0.265813f, 0.246884f, 0.095385f, + -0.260374f, -0.202916f, -0.042770f, 0.234967f, -0.233139f, -0.326994f, + -1.375256f, 0.121766f, 0.077433f, -1.103569f, 0.019497f, -1.029185f, + 0.253905f, 0.206569f, 0.187334f, -0.237089f, -0.294351f, 0.164137f, + 0.149696f, -0.749787f, -0.413433f, 0.976587f, 1.027976f, -0.285264f, + 0.209273f, -0.124762f, 0.050884f, 0.250764f, -0.082031f, -0.646520f, + 4.116680f, 0.437336f, 0.671684f, 0.129509f, -0.078462f, 0.014072f, + -0.678232f, 0.094831f, 1.125624f, 0.207070f, -0.154750f, -0.025780f, + -0.103030f, 0.118019f, -0.908186f, -0.263546f, -1.555324f, -0.236887f, + -0.217854f, -0.051790f, 0.017915f, 0.171001f, 1.355562f, 0.094603f, + -0.233929f, -1.282169f, -0.773183f, -0.161682f, -0.834565f, -0.286776f, + -0.298901f, 0.038162f, 0.251899f, 0.039612f, -0.022935f, -0.232308f, + -0.043855f, -0.192892f, -0.279009f, -0.182234f, -1.272808f, -0.070344f, + -0.092432f, -1.915946f, -0.134373f, -1.405496f, -0.067071f, -0.131922f, + 0.185269f, 1.465082f, 0.040240f, 0.112665f, 0.144329f, -0.286112f, + -0.617649f, 0.916177f, 0.221044f, -0.079867f, 0.170251f, -0.093638f, + -0.212620f, -0.305945f, -0.234356f, -0.482501f, 3.928472f, 1.241179f, + 0.355922f, -0.170848f, -0.189168f, 0.080225f, -1.357793f, 0.190890f, + 0.976800f, -0.068070f, -0.016295f, -0.088623f, -0.129560f, -0.212267f, + -0.071537f, -0.219501f, -0.655198f, -0.225188f, -0.116024f, 0.224174f, + -0.049715f, -0.178005f, 3.029985f, -1.141546f, 0.080066f, -1.932316f, + -0.641137f, -0.189564f, 0.935080f, 0.136119f, 0.015558f, -0.179331f, + 0.204571f, 0.020350f, 0.009362f, 0.108478f, 0.037076f, -0.049009f, + 0.081090f, -0.180202f, 1.455561f, -0.081559f, 0.059361f, 0.484971f, + 0.160923f, -2.170744f, -0.013204f, 0.126561f, -0.407122f, 1.223661f, + 0.044262f, 0.118044f, 0.058274f, -1.747100f, -0.171318f, 0.971374f, + 0.306995f, -0.103268f, -0.319443f, -0.333176f, -0.038608f, 0.119674f, + -0.106479f, -0.907933f, 1.121231f, 1.673840f, -0.421458f, -0.021146f, + -0.254838f, 0.097632f, 0.235109f, -2.901782f, 0.289518f, -0.355459f, + -0.068264f, -0.179121f, 0.068560f, -0.047570f, -0.522523f, -0.228963f, + -1.037158f, -0.163723f, 0.280563f, -0.000868f, -0.197220f, -0.239329f, + 1.985274f, -0.256181f, -0.064341f, -0.822417f, -0.465140f, -0.010942f, + -0.792024f, -0.114290f, 0.060969f, 0.104106f, -0.252123f, -0.150400f, + -0.133277f, 0.267147f, 0.274413f, 0.223744f, -0.180223f, -0.345415f, + -0.104883f, 0.119210f, -0.095041f, -0.301635f, 0.013175f, -2.128121f, + -0.147208f, -0.151509f, -0.692013f, 3.418555f, -0.016541f, 0.171511f, + 0.107159f, -1.516672f, 0.127408f, 0.687035f, -0.906486f, -0.145463f, + -0.169382f, -0.143906f, 0.125091f, -0.960645f, -0.180869f, -0.716908f, + 2.840951f, 1.904919f, -0.416268f, -0.425181f, -0.194697f, -0.075932f, + -0.950604f, -1.599800f, 0.943671f, -0.022744f, -0.270492f, 0.080843f, + -0.372916f, 0.047838f, -0.100300f, -0.026600f, 0.011733f, -0.226051f, + 0.172790f, -0.172982f, 0.041258f, -0.299379f, +}; + +static const float av1_ab_partition_nn_bias_16_layer1[LABEL_SIZE] = { + -0.053805f, -1.248639f, 0.520965f, -0.904962f, -0.126425f, -0.118798f, + 0.748430f, 0.203096f, 0.059317f, 0.418219f, 0.841294f, 0.402693f, + -0.658522f, 0.723479f, 0.544264f, 1.035225f, +}; + +static const NN_CONFIG av1_ab_partition_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 64, // num_hidden_nodes + }, + { + av1_ab_partition_nn_weights_16_layer0, + av1_ab_partition_nn_weights_16_layer1, + }, + { + av1_ab_partition_nn_bias_16_layer0, + av1_ab_partition_nn_bias_16_layer1, + }, +}; + +#undef FEATURE_SIZE +#undef LABEL_SIZE + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_ diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c index 054b0e062..c5a6bc831 100644 --- a/third_party/aom/av1/encoder/aq_complexity.c +++ b/third_party/aom/av1/encoder/aq_complexity.c @@ -39,21 +39,29 @@ static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { { -3.0, -2.0, -1.0, 100.00, 100.0 } }; -#define DEFAULT_COMPLEXITY 64 - static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) { // Approximate base quatizer (truncated to int) - const int base_quant = av1_ac_quant(q_index, 0, bit_depth) / 4; + const int base_quant = av1_ac_quant_Q3(q_index, 0, bit_depth) / 4; return (base_quant > 10) + (base_quant > 25); } void av1_setup_in_frame_q_adj(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; struct segmentation *const seg = &cm->seg; + int resolution_change = + cm->prev_frame && (cm->width != cm->prev_frame->width || + cm->height != cm->prev_frame->height); // Make SURE use of floating point in this function is safe. aom_clear_system_state(); + if (resolution_change) { + memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); + av1_clearall_segfeatures(seg); + av1_disable_segmentation(seg); + return; + } + if (frame_is_intra_only(cm) || cm->error_resilient_mode || cpi->refresh_alt_ref_frame || (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { @@ -74,9 +82,6 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) { av1_enable_segmentation(seg); - // Select delta coding method. - seg->abs_delta = SEGMENT_DELTADATA; - // Default segment "Q" feature is disabled so it defaults to the baseline Q. av1_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q); @@ -107,13 +112,13 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) { #define DEFAULT_LV_THRESH 10.0 #define MIN_DEFAULT_LV_THRESH 8.0 -#define VAR_STRENGTH_STEP 0.25 // Select a segment for the current block. // The choice of segment for a block depends on the ratio of the projected // bits for the block vs a target average and its spatial complexity. void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs, int mi_row, int mi_col, int projected_rate) { const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); const int mi_offset = mi_row * cm->mi_cols + mi_col; const int xmis = AOMMIN(cm->mi_cols - mi_col, mi_size_wide[bs]); @@ -126,9 +131,10 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs, segment = DEFAULT_AQ2_SEG; } else { // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh). - // It is converted to bits * 256 units. - const int64_t num = (int64_t)cpi->rc.sb64_target_rate * xmis * ymis * 256; - const int denom = cm->mib_size * cm->mib_size; + // It is converted to bits << AV1_PROB_COST_SHIFT units. + const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis) + << AV1_PROB_COST_SHIFT; + const int denom = cm->seq_params.mib_size * cm->seq_params.mib_size; const int target_rate = (int)(num / denom); double logvar; double low_var_thresh; @@ -139,7 +145,7 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs, MIN_DEFAULT_LV_THRESH) : DEFAULT_LV_THRESH; - av1_setup_src_planes(mb, cpi->source, mi_row, mi_col); + av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes); logvar = av1_log_block_var(cpi, mb, bs); segment = AQ_C_SEGMENTS - 1; // Just in case no break out below. diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c index 8f61c7eb8..a1fe37d4a 100644 --- a/third_party/aom/av1/encoder/aq_cyclicrefresh.c +++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c @@ -320,7 +320,7 @@ void av1_cyclic_refresh_check_golden_update(AV1_COMP *const cpi) { double fraction_low = 0.0; int low_content_frame = 0; - MODE_INFO **mi; + MB_MODE_INFO **mi; RATE_CONTROL *const rc = &cpi->rc; const int rows = cm->mi_rows, cols = cm->mi_cols; int cnt1 = 0, cnt2 = 0; @@ -330,12 +330,12 @@ void av1_cyclic_refresh_check_golden_update(AV1_COMP *const cpi) { mi = cm->mi_grid_visible + mi_row * cm->mi_stride; for (mi_col = 0; mi_col < cols; mi_col++) { - int16_t abs_mvr = mi[0]->mbmi.mv[0].as_mv.row >= 0 - ? mi[0]->mbmi.mv[0].as_mv.row - : -1 * mi[0]->mbmi.mv[0].as_mv.row; - int16_t abs_mvc = mi[0]->mbmi.mv[0].as_mv.col >= 0 - ? mi[0]->mbmi.mv[0].as_mv.col - : -1 * mi[0]->mbmi.mv[0].as_mv.col; + int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0 + ? mi[0]->mv[0].as_mv.row + : -1 * mi[0]->mv[0].as_mv.row; + int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0 + ? mi[0]->mv[0].as_mv.col + : -1 * mi[0]->mv[0].as_mv.col; // Calculate the motion of the background. if (abs_mvr <= 16 && abs_mvc <= 16) { @@ -389,8 +389,10 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) { int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame; int xmis, ymis, x, y; memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols); - sb_cols = (cm->mi_cols + cm->mib_size - 1) / cm->mib_size; - sb_rows = (cm->mi_rows + cm->mib_size - 1) / cm->mib_size; + sb_cols = + (cm->mi_cols + cm->seq_params.mib_size - 1) / cm->seq_params.mib_size; + sb_rows = + (cm->mi_rows + cm->seq_params.mib_size - 1) / cm->seq_params.mib_size; sbs_in_frame = sb_cols * sb_rows; // Number of target blocks to get the q delta (segment 1). block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100; @@ -406,8 +408,8 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) { // Get the mi_row/mi_col corresponding to superblock index i. int sb_row_index = (i / sb_cols); int sb_col_index = i - sb_row_index * sb_cols; - int mi_row = sb_row_index * cm->mib_size; - int mi_col = sb_col_index * cm->mib_size; + int mi_row = sb_row_index * cm->seq_params.mib_size; + int mi_col = sb_col_index * cm->seq_params.mib_size; int qindex_thresh = cpi->oxcf.content == AOM_CONTENT_SCREEN ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex) @@ -416,14 +418,14 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) { assert(mi_col >= 0 && mi_col < cm->mi_cols); bl_index = mi_row * cm->mi_cols + mi_col; // Loop through all MI blocks in superblock and update map. - xmis = AOMMIN(cm->mi_cols - mi_col, cm->mib_size); - ymis = AOMMIN(cm->mi_rows - mi_row, cm->mib_size); + xmis = AOMMIN(cm->mi_cols - mi_col, cm->seq_params.mib_size); + ymis = AOMMIN(cm->mi_rows - mi_row, cm->seq_params.mib_size); for (y = 0; y < ymis; y++) { for (x = 0; x < xmis; x++) { const int bl_index2 = bl_index + y * cm->mi_cols + x; // If the block is as a candidate for clean up then mark it // for possible boost/refresh (segment 1). The segment id may get - // reset to 0 later if block gets coded anything other than ZEROMV. + // reset to 0 later if block gets coded anything other than GLOBALMV. if (cr->map[bl_index2] == 0) { if (cr->last_coded_q_map[bl_index2] > qindex_thresh) sum_map++; } else if (cr->map[bl_index2] < 0) { @@ -479,6 +481,16 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) { CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; struct segmentation *const seg = &cm->seg; const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc); + int resolution_change = + cm->prev_frame && (cm->width != cm->prev_frame->width || + cm->height != cm->prev_frame->height); + if (resolution_change) { + memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); + av1_clearall_segfeatures(seg); + aom_clear_system_state(); + av1_disable_segmentation(seg); + return; + } if (cm->current_video_frame == 0) cr->low_content_avg = 0.0; // Don't apply refresh on key frame or enhancement layer frames. if (!apply_cyclic_refresh || cm->frame_type == KEY_FRAME) { @@ -509,8 +521,6 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) { // Clear down the segment map. av1_enable_segmentation(&cm->seg); av1_clearall_segfeatures(seg); - // Select delta coding method. - seg->abs_delta = SEGMENT_DELTADATA; // Note: setting temporal_update has no effect, as the seg-map coding method // (temporal or spatial) is determined in diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c index 84d967215..29a311447 100644 --- a/third_party/aom/av1/encoder/aq_variance.c +++ b/third_party/aom/av1/encoder/aq_variance.c @@ -19,6 +19,7 @@ #include "av1/encoder/ratectrl.h" #include "av1/encoder/rd.h" #include "av1/encoder/segmentation.h" +#include "av1/encoder/dwt.h" #include "aom_ports/system_state.h" #define ENERGY_MIN (-4) @@ -34,10 +35,8 @@ static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 }; #define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN] DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 }; -#if CONFIG_HIGHBITDEPTH DECLARE_ALIGNED(16, static const uint16_t, av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 }; -#endif unsigned int av1_vaq_segment_id(int energy) { ENERGY_IN_BOUNDS(energy); @@ -49,6 +48,16 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) { struct segmentation *seg = &cm->seg; int i; + int resolution_change = + cm->prev_frame && (cm->width != cm->prev_frame->width || + cm->height != cm->prev_frame->height); + if (resolution_change) { + memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); + av1_clearall_segfeatures(seg); + aom_clear_system_state(); + av1_disable_segmentation(seg); + return; + } if (frame_is_intra_only(cm) || cm->error_resilient_mode || cpi->refresh_alt_ref_frame || (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { @@ -57,8 +66,6 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) { av1_enable_segmentation(seg); av1_clearall_segfeatures(seg); - seg->abs_delta = SEGMENT_DELTADATA; - aom_clear_system_state(); for (i = 0; i < MAX_SEGMENTS; ++i) { @@ -74,11 +81,6 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) { qindex_delta = -cm->base_qindex + 1; } - // No need to enable SEG_LVL_ALT_Q for this segment. - if (rate_ratio[i] == 1.0) { - continue; - } - av1_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta); av1_enable_segfeature(seg, i, SEG_LVL_ALT_Q); } @@ -108,7 +110,6 @@ static void aq_variance(const uint8_t *a, int a_stride, const uint8_t *b, } } -#if CONFIG_HIGHBITDEPTH static void aq_highbd_variance64(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int w, int h, uint64_t *sse, uint64_t *sum) { @@ -139,7 +140,6 @@ static void aq_highbd_8_variance(const uint8_t *a8, int a_stride, *sse = (unsigned int)sse_long; *sum = (int)sum_long; } -#endif // CONFIG_HIGHBITDEPTH static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bs) { @@ -154,7 +154,6 @@ static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x, const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow; const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow; int avg; -#if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride, CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, bw, bh, @@ -165,14 +164,9 @@ static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x, aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0, bw, bh, &sse, &avg); } -#else - aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0, - bw, bh, &sse, &avg); -#endif // CONFIG_HIGHBITDEPTH var = sse - (unsigned int)(((int64_t)avg * avg) / (bw * bh)); return (unsigned int)((uint64_t)var * 256) / (bw * bh); } else { -#if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride, @@ -181,10 +175,6 @@ static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x, var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0, &sse); } -#else - var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride, - av1_all_zeros, 0, &sse); -#endif // CONFIG_HIGHBITDEPTH return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs]; } } @@ -205,3 +195,53 @@ int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { energy = av1_log_block_var(cpi, x, bs) - energy_midpoint; return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX); } + +unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) { + MACROBLOCKD *xd = &x->e_mbd; + int stride = x->plane[0].src.stride; + uint8_t *buf = x->plane[0].src.buf; + const int bw = MI_SIZE * mi_size_wide[bs]; + const int bh = MI_SIZE * mi_size_high[bs]; + int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; + + int var = 0; + for (int r = 0; r < bh; r += 8) + for (int c = 0; c < bw; c += 8) { + var += av1_haar_ac_sad_8x8_uint8_input(buf + c + r * stride, stride, hbd); + } + + return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs]; +} + +double av1_log_block_wavelet_energy(MACROBLOCK *x, BLOCK_SIZE bs) { + unsigned int haar_sad = haar_ac_energy(x, bs); + aom_clear_system_state(); + return log(haar_sad + 1.0); +} + +int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bs) { + double energy, energy_midpoint; + aom_clear_system_state(); + energy_midpoint = (cpi->oxcf.pass == 2) ? cpi->twopass.frame_avg_haar_energy + : DEFAULT_E_MIDPOINT; + energy = av1_log_block_wavelet_energy(x, bs) - energy_midpoint; + return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX); +} + +int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi, + int block_var_level) { + ENERGY_IN_BOUNDS(block_var_level); + + const int rate_level = SEGMENT_ID(block_var_level); + const AV1_COMMON *const cm = &cpi->common; + int qindex_delta = + av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex, + rate_ratio[rate_level], cm->bit_depth); + + if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) { + qindex_delta = -cm->base_qindex + 1; + } + + return qindex_delta; +} diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h index 05725c5de..b1a8bc38a 100644 --- a/third_party/aom/av1/encoder/aq_variance.h +++ b/third_party/aom/av1/encoder/aq_variance.h @@ -23,6 +23,10 @@ void av1_vaq_frame_setup(AV1_COMP *cpi); int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); +int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi, + int block_var_level); +int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bs); #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/av1/encoder/arm/neon/error_neon.c b/third_party/aom/av1/encoder/arm/neon/error_neon.c deleted file mode 100644 index fe5233f89..000000000 --- a/third_party/aom/av1/encoder/arm/neon/error_neon.c +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> -#include <assert.h> - -#include "./av1_rtcd.h" - -int64_t av1_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, - int block_size) { - int64x2_t error = vdupq_n_s64(0); - - assert(block_size >= 8); - assert((block_size % 8) == 0); - - do { - const int16x8_t c = vld1q_s16(coeff); - const int16x8_t d = vld1q_s16(dqcoeff); - const int16x8_t diff = vsubq_s16(c, d); - const int16x4_t diff_lo = vget_low_s16(diff); - const int16x4_t diff_hi = vget_high_s16(diff); - // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before - // accumulating them in 64-bits. - const int32x4_t err0 = vmull_s16(diff_lo, diff_lo); - const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi); - const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1)); - error = vaddq_s64(error, err2); - coeff += 8; - dqcoeff += 8; - block_size -= 8; - } while (block_size != 0); - - return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1); -} diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c new file mode 100644 index 000000000..b92b3469f --- /dev/null +++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c @@ -0,0 +1,1902 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdlib.h> +#include "av1/encoder/av1_fwd_txfm1d.h" + +#if CONFIG_COEFFICIENT_RANGE_CHECKING +void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf, + int32_t size, int8_t bit); + +#define range_check(stage, input, buf, size, bit) \ + range_check_func(stage, input, buf, size, bit) +#else // CONFIG_COEFFICIENT_RANGE_CHECKING + +#define range_check(stage, input, buf, size, bit) \ + { \ + (void)stage; \ + (void)input; \ + (void)buf; \ + (void)size; \ + (void)bit; \ + } +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + +void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 4; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[4]; + + // stage 0; + range_check(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[3]; + bf1[1] = input[1] + input[2]; + bf1[2] = -input[2] + input[1]; + bf1[3] = -input[3] + input[0]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[2]; + bf1[2] = bf0[1]; + bf1[3] = bf0[3]; + range_check(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 8; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[8]; + + // stage 0; + range_check(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[7]; + bf1[1] = input[1] + input[6]; + bf1[2] = input[2] + input[5]; + bf1[3] = input[3] + input[4]; + bf1[4] = -input[4] + input[3]; + bf1[5] = -input[5] + input[2]; + bf1[6] = -input[6] + input[1]; + bf1[7] = -input[7] + input[0]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[3]; + bf1[1] = bf0[1] + bf0[2]; + bf1[2] = -bf0[2] + bf0[1]; + bf1[3] = -bf0[3] + bf0[0]; + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); + bf1[7] = bf0[7]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + bf1[4] = bf0[4] + bf0[5]; + bf1[5] = -bf0[5] + bf0[4]; + bf1[6] = -bf0[6] + bf0[7]; + bf1[7] = bf0[7] + bf0[6]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); + bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[4]; + bf1[2] = bf0[2]; + bf1[3] = bf0[6]; + bf1[4] = bf0[1]; + bf1[5] = bf0[5]; + bf1[6] = bf0[3]; + bf1[7] = bf0[7]; + range_check(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 16; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[16]; + + // stage 0; + range_check(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[15]; + bf1[1] = input[1] + input[14]; + bf1[2] = input[2] + input[13]; + bf1[3] = input[3] + input[12]; + bf1[4] = input[4] + input[11]; + bf1[5] = input[5] + input[10]; + bf1[6] = input[6] + input[9]; + bf1[7] = input[7] + input[8]; + bf1[8] = -input[8] + input[7]; + bf1[9] = -input[9] + input[6]; + bf1[10] = -input[10] + input[5]; + bf1[11] = -input[11] + input[4]; + bf1[12] = -input[12] + input[3]; + bf1[13] = -input[13] + input[2]; + bf1[14] = -input[14] + input[1]; + bf1[15] = -input[15] + input[0]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[7]; + bf1[1] = bf0[1] + bf0[6]; + bf1[2] = bf0[2] + bf0[5]; + bf1[3] = bf0[3] + bf0[4]; + bf1[4] = -bf0[4] + bf0[3]; + bf1[5] = -bf0[5] + bf0[2]; + bf1[6] = -bf0[6] + bf0[1]; + bf1[7] = -bf0[7] + bf0[0]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[3]; + bf1[1] = bf0[1] + bf0[2]; + bf1[2] = -bf0[2] + bf0[1]; + bf1[3] = -bf0[3] + bf0[0]; + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = bf0[8] + bf0[11]; + bf1[9] = bf0[9] + bf0[10]; + bf1[10] = -bf0[10] + bf0[9]; + bf1[11] = -bf0[11] + bf0[8]; + bf1[12] = -bf0[12] + bf0[15]; + bf1[13] = -bf0[13] + bf0[14]; + bf1[14] = bf0[14] + bf0[13]; + bf1[15] = bf0[15] + bf0[12]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + bf1[4] = bf0[4] + bf0[5]; + bf1[5] = -bf0[5] + bf0[4]; + bf1[6] = -bf0[6] + bf0[7]; + bf1[7] = bf0[7] + bf0[6]; + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); + bf1[15] = bf0[15]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); + bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); + bf1[8] = bf0[8] + bf0[9]; + bf1[9] = -bf0[9] + bf0[8]; + bf1[10] = -bf0[10] + bf0[11]; + bf1[11] = bf0[11] + bf0[10]; + bf1[12] = bf0[12] + bf0[13]; + bf1[13] = -bf0[13] + bf0[12]; + bf1[14] = -bf0[14] + bf0[15]; + bf1[15] = bf0[15] + bf0[14]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); + bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[8]; + bf1[2] = bf0[4]; + bf1[3] = bf0[12]; + bf1[4] = bf0[2]; + bf1[5] = bf0[10]; + bf1[6] = bf0[6]; + bf1[7] = bf0[14]; + bf1[8] = bf0[1]; + bf1[9] = bf0[9]; + bf1[10] = bf0[5]; + bf1[11] = bf0[13]; + bf1[12] = bf0[3]; + bf1[13] = bf0[11]; + bf1[14] = bf0[7]; + bf1[15] = bf0[15]; + range_check(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 32; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[32]; + + // stage 0; + range_check(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[31]; + bf1[1] = input[1] + input[30]; + bf1[2] = input[2] + input[29]; + bf1[3] = input[3] + input[28]; + bf1[4] = input[4] + input[27]; + bf1[5] = input[5] + input[26]; + bf1[6] = input[6] + input[25]; + bf1[7] = input[7] + input[24]; + bf1[8] = input[8] + input[23]; + bf1[9] = input[9] + input[22]; + bf1[10] = input[10] + input[21]; + bf1[11] = input[11] + input[20]; + bf1[12] = input[12] + input[19]; + bf1[13] = input[13] + input[18]; + bf1[14] = input[14] + input[17]; + bf1[15] = input[15] + input[16]; + bf1[16] = -input[16] + input[15]; + bf1[17] = -input[17] + input[14]; + bf1[18] = -input[18] + input[13]; + bf1[19] = -input[19] + input[12]; + bf1[20] = -input[20] + input[11]; + bf1[21] = -input[21] + input[10]; + bf1[22] = -input[22] + input[9]; + bf1[23] = -input[23] + input[8]; + bf1[24] = -input[24] + input[7]; + bf1[25] = -input[25] + input[6]; + bf1[26] = -input[26] + input[5]; + bf1[27] = -input[27] + input[4]; + bf1[28] = -input[28] + input[3]; + bf1[29] = -input[29] + input[2]; + bf1[30] = -input[30] + input[1]; + bf1[31] = -input[31] + input[0]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[15]; + bf1[1] = bf0[1] + bf0[14]; + bf1[2] = bf0[2] + bf0[13]; + bf1[3] = bf0[3] + bf0[12]; + bf1[4] = bf0[4] + bf0[11]; + bf1[5] = bf0[5] + bf0[10]; + bf1[6] = bf0[6] + bf0[9]; + bf1[7] = bf0[7] + bf0[8]; + bf1[8] = -bf0[8] + bf0[7]; + bf1[9] = -bf0[9] + bf0[6]; + bf1[10] = -bf0[10] + bf0[5]; + bf1[11] = -bf0[11] + bf0[4]; + bf1[12] = -bf0[12] + bf0[3]; + bf1[13] = -bf0[13] + bf0[2]; + bf1[14] = -bf0[14] + bf0[1]; + bf1[15] = -bf0[15] + bf0[0]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit); + bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit); + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[7]; + bf1[1] = bf0[1] + bf0[6]; + bf1[2] = bf0[2] + bf0[5]; + bf1[3] = bf0[3] + bf0[4]; + bf1[4] = -bf0[4] + bf0[3]; + bf1[5] = -bf0[5] + bf0[2]; + bf1[6] = -bf0[6] + bf0[1]; + bf1[7] = -bf0[7] + bf0[0]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = bf0[16] + bf0[23]; + bf1[17] = bf0[17] + bf0[22]; + bf1[18] = bf0[18] + bf0[21]; + bf1[19] = bf0[19] + bf0[20]; + bf1[20] = -bf0[20] + bf0[19]; + bf1[21] = -bf0[21] + bf0[18]; + bf1[22] = -bf0[22] + bf0[17]; + bf1[23] = -bf0[23] + bf0[16]; + bf1[24] = -bf0[24] + bf0[31]; + bf1[25] = -bf0[25] + bf0[30]; + bf1[26] = -bf0[26] + bf0[29]; + bf1[27] = -bf0[27] + bf0[28]; + bf1[28] = bf0[28] + bf0[27]; + bf1[29] = bf0[29] + bf0[26]; + bf1[30] = bf0[30] + bf0[25]; + bf1[31] = bf0[31] + bf0[24]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[3]; + bf1[1] = bf0[1] + bf0[2]; + bf1[2] = -bf0[2] + bf0[1]; + bf1[3] = -bf0[3] + bf0[0]; + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = bf0[8] + bf0[11]; + bf1[9] = bf0[9] + bf0[10]; + bf1[10] = -bf0[10] + bf0[9]; + bf1[11] = -bf0[11] + bf0[8]; + bf1[12] = -bf0[12] + bf0[15]; + bf1[13] = -bf0[13] + bf0[14]; + bf1[14] = bf0[14] + bf0[13]; + bf1[15] = bf0[15] + bf0[12]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); + bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); + bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit); + bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit); + bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + bf1[4] = bf0[4] + bf0[5]; + bf1[5] = -bf0[5] + bf0[4]; + bf1[6] = -bf0[6] + bf0[7]; + bf1[7] = bf0[7] + bf0[6]; + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); + bf1[15] = bf0[15]; + bf1[16] = bf0[16] + bf0[19]; + bf1[17] = bf0[17] + bf0[18]; + bf1[18] = -bf0[18] + bf0[17]; + bf1[19] = -bf0[19] + bf0[16]; + bf1[20] = -bf0[20] + bf0[23]; + bf1[21] = -bf0[21] + bf0[22]; + bf1[22] = bf0[22] + bf0[21]; + bf1[23] = bf0[23] + bf0[20]; + bf1[24] = bf0[24] + bf0[27]; + bf1[25] = bf0[25] + bf0[26]; + bf1[26] = -bf0[26] + bf0[25]; + bf1[27] = -bf0[27] + bf0[24]; + bf1[28] = -bf0[28] + bf0[31]; + bf1[29] = -bf0[29] + bf0[30]; + bf1[30] = bf0[30] + bf0[29]; + bf1[31] = bf0[31] + bf0[28]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); + bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); + bf1[8] = bf0[8] + bf0[9]; + bf1[9] = -bf0[9] + bf0[8]; + bf1[10] = -bf0[10] + bf0[11]; + bf1[11] = bf0[11] + bf0[10]; + bf1[12] = bf0[12] + bf0[13]; + bf1[13] = -bf0[13] + bf0[12]; + bf1[14] = -bf0[14] + bf0[15]; + bf1[15] = bf0[15] + bf0[14]; + bf1[16] = bf0[16]; + bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); + bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit); + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); + bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); + bf1[31] = bf0[31]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); + bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); + bf1[16] = bf0[16] + bf0[17]; + bf1[17] = -bf0[17] + bf0[16]; + bf1[18] = -bf0[18] + bf0[19]; + bf1[19] = bf0[19] + bf0[18]; + bf1[20] = bf0[20] + bf0[21]; + bf1[21] = -bf0[21] + bf0[20]; + bf1[22] = -bf0[22] + bf0[23]; + bf1[23] = bf0[23] + bf0[22]; + bf1[24] = bf0[24] + bf0[25]; + bf1[25] = -bf0[25] + bf0[24]; + bf1[26] = -bf0[26] + bf0[27]; + bf1[27] = bf0[27] + bf0[26]; + bf1[28] = bf0[28] + bf0[29]; + bf1[29] = -bf0[29] + bf0[28]; + bf1[30] = -bf0[30] + bf0[31]; + bf1[31] = bf0[31] + bf0[30]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit); + bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit); + bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit); + bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit); + bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit); + bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit); + bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit); + bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit); + bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit); + bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit); + bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit); + bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit); + bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit); + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[16]; + bf1[2] = bf0[8]; + bf1[3] = bf0[24]; + bf1[4] = bf0[4]; + bf1[5] = bf0[20]; + bf1[6] = bf0[12]; + bf1[7] = bf0[28]; + bf1[8] = bf0[2]; + bf1[9] = bf0[18]; + bf1[10] = bf0[10]; + bf1[11] = bf0[26]; + bf1[12] = bf0[6]; + bf1[13] = bf0[22]; + bf1[14] = bf0[14]; + bf1[15] = bf0[30]; + bf1[16] = bf0[1]; + bf1[17] = bf0[17]; + bf1[18] = bf0[9]; + bf1[19] = bf0[25]; + bf1[20] = bf0[5]; + bf1[21] = bf0[21]; + bf1[22] = bf0[13]; + bf1[23] = bf0[29]; + bf1[24] = bf0[3]; + bf1[25] = bf0[19]; + bf1[26] = bf0[11]; + bf1[27] = bf0[27]; + bf1[28] = bf0[7]; + bf1[29] = bf0[23]; + bf1[30] = bf0[15]; + bf1[31] = bf0[31]; + range_check(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + int bit = cos_bit; + const int32_t *sinpi = sinpi_arr(bit); + int32_t x0, x1, x2, x3; + int32_t s0, s1, s2, s3, s4, s5, s6, s7; + + // stage 0 + range_check(0, input, input, 4, stage_range[0]); + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + // stage 1 + s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]); + s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]); + s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]); + s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]); + s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]); + s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]); + s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]); + s7 = range_check_value(x0 + x1, stage_range[1]); + + // stage 2 + s7 = range_check_value(s7 - x3, stage_range[2]); + + // stage 3 + x0 = range_check_value(s0 + s2, bit + stage_range[3]); + x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]); + x2 = range_check_value(s1 - s3, bit + stage_range[3]); + x3 = range_check_value(s4, bit + stage_range[3]); + + // stage 4 + x0 = range_check_value(x0 + s5, bit + stage_range[4]); + x2 = range_check_value(x2 + s6, bit + stage_range[4]); + + // stage 5 + s0 = range_check_value(x0 + x3, bit + stage_range[5]); + s1 = range_check_value(x1, bit + stage_range[5]); + s2 = range_check_value(x2 - x3, bit + stage_range[5]); + s3 = range_check_value(x2 - x0, bit + stage_range[5]); + + // stage 6 + s3 = range_check_value(s3 + x3, bit + stage_range[6]); + + // 1-D transform scaling factor is sqrt(2). + output[0] = round_shift(s0, bit); + output[1] = round_shift(s1, bit); + output[2] = round_shift(s2, bit); + output[3] = round_shift(s3, bit); + range_check(6, input, output, 4, stage_range[6]); +} + +void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 8; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[8]; + + // stage 0; + range_check(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + assert(output != input); + bf1 = output; + bf1[0] = input[0]; + bf1[1] = -input[7]; + bf1[2] = -input[3]; + bf1[3] = input[4]; + bf1[4] = -input[1]; + bf1[5] = input[6]; + bf1[6] = input[2]; + bf1[7] = -input[5]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[2]; + bf1[1] = bf0[1] + bf0[3]; + bf1[2] = bf0[0] - bf0[2]; + bf1[3] = bf0[1] - bf0[3]; + bf1[4] = bf0[4] + bf0[6]; + bf1[5] = bf0[5] + bf0[7]; + bf1[6] = bf0[4] - bf0[6]; + bf1[7] = bf0[5] - bf0[7]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); + bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[4]; + bf1[1] = bf0[1] + bf0[5]; + bf1[2] = bf0[2] + bf0[6]; + bf1[3] = bf0[3] + bf0[7]; + bf1[4] = bf0[0] - bf0[4]; + bf1[5] = bf0[1] - bf0[5]; + bf1[6] = bf0[2] - bf0[6]; + bf1[7] = bf0[3] - bf0[7]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit); + bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit); + bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit); + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[1]; + bf1[1] = bf0[6]; + bf1[2] = bf0[3]; + bf1[3] = bf0[4]; + bf1[4] = bf0[5]; + bf1[5] = bf0[2]; + bf1[6] = bf0[7]; + bf1[7] = bf0[0]; + range_check(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 16; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[16]; + + // stage 0; + range_check(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + assert(output != input); + bf1 = output; + bf1[0] = input[0]; + bf1[1] = -input[15]; + bf1[2] = -input[7]; + bf1[3] = input[8]; + bf1[4] = -input[3]; + bf1[5] = input[12]; + bf1[6] = input[4]; + bf1[7] = -input[11]; + bf1[8] = -input[1]; + bf1[9] = input[14]; + bf1[10] = input[6]; + bf1[11] = -input[9]; + bf1[12] = input[2]; + bf1[13] = -input[13]; + bf1[14] = -input[5]; + bf1[15] = input[10]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit); + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit); + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[2]; + bf1[1] = bf0[1] + bf0[3]; + bf1[2] = bf0[0] - bf0[2]; + bf1[3] = bf0[1] - bf0[3]; + bf1[4] = bf0[4] + bf0[6]; + bf1[5] = bf0[5] + bf0[7]; + bf1[6] = bf0[4] - bf0[6]; + bf1[7] = bf0[5] - bf0[7]; + bf1[8] = bf0[8] + bf0[10]; + bf1[9] = bf0[9] + bf0[11]; + bf1[10] = bf0[8] - bf0[10]; + bf1[11] = bf0[9] - bf0[11]; + bf1[12] = bf0[12] + bf0[14]; + bf1[13] = bf0[13] + bf0[15]; + bf1[14] = bf0[12] - bf0[14]; + bf1[15] = bf0[13] - bf0[15]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); + bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit); + bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit); + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[4]; + bf1[1] = bf0[1] + bf0[5]; + bf1[2] = bf0[2] + bf0[6]; + bf1[3] = bf0[3] + bf0[7]; + bf1[4] = bf0[0] - bf0[4]; + bf1[5] = bf0[1] - bf0[5]; + bf1[6] = bf0[2] - bf0[6]; + bf1[7] = bf0[3] - bf0[7]; + bf1[8] = bf0[8] + bf0[12]; + bf1[9] = bf0[9] + bf0[13]; + bf1[10] = bf0[10] + bf0[14]; + bf1[11] = bf0[11] + bf0[15]; + bf1[12] = bf0[8] - bf0[12]; + bf1[13] = bf0[9] - bf0[13]; + bf1[14] = bf0[10] - bf0[14]; + bf1[15] = bf0[11] - bf0[15]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit); + bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit); + bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit); + bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit); + bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit); + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[8]; + bf1[1] = bf0[1] + bf0[9]; + bf1[2] = bf0[2] + bf0[10]; + bf1[3] = bf0[3] + bf0[11]; + bf1[4] = bf0[4] + bf0[12]; + bf1[5] = bf0[5] + bf0[13]; + bf1[6] = bf0[6] + bf0[14]; + bf1[7] = bf0[7] + bf0[15]; + bf1[8] = bf0[0] - bf0[8]; + bf1[9] = bf0[1] - bf0[9]; + bf1[10] = bf0[2] - bf0[10]; + bf1[11] = bf0[3] - bf0[11]; + bf1[12] = bf0[4] - bf0[12]; + bf1[13] = bf0[5] - bf0[13]; + bf1[14] = bf0[6] - bf0[14]; + bf1[15] = bf0[7] - bf0[15]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit); + bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit); + bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit); + bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit); + bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit); + bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit); + bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit); + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[1]; + bf1[1] = bf0[14]; + bf1[2] = bf0[3]; + bf1[3] = bf0[12]; + bf1[4] = bf0[5]; + bf1[5] = bf0[10]; + bf1[6] = bf0[7]; + bf1[7] = bf0[8]; + bf1[8] = bf0[9]; + bf1[9] = bf0[6]; + bf1[10] = bf0[11]; + bf1[11] = bf0[4]; + bf1[12] = bf0[13]; + bf1[13] = bf0[2]; + bf1[14] = bf0[15]; + bf1[15] = bf0[0]; + range_check(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + for (int i = 0; i < 4; ++i) + output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits); + assert(stage_range[0] + NewSqrt2Bits <= 32); + range_check(0, input, output, 4, stage_range[0]); +} + +void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + for (int i = 0; i < 8; ++i) output[i] = input[i] * 2; + range_check(0, input, output, 8, stage_range[0]); +} + +void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + for (int i = 0; i < 16; ++i) + output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits); + assert(stage_range[0] + NewSqrt2Bits <= 32); + range_check(0, input, output, 16, stage_range[0]); +} + +void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + for (int i = 0; i < 32; ++i) output[i] = input[i] * 4; + range_check(0, input, output, 32, stage_range[0]); +} + +void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 64; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[64]; + + // stage 0; + range_check(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[63]; + bf1[1] = input[1] + input[62]; + bf1[2] = input[2] + input[61]; + bf1[3] = input[3] + input[60]; + bf1[4] = input[4] + input[59]; + bf1[5] = input[5] + input[58]; + bf1[6] = input[6] + input[57]; + bf1[7] = input[7] + input[56]; + bf1[8] = input[8] + input[55]; + bf1[9] = input[9] + input[54]; + bf1[10] = input[10] + input[53]; + bf1[11] = input[11] + input[52]; + bf1[12] = input[12] + input[51]; + bf1[13] = input[13] + input[50]; + bf1[14] = input[14] + input[49]; + bf1[15] = input[15] + input[48]; + bf1[16] = input[16] + input[47]; + bf1[17] = input[17] + input[46]; + bf1[18] = input[18] + input[45]; + bf1[19] = input[19] + input[44]; + bf1[20] = input[20] + input[43]; + bf1[21] = input[21] + input[42]; + bf1[22] = input[22] + input[41]; + bf1[23] = input[23] + input[40]; + bf1[24] = input[24] + input[39]; + bf1[25] = input[25] + input[38]; + bf1[26] = input[26] + input[37]; + bf1[27] = input[27] + input[36]; + bf1[28] = input[28] + input[35]; + bf1[29] = input[29] + input[34]; + bf1[30] = input[30] + input[33]; + bf1[31] = input[31] + input[32]; + bf1[32] = -input[32] + input[31]; + bf1[33] = -input[33] + input[30]; + bf1[34] = -input[34] + input[29]; + bf1[35] = -input[35] + input[28]; + bf1[36] = -input[36] + input[27]; + bf1[37] = -input[37] + input[26]; + bf1[38] = -input[38] + input[25]; + bf1[39] = -input[39] + input[24]; + bf1[40] = -input[40] + input[23]; + bf1[41] = -input[41] + input[22]; + bf1[42] = -input[42] + input[21]; + bf1[43] = -input[43] + input[20]; + bf1[44] = -input[44] + input[19]; + bf1[45] = -input[45] + input[18]; + bf1[46] = -input[46] + input[17]; + bf1[47] = -input[47] + input[16]; + bf1[48] = -input[48] + input[15]; + bf1[49] = -input[49] + input[14]; + bf1[50] = -input[50] + input[13]; + bf1[51] = -input[51] + input[12]; + bf1[52] = -input[52] + input[11]; + bf1[53] = -input[53] + input[10]; + bf1[54] = -input[54] + input[9]; + bf1[55] = -input[55] + input[8]; + bf1[56] = -input[56] + input[7]; + bf1[57] = -input[57] + input[6]; + bf1[58] = -input[58] + input[5]; + bf1[59] = -input[59] + input[4]; + bf1[60] = -input[60] + input[3]; + bf1[61] = -input[61] + input[2]; + bf1[62] = -input[62] + input[1]; + bf1[63] = -input[63] + input[0]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[31]; + bf1[1] = bf0[1] + bf0[30]; + bf1[2] = bf0[2] + bf0[29]; + bf1[3] = bf0[3] + bf0[28]; + bf1[4] = bf0[4] + bf0[27]; + bf1[5] = bf0[5] + bf0[26]; + bf1[6] = bf0[6] + bf0[25]; + bf1[7] = bf0[7] + bf0[24]; + bf1[8] = bf0[8] + bf0[23]; + bf1[9] = bf0[9] + bf0[22]; + bf1[10] = bf0[10] + bf0[21]; + bf1[11] = bf0[11] + bf0[20]; + bf1[12] = bf0[12] + bf0[19]; + bf1[13] = bf0[13] + bf0[18]; + bf1[14] = bf0[14] + bf0[17]; + bf1[15] = bf0[15] + bf0[16]; + bf1[16] = -bf0[16] + bf0[15]; + bf1[17] = -bf0[17] + bf0[14]; + bf1[18] = -bf0[18] + bf0[13]; + bf1[19] = -bf0[19] + bf0[12]; + bf1[20] = -bf0[20] + bf0[11]; + bf1[21] = -bf0[21] + bf0[10]; + bf1[22] = -bf0[22] + bf0[9]; + bf1[23] = -bf0[23] + bf0[8]; + bf1[24] = -bf0[24] + bf0[7]; + bf1[25] = -bf0[25] + bf0[6]; + bf1[26] = -bf0[26] + bf0[5]; + bf1[27] = -bf0[27] + bf0[4]; + bf1[28] = -bf0[28] + bf0[3]; + bf1[29] = -bf0[29] + bf0[2]; + bf1[30] = -bf0[30] + bf0[1]; + bf1[31] = -bf0[31] + bf0[0]; + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = bf0[34]; + bf1[35] = bf0[35]; + bf1[36] = bf0[36]; + bf1[37] = bf0[37]; + bf1[38] = bf0[38]; + bf1[39] = bf0[39]; + bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit); + bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit); + bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit); + bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit); + bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit); + bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit); + bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit); + bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit); + bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit); + bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit); + bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit); + bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit); + bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit); + bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit); + bf1[56] = bf0[56]; + bf1[57] = bf0[57]; + bf1[58] = bf0[58]; + bf1[59] = bf0[59]; + bf1[60] = bf0[60]; + bf1[61] = bf0[61]; + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[15]; + bf1[1] = bf0[1] + bf0[14]; + bf1[2] = bf0[2] + bf0[13]; + bf1[3] = bf0[3] + bf0[12]; + bf1[4] = bf0[4] + bf0[11]; + bf1[5] = bf0[5] + bf0[10]; + bf1[6] = bf0[6] + bf0[9]; + bf1[7] = bf0[7] + bf0[8]; + bf1[8] = -bf0[8] + bf0[7]; + bf1[9] = -bf0[9] + bf0[6]; + bf1[10] = -bf0[10] + bf0[5]; + bf1[11] = -bf0[11] + bf0[4]; + bf1[12] = -bf0[12] + bf0[3]; + bf1[13] = -bf0[13] + bf0[2]; + bf1[14] = -bf0[14] + bf0[1]; + bf1[15] = -bf0[15] + bf0[0]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit); + bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit); + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = bf0[32] + bf0[47]; + bf1[33] = bf0[33] + bf0[46]; + bf1[34] = bf0[34] + bf0[45]; + bf1[35] = bf0[35] + bf0[44]; + bf1[36] = bf0[36] + bf0[43]; + bf1[37] = bf0[37] + bf0[42]; + bf1[38] = bf0[38] + bf0[41]; + bf1[39] = bf0[39] + bf0[40]; + bf1[40] = -bf0[40] + bf0[39]; + bf1[41] = -bf0[41] + bf0[38]; + bf1[42] = -bf0[42] + bf0[37]; + bf1[43] = -bf0[43] + bf0[36]; + bf1[44] = -bf0[44] + bf0[35]; + bf1[45] = -bf0[45] + bf0[34]; + bf1[46] = -bf0[46] + bf0[33]; + bf1[47] = -bf0[47] + bf0[32]; + bf1[48] = -bf0[48] + bf0[63]; + bf1[49] = -bf0[49] + bf0[62]; + bf1[50] = -bf0[50] + bf0[61]; + bf1[51] = -bf0[51] + bf0[60]; + bf1[52] = -bf0[52] + bf0[59]; + bf1[53] = -bf0[53] + bf0[58]; + bf1[54] = -bf0[54] + bf0[57]; + bf1[55] = -bf0[55] + bf0[56]; + bf1[56] = bf0[56] + bf0[55]; + bf1[57] = bf0[57] + bf0[54]; + bf1[58] = bf0[58] + bf0[53]; + bf1[59] = bf0[59] + bf0[52]; + bf1[60] = bf0[60] + bf0[51]; + bf1[61] = bf0[61] + bf0[50]; + bf1[62] = bf0[62] + bf0[49]; + bf1[63] = bf0[63] + bf0[48]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[7]; + bf1[1] = bf0[1] + bf0[6]; + bf1[2] = bf0[2] + bf0[5]; + bf1[3] = bf0[3] + bf0[4]; + bf1[4] = -bf0[4] + bf0[3]; + bf1[5] = -bf0[5] + bf0[2]; + bf1[6] = -bf0[6] + bf0[1]; + bf1[7] = -bf0[7] + bf0[0]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = bf0[16] + bf0[23]; + bf1[17] = bf0[17] + bf0[22]; + bf1[18] = bf0[18] + bf0[21]; + bf1[19] = bf0[19] + bf0[20]; + bf1[20] = -bf0[20] + bf0[19]; + bf1[21] = -bf0[21] + bf0[18]; + bf1[22] = -bf0[22] + bf0[17]; + bf1[23] = -bf0[23] + bf0[16]; + bf1[24] = -bf0[24] + bf0[31]; + bf1[25] = -bf0[25] + bf0[30]; + bf1[26] = -bf0[26] + bf0[29]; + bf1[27] = -bf0[27] + bf0[28]; + bf1[28] = bf0[28] + bf0[27]; + bf1[29] = bf0[29] + bf0[26]; + bf1[30] = bf0[30] + bf0[25]; + bf1[31] = bf0[31] + bf0[24]; + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = bf0[34]; + bf1[35] = bf0[35]; + bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit); + bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit); + bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit); + bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit); + bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit); + bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit); + bf1[44] = bf0[44]; + bf1[45] = bf0[45]; + bf1[46] = bf0[46]; + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = bf0[49]; + bf1[50] = bf0[50]; + bf1[51] = bf0[51]; + bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit); + bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit); + bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit); + bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit); + bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit); + bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit); + bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit); + bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit); + bf1[60] = bf0[60]; + bf1[61] = bf0[61]; + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[3]; + bf1[1] = bf0[1] + bf0[2]; + bf1[2] = -bf0[2] + bf0[1]; + bf1[3] = -bf0[3] + bf0[0]; + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = bf0[8] + bf0[11]; + bf1[9] = bf0[9] + bf0[10]; + bf1[10] = -bf0[10] + bf0[9]; + bf1[11] = -bf0[11] + bf0[8]; + bf1[12] = -bf0[12] + bf0[15]; + bf1[13] = -bf0[13] + bf0[14]; + bf1[14] = bf0[14] + bf0[13]; + bf1[15] = bf0[15] + bf0[12]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); + bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); + bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit); + bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit); + bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = bf0[32] + bf0[39]; + bf1[33] = bf0[33] + bf0[38]; + bf1[34] = bf0[34] + bf0[37]; + bf1[35] = bf0[35] + bf0[36]; + bf1[36] = -bf0[36] + bf0[35]; + bf1[37] = -bf0[37] + bf0[34]; + bf1[38] = -bf0[38] + bf0[33]; + bf1[39] = -bf0[39] + bf0[32]; + bf1[40] = -bf0[40] + bf0[47]; + bf1[41] = -bf0[41] + bf0[46]; + bf1[42] = -bf0[42] + bf0[45]; + bf1[43] = -bf0[43] + bf0[44]; + bf1[44] = bf0[44] + bf0[43]; + bf1[45] = bf0[45] + bf0[42]; + bf1[46] = bf0[46] + bf0[41]; + bf1[47] = bf0[47] + bf0[40]; + bf1[48] = bf0[48] + bf0[55]; + bf1[49] = bf0[49] + bf0[54]; + bf1[50] = bf0[50] + bf0[53]; + bf1[51] = bf0[51] + bf0[52]; + bf1[52] = -bf0[52] + bf0[51]; + bf1[53] = -bf0[53] + bf0[50]; + bf1[54] = -bf0[54] + bf0[49]; + bf1[55] = -bf0[55] + bf0[48]; + bf1[56] = -bf0[56] + bf0[63]; + bf1[57] = -bf0[57] + bf0[62]; + bf1[58] = -bf0[58] + bf0[61]; + bf1[59] = -bf0[59] + bf0[60]; + bf1[60] = bf0[60] + bf0[59]; + bf1[61] = bf0[61] + bf0[58]; + bf1[62] = bf0[62] + bf0[57]; + bf1[63] = bf0[63] + bf0[56]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + bf1[4] = bf0[4] + bf0[5]; + bf1[5] = -bf0[5] + bf0[4]; + bf1[6] = -bf0[6] + bf0[7]; + bf1[7] = bf0[7] + bf0[6]; + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); + bf1[15] = bf0[15]; + bf1[16] = bf0[16] + bf0[19]; + bf1[17] = bf0[17] + bf0[18]; + bf1[18] = -bf0[18] + bf0[17]; + bf1[19] = -bf0[19] + bf0[16]; + bf1[20] = -bf0[20] + bf0[23]; + bf1[21] = -bf0[21] + bf0[22]; + bf1[22] = bf0[22] + bf0[21]; + bf1[23] = bf0[23] + bf0[20]; + bf1[24] = bf0[24] + bf0[27]; + bf1[25] = bf0[25] + bf0[26]; + bf1[26] = -bf0[26] + bf0[25]; + bf1[27] = -bf0[27] + bf0[24]; + bf1[28] = -bf0[28] + bf0[31]; + bf1[29] = -bf0[29] + bf0[30]; + bf1[30] = bf0[30] + bf0[29]; + bf1[31] = bf0[31] + bf0[28]; + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit); + bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit); + bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit); + bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit); + bf1[38] = bf0[38]; + bf1[39] = bf0[39]; + bf1[40] = bf0[40]; + bf1[41] = bf0[41]; + bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit); + bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit); + bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit); + bf1[46] = bf0[46]; + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = bf0[49]; + bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit); + bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit); + bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit); + bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit); + bf1[54] = bf0[54]; + bf1[55] = bf0[55]; + bf1[56] = bf0[56]; + bf1[57] = bf0[57]; + bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit); + bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit); + bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit); + bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit); + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); + bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); + bf1[8] = bf0[8] + bf0[9]; + bf1[9] = -bf0[9] + bf0[8]; + bf1[10] = -bf0[10] + bf0[11]; + bf1[11] = bf0[11] + bf0[10]; + bf1[12] = bf0[12] + bf0[13]; + bf1[13] = -bf0[13] + bf0[12]; + bf1[14] = -bf0[14] + bf0[15]; + bf1[15] = bf0[15] + bf0[14]; + bf1[16] = bf0[16]; + bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); + bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit); + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); + bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); + bf1[31] = bf0[31]; + bf1[32] = bf0[32] + bf0[35]; + bf1[33] = bf0[33] + bf0[34]; + bf1[34] = -bf0[34] + bf0[33]; + bf1[35] = -bf0[35] + bf0[32]; + bf1[36] = -bf0[36] + bf0[39]; + bf1[37] = -bf0[37] + bf0[38]; + bf1[38] = bf0[38] + bf0[37]; + bf1[39] = bf0[39] + bf0[36]; + bf1[40] = bf0[40] + bf0[43]; + bf1[41] = bf0[41] + bf0[42]; + bf1[42] = -bf0[42] + bf0[41]; + bf1[43] = -bf0[43] + bf0[40]; + bf1[44] = -bf0[44] + bf0[47]; + bf1[45] = -bf0[45] + bf0[46]; + bf1[46] = bf0[46] + bf0[45]; + bf1[47] = bf0[47] + bf0[44]; + bf1[48] = bf0[48] + bf0[51]; + bf1[49] = bf0[49] + bf0[50]; + bf1[50] = -bf0[50] + bf0[49]; + bf1[51] = -bf0[51] + bf0[48]; + bf1[52] = -bf0[52] + bf0[55]; + bf1[53] = -bf0[53] + bf0[54]; + bf1[54] = bf0[54] + bf0[53]; + bf1[55] = bf0[55] + bf0[52]; + bf1[56] = bf0[56] + bf0[59]; + bf1[57] = bf0[57] + bf0[58]; + bf1[58] = -bf0[58] + bf0[57]; + bf1[59] = -bf0[59] + bf0[56]; + bf1[60] = -bf0[60] + bf0[63]; + bf1[61] = -bf0[61] + bf0[62]; + bf1[62] = bf0[62] + bf0[61]; + bf1[63] = bf0[63] + bf0[60]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); + bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); + bf1[16] = bf0[16] + bf0[17]; + bf1[17] = -bf0[17] + bf0[16]; + bf1[18] = -bf0[18] + bf0[19]; + bf1[19] = bf0[19] + bf0[18]; + bf1[20] = bf0[20] + bf0[21]; + bf1[21] = -bf0[21] + bf0[20]; + bf1[22] = -bf0[22] + bf0[23]; + bf1[23] = bf0[23] + bf0[22]; + bf1[24] = bf0[24] + bf0[25]; + bf1[25] = -bf0[25] + bf0[24]; + bf1[26] = -bf0[26] + bf0[27]; + bf1[27] = bf0[27] + bf0[26]; + bf1[28] = bf0[28] + bf0[29]; + bf1[29] = -bf0[29] + bf0[28]; + bf1[30] = -bf0[30] + bf0[31]; + bf1[31] = bf0[31] + bf0[30]; + bf1[32] = bf0[32]; + bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit); + bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit); + bf1[35] = bf0[35]; + bf1[36] = bf0[36]; + bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit); + bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit); + bf1[39] = bf0[39]; + bf1[40] = bf0[40]; + bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit); + bf1[43] = bf0[43]; + bf1[44] = bf0[44]; + bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit); + bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit); + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit); + bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit); + bf1[51] = bf0[51]; + bf1[52] = bf0[52]; + bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit); + bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit); + bf1[55] = bf0[55]; + bf1[56] = bf0[56]; + bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit); + bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit); + bf1[59] = bf0[59]; + bf1[60] = bf0[60]; + bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit); + bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit); + bf1[63] = bf0[63]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit); + bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit); + bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit); + bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit); + bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit); + bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit); + bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit); + bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit); + bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit); + bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit); + bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit); + bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit); + bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit); + bf1[32] = bf0[32] + bf0[33]; + bf1[33] = -bf0[33] + bf0[32]; + bf1[34] = -bf0[34] + bf0[35]; + bf1[35] = bf0[35] + bf0[34]; + bf1[36] = bf0[36] + bf0[37]; + bf1[37] = -bf0[37] + bf0[36]; + bf1[38] = -bf0[38] + bf0[39]; + bf1[39] = bf0[39] + bf0[38]; + bf1[40] = bf0[40] + bf0[41]; + bf1[41] = -bf0[41] + bf0[40]; + bf1[42] = -bf0[42] + bf0[43]; + bf1[43] = bf0[43] + bf0[42]; + bf1[44] = bf0[44] + bf0[45]; + bf1[45] = -bf0[45] + bf0[44]; + bf1[46] = -bf0[46] + bf0[47]; + bf1[47] = bf0[47] + bf0[46]; + bf1[48] = bf0[48] + bf0[49]; + bf1[49] = -bf0[49] + bf0[48]; + bf1[50] = -bf0[50] + bf0[51]; + bf1[51] = bf0[51] + bf0[50]; + bf1[52] = bf0[52] + bf0[53]; + bf1[53] = -bf0[53] + bf0[52]; + bf1[54] = -bf0[54] + bf0[55]; + bf1[55] = bf0[55] + bf0[54]; + bf1[56] = bf0[56] + bf0[57]; + bf1[57] = -bf0[57] + bf0[56]; + bf1[58] = -bf0[58] + bf0[59]; + bf1[59] = bf0[59] + bf0[58]; + bf1[60] = bf0[60] + bf0[61]; + bf1[61] = -bf0[61] + bf0[60]; + bf1[62] = -bf0[62] + bf0[63]; + bf1[63] = bf0[63] + bf0[62]; + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 10 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = bf0[21]; + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = bf0[26]; + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit); + bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit); + bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit); + bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit); + bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit); + bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit); + bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit); + bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit); + bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit); + bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit); + bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit); + bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit); + bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit); + bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit); + bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit); + bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit); + bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit); + bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit); + bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit); + bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit); + bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit); + bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit); + bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit); + bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit); + bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit); + bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit); + bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit); + bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit); + bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit); + bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit); + bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit); + bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit); + range_check(stage, input, bf1, size, stage_range[stage]); + + // stage 11 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[32]; + bf1[2] = bf0[16]; + bf1[3] = bf0[48]; + bf1[4] = bf0[8]; + bf1[5] = bf0[40]; + bf1[6] = bf0[24]; + bf1[7] = bf0[56]; + bf1[8] = bf0[4]; + bf1[9] = bf0[36]; + bf1[10] = bf0[20]; + bf1[11] = bf0[52]; + bf1[12] = bf0[12]; + bf1[13] = bf0[44]; + bf1[14] = bf0[28]; + bf1[15] = bf0[60]; + bf1[16] = bf0[2]; + bf1[17] = bf0[34]; + bf1[18] = bf0[18]; + bf1[19] = bf0[50]; + bf1[20] = bf0[10]; + bf1[21] = bf0[42]; + bf1[22] = bf0[26]; + bf1[23] = bf0[58]; + bf1[24] = bf0[6]; + bf1[25] = bf0[38]; + bf1[26] = bf0[22]; + bf1[27] = bf0[54]; + bf1[28] = bf0[14]; + bf1[29] = bf0[46]; + bf1[30] = bf0[30]; + bf1[31] = bf0[62]; + bf1[32] = bf0[1]; + bf1[33] = bf0[33]; + bf1[34] = bf0[17]; + bf1[35] = bf0[49]; + bf1[36] = bf0[9]; + bf1[37] = bf0[41]; + bf1[38] = bf0[25]; + bf1[39] = bf0[57]; + bf1[40] = bf0[5]; + bf1[41] = bf0[37]; + bf1[42] = bf0[21]; + bf1[43] = bf0[53]; + bf1[44] = bf0[13]; + bf1[45] = bf0[45]; + bf1[46] = bf0[29]; + bf1[47] = bf0[61]; + bf1[48] = bf0[3]; + bf1[49] = bf0[35]; + bf1[50] = bf0[19]; + bf1[51] = bf0[51]; + bf1[52] = bf0[11]; + bf1[53] = bf0[43]; + bf1[54] = bf0[27]; + bf1[55] = bf0[59]; + bf1[56] = bf0[7]; + bf1[57] = bf0[39]; + bf1[58] = bf0[23]; + bf1[59] = bf0[55]; + bf1[60] = bf0[15]; + bf1[61] = bf0[47]; + bf1[62] = bf0[31]; + bf1[63] = bf0[63]; + range_check(stage, input, bf1, size, stage_range[stage]); +} diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h new file mode 100644 index 000000000..9472af8e6 --- /dev/null +++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_FWD_TXFM1D_H_ +#define AV1_FWD_TXFM1D_H_ + +#include "av1/common/av1_txfm.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +#ifdef __cplusplus +} +#endif + +#endif // AV1_FWD_TXFM1D_H_ diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h new file mode 100644 index 000000000..174689a14 --- /dev/null +++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_FWD_TXFM2D_CFG_H_ +#define AV1_FWD_TXFM2D_CFG_H_ +#include "av1/common/enums.h" +#include "av1/encoder/av1_fwd_txfm1d.h" +extern const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL]; +extern const int8_t fwd_cos_bit_col[5][5]; +extern const int8_t fwd_cos_bit_row[5][5]; +#endif // AV1_FWD_TXFM2D_CFG_H_ diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm2d.c b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c new file mode 100644 index 000000000..f25a667cf --- /dev/null +++ b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c @@ -0,0 +1,431 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/txfm_common.h" +#include "av1/common/enums.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/av1_fwd_txfm1d.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" + +static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { + switch (txfm_type) { + case TXFM_TYPE_DCT4: return av1_fdct4_new; + case TXFM_TYPE_DCT8: return av1_fdct8_new; + case TXFM_TYPE_DCT16: return av1_fdct16_new; + case TXFM_TYPE_DCT32: return av1_fdct32_new; + case TXFM_TYPE_DCT64: return av1_fdct64_new; + case TXFM_TYPE_ADST4: return av1_fadst4_new; + case TXFM_TYPE_ADST8: return av1_fadst8_new; + case TXFM_TYPE_ADST16: return av1_fadst16_new; + case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c; + case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c; + case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c; + case TXFM_TYPE_IDENTITY32: return av1_fidentity32_c; + default: assert(0); return NULL; + } +} + +void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, + const TXFM_2D_FLIP_CFG *cfg, int bd) { + // Take the shift from the larger dimension in the rectangular case. + const int8_t *shift = cfg->shift; + // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning + for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) { + stage_range_col[i] = cfg->stage_range_col[i] + shift[0] + bd + 1; + } + + // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning + for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) { + stage_range_row[i] = cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1; + } +} + +static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output, + const int stride, const TXFM_2D_FLIP_CFG *cfg, + int32_t *buf, int bd) { + int c, r; + // Note when assigning txfm_size_col, we use the txfm_size from the + // row configuration and vice versa. This is intentionally done to + // accurately perform rectangular transforms. When the transform is + // rectangular, the number of columns will be the same as the + // txfm_size stored in the row cfg struct. It will make no difference + // for square transforms. + const int txfm_size_col = tx_size_wide[cfg->tx_size]; + const int txfm_size_row = tx_size_high[cfg->tx_size]; + // Take the shift from the larger dimension in the rectangular case. + const int8_t *shift = cfg->shift; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; + int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; + assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM); + assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM); + av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bd); + + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); + const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row); + + // use output buffer as temp buffer + int32_t *temp_in = output; + int32_t *temp_out = output + txfm_size_row; + + // Columns + for (c = 0; c < txfm_size_col; ++c) { + if (cfg->ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * stride + c]; + } else { + for (r = 0; r < txfm_size_row; ++r) + // flip upside down + temp_in[r] = input[(txfm_size_row - r - 1) * stride + c]; + } + av1_round_shift_array(temp_in, txfm_size_row, -shift[0]); + txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + if (cfg->lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + buf[r * txfm_size_col + c] = temp_out[r]; + } else { + for (r = 0; r < txfm_size_row; ++r) + // flip from left to right + buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r]; + } + } + + // Rows + for (r = 0; r < txfm_size_row; ++r) { + txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col, + cos_bit_row, stage_range_row); + av1_round_shift_array(output + r * txfm_size_col, txfm_size_col, -shift[2]); + if (abs(rect_type) == 1) { + // Multiply everything by Sqrt2 if the transform is rectangular and the + // size difference is a factor of 2. + for (c = 0; c < txfm_size_col; ++c) { + output[r * txfm_size_col + c] = round_shift( + (int64_t)output[r * txfm_size_col + c] * NewSqrt2, NewSqrt2Bits); + } + } + } +} + +void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 8]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_4X8, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[8 * 4]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_8X4, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 16]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_8X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[16 * 8]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X8, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 32]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X32, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[32 * 16]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_4X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[16 * 4]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X4, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_8x32_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 8]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_8X32, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_32x8_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[32 * 8]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X8, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[4 * 4]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_4X4, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[8 * 8]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_8X8, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[16 * 16]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[32 * 32]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[64 * 64]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + + // Zero out top-right 32x32 area. + for (int row = 0; row < 32; ++row) { + memset(output + row * 64 + 32, 0, 32 * sizeof(*output)); + } + // Zero out the bottom 64x32 area. + memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output)); + // Re-pack non-zero coeffs in the first 32x32 indices. + for (int row = 1; row < 32; ++row) { + memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output)); + } +} + +void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 64]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X64, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + // Zero out the bottom 32x32 area. + memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output)); + // Note: no repacking needed here. +} + +void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[64 * 32]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X32, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + + // Zero out right 32x32 area. + for (int row = 0; row < 32; ++row) { + memset(output + row * 64 + 32, 0, 32 * sizeof(*output)); + } + // Re-pack non-zero coeffs in the first 32x32 indices. + for (int row = 1; row < 32; ++row) { + memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output)); + } +} + +void av1_fwd_txfm2d_16x64_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 16]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X64, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + // Zero out the bottom 16x32 area. + memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); + // Note: no repacking needed here. +} + +void av1_fwd_txfm2d_64x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[64 * 16]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + // Zero out right 32x16 area. + for (int row = 0; row < 16; ++row) { + memset(output + row * 64 + 32, 0, 32 * sizeof(*output)); + } + // Re-pack non-zero coeffs in the first 32x16 indices. + for (int row = 1; row < 16; ++row) { + memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output)); + } +} + +static const int8_t fwd_shift_4x4[3] = { 2, 0, 0 }; +static const int8_t fwd_shift_8x8[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_16x16[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_32x32[3] = { 2, -4, 0 }; +static const int8_t fwd_shift_64x64[3] = { 0, -2, -2 }; +static const int8_t fwd_shift_4x8[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_8x4[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_8x16[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_16x8[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_16x32[3] = { 2, -4, 0 }; +static const int8_t fwd_shift_32x16[3] = { 2, -4, 0 }; +static const int8_t fwd_shift_32x64[3] = { 0, -2, -2 }; +static const int8_t fwd_shift_64x32[3] = { 2, -4, -2 }; +static const int8_t fwd_shift_4x16[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_16x4[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_8x32[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_32x8[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_16x64[3] = { 0, -2, 0 }; +static const int8_t fwd_shift_64x16[3] = { 2, -4, 0 }; + +const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL] = { + fwd_shift_4x4, fwd_shift_8x8, fwd_shift_16x16, fwd_shift_32x32, + fwd_shift_64x64, fwd_shift_4x8, fwd_shift_8x4, fwd_shift_8x16, + fwd_shift_16x8, fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64, + fwd_shift_64x32, fwd_shift_4x16, fwd_shift_16x4, fwd_shift_8x32, + fwd_shift_32x8, fwd_shift_16x64, fwd_shift_64x16, +}; + +const int8_t fwd_cos_bit_col[MAX_TXWH_IDX /*txw_idx*/] + [MAX_TXWH_IDX /*txh_idx*/] = { + { 13, 13, 13, 0, 0 }, + { 13, 13, 13, 12, 0 }, + { 13, 13, 13, 12, 13 }, + { 0, 13, 13, 12, 13 }, + { 0, 0, 13, 12, 13 } + }; + +const int8_t fwd_cos_bit_row[MAX_TXWH_IDX /*txw_idx*/] + [MAX_TXWH_IDX /*txh_idx*/] = { + { 13, 13, 12, 0, 0 }, + { 13, 13, 13, 12, 0 }, + { 13, 13, 12, 13, 12 }, + { 0, 12, 13, 12, 11 }, + { 0, 0, 12, 11, 10 } + }; + +static const int8_t fdct4_range_mult2[4] = { 0, 2, 3, 3 }; +static const int8_t fdct8_range_mult2[6] = { 0, 2, 4, 5, 5, 5 }; +static const int8_t fdct16_range_mult2[8] = { 0, 2, 4, 6, 7, 7, 7, 7 }; +static const int8_t fdct32_range_mult2[10] = { 0, 2, 4, 6, 8, 9, 9, 9, 9, 9 }; +static const int8_t fdct64_range_mult2[12] = { 0, 2, 4, 6, 8, 10, + 11, 11, 11, 11, 11, 11 }; + +static const int8_t fadst4_range_mult2[7] = { 0, 2, 4, 3, 3, 3, 3 }; +static const int8_t fadst8_range_mult2[8] = { 0, 0, 1, 3, 3, 5, 5, 5 }; +static const int8_t fadst16_range_mult2[10] = { 0, 0, 1, 3, 3, 5, 5, 7, 7, 7 }; + +static const int8_t max_fwd_range_mult2_col[5] = { 3, 5, 7, 9, 11 }; + +static const int8_t fidtx4_range_mult2[1] = { 1 }; +static const int8_t fidtx8_range_mult2[1] = { 2 }; +static const int8_t fidtx16_range_mult2[1] = { 3 }; +static const int8_t fidtx32_range_mult2[1] = { 4 }; + +#if 0 +const int8_t fwd_idtx_range_row[MAX_TXWH_IDX /*txw_idx*/] + [MAX_TXWH_IDX /*txh_idx*/] = { { 2, 4, 5, 0, 0 }, + { 3, 4, 5, 6, 0 }, + { 4, 5, 6, 7, 8 }, + { 0, 5, 6, 7, 8 }, + { 0, 0, 7, 8, + 9 } }; +#endif + +const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = { + fdct4_range_mult2, fdct8_range_mult2, fdct16_range_mult2, + fdct32_range_mult2, fdct64_range_mult2, fadst4_range_mult2, + fadst8_range_mult2, fadst16_range_mult2, fidtx4_range_mult2, + fidtx8_range_mult2, fidtx16_range_mult2, fidtx32_range_mult2 +}; + +static INLINE void set_fwd_txfm_non_scale_range(TXFM_2D_FLIP_CFG *cfg) { + const int txh_idx = get_txh_idx(cfg->tx_size); + av1_zero(cfg->stage_range_col); + av1_zero(cfg->stage_range_row); + + if (cfg->txfm_type_col != TXFM_TYPE_INVALID) { + int stage_num_col = cfg->stage_num_col; + const int8_t *range_mult2_col = + fwd_txfm_range_mult2_list[cfg->txfm_type_col]; + for (int i = 0; i < stage_num_col; ++i) + cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1; + } + + if (cfg->txfm_type_row != TXFM_TYPE_INVALID) { + int stage_num_row = cfg->stage_num_row; + const int8_t *range_mult2_row = + fwd_txfm_range_mult2_list[cfg->txfm_type_row]; + for (int i = 0; i < stage_num_row; ++i) + cfg->stage_range_row[i] = + (max_fwd_range_mult2_col[txh_idx] + range_mult2_row[i] + 1) >> 1; + } +} + +void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, + TXFM_2D_FLIP_CFG *cfg) { + assert(cfg != NULL); + cfg->tx_size = tx_size; + set_flip_cfg(tx_type, cfg); + const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type]; + const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type]; + const int txw_idx = tx_size_wide_log2[tx_size] - tx_size_wide_log2[0]; + const int txh_idx = tx_size_high_log2[tx_size] - tx_size_high_log2[0]; + cfg->shift = fwd_txfm_shift_ls[tx_size]; + cfg->cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + cfg->cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col]; + cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row]; + cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col]; + cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row]; + set_fwd_txfm_non_scale_range(cfg); +} diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c index 033b4ba1a..1c5bdeb25 100644 --- a/third_party/aom/av1/encoder/av1_quantize.c +++ b/third_party/aom/av1/encoder/av1_quantize.c @@ -10,7 +10,9 @@ */ #include <math.h> -#include "./aom_dsp_rtcd.h" + +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/quantize.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" @@ -24,413 +26,6 @@ #include "av1/encoder/encoder.h" #include "av1/encoder/rd.h" -#if CONFIG_NEW_QUANT -static INLINE int quantize_coeff_nuq( - const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift, - const int16_t dequant, const tran_low_t *cuml_bins_ptr, - const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr) { - const int coeff = coeffv; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int i, q; - int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX); - for (i = 0; i < NUQ_KNOTS; i++) { - if (tmp < cuml_bins_ptr[i]) { - q = i; - break; - } - } - if (i == NUQ_KNOTS) { - tmp -= cuml_bins_ptr[NUQ_KNOTS - 1]; - q = NUQ_KNOTS + (((((tmp * quant) >> 16) + tmp) * quant_shift) >> 16); - } - if (q) { - *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val); - *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign; - *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr; - } else { - *qcoeff_ptr = 0; - *dqcoeff_ptr = 0; - } - return (q != 0); -} - -static INLINE int quantize_coeff_bigtx_nuq( - const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift, - const int16_t dequant, const tran_low_t *cuml_bins_ptr, - const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, int logsizeby16) { - const int coeff = coeffv; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int i, q; - int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX); - for (i = 0; i < NUQ_KNOTS; i++) { - if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) { - q = i; - break; - } - } - if (i == NUQ_KNOTS) { - tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16); - q = NUQ_KNOTS + - (((((tmp * quant) >> 16) + tmp) * quant_shift) >> (16 - logsizeby16)); - } - if (q) { - *dqcoeff_ptr = ROUND_POWER_OF_TWO( - av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16); - // *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val) >> - // (logsizeby16); - *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign; - *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr; - } else { - *qcoeff_ptr = 0; - *dqcoeff_ptr = 0; - } - return (q != 0); -} - -static INLINE int quantize_coeff_fp_nuq( - const tran_low_t coeffv, const int16_t quant, const int16_t dequant, - const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) { - const int coeff = coeffv; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int i, q; - int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX); - for (i = 0; i < NUQ_KNOTS; i++) { - if (tmp < cuml_bins_ptr[i]) { - q = i; - break; - } - } - if (i == NUQ_KNOTS) { - q = NUQ_KNOTS + - ((((int64_t)tmp - cuml_bins_ptr[NUQ_KNOTS - 1]) * quant) >> 16); - } - if (q) { - *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val); - *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign; - *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr; - } else { - *qcoeff_ptr = 0; - *dqcoeff_ptr = 0; - } - return (q != 0); -} - -static INLINE int quantize_coeff_bigtx_fp_nuq( - const tran_low_t coeffv, const int16_t quant, const int16_t dequant, - const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby16) { - const int coeff = coeffv; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int i, q; - int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX); - for (i = 0; i < NUQ_KNOTS; i++) { - if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) { - q = i; - break; - } - } - if (i == NUQ_KNOTS) { - q = NUQ_KNOTS + - ((((int64_t)tmp - - ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16)) * - quant) >> - (16 - logsizeby16)); - } - if (q) { - *dqcoeff_ptr = ROUND_POWER_OF_TWO( - av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16); - // *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val) >> - // (logsizeby16); - *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign; - *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr; - } else { - *qcoeff_ptr = 0; - *dqcoeff_ptr = 0; - } - return (q != 0); -} - -void quantize_dc_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t quant, - const int16_t quant_shift, const int16_t dequant, - const tran_low_t *cuml_bins_ptr, - const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - const int rc = 0; - if (quantize_coeff_nuq(coeff_ptr[rc], quant, quant_shift, dequant, - cuml_bins_ptr, dequant_val, qcoeff_ptr, dqcoeff_ptr)) - eob = 0; - } - *eob_ptr = eob + 1; -} - -void quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t quant, - const int16_t dequant, const tran_low_t *cuml_bins_ptr, - const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - const int rc = 0; - if (quantize_coeff_fp_nuq(coeff_ptr[rc], quant, dequant, cuml_bins_ptr, - dequant_val, qcoeff_ptr, dqcoeff_ptr)) - eob = 0; - } - *eob_ptr = eob + 1; -} - -void quantize_dc_32x32_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t quant, - const int16_t quant_shift, const int16_t dequant, - const tran_low_t *cuml_bins_ptr, - const tran_low_t *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - uint16_t *eob_ptr) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - const int rc = 0; - if (quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift, dequant, - cuml_bins_ptr, dequant_val, qcoeff_ptr, - dqcoeff_ptr, av1_get_tx_scale(TX_32X32))) - eob = 0; - } - *eob_ptr = eob + 1; -} - -void quantize_dc_32x32_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t quant, - const int16_t dequant, - const tran_low_t *cuml_bins_ptr, - const tran_low_t *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - uint16_t *eob_ptr) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - const int rc = 0; - if (quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant, - cuml_bins_ptr, dequant_val, qcoeff_ptr, - dqcoeff_ptr, av1_get_tx_scale(TX_32X32))) - eob = 0; - } - *eob_ptr = eob + 1; -} - -#if CONFIG_TX64X64 -void quantize_dc_64x64_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t quant, - const int16_t quant_shift, const int16_t dequant, - const tran_low_t *cuml_bins_ptr, - const tran_low_t *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - uint16_t *eob_ptr) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - const int rc = 0; - if (quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift, dequant, - cuml_bins_ptr, dequant_val, qcoeff_ptr, - dqcoeff_ptr, av1_get_tx_scale(TX_64X64))) - eob = 0; - } - *eob_ptr = eob + 1; -} - -void quantize_dc_64x64_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t quant, - const int16_t dequant, - const tran_low_t *cuml_bins_ptr, - const tran_low_t *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - uint16_t *eob_ptr) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - const int rc = 0; - if (quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant, - cuml_bins_ptr, dequant_val, qcoeff_ptr, - dqcoeff_ptr, av1_get_tx_scale(TX_64X64))) - eob = 0; - } - *eob_ptr = eob + 1; -} -#endif // CONFIG_TX64X64 - -void quantize_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, - const cuml_bins_type_nuq *cuml_bins_ptr, - const dequant_val_type_nuq *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const uint8_t *band) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - int i; - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - if (quantize_coeff_nuq(coeff_ptr[rc], quant_ptr[rc != 0], - quant_shift_ptr[rc != 0], dequant_ptr[rc != 0], - cuml_bins_ptr[band[i]], dequant_val[band[i]], - &qcoeff_ptr[rc], &dqcoeff_ptr[rc])) - eob = i; - } - } - *eob_ptr = eob + 1; -} - -void quantize_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *quant_ptr, - const int16_t *dequant_ptr, - const cuml_bins_type_nuq *cuml_bins_ptr, - const dequant_val_type_nuq *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const uint8_t *band) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - int i; - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - if (quantize_coeff_fp_nuq(coeff_ptr[rc], quant_ptr[rc != 0], - dequant_ptr[rc != 0], cuml_bins_ptr[band[i]], - dequant_val[band[i]], &qcoeff_ptr[rc], - &dqcoeff_ptr[rc])) - eob = i; - } - } - *eob_ptr = eob + 1; -} - -void quantize_32x32_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - const int16_t *dequant_ptr, - const cuml_bins_type_nuq *cuml_bins_ptr, - const dequant_val_type_nuq *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const uint8_t *band) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - int i; - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - if (quantize_coeff_bigtx_nuq( - coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0], - dequant_ptr[rc != 0], cuml_bins_ptr[band[i]], - dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc], - av1_get_tx_scale(TX_32X32))) - eob = i; - } - } - *eob_ptr = eob + 1; -} - -void quantize_32x32_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *quant_ptr, - const int16_t *dequant_ptr, - const cuml_bins_type_nuq *cuml_bins_ptr, - const dequant_val_type_nuq *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const uint8_t *band) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - int i; - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - if (quantize_coeff_bigtx_fp_nuq( - coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0], - cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc], - &dqcoeff_ptr[rc], av1_get_tx_scale(TX_32X32))) - eob = i; - } - } - *eob_ptr = eob + 1; -} - -#if CONFIG_TX64X64 -void quantize_64x64_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - const int16_t *dequant_ptr, - const cuml_bins_type_nuq *cuml_bins_ptr, - const dequant_val_type_nuq *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const uint8_t *band) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - int i; - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - if (quantize_coeff_bigtx_nuq( - coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0], - dequant_ptr[rc != 0], cuml_bins_ptr[band[i]], - dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc], - av1_get_tx_scale(TX_64X64))) - eob = i; - } - } - *eob_ptr = eob + 1; -} - -void quantize_64x64_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *quant_ptr, - const int16_t *dequant_ptr, - const cuml_bins_type_nuq *cuml_bins_ptr, - const dequant_val_type_nuq *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const uint8_t *band) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - int i; - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - if (quantize_coeff_bigtx_fp_nuq( - coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0], - cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc], - &dqcoeff_ptr[rc], av1_get_tx_scale(TX_64X64))) - eob = i; - } - } - *eob_ptr = eob + 1; -} -#endif // CONFIG_TX64X64 -#endif // CONFIG_NEW_QUANT - void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); @@ -439,8 +34,8 @@ void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr, } static void quantize_fp_helper_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, @@ -450,12 +45,45 @@ static void quantize_fp_helper_c( // quantization process is completed. (void)zbin_ptr; (void)quant_shift_ptr; - (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { + if (qm_ptr == NULL && iqm_ptr == NULL) { + const int rounding0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale); + { // rc == 0 + const int coeff = coeff_ptr[0]; + const int coeff_sign = (coeff >> 31); + int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + if ((abs_coeff << (1 + log_scale)) >= (int32_t)(dequant_ptr[0])) { + abs_coeff = clamp64(abs_coeff + rounding0, INT16_MIN, INT16_MAX); + const int tmp32 = (int)((abs_coeff * quant_ptr[0]) >> (16 - log_scale)); + if (tmp32) { + qcoeff_ptr[0] = (tmp32 ^ coeff_sign) - coeff_sign; + const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[0]) >> log_scale; + dqcoeff_ptr[0] = (abs_dqcoeff ^ coeff_sign) - coeff_sign; + eob = 0; + } + } + } + const int rounding1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale); + const int32_t thresh1 = (int32_t)(dequant_ptr[1]); + for (i = 1; i < n_coeffs; i++) { + const int coeff = coeff_ptr[i]; + const int coeff_sign = (coeff >> 31); + int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + if ((abs_coeff << (1 + log_scale)) >= thresh1) { + abs_coeff = clamp64(abs_coeff + rounding1, INT16_MIN, INT16_MAX); + const int tmp32 = (int)((abs_coeff * quant_ptr[1]) >> (16 - log_scale)); + if (tmp32) { + qcoeff_ptr[i] = (tmp32 ^ coeff_sign) - coeff_sign; + const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[1]) >> log_scale; + dqcoeff_ptr[i] = (abs_dqcoeff ^ coeff_sign) - coeff_sign; + eob = AOMMAX(iscan[i], eob); + } + } + } + } else { // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. for (i = 0; i < n_coeffs; i++) { @@ -476,7 +104,8 @@ static void quantize_fp_helper_c( tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >> (16 - log_scale + AOM_QM_BITS)); qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / (1 << log_scale); + const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; + dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign; } if (tmp32) eob = i; @@ -486,15 +115,14 @@ static void quantize_fp_helper_c( } static void highbd_quantize_fp_helper_c( - const tran_low_t *coeff_ptr, intptr_t count, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, int log_scale) { int i; int eob = -1; - const int scale = 1 << log_scale; const int shift = 16 - log_scale; // TODO(jingning) Decide the need of these arguments after the // quantization process is completed. @@ -502,10 +130,7 @@ static void highbd_quantize_fp_helper_c( (void)quant_shift_ptr; (void)iscan; - memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { + if (qm_ptr || iqm_ptr) { // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. for (i = 0; i < count; i++) { @@ -517,150 +142,170 @@ static void highbd_quantize_fp_helper_c( (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; const int coeff_sign = (coeff >> 31); + const int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int abs_qcoeff = 0; + if (abs_coeff * wt >= + (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) { + const int64_t tmp = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); + abs_qcoeff = + (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + if (abs_qcoeff) eob = i; + } else { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + } + } else { + const int log_scaled_round_arr[2] = { + ROUND_POWER_OF_TWO(round_ptr[0], log_scale), + ROUND_POWER_OF_TWO(round_ptr[1], log_scale), + }; + for (i = 0; i < count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int rc01 = (rc != 0); + const int coeff_sign = (coeff >> 31); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + (round_ptr[rc != 0] >> log_scale); - const int abs_qcoeff = - (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS)); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / scale; - if (abs_qcoeff) eob = i; + const int log_scaled_round = log_scaled_round_arr[rc01]; + if ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) { + const int quant = quant_ptr[rc01]; + const int dequant = dequant_ptr[rc01]; + const int64_t tmp = (int64_t)abs_coeff + log_scaled_round; + const int abs_qcoeff = (int)((tmp * quant) >> shift); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + if (abs_qcoeff) eob = i; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + } else { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } } } *eob_ptr = eob + 1; } void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { - quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, - dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0); + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 0); } void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, - dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1); + quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 1); } -#if CONFIG_TX64X64 void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, - dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2); + quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 2); } -#endif // CONFIG_TX64X64 void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { - // obsolete skip_block - const int skip_block = 0; -#if CONFIG_AOM_QM const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; if (qm_ptr != NULL && iqm_ptr != NULL) { - quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr, - pd->dequant, eob_ptr, sc->scan, sc->iscan, qm_ptr, - iqm_ptr, qparam->log_scale); + quantize_fp_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); } else { -#endif switch (qparam->log_scale) { case 0: if (n_coeffs < 16) { // TODO(jingning): Need SIMD implementation for smaller block size // quantization. quantize_fp_helper_c( - coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr, pd->dequant, - eob_ptr, sc->scan, sc->iscan, NULL, NULL, qparam->log_scale); + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, + p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, NULL, NULL, 0); } else { - av1_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr, - pd->dequant, eob_ptr, sc->scan, sc->iscan); + av1_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); } break; case 1: - av1_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, - p->round_fp, p->quant_fp, p->quant_shift, - qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr, - sc->scan, sc->iscan); + av1_quantize_fp_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); break; -#if CONFIG_TX64X64 case 2: - av1_quantize_fp_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin, - p->round_fp, p->quant_fp, p->quant_shift, - qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr, - sc->scan, sc->iscan); + av1_quantize_fp_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); break; -#endif // CONFIG_TX64X64 default: assert(0); } -#if CONFIG_AOM_QM } -#endif } void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr, - uint16_t *eob_ptr, const SCAN_ORDER *sc, - const QUANT_PARAM *qparam) { + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { // obsolete skip_block const int skip_block = 0; -#if CONFIG_AOM_QM const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; if (qm_ptr != NULL && iqm_ptr != NULL) { - quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr, - pd->dequant, eob_ptr, sc->scan, sc->iscan, qm_ptr, - iqm_ptr, qparam->log_scale); + quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, + p->round_QTX, p->quant_QTX, p->quant_shift_QTX, + qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qm_ptr, iqm_ptr, + qparam->log_scale); } else { -#endif // CONFIG_AOM_QM - switch (qparam->log_scale) { case 0: - aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr, - pd->dequant, eob_ptr, sc->scan, sc->iscan); + aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, + p->round_QTX, p->quant_QTX, p->quant_shift_QTX, + qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan); break; case 1: - aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr, - pd->dequant, eob_ptr, sc->scan, sc->iscan); + aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, + p->round_QTX, p->quant_QTX, p->quant_shift_QTX, + qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan); break; -#if CONFIG_TX64X64 case 2: - aom_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr, - pd->dequant, eob_ptr, sc->scan, sc->iscan); + aom_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, + p->round_QTX, p->quant_QTX, p->quant_shift_QTX, + qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan); break; -#endif // CONFIG_TX64X64 default: assert(0); } -#if CONFIG_AOM_QM } -#endif } static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, @@ -689,7 +334,8 @@ static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS)); qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / (1 << log_scale); + const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); if (tmp32) eob = 0; } *eob_ptr = eob + 1; @@ -697,237 +343,97 @@ static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { // obsolete skip_block const int skip_block = 0; (void)sc; - assert(qparam->log_scale >= 0 && qparam->log_scale < (2 + CONFIG_TX64X64)); -#if CONFIG_AOM_QM + assert(qparam->log_scale >= 0 && qparam->log_scale < (3)); const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; -#else - const qm_val_t *qm_ptr = NULL; - const qm_val_t *iqm_ptr = NULL; -#endif - quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round, p->quant_fp[0], - qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr, qm_ptr, iqm_ptr, - qparam->log_scale); -} - -#if CONFIG_NEW_QUANT -void av1_quantize_b_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const MACROBLOCK_PLANE *p, - tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, - const SCAN_ORDER *sc, - const QUANT_PARAM *qparam) { - // obsolete skip_block - const int skip_block = 0; - const uint8_t *band = get_band_translate(qparam->tx_size); - int dq = qparam->dq; - - switch (qparam->log_scale) { - case 0: - quantize_nuq(coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift, - pd->dequant, - (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], - qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band); - break; - case 1: - quantize_32x32_nuq(coeff_ptr, n_coeffs, skip_block, p->quant, - p->quant_shift, pd->dequant, - (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], - qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band); - break; -#if CONFIG_TX64X64 - case 2: - quantize_64x64_nuq(coeff_ptr, n_coeffs, skip_block, p->quant, - p->quant_shift, pd->dequant, - (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], - qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band); - break; -#endif // CONFIG_TX64X64 - default: assert(0); - } -} - -void av1_quantize_fp_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const MACROBLOCK_PLANE *p, - tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, - const SCAN_ORDER *sc, - const QUANT_PARAM *qparam) { - // obsolete skip_block - const int skip_block = 0; - const uint8_t *band = get_band_translate(qparam->tx_size); - int dq = qparam->dq; - - switch (qparam->log_scale) { - case 0: - quantize_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant, - (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], - qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band); - break; - case 1: - quantize_32x32_fp_nuq( - coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant, - (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr, - dqcoeff_ptr, eob_ptr, sc->scan, band); - break; -#if CONFIG_TX64X64 - case 2: - quantize_64x64_fp_nuq( - coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant, - (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr, - dqcoeff_ptr, eob_ptr, sc->scan, band); - break; -#endif // CONFIG_TX64X64 - default: assert(0); - } + quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX, + p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX[0], + eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale); } -void av1_quantize_dc_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const MACROBLOCK_PLANE *p, - tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, - const SCAN_ORDER *sc, - const QUANT_PARAM *qparam) { - // obsolete skip_block - const int skip_block = 0; - int dq = qparam->dq; - (void)sc; - - switch (qparam->log_scale) { - case 0: - quantize_dc_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], - pd->dequant[0], p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], qcoeff_ptr, dqcoeff_ptr, - eob_ptr); - break; - case 1: - quantize_dc_32x32_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], - pd->dequant[0], p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], qcoeff_ptr, - dqcoeff_ptr, eob_ptr); - break; -#if CONFIG_TX64X64 - case 2: - quantize_dc_64x64_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], - pd->dequant[0], p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], qcoeff_ptr, - dqcoeff_ptr, eob_ptr); - break; -#endif // CONFIG_TX64X64 - default: assert(0); - } -} -#endif // CONFIG_NEW_QUANT - void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { - // obsolete skip_block - const int skip_block = 0; -#if CONFIG_AOM_QM const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; if (qm_ptr != NULL && iqm_ptr != NULL) { highbd_quantize_fp_helper_c( - coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp, p->quant_fp, - p->quant_shift, qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan, - sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); } else { -#endif // CONFIG_AOM_QM - if (n_coeffs < 16) { // TODO(jingning): Need SIMD implementation for smaller block size // quantization. - av1_highbd_quantize_fp_c(coeff_ptr, n_coeffs, skip_block, p->zbin, - p->round_fp, p->quant_fp, p->quant_shift, - qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr, - sc->scan, sc->iscan, qparam->log_scale); + av1_highbd_quantize_fp_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qparam->log_scale); return; } - - av1_highbd_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, - p->round_fp, p->quant_fp, p->quant_shift, qcoeff_ptr, - dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan, + av1_highbd_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, qparam->log_scale); -#if CONFIG_AOM_QM } -#endif } void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { // obsolete skip_block const int skip_block = 0; -#if CONFIG_AOM_QM const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; if (qm_ptr != NULL && iqm_ptr != NULL) { - highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin, - p->round, p->quant, p->quant_shift, qcoeff_ptr, - dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan, - sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, + p->round_QTX, p->quant_QTX, p->quant_shift_QTX, + qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qm_ptr, iqm_ptr, + qparam->log_scale); } else { -#endif // CONFIG_AOM_QM - switch (qparam->log_scale) { case 0: if (LIKELY(n_coeffs >= 8)) { - aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, - p->round, p->quant, p->quant_shift, qcoeff_ptr, - dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan, - sc->iscan); + aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, + p->round_QTX, p->quant_QTX, p->quant_shift_QTX, + qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); } else { // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size // quantization - aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, skip_block, p->zbin, - p->round, p->quant, p->quant_shift, - qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr, - sc->scan, sc->iscan); + aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, + p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, + p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); } break; case 1: - aom_highbd_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, - p->round, p->quant, p->quant_shift, - qcoeff_ptr, dqcoeff_ptr, pd->dequant, - eob_ptr, sc->scan, sc->iscan); + aom_highbd_quantize_b_32x32( + coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, + p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); break; -#if CONFIG_TX64X64 case 2: - aom_highbd_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin, - p->round, p->quant, p->quant_shift, - qcoeff_ptr, dqcoeff_ptr, pd->dequant, - eob_ptr, sc->scan, sc->iscan); + aom_highbd_quantize_b_64x64( + coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, + p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); break; -#endif // CONFIG_TX64X64 default: assert(0); } -#if CONFIG_AOM_QM } -#endif } static INLINE void highbd_quantize_dc( @@ -954,7 +460,8 @@ static INLINE void highbd_quantize_dc( const int dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / (1 << log_scale); + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + dqcoeff_ptr[0] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); if (abs_qcoeff) eob = 0; } *eob_ptr = eob + 1; @@ -963,550 +470,33 @@ static INLINE void highbd_quantize_dc( void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { // obsolete skip_block const int skip_block = 0; -#if CONFIG_AOM_QM const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; -#else - const qm_val_t *qm_ptr = NULL; - const qm_val_t *iqm_ptr = NULL; -#endif // CONFIG_AOM_QM - (void)sc; - highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round, - p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], - eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale); -} - -#if CONFIG_NEW_QUANT -static INLINE int highbd_quantize_coeff_nuq( - const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift, - const int16_t dequant, const tran_low_t *cuml_bins_ptr, - const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr) { - const int coeff = coeffv; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int i, q; - int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX); - for (i = 0; i < NUQ_KNOTS; i++) { - if (tmp < cuml_bins_ptr[i]) { - q = i; - break; - } - } - if (i == NUQ_KNOTS) { - tmp -= cuml_bins_ptr[NUQ_KNOTS - 1]; - q = NUQ_KNOTS + (int)(((((tmp * quant) >> 16) + tmp) * quant_shift) >> 16); - } - if (q) { - *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val); - *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign; - *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr; - } else { - *qcoeff_ptr = 0; - *dqcoeff_ptr = 0; - } - return (q != 0); -} - -static INLINE int highbd_quantize_coeff_fp_nuq( - const tran_low_t coeffv, const int16_t quant, const int16_t dequant, - const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) { - const int coeff = coeffv; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int i, q; - int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX); - for (i = 0; i < NUQ_KNOTS; i++) { - if (tmp < cuml_bins_ptr[i]) { - q = i; - break; - } - } - if (i == NUQ_KNOTS) { - q = NUQ_KNOTS + (int)(((tmp - cuml_bins_ptr[NUQ_KNOTS - 1]) * quant) >> 16); - } - if (q) { - *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val); - *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign; - *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr; - } else { - *qcoeff_ptr = 0; - *dqcoeff_ptr = 0; - } - return (q != 0); -} - -static INLINE int highbd_quantize_coeff_bigtx_fp_nuq( - const tran_low_t coeffv, const int16_t quant, const int16_t dequant, - const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby16) { - const int coeff = coeffv; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int i, q; - int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX); - for (i = 0; i < NUQ_KNOTS; i++) { - if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) { - q = i; - break; - } - } - if (i == NUQ_KNOTS) { - q = NUQ_KNOTS + - (int)(((tmp - - ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16)) * - quant) >> - (16 - logsizeby16)); - } - if (q) { - *dqcoeff_ptr = ROUND_POWER_OF_TWO( - av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16); - *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign; - *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr; - } else { - *qcoeff_ptr = 0; - *dqcoeff_ptr = 0; - } - return (q != 0); -} - -static INLINE int highbd_quantize_coeff_bigtx_nuq( - const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift, - const int16_t dequant, const tran_low_t *cuml_bins_ptr, - const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, int logsizeby16) { - const int coeff = coeffv; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int i, q; - int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX); - for (i = 0; i < NUQ_KNOTS; i++) { - if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) { - q = i; - break; - } - } - if (i == NUQ_KNOTS) { - tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16); - q = NUQ_KNOTS + (int)(((((tmp * quant) >> 16) + tmp) * quant_shift) >> - (16 - logsizeby16)); - } - if (q) { - *dqcoeff_ptr = ROUND_POWER_OF_TWO( - av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16); - *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign; - *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr; - } else { - *qcoeff_ptr = 0; - *dqcoeff_ptr = 0; - } - return (q != 0); -} - -void highbd_quantize_dc_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t quant, - const int16_t quant_shift, const int16_t dequant, - const tran_low_t *cuml_bins_ptr, - const tran_low_t *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - uint16_t *eob_ptr) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - const int rc = 0; - if (highbd_quantize_coeff_nuq(coeff_ptr[rc], quant, quant_shift, dequant, - cuml_bins_ptr, dequant_val, qcoeff_ptr, - dqcoeff_ptr)) - eob = 0; - } - *eob_ptr = eob + 1; -} - -void highbd_quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t quant, - const int16_t dequant, - const tran_low_t *cuml_bins_ptr, - const tran_low_t *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - uint16_t *eob_ptr) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - const int rc = 0; - if (highbd_quantize_coeff_fp_nuq(coeff_ptr[rc], quant, dequant, - cuml_bins_ptr, dequant_val, qcoeff_ptr, - dqcoeff_ptr)) - eob = 0; - } - *eob_ptr = eob + 1; -} - -void highbd_quantize_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - const int16_t *dequant_ptr, - const cuml_bins_type_nuq *cuml_bins_ptr, - const dequant_val_type_nuq *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const uint8_t *band) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - int i; - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - if (highbd_quantize_coeff_nuq( - coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0], - dequant_ptr[rc != 0], cuml_bins_ptr[band[i]], - dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc])) - eob = i; - } - } - *eob_ptr = eob + 1; -} - -void highbd_quantize_32x32_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - const int16_t *dequant_ptr, - const cuml_bins_type_nuq *cuml_bins_ptr, - const dequant_val_type_nuq *dequant_val, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, - const int16_t *scan, const uint8_t *band) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - int i; - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - if (highbd_quantize_coeff_bigtx_nuq( - coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0], - dequant_ptr[rc != 0], cuml_bins_ptr[band[i]], - dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc], - av1_get_tx_scale(TX_32X32))) - eob = i; - } - } - *eob_ptr = eob + 1; -} - -void highbd_quantize_32x32_fp_nuq_c(const tran_low_t *coeff_ptr, - intptr_t n_coeffs, int skip_block, - const int16_t *quant_ptr, - const int16_t *dequant_ptr, - const cuml_bins_type_nuq *cuml_bins_ptr, - const dequant_val_type_nuq *dequant_val, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, - const int16_t *scan, const uint8_t *band) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - int i; - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - if (highbd_quantize_coeff_bigtx_fp_nuq( - coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0], - cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc], - &dqcoeff_ptr[rc], av1_get_tx_scale(TX_32X32))) - eob = i; - } - } - *eob_ptr = eob + 1; -} - -#if CONFIG_TX64X64 -void highbd_quantize_64x64_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - const int16_t *dequant_ptr, - const cuml_bins_type_nuq *cuml_bins_ptr, - const dequant_val_type_nuq *dequant_val, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, - const int16_t *scan, const uint8_t *band) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - int i; - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - if (highbd_quantize_coeff_bigtx_nuq( - coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0], - dequant_ptr[rc != 0], cuml_bins_ptr[band[i]], - dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc], - av1_get_tx_scale(TX_64X64))) - eob = i; - } - } - *eob_ptr = eob + 1; + highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX, + p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, + p->dequant_QTX[0], eob_ptr, qm_ptr, iqm_ptr, + qparam->log_scale); } -void highbd_quantize_64x64_fp_nuq_c(const tran_low_t *coeff_ptr, - intptr_t n_coeffs, int skip_block, - const int16_t *quant_ptr, - const int16_t *dequant_ptr, - const cuml_bins_type_nuq *cuml_bins_ptr, - const dequant_val_type_nuq *dequant_val, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, - const int16_t *scan, const uint8_t *band) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - int i; - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - if (highbd_quantize_coeff_bigtx_fp_nuq( - coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0], - cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc], - &dqcoeff_ptr[rc], av1_get_tx_scale(TX_64X64))) - eob = i; - } - } - *eob_ptr = eob + 1; -} -#endif // CONFIG_TX64X64 - -void highbd_quantize_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *quant_ptr, - const int16_t *dequant_ptr, - const cuml_bins_type_nuq *cuml_bins_ptr, - const dequant_val_type_nuq *dequant_val, +void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const uint8_t *band) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - int i; - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - if (highbd_quantize_coeff_fp_nuq( - coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0], - cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc], - &dqcoeff_ptr[rc])) - eob = i; - } - } - *eob_ptr = eob + 1; -} - -void highbd_quantize_dc_32x32_nuq( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t quant, const int16_t quant_shift, const int16_t dequant, - const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - const int rc = 0; - if (highbd_quantize_coeff_bigtx_nuq( - coeff_ptr[rc], quant, quant_shift, dequant, cuml_bins_ptr, - dequant_val, qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_32X32))) - eob = 0; - } - *eob_ptr = eob + 1; -} - -void highbd_quantize_dc_32x32_fp_nuq( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t quant, const int16_t dequant, const tran_low_t *cuml_bins_ptr, - const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - const int rc = 0; - if (highbd_quantize_coeff_bigtx_fp_nuq( - coeff_ptr[rc], quant, dequant, cuml_bins_ptr, dequant_val, - qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_32X32))) - eob = 0; - } - *eob_ptr = eob + 1; -} - -#if CONFIG_TX64X64 -void highbd_quantize_dc_64x64_nuq( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t quant, const int16_t quant_shift, const int16_t dequant, - const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - const int rc = 0; - if (highbd_quantize_coeff_bigtx_nuq( - coeff_ptr[rc], quant, quant_shift, dequant, cuml_bins_ptr, - dequant_val, qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_64X64))) - eob = 0; - } - *eob_ptr = eob + 1; -} - -void highbd_quantize_dc_64x64_fp_nuq( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t quant, const int16_t dequant, const tran_low_t *cuml_bins_ptr, - const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { - int eob = -1; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - const int rc = 0; - if (highbd_quantize_coeff_bigtx_fp_nuq( - coeff_ptr[rc], quant, dequant, cuml_bins_ptr, dequant_val, - qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_64X64))) - eob = 0; - } - *eob_ptr = eob + 1; -} -#endif // CONFIG_TX64X64 - -void av1_highbd_quantize_b_nuq_facade( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, - tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, - const QUANT_PARAM *qparam) { - // obsolete skip_block - const int skip_block = 0; - const uint8_t *band = get_band_translate(qparam->tx_size); - const int dq = qparam->dq; - - switch (qparam->log_scale) { - case 0: - highbd_quantize_nuq(coeff_ptr, n_coeffs, skip_block, p->quant, - p->quant_shift, pd->dequant, - (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], - qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band); - break; - case 1: - highbd_quantize_32x32_nuq( - coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift, - pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr, - dqcoeff_ptr, eob_ptr, sc->scan, band); - break; -#if CONFIG_TX64X64 - case 2: - highbd_quantize_64x64_nuq( - coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift, - pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr, - dqcoeff_ptr, eob_ptr, sc->scan, band); - break; -#endif // CONFIG_TX64X64 - default: assert(0); - } -} - -void av1_highbd_quantize_fp_nuq_facade( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, - tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, - const QUANT_PARAM *qparam) { - // obsolete skip_block - const int skip_block = 0; - const uint8_t *band = get_band_translate(qparam->tx_size); - const int dq = qparam->dq; - - switch (qparam->log_scale) { - case 0: - highbd_quantize_fp_nuq( - coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant, - (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr, - dqcoeff_ptr, eob_ptr, sc->scan, band); - break; - case 1: - highbd_quantize_32x32_fp_nuq( - coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant, - (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr, - dqcoeff_ptr, eob_ptr, sc->scan, band); - break; -#if CONFIG_TX64X64 - case 2: - highbd_quantize_64x64_fp_nuq( - coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant, - (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr, - dqcoeff_ptr, eob_ptr, sc->scan, band); - break; -#endif // CONFIG_TX64X64 - default: assert(0); - } -} - -void av1_highbd_quantize_dc_nuq_facade( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, - tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, - const QUANT_PARAM *qparam) { - // obsolete skip_block - const int skip_block = 0; - const int dq = qparam->dq; - (void)sc; - - switch (qparam->log_scale) { - case 0: - highbd_quantize_dc_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], - pd->dequant[0], p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], qcoeff_ptr, - dqcoeff_ptr, eob_ptr); - break; - case 1: - highbd_quantize_dc_32x32_fp_nuq( - coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], pd->dequant[0], - p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0], qcoeff_ptr, - dqcoeff_ptr, eob_ptr); - break; -#if CONFIG_TX64X64 - case 2: - highbd_quantize_dc_64x64_fp_nuq( - coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], pd->dequant[0], - p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0], qcoeff_ptr, - dqcoeff_ptr, eob_ptr); - break; -#endif // CONFIG_TX64X64 - default: assert(0); - } -} -#endif // CONFIG_NEW_QUANT - -void av1_highbd_quantize_fp_c( - const tran_low_t *coeff_ptr, intptr_t count, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan, int log_scale) { - highbd_quantize_fp_helper_c(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, - dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, - NULL, NULL, log_scale); + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, + int log_scale) { + highbd_quantize_fp_helper_c(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, + log_scale); } static void invert_quant(int16_t *quant, int16_t *shift, int d) { @@ -1520,8 +510,7 @@ static void invert_quant(int16_t *quant, int16_t *shift, int d) { } static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) { - const int quant = av1_dc_quant(q, 0, bit_depth); -#if CONFIG_HIGHBITDEPTH + const int quant = av1_dc_quant_Q3(q, 0, bit_depth); switch (bit_depth) { case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80); case AOM_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80); @@ -1530,16 +519,13 @@ static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) { assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); return -1; } -#else - (void)bit_depth; - return q == 0 ? 64 : (quant < 148 ? 84 : 80); -#endif } void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q, - int uv_dc_delta_q, int uv_ac_delta_q, - QUANTS *const quants, Dequants *const deq) { - int i, q, quant; + int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q, + int v_ac_delta_q, QUANTS *const quants, + Dequants *const deq) { + int i, q, quant_Q3, quant_QTX; for (q = 0; q < QINDEX_RANGE; q++) { const int qzbin_factor = get_qzbin_factor(q, bit_depth); @@ -1547,41 +533,51 @@ void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q, for (i = 0; i < 2; ++i) { int qrounding_factor_fp = 64; - // y - quant = i == 0 ? av1_dc_quant(q, y_dc_delta_q, bit_depth) - : av1_ac_quant(q, 0, bit_depth); - invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant); - quants->y_quant_fp[q][i] = (1 << 16) / quant; - quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7; - quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); - quants->y_round[q][i] = (qrounding_factor * quant) >> 7; - deq->y_dequant[q][i] = quant; - - // uv - quant = i == 0 ? av1_dc_quant(q, uv_dc_delta_q, bit_depth) - : av1_ac_quant(q, uv_ac_delta_q, bit_depth); - invert_quant(&quants->uv_quant[q][i], &quants->uv_quant_shift[q][i], - quant); - quants->uv_quant_fp[q][i] = (1 << 16) / quant; - quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7; - quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); - quants->uv_round[q][i] = (qrounding_factor * quant) >> 7; - deq->uv_dequant[q][i] = quant; - } - -#if CONFIG_NEW_QUANT - int dq; - for (dq = 0; dq < QUANT_PROFILES; dq++) { - for (i = 0; i < COEF_BANDS; i++) { - const int y_quant = deq->y_dequant[q][i != 0]; - const int uvquant = deq->uv_dequant[q][i != 0]; - av1_get_dequant_val_nuq(y_quant, i, deq->y_dequant_val_nuq[dq][q][i], - quants->y_cuml_bins_nuq[dq][q][i], dq); - av1_get_dequant_val_nuq(uvquant, i, deq->uv_dequant_val_nuq[dq][q][i], - quants->uv_cuml_bins_nuq[dq][q][i], dq); - } + // y quantizer setup with original coeff shift of Q3 + quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, y_dc_delta_q, bit_depth) + : av1_ac_quant_Q3(q, 0, bit_depth); + // y quantizer with TX scale + quant_QTX = i == 0 ? av1_dc_quant_QTX(q, y_dc_delta_q, bit_depth) + : av1_ac_quant_QTX(q, 0, bit_depth); + invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], + quant_QTX); + quants->y_quant_fp[q][i] = (1 << 16) / quant_QTX; + quants->y_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7; + quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7); + quants->y_round[q][i] = (qrounding_factor * quant_QTX) >> 7; + deq->y_dequant_QTX[q][i] = quant_QTX; + deq->y_dequant_Q3[q][i] = quant_Q3; + + // u quantizer setup with original coeff shift of Q3 + quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, u_dc_delta_q, bit_depth) + : av1_ac_quant_Q3(q, u_ac_delta_q, bit_depth); + // u quantizer with TX scale + quant_QTX = i == 0 ? av1_dc_quant_QTX(q, u_dc_delta_q, bit_depth) + : av1_ac_quant_QTX(q, u_ac_delta_q, bit_depth); + invert_quant(&quants->u_quant[q][i], &quants->u_quant_shift[q][i], + quant_QTX); + quants->u_quant_fp[q][i] = (1 << 16) / quant_QTX; + quants->u_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7; + quants->u_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7); + quants->u_round[q][i] = (qrounding_factor * quant_QTX) >> 7; + deq->u_dequant_QTX[q][i] = quant_QTX; + deq->u_dequant_Q3[q][i] = quant_Q3; + + // v quantizer setup with original coeff shift of Q3 + quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, v_dc_delta_q, bit_depth) + : av1_ac_quant_Q3(q, v_ac_delta_q, bit_depth); + // v quantizer with TX scale + quant_QTX = i == 0 ? av1_dc_quant_QTX(q, v_dc_delta_q, bit_depth) + : av1_ac_quant_QTX(q, v_ac_delta_q, bit_depth); + invert_quant(&quants->v_quant[q][i], &quants->v_quant_shift[q][i], + quant_QTX); + quants->v_quant_fp[q][i] = (1 << 16) / quant_QTX; + quants->v_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7; + quants->v_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7); + quants->v_round[q][i] = (qrounding_factor * quant_QTX) >> 7; + deq->v_dequant_QTX[q][i] = quant_QTX; + deq->v_dequant_Q3[q][i] = quant_Q3; } -#endif // CONFIG_NEW_QUANT for (i = 2; i < 8; i++) { // 8: SIMD width quants->y_quant[q][i] = quants->y_quant[q][1]; @@ -1590,15 +586,25 @@ void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q, quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1]; quants->y_zbin[q][i] = quants->y_zbin[q][1]; quants->y_round[q][i] = quants->y_round[q][1]; - deq->y_dequant[q][i] = deq->y_dequant[q][1]; - - quants->uv_quant[q][i] = quants->uv_quant[q][1]; - quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1]; - quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1]; - quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1]; - quants->uv_zbin[q][i] = quants->uv_zbin[q][1]; - quants->uv_round[q][i] = quants->uv_round[q][1]; - deq->uv_dequant[q][i] = deq->uv_dequant[q][1]; + deq->y_dequant_QTX[q][i] = deq->y_dequant_QTX[q][1]; + deq->y_dequant_Q3[q][i] = deq->y_dequant_Q3[q][1]; + + quants->u_quant[q][i] = quants->u_quant[q][1]; + quants->u_quant_fp[q][i] = quants->u_quant_fp[q][1]; + quants->u_round_fp[q][i] = quants->u_round_fp[q][1]; + quants->u_quant_shift[q][i] = quants->u_quant_shift[q][1]; + quants->u_zbin[q][i] = quants->u_zbin[q][1]; + quants->u_round[q][i] = quants->u_round[q][1]; + deq->u_dequant_QTX[q][i] = deq->u_dequant_QTX[q][1]; + deq->u_dequant_Q3[q][i] = deq->u_dequant_Q3[q][1]; + quants->v_quant[q][i] = quants->u_quant[q][1]; + quants->v_quant_fp[q][i] = quants->v_quant_fp[q][1]; + quants->v_round_fp[q][i] = quants->v_round_fp[q][1]; + quants->v_quant_shift[q][i] = quants->v_quant_shift[q][1]; + quants->v_zbin[q][i] = quants->v_zbin[q][1]; + quants->v_round[q][i] = quants->v_round[q][1]; + deq->v_dequant_QTX[q][i] = deq->v_dequant_QTX[q][1]; + deq->v_dequant_Q3[q][i] = deq->v_dequant_Q3[q][1]; } } } @@ -1607,8 +613,9 @@ void av1_init_quantizer(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; QUANTS *const quants = &cpi->quants; Dequants *const dequants = &cpi->dequants; - av1_build_quantizer(cm->bit_depth, cm->y_dc_delta_q, cm->uv_dc_delta_q, - cm->uv_ac_delta_q, quants, dequants); + av1_build_quantizer(cm->bit_depth, cm->y_dc_delta_q, cm->u_dc_delta_q, + cm->u_ac_delta_q, cm->v_dc_delta_q, cm->v_ac_delta_q, + quants, dequants); } void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x, @@ -1617,79 +624,68 @@ void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *const xd = &x->e_mbd; const QUANTS *const quants = &cpi->quants; -#if CONFIG_EXT_DELTA_Q - int current_q_index = - AOMMAX(0, AOMMIN(QINDEX_RANGE - 1, - cpi->oxcf.deltaq_mode != NO_DELTA_Q - ? cm->base_qindex + xd->delta_qindex - : cm->base_qindex)); -#else - int current_q_index = AOMMAX( - 0, AOMMIN(QINDEX_RANGE - 1, - cm->delta_q_present_flag ? cm->base_qindex + xd->delta_qindex - : cm->base_qindex)); -#endif - const int qindex = av1_get_qindex(&cm->seg, segment_id, current_q_index); + int current_qindex = AOMMAX( + 0, AOMMIN(QINDEX_RANGE - 1, cpi->oxcf.deltaq_mode != NO_DELTA_Q + ? cm->base_qindex + xd->delta_qindex + : cm->base_qindex)); + const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex); const int rdmult = av1_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q); - int i; -#if CONFIG_AOM_QM - int minqm = cm->min_qmlevel; - int maxqm = cm->max_qmlevel; - // Quant matrix only depends on the base QP so there is only one set per frame int qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0) ? NUM_QM_LEVELS - 1 - : aom_get_qmlevel(cm->base_qindex, minqm, maxqm); -#endif -#if CONFIG_NEW_QUANT - int dq; -#endif + : cm->qm_y; // Y - x->plane[0].quant = quants->y_quant[qindex]; - x->plane[0].quant_fp = quants->y_quant_fp[qindex]; - x->plane[0].round_fp = quants->y_round_fp[qindex]; - x->plane[0].quant_shift = quants->y_quant_shift[qindex]; - x->plane[0].zbin = quants->y_zbin[qindex]; - x->plane[0].round = quants->y_round[qindex]; -#if CONFIG_AOM_QM + x->plane[0].quant_QTX = quants->y_quant[qindex]; + x->plane[0].quant_fp_QTX = quants->y_quant_fp[qindex]; + x->plane[0].round_fp_QTX = quants->y_round_fp[qindex]; + x->plane[0].quant_shift_QTX = quants->y_quant_shift[qindex]; + x->plane[0].zbin_QTX = quants->y_zbin[qindex]; + x->plane[0].round_QTX = quants->y_round[qindex]; + x->plane[0].dequant_QTX = cpi->dequants.y_dequant_QTX[qindex]; memcpy(&xd->plane[0].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][0], sizeof(cm->gqmatrix[qmlevel][0])); memcpy(&xd->plane[0].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][0], sizeof(cm->giqmatrix[qmlevel][0])); -#endif - xd->plane[0].dequant = cpi->dequants.y_dequant[qindex]; -#if CONFIG_NEW_QUANT - for (dq = 0; dq < QUANT_PROFILES; dq++) { - x->plane[0].cuml_bins_nuq[dq] = quants->y_cuml_bins_nuq[dq][qindex]; - xd->plane[0].dequant_val_nuq[dq] = - cpi->dequants.y_dequant_val_nuq[dq][qindex]; - } -#endif // CONFIG_NEW_QUANT - - // UV - for (i = 1; i < 3; i++) { - x->plane[i].quant = quants->uv_quant[qindex]; - x->plane[i].quant_fp = quants->uv_quant_fp[qindex]; - x->plane[i].round_fp = quants->uv_round_fp[qindex]; - x->plane[i].quant_shift = quants->uv_quant_shift[qindex]; - x->plane[i].zbin = quants->uv_zbin[qindex]; - x->plane[i].round = quants->uv_round[qindex]; -#if CONFIG_AOM_QM - memcpy(&xd->plane[i].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][1], + xd->plane[0].dequant_Q3 = cpi->dequants.y_dequant_Q3[qindex]; + + // U + qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0) + ? NUM_QM_LEVELS - 1 + : cm->qm_u; + { + x->plane[1].quant_QTX = quants->u_quant[qindex]; + x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex]; + x->plane[1].round_fp_QTX = quants->u_round_fp[qindex]; + x->plane[1].quant_shift_QTX = quants->u_quant_shift[qindex]; + x->plane[1].zbin_QTX = quants->u_zbin[qindex]; + x->plane[1].round_QTX = quants->u_round[qindex]; + x->plane[1].dequant_QTX = cpi->dequants.u_dequant_QTX[qindex]; + memcpy(&xd->plane[1].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][1], sizeof(cm->gqmatrix[qmlevel][1])); - memcpy(&xd->plane[i].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][1], + memcpy(&xd->plane[1].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][1], sizeof(cm->giqmatrix[qmlevel][1])); -#endif - xd->plane[i].dequant = cpi->dequants.uv_dequant[qindex]; -#if CONFIG_NEW_QUANT - for (dq = 0; dq < QUANT_PROFILES; dq++) { - x->plane[i].cuml_bins_nuq[dq] = quants->uv_cuml_bins_nuq[dq][qindex]; - xd->plane[i].dequant_val_nuq[dq] = - cpi->dequants.uv_dequant_val_nuq[dq][qindex]; - } -#endif // CONFIG_NEW_QUANT + x->plane[1].dequant_QTX = cpi->dequants.u_dequant_QTX[qindex]; + xd->plane[1].dequant_Q3 = cpi->dequants.u_dequant_Q3[qindex]; + } + // V + qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0) + ? NUM_QM_LEVELS - 1 + : cm->qm_v; + { + x->plane[2].quant_QTX = quants->v_quant[qindex]; + x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex]; + x->plane[2].round_fp_QTX = quants->v_round_fp[qindex]; + x->plane[2].quant_shift_QTX = quants->v_quant_shift[qindex]; + x->plane[2].zbin_QTX = quants->v_zbin[qindex]; + x->plane[2].round_QTX = quants->v_round[qindex]; + x->plane[2].dequant_QTX = cpi->dequants.v_dequant_QTX[qindex]; + memcpy(&xd->plane[2].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][2], + sizeof(cm->gqmatrix[qmlevel][2])); + memcpy(&xd->plane[2].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][2], + sizeof(cm->giqmatrix[qmlevel][2])); + x->plane[2].dequant_QTX = cpi->dequants.v_dequant_QTX[qindex]; + xd->plane[2].dequant_Q3 = cpi->dequants.v_dequant_Q3[qindex]; } - x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); x->qindex = qindex; @@ -1701,16 +697,27 @@ void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x, void av1_frame_init_quantizer(AV1_COMP *cpi) { MACROBLOCK *const x = &cpi->td.mb; MACROBLOCKD *const xd = &x->e_mbd; - av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id); + av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id); } void av1_set_quantizer(AV1_COMMON *cm, int q) { // quantizer has to be reinitialized with av1_init_quantizer() if any // delta_q changes. - cm->base_qindex = q; + cm->base_qindex = AOMMAX(cm->delta_q_present_flag, q); cm->y_dc_delta_q = 0; - cm->uv_dc_delta_q = 0; - cm->uv_ac_delta_q = 0; + cm->u_dc_delta_q = 0; + cm->u_ac_delta_q = 0; + cm->v_dc_delta_q = 0; + cm->v_ac_delta_q = 0; + cm->qm_y = aom_get_qmlevel(cm->base_qindex, cm->min_qmlevel, cm->max_qmlevel); + cm->qm_u = aom_get_qmlevel(cm->base_qindex + cm->u_ac_delta_q, + cm->min_qmlevel, cm->max_qmlevel); + + if (!cm->separate_uv_delta_q) + cm->qm_v = cm->qm_u; + else + cm->qm_v = aom_get_qmlevel(cm->base_qindex + cm->v_ac_delta_q, + cm->min_qmlevel, cm->max_qmlevel); } // Table that converts 0-63 Q-range values passed in outside to the Qindex diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h index e5fc8b528..eaf8374de 100644 --- a/third_party/aom/av1/encoder/av1_quantize.h +++ b/third_party/aom/av1/encoder/av1_quantize.h @@ -12,7 +12,8 @@ #ifndef AV1_ENCODER_QUANTIZE_H_ #define AV1_ENCODER_QUANTIZE_H_ -#include "./aom_config.h" +#include "config/aom_config.h" + #include "av1/common/quant_common.h" #include "av1/common/scan.h" #include "av1/encoder/block.h" @@ -23,33 +24,22 @@ extern "C" { typedef struct QUANT_PARAM { int log_scale; -#if CONFIG_NEW_QUANT TX_SIZE tx_size; - int dq; -#endif // CONFIG_NEW_QUANT -#if CONFIG_AOM_QM const qm_val_t *qmatrix; const qm_val_t *iqmatrix; -#endif // CONFIG_AOM_QM } QUANT_PARAM; typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam); +// The QUANTS structure is used only for internal quantizer setup in +// av1_quantize.c. +// All of its fields use the same coefficient shift/scaling at TX. typedef struct { -#if CONFIG_NEW_QUANT - DECLARE_ALIGNED( - 16, tran_low_t, - y_cuml_bins_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS][NUQ_KNOTS]); - DECLARE_ALIGNED( - 16, tran_low_t, - uv_cuml_bins_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS][NUQ_KNOTS]); -#endif // CONFIG_NEW_QUANT // 0: dc 1: ac 2-8: ac repeated to SIMD width DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]); @@ -59,25 +49,36 @@ typedef struct { // TODO(jingning): in progress of re-working the quantization. will decide // if we want to deprecate the current use of y_quant. DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_quant_fp[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]); - - DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_round_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_round_fp[QINDEX_RANGE][8]); + + DECLARE_ALIGNED(16, int16_t, u_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_round[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_round[QINDEX_RANGE][8]); } QUANTS; +// The Dequants structure is used only for internal quantizer setup in +// av1_quantize.c. +// Fields are sufffixed according to whether or not they're expressed in +// the same coefficient shift/precision as TX or a fixed Q3 format. typedef struct { - DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]); // 8: SIMD width - DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]); // 8: SIMD width -#if CONFIG_NEW_QUANT - DECLARE_ALIGNED(16, dequant_val_type_nuq, - y_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]); - DECLARE_ALIGNED(16, dequant_val_type_nuq, - uv_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]); -#endif // CONFIG_NEW_QUANT + DECLARE_ALIGNED(16, int16_t, + y_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width + DECLARE_ALIGNED(16, int16_t, + u_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width + DECLARE_ALIGNED(16, int16_t, + v_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width + DECLARE_ALIGNED(16, int16_t, y_dequant_Q3[QINDEX_RANGE][8]); // 8: SIMD width + DECLARE_ALIGNED(16, int16_t, u_dequant_Q3[QINDEX_RANGE][8]); // 8: SIMD width + DECLARE_ALIGNED(16, int16_t, v_dequant_Q3[QINDEX_RANGE][8]); // 8: SIMD width } Dequants; struct AV1_COMP; @@ -89,8 +90,9 @@ void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x, int segment_id); void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q, - int uv_dc_delta_q, int uv_ac_delta_q, - QUANTS *const quants, Dequants *const deq); + int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q, + int v_ac_delta_q, QUANTS *const quants, + Dequants *const deq); void av1_init_quantizer(struct AV1_COMP *cpi); @@ -105,51 +107,22 @@ void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr, void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam); void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr, - uint16_t *eob_ptr, const SCAN_ORDER *sc, - const QUANT_PARAM *qparam); + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam); void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam); -#if CONFIG_NEW_QUANT -void av1_quantize_fp_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const MACROBLOCK_PLANE *p, - tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, - const SCAN_ORDER *sc, - const QUANT_PARAM *qparam); - -void av1_quantize_b_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const MACROBLOCK_PLANE *p, - tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, - const SCAN_ORDER *sc, const QUANT_PARAM *qparam); - -void av1_quantize_dc_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const MACROBLOCK_PLANE *p, - tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, - const SCAN_ORDER *sc, - const QUANT_PARAM *qparam); -#endif // CONFIG_NEW_QUANT - void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam); @@ -157,7 +130,6 @@ void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr, void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam); @@ -165,31 +137,10 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, - const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam); -#if CONFIG_NEW_QUANT -void av1_highbd_quantize_fp_nuq_facade( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, - tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, - const QUANT_PARAM *qparam); - -void av1_highbd_quantize_b_nuq_facade( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, - tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, - const QUANT_PARAM *qparam); - -void av1_highbd_quantize_dc_nuq_facade( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, - tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd, - tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, - const QUANT_PARAM *qparam); -#endif // CONFIG_NEW_QUANT - #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/av1/encoder/bgsprite.c b/third_party/aom/av1/encoder/bgsprite.c deleted file mode 100644 index ae2cb1d40..000000000 --- a/third_party/aom/av1/encoder/bgsprite.c +++ /dev/null @@ -1,1257 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#define _POSIX_C_SOURCE 200112L // rand_r() -#include <assert.h> -#include <float.h> -#include <limits.h> -#include <math.h> -#include <stdlib.h> -#include <time.h> - -#include "av1/encoder/bgsprite.h" - -#include "aom_mem/aom_mem.h" -#include "./aom_scale_rtcd.h" -#include "av1/common/mv.h" -#include "av1/common/warped_motion.h" -#include "av1/encoder/encoder.h" -#include "av1/encoder/global_motion.h" -#include "av1/encoder/mathutils.h" -#include "av1/encoder/temporal_filter.h" - -/* Blending Modes: - * 0 = Median - * 1 = Mean - */ -#define BGSPRITE_BLENDING_MODE 1 - -// Enable removal of outliers from mean blending mode. -#if BGSPRITE_BLENDING_MODE == 1 -#define BGSPRITE_MEAN_REMOVE_OUTLIERS 0 -#endif // BGSPRITE_BLENDING_MODE == 1 - -/* Interpolation for panorama alignment sampling: - * 0 = Nearest neighbor - * 1 = Bilinear - */ -#define BGSPRITE_INTERPOLATION 0 - -// Enable turning off bgsprite from firstpass metrics in define_gf_group. -#define BGSPRITE_ENABLE_METRICS 1 - -// Enable foreground/backgrond segmentation and combine with temporal filter. -#define BGSPRITE_ENABLE_SEGMENTATION 1 - -// Enable alignment using global motion. -#define BGSPRITE_ENABLE_GME 0 - -// Block size for foreground mask. -#define BGSPRITE_MASK_BLOCK_SIZE 4 - -typedef struct { -#if CONFIG_HIGHBITDEPTH - uint16_t y; - uint16_t u; - uint16_t v; -#else - uint8_t y; - uint8_t u; - uint8_t v; -#endif // CONFIG_HIGHBITDEPTH - uint8_t exists; -} YuvPixel; - -typedef struct { - int curr_model; - double mean[2]; - double var[2]; - int age[2]; - double u_mean[2]; - double v_mean[2]; - -#if CONFIG_HIGHBITDEPTH - uint16_t y; - uint16_t u; - uint16_t v; -#else - uint8_t y; - uint8_t u; - uint8_t v; -#endif // CONFIG_HIGHBITDEPTH - double final_var; -} YuvPixelGaussian; - -// Maps to convert from matrix form to param vector form. -static const int params_to_matrix_map[] = { 2, 3, 0, 4, 5, 1, 6, 7 }; -static const int matrix_to_params_map[] = { 2, 5, 0, 1, 3, 4, 6, 7 }; - -// Convert the parameter array to a 3x3 matrix form. -static void params_to_matrix(const double *const params, double *target) { - for (int i = 0; i < MAX_PARAMDIM - 1; i++) { - assert(params_to_matrix_map[i] < MAX_PARAMDIM - 1); - target[i] = params[params_to_matrix_map[i]]; - } - target[8] = 1; -} - -// Convert a 3x3 matrix to a parameter array form. -static void matrix_to_params(const double *const matrix, double *target) { - for (int i = 0; i < MAX_PARAMDIM - 1; i++) { - assert(matrix_to_params_map[i] < MAX_PARAMDIM - 1); - target[i] = matrix[matrix_to_params_map[i]]; - } -} - -#define TRANSFORM_MAT_DIM 3 - -// Do matrix multiplication on params. -static void multiply_params(double *const m1, double *const m2, - double *target) { - double m1_matrix[MAX_PARAMDIM]; - double m2_matrix[MAX_PARAMDIM]; - double result[MAX_PARAMDIM]; - - params_to_matrix(m1, m1_matrix); - params_to_matrix(m2, m2_matrix); - multiply_mat(m2_matrix, m1_matrix, result, TRANSFORM_MAT_DIM, - TRANSFORM_MAT_DIM, TRANSFORM_MAT_DIM); - matrix_to_params(result, target); -} - -// Finds x and y limits of a single transformed image. -// Width and height are the size of the input video. -static void find_frame_limit(int width, int height, - const double *const transform, int *x_min, - int *x_max, int *y_min, int *y_max) { - double transform_matrix[MAX_PARAMDIM]; - double xy_matrix[3] = { 0, 0, 1 }; - double uv_matrix[3] = { 0 }; -// Macro used to update frame limits based on transformed coordinates. -#define UPDATELIMITS(u, v, x_min, x_max, y_min, y_max) \ - { \ - if ((int)ceil(u) > *x_max) { \ - *x_max = (int)ceil(u); \ - } \ - if ((int)floor(u) < *x_min) { \ - *x_min = (int)floor(u); \ - } \ - if ((int)ceil(v) > *y_max) { \ - *y_max = (int)ceil(v); \ - } \ - if ((int)floor(v) < *y_min) { \ - *y_min = (int)floor(v); \ - } \ - } - - params_to_matrix(transform, transform_matrix); - xy_matrix[0] = 0; - xy_matrix[1] = 0; - multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM, - TRANSFORM_MAT_DIM, 1); - *x_max = (int)ceil(uv_matrix[0]); - *x_min = (int)floor(uv_matrix[0]); - *y_max = (int)ceil(uv_matrix[1]); - *y_min = (int)floor(uv_matrix[1]); - - xy_matrix[0] = width - 1; - xy_matrix[1] = 0; - multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM, - TRANSFORM_MAT_DIM, 1); - UPDATELIMITS(uv_matrix[0], uv_matrix[1], x_min, x_max, y_min, y_max); - - xy_matrix[0] = width - 1; - xy_matrix[1] = height - 1; - multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM, - TRANSFORM_MAT_DIM, 1); - UPDATELIMITS(uv_matrix[0], uv_matrix[1], x_min, x_max, y_min, y_max); - - xy_matrix[0] = 0; - xy_matrix[1] = height - 1; - multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM, - TRANSFORM_MAT_DIM, 1); - UPDATELIMITS(uv_matrix[0], uv_matrix[1], x_min, x_max, y_min, y_max); - -#undef UPDATELIMITS -} - -// Finds x and y limits for arrays. Also finds the overall max and minimums -static void find_limits(int width, int height, const double **const params, - int num_frames, int *x_min, int *x_max, int *y_min, - int *y_max, int *pano_x_min, int *pano_x_max, - int *pano_y_min, int *pano_y_max) { - *pano_x_max = INT_MIN; - *pano_x_min = INT_MAX; - *pano_y_max = INT_MIN; - *pano_y_min = INT_MAX; - for (int i = 0; i < num_frames; ++i) { - find_frame_limit(width, height, (const double *const)params[i], &x_min[i], - &x_max[i], &y_min[i], &y_max[i]); - if (x_max[i] > *pano_x_max) { - *pano_x_max = x_max[i]; - } - if (x_min[i] < *pano_x_min) { - *pano_x_min = x_min[i]; - } - if (y_max[i] > *pano_y_max) { - *pano_y_max = y_max[i]; - } - if (y_min[i] < *pano_y_min) { - *pano_y_min = y_min[i]; - } - } -} - -// Inverts a 3x3 matrix that is in the parameter form. -static void invert_params(const double *const params, double *target) { - double temp[MAX_PARAMDIM] = { 0 }; - params_to_matrix(params, temp); - - // Find determinant of matrix (expansion by minors). - const double det = temp[0] * ((temp[4] * temp[8]) - (temp[5] * temp[7])) - - temp[1] * ((temp[3] * temp[8]) - (temp[5] * temp[6])) + - temp[2] * ((temp[3] * temp[7]) - (temp[4] * temp[6])); - assert(det != 0); - - // inverse is transpose of cofactor * 1/det. - double inverse[MAX_PARAMDIM] = { 0 }; - inverse[0] = (temp[4] * temp[8] - temp[7] * temp[5]) / det; - inverse[1] = (temp[2] * temp[7] - temp[1] * temp[8]) / det; - inverse[2] = (temp[1] * temp[5] - temp[2] * temp[4]) / det; - inverse[3] = (temp[5] * temp[6] - temp[3] * temp[8]) / det; - inverse[4] = (temp[0] * temp[8] - temp[2] * temp[6]) / det; - inverse[5] = (temp[3] * temp[2] - temp[0] * temp[5]) / det; - inverse[6] = (temp[3] * temp[7] - temp[6] * temp[4]) / det; - inverse[7] = (temp[6] * temp[1] - temp[0] * temp[7]) / det; - inverse[8] = (temp[0] * temp[4] - temp[3] * temp[1]) / det; - - matrix_to_params(inverse, target); -} - -static void build_image_stack(YV12_BUFFER_CONFIG **const frames, - const int num_frames, const double **const params, - const int *const x_min, const int *const x_max, - const int *const y_min, const int *const y_max, - int pano_x_min, int pano_y_min, - YuvPixel ***img_stack) { - // Re-sample images onto panorama (pre-filtering). - const int x_offset = -pano_x_min; - const int y_offset = -pano_y_min; - const int frame_width = frames[0]->y_width; - const int frame_height = frames[0]->y_height; - for (int i = 0; i < num_frames; ++i) { - // Find transforms from panorama coordinate system back to single image - // coordinate system for sampling. - int transformed_width = x_max[i] - x_min[i] + 1; - int transformed_height = y_max[i] - y_min[i] + 1; - - double transform_matrix[MAX_PARAMDIM]; - double transform_params[MAX_PARAMDIM - 1]; - invert_params(params[i], transform_params); - params_to_matrix(transform_params, transform_matrix); - -#if CONFIG_HIGHBITDEPTH - const uint16_t *y_buffer16 = CONVERT_TO_SHORTPTR(frames[i]->y_buffer); - const uint16_t *u_buffer16 = CONVERT_TO_SHORTPTR(frames[i]->u_buffer); - const uint16_t *v_buffer16 = CONVERT_TO_SHORTPTR(frames[i]->v_buffer); -#endif // CONFIG_HIGHBITDEPTH - - for (int y = 0; y < transformed_height; ++y) { - for (int x = 0; x < transformed_width; ++x) { - // Do transform. - double xy_matrix[3] = { x + x_min[i], y + y_min[i], 1 }; - double uv_matrix[3] = { 0 }; - multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM, - TRANSFORM_MAT_DIM, 1); - - // Coordinates used for nearest neighbor interpolation. - int image_x = (int)round(uv_matrix[0]); - int image_y = (int)round(uv_matrix[1]); - - // Temporary values for bilinear interpolation - double interpolated_yvalue = 0.0; - double interpolated_uvalue = 0.0; - double interpolated_vvalue = 0.0; - double interpolated_fraction = 0.0; - int interpolation_count = 0; - -#if BGSPRITE_INTERPOLATION == 1 - // Coordintes used for bilinear interpolation. - double x_base; - double y_base; - double x_decimal = modf(uv_matrix[0], &x_base); - double y_decimal = modf(uv_matrix[1], &y_base); - - if ((x_decimal > 0.2 && x_decimal < 0.8) || - (y_decimal > 0.2 && y_decimal < 0.8)) { - for (int u = 0; u < 2; ++u) { - for (int v = 0; v < 2; ++v) { - int interp_x = (int)x_base + u; - int interp_y = (int)y_base + v; - if (interp_x >= 0 && interp_x < frame_width && interp_y >= 0 && - interp_y < frame_height) { - interpolation_count++; - - interpolated_fraction += - fabs(u - x_decimal) * fabs(v - y_decimal); - int ychannel_idx = interp_y * frames[i]->y_stride + interp_x; - int uvchannel_idx = (interp_y >> frames[i]->subsampling_y) * - frames[i]->uv_stride + - (interp_x >> frames[i]->subsampling_x); -#if CONFIG_HIGHBITDEPTH - if (frames[i]->flags & YV12_FLAG_HIGHBITDEPTH) { - interpolated_yvalue += (1 - fabs(u - x_decimal)) * - (1 - fabs(v - y_decimal)) * - y_buffer16[ychannel_idx]; - interpolated_uvalue += (1 - fabs(u - x_decimal)) * - (1 - fabs(v - y_decimal)) * - u_buffer16[uvchannel_idx]; - interpolated_vvalue += (1 - fabs(u - x_decimal)) * - (1 - fabs(v - y_decimal)) * - v_buffer16[uvchannel_idx]; - } else { -#endif // CONFIG_HIGHBITDEPTH - interpolated_yvalue += (1 - fabs(u - x_decimal)) * - (1 - fabs(v - y_decimal)) * - frames[i]->y_buffer[ychannel_idx]; - interpolated_uvalue += (1 - fabs(u - x_decimal)) * - (1 - fabs(v - y_decimal)) * - frames[i]->u_buffer[uvchannel_idx]; - interpolated_vvalue += (1 - fabs(u - x_decimal)) * - (1 - fabs(v - y_decimal)) * - frames[i]->v_buffer[uvchannel_idx]; -#if CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_HIGHBITDEPTH - } - } - } - } -#endif // BGSPRITE_INTERPOLATION == 1 - - if (BGSPRITE_INTERPOLATION && interpolation_count > 2) { - if (interpolation_count != 4) { - interpolated_yvalue /= interpolated_fraction; - interpolated_uvalue /= interpolated_fraction; - interpolated_vvalue /= interpolated_fraction; - } - int pano_x = x + x_min[i] + x_offset; - int pano_y = y + y_min[i] + y_offset; - -#if CONFIG_HIGHBITDEPTH - if (frames[i]->flags & YV12_FLAG_HIGHBITDEPTH) { - img_stack[pano_y][pano_x][i].y = (uint16_t)interpolated_yvalue; - img_stack[pano_y][pano_x][i].u = (uint16_t)interpolated_uvalue; - img_stack[pano_y][pano_x][i].v = (uint16_t)interpolated_vvalue; - img_stack[pano_y][pano_x][i].exists = 1; - } else { -#endif // CONFIG_HIGHBITDEPTH - img_stack[pano_y][pano_x][i].y = (uint8_t)interpolated_yvalue; - img_stack[pano_y][pano_x][i].u = (uint8_t)interpolated_uvalue; - img_stack[pano_y][pano_x][i].v = (uint8_t)interpolated_vvalue; - img_stack[pano_y][pano_x][i].exists = 1; -#if CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_HIGHBITDEPTH - } else if (image_x >= 0 && image_x < frame_width && image_y >= 0 && - image_y < frame_height) { - // Place in panorama stack. - int pano_x = x + x_min[i] + x_offset; - int pano_y = y + y_min[i] + y_offset; - - int ychannel_idx = image_y * frames[i]->y_stride + image_x; - int uvchannel_idx = - (image_y >> frames[i]->subsampling_y) * frames[i]->uv_stride + - (image_x >> frames[i]->subsampling_x); -#if CONFIG_HIGHBITDEPTH - if (frames[i]->flags & YV12_FLAG_HIGHBITDEPTH) { - img_stack[pano_y][pano_x][i].y = y_buffer16[ychannel_idx]; - img_stack[pano_y][pano_x][i].u = u_buffer16[uvchannel_idx]; - img_stack[pano_y][pano_x][i].v = v_buffer16[uvchannel_idx]; - img_stack[pano_y][pano_x][i].exists = 1; - } else { -#endif // CONFIG_HIGHBITDEPTH - img_stack[pano_y][pano_x][i].y = frames[i]->y_buffer[ychannel_idx]; - img_stack[pano_y][pano_x][i].u = frames[i]->u_buffer[uvchannel_idx]; - img_stack[pano_y][pano_x][i].v = frames[i]->v_buffer[uvchannel_idx]; - img_stack[pano_y][pano_x][i].exists = 1; -#if CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_HIGHBITDEPTH - } - } - } - } -} - -#if BGSPRITE_BLENDING_MODE == 0 -// swaps two YuvPixels. -static void swap_yuv(YuvPixel *a, YuvPixel *b) { - const YuvPixel temp = *b; - *b = *a; - *a = temp; -} - -// Partitions array to find pivot index in qselect. -static int partition(YuvPixel arr[], int left, int right, int pivot_idx) { - YuvPixel pivot = arr[pivot_idx]; - - // Move pivot to the end. - swap_yuv(&arr[pivot_idx], &arr[right]); - - int p_idx = left; - for (int i = left; i < right; ++i) { - if (arr[i].y <= pivot.y) { - swap_yuv(&arr[i], &arr[p_idx]); - p_idx++; - } - } - - swap_yuv(&arr[p_idx], &arr[right]); - - return p_idx; -} - -// Returns the kth element in array, partially sorted in place (quickselect). -static YuvPixel qselect(YuvPixel arr[], int left, int right, int k) { - if (left >= right) { - return arr[left]; - } - unsigned int seed = (int)time(NULL); - int pivot_idx = left + rand_r(&seed) % (right - left + 1); - pivot_idx = partition(arr, left, right, pivot_idx); - - if (k == pivot_idx) { - return arr[k]; - } else if (k < pivot_idx) { - return qselect(arr, left, pivot_idx - 1, k); - } else { - return qselect(arr, pivot_idx + 1, right, k); - } -} - -// Blends image stack together using a temporal median. -static void blend_median(const int width, const int height, - const int num_frames, const YuvPixel ***image_stack, - YuvPixel **blended_img) { - // Allocate stack of pixels - YuvPixel *pixel_stack = aom_calloc(num_frames, sizeof(*pixel_stack)); - - // Apply median filtering using quickselect. - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - int count = 0; - for (int i = 0; i < num_frames; ++i) { - if (image_stack[y][x][i].exists) { - pixel_stack[count] = image_stack[y][x][i]; - ++count; - } - } - if (count == 0) { - // Just make the pixel black. - // TODO(toddnguyen): Color the pixel with nearest neighbor - blended_img[y][x].exists = 0; - } else { - const int median_idx = (int)floor(count / 2); - YuvPixel median = qselect(pixel_stack, 0, count - 1, median_idx); - - // Make the median value the 0th index for UV subsampling later - blended_img[y][x] = median; - blended_img[y][x].exists = 1; - } - } - } - - aom_free(pixel_stack); -} -#endif // BGSPRITE_BLENDING_MODE == 0 - -#if BGSPRITE_BLENDING_MODE == 1 -// Blends image stack together using a temporal mean. -static void blend_mean(const int width, const int height, const int num_frames, - const YuvPixel ***image_stack, YuvPixel **blended_img, - int highbitdepth) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - // Find - uint32_t y_sum = 0; - uint32_t u_sum = 0; - uint32_t v_sum = 0; - uint32_t count = 0; - for (int i = 0; i < num_frames; ++i) { - if (image_stack[y][x][i].exists) { - y_sum += image_stack[y][x][i].y; - u_sum += image_stack[y][x][i].u; - v_sum += image_stack[y][x][i].v; - ++count; - } - } - -#if BGSPRITE_MEAN_REMOVE_OUTLIERS - if (count > 1) { - double stdev = 0; - double y_mean = (double)y_sum / count; - for (int i = 0; i < num_frames; ++i) { - if (image_stack[y][x][i].exists) { - stdev += pow(y_mean - image_stack[y][x][i].y, 2); - } - } - stdev = sqrt(stdev / count); - - uint32_t inlier_y_sum = 0; - uint32_t inlier_u_sum = 0; - uint32_t inlier_v_sum = 0; - uint32_t inlier_count = 0; - for (int i = 0; i < num_frames; ++i) { - if (image_stack[y][x][i].exists && - fabs(image_stack[y][x][i].y - y_mean) <= 1.5 * stdev) { - inlier_y_sum += image_stack[y][x][i].y; - inlier_u_sum += image_stack[y][x][i].u; - inlier_v_sum += image_stack[y][x][i].v; - ++inlier_count; - } - } - count = inlier_count; - y_sum = inlier_y_sum; - u_sum = inlier_u_sum; - v_sum = inlier_v_sum; - } -#endif // BGSPRITE_MEAN_REMOVE_OUTLIERS - - if (count != 0) { - blended_img[y][x].exists = 1; -#if CONFIG_HIGHBITDEPTH - if (highbitdepth) { - blended_img[y][x].y = (uint16_t)OD_DIVU(y_sum, count); - blended_img[y][x].u = (uint16_t)OD_DIVU(u_sum, count); - blended_img[y][x].v = (uint16_t)OD_DIVU(v_sum, count); - } else { -#endif // CONFIG_HIGHBITDEPTH - (void)highbitdepth; - blended_img[y][x].y = (uint8_t)OD_DIVU(y_sum, count); - blended_img[y][x].u = (uint8_t)OD_DIVU(u_sum, count); - blended_img[y][x].v = (uint8_t)OD_DIVU(v_sum, count); -#if CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_HIGHBITDEPTH - } else { - blended_img[y][x].exists = 0; - } - } - } -} -#endif // BGSPRITE_BLENDING_MODE == 1 - -#if BGSPRITE_ENABLE_SEGMENTATION -// Builds dual-mode single gaussian model from image stack. -static void build_gaussian(const YuvPixel ***image_stack, const int num_frames, - const int width, const int height, - const int x_block_width, const int y_block_height, - const int block_size, YuvPixelGaussian **gauss) { - const double initial_variance = 10.0; - const double s_theta = 2.0; - - // Add images to dual-mode single gaussian model - for (int y_block = 0; y_block < y_block_height; ++y_block) { - for (int x_block = 0; x_block < x_block_width; ++x_block) { - // Process all blocks. - YuvPixelGaussian *model = &gauss[y_block][x_block]; - - // Process all frames. - for (int i = 0; i < num_frames; ++i) { - // Add block to the Gaussian model. - double max_variance[2] = { 0.0, 0.0 }; - double temp_y_mean = 0.0; - double temp_u_mean = 0.0; - double temp_v_mean = 0.0; - - // Find mean/variance of a block of pixels. - int temp_count = 0; - for (int sub_y = 0; sub_y < block_size; ++sub_y) { - for (int sub_x = 0; sub_x < block_size; ++sub_x) { - const int y = y_block * block_size + sub_y; - const int x = x_block * block_size + sub_x; - if (y < height && x < width && image_stack[y][x][i].exists) { - ++temp_count; - temp_y_mean += (double)image_stack[y][x][i].y; - temp_u_mean += (double)image_stack[y][x][i].u; - temp_v_mean += (double)image_stack[y][x][i].v; - - const double variance_0 = - pow((double)image_stack[y][x][i].y - model->mean[0], 2); - const double variance_1 = - pow((double)image_stack[y][x][i].y - model->mean[1], 2); - - if (variance_0 > max_variance[0]) { - max_variance[0] = variance_0; - } - if (variance_1 > max_variance[1]) { - max_variance[1] = variance_1; - } - } - } - } - - // If pixels exist in the block, add to the model. - if (temp_count > 0) { - assert(temp_count <= block_size * block_size); - temp_y_mean /= temp_count; - temp_u_mean /= temp_count; - temp_v_mean /= temp_count; - - // Switch the background model to the oldest model. - if (model->age[0] > model->age[1]) { - model->curr_model = 0; - } else if (model->age[1] > model->age[0]) { - model->curr_model = 1; - } - - // If model is empty, initialize model. - if (model->age[model->curr_model] == 0) { - model->mean[model->curr_model] = temp_y_mean; - model->u_mean[model->curr_model] = temp_u_mean; - model->v_mean[model->curr_model] = temp_v_mean; - model->var[model->curr_model] = initial_variance; - model->age[model->curr_model] = 1; - } else { - // Constants for current model and foreground model (0 or 1). - const int opposite = 1 - model->curr_model; - const int current = model->curr_model; - const double j = i; - - // Put block into the appropriate model. - if (pow(temp_y_mean - model->mean[current], 2) < - s_theta * model->var[current]) { - // Add block to the current background model - model->age[current] += 1; - const double prev_weight = 1 / j; - const double curr_weight = (j - 1) / j; - model->mean[current] = prev_weight * model->mean[current] + - curr_weight * temp_y_mean; - model->u_mean[current] = prev_weight * model->u_mean[current] + - curr_weight * temp_u_mean; - model->v_mean[current] = prev_weight * model->v_mean[current] + - curr_weight * temp_v_mean; - model->var[current] = prev_weight * model->var[current] + - curr_weight * max_variance[current]; - } else { - // Block does not fit into current background candidate. Add to - // foreground candidate and reinitialize if necessary. - const double var_fg = pow(temp_y_mean - model->mean[opposite], 2); - - if (var_fg <= s_theta * model->var[opposite]) { - model->age[opposite] += 1; - const double prev_weight = 1 / j; - const double curr_weight = (j - 1) / j; - model->mean[opposite] = prev_weight * model->mean[opposite] + - curr_weight * temp_y_mean; - model->u_mean[opposite] = - prev_weight * model->u_mean[opposite] + - curr_weight * temp_u_mean; - model->v_mean[opposite] = - prev_weight * model->v_mean[opposite] + - curr_weight * temp_v_mean; - model->var[opposite] = prev_weight * model->var[opposite] + - curr_weight * max_variance[opposite]; - } else if (model->age[opposite] == 0 || - var_fg > s_theta * model->var[opposite]) { - model->mean[opposite] = temp_y_mean; - model->u_mean[opposite] = temp_u_mean; - model->v_mean[opposite] = temp_v_mean; - model->var[opposite] = initial_variance; - model->age[opposite] = 1; - } else { - // This case should never happen. - assert(0); - } - } - } - } - } - - // Select the oldest candidate as the background model. - if (model->age[0] == 0 && model->age[1] == 0) { - model->y = 0; - model->u = 0; - model->v = 0; - model->final_var = 0; - } else if (model->age[0] > model->age[1]) { - model->y = (uint8_t)model->mean[0]; - model->u = (uint8_t)model->u_mean[0]; - model->v = (uint8_t)model->v_mean[0]; - model->final_var = model->var[0]; - } else { - model->y = (uint8_t)model->mean[1]; - model->u = (uint8_t)model->u_mean[1]; - model->v = (uint8_t)model->v_mean[1]; - model->final_var = model->var[1]; - } - } - } -} - -// Builds foreground mask based on reference image and gaussian model. -// In mask[][], 1 is foreground and 0 is background. -static void build_mask(const int x_min, const int y_min, const int x_offset, - const int y_offset, const int x_block_width, - const int y_block_height, const int block_size, - const YuvPixelGaussian **gauss, - YV12_BUFFER_CONFIG *const reference, - YV12_BUFFER_CONFIG *const panorama, uint8_t **mask) { - const int crop_x_offset = x_min + x_offset; - const int crop_y_offset = y_min + y_offset; - const double d_theta = 4.0; - - for (int y_block = 0; y_block < y_block_height; ++y_block) { - for (int x_block = 0; x_block < x_block_width; ++x_block) { - // Create mask to determine if ARF is background for foreground. - const YuvPixelGaussian *model = &gauss[y_block][x_block]; - double temp_y_mean = 0.0; - int temp_count = 0; - - for (int sub_y = 0; sub_y < block_size; ++sub_y) { - for (int sub_x = 0; sub_x < block_size; ++sub_x) { - // x and y are panorama coordinates. - const int y = y_block * block_size + sub_y; - const int x = x_block * block_size + sub_x; - - const int arf_y = y - crop_y_offset; - const int arf_x = x - crop_x_offset; - - if (arf_y >= 0 && arf_y < panorama->y_height && arf_x >= 0 && - arf_x < panorama->y_width) { - ++temp_count; - const int ychannel_idx = arf_y * panorama->y_stride + arf_x; - temp_y_mean += (double)reference->y_buffer[ychannel_idx]; - } - } - } - if (temp_count > 0) { - assert(temp_count <= block_size * block_size); - temp_y_mean /= temp_count; - - if (pow(temp_y_mean - model->y, 2) > model->final_var * d_theta) { - // Mark block as foreground. - mask[y_block][x_block] = 1; - } - } - } - } -} -#endif // BGSPRITE_ENABLE_SEGMENTATION - -// Resamples blended_img into panorama, including UV subsampling. -static void resample_panorama(YuvPixel **blended_img, const int center_idx, - const int *const x_min, const int *const y_min, - int pano_x_min, int pano_x_max, int pano_y_min, - int pano_y_max, YV12_BUFFER_CONFIG *panorama) { - const int width = pano_x_max - pano_x_min + 1; - const int height = pano_y_max - pano_y_min + 1; - const int x_offset = -pano_x_min; - const int y_offset = -pano_y_min; - const int crop_x_offset = x_min[center_idx] + x_offset; - const int crop_y_offset = y_min[center_idx] + y_offset; -#if CONFIG_HIGHBITDEPTH - if (panorama->flags & YV12_FLAG_HIGHBITDEPTH) { - // Use median Y value. - uint16_t *pano_y_buffer16 = CONVERT_TO_SHORTPTR(panorama->y_buffer); - uint16_t *pano_u_buffer16 = CONVERT_TO_SHORTPTR(panorama->u_buffer); - uint16_t *pano_v_buffer16 = CONVERT_TO_SHORTPTR(panorama->v_buffer); - - for (int y = 0; y < panorama->y_height; ++y) { - for (int x = 0; x < panorama->y_width; ++x) { - const int ychannel_idx = y * panorama->y_stride + x; - if (blended_img[y + crop_y_offset][x + crop_x_offset].exists) { - pano_y_buffer16[ychannel_idx] = - blended_img[y + crop_y_offset][x + crop_x_offset].y; - } else { - pano_y_buffer16[ychannel_idx] = 0; - } - } - } - - // UV subsampling with median UV values - for (int y = 0; y < panorama->uv_height; ++y) { - for (int x = 0; x < panorama->uv_width; ++x) { - uint32_t avg_count = 0; - uint32_t u_sum = 0; - uint32_t v_sum = 0; - - // Look at surrounding pixels for subsampling - for (int s_x = 0; s_x < panorama->subsampling_x + 1; ++s_x) { - for (int s_y = 0; s_y < panorama->subsampling_y + 1; ++s_y) { - int y_sample = crop_y_offset + (y << panorama->subsampling_y) + s_y; - int x_sample = crop_x_offset + (x << panorama->subsampling_x) + s_x; - if (y_sample > 0 && y_sample < height && x_sample > 0 && - x_sample < width && blended_img[y_sample][x_sample].exists) { - u_sum += blended_img[y_sample][x_sample].u; - v_sum += blended_img[y_sample][x_sample].v; - avg_count++; - } - } - } - - const int uvchannel_idx = y * panorama->uv_stride + x; - if (avg_count != 0) { - pano_u_buffer16[uvchannel_idx] = (uint16_t)OD_DIVU(u_sum, avg_count); - pano_v_buffer16[uvchannel_idx] = (uint16_t)OD_DIVU(v_sum, avg_count); - } else { - pano_u_buffer16[uvchannel_idx] = 0; - pano_v_buffer16[uvchannel_idx] = 0; - } - } - } - } else { -#endif // CONFIG_HIGHBITDEPTH - // Use blended Y value. - for (int y = 0; y < panorama->y_height; ++y) { - for (int x = 0; x < panorama->y_width; ++x) { - const int ychannel_idx = y * panorama->y_stride + x; - // Use filtered background. - if (blended_img[y + crop_y_offset][x + crop_x_offset].exists) { - panorama->y_buffer[ychannel_idx] = - blended_img[y + crop_y_offset][x + crop_x_offset].y; - } else { - panorama->y_buffer[ychannel_idx] = 0; - } - } - } - - // UV subsampling with blended UV values. - for (int y = 0; y < panorama->uv_height; ++y) { - for (int x = 0; x < panorama->uv_width; ++x) { - uint16_t avg_count = 0; - uint16_t u_sum = 0; - uint16_t v_sum = 0; - - // Look at surrounding pixels for subsampling. - for (int s_x = 0; s_x < panorama->subsampling_x + 1; ++s_x) { - for (int s_y = 0; s_y < panorama->subsampling_y + 1; ++s_y) { - int y_sample = crop_y_offset + (y << panorama->subsampling_y) + s_y; - int x_sample = crop_x_offset + (x << panorama->subsampling_x) + s_x; - if (y_sample > 0 && y_sample < height && x_sample > 0 && - x_sample < width && blended_img[y_sample][x_sample].exists) { - u_sum += blended_img[y_sample][x_sample].u; - v_sum += blended_img[y_sample][x_sample].v; - avg_count++; - } - } - } - - const int uvchannel_idx = y * panorama->uv_stride + x; - if (avg_count != 0) { - panorama->u_buffer[uvchannel_idx] = - (uint8_t)OD_DIVU(u_sum, avg_count); - panorama->v_buffer[uvchannel_idx] = - (uint8_t)OD_DIVU(v_sum, avg_count); - } else { - panorama->u_buffer[uvchannel_idx] = 0; - panorama->v_buffer[uvchannel_idx] = 0; - } - } - } -#if CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_HIGHBITDEPTH -} - -#if BGSPRITE_ENABLE_SEGMENTATION -// Combines temporal filter output and bgsprite output to make final ARF output -static void combine_arf(YV12_BUFFER_CONFIG *const temporal_arf, - YV12_BUFFER_CONFIG *const bgsprite, - uint8_t **const mask, const int block_size, - const int x_offset, const int y_offset, - YV12_BUFFER_CONFIG *target) { - const int height = temporal_arf->y_height; - const int width = temporal_arf->y_width; - - YuvPixel **blended_img = aom_malloc(height * sizeof(*blended_img)); - for (int i = 0; i < height; ++i) { - blended_img[i] = aom_malloc(width * sizeof(**blended_img)); - } - - const int block_2_height = (height / BGSPRITE_MASK_BLOCK_SIZE) + - (height % BGSPRITE_MASK_BLOCK_SIZE != 0 ? 1 : 0); - const int block_2_width = (width / BGSPRITE_MASK_BLOCK_SIZE) + - (width % BGSPRITE_MASK_BLOCK_SIZE != 0 ? 1 : 0); - - for (int block_y = 0; block_y < block_2_height; ++block_y) { - for (int block_x = 0; block_x < block_2_width; ++block_x) { - int count = 0; - int total = 0; - for (int sub_y = 0; sub_y < BGSPRITE_MASK_BLOCK_SIZE; ++sub_y) { - for (int sub_x = 0; sub_x < BGSPRITE_MASK_BLOCK_SIZE; ++sub_x) { - const int img_y = block_y * BGSPRITE_MASK_BLOCK_SIZE + sub_y; - const int img_x = block_x * BGSPRITE_MASK_BLOCK_SIZE + sub_x; - const int mask_y = (y_offset + img_y) / block_size; - const int mask_x = (x_offset + img_x) / block_size; - - if (img_y < height && img_x < width) { - if (mask[mask_y][mask_x]) { - ++count; - } - ++total; - } - } - } - - const double threshold = 0.30; - const int amount = (int)(threshold * total); - for (int sub_y = 0; sub_y < BGSPRITE_MASK_BLOCK_SIZE; ++sub_y) { - for (int sub_x = 0; sub_x < BGSPRITE_MASK_BLOCK_SIZE; ++sub_x) { - const int y = block_y * BGSPRITE_MASK_BLOCK_SIZE + sub_y; - const int x = block_x * BGSPRITE_MASK_BLOCK_SIZE + sub_x; - if (y < height && x < width) { - blended_img[y][x].exists = 1; - const int ychannel_idx = y * temporal_arf->y_stride + x; - const int uvchannel_idx = - (y >> temporal_arf->subsampling_y) * temporal_arf->uv_stride + - (x >> temporal_arf->subsampling_x); - - if (count > amount) { -// Foreground; use temporal arf. -#if CONFIG_HIGHBITDEPTH - if (temporal_arf->flags & YV12_FLAG_HIGHBITDEPTH) { - uint16_t *pano_y_buffer16 = - CONVERT_TO_SHORTPTR(temporal_arf->y_buffer); - uint16_t *pano_u_buffer16 = - CONVERT_TO_SHORTPTR(temporal_arf->u_buffer); - uint16_t *pano_v_buffer16 = - CONVERT_TO_SHORTPTR(temporal_arf->v_buffer); - blended_img[y][x].y = pano_y_buffer16[ychannel_idx]; - blended_img[y][x].u = pano_u_buffer16[uvchannel_idx]; - blended_img[y][x].v = pano_v_buffer16[uvchannel_idx]; - } else { -#endif // CONFIG_HIGHBITDEPTH - blended_img[y][x].y = temporal_arf->y_buffer[ychannel_idx]; - blended_img[y][x].u = temporal_arf->u_buffer[uvchannel_idx]; - blended_img[y][x].v = temporal_arf->v_buffer[uvchannel_idx]; -#if CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_HIGHBITDEPTH - } else { -// Background; use bgsprite arf. -#if CONFIG_HIGHBITDEPTH - if (bgsprite->flags & YV12_FLAG_HIGHBITDEPTH) { - uint16_t *pano_y_buffer16 = - CONVERT_TO_SHORTPTR(bgsprite->y_buffer); - uint16_t *pano_u_buffer16 = - CONVERT_TO_SHORTPTR(bgsprite->u_buffer); - uint16_t *pano_v_buffer16 = - CONVERT_TO_SHORTPTR(bgsprite->v_buffer); - blended_img[y][x].y = pano_y_buffer16[ychannel_idx]; - blended_img[y][x].u = pano_u_buffer16[uvchannel_idx]; - blended_img[y][x].v = pano_v_buffer16[uvchannel_idx]; - } else { -#endif // CONFIG_HIGHBITDEPTH - blended_img[y][x].y = bgsprite->y_buffer[ychannel_idx]; - blended_img[y][x].u = bgsprite->u_buffer[uvchannel_idx]; - blended_img[y][x].v = bgsprite->v_buffer[uvchannel_idx]; -#if CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_HIGHBITDEPTH - } - } - } - } - } - } - - const int x_min = 0; - const int y_min = 0; - resample_panorama(blended_img, 0, &x_min, &y_min, 0, width - 1, 0, height - 1, - target); - - for (int i = 0; i < height; ++i) { - aom_free(blended_img[i]); - } - aom_free(blended_img); -} -#endif // BGSPRITE_ENABLE_SEGMENTATION - -// Stitches images together to create ARF and stores it in 'panorama'. -static void stitch_images(AV1_COMP *cpi, YV12_BUFFER_CONFIG **const frames, - const int num_frames, const int distance, - const int center_idx, const double **const params, - const int *const x_min, const int *const x_max, - const int *const y_min, const int *const y_max, - int pano_x_min, int pano_x_max, int pano_y_min, - int pano_y_max, YV12_BUFFER_CONFIG *panorama) { - const int width = pano_x_max - pano_x_min + 1; - const int height = pano_y_max - pano_y_min + 1; - - // Create pano_stack[y][x][num_frames] stack of pixel values - YuvPixel ***pano_stack = aom_malloc(height * sizeof(*pano_stack)); - for (int i = 0; i < height; ++i) { - pano_stack[i] = aom_malloc(width * sizeof(**pano_stack)); - for (int j = 0; j < width; ++j) { - pano_stack[i][j] = aom_calloc(num_frames, sizeof(***pano_stack)); - } - } - - build_image_stack(frames, num_frames, params, x_min, x_max, y_min, y_max, - pano_x_min, pano_y_min, pano_stack); - - // Create blended_img[y][x] of combined panorama pixel values. - YuvPixel **blended_img = aom_malloc(height * sizeof(*blended_img)); - for (int i = 0; i < height; ++i) { - blended_img[i] = aom_malloc(width * sizeof(**blended_img)); - } - -// Blending and saving result in blended_img. -#if BGSPRITE_BLENDING_MODE == 1 - blend_mean(width, height, num_frames, (const YuvPixel ***)pano_stack, - blended_img, panorama->flags & YV12_FLAG_HIGHBITDEPTH); -#else // BGSPRITE_BLENDING_MODE != 1 - blend_median(width, height, num_frames, (const YuvPixel ***)pano_stack, - blended_img); -#endif // BGSPRITE_BLENDING_MODE == 1 - - // NOTE(toddnguyen): Right now the ARF in the cpi struct is fixed size at - // the same size as the frames. For now, we crop the generated panorama. - assert(panorama->y_width <= width && panorama->y_height <= height); - - // Resamples the blended_img into the panorama buffer. - YV12_BUFFER_CONFIG bgsprite; - memset(&bgsprite, 0, sizeof(bgsprite)); - aom_alloc_frame_buffer(&bgsprite, frames[0]->y_width, frames[0]->y_height, - frames[0]->subsampling_x, frames[0]->subsampling_y, -#if CONFIG_HIGHBITDEPTH - frames[0]->flags & YV12_FLAG_HIGHBITDEPTH, -#endif - frames[0]->border, 0); - aom_yv12_copy_frame(frames[0], &bgsprite); - bgsprite.bit_depth = frames[0]->bit_depth; - resample_panorama(blended_img, center_idx, x_min, y_min, pano_x_min, - pano_x_max, pano_y_min, pano_y_max, &bgsprite); - -#if BGSPRITE_ENABLE_SEGMENTATION - YV12_BUFFER_CONFIG temporal_bgsprite; - memset(&temporal_bgsprite, 0, sizeof(temporal_bgsprite)); - aom_alloc_frame_buffer(&temporal_bgsprite, frames[0]->y_width, - frames[0]->y_height, frames[0]->subsampling_x, - frames[0]->subsampling_y, -#if CONFIG_HIGHBITDEPTH - frames[0]->flags & YV12_FLAG_HIGHBITDEPTH, -#endif - frames[0]->border, 0); - aom_yv12_copy_frame(frames[0], &temporal_bgsprite); - temporal_bgsprite.bit_depth = frames[0]->bit_depth; - - av1_temporal_filter(cpi, &bgsprite, &temporal_bgsprite, distance); - - // Block size constants for gaussian model. - const int N_1 = 2; - const int y_block_height = (height / N_1) + (height % N_1 != 0 ? 1 : 0); - const int x_block_width = (width / N_1) + (height % N_1 != 0 ? 1 : 0); - YuvPixelGaussian **gauss = aom_malloc(y_block_height * sizeof(*gauss)); - for (int i = 0; i < y_block_height; ++i) { - gauss[i] = aom_calloc(x_block_width, sizeof(**gauss)); - } - - // Build Gaussian model. - build_gaussian((const YuvPixel ***)pano_stack, num_frames, width, height, - x_block_width, y_block_height, N_1, gauss); - - // Select background model and build foreground mask. - uint8_t **mask = aom_malloc(y_block_height * sizeof(*mask)); - for (int i = 0; i < y_block_height; ++i) { - mask[i] = aom_calloc(x_block_width, sizeof(**mask)); - } - - const int x_offset = -pano_x_min; - const int y_offset = -pano_y_min; - build_mask(x_min[center_idx], y_min[center_idx], x_offset, y_offset, - x_block_width, y_block_height, N_1, - (const YuvPixelGaussian **)gauss, - (YV12_BUFFER_CONFIG * const) frames[center_idx], panorama, mask); - - YV12_BUFFER_CONFIG temporal_arf; - memset(&temporal_arf, 0, sizeof(temporal_arf)); - aom_alloc_frame_buffer(&temporal_arf, frames[0]->y_width, frames[0]->y_height, - frames[0]->subsampling_x, frames[0]->subsampling_y, -#if CONFIG_HIGHBITDEPTH - frames[0]->flags & YV12_FLAG_HIGHBITDEPTH, -#endif - frames[0]->border, 0); - aom_yv12_copy_frame(frames[0], &temporal_arf); - temporal_arf.bit_depth = frames[0]->bit_depth; - av1_temporal_filter(cpi, NULL, &temporal_arf, distance); - - combine_arf(&temporal_arf, &temporal_bgsprite, mask, N_1, x_offset, y_offset, - panorama); - - aom_free_frame_buffer(&temporal_arf); - aom_free_frame_buffer(&temporal_bgsprite); - for (int i = 0; i < y_block_height; ++i) { - aom_free(gauss[i]); - aom_free(mask[i]); - } - aom_free(gauss); - aom_free(mask); -#else // !BGSPRITE_ENABLE_SEGMENTATION - av1_temporal_filter(cpi, &bgsprite, panorama, distance); -#endif // BGSPRITE_ENABLE_SEGMENTATION - - aom_free_frame_buffer(&bgsprite); - for (int i = 0; i < height; ++i) { - for (int j = 0; j < width; ++j) { - aom_free(pano_stack[i][j]); - } - aom_free(pano_stack[i]); - aom_free(blended_img[i]); - } - aom_free(pano_stack); - aom_free(blended_img); -} - -int av1_background_sprite(AV1_COMP *cpi, int distance) { -#if BGSPRITE_ENABLE_METRICS - // Do temporal filter if firstpass stats disable bgsprite. - if (!cpi->bgsprite_allowed) { - return 1; - } -#endif // BGSPRITE_ENABLE_METRICS - - YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL }; - static const double identity_params[MAX_PARAMDIM - 1] = { - 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0 - }; - - const int frames_after_arf = - av1_lookahead_depth(cpi->lookahead) - distance - 1; - int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1; - int frames_bwd; - - // Define the forward and backwards filter limits for this arnr group. - if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf; - if (frames_fwd > distance) frames_fwd = distance; - frames_bwd = frames_fwd; - -#if CONFIG_EXT_REFS - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) { - cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 1; - frames_fwd = 0; - frames_bwd = 0; - } else { - cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 0; - } -#endif // CONFIG_EXT_REFS - - const int start_frame = distance + frames_fwd; - const int frames_to_stitch = frames_bwd + 1 + frames_fwd; - - // Get frames to be included in background sprite. - for (int frame = 0; frame < frames_to_stitch; ++frame) { - const int which_buffer = start_frame - frame; - struct lookahead_entry *buf = - av1_lookahead_peek(cpi->lookahead, which_buffer); - frames[frames_to_stitch - 1 - frame] = &buf->img; - } - - // Allocate empty arrays for parameters between frames. - double **params = aom_malloc(frames_to_stitch * sizeof(*params)); - for (int i = 0; i < frames_to_stitch; ++i) { - params[i] = aom_malloc(sizeof(identity_params)); - memcpy(params[i], identity_params, sizeof(identity_params)); - } - -// Use global motion to find affine transformations between frames. -// params[i] will have the transform from frame[i] to frame[i-1]. -// params[0] will have the identity matrix (has no previous frame). -#if BGSPRITE_ENABLE_GME - TransformationType model = AFFINE; - int inliers_by_motion[RANSAC_NUM_MOTIONS]; - for (int frame = 0; frame < frames_to_stitch - 1; ++frame) { - const int global_motion_ret = compute_global_motion_feature_based( - model, frames[frame + 1], frames[frame], -#if CONFIG_HIGHBITDEPTH - cpi->common.bit_depth, -#endif // CONFIG_HIGHBITDEPTH - inliers_by_motion, params[frame + 1], RANSAC_NUM_MOTIONS); - - // Quit if global motion had an error. - if (global_motion_ret == 0) { - for (int i = 0; i < frames_to_stitch; ++i) { - aom_free(params[i]); - } - aom_free(params); - return 1; - } - } -#endif // BGSPRITE_ENABLE_GME - - // Compound the transformation parameters. - for (int i = 1; i < frames_to_stitch; ++i) { - multiply_params(params[i - 1], params[i], params[i]); - } - - // Compute frame limits for final stitched images. - int pano_x_max = INT_MIN; - int pano_x_min = INT_MAX; - int pano_y_max = INT_MIN; - int pano_y_min = INT_MAX; - int *x_max = aom_malloc(frames_to_stitch * sizeof(*x_max)); - int *x_min = aom_malloc(frames_to_stitch * sizeof(*x_min)); - int *y_max = aom_malloc(frames_to_stitch * sizeof(*y_max)); - int *y_min = aom_malloc(frames_to_stitch * sizeof(*y_min)); - - find_limits(frames[0]->y_width, frames[0]->y_height, - (const double **const)params, frames_to_stitch, x_min, x_max, - y_min, y_max, &pano_x_min, &pano_x_max, &pano_y_min, &pano_y_max); - - // Center panorama on the ARF. - const int center_idx = frames_bwd; - assert(center_idx >= 0 && center_idx < frames_to_stitch); - - // Recompute transformations to adjust to center image. - // Invert center image's transform. - double inverse[MAX_PARAMDIM - 1] = { 0 }; - invert_params(params[center_idx], inverse); - - // Multiply the inverse to all transformation parameters. - for (int i = 0; i < frames_to_stitch; ++i) { - multiply_params(inverse, params[i], params[i]); - } - - // Recompute frame limits for new adjusted center. - find_limits(frames[0]->y_width, frames[0]->y_height, - (const double **const)params, frames_to_stitch, x_min, x_max, - y_min, y_max, &pano_x_min, &pano_x_max, &pano_y_min, &pano_y_max); - - // Stitch Images and apply bgsprite filter. - stitch_images(cpi, frames, frames_to_stitch, distance, center_idx, - (const double **const)params, x_min, x_max, y_min, y_max, - pano_x_min, pano_x_max, pano_y_min, pano_y_max, - &cpi->alt_ref_buffer); - - // Free memory. - for (int i = 0; i < frames_to_stitch; ++i) { - aom_free(params[i]); - } - aom_free(params); - aom_free(x_max); - aom_free(x_min); - aom_free(y_max); - aom_free(y_min); - - return 0; -} - -#undef _POSIX_C_SOURCE -#undef BGSPRITE_BLENDING_MODE -#undef BGSPRITE_INTERPOLATION -#undef BGSPRITE_ENABLE_METRICS -#undef BGSPRITE_ENABLE_SEGMENTATION -#undef BGSPRITE_ENABLE_GME -#undef BGSPRITE_MASK_BLOCK_SIZE -#undef TRANSFORM_MAT_DIM diff --git a/third_party/aom/av1/encoder/bgsprite.h b/third_party/aom/av1/encoder/bgsprite.h deleted file mode 100644 index 711b00e40..000000000 --- a/third_party/aom/av1/encoder/bgsprite.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AV1_ENCODER_BGSPRITE_H_ -#define AV1_ENCODER_BGSPRITE_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include "av1/encoder/encoder.h" - -// Creates alternate reference frame staring from source image + frames up to -// 'distance' past source frame. -// Returns 0 on success and 1 on failure. -int av1_background_sprite(AV1_COMP *cpi, int distance); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AV1_ENCODER_BGSPRITE_H_ diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c index 08f605f10..cdd7c2492 100644 --- a/third_party/aom/av1/encoder/bitstream.c +++ b/third_party/aom/av1/encoder/bitstream.c @@ -24,9 +24,8 @@ #include "aom_util/debug_util.h" #endif // CONFIG_BITSTREAM_DEBUG -#if CONFIG_CDEF #include "av1/common/cdef.h" -#endif // CONFIG_CDEF +#include "av1/common/cfl.h" #include "av1/common/entropy.h" #include "av1/common/entropymode.h" #include "av1/common/entropymv.h" @@ -34,38 +33,21 @@ #include "av1/common/odintrin.h" #include "av1/common/pred_common.h" #include "av1/common/reconinter.h" -#if CONFIG_EXT_INTRA #include "av1/common/reconintra.h" -#endif // CONFIG_EXT_INTRA #include "av1/common/seg_common.h" #include "av1/common/tile_common.h" -#if CONFIG_LV_MAP -#include "av1/encoder/encodetxb.h" -#endif // CONFIG_LV_MAP #include "av1/encoder/bitstream.h" #include "av1/encoder/cost.h" #include "av1/encoder/encodemv.h" +#include "av1/encoder/encodetxb.h" #include "av1/encoder/mcomp.h" -#if CONFIG_PALETTE_DELTA_ENCODING #include "av1/encoder/palette.h" -#endif // CONFIG_PALETTE_DELTA_ENCODING #include "av1/encoder/segmentation.h" -#include "av1/encoder/subexp.h" #include "av1/encoder/tokenize.h" -#if CONFIG_PVQ -#include "av1/encoder/pvq_encoder.h" -#endif #define ENC_MISMATCH_DEBUG 0 -#if CONFIG_COMPOUND_SINGLEREF -static struct av1_token - inter_singleref_comp_mode_encodings[INTER_SINGLEREF_COMP_MODES]; -#endif // CONFIG_COMPOUND_SINGLEREF - -// TODO(anybody) : remove this flag when PVQ supports pallete coding tool -#if !CONFIG_PVQ || CONFIG_EXT_INTRA static INLINE void write_uniform(aom_writer *w, int n, int v) { const int l = get_unsigned_bits(n); const int m = (1 << l) - n; @@ -77,110 +59,38 @@ static INLINE void write_uniform(aom_writer *w, int n, int v) { aom_write_literal(w, (v - m) & 1, 1); } } -#endif // !CONFIG_PVQ || CONFIG_EXT_INTRA - -#if CONFIG_EXT_INTRA -#if CONFIG_INTRA_INTERP -static struct av1_token intra_filter_encodings[INTRA_FILTERS]; -#endif // CONFIG_INTRA_INTERP -#endif // CONFIG_EXT_INTRA -#if CONFIG_INTERINTRA -static struct av1_token interintra_mode_encodings[INTERINTRA_MODES]; -#endif -#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE -static struct av1_token compound_type_encodings[COMPOUND_TYPES]; -#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE -#if CONFIG_LOOP_RESTORATION -static struct av1_token switchable_restore_encodings[RESTORE_SWITCHABLE_TYPES]; + static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm, MACROBLOCKD *xd, + const RestorationUnitInfo *rui, aom_writer *const w, int plane, - int rtile_idx); -#endif // CONFIG_LOOP_RESTORATION -#if CONFIG_OBU -static void write_uncompressed_header_obu(AV1_COMP *cpi, - struct aom_write_bit_buffer *wb); -#else -static void write_uncompressed_header_frame(AV1_COMP *cpi, - struct aom_write_bit_buffer *wb); -#endif - -static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data); - -#if !CONFIG_OBU || CONFIG_EXT_TILE -static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst, - const uint32_t data_size, const uint32_t max_tile_size, - const uint32_t max_tile_col_size, - int *const tile_size_bytes, - int *const tile_col_size_bytes); -#endif -void av1_encode_token_init(void) { -#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP - av1_tokens_from_tree(intra_filter_encodings, av1_intra_filter_tree); -#endif // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP -#if CONFIG_INTERINTRA - av1_tokens_from_tree(interintra_mode_encodings, av1_interintra_mode_tree); -#endif // CONFIG_INTERINTRA -#if CONFIG_COMPOUND_SINGLEREF - av1_tokens_from_tree(inter_singleref_comp_mode_encodings, - av1_inter_singleref_comp_mode_tree); -#endif // CONFIG_COMPOUND_SINGLEREF -#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE - av1_tokens_from_tree(compound_type_encodings, av1_compound_type_tree); -#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE -#if CONFIG_LOOP_RESTORATION - av1_tokens_from_tree(switchable_restore_encodings, - av1_switchable_restore_tree); -#endif // CONFIG_LOOP_RESTORATION -} + FRAME_COUNTS *counts); -static void write_intra_mode_kf(const AV1_COMMON *cm, FRAME_CONTEXT *frame_ctx, - const MODE_INFO *mi, const MODE_INFO *above_mi, - const MODE_INFO *left_mi, int block, +static void write_intra_mode_kf(FRAME_CONTEXT *frame_ctx, + const MB_MODE_INFO *mi, + const MB_MODE_INFO *above_mi, + const MB_MODE_INFO *left_mi, PREDICTION_MODE mode, aom_writer *w) { -#if CONFIG_INTRABC - assert(!is_intrabc_block(&mi->mbmi)); -#endif // CONFIG_INTRABC - aom_write_symbol(w, mode, - get_y_mode_cdf(frame_ctx, mi, above_mi, left_mi, block), + assert(!is_intrabc_block(mi)); + (void)mi; + aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi), INTRA_MODES); - (void)cm; } static void write_inter_mode(aom_writer *w, PREDICTION_MODE mode, FRAME_CONTEXT *ec_ctx, const int16_t mode_ctx) { const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK; -#if CONFIG_NEW_MULTISYMBOL aom_write_symbol(w, mode != NEWMV, ec_ctx->newmv_cdf[newmv_ctx], 2); -#else - aom_write(w, mode != NEWMV, ec_ctx->newmv_prob[newmv_ctx]); -#endif if (mode != NEWMV) { - if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) { - assert(mode == ZEROMV); - return; - } + const int16_t zeromv_ctx = + (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + aom_write_symbol(w, mode != GLOBALMV, ec_ctx->zeromv_cdf[zeromv_ctx], 2); - const int16_t zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK; -#if CONFIG_NEW_MULTISYMBOL - aom_write_symbol(w, mode != ZEROMV, ec_ctx->zeromv_cdf[zeromv_ctx], 2); -#else - aom_write(w, mode != ZEROMV, ec_ctx->zeromv_prob[zeromv_ctx]); -#endif - - if (mode != ZEROMV) { + if (mode != GLOBALMV) { int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; - - if (mode_ctx & (1 << SKIP_NEARESTMV_OFFSET)) refmv_ctx = 6; - if (mode_ctx & (1 << SKIP_NEARMV_OFFSET)) refmv_ctx = 7; - if (mode_ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) refmv_ctx = 8; -#if CONFIG_NEW_MULTISYMBOL aom_write_symbol(w, mode != NEARESTMV, ec_ctx->refmv_cdf[refmv_ctx], 2); -#else - aom_write(w, mode != NEARESTMV, ec_ctx->refmv_prob[refmv_ctx]); -#endif } } } @@ -191,24 +101,16 @@ static void write_drl_idx(FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi, assert(mbmi->ref_mv_idx < 3); -#if CONFIG_COMPOUND_SINGLEREF - if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV || - mbmi->mode == SR_NEW_NEWMV) { -#else // !CONFIG_COMPOUND_SINGLEREF - if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) { -#endif // CONFIG_COMPOUND_SINGLEREF + const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV; + if (new_mv) { int idx; for (idx = 0; idx < 2; ++idx) { if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx); -#if CONFIG_NEW_MULTISYMBOL aom_write_symbol(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_cdf[drl_ctx], 2); -#else - aom_write(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_prob[drl_ctx]); -#endif if (mbmi->ref_mv_idx == idx) return; } } @@ -222,12 +124,8 @@ static void write_drl_idx(FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi, if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx); -#if CONFIG_NEW_MULTISYMBOL aom_write_symbol(w, mbmi->ref_mv_idx != (idx - 1), ec_ctx->drl_cdf[drl_ctx], 2); -#else - aom_write(w, mbmi->ref_mv_idx != (idx - 1), ec_ctx->drl_prob[drl_ctx]); -#endif if (mbmi->ref_mv_idx == (idx - 1)) return; } } @@ -235,52 +133,22 @@ static void write_drl_idx(FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi, } } -static void write_inter_compound_mode(AV1_COMMON *cm, MACROBLOCKD *xd, - aom_writer *w, PREDICTION_MODE mode, +static void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w, + PREDICTION_MODE mode, const int16_t mode_ctx) { assert(is_inter_compound_mode(mode)); - (void)cm; aom_write_symbol(w, INTER_COMPOUND_OFFSET(mode), xd->tile_ctx->inter_compound_mode_cdf[mode_ctx], INTER_COMPOUND_MODES); } -#if CONFIG_COMPOUND_SINGLEREF -static void write_inter_singleref_comp_mode(MACROBLOCKD *xd, aom_writer *w, - PREDICTION_MODE mode, - const int16_t mode_ctx) { - assert(is_inter_singleref_comp_mode(mode)); - aom_cdf_prob *const inter_singleref_comp_cdf = - xd->tile_ctx->inter_singleref_comp_mode_cdf[mode_ctx]; - - aom_write_symbol(w, INTER_SINGLEREF_COMP_OFFSET(mode), - inter_singleref_comp_cdf, INTER_SINGLEREF_COMP_MODES); -} -#endif // CONFIG_COMPOUND_SINGLEREF - -static void encode_unsigned_max(struct aom_write_bit_buffer *wb, int data, - int max) { - aom_wb_write_literal(wb, data, get_unsigned_bits(max)); -} - -#if CONFIG_VAR_TX -static void write_tx_size_vartx(const AV1_COMMON *cm, MACROBLOCKD *xd, - const MB_MODE_INFO *mbmi, TX_SIZE tx_size, - int depth, int blk_row, int blk_col, - aom_writer *w) { -#if CONFIG_NEW_MULTISYMBOL +static void write_tx_size_vartx(MACROBLOCKD *xd, const MB_MODE_INFO *mbmi, + TX_SIZE tx_size, int depth, int blk_row, + int blk_col, aom_writer *w) { FRAME_CONTEXT *ec_ctx = xd->tile_ctx; - (void)cm; -#endif - const int tx_row = blk_row >> 1; - const int tx_col = blk_col >> 1; const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0); const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0); - int ctx = txfm_partition_context(xd->above_txfm_context + blk_col, - xd->left_txfm_context + blk_row, - mbmi->sb_type, tx_size); - if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; if (depth == MAX_VARTX_DEPTH) { @@ -289,31 +157,25 @@ static void write_tx_size_vartx(const AV1_COMMON *cm, MACROBLOCKD *xd, return; } -#if CONFIG_RECT_TX_EXT - if (tx_size == mbmi->inter_tx_size[tx_row][tx_col] || - mbmi->tx_size == quarter_txsize_lookup[mbmi->sb_type]) { -#else - if (tx_size == mbmi->inter_tx_size[tx_row][tx_col]) { -#endif -#if CONFIG_NEW_MULTISYMBOL + const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, + mbmi->sb_type, tx_size); + const int txb_size_index = + av1_get_txb_size_index(mbmi->sb_type, blk_row, blk_col); + const int write_txfm_partition = + tx_size == mbmi->inter_tx_size[txb_size_index]; + if (write_txfm_partition) { aom_write_symbol(w, 0, ec_ctx->txfm_partition_cdf[ctx], 2); -#else - aom_write(w, 0, cm->fc->txfm_partition_prob[ctx]); -#endif txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, tx_size, tx_size); // TODO(yuec): set correct txfm partition update for qttx } else { const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; - const int bsl = tx_size_wide_unit[sub_txs]; - int i; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; -#if CONFIG_NEW_MULTISYMBOL aom_write_symbol(w, 1, ec_ctx->txfm_partition_cdf[ctx], 2); -#else - aom_write(w, 1, cm->fc->txfm_partition_prob[ctx]); -#endif if (sub_txs == TX_4X4) { txfm_partition_update(xd->above_txfm_context + blk_col, @@ -321,185 +183,115 @@ static void write_tx_size_vartx(const AV1_COMMON *cm, MACROBLOCKD *xd, return; } - assert(bsl > 0); - for (i = 0; i < 4; ++i) { - int offsetr = blk_row + (i >> 1) * bsl; - int offsetc = blk_col + (i & 0x01) * bsl; - write_tx_size_vartx(cm, xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, - w); - } + assert(bsw > 0 && bsh > 0); + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + int offsetr = blk_row + row; + int offsetc = blk_col + col; + write_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, w); + } } } -#if !CONFIG_NEW_MULTISYMBOL -static void update_txfm_partition_probs(AV1_COMMON *cm, aom_writer *w, - FRAME_COUNTS *counts, int probwt) { - int k; - for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k) - av1_cond_prob_diff_update(w, &cm->fc->txfm_partition_prob[k], - counts->txfm_partition[k], probwt); -} -#endif // CONFIG_NEW_MULTISYMBOL -#endif // CONFIG_VAR_TX - -static void write_selected_tx_size(const AV1_COMMON *cm, const MACROBLOCKD *xd, - aom_writer *w) { - const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; +static void write_selected_tx_size(const MACROBLOCKD *xd, aom_writer *w) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; const BLOCK_SIZE bsize = mbmi->sb_type; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; - (void)cm; if (block_signals_txsize(bsize)) { const TX_SIZE tx_size = mbmi->tx_size; - const int is_inter = is_inter_block(mbmi); const int tx_size_ctx = get_tx_size_context(xd); - const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize] - : intra_tx_size_cat_lookup[bsize]; - const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size]; - const int depth = tx_size_to_depth(coded_tx_size); -#if CONFIG_EXT_TX && CONFIG_RECT_TX + const int depth = tx_size_to_depth(tx_size, bsize); + const int max_depths = bsize_to_max_depth(bsize); + const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); + + assert(depth >= 0 && depth <= max_depths); + assert(!is_inter_block(mbmi)); assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi))); -#endif // CONFIG_EXT_TX && CONFIG_RECT_TX aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx], - tx_size_cat + 2); -#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX) - if (is_quarter_tx_allowed(xd, mbmi, is_inter) && tx_size != coded_tx_size) -#if CONFIG_NEW_MULTISYMBOL - aom_write_symbol(w, tx_size == quarter_txsize_lookup[bsize], - cm->fc->quarter_tx_size_cdf, 2); -#else - aom_write(w, tx_size == quarter_txsize_lookup[bsize], - cm->fc->quarter_tx_size_prob); -#endif -#endif + max_depths + 1); } } -#if !CONFIG_NEW_MULTISYMBOL -static void update_inter_mode_probs(AV1_COMMON *cm, aom_writer *w, - FRAME_COUNTS *counts) { - int i; - const int probwt = cm->num_tg; - for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) - av1_cond_prob_diff_update(w, &cm->fc->newmv_prob[i], counts->newmv_mode[i], - probwt); - for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i) - av1_cond_prob_diff_update(w, &cm->fc->zeromv_prob[i], - counts->zeromv_mode[i], probwt); - for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) - av1_cond_prob_diff_update(w, &cm->fc->refmv_prob[i], counts->refmv_mode[i], - probwt); - for (i = 0; i < DRL_MODE_CONTEXTS; ++i) - av1_cond_prob_diff_update(w, &cm->fc->drl_prob[i], counts->drl_mode[i], - probwt); -} -#endif - static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd, - int segment_id, const MODE_INFO *mi, aom_writer *w) { + int segment_id, const MB_MODE_INFO *mi, aom_writer *w) { if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { return 1; } else { - const int skip = mi->mbmi.skip; -#if CONFIG_NEW_MULTISYMBOL - FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const int skip = mi->skip; const int ctx = av1_get_skip_context(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; aom_write_symbol(w, skip, ec_ctx->skip_cdfs[ctx], 2); -#else - aom_write(w, skip, av1_get_skip_prob(cm, xd)); -#endif return skip; } } +static int write_skip_mode(const AV1_COMMON *cm, const MACROBLOCKD *xd, + int segment_id, const MB_MODE_INFO *mi, + aom_writer *w) { + if (!cm->skip_mode_flag) return 0; + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + return 0; + } + const int skip_mode = mi->skip_mode; + if (!is_comp_ref_allowed(mi->sb_type)) { + assert(!skip_mode); + return 0; + } + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) || + segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + // These features imply single-reference mode, while skip mode implies + // compound reference. Hence, the two are mutually exclusive. + // In other words, skip_mode is implicitly 0 here. + assert(!skip_mode); + return 0; + } + const int ctx = av1_get_skip_mode_context(xd); + aom_write_symbol(w, skip_mode, xd->tile_ctx->skip_mode_cdfs[ctx], 2); + return skip_mode; +} + static void write_is_inter(const AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id, aom_writer *w, const int is_inter) { if (!segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { -#if CONFIG_NEW_MULTISYMBOL - FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + assert(is_inter); + return; + } const int ctx = av1_get_intra_inter_context(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; aom_write_symbol(w, is_inter, ec_ctx->intra_inter_cdf[ctx], 2); -#else - aom_write(w, is_inter, av1_get_intra_inter_prob(cm, xd)); -#endif } } -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION static void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd, - const MODE_INFO *mi, aom_writer *w) { - const MB_MODE_INFO *mbmi = &mi->mbmi; - -#if !CONFIG_GLOBAL_MOTION - // The cm parameter is only used with global_motion or with - // motion_var and warped_motion. In other cases, explicitly ignore - // it to avoid a compiler warning. - (void)cm; -#endif - MOTION_MODE last_motion_mode_allowed = motion_mode_allowed( -#if CONFIG_GLOBAL_MOTION - 0, cm->global_motion, -#endif // CONFIG_GLOBAL_MOTION -#if CONFIG_WARPED_MOTION - xd, -#endif - mi); - if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return; -#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION -#if CONFIG_NCOBMC_ADAPT_WEIGHT - if (last_motion_mode_allowed == NCOBMC_ADAPT_WEIGHT) { - aom_write_symbol(w, mbmi->motion_mode, - xd->tile_ctx->ncobmc_cdf[mbmi->sb_type], - OBMC_FAMILY_MODES); - } else if (last_motion_mode_allowed == OBMC_CAUSAL) { - aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL, - xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2); - } else { -#else - if (last_motion_mode_allowed == OBMC_CAUSAL) { -#if CONFIG_NEW_MULTISYMBOL - aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL, - xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2); -#else - aom_write(w, mbmi->motion_mode == OBMC_CAUSAL, - cm->fc->obmc_prob[mbmi->sb_type]); -#endif - } else { -#endif // CONFIG_NCOBMC_ADAPT_WEIGHT -#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION - aom_write_symbol(w, mbmi->motion_mode, - xd->tile_ctx->motion_mode_cdf[mbmi->sb_type], - MOTION_MODES); -#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION - } -#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION -} - -#if CONFIG_NCOBMC_ADAPT_WEIGHT -static void write_ncobmc_mode(MACROBLOCKD *xd, const MODE_INFO *mi, - aom_writer *w) { - const MB_MODE_INFO *mbmi = &mi->mbmi; - ADAPT_OVERLAP_BLOCK ao_block = adapt_overlap_block_lookup[mbmi->sb_type]; - if (mbmi->motion_mode != NCOBMC_ADAPT_WEIGHT) return; - - aom_write_symbol(w, mbmi->ncobmc_mode[0], - xd->tile_ctx->ncobmc_mode_cdf[ao_block], MAX_NCOBMC_MODES); - if (mi_size_wide[mbmi->sb_type] != mi_size_high[mbmi->sb_type]) { - aom_write_symbol(w, mbmi->ncobmc_mode[1], - xd->tile_ctx->ncobmc_mode_cdf[ao_block], MAX_NCOBMC_MODES); + const MB_MODE_INFO *mbmi, aom_writer *w) { + MOTION_MODE last_motion_mode_allowed = + cm->switchable_motion_mode + ? motion_mode_allowed(cm->global_motion, xd, mbmi, + cm->allow_warped_motion) + : SIMPLE_TRANSLATION; + assert(mbmi->motion_mode <= last_motion_mode_allowed); + switch (last_motion_mode_allowed) { + case SIMPLE_TRANSLATION: break; + case OBMC_CAUSAL: + aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL, + xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2); + break; + default: + aom_write_symbol(w, mbmi->motion_mode, + xd->tile_ctx->motion_mode_cdf[mbmi->sb_type], + MOTION_MODES); } } -#endif -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION -static void write_delta_qindex(const AV1_COMMON *cm, const MACROBLOCKD *xd, - int delta_qindex, aom_writer *w) { +static void write_delta_qindex(const MACROBLOCKD *xd, int delta_qindex, + aom_writer *w) { int sign = delta_qindex < 0; int abs = sign ? -delta_qindex : delta_qindex; int rem_bits, thr; int smallval = abs < DELTA_Q_SMALL ? 1 : 0; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; - (void)cm; aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1); @@ -515,32 +307,23 @@ static void write_delta_qindex(const AV1_COMMON *cm, const MACROBLOCKD *xd, } } -#if CONFIG_EXT_DELTA_Q static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd, -#if CONFIG_LOOPFILTER_LEVEL - int lf_id, -#endif - int delta_lflevel, aom_writer *w) { + int lf_id, int delta_lflevel, aom_writer *w) { int sign = delta_lflevel < 0; int abs = sign ? -delta_lflevel : delta_lflevel; int rem_bits, thr; int smallval = abs < DELTA_LF_SMALL ? 1 : 0; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; - (void)cm; -#if CONFIG_LOOPFILTER_LEVEL if (cm->delta_lf_multi) { - assert(lf_id >= 0 && lf_id < FRAME_LF_COUNT); + assert(lf_id >= 0 && lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT + : FRAME_LF_COUNT - 2)); aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_multi_cdf[lf_id], DELTA_LF_PROBS + 1); } else { aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf, DELTA_LF_PROBS + 1); } -#else - aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf, - DELTA_LF_PROBS + 1); -#endif // CONFIG_LOOPFILTER_LEVEL if (!smallval) { rem_bits = OD_ILOG_NZ(abs - 1) - 1; @@ -552,22 +335,7 @@ static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd, aom_write_bit(w, sign); } } -#endif // CONFIG_EXT_DELTA_Q - -#if !CONFIG_NEW_MULTISYMBOL -static void update_skip_probs(AV1_COMMON *cm, aom_writer *w, - FRAME_COUNTS *counts) { - int k; - const int probwt = cm->num_tg; - for (k = 0; k < SKIP_CONTEXTS; ++k) { - av1_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k], - probwt); - } -} -#endif -// TODO(anybody) : remove this flag when PVQ supports pallete coding tool -#if !CONFIG_PVQ static void pack_map_tokens(aom_writer *w, const TOKENEXTRA **tp, int n, int num) { const TOKENEXTRA *p = *tp; @@ -580,423 +348,142 @@ static void pack_map_tokens(aom_writer *w, const TOKENEXTRA **tp, int n, } *tp = p; } -#endif // !CONFIG_PVQ -#if !CONFIG_PVQ -#if CONFIG_SUPERTX -static void update_supertx_probs(AV1_COMMON *cm, int probwt, aom_writer *w) { - const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) - - av1_cost_zero(GROUP_DIFF_UPDATE_PROB); - int i, j; - int savings = 0; - int do_update = 0; - for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) { - for (j = TX_8X8; j < TX_SIZES; ++j) { - savings += av1_cond_prob_diff_update_savings( - &cm->fc->supertx_prob[i][j], cm->counts.supertx[i][j], probwt); - } - } - do_update = savings > savings_thresh; - aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB); - if (do_update) { - for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) { - for (j = TX_8X8; j < TX_SIZES; ++j) { - av1_cond_prob_diff_update(w, &cm->fc->supertx_prob[i][j], - cm->counts.supertx[i][j], probwt); - } - } - } -} -#endif // CONFIG_SUPERTX - -#if !CONFIG_LV_MAP -#if CONFIG_NEW_MULTISYMBOL -static INLINE void write_coeff_extra(const aom_cdf_prob *const *cdf, int val, - int n, aom_writer *w) { - // Code the extra bits from LSB to MSB in groups of 4 - int i = 0; - int count = 0; - while (count < n) { - const int size = AOMMIN(n - count, 4); - const int mask = (1 << size) - 1; - aom_write_cdf(w, val & mask, cdf[i++], 1 << size); - val >>= size; - count += size; - } -} -#else -static INLINE void write_coeff_extra(const aom_prob *pb, int value, - int num_bits, int skip_bits, aom_writer *w, - TOKEN_STATS *token_stats) { - // Code the extra bits from MSB to LSB 1 bit at a time - int index; - for (index = skip_bits; index < num_bits; ++index) { - const int shift = num_bits - index - 1; - const int bb = (value >> shift) & 1; - aom_write_record(w, bb, pb[index], token_stats); - } -} -#endif // CONFIG_NEW_MULTISYMBOL - -static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp, - const TOKENEXTRA *const stop, - aom_bit_depth_t bit_depth, const TX_SIZE tx_size, -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - TX_TYPE tx_type, int is_inter, -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - TOKEN_STATS *token_stats) { - const TOKENEXTRA *p = *tp; -#if CONFIG_VAR_TX - int count = 0; - const int seg_eob = tx_size_2d[tx_size]; -#endif - -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - if (tx_type == MRC_DCT && ((is_inter && SIGNAL_MRC_MASK_INTER) || - (!is_inter && SIGNAL_MRC_MASK_INTRA))) { - int rows = tx_size_high[tx_size]; - int cols = tx_size_wide[tx_size]; - assert(tx_size == TX_32X32); - assert(p < stop); - pack_map_tokens(w, &p, 2, rows * cols); - } -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - - while (p < stop && p->token != EOSB_TOKEN) { - const int token = p->token; - const int eob_val = p->eob_val; - if (token == BLOCK_Z_TOKEN) { - aom_write_symbol(w, 0, *p->head_cdf, HEAD_TOKENS + 1); - p++; -#if CONFIG_VAR_TX - break; -#endif - continue; - } - - const av1_extra_bit *const extra_bits = &av1_extra_bits[token]; - if (eob_val == LAST_EOB) { - // Just code a flag indicating whether the value is >1 or 1. - aom_write_bit(w, token != ONE_TOKEN); - } else { - int comb_symb = 2 * AOMMIN(token, TWO_TOKEN) - eob_val + p->first_val; - aom_write_symbol(w, comb_symb, *p->head_cdf, HEAD_TOKENS + p->first_val); - } - if (token > ONE_TOKEN) { - aom_write_symbol(w, token - TWO_TOKEN, *p->tail_cdf, TAIL_TOKENS); - } - - if (extra_bits->base_val) { - const int bit_string = p->extra; - const int bit_string_length = extra_bits->len; // Length of extra bits to - const int is_cat6 = (extra_bits->base_val == CAT6_MIN_VAL); - // be written excluding - // the sign bit. - int skip_bits = is_cat6 - ? (int)sizeof(av1_cat6_prob) - - av1_get_cat6_extrabits_size(tx_size, bit_depth) - : 0; - - assert(!(bit_string >> (bit_string_length - skip_bits + 1))); - if (bit_string_length > 0) -#if CONFIG_NEW_MULTISYMBOL - write_coeff_extra(extra_bits->cdf, bit_string >> 1, - bit_string_length - skip_bits, w); -#else - write_coeff_extra(extra_bits->prob, bit_string >> 1, bit_string_length, - skip_bits, w, token_stats); -#endif - - aom_write_bit_record(w, bit_string & 1, token_stats); - } - ++p; - -#if CONFIG_VAR_TX - ++count; - if (eob_val == EARLY_EOB || count == seg_eob) break; -#endif - } - - *tp = p; -} -#endif // !CONFIG_LV_MAP -#else // !CONFIG_PVQ -static PVQ_INFO *get_pvq_block(PVQ_QUEUE *pvq_q) { - PVQ_INFO *pvq; - - assert(pvq_q->curr_pos <= pvq_q->last_pos); - assert(pvq_q->curr_pos < pvq_q->buf_len); - - pvq = pvq_q->buf + pvq_q->curr_pos; - ++pvq_q->curr_pos; - - return pvq; -} - -static void pack_pvq_tokens(aom_writer *w, MACROBLOCK *const x, - MACROBLOCKD *const xd, int plane, BLOCK_SIZE bsize, - const TX_SIZE tx_size) { - PVQ_INFO *pvq; - int idx, idy; - const struct macroblockd_plane *const pd = &xd->plane[plane]; - od_adapt_ctx *adapt; - int max_blocks_wide; - int max_blocks_high; - int step = (1 << tx_size); - -#if CONFIG_CHROMA_SUB8X8 - const BLOCK_SIZE plane_bsize = - AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd)); -#elif CONFIG_CB4X4 - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); -#else - const BLOCK_SIZE plane_bsize = - get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd); -#endif - - adapt = x->daala_enc.state.adapt; - - max_blocks_wide = max_block_wide(xd, plane_bsize, plane); - max_blocks_high = max_block_high(xd, plane_bsize, plane); - - for (idy = 0; idy < max_blocks_high; idy += step) { - for (idx = 0; idx < max_blocks_wide; idx += step) { - const int is_keyframe = 0; - const int encode_flip = 0; - const int flip = 0; - int i; - const int has_dc_skip = 1; - int *exg = &adapt->pvq.pvq_exg[plane][tx_size][0]; - int *ext = adapt->pvq.pvq_ext + tx_size * PVQ_MAX_PARTITIONS; - generic_encoder *model = adapt->pvq.pvq_param_model; - - pvq = get_pvq_block(x->pvq_q); - - // encode block skip info - aom_write_symbol(w, pvq->ac_dc_coded, - adapt->skip_cdf[2 * tx_size + (plane != 0)], 4); - - // AC coeffs coded? - if (pvq->ac_dc_coded & AC_CODED) { - assert(pvq->bs == tx_size); - for (i = 0; i < pvq->nb_bands; i++) { - if (i == 0 || - (!pvq->skip_rest && !(pvq->skip_dir & (1 << ((i - 1) % 3))))) { - pvq_encode_partition( - w, pvq->qg[i], pvq->theta[i], pvq->y + pvq->off[i], - pvq->size[i], pvq->k[i], model, adapt, exg + i, ext + i, - (plane != 0) * OD_TXSIZES * PVQ_MAX_PARTITIONS + - pvq->bs * PVQ_MAX_PARTITIONS + i, - is_keyframe, i == 0 && (i < pvq->nb_bands - 1), pvq->skip_rest, - encode_flip, flip); - } - if (i == 0 && !pvq->skip_rest && pvq->bs > 0) { - aom_write_symbol( - w, pvq->skip_dir, - &adapt->pvq - .pvq_skip_dir_cdf[(plane != 0) + 2 * (pvq->bs - 1)][0], - 7); - } - } - } - // Encode residue of DC coeff, if exist. - if (!has_dc_skip || (pvq->ac_dc_coded & DC_CODED)) { - generic_encode(w, &adapt->model_dc[plane], - abs(pvq->dq_dc_residue) - has_dc_skip, - &adapt->ex_dc[plane][pvq->bs][0], 2); - } - if ((pvq->ac_dc_coded & DC_CODED)) { - aom_write_bit(w, pvq->dq_dc_residue < 0); - } - } - } // for (idy = 0; -} -#endif // !CONFIG_PVG - -#if CONFIG_VAR_TX && !CONFIG_COEF_INTERLEAVE -#if CONFIG_LV_MAP -static void pack_txb_tokens(aom_writer *w, -#if CONFIG_LV_MAP - AV1_COMMON *cm, -#endif // CONFIG_LV_MAP +static void pack_txb_tokens(aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x, const TOKENEXTRA **tp, - const TOKENEXTRA *const tok_end, -#if CONFIG_PVQ || CONFIG_LV_MAP - MACROBLOCK *const x, -#endif - MACROBLOCKD *xd, MB_MODE_INFO *mbmi, int plane, + const TOKENEXTRA *const tok_end, MACROBLOCKD *xd, + MB_MODE_INFO *mbmi, int plane, BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth, int block, int blk_row, int blk_col, TX_SIZE tx_size, TOKEN_STATS *token_stats) { - const struct macroblockd_plane *const pd = &xd->plane[plane]; - const BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; - const int tx_row = blk_row >> (1 - pd->subsampling_y); - const int tx_col = blk_col >> (1 - pd->subsampling_x); - TX_SIZE plane_tx_size; const int max_blocks_high = max_block_high(xd, plane_bsize, plane); const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; - plane_tx_size = - plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0] - : mbmi->inter_tx_size[tx_row][tx_col]; - - if (tx_size == plane_tx_size) { - TOKEN_STATS tmp_token_stats; - init_token_stats(&tmp_token_stats); + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE plane_tx_size = + plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x, + pd->subsampling_y) + : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, + blk_col)]; -#if !CONFIG_PVQ + if (tx_size == plane_tx_size || plane) { tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block); - uint16_t eob = x->mbmi_ext->eobs[plane][block]; + const uint16_t eob = x->mbmi_ext->eobs[plane][block]; TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block], x->mbmi_ext->dc_sign_ctx[plane][block] }; - av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, block, plane, tx_size, - tcoeff, eob, &txb_ctx); -#else - pack_pvq_tokens(w, x, xd, plane, bsize, tx_size); -#endif + av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff, + eob, &txb_ctx); #if CONFIG_RD_DEBUG - token_stats->txb_coeff_cost_map[blk_row][blk_col] = tmp_token_stats.cost; - token_stats->cost += tmp_token_stats.cost; -#endif - } else { - const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; - const int bsl = tx_size_wide_unit[sub_txs]; - int i; - - assert(bsl > 0); - - for (i = 0; i < 4; ++i) { - const int offsetr = blk_row + (i >> 1) * bsl; - const int offsetc = blk_col + (i & 0x01) * bsl; - const int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs]; - - if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; - - pack_txb_tokens(w, -#if CONFIG_LV_MAP - cm, -#endif - tp, tok_end, -#if CONFIG_PVQ || CONFIG_LV_MAP - x, -#endif - xd, mbmi, plane, plane_bsize, bit_depth, block, offsetr, - offsetc, sub_txs, token_stats); - block += step; - } - } -} -#else // CONFIG_LV_MAP -static void pack_txb_tokens(aom_writer *w, const TOKENEXTRA **tp, - const TOKENEXTRA *const tok_end, -#if CONFIG_PVQ - MACROBLOCK *const x, -#endif - MACROBLOCKD *xd, MB_MODE_INFO *mbmi, int plane, - BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth, - int block, int blk_row, int blk_col, - TX_SIZE tx_size, TOKEN_STATS *token_stats) { - const struct macroblockd_plane *const pd = &xd->plane[plane]; - const BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; - const int tx_row = blk_row >> (1 - pd->subsampling_y); - const int tx_col = blk_col >> (1 - pd->subsampling_x); - TX_SIZE plane_tx_size; - const int max_blocks_high = max_block_high(xd, plane_bsize, plane); - const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - TX_TYPE tx_type = av1_get_tx_type(plane ? PLANE_TYPE_UV : PLANE_TYPE_Y, xd, - blk_row, blk_col, block, tx_size); -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - - if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; - - plane_tx_size = - plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0] - : mbmi->inter_tx_size[tx_row][tx_col]; - - if (tx_size == plane_tx_size) { TOKEN_STATS tmp_token_stats; init_token_stats(&tmp_token_stats); -#if !CONFIG_PVQ - pack_mb_tokens(w, tp, tok_end, bit_depth, tx_size, -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - tx_type, is_inter_block(mbmi), -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - &tmp_token_stats); -#else - pack_pvq_tokens(w, x, xd, plane, bsize, tx_size); -#endif -#if CONFIG_RD_DEBUG token_stats->txb_coeff_cost_map[blk_row][blk_col] = tmp_token_stats.cost; token_stats->cost += tmp_token_stats.cost; #endif } else { -#if CONFIG_RECT_TX_EXT - int is_qttx = plane_tx_size == quarter_txsize_lookup[plane_bsize]; - const TX_SIZE sub_txs = is_qttx ? plane_tx_size : sub_tx_size_map[tx_size]; -#else const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; -#endif - const int bsl = tx_size_wide_unit[sub_txs]; - int i; - - assert(bsl > 0); - - for (i = 0; i < 4; ++i) { -#if CONFIG_RECT_TX_EXT - int is_wide_tx = tx_size_wide_unit[sub_txs] > tx_size_high_unit[sub_txs]; - const int offsetr = - is_qttx ? (is_wide_tx ? i * tx_size_high_unit[sub_txs] : 0) - : blk_row + (i >> 1) * bsl; - const int offsetc = - is_qttx ? (is_wide_tx ? 0 : i * tx_size_wide_unit[sub_txs]) - : blk_col + (i & 0x01) * bsl; -#else - const int offsetr = blk_row + (i >> 1) * bsl; - const int offsetc = blk_col + (i & 0x01) * bsl; -#endif - const int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs]; - - if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; - - pack_txb_tokens(w, tp, tok_end, -#if CONFIG_PVQ - x, -#endif - xd, mbmi, plane, plane_bsize, bit_depth, block, offsetr, - offsetc, sub_txs, token_stats); - block += step; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int step = bsh * bsw; + + assert(bsw > 0 && bsh > 0); + + for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) { + for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw) { + const int offsetr = blk_row + r; + const int offsetc = blk_col + c; + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + pack_txb_tokens(w, cm, x, tp, tok_end, xd, mbmi, plane, plane_bsize, + bit_depth, block, offsetr, offsetc, sub_txs, + token_stats); + block += step; + } + } + } +} + +static INLINE void set_spatial_segment_id(const AV1_COMMON *const cm, + uint8_t *segment_ids, + BLOCK_SIZE bsize, int mi_row, + int mi_col, int segment_id) { + const int mi_offset = mi_row * cm->mi_cols + mi_col; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int xmis = AOMMIN(cm->mi_cols - mi_col, bw); + const int ymis = AOMMIN(cm->mi_rows - mi_row, bh); + int x, y; + + for (y = 0; y < ymis; ++y) + for (x = 0; x < xmis; ++x) + segment_ids[mi_offset + y * cm->mi_cols + x] = segment_id; +} + +int av1_neg_interleave(int x, int ref, int max) { + assert(x < max); + const int diff = x - ref; + if (!ref) return x; + if (ref >= (max - 1)) return -x + max - 1; + if (2 * ref < max) { + if (abs(diff) <= ref) { + if (diff > 0) + return (diff << 1) - 1; + else + return ((-diff) << 1); + } + return x; + } else { + if (abs(diff) < (max - ref)) { + if (diff > 0) + return (diff << 1) - 1; + else + return ((-diff) << 1); } + return (max - x) - 1; } } -#endif // CONFIG_LV_MAP -#endif // CONFIG_VAR_TX -static void write_segment_id(aom_writer *w, const struct segmentation *seg, - struct segmentation_probs *segp, int segment_id) { - if (seg->enabled && seg->update_map) { - aom_write_symbol(w, segment_id, segp->tree_cdf, MAX_SEGMENTS); +static void write_segment_id(AV1_COMP *cpi, const MB_MODE_INFO *const mbmi, + aom_writer *w, const struct segmentation *seg, + struct segmentation_probs *segp, int mi_row, + int mi_col, int skip) { + if (!seg->enabled || !seg->update_map) return; + + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + int cdf_num; + const int pred = av1_get_spatial_seg_pred(cm, xd, mi_row, mi_col, &cdf_num); + + if (skip) { + // Still need to transmit tx size for intra blocks even if skip is + // true. Changing segment_id may make the tx size become invalid, e.g + // changing from lossless to lossy. + assert(is_inter_block(mbmi) || !cpi->has_lossless_segment); + + set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, mi_row, + mi_col, pred); + set_spatial_segment_id(cm, cpi->segmentation_map, mbmi->sb_type, mi_row, + mi_col, pred); + /* mbmi is read only but we need to update segment_id */ + ((MB_MODE_INFO *)mbmi)->segment_id = pred; + return; } + + const int coded_id = + av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1); + aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num]; + aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS); + set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, mi_row, + mi_col, mbmi->segment_id); } -#if CONFIG_NEW_MULTISYMBOL #define WRITE_REF_BIT(bname, pname) \ - aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(cm, xd), 2) -#define WRITE_REF_BIT2(bname, pname) \ aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(xd), 2) -#else -#define WRITE_REF_BIT(bname, pname) \ - aom_write(w, bname, av1_get_pred_prob_##pname(cm, xd)) -#define WRITE_REF_BIT2(bname, pname) \ - aom_write(w, bname, av1_get_pred_prob_##pname(cm, xd)) -#endif // This function encodes the reference frame static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd, aom_writer *w) { - const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const MB_MODE_INFO *const mbmi = xd->mi[0]; const int is_compound = has_second_ref(mbmi); const int segment_id = mbmi->segment_id; @@ -1006,75 +493,40 @@ static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd, assert(!is_compound); assert(mbmi->ref_frame[0] == get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME)); + } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) || + segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + assert(!is_compound); + assert(mbmi->ref_frame[0] == LAST_FRAME); } else { // does the feature use compound prediction or not // (if not specified at the frame/segment level) if (cm->reference_mode == REFERENCE_MODE_SELECT) { if (is_comp_ref_allowed(mbmi->sb_type)) -#if CONFIG_NEW_MULTISYMBOL - aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(cm, xd), 2); -#else - aom_write(w, is_compound, av1_get_reference_mode_prob(cm, xd)); -#endif // CONFIG_NEW_MULTISYMBOL + aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(xd), 2); } else { assert((!is_compound) == (cm->reference_mode == SINGLE_REFERENCE)); } if (is_compound) { -#if CONFIG_EXT_COMP_REFS const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi) ? UNIDIR_COMP_REFERENCE : BIDIR_COMP_REFERENCE; -#if USE_UNI_COMP_REFS -#if CONFIG_VAR_REFS - if ((L_OR_L2(cm) || L3_OR_G(cm)) && BWD_OR_ALT(cm)) - if (L_AND_L2(cm) || L_AND_L3(cm) || L_AND_G(cm) || BWD_AND_ALT(cm)) -#endif // CONFIG_VAR_REFS -#if CONFIG_NEW_MULTISYMBOL - aom_write_symbol(w, comp_ref_type, - av1_get_comp_reference_type_cdf(xd), 2); -#else - aom_write(w, comp_ref_type, av1_get_comp_reference_type_prob(cm, xd)); -#endif -#if CONFIG_VAR_REFS - else - assert(comp_ref_type == BIDIR_COMP_REFERENCE); - else - assert(comp_ref_type == UNIDIR_COMP_REFERENCE); -#endif // CONFIG_VAR_REFS -#else // !USE_UNI_COMP_REFS - // NOTE: uni-directional comp refs disabled - assert(comp_ref_type == BIDIR_COMP_REFERENCE); -#endif // USE_UNI_COMP_REFS + aom_write_symbol(w, comp_ref_type, av1_get_comp_reference_type_cdf(xd), + 2); if (comp_ref_type == UNIDIR_COMP_REFERENCE) { const int bit = mbmi->ref_frame[0] == BWDREF_FRAME; -#if CONFIG_VAR_REFS - if ((L_AND_L2(cm) || L_AND_L3(cm) || L_AND_G(cm)) && BWD_AND_ALT(cm)) -#endif // CONFIG_VAR_REFS - WRITE_REF_BIT2(bit, uni_comp_ref_p); + WRITE_REF_BIT(bit, uni_comp_ref_p); if (!bit) { assert(mbmi->ref_frame[0] == LAST_FRAME); -#if CONFIG_VAR_REFS - if (L_AND_L2(cm) && (L_AND_L3(cm) || L_AND_G(cm))) { -#endif // CONFIG_VAR_REFS - const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME || - mbmi->ref_frame[1] == GOLDEN_FRAME; - WRITE_REF_BIT2(bit1, uni_comp_ref_p1); - if (bit1) { -#if CONFIG_VAR_REFS - if (L_AND_L3(cm) && L_AND_G(cm)) { -#endif // CONFIG_VAR_REFS - const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME; - WRITE_REF_BIT2(bit2, uni_comp_ref_p2); -#if CONFIG_VAR_REFS - } -#endif // CONFIG_VAR_REFS - } -#if CONFIG_VAR_REFS + const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME || + mbmi->ref_frame[1] == GOLDEN_FRAME; + WRITE_REF_BIT(bit1, uni_comp_ref_p1); + if (bit1) { + const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME; + WRITE_REF_BIT(bit2, uni_comp_ref_p2); } -#endif // CONFIG_VAR_REFS } else { assert(mbmi->ref_frame[1] == ALTREF_FRAME); } @@ -1083,213 +535,81 @@ static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd, } assert(comp_ref_type == BIDIR_COMP_REFERENCE); -#endif // CONFIG_EXT_COMP_REFS -#if CONFIG_EXT_REFS const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME || mbmi->ref_frame[0] == LAST3_FRAME); -#if CONFIG_VAR_REFS - // Test need to explicitly code (L,L2) vs (L3,G) branch node in tree - if (L_OR_L2(cm) && L3_OR_G(cm)) -#endif // CONFIG_VAR_REFS - WRITE_REF_BIT(bit, comp_ref_p); + WRITE_REF_BIT(bit, comp_ref_p); if (!bit) { -#if CONFIG_VAR_REFS - // Test need to explicitly code (L) vs (L2) branch node in tree - if (L_AND_L2(cm)) { -#endif // CONFIG_VAR_REFS - const int bit1 = mbmi->ref_frame[0] == LAST_FRAME; - WRITE_REF_BIT(bit1, comp_ref_p1); -#if CONFIG_VAR_REFS - } -#endif // CONFIG_VAR_REFS + const int bit1 = mbmi->ref_frame[0] == LAST2_FRAME; + WRITE_REF_BIT(bit1, comp_ref_p1); } else { -#if CONFIG_VAR_REFS - // Test need to explicitly code (L3) vs (G) branch node in tree - if (L3_AND_G(cm)) { -#endif // CONFIG_VAR_REFS - const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME; - WRITE_REF_BIT(bit2, comp_ref_p2); -#if CONFIG_VAR_REFS - } -#endif // CONFIG_VAR_REFS + const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME; + WRITE_REF_BIT(bit2, comp_ref_p2); } -#if CONFIG_VAR_REFS - // Test need to explicitly code (BWD,ALT2) vs (ALT) branch node in tree - if (BWD_OR_ALT2(cm) && ALTREF_IS_VALID(cm)) { -#endif // CONFIG_VAR_REFS - const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME; - WRITE_REF_BIT(bit_bwd, comp_bwdref_p); - - if (!bit_bwd) { -#if CONFIG_VAR_REFS - // Test need to explicitly code (BWD,ALT2) vs (ALT) branch node in - // tree - if (BWD_AND_ALT2(cm)) -#endif // CONFIG_VAR_REFS - WRITE_REF_BIT(mbmi->ref_frame[1] == ALTREF2_FRAME, comp_bwdref_p1); - } -#if CONFIG_VAR_REFS + const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME; + WRITE_REF_BIT(bit_bwd, comp_bwdref_p); + + if (!bit_bwd) { + WRITE_REF_BIT(mbmi->ref_frame[1] == ALTREF2_FRAME, comp_bwdref_p1); } -#endif // CONFIG_VAR_REFS -#else // !CONFIG_EXT_REFS - const int bit = mbmi->ref_frame[0] == GOLDEN_FRAME; - WRITE_REF_BIT(bit, comp_ref_p); -#endif // CONFIG_EXT_REFS } else { -#if CONFIG_EXT_REFS const int bit0 = (mbmi->ref_frame[0] <= ALTREF_FRAME && mbmi->ref_frame[0] >= BWDREF_FRAME); -#if CONFIG_VAR_REFS - // Test need to explicitly code (L,L2,L3,G) vs (BWD,ALT2,ALT) branch node - // in tree - if ((L_OR_L2(cm) || L3_OR_G(cm)) && - (BWD_OR_ALT2(cm) || ALTREF_IS_VALID(cm))) -#endif // CONFIG_VAR_REFS - WRITE_REF_BIT(bit0, single_ref_p1); + WRITE_REF_BIT(bit0, single_ref_p1); if (bit0) { -#if CONFIG_VAR_REFS - // Test need to explicitly code (BWD,ALT2) vs (ALT) branch node in tree - if (BWD_OR_ALT2(cm) && ALTREF_IS_VALID(cm)) { -#endif // CONFIG_VAR_REFS - const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME; - WRITE_REF_BIT(bit1, single_ref_p2); - - if (!bit1) { -#if CONFIG_VAR_REFS - // Test need to explicitly code (BWD) vs (ALT2) branch node in tree - if (BWD_AND_ALT2(cm)) -#endif // CONFIG_VAR_REFS - WRITE_REF_BIT(mbmi->ref_frame[0] == ALTREF2_FRAME, single_ref_p6); - } -#if CONFIG_VAR_REFS + const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME; + WRITE_REF_BIT(bit1, single_ref_p2); + + if (!bit1) { + WRITE_REF_BIT(mbmi->ref_frame[0] == ALTREF2_FRAME, single_ref_p6); } -#endif // CONFIG_VAR_REFS } else { const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME || mbmi->ref_frame[0] == GOLDEN_FRAME); -#if CONFIG_VAR_REFS - // Test need to explicitly code (L,L2) vs (L3,G) branch node in tree - if (L_OR_L2(cm) && L3_OR_G(cm)) -#endif // CONFIG_VAR_REFS - WRITE_REF_BIT(bit2, single_ref_p3); + WRITE_REF_BIT(bit2, single_ref_p3); if (!bit2) { -#if CONFIG_VAR_REFS - // Test need to explicitly code (L) vs (L2) branch node in tree - if (L_AND_L2(cm)) { -#endif // CONFIG_VAR_REFS - const int bit3 = mbmi->ref_frame[0] != LAST_FRAME; - WRITE_REF_BIT(bit3, single_ref_p4); -#if CONFIG_VAR_REFS - } -#endif // CONFIG_VAR_REFS + const int bit3 = mbmi->ref_frame[0] != LAST_FRAME; + WRITE_REF_BIT(bit3, single_ref_p4); } else { -#if CONFIG_VAR_REFS - // Test need to explicitly code (L3) vs (G) branch node in tree - if (L3_AND_G(cm)) { -#endif // CONFIG_VAR_REFS - const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME; - WRITE_REF_BIT(bit4, single_ref_p5); -#if CONFIG_VAR_REFS - } -#endif // CONFIG_VAR_REFS + const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME; + WRITE_REF_BIT(bit4, single_ref_p5); } } -#else // !CONFIG_EXT_REFS - const int bit0 = mbmi->ref_frame[0] != LAST_FRAME; - WRITE_REF_BIT(bit0, single_ref_p1); - - if (bit0) { - const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME; - WRITE_REF_BIT(bit1, single_ref_p2); - } -#endif // CONFIG_EXT_REFS } } } -#if CONFIG_FILTER_INTRA -static void write_filter_intra_mode_info(const AV1_COMMON *const cm, +static void write_filter_intra_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi, - int mi_row, int mi_col, aom_writer *w) { - if (mbmi->mode == DC_PRED && mbmi->palette_mode_info.palette_size[0] == 0) { - aom_write(w, mbmi->filter_intra_mode_info.use_filter_intra_mode[0], - cm->fc->filter_intra_probs[0]); - if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) { - const FILTER_INTRA_MODE mode = - mbmi->filter_intra_mode_info.filter_intra_mode[0]; - write_uniform(w, FILTER_INTRA_MODES, mode); - } - } - -#if CONFIG_CB4X4 - if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type, - xd->plane[1].subsampling_x, - xd->plane[1].subsampling_y)) - return; -#else - (void)xd; - (void)mi_row; - (void)mi_col; -#endif // CONFIG_CB4X4 - - if (mbmi->uv_mode == UV_DC_PRED && - mbmi->palette_mode_info.palette_size[1] == 0) { - aom_write(w, mbmi->filter_intra_mode_info.use_filter_intra_mode[1], - cm->fc->filter_intra_probs[1]); - if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1]) { + if (av1_filter_intra_allowed(cm, mbmi)) { + aom_write_symbol(w, mbmi->filter_intra_mode_info.use_filter_intra, + xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2); + if (mbmi->filter_intra_mode_info.use_filter_intra) { const FILTER_INTRA_MODE mode = - mbmi->filter_intra_mode_info.filter_intra_mode[1]; - write_uniform(w, FILTER_INTRA_MODES, mode); + mbmi->filter_intra_mode_info.filter_intra_mode; + aom_write_symbol(w, mode, xd->tile_ctx->filter_intra_mode_cdf, + FILTER_INTRA_MODES); } } } -#endif // CONFIG_FILTER_INTRA -#if CONFIG_EXT_INTRA -static void write_intra_angle_info(const MACROBLOCKD *xd, - FRAME_CONTEXT *const ec_ctx, aom_writer *w) { - const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - const BLOCK_SIZE bsize = mbmi->sb_type; -#if CONFIG_INTRA_INTERP - const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd); - int p_angle; -#endif // CONFIG_INTRA_INTERP - - (void)ec_ctx; - if (!av1_use_angle_delta(bsize)) return; - - if (av1_is_directional_mode(mbmi->mode, bsize)) { - write_uniform(w, 2 * MAX_ANGLE_DELTA + 1, - MAX_ANGLE_DELTA + mbmi->angle_delta[0]); -#if CONFIG_INTRA_INTERP - p_angle = mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP; - if (av1_is_intra_filter_switchable(p_angle)) { - aom_write_symbol(w, mbmi->intra_filter, - ec_ctx->intra_filter_cdf[intra_filter_ctx], - INTRA_FILTERS); - } -#endif // CONFIG_INTRA_INTERP - } - - if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize)) { - write_uniform(w, 2 * MAX_ANGLE_DELTA + 1, - MAX_ANGLE_DELTA + mbmi->angle_delta[1]); - } +static void write_angle_delta(aom_writer *w, int angle_delta, + aom_cdf_prob *cdf) { + aom_write_symbol(w, angle_delta + MAX_ANGLE_DELTA, cdf, + 2 * MAX_ANGLE_DELTA + 1); } -#endif // CONFIG_EXT_INTRA static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd, aom_writer *w) { AV1_COMMON *const cm = &cpi->common; - const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const MB_MODE_INFO *const mbmi = xd->mi[0]; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; if (!av1_is_interp_needed(xd)) { @@ -1299,36 +619,19 @@ static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd, return; } if (cm->interp_filter == SWITCHABLE) { -#if CONFIG_DUAL_FILTER int dir; for (dir = 0; dir < 2; ++dir) { - if (has_subpel_mv_component(xd->mi[0], xd, dir) || - (mbmi->ref_frame[1] > INTRA_FRAME && - has_subpel_mv_component(xd->mi[0], xd, dir + 2))) { - const int ctx = av1_get_pred_context_switchable_interp(xd, dir); - InterpFilter filter = - av1_extract_interp_filter(mbmi->interp_filters, dir); - aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx], - SWITCHABLE_FILTERS); - ++cpi->interp_filter_selected[0][filter]; - } else { - assert(av1_extract_interp_filter(mbmi->interp_filters, dir) == - EIGHTTAP_REGULAR); - } - } -#else - { - const int ctx = av1_get_pred_context_switchable_interp(xd); - InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, 0); + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + InterpFilter filter = + av1_extract_interp_filter(mbmi->interp_filters, dir); aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS); ++cpi->interp_filter_selected[0][filter]; + if (cm->seq_params.enable_dual_filter == 0) return; } -#endif // CONFIG_DUAL_FILTER } } -#if CONFIG_PALETTE_DELTA_ENCODING // Transmit color values with delta encoding. Write the first value as // literal, and the deltas between each value and the previous one. "min_val" is // the smallest possible value of the deltas. @@ -1446,207 +749,90 @@ static void write_palette_colors_uv(const MACROBLOCKD *const xd, } } } -#endif // CONFIG_PALETTE_DELTA_ENCODING static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd, - const MODE_INFO *const mi, aom_writer *w) { - const MB_MODE_INFO *const mbmi = &mi->mbmi; - const MODE_INFO *const above_mi = xd->above_mi; - const MODE_INFO *const left_mi = xd->left_mi; + const MB_MODE_INFO *const mbmi, int mi_row, + int mi_col, aom_writer *w) { + const int num_planes = av1_num_planes(cm); const BLOCK_SIZE bsize = mbmi->sb_type; + assert(av1_allow_palette(cm->allow_screen_content_tools, bsize)); const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; - - assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_LARGEST); - const int block_palette_idx = bsize - BLOCK_8X8; + const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); if (mbmi->mode == DC_PRED) { const int n = pmi->palette_size[0]; - int palette_y_mode_ctx = 0; - if (above_mi) { - palette_y_mode_ctx += - (above_mi->mbmi.palette_mode_info.palette_size[0] > 0); - } - if (left_mi) { - palette_y_mode_ctx += - (left_mi->mbmi.palette_mode_info.palette_size[0] > 0); - } -#if CONFIG_NEW_MULTISYMBOL + const int palette_y_mode_ctx = av1_get_palette_mode_ctx(xd); aom_write_symbol( w, n > 0, - xd->tile_ctx->palette_y_mode_cdf[block_palette_idx][palette_y_mode_ctx], - 2); -#else - aom_write( - w, n > 0, - av1_default_palette_y_mode_prob[block_palette_idx][palette_y_mode_ctx]); -#endif + xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_y_mode_ctx], 2); if (n > 0) { aom_write_symbol(w, n - PALETTE_MIN_SIZE, - xd->tile_ctx->palette_y_size_cdf[block_palette_idx], + xd->tile_ctx->palette_y_size_cdf[bsize_ctx], PALETTE_SIZES); -#if CONFIG_PALETTE_DELTA_ENCODING write_palette_colors_y(xd, pmi, cm->bit_depth, w); -#else - for (int i = 0; i < n; ++i) { - assert(pmi->palette_colors[i] < (1 << cm->bit_depth)); - aom_write_literal(w, pmi->palette_colors[i], cm->bit_depth); - } -#endif // CONFIG_PALETTE_DELTA_ENCODING } } - if (mbmi->uv_mode == UV_DC_PRED) { + const int uv_dc_pred = + num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && + is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, + xd->plane[1].subsampling_y); + if (uv_dc_pred) { const int n = pmi->palette_size[1]; const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0); -#if CONFIG_NEW_MULTISYMBOL aom_write_symbol(w, n > 0, xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2); -#else - aom_write(w, n > 0, av1_default_palette_uv_mode_prob[palette_uv_mode_ctx]); -#endif if (n > 0) { aom_write_symbol(w, n - PALETTE_MIN_SIZE, - xd->tile_ctx->palette_uv_size_cdf[block_palette_idx], + xd->tile_ctx->palette_uv_size_cdf[bsize_ctx], PALETTE_SIZES); -#if CONFIG_PALETTE_DELTA_ENCODING write_palette_colors_uv(xd, pmi, cm->bit_depth, w); -#else - for (int i = 0; i < n; ++i) { - assert(pmi->palette_colors[PALETTE_MAX_SIZE + i] < - (1 << cm->bit_depth)); - assert(pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] < - (1 << cm->bit_depth)); - aom_write_literal(w, pmi->palette_colors[PALETTE_MAX_SIZE + i], - cm->bit_depth); - aom_write_literal(w, pmi->palette_colors[2 * PALETTE_MAX_SIZE + i], - cm->bit_depth); - } -#endif // CONFIG_PALETTE_DELTA_ENCODING } } } void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, -#if CONFIG_SUPERTX - const int supertx_enabled, -#endif -#if CONFIG_TXK_SEL - int blk_row, int blk_col, int block, int plane, - TX_SIZE tx_size, -#endif + int blk_row, int blk_col, int plane, TX_SIZE tx_size, aom_writer *w) { - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *mbmi = xd->mi[0]; const int is_inter = is_inter_block(mbmi); -#if !CONFIG_TXK_SEL -#if CONFIG_VAR_TX - const TX_SIZE tx_size = is_inter ? mbmi->min_tx_size : mbmi->tx_size; -#else - const TX_SIZE tx_size = mbmi->tx_size; -#endif // CONFIG_VAR_TX -#endif // !CONFIG_TXK_SEL FRAME_CONTEXT *ec_ctx = xd->tile_ctx; -#if !CONFIG_TXK_SEL - TX_TYPE tx_type = mbmi->tx_type; -#else // Only y plane's tx_type is transmitted if (plane > 0) return; PLANE_TYPE plane_type = get_plane_type(plane); - TX_TYPE tx_type = - av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size); -#endif - - if (!FIXED_TX_TYPE) { -#if CONFIG_EXT_TX - const TX_SIZE square_tx_size = txsize_sqr_map[tx_size]; - const BLOCK_SIZE bsize = mbmi->sb_type; - if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > - 1 && - ((!cm->seg.enabled && cm->base_qindex > 0) || - (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) && - !mbmi->skip && -#if CONFIG_SUPERTX - !supertx_enabled && -#endif // CONFIG_SUPERTX - !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { -#if CONFIG_MRC_TX - if (tx_type == MRC_DCT) - assert(mbmi->valid_mrc_mask && "Invalid MRC mask"); -#endif // CONFIG_MRC_TX - const TxSetType tx_set_type = get_ext_tx_set_type( - tx_size, bsize, is_inter, cm->reduced_tx_set_used); - const int eset = - get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used); - // eset == 0 should correspond to a set with only DCT_DCT and there - // is no need to send the tx_type - assert(eset > 0); - assert(av1_ext_tx_used[tx_set_type][tx_type]); -#if !CONFIG_LGT_FROM_PRED - if (is_inter) { - aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type], - ec_ctx->inter_ext_tx_cdf[eset][square_tx_size], - av1_num_ext_tx_set[tx_set_type]); - } else if (ALLOW_INTRA_EXT_TX) { - aom_write_symbol( - w, av1_ext_tx_ind[tx_set_type][tx_type], - ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode], - av1_num_ext_tx_set[tx_set_type]); - } -#else - // only signal tx_type when lgt is not allowed or not selected - if (is_inter) { - if (LGT_FROM_PRED_INTER) { - if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used) - aom_write(w, mbmi->use_lgt, ec_ctx->inter_lgt_prob[square_tx_size]); - if (!mbmi->use_lgt) - aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type], - ec_ctx->inter_ext_tx_cdf[eset][square_tx_size], - av1_num_ext_tx_set[tx_set_type]); - } else { - aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type], - ec_ctx->inter_ext_tx_cdf[eset][square_tx_size], - av1_num_ext_tx_set[tx_set_type]); - } - } else if (ALLOW_INTRA_EXT_TX) { - if (LGT_FROM_PRED_INTRA) { - if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used) - aom_write(w, mbmi->use_lgt, - ec_ctx->intra_lgt_prob[square_tx_size][mbmi->mode]); - if (!mbmi->use_lgt) - aom_write_symbol( - w, av1_ext_tx_ind[tx_set_type][tx_type], - ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode], - av1_num_ext_tx_set[tx_set_type]); - } else { - aom_write_symbol( - w, av1_ext_tx_ind[tx_set_type][tx_type], - ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode], - av1_num_ext_tx_set[tx_set_type]); - } - } -#endif // CONFIG_LGT_FROM_PRED - } -#else // CONFIG_EXT_TX - if (tx_size < TX_32X32 && - ((!cm->seg.enabled && cm->base_qindex > 0) || - (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) && - !mbmi->skip && -#if CONFIG_SUPERTX - !supertx_enabled && -#endif // CONFIG_SUPERTX - !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { - if (is_inter) { - aom_write_symbol(w, av1_ext_tx_ind[tx_type], - ec_ctx->inter_ext_tx_cdf[tx_size], TX_TYPES); - } else { - aom_write_symbol( - w, av1_ext_tx_ind[tx_type], - ec_ctx->intra_ext_tx_cdf[tx_size] - [intra_mode_to_tx_type_context[mbmi->mode]], - TX_TYPES); - } + TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size, + cm->reduced_tx_set_used); + + const TX_SIZE square_tx_size = txsize_sqr_map[tx_size]; + if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 && + ((!cm->seg.enabled && cm->base_qindex > 0) || + (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) && + !mbmi->skip && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + const TxSetType tx_set_type = + av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used); + const int eset = get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used); + // eset == 0 should correspond to a set with only DCT_DCT and there + // is no need to send the tx_type + assert(eset > 0); + assert(av1_ext_tx_used[tx_set_type][tx_type]); + if (is_inter) { + aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type], + ec_ctx->inter_ext_tx_cdf[eset][square_tx_size], + av1_num_ext_tx_set[tx_set_type]); + } else { + PREDICTION_MODE intra_dir; + if (mbmi->filter_intra_mode_info.use_filter_intra) + intra_dir = + fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode]; + else + intra_dir = mbmi->mode; + aom_write_symbol( + w, av1_ext_tx_ind[tx_set_type][tx_type], + ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir], + av1_num_ext_tx_set[tx_set_type]); } -#endif // CONFIG_EXT_TX } } @@ -1658,14 +844,12 @@ static void write_intra_mode(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize, static void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx, UV_PREDICTION_MODE uv_mode, - PREDICTION_MODE y_mode, aom_writer *w) { -#if !CONFIG_CFL - uv_mode = get_uv_mode(uv_mode); -#endif - aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[y_mode], UV_INTRA_MODES); + PREDICTION_MODE y_mode, + CFL_ALLOWED_TYPE cfl_allowed, aom_writer *w) { + aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[cfl_allowed][y_mode], + UV_INTRA_MODES - !cfl_allowed); } -#if CONFIG_CFL static void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, int idx, int joint_sign, aom_writer *w) { aom_write_symbol(w, joint_sign, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS); @@ -1679,23 +863,85 @@ static void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, int idx, aom_write_symbol(w, CFL_IDX_V(idx), cdf_v, CFL_ALPHABET_SIZE); } } -#endif + +static void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd, aom_writer *w, + int skip, int mi_col, int mi_row) { + if (cm->coded_lossless || cm->allow_intrabc) { + // Initialize to indicate no CDEF for safety. + cm->cdef_bits = 0; + cm->cdef_strengths[0] = 0; + cm->nb_cdef_strengths = 1; + cm->cdef_uv_strengths[0] = 0; + return; + } + + const int m = ~((1 << (6 - MI_SIZE_LOG2)) - 1); + const MB_MODE_INFO *mbmi = + cm->mi_grid_visible[(mi_row & m) * cm->mi_stride + (mi_col & m)]; + // Initialise when at top left part of the superblock + if (!(mi_row & (cm->seq_params.mib_size - 1)) && + !(mi_col & (cm->seq_params.mib_size - 1))) { // Top left? + xd->cdef_preset[0] = xd->cdef_preset[1] = xd->cdef_preset[2] = + xd->cdef_preset[3] = -1; + } + + // Emit CDEF param at first non-skip coding block + const int mask = 1 << (6 - MI_SIZE_LOG2); + const int index = cm->seq_params.sb_size == BLOCK_128X128 + ? !!(mi_col & mask) + 2 * !!(mi_row & mask) + : 0; + if (xd->cdef_preset[index] == -1 && !skip) { + aom_write_literal(w, mbmi->cdef_strength, cm->cdef_bits); + xd->cdef_preset[index] = mbmi->cdef_strength; + } +} + +static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w, + const struct segmentation *const seg, + struct segmentation_probs *const segp, + int mi_row, int mi_col, int skip, + int preskip) { + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + AV1_COMMON *const cm = &cpi->common; + + if (seg->update_map) { + if (preskip) { + if (!seg->segid_preskip) return; + } else { + if (seg->segid_preskip) return; + if (skip) { + write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 1); + if (seg->temporal_update) ((MB_MODE_INFO *)mbmi)->seg_id_predicted = 0; + return; + } + } + if (seg->temporal_update) { + const int pred_flag = mbmi->seg_id_predicted; + aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd); + aom_write_symbol(w, pred_flag, pred_cdf, 2); + if (!pred_flag) { + write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0); + } + if (pred_flag) { + set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, + mi_row, mi_col, mbmi->segment_id); + } + } else { + write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0); + } + } +} static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row, - const int mi_col, -#if CONFIG_SUPERTX - int supertx_enabled, -#endif - aom_writer *w) { + const int mi_col, aom_writer *w) { AV1_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->td.mb; MACROBLOCKD *const xd = &x->e_mbd; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; - const MODE_INFO *mi = xd->mi[0]; - const struct segmentation *const seg = &cm->seg; struct segmentation_probs *const segp = &ec_ctx->seg; - const MB_MODE_INFO *const mbmi = &mi->mbmi; + const MB_MODE_INFO *const mbmi = xd->mi[0]; const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; const PREDICTION_MODE mode = mbmi->mode; const int segment_id = mbmi->segment_id; @@ -1704,595 +950,323 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row, const int is_inter = is_inter_block(mbmi); const int is_compound = has_second_ref(mbmi); int skip, ref; -#if CONFIG_CB4X4 - const int unify_bsize = 1; -#else - const int unify_bsize = 0; -#endif (void)mi_row; (void)mi_col; - if (seg->update_map) { - if (seg->temporal_update) { - const int pred_flag = mbmi->seg_id_predicted; -#if CONFIG_NEW_MULTISYMBOL - aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd); - aom_write_symbol(w, pred_flag, pred_cdf, 2); -#else - aom_prob pred_prob = av1_get_pred_prob_seg_id(segp, xd); - aom_write(w, pred_flag, pred_prob); -#endif - if (!pred_flag) write_segment_id(w, seg, segp, segment_id); - } else { - write_segment_id(w, seg, segp, segment_id); - } - } + write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, 0, 1); + + write_skip_mode(cm, xd, segment_id, mbmi, w); + + assert(IMPLIES(mbmi->skip_mode, mbmi->skip)); + skip = mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w); + + write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, skip, 0); + + write_cdef(cm, xd, w, skip, mi_col, mi_row); -#if CONFIG_SUPERTX - if (supertx_enabled) - skip = mbmi->skip; - else - skip = write_skip(cm, xd, segment_id, mi, w); -#else - skip = write_skip(cm, xd, segment_id, mi, w); -#endif // CONFIG_SUPERTX if (cm->delta_q_present_flag) { int super_block_upper_left = - ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0); - if ((bsize != BLOCK_LARGEST || skip == 0) && super_block_upper_left) { - assert(mbmi->current_q_index > 0); + ((mi_row & (cm->seq_params.mib_size - 1)) == 0) && + ((mi_col & (cm->seq_params.mib_size - 1)) == 0); + if ((bsize != cm->seq_params.sb_size || skip == 0) && + super_block_upper_left) { + assert(mbmi->current_qindex > 0); int reduced_delta_qindex = - (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res; - write_delta_qindex(cm, xd, reduced_delta_qindex, w); - xd->prev_qindex = mbmi->current_q_index; -#if CONFIG_EXT_DELTA_Q -#if CONFIG_LOOPFILTER_LEVEL + (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res; + write_delta_qindex(xd, reduced_delta_qindex, w); + xd->current_qindex = mbmi->current_qindex; if (cm->delta_lf_present_flag) { if (cm->delta_lf_multi) { - for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { int reduced_delta_lflevel = - (mbmi->curr_delta_lf[lf_id] - xd->prev_delta_lf[lf_id]) / + (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / cm->delta_lf_res; write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w); - xd->prev_delta_lf[lf_id] = mbmi->curr_delta_lf[lf_id]; + xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; } } else { int reduced_delta_lflevel = - (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) / + (mbmi->delta_lf_from_base - xd->delta_lf_from_base) / cm->delta_lf_res; write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w); - xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base; + xd->delta_lf_from_base = mbmi->delta_lf_from_base; } } -#else - if (cm->delta_lf_present_flag) { - int reduced_delta_lflevel = - (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) / - cm->delta_lf_res; - write_delta_lflevel(cm, xd, reduced_delta_lflevel, w); - xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base; - } -#endif // CONFIG_LOOPFILTER_LEVEL -#endif // CONFIG_EXT_DELTA_Q } } -#if CONFIG_SUPERTX - if (!supertx_enabled) -#endif // CONFIG_SUPERTX - write_is_inter(cm, xd, mbmi->segment_id, w, is_inter); + if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter); - if (cm->tx_mode == TX_MODE_SELECT && -#if CONFIG_CB4X4 && CONFIG_VAR_TX && !CONFIG_RECT_TX - (bsize >= BLOCK_8X8 || (bsize > BLOCK_4X4 && is_inter)) && -#else - block_signals_txsize(bsize) && -#endif -#if CONFIG_SUPERTX - !supertx_enabled && -#endif // CONFIG_SUPERTX - !(is_inter && skip) && !xd->lossless[segment_id]) { -#if CONFIG_VAR_TX - if (is_inter) { // This implies skip flag is 0. - const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, bsize, 0); - const int bh = tx_size_high_unit[max_tx_size]; - const int bw = tx_size_wide_unit[max_tx_size]; - const int width = block_size_wide[bsize] >> tx_size_wide_log2[0]; - const int height = block_size_high[bsize] >> tx_size_wide_log2[0]; - int init_depth = - (height != width) ? RECT_VARTX_DEPTH_INIT : SQR_VARTX_DEPTH_INIT; - int idx, idy; - for (idy = 0; idy < height; idy += bh) - for (idx = 0; idx < width; idx += bw) - write_tx_size_vartx(cm, xd, mbmi, max_tx_size, init_depth, idy, idx, - w); -#if CONFIG_RECT_TX_EXT - if (is_quarter_tx_allowed(xd, mbmi, is_inter_block(mbmi)) && - quarter_txsize_lookup[bsize] != max_tx_size && - (mbmi->tx_size == quarter_txsize_lookup[bsize] || - mbmi->tx_size == max_tx_size)) { -#if CONFIG_NEW_MULTISYMBOL - aom_write_symbol(w, mbmi->tx_size != max_tx_size, - cm->fc->quarter_tx_size_cdf, 2); -#else - aom_write(w, mbmi->tx_size != max_tx_size, - cm->fc->quarter_tx_size_prob); -#endif - } -#endif - } else { - set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, skip, xd); - write_selected_tx_size(cm, xd, w); - } - } else { - set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, skip, xd); -#else - write_selected_tx_size(cm, xd, w); -#endif - } + if (mbmi->skip_mode) return; if (!is_inter) { - if (bsize >= BLOCK_8X8 || unify_bsize) { - write_intra_mode(ec_ctx, bsize, mode, w); - } else { - int idx, idy; - const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; - const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; - for (idy = 0; idy < 2; idy += num_4x4_h) { - for (idx = 0; idx < 2; idx += num_4x4_w) { - const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode; - write_intra_mode(ec_ctx, bsize, b_mode, w); - } - } + write_intra_mode(ec_ctx, bsize, mode, w); + const int use_angle_delta = av1_use_angle_delta(bsize); + + if (use_angle_delta && av1_is_directional_mode(mode)) { + write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y], + ec_ctx->angle_delta_cdf[mode - V_PRED]); } -#if CONFIG_CB4X4 - if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, - xd->plane[1].subsampling_y)) { - write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mode, w); -#else // !CONFIG_CB4X4 - write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mode, w); -#endif // CONFIG_CB4X4 -#if CONFIG_CFL - if (mbmi->uv_mode == UV_CFL_PRED) { + if (!cm->seq_params.monochrome && + is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, + xd->plane[1].subsampling_y)) { + const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; + write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w); + if (uv_mode == UV_CFL_PRED) write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w); + if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) { + write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV], + ec_ctx->angle_delta_cdf[uv_mode - V_PRED]); } -#endif - -#if CONFIG_CB4X4 } -#endif -#if CONFIG_EXT_INTRA - write_intra_angle_info(xd, ec_ctx, w); -#endif // CONFIG_EXT_INTRA if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) - write_palette_mode_info(cm, xd, mi, w); -#if CONFIG_FILTER_INTRA - if (bsize >= BLOCK_8X8 || unify_bsize) - write_filter_intra_mode_info(cm, xd, mbmi, mi_row, mi_col, w); -#endif // CONFIG_FILTER_INTRA + write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w); + + write_filter_intra_mode_info(cm, xd, mbmi, w); } else { int16_t mode_ctx; - write_ref_frames(cm, xd, w); -#if CONFIG_COMPOUND_SINGLEREF - if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { - // NOTE: Handle single ref comp mode - if (!is_compound) - aom_write(w, is_inter_singleref_comp_mode(mode), - av1_get_inter_mode_prob(cm, xd)); - } -#endif // CONFIG_COMPOUND_SINGLEREF - -#if CONFIG_COMPOUND_SINGLEREF - if (is_compound || is_inter_singleref_comp_mode(mode)) -#else // !CONFIG_COMPOUND_SINGLEREF - if (is_compound) -#endif // CONFIG_COMPOUND_SINGLEREF - mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]]; - else + av1_collect_neighbors_ref_counts(xd); - mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, - mbmi->ref_frame, bsize, -1); + write_ref_frames(cm, xd, w); + + mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); // If segment skip is not enabled code the mode. if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { - if (bsize >= BLOCK_8X8 || unify_bsize) { - if (is_inter_compound_mode(mode)) - write_inter_compound_mode(cm, xd, w, mode, mode_ctx); -#if CONFIG_COMPOUND_SINGLEREF - else if (is_inter_singleref_comp_mode(mode)) - write_inter_singleref_comp_mode(xd, w, mode, mode_ctx); -#endif // CONFIG_COMPOUND_SINGLEREF - else if (is_inter_singleref_mode(mode)) - write_inter_mode(w, mode, ec_ctx, mode_ctx); - - if (mode == NEWMV || mode == NEW_NEWMV || -#if CONFIG_COMPOUND_SINGLEREF - mbmi->mode == SR_NEW_NEWMV || -#endif // CONFIG_COMPOUND_SINGLEREF - have_nearmv_in_inter_mode(mode)) - write_drl_idx(ec_ctx, mbmi, mbmi_ext, w); - else - assert(mbmi->ref_mv_idx == 0); - } + if (is_inter_compound_mode(mode)) + write_inter_compound_mode(xd, w, mode, mode_ctx); + else if (is_inter_singleref_mode(mode)) + write_inter_mode(w, mode, ec_ctx, mode_ctx); + + if (mode == NEWMV || mode == NEW_NEWMV || have_nearmv_in_inter_mode(mode)) + write_drl_idx(ec_ctx, mbmi, mbmi_ext, w); + else + assert(mbmi->ref_mv_idx == 0); } -#if !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION && !CONFIG_GLOBAL_MOTION - write_mb_interp_filter(cpi, xd, w); -#endif // !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION - - if (bsize < BLOCK_8X8 && !unify_bsize) { -#if CONFIG_COMPOUND_SINGLEREF - /// NOTE: Single ref comp mode does not support sub8x8. - assert(is_compound || !is_inter_singleref_comp_mode(mbmi->mode)); -#endif // CONFIG_COMPOUND_SINGLEREF - const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; - const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; - int idx, idy; - for (idy = 0; idy < 2; idy += num_4x4_h) { - for (idx = 0; idx < 2; idx += num_4x4_w) { - const int j = idy * 2 + idx; - const PREDICTION_MODE b_mode = mi->bmi[j].as_mode; - if (!is_compound) - mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, - mbmi->ref_frame, bsize, j); - if (is_inter_compound_mode(b_mode)) - write_inter_compound_mode(cm, xd, w, b_mode, mode_ctx); - else if (is_inter_singleref_mode(b_mode)) - write_inter_mode(w, b_mode, ec_ctx, mode_ctx); - - if (b_mode == NEWMV || b_mode == NEW_NEWMV) { - for (ref = 0; ref < 1 + is_compound; ++ref) { - int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); - int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], - mbmi_ext->ref_mv_stack[rf_type], ref, - mbmi->ref_mv_idx); - nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx]; - av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv, - &mi->bmi[j].ref_mv[ref].as_mv, nmvc, allow_hp); - } - } else if (b_mode == NEAREST_NEWMV || b_mode == NEAR_NEWMV) { - int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); - int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], - mbmi_ext->ref_mv_stack[rf_type], 1, - mbmi->ref_mv_idx); - nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx]; - av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[1].as_mv, - &mi->bmi[j].ref_mv[1].as_mv, nmvc, allow_hp); - } else if (b_mode == NEW_NEARESTMV || b_mode == NEW_NEARMV) { - int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); - int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], - mbmi_ext->ref_mv_stack[rf_type], 0, - mbmi->ref_mv_idx); - nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx]; - av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[0].as_mv, - &mi->bmi[j].ref_mv[0].as_mv, nmvc, allow_hp); - } - } - } - } else { - if (mode == NEWMV || mode == NEW_NEWMV) { - int_mv ref_mv; - for (ref = 0; ref < 1 + is_compound; ++ref) { - int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); - int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], - mbmi_ext->ref_mv_stack[rf_type], ref, - mbmi->ref_mv_idx); - nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx]; - ref_mv = mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0]; - av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc, - allow_hp); - } - } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) { - int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); - int nmv_ctx = - av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], - mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx); - nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx]; - av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, - &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv, nmvc, - allow_hp); - } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) { - int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); - int nmv_ctx = - av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], - mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx); - nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx]; - av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, - &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv, nmvc, + if (mode == NEWMV || mode == NEW_NEWMV) { + for (ref = 0; ref < 1 + is_compound; ++ref) { + nmv_context *nmvc = &ec_ctx->nmvc; + const int_mv ref_mv = av1_get_ref_mv(x, ref); + av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc, allow_hp); -#if CONFIG_COMPOUND_SINGLEREF - } else if ( // mode == SR_NEAREST_NEWMV || - mode == SR_NEAR_NEWMV || mode == SR_ZERO_NEWMV || - mode == SR_NEW_NEWMV) { - int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); - int nmv_ctx = - av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], - mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx); - nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx]; - int_mv ref_mv = mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0]; - if (mode == SR_NEW_NEWMV) - av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc, - allow_hp); - av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc, - allow_hp); -#endif // CONFIG_COMPOUND_SINGLEREF } + } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) { + nmv_context *nmvc = &ec_ctx->nmvc; + const int_mv ref_mv = av1_get_ref_mv(x, 1); + av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc, allow_hp); + } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) { + nmv_context *nmvc = &ec_ctx->nmvc; + const int_mv ref_mv = av1_get_ref_mv(x, 0); + av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc, allow_hp); } -#if CONFIG_INTERINTRA if (cpi->common.reference_mode != COMPOUND_REFERENCE && -#if CONFIG_SUPERTX - !supertx_enabled && -#endif // CONFIG_SUPERTX - cpi->common.allow_interintra_compound && is_interintra_allowed(mbmi)) { + cpi->common.seq_params.enable_interintra_compound && + is_interintra_allowed(mbmi)) { const int interintra = mbmi->ref_frame[1] == INTRA_FRAME; const int bsize_group = size_group_lookup[bsize]; -#if CONFIG_NEW_MULTISYMBOL aom_write_symbol(w, interintra, ec_ctx->interintra_cdf[bsize_group], 2); -#else - aom_write(w, interintra, cm->fc->interintra_prob[bsize_group]); -#endif if (interintra) { aom_write_symbol(w, mbmi->interintra_mode, ec_ctx->interintra_mode_cdf[bsize_group], INTERINTRA_MODES); if (is_interintra_wedge_used(bsize)) { -#if CONFIG_NEW_MULTISYMBOL aom_write_symbol(w, mbmi->use_wedge_interintra, ec_ctx->wedge_interintra_cdf[bsize], 2); -#else - aom_write(w, mbmi->use_wedge_interintra, - cm->fc->wedge_interintra_prob[bsize]); -#endif if (mbmi->use_wedge_interintra) { - aom_write_literal(w, mbmi->interintra_wedge_index, - get_wedge_bits_lookup(bsize)); + aom_write_symbol(w, mbmi->interintra_wedge_index, + ec_ctx->wedge_idx_cdf[bsize], 16); assert(mbmi->interintra_wedge_sign == 0); } } } } -#endif // CONFIG_INTERINTRA - -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION -#if CONFIG_SUPERTX - if (!supertx_enabled) -#endif // CONFIG_SUPERTX - if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mi, w); -#if CONFIG_NCOBMC_ADAPT_WEIGHT - write_ncobmc_mode(xd, mi, w); -#endif -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - - if ( -#if CONFIG_COMPOUND_SINGLEREF - is_inter_anyref_comp_mode(mbmi->mode) && -#else // !CONFIG_COMPOUND_SINGLEREF - cpi->common.reference_mode != SINGLE_REFERENCE && - is_inter_compound_mode(mbmi->mode) && -#endif // CONFIG_COMPOUND_SINGLEREF -#if CONFIG_MOTION_VAR - mbmi->motion_mode == SIMPLE_TRANSLATION && -#endif // CONFIG_MOTION_VAR - is_any_masked_compound_used(bsize)) { -#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE - if (cm->allow_masked_compound) { -#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT - if (!is_interinter_compound_used(COMPOUND_WEDGE, bsize)) - aom_write_bit(w, mbmi->interinter_compound_type == COMPOUND_AVERAGE); - else -#endif // CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT - aom_write_symbol(w, mbmi->interinter_compound_type, - ec_ctx->compound_type_cdf[bsize], COMPOUND_TYPES); -#if CONFIG_WEDGE - if (is_interinter_compound_used(COMPOUND_WEDGE, bsize) && - mbmi->interinter_compound_type == COMPOUND_WEDGE) { - aom_write_literal(w, mbmi->wedge_index, get_wedge_bits_lookup(bsize)); - aom_write_bit(w, mbmi->wedge_sign); + + if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mbmi, w); + + // First write idx to indicate current compound inter prediction mode group + // Group A (0): jnt_comp, compound_average + // Group B (1): interintra, compound_diffwtd, wedge + if (has_second_ref(mbmi)) { + const int masked_compound_used = is_any_masked_compound_used(bsize) && + cm->seq_params.enable_masked_compound; + + if (masked_compound_used) { + const int ctx_comp_group_idx = get_comp_group_idx_context(xd); + aom_write_symbol(w, mbmi->comp_group_idx, + ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2); + } else { + assert(mbmi->comp_group_idx == 0); + } + + if (mbmi->comp_group_idx == 0) { + if (mbmi->compound_idx) + assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE); + + if (cm->seq_params.enable_jnt_comp) { + const int comp_index_ctx = get_comp_index_context(cm, xd); + aom_write_symbol(w, mbmi->compound_idx, + ec_ctx->compound_index_cdf[comp_index_ctx], 2); + } else { + assert(mbmi->compound_idx == 1); } -#endif // CONFIG_WEDGE -#if CONFIG_COMPOUND_SEGMENT - if (mbmi->interinter_compound_type == COMPOUND_SEG) { - aom_write_literal(w, mbmi->mask_type, MAX_SEG_MASK_BITS); + } else { + assert(cpi->common.reference_mode != SINGLE_REFERENCE && + is_inter_compound_mode(mbmi->mode) && + mbmi->motion_mode == SIMPLE_TRANSLATION); + assert(masked_compound_used); + // compound_diffwtd, wedge + assert(mbmi->interinter_comp.type == COMPOUND_WEDGE || + mbmi->interinter_comp.type == COMPOUND_DIFFWTD); + + if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) + aom_write_symbol(w, mbmi->interinter_comp.type - 1, + ec_ctx->compound_type_cdf[bsize], + COMPOUND_TYPES - 1); + + if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { + assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize)); + aom_write_symbol(w, mbmi->interinter_comp.wedge_index, + ec_ctx->wedge_idx_cdf[bsize], 16); + aom_write_bit(w, mbmi->interinter_comp.wedge_sign); + } else { + assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD); + aom_write_literal(w, mbmi->interinter_comp.mask_type, + MAX_DIFFWTD_MASK_BITS); } -#endif // CONFIG_COMPOUND_SEGMENT } -#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE } -#if CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION write_mb_interp_filter(cpi, xd, w); -#endif // CONFIG_DUAL_FILTE || CONFIG_WARPED_MOTION } +} -#if !CONFIG_TXK_SEL - av1_write_tx_type(cm, xd, -#if CONFIG_SUPERTX - supertx_enabled, -#endif - w); -#endif // !CONFIG_TXK_SEL +static void write_intrabc_info(MACROBLOCKD *xd, + const MB_MODE_INFO_EXT *mbmi_ext, + aom_writer *w) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + int use_intrabc = is_intrabc_block(mbmi); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + aom_write_symbol(w, use_intrabc, ec_ctx->intrabc_cdf, 2); + if (use_intrabc) { + assert(mbmi->mode == DC_PRED); + assert(mbmi->uv_mode == UV_DC_PRED); + assert(mbmi->motion_mode == SIMPLE_TRANSLATION); + int_mv dv_ref = mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv; + av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc); + } } -static void write_mb_modes_kf(AV1_COMMON *cm, MACROBLOCKD *xd, -#if CONFIG_INTRABC +static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd, const MB_MODE_INFO_EXT *mbmi_ext, -#endif // CONFIG_INTRABC const int mi_row, const int mi_col, aom_writer *w) { + AV1_COMMON *const cm = &cpi->common; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; const struct segmentation *const seg = &cm->seg; struct segmentation_probs *const segp = &ec_ctx->seg; - const MODE_INFO *const mi = xd->mi[0]; - const MODE_INFO *const above_mi = xd->above_mi; - const MODE_INFO *const left_mi = xd->left_mi; - const MB_MODE_INFO *const mbmi = &mi->mbmi; + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + const MB_MODE_INFO *const mbmi = xd->mi[0]; const BLOCK_SIZE bsize = mbmi->sb_type; -#if CONFIG_CB4X4 - const int unify_bsize = 1; -#else - const int unify_bsize = 0; -#endif - (void)mi_row; - (void)mi_col; + const PREDICTION_MODE mode = mbmi->mode; + + if (seg->segid_preskip && seg->update_map) + write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0); - if (seg->update_map) write_segment_id(w, seg, segp, mbmi->segment_id); + const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w); + + if (!seg->segid_preskip && seg->update_map) + write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, skip); + + write_cdef(cm, xd, w, skip, mi_col, mi_row); - const int skip = write_skip(cm, xd, mbmi->segment_id, mi, w); if (cm->delta_q_present_flag) { int super_block_upper_left = - ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0); - if ((bsize != BLOCK_LARGEST || skip == 0) && super_block_upper_left) { - assert(mbmi->current_q_index > 0); + ((mi_row & (cm->seq_params.mib_size - 1)) == 0) && + ((mi_col & (cm->seq_params.mib_size - 1)) == 0); + if ((bsize != cm->seq_params.sb_size || skip == 0) && + super_block_upper_left) { + assert(mbmi->current_qindex > 0); int reduced_delta_qindex = - (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res; - write_delta_qindex(cm, xd, reduced_delta_qindex, w); - xd->prev_qindex = mbmi->current_q_index; -#if CONFIG_EXT_DELTA_Q -#if CONFIG_LOOPFILTER_LEVEL + (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res; + write_delta_qindex(xd, reduced_delta_qindex, w); + xd->current_qindex = mbmi->current_qindex; if (cm->delta_lf_present_flag) { if (cm->delta_lf_multi) { - for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { int reduced_delta_lflevel = - (mbmi->curr_delta_lf[lf_id] - xd->prev_delta_lf[lf_id]) / + (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / cm->delta_lf_res; write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w); - xd->prev_delta_lf[lf_id] = mbmi->curr_delta_lf[lf_id]; + xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; } } else { int reduced_delta_lflevel = - (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) / + (mbmi->delta_lf_from_base - xd->delta_lf_from_base) / cm->delta_lf_res; write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w); - xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base; + xd->delta_lf_from_base = mbmi->delta_lf_from_base; } } -#else - if (cm->delta_lf_present_flag) { - int reduced_delta_lflevel = - (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) / - cm->delta_lf_res; - write_delta_lflevel(cm, xd, reduced_delta_lflevel, w); - xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base; - } -#endif // CONFIG_LOOPFILTER_LEVEL -#endif // CONFIG_EXT_DELTA_Q } } - int enable_tx_size = cm->tx_mode == TX_MODE_SELECT && - block_signals_txsize(bsize) && - !xd->lossless[mbmi->segment_id]; - -#if CONFIG_INTRABC - if (av1_allow_intrabc(bsize, cm)) { - int use_intrabc = is_intrabc_block(mbmi); - aom_write_symbol(w, use_intrabc, ec_ctx->intrabc_cdf, 2); - if (use_intrabc) { - assert(mbmi->mode == DC_PRED); - assert(mbmi->uv_mode == UV_DC_PRED); - if (enable_tx_size && !mbmi->skip) write_selected_tx_size(cm, xd, w); - int_mv dv_ref = mbmi_ext->ref_mvs[INTRA_FRAME][0]; - av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc); -#if CONFIG_EXT_TX && !CONFIG_TXK_SEL - av1_write_tx_type(cm, xd, -#if CONFIG_SUPERTX - 0, -#endif - w); -#endif // CONFIG_EXT_TX && !CONFIG_TXK_SEL - return; - } + if (av1_allow_intrabc(cm)) { + write_intrabc_info(xd, mbmi_ext, w); + if (is_intrabc_block(mbmi)) return; } -#endif // CONFIG_INTRABC - if (enable_tx_size) write_selected_tx_size(cm, xd, w); - if (bsize >= BLOCK_8X8 || unify_bsize) { - write_intra_mode_kf(cm, ec_ctx, mi, above_mi, left_mi, 0, mbmi->mode, w); - } else { - const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; - const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; - int idx, idy; - - for (idy = 0; idy < 2; idy += num_4x4_h) { - for (idx = 0; idx < 2; idx += num_4x4_w) { - const int block = idy * 2 + idx; - write_intra_mode_kf(cm, ec_ctx, mi, above_mi, left_mi, block, - mi->bmi[block].as_mode, w); - } - } + write_intra_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w); + + const int use_angle_delta = av1_use_angle_delta(bsize); + if (use_angle_delta && av1_is_directional_mode(mode)) { + write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y], + ec_ctx->angle_delta_cdf[mode - V_PRED]); } -#if CONFIG_CB4X4 - if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, + if (!cm->seq_params.monochrome && + is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y)) { - write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mbmi->mode, w); -#else // !CONFIG_CB4X4 - write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mbmi->mode, w); -#endif // CONFIG_CB4X4 - -#if CONFIG_CFL - if (mbmi->uv_mode == UV_CFL_PRED) { + const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; + write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w); + if (uv_mode == UV_CFL_PRED) write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w); + if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) { + write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV], + ec_ctx->angle_delta_cdf[uv_mode - V_PRED]); } -#endif - -#if CONFIG_CB4X4 } -#endif -#if CONFIG_EXT_INTRA - write_intra_angle_info(xd, ec_ctx, w); -#endif // CONFIG_EXT_INTRA + if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) - write_palette_mode_info(cm, xd, mi, w); -#if CONFIG_FILTER_INTRA - if (bsize >= BLOCK_8X8 || unify_bsize) - write_filter_intra_mode_info(cm, xd, mbmi, mi_row, mi_col, w); -#endif // CONFIG_FILTER_INTRA - -#if !CONFIG_TXK_SEL - av1_write_tx_type(cm, xd, -#if CONFIG_SUPERTX - 0, -#endif - w); -#endif // !CONFIG_TXK_SEL -} + write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w); -#if CONFIG_SUPERTX -#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \ - mi_row, mi_col) \ - write_modes_b(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col) -#else -#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \ - mi_row, mi_col) \ - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col) -#endif // CONFIG_SUPERTX + write_filter_intra_mode_info(cm, xd, mbmi, w); +} #if CONFIG_RD_DEBUG static void dump_mode_info(MODE_INFO *mi) { - printf("\nmi->mbmi.mi_row == %d\n", mi->mbmi.mi_row); - printf("&& mi->mbmi.mi_col == %d\n", mi->mbmi.mi_col); - printf("&& mi->mbmi.sb_type == %d\n", mi->mbmi.sb_type); - printf("&& mi->mbmi.tx_size == %d\n", mi->mbmi.tx_size); - if (mi->mbmi.sb_type >= BLOCK_8X8) { - printf("&& mi->mbmi.mode == %d\n", mi->mbmi.mode); - } else { - printf("&& mi->bmi[0].as_mode == %d\n", mi->bmi[0].as_mode); - } + printf("\nmi->mi_row == %d\n", mi->mi_row); + printf("&& mi->mi_col == %d\n", mi->mi_col); + printf("&& mi->sb_type == %d\n", mi->sb_type); + printf("&& mi->tx_size == %d\n", mi->tx_size); + printf("&& mi->mode == %d\n", mi->mode); } static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats, int plane) { if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) { -#if CONFIG_VAR_TX int r, c; -#endif printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n", plane, rd_stats->txb_coeff_cost[plane], token_stats->cost); -#if CONFIG_VAR_TX printf("rd txb_coeff_cost_map\n"); for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) { for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) { @@ -2308,7 +1282,6 @@ static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats, } printf("\n"); } -#endif return 1; } return 0; @@ -2319,128 +1292,139 @@ static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats, static void enc_dump_logs(AV1_COMP *cpi, int mi_row, int mi_col) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; - MODE_INFO *m; xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col); - m = xd->mi[0]; - if (is_inter_block(&m->mbmi)) { -#define FRAME_TO_CHECK 1 + const MB_MODE_INFO *const *mbmi = xd->mi[0]; + if (is_inter_block(mbmi)) { +#define FRAME_TO_CHECK 11 if (cm->current_video_frame == FRAME_TO_CHECK && cm->show_frame == 1) { - const MB_MODE_INFO *const mbmi = &m->mbmi; const BLOCK_SIZE bsize = mbmi->sb_type; int_mv mv[2]; - int is_comp_ref = has_second_ref(&m->mbmi); + int is_comp_ref = has_second_ref(mbmi); int ref; for (ref = 0; ref < 1 + is_comp_ref; ++ref) - mv[ref].as_mv = m->mbmi.mv[ref].as_mv; + mv[ref].as_mv = mbmi->mv[ref].as_mv; if (!is_comp_ref) { -#if CONFIG_COMPOUND_SINGLEREF - if (is_inter_singleref_comp_mode(m->mbmi.mode)) - mv[1].as_mv = m->mbmi.mv[1].as_mv; - else -#endif // CONFIG_COMPOUND_SINGLEREF - mv[1].as_int = 0; + mv[1].as_int = 0; } MACROBLOCK *const x = &cpi->td.mb; const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; - const int16_t mode_ctx = av1_mode_context_analyzer( - mbmi_ext->mode_context, mbmi->ref_frame, bsize, -1); + const int16_t mode_ctx = + is_comp_ref ? mbmi_ext->compound_mode_context[mbmi->ref_frame[0]] + : av1_mode_context_analyzer(mbmi_ext->mode_context, + mbmi->ref_frame); + const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK; int16_t zeromv_ctx = -1; int16_t refmv_ctx = -1; + if (mbmi->mode != NEWMV) { - zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK; - if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) { - assert(mbmi->mode == ZEROMV); - } - if (mbmi->mode != ZEROMV) { + zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + if (mbmi->mode != GLOBALMV) refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; - if (mode_ctx & (1 << SKIP_NEARESTMV_OFFSET)) refmv_ctx = 6; - if (mode_ctx & (1 << SKIP_NEARMV_OFFSET)) refmv_ctx = 7; - if (mode_ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) refmv_ctx = 8; - } } - int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); printf( "=== ENCODER ===: " - "Frame=%d, (mi_row,mi_col)=(%d,%d), mode=%d, bsize=%d, " + "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, " "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, " - "ref[1]=%d, motion_mode=%d, inter_mode_ctx=%d, mode_ctx=%d, " - "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d\n", - cm->current_video_frame, mi_row, mi_col, mbmi->mode, bsize, - cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col, mv[1].as_mv.row, - mv[1].as_mv.col, mbmi->ref_frame[0], mbmi->ref_frame[1], - mbmi->motion_mode, mbmi_ext->mode_context[ref_frame_type], mode_ctx, - newmv_ctx, zeromv_ctx, refmv_ctx); + "ref[1]=%d, motion_mode=%d, mode_ctx=%d, " + "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n", + cm->current_video_frame, mi_row, mi_col, mbmi->skip_mode, mbmi->mode, + bsize, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col, + mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0], + mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx, + zeromv_ctx, refmv_ctx, mbmi->tx_size); } } } #endif // ENC_MISMATCH_DEBUG static void write_mbmi_b(AV1_COMP *cpi, const TileInfo *const tile, - aom_writer *w, -#if CONFIG_SUPERTX - int supertx_enabled, -#endif - int mi_row, int mi_col) { + aom_writer *w, int mi_row, int mi_col) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; - MODE_INFO *m; int bh, bw; xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col); - m = xd->mi[0]; + MB_MODE_INFO *m = xd->mi[0]; - assert(m->mbmi.sb_type <= cm->sb_size || - (m->mbmi.sb_type >= BLOCK_SIZES && m->mbmi.sb_type < BLOCK_SIZES_ALL)); + assert(m->sb_type <= cm->seq_params.sb_size || + (m->sb_type >= BLOCK_SIZES && m->sb_type < BLOCK_SIZES_ALL)); - bh = mi_size_high[m->mbmi.sb_type]; - bw = mi_size_wide[m->mbmi.sb_type]; + bh = mi_size_high[m->sb_type]; + bw = mi_size_wide[m->sb_type]; cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); - set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, -#if CONFIG_DEPENDENT_HORZTILES - cm->dependent_horz_tiles, -#endif // CONFIG_DEPENDENT_HORZTILES - cm->mi_rows, cm->mi_cols); + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); + + xd->above_txfm_context = cm->above_txfm_context[tile->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); if (frame_is_intra_only(cm)) { - write_mb_modes_kf(cm, xd, -#if CONFIG_INTRABC - cpi->td.mb.mbmi_ext, -#endif // CONFIG_INTRABC - mi_row, mi_col, w); + write_mb_modes_kf(cpi, xd, cpi->td.mb.mbmi_ext, mi_row, mi_col, w); } else { -#if CONFIG_VAR_TX - xd->above_txfm_context = - cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2); - xd->left_txfm_context = xd->left_txfm_context_buffer + - ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2); -#endif -#if CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION // has_subpel_mv_component needs the ref frame buffers set up to look // up if they are scaled. has_subpel_mv_component is in turn needed by // write_switchable_interp_filter, which is called by pack_inter_mode_mvs. - set_ref_ptrs(cm, xd, m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]); -#if CONFIG_COMPOUND_SINGLEREF - if (!has_second_ref(&m->mbmi) && is_inter_singleref_comp_mode(m->mbmi.mode)) - xd->block_refs[1] = xd->block_refs[0]; -#endif // CONFIG_COMPOUND_SINGLEREF -#endif // CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION + set_ref_ptrs(cm, xd, m->ref_frame[0], m->ref_frame[1]); #if ENC_MISMATCH_DEBUG enc_dump_logs(cpi, mi_row, mi_col); #endif // ENC_MISMATCH_DEBUG - pack_inter_mode_mvs(cpi, mi_row, mi_col, -#if CONFIG_SUPERTX - supertx_enabled, -#endif - w); + pack_inter_mode_mvs(cpi, mi_row, mi_col, w); + } +} + +static void write_inter_txb_coeff(AV1_COMMON *const cm, MACROBLOCK *const x, + MB_MODE_INFO *const mbmi, aom_writer *w, + const TOKENEXTRA **tok, + const TOKENEXTRA *const tok_end, + TOKEN_STATS *token_stats, const int row, + const int col, int *block, const int plane) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bsize = mbmi->sb_type; + const BLOCK_SIZE bsizec = + scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y); + + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y); + + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); + const int step = + tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; + const int bkw = tx_size_wide_unit[max_tx_size]; + const int bkh = tx_size_high_unit[max_tx_size]; + + const BLOCK_SIZE max_unit_bsize = + get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y); + int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0]; + int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0]; + + int blk_row, blk_col; + + const int num_4x4_w = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + const int num_4x4_h = block_size_high[plane_bsize] >> tx_size_high_log2[0]; + + const int unit_height = + AOMMIN(mu_blocks_high + (row >> pd->subsampling_y), num_4x4_h); + const int unit_width = + AOMMIN(mu_blocks_wide + (col >> pd->subsampling_x), num_4x4_w); + for (blk_row = row >> pd->subsampling_y; blk_row < unit_height; + blk_row += bkh) { + for (blk_col = col >> pd->subsampling_x; blk_col < unit_width; + blk_col += bkw) { + pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize, + cm->bit_depth, *block, blk_row, blk_col, max_tx_size, + token_stats); + *block += step; + } } } @@ -2449,167 +1433,48 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile, const TOKENEXTRA *const tok_end, int mi_row, int mi_col) { AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; const int mi_offset = mi_row * cm->mi_stride + mi_col; - MODE_INFO *const m = *(cm->mi_grid_visible + mi_offset); - MB_MODE_INFO *const mbmi = &m->mbmi; + MB_MODE_INFO *const mbmi = *(cm->mi_grid_visible + mi_offset); int plane; int bh, bw; -#if CONFIG_PVQ || CONFIG_LV_MAP MACROBLOCK *const x = &cpi->td.mb; (void)tok; (void)tok_end; -#endif xd->mi = cm->mi_grid_visible + mi_offset; - assert(mbmi->sb_type <= cm->sb_size || + assert(mbmi->sb_type <= cm->seq_params.sb_size || (mbmi->sb_type >= BLOCK_SIZES && mbmi->sb_type < BLOCK_SIZES_ALL)); bh = mi_size_high[mbmi->sb_type]; bw = mi_size_wide[mbmi->sb_type]; cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); - set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, -#if CONFIG_DEPENDENT_HORZTILES - cm->dependent_horz_tiles, -#endif // CONFIG_DEPENDENT_HORZTILES - cm->mi_rows, cm->mi_cols); - -// TODO(anybody) : remove this flag when PVQ supports pallete coding tool -#if !CONFIG_PVQ - for (plane = 0; plane <= 1; ++plane) { - const uint8_t palette_size_plane = - mbmi->palette_mode_info.palette_size[plane]; - if (palette_size_plane > 0) { -#if CONFIG_INTRABC - assert(mbmi->use_intrabc == 0); -#endif - int rows, cols; - assert(mbmi->sb_type >= BLOCK_8X8); - av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows, - &cols); - assert(*tok < tok_end); - pack_map_tokens(w, tok, palette_size_plane, rows * cols); -#if !CONFIG_LV_MAP - assert(*tok < tok_end + mbmi->skip); -#endif // !CONFIG_LV_MAP - } - } -#endif // !CONFIG_PVQ + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); -#if CONFIG_COEF_INTERLEAVE if (!mbmi->skip) { - const struct macroblockd_plane *const pd_y = &xd->plane[0]; - const struct macroblockd_plane *const pd_c = &xd->plane[1]; - const TX_SIZE tx_log2_y = mbmi->tx_size; - const TX_SIZE tx_log2_c = av1_get_uv_tx_size(mbmi, pd_c); - const int tx_sz_y = (1 << tx_log2_y); - const int tx_sz_c = (1 << tx_log2_c); - - const BLOCK_SIZE plane_bsize_y = - get_plane_block_size(AOMMAX(mbmi->sb_type, 3), pd_y); - const BLOCK_SIZE plane_bsize_c = - get_plane_block_size(AOMMAX(mbmi->sb_type, 3), pd_c); - - const int num_4x4_w_y = num_4x4_blocks_wide_lookup[plane_bsize_y]; - const int num_4x4_w_c = num_4x4_blocks_wide_lookup[plane_bsize_c]; - const int num_4x4_h_y = num_4x4_blocks_high_lookup[plane_bsize_y]; - const int num_4x4_h_c = num_4x4_blocks_high_lookup[plane_bsize_c]; - - const int max_4x4_w_y = get_max_4x4_size(num_4x4_w_y, xd->mb_to_right_edge, - pd_y->subsampling_x); - const int max_4x4_h_y = get_max_4x4_size(num_4x4_h_y, xd->mb_to_bottom_edge, - pd_y->subsampling_y); - const int max_4x4_w_c = get_max_4x4_size(num_4x4_w_c, xd->mb_to_right_edge, - pd_c->subsampling_x); - const int max_4x4_h_c = get_max_4x4_size(num_4x4_h_c, xd->mb_to_bottom_edge, - pd_c->subsampling_y); - - // The max_4x4_w/h may be smaller than tx_sz under some corner cases, - // i.e. when the SB is splitted by tile boundaries. - const int tu_num_w_y = (max_4x4_w_y + tx_sz_y - 1) / tx_sz_y; - const int tu_num_h_y = (max_4x4_h_y + tx_sz_y - 1) / tx_sz_y; - const int tu_num_w_c = (max_4x4_w_c + tx_sz_c - 1) / tx_sz_c; - const int tu_num_h_c = (max_4x4_h_c + tx_sz_c - 1) / tx_sz_c; - const int tu_num_y = tu_num_w_y * tu_num_h_y; - const int tu_num_c = tu_num_w_c * tu_num_h_c; - - int tu_idx_y = 0, tu_idx_c = 0; - TOKEN_STATS token_stats; - init_token_stats(&token_stats); - - assert(*tok < tok_end); - - while (tu_idx_y < tu_num_y) { - pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_y, &token_stats); - assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN); - (*tok)++; - tu_idx_y++; - - if (tu_idx_c < tu_num_c) { - pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats); - assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN); - (*tok)++; - - pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats); - assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN); - (*tok)++; - - tu_idx_c++; - } - } - - // In 422 case, it's possilbe that Chroma has more TUs than Luma - while (tu_idx_c < tu_num_c) { - pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats); - assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN); - (*tok)++; - - pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats); - assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN); - (*tok)++; - - tu_idx_c++; - } - } -#else // CONFIG_COEF_INTERLEAVE - if (!mbmi->skip) { -#if !CONFIG_PVQ && !CONFIG_LV_MAP - assert(*tok < tok_end); -#endif - for (plane = 0; plane < MAX_MB_PLANE; ++plane) { -#if CONFIG_CB4X4 - if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type, - xd->plane[plane].subsampling_x, - xd->plane[plane].subsampling_y)) { -#if !CONFIG_LV_MAP - (*tok)++; -#endif // !CONFIG_LV_MAP - continue; - } -#endif -#if CONFIG_VAR_TX - const struct macroblockd_plane *const pd = &xd->plane[plane]; - BLOCK_SIZE bsize = mbmi->sb_type; -#if CONFIG_CHROMA_SUB8X8 - const BLOCK_SIZE plane_bsize = - AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd)); -#elif CONFIG_CB4X4 - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); -#else - const BLOCK_SIZE plane_bsize = - get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd); -#endif - + if (!is_inter_block(mbmi)) + av1_write_coeffs_mb(cm, x, mi_row, mi_col, w, mbmi->sb_type); + + if (is_inter_block(mbmi)) { + int block[MAX_MB_PLANE] = { 0 }; + const BLOCK_SIZE plane_bsize = mbmi->sb_type; + assert(plane_bsize == get_plane_block_size(mbmi->sb_type, + xd->plane[0].subsampling_x, + xd->plane[0].subsampling_y)); const int num_4x4_w = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; const int num_4x4_h = - block_size_high[plane_bsize] >> tx_size_wide_log2[0]; + block_size_high[plane_bsize] >> tx_size_high_log2[0]; int row, col; TOKEN_STATS token_stats; init_token_stats(&token_stats); - const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd); + const BLOCK_SIZE max_unit_bsize = BLOCK_64X64; + assert(max_unit_bsize == + get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x, + xd->plane[0].subsampling_y)); int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0]; int mu_blocks_high = @@ -2618,37 +1483,16 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile, mu_blocks_wide = AOMMIN(num_4x4_w, mu_blocks_wide); mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high); - if (is_inter_block(mbmi)) { - const TX_SIZE max_tx_size = get_vartx_max_txsize( - mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y); - int block = 0; - const int step = - tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; - const int bkw = tx_size_wide_unit[max_tx_size]; - const int bkh = tx_size_high_unit[max_tx_size]; - assert(bkw <= mu_blocks_wide); - assert(bkh <= mu_blocks_high); - for (row = 0; row < num_4x4_h; row += mu_blocks_high) { - const int unit_height = AOMMIN(mu_blocks_high + row, num_4x4_h); - for (col = 0; col < num_4x4_w; col += mu_blocks_wide) { - int blk_row, blk_col; - const int unit_width = AOMMIN(mu_blocks_wide + col, num_4x4_w); - for (blk_row = row; blk_row < unit_height; blk_row += bkh) { - for (blk_col = col; blk_col < unit_width; blk_col += bkw) { - pack_txb_tokens(w, -#if CONFIG_LV_MAP - cm, -#endif - tok, tok_end, -#if CONFIG_PVQ || CONFIG_LV_MAP - x, -#endif - xd, mbmi, plane, plane_bsize, cm->bit_depth, - block, blk_row, blk_col, max_tx_size, - &token_stats); - block += step; - } + for (row = 0; row < num_4x4_h; row += mu_blocks_high) { + for (col = 0; col < num_4x4_w; col += mu_blocks_wide) { + for (plane = 0; plane < num_planes && is_inter_block(mbmi); ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type, + pd->subsampling_x, pd->subsampling_y)) { + continue; } + write_inter_txb_coeff(cm, x, mbmi, w, tok, tok_end, &token_stats, + row, col, &block[plane], plane); } } #if CONFIG_RD_DEBUG @@ -2658,607 +1502,196 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile, assert(0); } #endif // CONFIG_RD_DEBUG - } else { -#if CONFIG_LV_MAP - av1_write_coeffs_mb(cm, x, w, plane); -#else - const TX_SIZE tx = av1_get_tx_size(plane, xd); - const int bkw = tx_size_wide_unit[tx]; - const int bkh = tx_size_high_unit[tx]; - int blk_row, blk_col; - - for (row = 0; row < num_4x4_h; row += mu_blocks_high) { - for (col = 0; col < num_4x4_w; col += mu_blocks_wide) { - const int unit_height = AOMMIN(mu_blocks_high + row, num_4x4_h); - const int unit_width = AOMMIN(mu_blocks_wide + col, num_4x4_w); - - for (blk_row = row; blk_row < unit_height; blk_row += bkh) { - for (blk_col = col; blk_col < unit_width; blk_col += bkw) { -#if !CONFIG_PVQ -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - TX_TYPE tx_type = - av1_get_tx_type(plane ? PLANE_TYPE_UV : PLANE_TYPE_Y, xd, - blk_row, blk_col, 0, tx); -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - tx_type, is_inter_block(mbmi), -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - &token_stats); -#else - pack_pvq_tokens(w, x, xd, plane, bsize, tx); -#endif - } - } - } - } -#endif // CONFIG_LV_MAP - } -#else - const TX_SIZE tx = av1_get_tx_size(plane, xd); - TOKEN_STATS token_stats; -#if !CONFIG_PVQ - init_token_stats(&token_stats); -#if CONFIG_LV_MAP - (void)tx; - av1_write_coeffs_mb(cm, x, w, plane); -#else // CONFIG_LV_MAP -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - TX_TYPE tx_type = av1_get_tx_type(plane ? PLANE_TYPE_UV : PLANE_TYPE_Y, - xd, blk_row, blk_col, 0, tx); -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - tx_type, is_inter_block(mbmi), -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - &token_stats); -#endif // CONFIG_LV_MAP - -#else - (void)token_stats; - pack_pvq_tokens(w, x, xd, plane, mbmi->sb_type, tx); -#endif -#if CONFIG_RD_DEBUG - if (is_inter_block(mbmi) && mbmi->sb_type >= BLOCK_8X8 && - rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) { - dump_mode_info(m); - assert(0); } -#endif // CONFIG_RD_DEBUG -#endif // CONFIG_VAR_TX - -#if !CONFIG_PVQ && !CONFIG_LV_MAP - assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN); - (*tok)++; -#endif } } -#endif // CONFIG_COEF_INTERLEAVE } -#if CONFIG_MOTION_VAR && NC_MODE_INFO -static void write_tokens_sb(AV1_COMP *cpi, const TileInfo *const tile, - aom_writer *w, const TOKENEXTRA **tok, - const TOKENEXTRA *const tok_end, int mi_row, - int mi_col, BLOCK_SIZE bsize) { - const AV1_COMMON *const cm = &cpi->common; - const int hbs = mi_size_wide[bsize] / 2; - PARTITION_TYPE partition; - BLOCK_SIZE subsize; -#if CONFIG_CB4X4 - const int unify_bsize = 1; -#else - const int unify_bsize = 0; -#endif - - if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - - partition = get_partition(cm, mi_row, mi_col, bsize); - subsize = get_subsize(bsize, partition); - - if (subsize < BLOCK_8X8 && !unify_bsize) { - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); - } else { - switch (partition) { - case PARTITION_NONE: - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); - break; - case PARTITION_HORZ: - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); - if (mi_row + hbs < cm->mi_rows) - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); - break; - case PARTITION_VERT: - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); - if (mi_col + hbs < cm->mi_cols) - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); - break; - case PARTITION_SPLIT: - write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize); - write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs, - subsize); - write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col, - subsize); - write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs, - subsize); - break; -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION_TYPES_AB -#error NC_MODE_INFO+MOTION_VAR not yet supported for new HORZ/VERT_AB partitions -#endif - case PARTITION_HORZ_A: - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); - break; - case PARTITION_HORZ_B: - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); - break; - case PARTITION_VERT_A: - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); - break; - case PARTITION_VERT_B: - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); - break; -#endif // CONFIG_EXT_PARTITION_TYPES - default: assert(0); +static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile, + aom_writer *w, const TOKENEXTRA **tok, + const TOKENEXTRA *const tok_end, int mi_row, + int mi_col) { + write_mbmi_b(cpi, tile, w, mi_row, mi_col); + + AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &cpi->td.mb.e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) { + const uint8_t palette_size_plane = + mbmi->palette_mode_info.palette_size[plane]; + assert(!mbmi->skip_mode || !palette_size_plane); + if (palette_size_plane > 0) { + assert(mbmi->use_intrabc == 0); + assert(av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type)); + int rows, cols; + av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows, + &cols); + assert(*tok < tok_end); + pack_map_tokens(w, tok, palette_size_plane, rows * cols); } } -} -#endif -static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile, - aom_writer *w, const TOKENEXTRA **tok, - const TOKENEXTRA *const tok_end, -#if CONFIG_SUPERTX - int supertx_enabled, -#endif - int mi_row, int mi_col) { - write_mbmi_b(cpi, tile, w, -#if CONFIG_SUPERTX - supertx_enabled, -#endif - mi_row, mi_col); + BLOCK_SIZE bsize = mbmi->sb_type; + int is_inter_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi); + int skip = mbmi->skip; + int segment_id = mbmi->segment_id; + if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) && + !(is_inter_tx && skip) && !xd->lossless[segment_id]) { + if (is_inter_tx) { // This implies skip flag is 0. + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0); + const int txbh = tx_size_high_unit[max_tx_size]; + const int txbw = tx_size_wide_unit[max_tx_size]; + const int width = block_size_wide[bsize] >> tx_size_wide_log2[0]; + const int height = block_size_high[bsize] >> tx_size_high_log2[0]; + int idx, idy; + for (idy = 0; idy < height; idy += txbh) + for (idx = 0; idx < width; idx += txbw) + write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w); + } else { + write_selected_tx_size(xd, w); + set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, 0, xd); + } + } else { + set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, + skip && is_inter_block(mbmi), xd); + } -#if CONFIG_MOTION_VAR && NC_MODE_INFO - (void)tok; - (void)tok_end; -#else -#if !CONFIG_PVQ && CONFIG_SUPERTX - if (!supertx_enabled) -#endif - write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); -#endif + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); } static void write_partition(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, int hbs, int mi_row, int mi_col, PARTITION_TYPE p, BLOCK_SIZE bsize, aom_writer *w) { + const int is_partition_point = bsize >= BLOCK_8X8; + + if (!is_partition_point) return; + const int has_rows = (mi_row + hbs) < cm->mi_rows; const int has_cols = (mi_col + hbs) < cm->mi_cols; - const int is_partition_point = bsize >= BLOCK_8X8; - const int ctx = is_partition_point - ? partition_plane_context(xd, mi_row, mi_col, -#if CONFIG_UNPOISON_PARTITION_CTX - has_rows, has_cols, -#endif - bsize) - : 0; + const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize); FRAME_CONTEXT *ec_ctx = xd->tile_ctx; - (void)cm; - if (!is_partition_point) return; + if (!has_rows && !has_cols) { + assert(p == PARTITION_SPLIT); + return; + } if (has_rows && has_cols) { -#if CONFIG_EXT_PARTITION_TYPES - const int num_partition_types = - (mi_width_log2_lookup[bsize] > mi_width_log2_lookup[BLOCK_8X8]) - ? EXT_PARTITION_TYPES - : PARTITION_TYPES; -#else - const int num_partition_types = PARTITION_TYPES; -#endif - aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], num_partition_types); + aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], + partition_cdf_length(bsize)); } else if (!has_rows && has_cols) { assert(p == PARTITION_SPLIT || p == PARTITION_HORZ); assert(bsize > BLOCK_8X8); aom_cdf_prob cdf[2]; - partition_gather_vert_alike(cdf, ec_ctx->partition_cdf[ctx]); + partition_gather_vert_alike(cdf, ec_ctx->partition_cdf[ctx], bsize); aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2); - } else if (has_rows && !has_cols) { + } else { + assert(has_rows && !has_cols); assert(p == PARTITION_SPLIT || p == PARTITION_VERT); assert(bsize > BLOCK_8X8); aom_cdf_prob cdf[2]; - partition_gather_horz_alike(cdf, ec_ctx->partition_cdf[ctx]); + partition_gather_horz_alike(cdf, ec_ctx->partition_cdf[ctx], bsize); aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2); - } else { - assert(p == PARTITION_SPLIT); } } -#if CONFIG_SUPERTX -#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \ - mi_row, mi_col, bsize) \ - write_modes_sb(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col, \ - bsize) -#else -#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \ - mi_row, mi_col, bsize) \ - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, bsize) -#endif // CONFIG_SUPERTX - static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile, aom_writer *const w, const TOKENEXTRA **tok, - const TOKENEXTRA *const tok_end, -#if CONFIG_SUPERTX - int supertx_enabled, -#endif - int mi_row, int mi_col, BLOCK_SIZE bsize) { + const TOKENEXTRA *const tok_end, int mi_row, + int mi_col, BLOCK_SIZE bsize) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; const int hbs = mi_size_wide[bsize] / 2; -#if CONFIG_EXT_PARTITION_TYPES const int quarter_step = mi_size_wide[bsize] / 4; int i; -#if CONFIG_EXT_PARTITION_TYPES_AB - const int qbs = mi_size_wide[bsize] / 4; -#endif // CONFIG_EXT_PARTITION_TYPES_AB -#endif // CONFIG_EXT_PARTITION_TYPES const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize); - const BLOCK_SIZE subsize = get_subsize(bsize, partition); -#if CONFIG_CB4X4 - const int unify_bsize = 1; -#else - const int unify_bsize = 0; -#endif - -#if CONFIG_SUPERTX - const int mi_offset = mi_row * cm->mi_stride + mi_col; - MB_MODE_INFO *mbmi; - const int pack_token = !supertx_enabled; - TX_SIZE supertx_size; -#endif + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w); -#if CONFIG_SUPERTX - mbmi = &cm->mi_grid_visible[mi_offset]->mbmi; - xd->mi = cm->mi_grid_visible + mi_offset; - set_mi_row_col(xd, tile, mi_row, mi_size_high[bsize], mi_col, - mi_size_wide[bsize], -#if CONFIG_DEPENDENT_HORZTILES - cm->dependent_horz_tiles, -#endif // CONFIG_DEPENDENT_HORZTILES - cm->mi_rows, cm->mi_cols); - if (!supertx_enabled && !frame_is_intra_only(cm) && - partition != PARTITION_NONE && bsize <= MAX_SUPERTX_BLOCK_SIZE && - !xd->lossless[0]) { - aom_prob prob; - supertx_size = max_txsize_lookup[bsize]; - prob = cm->fc->supertx_prob[partition_supertx_context_lookup[partition]] - [supertx_size]; - supertx_enabled = (xd->mi[0]->mbmi.tx_size == supertx_size); - aom_write(w, supertx_enabled, prob); - } -#endif // CONFIG_SUPERTX - if (subsize < BLOCK_8X8 && !unify_bsize) { - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, - mi_col); - } else { - switch (partition) { - case PARTITION_NONE: - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col); - break; - case PARTITION_HORZ: - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col); - if (mi_row + hbs < cm->mi_rows) - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row + hbs, mi_col); - break; - case PARTITION_VERT: - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col); - if (mi_col + hbs < cm->mi_cols) - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col + hbs); - break; - case PARTITION_SPLIT: - write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col, subsize); - write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col + hbs, subsize); - write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row + hbs, mi_col, subsize); - write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row + hbs, mi_col + hbs, subsize); - break; -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION_TYPES_AB - case PARTITION_HORZ_A: - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col); - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row + qbs, mi_col); - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row + hbs, mi_col); - break; - case PARTITION_HORZ_B: - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col); - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row + hbs, mi_col); - if (mi_row + 3 * qbs < cm->mi_rows) - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row + 3 * qbs, mi_col); - break; - case PARTITION_VERT_A: - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col); - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col + qbs); - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col + hbs); - break; - case PARTITION_VERT_B: - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col); - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col + hbs); - if (mi_col + 3 * qbs < cm->mi_cols) - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col + 3 * qbs); - break; -#else - case PARTITION_HORZ_A: - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col); - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col + hbs); - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row + hbs, mi_col); - break; - case PARTITION_HORZ_B: - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col); - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row + hbs, mi_col); - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row + hbs, mi_col + hbs); - break; - case PARTITION_VERT_A: - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col); - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row + hbs, mi_col); - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col + hbs); - break; - case PARTITION_VERT_B: - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col); - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, mi_col + hbs); - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row + hbs, mi_col + hbs); - break; -#endif - case PARTITION_HORZ_4: - for (i = 0; i < 4; ++i) { - int this_mi_row = mi_row + i * quarter_step; - if (i > 0 && this_mi_row >= cm->mi_rows) break; - - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - this_mi_row, mi_col); - } - break; - case PARTITION_VERT_4: - for (i = 0; i < 4; ++i) { - int this_mi_col = mi_col + i * quarter_step; - if (i > 0 && this_mi_col >= cm->mi_cols) break; - - write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, - mi_row, this_mi_col); + const int num_planes = av1_num_planes(cm); + for (int plane = 0; plane < num_planes; ++plane) { + int rcol0, rcol1, rrow0, rrow1, tile_tl_idx; + if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize, + &rcol0, &rcol1, &rrow0, &rrow1, + &tile_tl_idx)) { + const int rstride = cm->rst_info[plane].horz_units_per_tile; + for (int rrow = rrow0; rrow < rrow1; ++rrow) { + for (int rcol = rcol0; rcol < rcol1; ++rcol) { + const int runit_idx = tile_tl_idx + rcol + rrow * rstride; + const RestorationUnitInfo *rui = + &cm->rst_info[plane].unit_info[runit_idx]; + loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane, + cpi->td.counts); } - break; -#endif // CONFIG_EXT_PARTITION_TYPES - default: assert(0); + } } } -#if CONFIG_SUPERTX - if (partition != PARTITION_NONE && supertx_enabled && pack_token) { - int skip; - const int bsw = mi_size_wide[bsize]; - const int bsh = mi_size_high[bsize]; - - xd->mi = cm->mi_grid_visible + mi_offset; - supertx_size = mbmi->tx_size; - set_mi_row_col(xd, tile, mi_row, bsh, mi_col, bsw, -#if CONFIG_DEPENDENT_HORZTILES - cm->dependent_horz_tiles, -#endif // CONFIG_DEPENDENT_HORZTILES - cm->mi_rows, cm->mi_cols); - assert(IMPLIES(!cm->seg.enabled, mbmi->segment_id_supertx == 0)); - assert(mbmi->segment_id_supertx < MAX_SEGMENTS); - - skip = write_skip(cm, xd, mbmi->segment_id_supertx, xd->mi[0], w); - - FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w); + switch (partition) { + case PARTITION_NONE: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + break; + case PARTITION_HORZ: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + if (mi_row + hbs < cm->mi_rows) + write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); + break; + case PARTITION_VERT: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + if (mi_col + hbs < cm->mi_cols) + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); + break; + case PARTITION_SPLIT: + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize); + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs, subsize); + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col, subsize); + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs, + subsize); + break; + case PARTITION_HORZ_A: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); + write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); + break; + case PARTITION_HORZ_B: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); + write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); + break; + case PARTITION_VERT_A: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); + break; + case PARTITION_VERT_B: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); + write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); + break; + case PARTITION_HORZ_4: + for (i = 0; i < 4; ++i) { + int this_mi_row = mi_row + i * quarter_step; + if (i > 0 && this_mi_row >= cm->mi_rows) break; -#if CONFIG_EXT_TX - if (get_ext_tx_types(supertx_size, bsize, 1, cm->reduced_tx_set_used) > 1 && - !skip) { - const int eset = - get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used); - const int tx_set_type = - get_ext_tx_set_type(supertx_size, bsize, 1, cm->reduced_tx_set_used); - if (eset > 0) { - aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][mbmi->tx_type], - ec_ctx->inter_ext_tx_cdf[eset][supertx_size], - av1_num_ext_tx_set[tx_set_type]); + write_modes_b(cpi, tile, w, tok, tok_end, this_mi_row, mi_col); } - } -#else - if (supertx_size < TX_32X32 && !skip) { - aom_write_symbol(w, mbmi->tx_type, ec_ctx->inter_ext_tx_cdf[supertx_size], - TX_TYPES); - } -#endif // CONFIG_EXT_TX + break; + case PARTITION_VERT_4: + for (i = 0; i < 4; ++i) { + int this_mi_col = mi_col + i * quarter_step; + if (i > 0 && this_mi_col >= cm->mi_cols) break; - if (!skip) { - assert(*tok < tok_end); - for (int plane = 0; plane < MAX_MB_PLANE; ++plane) { -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - TX_TYPE tx_type = av1_get_tx_type(plane ? PLANE_TYPE_UV : PLANE_TYPE_Y, - xd, blk_row, blk_col, block, tx_size); -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - const struct macroblockd_plane *const pd = &xd->plane[plane]; - const int mbmi_txb_size = txsize_to_bsize[mbmi->tx_size]; - const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi_txb_size, pd); - - const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); - const int max_blocks_high = max_block_high(xd, plane_bsize, plane); - - int row, col; - const TX_SIZE tx = av1_get_tx_size(plane, xd); - BLOCK_SIZE txb_size = txsize_to_bsize[tx]; - - const int stepr = tx_size_high_unit[txb_size]; - const int stepc = tx_size_wide_unit[txb_size]; - - TOKEN_STATS token_stats; - token_stats.cost = 0; - for (row = 0; row < max_blocks_high; row += stepr) - for (col = 0; col < max_blocks_wide; col += stepc) - pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - tx_type, is_inter_block(mbmi), -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - &token_stats); - assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN); - (*tok)++; + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, this_mi_col); } - } -#if CONFIG_VAR_TX - xd->above_txfm_context = cm->above_txfm_context + mi_col; - xd->left_txfm_context = - xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); - set_txfm_ctxs(xd->mi[0]->mbmi.tx_size, bsw, bsh, skip, xd); -#endif + break; + default: assert(0); } -#endif // CONFIG_SUPERTX -// update partition context -#if CONFIG_EXT_PARTITION_TYPES + // update partition context update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); -#else - if (bsize >= BLOCK_8X8 && - (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) - update_partition_context(xd, mi_row, mi_col, subsize, bsize); -#endif // CONFIG_EXT_PARTITION_TYPES - -#if CONFIG_LPF_SB - // send filter level for each superblock (64x64) - if (bsize == cm->sb_size) { - if (mi_row == 0 && mi_col == 0) { - aom_write_literal(w, cm->mi_grid_visible[0]->mbmi.filt_lvl, 6); - cm->mi_grid_visible[0]->mbmi.reuse_sb_lvl = 0; - cm->mi_grid_visible[0]->mbmi.delta = 0; - cm->mi_grid_visible[0]->mbmi.sign = 0; - } else { - int prev_mi_row, prev_mi_col; - if (mi_col - MAX_MIB_SIZE < 0) { - prev_mi_row = mi_row - MAX_MIB_SIZE; - prev_mi_col = mi_col; - } else { - prev_mi_row = mi_row; - prev_mi_col = mi_col - MAX_MIB_SIZE; - } - MB_MODE_INFO *curr_mbmi = - &cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi; - MB_MODE_INFO *prev_mbmi = - &cm->mi_grid_visible[prev_mi_row * cm->mi_stride + prev_mi_col]->mbmi; - - const uint8_t curr_lvl = curr_mbmi->filt_lvl; - const uint8_t prev_lvl = prev_mbmi->filt_lvl; - - const int reuse_prev_lvl = curr_lvl == prev_lvl; - const int reuse_ctx = prev_mbmi->reuse_sb_lvl; - curr_mbmi->reuse_sb_lvl = reuse_prev_lvl; - aom_write_symbol(w, reuse_prev_lvl, - xd->tile_ctx->lpf_reuse_cdf[reuse_ctx], 2); - - if (reuse_prev_lvl) { - curr_mbmi->delta = 0; - curr_mbmi->sign = 0; - } else { - const unsigned int delta = abs(curr_lvl - prev_lvl) / LPF_STEP; - const int delta_ctx = prev_mbmi->delta; - curr_mbmi->delta = delta; - aom_write_symbol(w, delta, xd->tile_ctx->lpf_delta_cdf[delta_ctx], - DELTA_RANGE); - - if (delta) { - const int sign = curr_lvl > prev_lvl; - const int sign_ctx = prev_mbmi->sign; - curr_mbmi->sign = sign; - aom_write_symbol(w, sign, - xd->tile_ctx->lpf_sign_cdf[reuse_ctx][sign_ctx], 2); - } else { - curr_mbmi->sign = 0; - } - } - } - } -#endif - -#if CONFIG_CDEF - if (bsize == cm->sb_size && cm->cdef_bits != 0 && !cm->all_lossless) { - int width_step = mi_size_wide[BLOCK_64X64]; - int height_step = mi_size_high[BLOCK_64X64]; - int width, height; - for (height = 0; (height < mi_size_high[cm->sb_size]) && - (mi_row + height < cm->mi_rows); - height += height_step) { - for (width = 0; (width < mi_size_wide[cm->sb_size]) && - (mi_col + width < cm->mi_cols); - width += width_step) { - if (!sb_all_skip(cm, mi_row + height, mi_col + width)) - aom_write_literal( - w, - cm->mi_grid_visible[(mi_row + height) * cm->mi_stride + - (mi_col + width)] - ->mbmi.cdef_strength, - cm->cdef_bits); - } - } - } -#endif -#if CONFIG_LOOP_RESTORATION - for (int plane = 0; plane < MAX_MB_PLANE; ++plane) { - int rcol0, rcol1, rrow0, rrow1, nhtiles; - if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize, - &rcol0, &rcol1, &rrow0, &rrow1, - &nhtiles)) { - for (int rrow = rrow0; rrow < rrow1; ++rrow) { - for (int rcol = rcol0; rcol < rcol1; ++rcol) { - int rtile_idx = rcol + rrow * nhtiles; - loop_restoration_write_sb_coeffs(cm, xd, w, plane, rtile_idx); - } - } - } - } -#endif } static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile, @@ -3272,78 +1705,46 @@ static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile, const int mi_col_end = tile->mi_col_end; int mi_row, mi_col; -#if CONFIG_DEPENDENT_HORZTILES - if (!cm->dependent_horz_tiles || mi_row_start == 0 || - tile->tg_horz_boundary) { - av1_zero_above_context(cm, mi_col_start, mi_col_end); - } -#else - av1_zero_above_context(cm, mi_col_start, mi_col_end); -#endif -#if CONFIG_PVQ - assert(cpi->td.mb.pvq_q->curr_pos == 0); -#endif + av1_zero_above_context(cm, mi_col_start, mi_col_end, tile->tile_row); + av1_init_above_context(cm, xd, tile->tile_row); + if (cpi->common.delta_q_present_flag) { - xd->prev_qindex = cpi->common.base_qindex; -#if CONFIG_EXT_DELTA_Q + xd->current_qindex = cpi->common.base_qindex; if (cpi->common.delta_lf_present_flag) { -#if CONFIG_LOOPFILTER_LEVEL - for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) - xd->prev_delta_lf[lf_id] = 0; -#endif // CONFIG_LOOPFILTER_LEVEL - xd->prev_delta_lf_from_base = 0; + av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); } -#endif // CONFIG_EXT_DELTA_Q } - for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += cm->mib_size) { + for (mi_row = mi_row_start; mi_row < mi_row_end; + mi_row += cm->seq_params.mib_size) { av1_zero_left_context(xd); - for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += cm->mib_size) { - write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, 0, mi_row, mi_col, - cm->sb_size); -#if CONFIG_MOTION_VAR && NC_MODE_INFO - write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, cm->sb_size); -#endif + for (mi_col = mi_col_start; mi_col < mi_col_end; + mi_col += cm->seq_params.mib_size) { + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, + cm->seq_params.sb_size); } } -#if CONFIG_PVQ - // Check that the number of PVQ blocks encoded and written to the bitstream - // are the same - assert(cpi->td.mb.pvq_q->curr_pos == cpi->td.mb.pvq_q->last_pos); - // Reset curr_pos in case we repack the bitstream - cpi->td.mb.pvq_q->curr_pos = 0; -#endif } -#if CONFIG_LOOP_RESTORATION static void encode_restoration_mode(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { - int p; - RestorationInfo *rsi = &cm->rst_info[0]; - switch (rsi->frame_restoration_type) { - case RESTORE_NONE: - aom_wb_write_bit(wb, 0); - aom_wb_write_bit(wb, 0); - break; - case RESTORE_WIENER: - aom_wb_write_bit(wb, 1); - aom_wb_write_bit(wb, 0); - break; - case RESTORE_SGRPROJ: - aom_wb_write_bit(wb, 1); - aom_wb_write_bit(wb, 1); - break; - case RESTORE_SWITCHABLE: - aom_wb_write_bit(wb, 0); - aom_wb_write_bit(wb, 1); - break; - default: assert(0); - } - for (p = 1; p < MAX_MB_PLANE; ++p) { - rsi = &cm->rst_info[p]; + assert(!cm->all_lossless); + if (!cm->seq_params.enable_restoration) return; + if (cm->allow_intrabc) return; + const int num_planes = av1_num_planes(cm); + int all_none = 1, chroma_none = 1; + for (int p = 0; p < num_planes; ++p) { + RestorationInfo *rsi = &cm->rst_info[p]; + if (rsi->frame_restoration_type != RESTORE_NONE) { + all_none = 0; + chroma_none &= p == 0; + } switch (rsi->frame_restoration_type) { - case RESTORE_NONE: aom_wb_write_bit(wb, 0); break; + case RESTORE_NONE: + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, 0); + break; case RESTORE_WIENER: aom_wb_write_bit(wb, 1); aom_wb_write_bit(wb, 0); @@ -3352,40 +1753,52 @@ static void encode_restoration_mode(AV1_COMMON *cm, aom_wb_write_bit(wb, 1); aom_wb_write_bit(wb, 1); break; + case RESTORE_SWITCHABLE: + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, 1); + break; default: assert(0); } } - if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE || - cm->rst_info[1].frame_restoration_type != RESTORE_NONE || - cm->rst_info[2].frame_restoration_type != RESTORE_NONE) { - rsi = &cm->rst_info[0]; - aom_wb_write_bit(wb, rsi->restoration_tilesize != RESTORATION_TILESIZE_MAX); - if (rsi->restoration_tilesize != RESTORATION_TILESIZE_MAX) { - aom_wb_write_bit( - wb, rsi->restoration_tilesize != (RESTORATION_TILESIZE_MAX >> 1)); + if (!all_none) { + assert(cm->seq_params.sb_size == BLOCK_64X64 || + cm->seq_params.sb_size == BLOCK_128X128); + const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64; + + RestorationInfo *rsi = &cm->rst_info[0]; + + assert(rsi->restoration_unit_size >= sb_size); + assert(RESTORATION_UNITSIZE_MAX == 256); + + if (sb_size == 64) { + aom_wb_write_bit(wb, rsi->restoration_unit_size > 64); + } + if (rsi->restoration_unit_size > 64) { + aom_wb_write_bit(wb, rsi->restoration_unit_size > 128); } } - int s = AOMMIN(cm->subsampling_x, cm->subsampling_y); - if (s && (cm->rst_info[1].frame_restoration_type != RESTORE_NONE || - cm->rst_info[2].frame_restoration_type != RESTORE_NONE)) { - aom_wb_write_bit(wb, - cm->rst_info[1].restoration_tilesize != - cm->rst_info[0].restoration_tilesize); - assert(cm->rst_info[1].restoration_tilesize == - cm->rst_info[0].restoration_tilesize || - cm->rst_info[1].restoration_tilesize == - (cm->rst_info[0].restoration_tilesize >> s)); - assert(cm->rst_info[2].restoration_tilesize == - cm->rst_info[1].restoration_tilesize); - } else if (!s) { - assert(cm->rst_info[1].restoration_tilesize == - cm->rst_info[0].restoration_tilesize); - assert(cm->rst_info[2].restoration_tilesize == - cm->rst_info[1].restoration_tilesize); + + if (num_planes > 1) { + int s = AOMMIN(cm->subsampling_x, cm->subsampling_y); + if (s && !chroma_none) { + aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size != + cm->rst_info[0].restoration_unit_size); + assert(cm->rst_info[1].restoration_unit_size == + cm->rst_info[0].restoration_unit_size || + cm->rst_info[1].restoration_unit_size == + (cm->rst_info[0].restoration_unit_size >> s)); + assert(cm->rst_info[2].restoration_unit_size == + cm->rst_info[1].restoration_unit_size); + } else if (!s) { + assert(cm->rst_info[1].restoration_unit_size == + cm->rst_info[0].restoration_unit_size); + assert(cm->rst_info[2].restoration_unit_size == + cm->rst_info[1].restoration_unit_size); + } } } -static void write_wiener_filter(int wiener_win, WienerInfo *wiener_info, +static void write_wiener_filter(int wiener_win, const WienerInfo *wiener_info, WienerInfo *ref_wiener_info, aom_writer *wb) { if (wiener_win == WIENER_WIN) aom_write_primitive_refsubexpfin( @@ -3428,78 +1841,106 @@ static void write_wiener_filter(int wiener_win, WienerInfo *wiener_info, memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info)); } -static void write_sgrproj_filter(SgrprojInfo *sgrproj_info, +static void write_sgrproj_filter(const SgrprojInfo *sgrproj_info, SgrprojInfo *ref_sgrproj_info, aom_writer *wb) { aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS); - aom_write_primitive_refsubexpfin(wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, - SGRPROJ_PRJ_SUBEXP_K, - ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, - sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); - aom_write_primitive_refsubexpfin(wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, - SGRPROJ_PRJ_SUBEXP_K, - ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, - sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); + const sgr_params_type *params = &sgr_params[sgrproj_info->ep]; + + if (params->r[0] == 0) { + assert(sgrproj_info->xqd[0] == 0); + aom_write_primitive_refsubexpfin( + wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, + sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); + } else if (params->r[1] == 0) { + aom_write_primitive_refsubexpfin( + wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, + sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); + } else { + aom_write_primitive_refsubexpfin( + wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, + sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); + aom_write_primitive_refsubexpfin( + wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, + sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); + } + memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info)); } static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm, MACROBLOCKD *xd, + const RestorationUnitInfo *rui, aom_writer *const w, int plane, - int rtile_idx) { + FRAME_COUNTS *counts) { const RestorationInfo *rsi = cm->rst_info + plane; - if (rsi->frame_restoration_type == RESTORE_NONE) return; + RestorationType frame_rtype = rsi->frame_restoration_type; + if (frame_rtype == RESTORE_NONE) return; + + (void)counts; + assert(!cm->all_lossless); const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN; WienerInfo *wiener_info = xd->wiener_info + plane; SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane; + RestorationType unit_rtype = rui->restoration_type; - if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) { - assert(plane == 0); - av1_write_token( - w, av1_switchable_restore_tree, cm->fc->switchable_restore_prob, - &switchable_restore_encodings[rsi->restoration_type[rtile_idx]]); - if (rsi->restoration_type[rtile_idx] == RESTORE_WIENER) { - write_wiener_filter(wiener_win, &rsi->wiener_info[rtile_idx], wiener_info, - w); - } else if (rsi->restoration_type[rtile_idx] == RESTORE_SGRPROJ) { - write_sgrproj_filter(&rsi->sgrproj_info[rtile_idx], sgrproj_info, w); + if (frame_rtype == RESTORE_SWITCHABLE) { + aom_write_symbol(w, unit_rtype, xd->tile_ctx->switchable_restore_cdf, + RESTORE_SWITCHABLE_TYPES); +#if CONFIG_ENTROPY_STATS + ++counts->switchable_restore[unit_rtype]; +#endif + switch (unit_rtype) { + case RESTORE_WIENER: + write_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, w); + break; + case RESTORE_SGRPROJ: + write_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, w); + break; + default: assert(unit_rtype == RESTORE_NONE); break; } - } else if (rsi->frame_restoration_type == RESTORE_WIENER) { - aom_write(w, rsi->restoration_type[rtile_idx] != RESTORE_NONE, - RESTORE_NONE_WIENER_PROB); - if (rsi->restoration_type[rtile_idx] != RESTORE_NONE) { - write_wiener_filter(wiener_win, &rsi->wiener_info[rtile_idx], wiener_info, - w); + } else if (frame_rtype == RESTORE_WIENER) { + aom_write_symbol(w, unit_rtype != RESTORE_NONE, + xd->tile_ctx->wiener_restore_cdf, 2); +#if CONFIG_ENTROPY_STATS + ++counts->wiener_restore[unit_rtype != RESTORE_NONE]; +#endif + if (unit_rtype != RESTORE_NONE) { + write_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, w); } - } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) { - aom_write(w, rsi->restoration_type[rtile_idx] != RESTORE_NONE, - RESTORE_NONE_SGRPROJ_PROB); - if (rsi->restoration_type[rtile_idx] != RESTORE_NONE) { - write_sgrproj_filter(&rsi->sgrproj_info[rtile_idx], sgrproj_info, w); + } else if (frame_rtype == RESTORE_SGRPROJ) { + aom_write_symbol(w, unit_rtype != RESTORE_NONE, + xd->tile_ctx->sgrproj_restore_cdf, 2); +#if CONFIG_ENTROPY_STATS + ++counts->sgrproj_restore[unit_rtype != RESTORE_NONE]; +#endif + if (unit_rtype != RESTORE_NONE) { + write_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, w); } } } -#endif // CONFIG_LOOP_RESTORATION - static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { + assert(!cm->coded_lossless); + if (cm->allow_intrabc) return; + const int num_planes = av1_num_planes(cm); int i; struct loopfilter *lf = &cm->lf; -// Encode the loop filter level and type -#if !CONFIG_LPF_SB -#if CONFIG_LOOPFILTER_LEVEL + // Encode the loop filter level and type aom_wb_write_literal(wb, lf->filter_level[0], 6); aom_wb_write_literal(wb, lf->filter_level[1], 6); - if (lf->filter_level[0] || lf->filter_level[1]) { - aom_wb_write_literal(wb, lf->filter_level_u, 6); - aom_wb_write_literal(wb, lf->filter_level_v, 6); - } -#else - aom_wb_write_literal(wb, lf->filter_level, 6); -#endif // CONFIG_LOOPFILTER_LEVEL -#endif // CONFIG_LPF_SB + if (num_planes > 1) { + if (lf->filter_level[0] || lf->filter_level[1]) { + aom_wb_write_literal(wb, lf->filter_level_u, 6); + aom_wb_write_literal(wb, lf->filter_level_v, 6); + } + } aom_wb_write_literal(wb, lf->sharpness_level, 3); // Write out loop filter deltas applied at the MB level based on mode or @@ -3508,48 +1949,58 @@ static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { if (lf->mode_ref_delta_enabled) { aom_wb_write_bit(wb, lf->mode_ref_delta_update); + if (lf->mode_ref_delta_update) { - for (i = 0; i < TOTAL_REFS_PER_FRAME; i++) { + const int prime_idx = cm->primary_ref_frame; + const int buf_idx = + prime_idx == PRIMARY_REF_NONE ? -1 : cm->frame_refs[prime_idx].idx; + int8_t last_ref_deltas[REF_FRAMES]; + if (prime_idx == PRIMARY_REF_NONE || buf_idx < 0) { + av1_set_default_ref_deltas(last_ref_deltas); + } else { + memcpy(last_ref_deltas, cm->buffer_pool->frame_bufs[buf_idx].ref_deltas, + REF_FRAMES); + } + for (i = 0; i < REF_FRAMES; i++) { const int delta = lf->ref_deltas[i]; - const int changed = delta != lf->last_ref_deltas[i]; + const int changed = delta != last_ref_deltas[i]; aom_wb_write_bit(wb, changed); - if (changed) { - lf->last_ref_deltas[i] = delta; - aom_wb_write_inv_signed_literal(wb, delta, 6); - } + if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6); } + int8_t last_mode_deltas[MAX_MODE_LF_DELTAS]; + if (prime_idx == PRIMARY_REF_NONE || buf_idx < 0) { + av1_set_default_mode_deltas(last_mode_deltas); + } else { + memcpy(last_mode_deltas, + cm->buffer_pool->frame_bufs[buf_idx].mode_deltas, + MAX_MODE_LF_DELTAS); + } for (i = 0; i < MAX_MODE_LF_DELTAS; i++) { const int delta = lf->mode_deltas[i]; - const int changed = delta != lf->last_mode_deltas[i]; + const int changed = delta != last_mode_deltas[i]; aom_wb_write_bit(wb, changed); - if (changed) { - lf->last_mode_deltas[i] = delta; - aom_wb_write_inv_signed_literal(wb, delta, 6); - } + if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6); } } } } -#if CONFIG_CDEF static void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { + assert(!cm->coded_lossless); + if (!cm->seq_params.enable_cdef) return; + if (cm->allow_intrabc) return; + const int num_planes = av1_num_planes(cm); int i; -#if CONFIG_CDEF_SINGLEPASS aom_wb_write_literal(wb, cm->cdef_pri_damping - 3, 2); assert(cm->cdef_pri_damping == cm->cdef_sec_damping); -#else - aom_wb_write_literal(wb, cm->cdef_pri_damping - 5, 1); - aom_wb_write_literal(wb, cm->cdef_sec_damping - 3, 2); -#endif aom_wb_write_literal(wb, cm->cdef_bits, 2); for (i = 0; i < cm->nb_cdef_strengths; i++) { aom_wb_write_literal(wb, cm->cdef_strengths[i], CDEF_STRENGTH_BITS); - if (cm->subsampling_x == cm->subsampling_y) + if (num_planes > 1) aom_wb_write_literal(wb, cm->cdef_uv_strengths[i], CDEF_STRENGTH_BITS); } } -#endif static void write_delta_q(struct aom_write_bit_buffer *wb, int delta_q) { if (delta_q != 0) { @@ -3562,63 +2013,71 @@ static void write_delta_q(struct aom_write_bit_buffer *wb, int delta_q) { static void encode_quantization(const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { + const int num_planes = av1_num_planes(cm); + aom_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS); write_delta_q(wb, cm->y_dc_delta_q); - write_delta_q(wb, cm->uv_dc_delta_q); - write_delta_q(wb, cm->uv_ac_delta_q); -#if CONFIG_AOM_QM + if (num_planes > 1) { + int diff_uv_delta = (cm->u_dc_delta_q != cm->v_dc_delta_q) || + (cm->u_ac_delta_q != cm->v_ac_delta_q); + if (cm->separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta); + write_delta_q(wb, cm->u_dc_delta_q); + write_delta_q(wb, cm->u_ac_delta_q); + if (diff_uv_delta) { + write_delta_q(wb, cm->v_dc_delta_q); + write_delta_q(wb, cm->v_ac_delta_q); + } + } aom_wb_write_bit(wb, cm->using_qmatrix); if (cm->using_qmatrix) { - aom_wb_write_literal(wb, cm->min_qmlevel, QM_LEVEL_BITS); - aom_wb_write_literal(wb, cm->max_qmlevel, QM_LEVEL_BITS); + aom_wb_write_literal(wb, cm->qm_y, QM_LEVEL_BITS); + aom_wb_write_literal(wb, cm->qm_u, QM_LEVEL_BITS); + if (!cm->separate_uv_delta_q) + assert(cm->qm_u == cm->qm_v); + else + aom_wb_write_literal(wb, cm->qm_v, QM_LEVEL_BITS); } -#endif } static void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd, struct aom_write_bit_buffer *wb) { int i, j; - const struct segmentation *seg = &cm->seg; + struct segmentation *seg = &cm->seg; aom_wb_write_bit(wb, seg->enabled); if (!seg->enabled) return; - // Segmentation map - if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) { - aom_wb_write_bit(wb, seg->update_map); - } else { + // Write update flags + if (cm->primary_ref_frame == PRIMARY_REF_NONE) { assert(seg->update_map == 1); - } - if (seg->update_map) { - // Select the coding strategy (temporal or spatial) - av1_choose_segmap_coding_method(cm, xd); - - // Write out the chosen coding method. - if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) { + seg->temporal_update = 0; + assert(seg->update_data == 1); + } else { + aom_wb_write_bit(wb, seg->update_map); + if (seg->update_map) { + // Select the coding strategy (temporal or spatial) + av1_choose_segmap_coding_method(cm, xd); aom_wb_write_bit(wb, seg->temporal_update); - } else { - assert(seg->temporal_update == 0); } + aom_wb_write_bit(wb, seg->update_data); } // Segmentation data - aom_wb_write_bit(wb, seg->update_data); if (seg->update_data) { - aom_wb_write_bit(wb, seg->abs_delta); - for (i = 0; i < MAX_SEGMENTS; i++) { for (j = 0; j < SEG_LVL_MAX; j++) { const int active = segfeature_active(seg, i, j); aom_wb_write_bit(wb, active); if (active) { - const int data = get_segdata(seg, i, j); const int data_max = av1_seg_feature_data_max(j); + const int data_min = -data_max; + const int ubits = get_unsigned_bits(data_max); + const int data = clamp(get_segdata(seg, i, j), data_min, data_max); if (av1_is_segfeature_signed(j)) { - encode_unsigned_max(wb, abs(data), data_max); - aom_wb_write_bit(wb, data < 0); + aom_wb_write_inv_signed_literal(wb, data, ubits); } else { - encode_unsigned_max(wb, data, data_max); + aom_wb_write_literal(wb, data, ubits); } } } @@ -3628,26 +2087,11 @@ static void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd, static void write_tx_mode(AV1_COMMON *cm, TX_MODE *mode, struct aom_write_bit_buffer *wb) { - if (cm->all_lossless) { + if (cm->coded_lossless) { *mode = ONLY_4X4; return; } -#if CONFIG_VAR_TX_NO_TX_MODE - (void)wb; - *mode = TX_MODE_SELECT; - return; -#else -#if CONFIG_TX64X64 - aom_wb_write_bit(wb, *mode == TX_MODE_SELECT); - if (*mode != TX_MODE_SELECT) { - aom_wb_write_literal(wb, AOMMIN(*mode, ALLOW_32X32), 2); - if (*mode >= ALLOW_32X32) aom_wb_write_bit(wb, *mode == ALLOW_64X64); - } -#else aom_wb_write_bit(wb, *mode == TX_MODE_SELECT); - if (*mode != TX_MODE_SELECT) aom_wb_write_literal(wb, *mode, 2); -#endif // CONFIG_TX64X64 -#endif // CONFIG_VAR_TX_NO_TX_MODE } static void write_frame_interp_filter(InterpFilter filter, @@ -3672,14 +2116,7 @@ static void fix_interp_filter(AV1_COMMON *cm, FRAME_COUNTS *counts) { // Only one filter is used. So set the filter at frame level for (i = 0; i < SWITCHABLE_FILTERS; ++i) { if (count[i]) { -#if CONFIG_MOTION_VAR && (CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION) -#if CONFIG_WARPED_MOTION - if (i == EIGHTTAP_REGULAR || WARP_WM_NEIGHBORS_WITH_OBMC) -#else - if (i == EIGHTTAP_REGULAR || WARP_GM_NEIGHBORS_WITH_OBMC) -#endif // CONFIG_WARPED_MOTION -#endif // CONFIG_MOTION_VAR && (CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION) - cm->interp_filter = i; + if (i == EIGHTTAP_REGULAR) cm->interp_filter = i; break; } } @@ -3687,8 +2124,6 @@ static void fix_interp_filter(AV1_COMMON *cm, FRAME_COUNTS *counts) { } } -#if CONFIG_MAX_TILE - // Same function as write_uniform but writing to uncompresses header wb static void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, int v) { const int l = get_unsigned_bits(n); @@ -3704,10 +2139,10 @@ static void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, int v) { static void write_tile_info_max_tile(const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { - int width_mi = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2); - int height_mi = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2); - int width_sb = width_mi >> MAX_MIB_SIZE_LOG2; - int height_sb = height_mi >> MAX_MIB_SIZE_LOG2; + int width_mi = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2); + int height_mi = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2); + int width_sb = width_mi >> cm->seq_params.mib_size_log2; + int height_sb = height_mi >> cm->seq_params.mib_size_log2; int size_sb, i; aom_wb_write_bit(wb, cm->uniform_tile_spacing_flag); @@ -3736,7 +2171,8 @@ static void write_tile_info_max_tile(const AV1_COMMON *const cm, // columns for (i = 0; i < cm->tile_cols; i++) { size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i]; - wb_write_uniform(wb, AOMMIN(width_sb, MAX_TILE_WIDTH_SB), size_sb - 1); + wb_write_uniform(wb, AOMMIN(width_sb, cm->max_tile_width_sb), + size_sb - 1); width_sb -= size_sb; } assert(width_sb == 0); @@ -3751,72 +2187,45 @@ static void write_tile_info_max_tile(const AV1_COMMON *const cm, assert(height_sb == 0); } } -#endif static void write_tile_info(const AV1_COMMON *const cm, + struct aom_write_bit_buffer *saved_wb, struct aom_write_bit_buffer *wb) { -#if CONFIG_EXT_TILE - if (cm->large_scale_tile) { - const int tile_width = - ALIGN_POWER_OF_TWO(cm->tile_width, cm->mib_size_log2) >> - cm->mib_size_log2; - const int tile_height = - ALIGN_POWER_OF_TWO(cm->tile_height, cm->mib_size_log2) >> - cm->mib_size_log2; - - assert(tile_width > 0); - assert(tile_height > 0); - -// Write the tile sizes -#if CONFIG_EXT_PARTITION - if (cm->sb_size == BLOCK_128X128) { - assert(tile_width <= 32); - assert(tile_height <= 32); - aom_wb_write_literal(wb, tile_width - 1, 5); - aom_wb_write_literal(wb, tile_height - 1, 5); - } else { -#endif // CONFIG_EXT_PARTITION - assert(tile_width <= 64); - assert(tile_height <= 64); - aom_wb_write_literal(wb, tile_width - 1, 6); - aom_wb_write_literal(wb, tile_height - 1, 6); -#if CONFIG_EXT_PARTITION - } -#endif // CONFIG_EXT_PARTITION - } else { -#endif // CONFIG_EXT_TILE - -#if CONFIG_MAX_TILE - write_tile_info_max_tile(cm, wb); -#else - int min_log2_tile_cols, max_log2_tile_cols, ones; - av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); + write_tile_info_max_tile(cm, wb); - // columns - ones = cm->log2_tile_cols - min_log2_tile_cols; - while (ones--) aom_wb_write_bit(wb, 1); + *saved_wb = *wb; + if (cm->tile_rows * cm->tile_cols > 1) { + // tile id used for cdf update + aom_wb_write_literal(wb, 0, cm->log2_tile_cols + cm->log2_tile_rows); + // Number of bytes in tile size - 1 + aom_wb_write_literal(wb, 3, 2); + } +} - if (cm->log2_tile_cols < max_log2_tile_cols) aom_wb_write_bit(wb, 0); +static void write_ext_tile_info(const AV1_COMMON *const cm, + struct aom_write_bit_buffer *saved_wb, + struct aom_write_bit_buffer *wb) { + // This information is stored as a separate byte. + int mod = wb->bit_offset % CHAR_BIT; + if (mod > 0) aom_wb_write_literal(wb, 0, CHAR_BIT - mod); + assert(aom_wb_is_byte_aligned(wb)); - // rows - aom_wb_write_bit(wb, cm->log2_tile_rows != 0); - if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->log2_tile_rows != 1); -#endif -#if CONFIG_DEPENDENT_HORZTILES - if (cm->tile_rows > 1) aom_wb_write_bit(wb, cm->dependent_horz_tiles); -#endif -#if CONFIG_EXT_TILE + *saved_wb = *wb; + if (cm->tile_rows * cm->tile_cols > 1) { + // Note that the last item in the uncompressed header is the data + // describing tile configuration. + // Number of bytes in tile column size - 1 + aom_wb_write_literal(wb, 0, 2); + // Number of bytes in tile size - 1 + aom_wb_write_literal(wb, 0, 2); } -#endif // CONFIG_EXT_TILE - -#if CONFIG_LOOPFILTERING_ACROSS_TILES - aom_wb_write_bit(wb, cm->loop_filter_across_tiles_enabled); -#endif // CONFIG_LOOPFILTERING_ACROSS_TILES } -#if CONFIG_EXT_REFS #if USE_GF16_MULTI_LAYER static int get_refresh_mask_gf16(AV1_COMP *cpi) { + if (cpi->common.frame_type == KEY_FRAME || frame_is_sframe(&cpi->common)) + return 0xFF; + int refresh_mask = 0; if (cpi->refresh_last_frame || cpi->refresh_golden_frame || @@ -3829,11 +2238,12 @@ static int get_refresh_mask_gf16(AV1_COMP *cpi) { return refresh_mask; } #endif // USE_GF16_MULTI_LAYER -#endif // CONFIG_EXT_REFS static int get_refresh_mask(AV1_COMP *cpi) { + if (cpi->common.frame_type == KEY_FRAME || frame_is_sframe(&cpi->common)) + return 0xFF; + int refresh_mask = 0; -#if CONFIG_EXT_REFS #if USE_GF16_MULTI_LAYER if (cpi->rc.baseline_gf_interval == 16) return get_refresh_mask_gf16(cpi); #endif // USE_GF16_MULTI_LAYER @@ -3847,13 +2257,12 @@ static int get_refresh_mask(AV1_COMP *cpi) { // shifted and become the new virtual indexes for LAST2_FRAME and // LAST3_FRAME. refresh_mask |= - (cpi->refresh_last_frame << cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]); + (cpi->refresh_last_frame << cpi->ref_fb_idx[LAST_REF_FRAMES - 1]); - refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->bwd_fb_idx); - refresh_mask |= (cpi->refresh_alt2_ref_frame << cpi->alt2_fb_idx); -#else // !CONFIG_EXT_REFS - refresh_mask |= (cpi->refresh_last_frame << cpi->lst_fb_idx); -#endif // CONFIG_EXT_REFS + refresh_mask |= + (cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[BWDREF_FRAME - 1]); + refresh_mask |= + (cpi->refresh_alt2_ref_frame << cpi->ref_fb_idx[ALTREF2_FRAME - 1]); if (av1_preserve_existing_gf(cpi)) { // We have decided to preserve the previously existing golden frame as our @@ -3866,26 +2275,19 @@ static int get_refresh_mask(AV1_COMP *cpi) { // Note: This is highly specific to the use of ARF as a forward reference, // and this needs to be generalized as other uses are implemented // (like RTC/temporal scalability). - return refresh_mask | (cpi->refresh_golden_frame << cpi->alt_fb_idx); + return refresh_mask | + (cpi->refresh_golden_frame << cpi->ref_fb_idx[ALTREF_FRAME - 1]); } else { -#if CONFIG_EXT_REFS - const int arf_idx = cpi->alt_fb_idx; -#else // !CONFIG_EXT_REFS - int arf_idx = cpi->alt_fb_idx; - if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - arf_idx = gf_group->arf_update_idx[gf_group->index]; - } -#endif // CONFIG_EXT_REFS - return refresh_mask | (cpi->refresh_golden_frame << cpi->gld_fb_idx) | + const int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1]; + return refresh_mask | + (cpi->refresh_golden_frame << cpi->ref_fb_idx[GOLDEN_FRAME - 1]) | (cpi->refresh_alt_ref_frame << arf_idx); } } -#if CONFIG_EXT_TILE static INLINE int find_identical_tile( const int tile_row, const int tile_col, - TileBufferEnc (*const tile_buffers)[1024]) { + TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) { const MV32 candidate_offset[1] = { { 1, 0 } }; const uint8_t *const cur_tile_data = tile_buffers[tile_row][tile_col].data + 4; @@ -3933,329 +2335,10 @@ static INLINE int find_identical_tile( // No identical tile found return 0; } -#endif // CONFIG_EXT_TILE - -#if !CONFIG_OBU || CONFIG_EXT_TILE -static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst, - unsigned int *max_tile_size, - unsigned int *max_tile_col_size) { - const AV1_COMMON *const cm = &cpi->common; - aom_writer mode_bc; - int tile_row, tile_col; - TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok; - TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers; - uint32_t total_size = 0; - const int tile_cols = cm->tile_cols; - const int tile_rows = cm->tile_rows; - unsigned int tile_size = 0; - const int have_tiles = tile_cols * tile_rows > 1; - struct aom_write_bit_buffer wb = { dst, 0 }; - const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols; - uint32_t compressed_hdr_size; - // Fixed size tile groups for the moment - const int num_tg_hdrs = cm->num_tg; - const int tg_size = -#if CONFIG_EXT_TILE - (cm->large_scale_tile) - ? 1 - : -#endif // CONFIG_EXT_TILE - (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs; - int tile_count = 0; - int tg_count = 1; - int tile_size_bytes = 4; - int tile_col_size_bytes; - uint32_t uncompressed_hdr_size = 0; - struct aom_write_bit_buffer tg_params_wb; - struct aom_write_bit_buffer tile_size_bytes_wb; - uint32_t saved_offset; - int mtu_size = cpi->oxcf.mtu; - int curr_tg_data_size = 0; - int hdr_size; - - *max_tile_size = 0; - *max_tile_col_size = 0; - -// All tile size fields are output on 4 bytes. A call to remux_tiles will -// later compact the data if smaller headers are adequate. - -#if CONFIG_EXT_TILE - if (cm->large_scale_tile) { - for (tile_col = 0; tile_col < tile_cols; tile_col++) { - TileInfo tile_info; - const int is_last_col = (tile_col == tile_cols - 1); - const uint32_t col_offset = total_size; - - av1_tile_set_col(&tile_info, cm, tile_col); - - // The last column does not have a column header - if (!is_last_col) total_size += 4; - - for (tile_row = 0; tile_row < tile_rows; tile_row++) { - TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col]; - const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col]; - const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col]; - const int data_offset = have_tiles ? 4 : 0; - const int tile_idx = tile_row * tile_cols + tile_col; - TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; - av1_tile_set_row(&tile_info, cm, tile_row); - - buf->data = dst + total_size; - - // Is CONFIG_EXT_TILE = 1, every tile in the row has a header, - // even for the last one, unless no tiling is used at all. - total_size += data_offset; - // Initialise tile context from the frame context - this_tile->tctx = *cm->fc; - cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; -#if CONFIG_PVQ - cpi->td.mb.pvq_q = &this_tile->pvq_q; - cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context; -#endif // CONFIG_PVQ -#if CONFIG_ANS - mode_bc.size = 1 << cpi->common.ans_window_size_log2; -#endif - aom_start_encode(&mode_bc, buf->data + data_offset); - write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end); - assert(tok == tok_end); - aom_stop_encode(&mode_bc); - tile_size = mode_bc.pos; -#if CONFIG_PVQ - cpi->td.mb.pvq_q = NULL; -#endif - buf->size = tile_size; - - // Record the maximum tile size we see, so we can compact headers later. - *max_tile_size = AOMMAX(*max_tile_size, tile_size); - - if (have_tiles) { - // tile header: size of this tile, or copy offset - uint32_t tile_header = tile_size; - const int tile_copy_mode = - ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256) - ? 1 - : 0; - - // If tile_copy_mode = 1, check if this tile is a copy tile. - // Very low chances to have copy tiles on the key frames, so don't - // search on key frames to reduce unnecessary search. - if (cm->frame_type != KEY_FRAME && tile_copy_mode) { - const int idendical_tile_offset = - find_identical_tile(tile_row, tile_col, tile_buffers); - - if (idendical_tile_offset > 0) { - tile_size = 0; - tile_header = idendical_tile_offset | 0x80; - tile_header <<= 24; - } - } - - mem_put_le32(buf->data, tile_header); - } - - total_size += tile_size; - } - - if (!is_last_col) { - uint32_t col_size = total_size - col_offset - 4; - mem_put_le32(dst + col_offset, col_size); - - // If it is not final packing, record the maximum tile column size we - // see, otherwise, check if the tile size is out of the range. - *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size); - } - } - } else { -#endif // CONFIG_EXT_TILE - write_uncompressed_header_frame(cpi, &wb); - -#if CONFIG_EXT_REFS - if (cm->show_existing_frame) { - total_size = aom_wb_bytes_written(&wb); - return (uint32_t)total_size; - } -#endif // CONFIG_EXT_REFS - - // Write the tile length code - tile_size_bytes_wb = wb; - aom_wb_write_literal(&wb, 3, 2); - - /* Write a placeholder for the number of tiles in each tile group */ - tg_params_wb = wb; - saved_offset = wb.bit_offset; - if (have_tiles) { - aom_wb_overwrite_literal(&wb, 3, n_log2_tiles); - aom_wb_overwrite_literal(&wb, (1 << n_log2_tiles) - 1, n_log2_tiles); - } - - if (!use_compressed_header(cm)) { - uncompressed_hdr_size = aom_wb_bytes_written(&wb); - compressed_hdr_size = 0; - } else { - /* Write a placeholder for the compressed header length */ - struct aom_write_bit_buffer comp_hdr_len_wb = wb; - aom_wb_write_literal(&wb, 0, 16); - - uncompressed_hdr_size = aom_wb_bytes_written(&wb); - compressed_hdr_size = - write_compressed_header(cpi, dst + uncompressed_hdr_size); - aom_wb_overwrite_literal(&comp_hdr_len_wb, (int)(compressed_hdr_size), - 16); - } - - hdr_size = uncompressed_hdr_size + compressed_hdr_size; - total_size += hdr_size; - - for (tile_row = 0; tile_row < tile_rows; tile_row++) { - TileInfo tile_info; - const int is_last_row = (tile_row == tile_rows - 1); - av1_tile_set_row(&tile_info, cm, tile_row); - - for (tile_col = 0; tile_col < tile_cols; tile_col++) { - const int tile_idx = tile_row * tile_cols + tile_col; - TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col]; - TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; - const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col]; - const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col]; - const int is_last_col = (tile_col == tile_cols - 1); - const int is_last_tile = is_last_col && is_last_row; - - if ((!mtu_size && tile_count > tg_size) || - (mtu_size && tile_count && curr_tg_data_size >= mtu_size)) { - // New tile group - tg_count++; - // We've exceeded the packet size - if (tile_count > 1) { - /* The last tile exceeded the packet size. The tile group size - should therefore be tile_count-1. - Move the last tile and insert headers before it - */ - uint32_t old_total_size = total_size - tile_size - 4; - memmove(dst + old_total_size + hdr_size, dst + old_total_size, - (tile_size + 4) * sizeof(uint8_t)); - // Copy uncompressed header - memmove(dst + old_total_size, dst, - uncompressed_hdr_size * sizeof(uint8_t)); - // Write the number of tiles in the group into the last uncompressed - // header before the one we've just inserted - aom_wb_overwrite_literal(&tg_params_wb, tile_idx - tile_count, - n_log2_tiles); - aom_wb_overwrite_literal(&tg_params_wb, tile_count - 2, - n_log2_tiles); - // Update the pointer to the last TG params - tg_params_wb.bit_offset = saved_offset + 8 * old_total_size; - // Copy compressed header - memmove(dst + old_total_size + uncompressed_hdr_size, - dst + uncompressed_hdr_size, - compressed_hdr_size * sizeof(uint8_t)); - total_size += hdr_size; - tile_count = 1; - curr_tg_data_size = hdr_size + tile_size + 4; - } else { - // We exceeded the packet size in just one tile - // Copy uncompressed header - memmove(dst + total_size, dst, - uncompressed_hdr_size * sizeof(uint8_t)); - // Write the number of tiles in the group into the last uncompressed - // header - aom_wb_overwrite_literal(&tg_params_wb, tile_idx - tile_count, - n_log2_tiles); - aom_wb_overwrite_literal(&tg_params_wb, tile_count - 1, - n_log2_tiles); - tg_params_wb.bit_offset = saved_offset + 8 * total_size; - // Copy compressed header - memmove(dst + total_size + uncompressed_hdr_size, - dst + uncompressed_hdr_size, - compressed_hdr_size * sizeof(uint8_t)); - total_size += hdr_size; - tile_count = 0; - curr_tg_data_size = hdr_size; - } - } - tile_count++; - av1_tile_set_col(&tile_info, cm, tile_col); - -#if CONFIG_DEPENDENT_HORZTILES - av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col); -#endif - buf->data = dst + total_size; - - // The last tile does not have a header. - if (!is_last_tile) total_size += 4; - - // Initialise tile context from the frame context - this_tile->tctx = *cm->fc; - cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; -#if CONFIG_PVQ - cpi->td.mb.pvq_q = &this_tile->pvq_q; - cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context; -#endif // CONFIG_PVQ -#if CONFIG_ANS - mode_bc.size = 1 << cpi->common.ans_window_size_log2; -#endif // CONFIG_ANS -#if CONFIG_LOOP_RESTORATION - for (int p = 0; p < MAX_MB_PLANE; ++p) { - set_default_wiener(cpi->td.mb.e_mbd.wiener_info + p); - set_default_sgrproj(cpi->td.mb.e_mbd.sgrproj_info + p); - } -#endif // CONFIG_LOOP_RESTORATION - - aom_start_encode(&mode_bc, dst + total_size); - write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end); -#if !CONFIG_LV_MAP -#if !CONFIG_PVQ - assert(tok == tok_end); -#endif // !CONFIG_PVQ -#endif // !CONFIG_LV_MAP - aom_stop_encode(&mode_bc); - tile_size = mode_bc.pos; -#if CONFIG_PVQ - cpi->td.mb.pvq_q = NULL; -#endif - - assert(tile_size > 0); - - curr_tg_data_size += tile_size + 4; - buf->size = tile_size; - - if (!is_last_tile) { - *max_tile_size = AOMMAX(*max_tile_size, tile_size); - // size of this tile - mem_put_le32(buf->data, tile_size); - } - - total_size += tile_size; - } - } - // Write the final tile group size - if (n_log2_tiles) { - aom_wb_overwrite_literal( - &tg_params_wb, (tile_cols * tile_rows) - tile_count, n_log2_tiles); - aom_wb_overwrite_literal(&tg_params_wb, tile_count - 1, n_log2_tiles); - } - // Remux if possible. TODO (Thomas Davies): do this for more than one tile - // group - if (have_tiles && tg_count == 1) { - int data_size = - total_size - (uncompressed_hdr_size + compressed_hdr_size); - data_size = - remux_tiles(cm, dst + uncompressed_hdr_size + compressed_hdr_size, - data_size, *max_tile_size, *max_tile_col_size, - &tile_size_bytes, &tile_col_size_bytes); - total_size = data_size + uncompressed_hdr_size + compressed_hdr_size; - aom_wb_overwrite_literal(&tile_size_bytes_wb, tile_size_bytes - 1, 2); - } - -#if CONFIG_EXT_TILE - } -#endif // CONFIG_EXT_TILE - return (uint32_t)total_size; -} -#endif static void write_render_size(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { - const int scaling_active = !av1_resize_unscaled(cm); + const int scaling_active = av1_resize_scaled(cm); aom_wb_write_bit(wb, scaling_active); if (scaling_active) { aom_wb_write_literal(wb, cm->render_width - 1, 16); @@ -4263,31 +2346,42 @@ static void write_render_size(const AV1_COMMON *cm, } } -#if CONFIG_FRAME_SUPERRES static void write_superres_scale(const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { + const SequenceHeader *const seq_params = &cm->seq_params; + if (!seq_params->enable_superres) { + assert(cm->superres_scale_denominator == SCALE_NUMERATOR); + return; + } + // First bit is whether to to scale or not if (cm->superres_scale_denominator == SCALE_NUMERATOR) { aom_wb_write_bit(wb, 0); // no scaling } else { aom_wb_write_bit(wb, 1); // scaling, write scale factor + assert(cm->superres_scale_denominator >= SUPERRES_SCALE_DENOMINATOR_MIN); + assert(cm->superres_scale_denominator < + SUPERRES_SCALE_DENOMINATOR_MIN + (1 << SUPERRES_SCALE_BITS)); aom_wb_write_literal( wb, cm->superres_scale_denominator - SUPERRES_SCALE_DENOMINATOR_MIN, SUPERRES_SCALE_BITS); } } -#endif // CONFIG_FRAME_SUPERRES -static void write_frame_size(const AV1_COMMON *cm, +static void write_frame_size(const AV1_COMMON *cm, int frame_size_override, struct aom_write_bit_buffer *wb) { -#if CONFIG_FRAME_SUPERRES - aom_wb_write_literal(wb, cm->superres_upscaled_width - 1, 16); - aom_wb_write_literal(wb, cm->superres_upscaled_height - 1, 16); + const int coded_width = cm->superres_upscaled_width - 1; + const int coded_height = cm->superres_upscaled_height - 1; + + if (frame_size_override) { + const SequenceHeader *seq_params = &cm->seq_params; + int num_bits_width = seq_params->num_bits_width; + int num_bits_height = seq_params->num_bits_height; + aom_wb_write_literal(wb, coded_width, num_bits_width); + aom_wb_write_literal(wb, coded_height, num_bits_height); + } + write_superres_scale(cm, wb); -#else - aom_wb_write_literal(wb, cm->width - 1, 16); - aom_wb_write_literal(wb, cm->height - 1, 16); -#endif // CONFIG_FRAME_SUPERRES write_render_size(cm, wb); } @@ -4301,209 +2395,426 @@ static void write_frame_size_with_refs(AV1_COMP *cpi, YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame); if (cfg != NULL) { -#if CONFIG_FRAME_SUPERRES found = cm->superres_upscaled_width == cfg->y_crop_width && cm->superres_upscaled_height == cfg->y_crop_height; -#else - found = - cm->width == cfg->y_crop_width && cm->height == cfg->y_crop_height; -#endif found &= cm->render_width == cfg->render_width && cm->render_height == cfg->render_height; } aom_wb_write_bit(wb, found); if (found) { -#if CONFIG_FRAME_SUPERRES write_superres_scale(cm, wb); -#endif // CONFIG_FRAME_SUPERRES break; } } - if (!found) write_frame_size(cm, wb); + if (!found) { + int frame_size_override = 1; // Always equal to 1 in this function + write_frame_size(cm, frame_size_override, wb); + } } static void write_profile(BITSTREAM_PROFILE profile, struct aom_write_bit_buffer *wb) { - switch (profile) { - case PROFILE_0: aom_wb_write_literal(wb, 0, 2); break; - case PROFILE_1: aom_wb_write_literal(wb, 2, 2); break; - case PROFILE_2: aom_wb_write_literal(wb, 1, 2); break; - case PROFILE_3: aom_wb_write_literal(wb, 6, 3); break; - default: assert(0); - } + assert(profile >= PROFILE_0 && profile < MAX_PROFILES); + aom_wb_write_literal(wb, profile, PROFILE_BITS); } -static void write_bitdepth_colorspace_sampling( - AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { - if (cm->profile >= PROFILE_2) { - assert(cm->bit_depth > AOM_BITS_8); +static void write_bitdepth(AV1_COMMON *const cm, + struct aom_write_bit_buffer *wb) { + // Profile 0/1: [0] for 8 bit, [1] 10-bit + // Profile 2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit + aom_wb_write_bit(wb, cm->bit_depth == AOM_BITS_8 ? 0 : 1); + if (cm->profile == PROFILE_2 && cm->bit_depth != AOM_BITS_8) { aom_wb_write_bit(wb, cm->bit_depth == AOM_BITS_10 ? 0 : 1); } -#if CONFIG_COLORSPACE_HEADERS - aom_wb_write_literal(wb, cm->color_space, 5); - aom_wb_write_literal(wb, cm->transfer_function, 5); -#else - aom_wb_write_literal(wb, cm->color_space, 3); -#endif - if (cm->color_space != AOM_CS_SRGB) { +} + +static void write_color_config(AV1_COMMON *const cm, + struct aom_write_bit_buffer *wb) { + write_bitdepth(cm, wb); + const int is_monochrome = cm->seq_params.monochrome; + // monochrome bit + if (cm->profile != PROFILE_1) + aom_wb_write_bit(wb, is_monochrome); + else + assert(!is_monochrome); + if (cm->color_primaries == AOM_CICP_CP_UNSPECIFIED && + cm->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED && + cm->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) { + aom_wb_write_bit(wb, 0); // No color description present + } else { + aom_wb_write_bit(wb, 1); // Color description present + aom_wb_write_literal(wb, cm->color_primaries, 8); + aom_wb_write_literal(wb, cm->transfer_characteristics, 8); + aom_wb_write_literal(wb, cm->matrix_coefficients, 8); + } + if (is_monochrome) { // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] aom_wb_write_bit(wb, cm->color_range); - if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { - assert(cm->subsampling_x != 1 || cm->subsampling_y != 1); - aom_wb_write_bit(wb, cm->subsampling_x); - aom_wb_write_bit(wb, cm->subsampling_y); - aom_wb_write_bit(wb, 0); // unused - } else { + return; + } + if (cm->color_primaries == AOM_CICP_CP_BT_709 && + cm->transfer_characteristics == AOM_CICP_TC_SRGB && + cm->matrix_coefficients == + AOM_CICP_MC_IDENTITY) { // it would be better to remove this + // dependency too + assert(cm->subsampling_x == 0 && cm->subsampling_y == 0); + assert(cm->profile == PROFILE_1 || + (cm->profile == PROFILE_2 && cm->bit_depth == AOM_BITS_12)); + } else { + // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] + aom_wb_write_bit(wb, cm->color_range); + if (cm->profile == PROFILE_0) { + // 420 only assert(cm->subsampling_x == 1 && cm->subsampling_y == 1); + } else if (cm->profile == PROFILE_1) { + // 444 only + assert(cm->subsampling_x == 0 && cm->subsampling_y == 0); + } else if (cm->profile == PROFILE_2) { + if (cm->bit_depth == AOM_BITS_12) { + // 420, 444 or 422 + aom_wb_write_bit(wb, cm->subsampling_x); + if (cm->subsampling_x == 0) { + assert(cm->subsampling_y == 0 && + "4:4:0 subsampling not allowed in AV1"); + } else { + aom_wb_write_bit(wb, cm->subsampling_y); + } + } else { + // 422 only + assert(cm->subsampling_x == 1 && cm->subsampling_y == 0); + } + } + if (cm->matrix_coefficients == AOM_CICP_MC_IDENTITY) { + assert(cm->subsampling_x == 0 && cm->subsampling_y == 0); } -#if CONFIG_COLORSPACE_HEADERS if (cm->subsampling_x == 1 && cm->subsampling_y == 1) { aom_wb_write_literal(wb, cm->chroma_sample_position, 2); } -#endif - } else { - assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3); - aom_wb_write_bit(wb, 0); // unused } + aom_wb_write_bit(wb, cm->separate_uv_delta_q); } -#if CONFIG_REFERENCE_BUFFER -void write_sequence_header(AV1_COMMON *const cm, - struct aom_write_bit_buffer *wb) { - SequenceHeader *seq_params = &cm->seq_params; - /* Placeholder for actually writing to the bitstream */ - seq_params->frame_id_numbers_present_flag = -#if CONFIG_EXT_TILE - cm->large_scale_tile ? 0 : -#endif // CONFIG_EXT_TILE - FRAME_ID_NUMBERS_PRESENT_FLAG; - seq_params->frame_id_length_minus7 = FRAME_ID_LENGTH_MINUS7; - seq_params->delta_frame_id_length_minus2 = DELTA_FRAME_ID_LENGTH_MINUS2; - - aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag); - if (seq_params->frame_id_numbers_present_flag) { - aom_wb_write_literal(wb, seq_params->frame_id_length_minus7, 4); - aom_wb_write_literal(wb, seq_params->delta_frame_id_length_minus2, 4); +static void write_timing_info_header(AV1_COMMON *const cm, + struct aom_write_bit_buffer *wb) { + aom_wb_write_unsigned_literal(wb, cm->timing_info.num_units_in_display_tick, + 32); // Number of units in tick + aom_wb_write_unsigned_literal(wb, cm->timing_info.time_scale, + 32); // Time scale + aom_wb_write_bit( + wb, + cm->timing_info.equal_picture_interval); // Equal picture interval bit + if (cm->timing_info.equal_picture_interval) { + aom_wb_write_uvlc( + wb, + cm->timing_info.num_ticks_per_picture - 1); // ticks per picture + } +} + +static void write_decoder_model_info(AV1_COMMON *const cm, + struct aom_write_bit_buffer *wb) { + aom_wb_write_literal( + wb, cm->buffer_model.encoder_decoder_buffer_delay_length - 1, 5); + aom_wb_write_unsigned_literal(wb, cm->buffer_model.num_units_in_decoding_tick, + 32); // Number of units in decoding tick + aom_wb_write_literal(wb, cm->buffer_model.buffer_removal_delay_length - 1, 5); + aom_wb_write_literal(wb, cm->buffer_model.frame_presentation_delay_length - 1, + 5); +} + +static void write_dec_model_op_parameters(AV1_COMMON *const cm, + struct aom_write_bit_buffer *wb, + int op_num) { + if (op_num > MAX_NUM_OPERATING_POINTS) + aom_internal_error( + &cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Encoder does not support %d decoder model operating points", op_num); + + // aom_wb_write_bit(wb, cm->op_params[op_num].has_parameters); + // if (!cm->op_params[op_num].has_parameters) return; + + aom_wb_write_literal(wb, cm->op_params[op_num].decoder_buffer_delay, + cm->buffer_model.encoder_decoder_buffer_delay_length); + + aom_wb_write_literal(wb, cm->op_params[op_num].encoder_buffer_delay, + cm->buffer_model.encoder_decoder_buffer_delay_length); + + aom_wb_write_bit(wb, cm->op_params[op_num].low_delay_mode_flag); + + cm->op_frame_timing[op_num].buffer_removal_delay = + 0; // reset the decoded frame counter +} + +static void write_tu_pts_info(AV1_COMMON *const cm, + struct aom_write_bit_buffer *wb) { + aom_wb_write_unsigned_literal( + wb, (uint32_t)cm->tu_presentation_delay, + cm->buffer_model.frame_presentation_delay_length); +} + +static void write_film_grain_params(AV1_COMP *cpi, + struct aom_write_bit_buffer *wb) { + AV1_COMMON *const cm = &cpi->common; + aom_film_grain_t *pars = &cm->film_grain_params; + + cm->cur_frame->film_grain_params = *pars; + + aom_wb_write_bit(wb, pars->apply_grain); + if (!pars->apply_grain) return; + + aom_wb_write_literal(wb, pars->random_seed, 16); + + pars->random_seed += 3245; // For film grain test vectors purposes + if (!pars->random_seed) // Random seed should not be zero + pars->random_seed += 1735; + if (cm->frame_type == INTER_FRAME) + aom_wb_write_bit(wb, pars->update_parameters); + else + pars->update_parameters = 1; + if (!pars->update_parameters) { + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + int ref_frame, ref_idx, buf_idx; + for (ref_frame = LAST_FRAME; ref_frame < REF_FRAMES; ref_frame++) { + ref_idx = get_ref_frame_map_idx(cpi, ref_frame); + assert(ref_idx != INVALID_IDX); + buf_idx = cm->ref_frame_map[ref_idx]; + if (frame_bufs[buf_idx].film_grain_params_present && + memcmp(pars, &frame_bufs[buf_idx].film_grain_params, sizeof(*pars))) { + break; + } + } + assert(ref_frame < REF_FRAMES); + aom_wb_write_literal(wb, ref_idx, 3); + return; + } + + // Scaling functions parameters + aom_wb_write_literal(wb, pars->num_y_points, 4); // max 14 + for (int i = 0; i < pars->num_y_points; i++) { + aom_wb_write_literal(wb, pars->scaling_points_y[i][0], 8); + aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8); + } + + if (!cm->seq_params.monochrome) + aom_wb_write_bit(wb, pars->chroma_scaling_from_luma); + else + pars->chroma_scaling_from_luma = 0; // for monochrome override to 0 + + if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma || + ((cm->subsampling_x == 1) && (cm->subsampling_y == 1) && + (pars->num_y_points == 0))) { + pars->num_cb_points = 0; + pars->num_cr_points = 0; + } else { + aom_wb_write_literal(wb, pars->num_cb_points, 4); // max 10 + for (int i = 0; i < pars->num_cb_points; i++) { + aom_wb_write_literal(wb, pars->scaling_points_cb[i][0], 8); + aom_wb_write_literal(wb, pars->scaling_points_cb[i][1], 8); + } + + aom_wb_write_literal(wb, pars->num_cr_points, 4); // max 10 + for (int i = 0; i < pars->num_cr_points; i++) { + aom_wb_write_literal(wb, pars->scaling_points_cr[i][0], 8); + aom_wb_write_literal(wb, pars->scaling_points_cr[i][1], 8); + } } + + aom_wb_write_literal(wb, pars->scaling_shift - 8, 2); // 8 + value + + // AR coefficients + // Only sent if the corresponsing scaling function has + // more than 0 points + + aom_wb_write_literal(wb, pars->ar_coeff_lag, 2); + + int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); + int num_pos_chroma = num_pos_luma; + if (pars->num_y_points > 0) ++num_pos_chroma; + + if (pars->num_y_points) + for (int i = 0; i < num_pos_luma; i++) + aom_wb_write_literal(wb, pars->ar_coeffs_y[i] + 128, 8); + + if (pars->num_cb_points || pars->chroma_scaling_from_luma) + for (int i = 0; i < num_pos_chroma; i++) + aom_wb_write_literal(wb, pars->ar_coeffs_cb[i] + 128, 8); + + if (pars->num_cr_points || pars->chroma_scaling_from_luma) + for (int i = 0; i < num_pos_chroma; i++) + aom_wb_write_literal(wb, pars->ar_coeffs_cr[i] + 128, 8); + + aom_wb_write_literal(wb, pars->ar_coeff_shift - 6, 2); // 8 + value + + aom_wb_write_literal(wb, pars->grain_scale_shift, 2); + + if (pars->num_cb_points) { + aom_wb_write_literal(wb, pars->cb_mult, 8); + aom_wb_write_literal(wb, pars->cb_luma_mult, 8); + aom_wb_write_literal(wb, pars->cb_offset, 9); + } + + if (pars->num_cr_points) { + aom_wb_write_literal(wb, pars->cr_mult, 8); + aom_wb_write_literal(wb, pars->cr_luma_mult, 8); + aom_wb_write_literal(wb, pars->cr_offset, 9); + } + + aom_wb_write_bit(wb, pars->overlap_flag); + + aom_wb_write_bit(wb, pars->clip_to_restricted_range); } -#endif // CONFIG_REFERENCE_BUFFER -static void write_sb_size(const AV1_COMMON *cm, +static void write_sb_size(SequenceHeader *seq_params, struct aom_write_bit_buffer *wb) { - (void)cm; + (void)seq_params; (void)wb; - assert(cm->mib_size == mi_size_wide[cm->sb_size]); - assert(cm->mib_size == 1 << cm->mib_size_log2); -#if CONFIG_EXT_PARTITION - assert(cm->sb_size == BLOCK_128X128 || cm->sb_size == BLOCK_64X64); - aom_wb_write_bit(wb, cm->sb_size == BLOCK_128X128 ? 1 : 0); -#else - assert(cm->sb_size == BLOCK_64X64); -#endif // CONFIG_EXT_PARTITION + assert(seq_params->mib_size == mi_size_wide[seq_params->sb_size]); + assert(seq_params->mib_size == 1 << seq_params->mib_size_log2); + assert(seq_params->sb_size == BLOCK_128X128 || + seq_params->sb_size == BLOCK_64X64); + aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0); } -static void write_compound_tools(const AV1_COMMON *cm, - struct aom_write_bit_buffer *wb) { - (void)cm; - (void)wb; -#if CONFIG_INTERINTRA - if (!frame_is_intra_only(cm) && cm->reference_mode != COMPOUND_REFERENCE) { - aom_wb_write_bit(wb, cm->allow_interintra_compound); - } else { - assert(cm->allow_interintra_compound == 0); - } -#endif // CONFIG_INTERINTRA -#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT -#if CONFIG_COMPOUND_SINGLEREF - if (!frame_is_intra_only(cm)) { -#else // !CONFIG_COMPOUND_SINGLEREF - if (!frame_is_intra_only(cm) && cm->reference_mode != SINGLE_REFERENCE) { -#endif // CONFIG_COMPOUND_SINGLEREF - aom_wb_write_bit(wb, cm->allow_masked_compound); - } else { - assert(cm->allow_masked_compound == 0); +void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb) { + AV1_COMMON *const cm = &cpi->common; + SequenceHeader *seq_params = &cm->seq_params; + + int max_frame_width = cpi->oxcf.forced_max_frame_width + ? cpi->oxcf.forced_max_frame_width + : cpi->oxcf.width; + int max_frame_height = cpi->oxcf.forced_max_frame_height + ? cpi->oxcf.forced_max_frame_height + : cpi->oxcf.height; + const int num_bits_width = + (max_frame_width > 1) ? get_msb(max_frame_width - 1) + 1 : 1; + const int num_bits_height = + (max_frame_height > 1) ? get_msb(max_frame_height - 1) + 1 : 1; + assert(num_bits_width <= 16); + assert(num_bits_height <= 16); + + seq_params->num_bits_width = num_bits_width; + seq_params->num_bits_height = num_bits_height; + seq_params->max_frame_width = max_frame_width; + seq_params->max_frame_height = max_frame_height; + + aom_wb_write_literal(wb, num_bits_width - 1, 4); + aom_wb_write_literal(wb, num_bits_height - 1, 4); + aom_wb_write_literal(wb, max_frame_width - 1, num_bits_width); + aom_wb_write_literal(wb, max_frame_height - 1, num_bits_height); + + /* Placeholder for actually writing to the bitstream */ + if (!seq_params->reduced_still_picture_hdr) { + seq_params->frame_id_numbers_present_flag = + cm->large_scale_tile ? 0 : cm->error_resilient_mode; + seq_params->frame_id_length = FRAME_ID_LENGTH; + seq_params->delta_frame_id_length = DELTA_FRAME_ID_LENGTH; + + aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag); + if (seq_params->frame_id_numbers_present_flag) { + // We must always have delta_frame_id_length < frame_id_length, + // in order for a frame to be referenced with a unique delta. + // Avoid wasting bits by using a coding that enforces this restriction. + aom_wb_write_literal(wb, seq_params->delta_frame_id_length - 2, 4); + aom_wb_write_literal( + wb, + seq_params->frame_id_length - seq_params->delta_frame_id_length - 1, + 3); + } + } + + write_sb_size(seq_params, wb); + + aom_wb_write_bit(wb, seq_params->enable_filter_intra); + aom_wb_write_bit(wb, seq_params->enable_intra_edge_filter); + + if (!seq_params->reduced_still_picture_hdr) { + aom_wb_write_bit(wb, seq_params->enable_interintra_compound); + aom_wb_write_bit(wb, seq_params->enable_masked_compound); + aom_wb_write_bit(wb, seq_params->enable_warped_motion); + aom_wb_write_bit(wb, seq_params->enable_dual_filter); + + aom_wb_write_bit(wb, seq_params->enable_order_hint); + + if (seq_params->enable_order_hint) { + aom_wb_write_bit(wb, seq_params->enable_jnt_comp); + aom_wb_write_bit(wb, seq_params->enable_ref_frame_mvs); + } + if (seq_params->force_screen_content_tools == 2) { + aom_wb_write_bit(wb, 1); + } else { + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, seq_params->force_screen_content_tools); + } + if (seq_params->force_screen_content_tools > 0) { + if (seq_params->force_integer_mv == 2) { + aom_wb_write_bit(wb, 1); + } else { + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, seq_params->force_integer_mv); + } + } else { + assert(seq_params->force_integer_mv == 2); + } + if (seq_params->enable_order_hint) + aom_wb_write_literal(wb, seq_params->order_hint_bits_minus_1, 3); } -#endif // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT + + aom_wb_write_bit(wb, seq_params->enable_superres); + aom_wb_write_bit(wb, seq_params->enable_cdef); + aom_wb_write_bit(wb, seq_params->enable_restoration); } -#if CONFIG_GLOBAL_MOTION static void write_global_motion_params(const WarpedMotionParams *params, const WarpedMotionParams *ref_params, struct aom_write_bit_buffer *wb, int allow_hp) { - TransformationType type = params->wmtype; - int trans_bits; - int trans_prec_diff; + const TransformationType type = params->wmtype; aom_wb_write_bit(wb, type != IDENTITY); if (type != IDENTITY) { -#if GLOBAL_TRANS_TYPES > 4 - aom_wb_write_literal(wb, type - 1, GLOBAL_TYPE_BITS); -#else aom_wb_write_bit(wb, type == ROTZOOM); if (type != ROTZOOM) aom_wb_write_bit(wb, type == TRANSLATION); -#endif // GLOBAL_TRANS_TYPES > 4 - } - - switch (type) { - case HOMOGRAPHY: - case HORTRAPEZOID: - case VERTRAPEZOID: - if (type != HORTRAPEZOID) - aom_wb_write_signed_primitive_refsubexpfin( - wb, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K, - (ref_params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF), - (params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF)); - if (type != VERTRAPEZOID) - aom_wb_write_signed_primitive_refsubexpfin( - wb, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K, - (ref_params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF), - (params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF)); - // fallthrough intended - case AFFINE: - case ROTZOOM: - aom_wb_write_signed_primitive_refsubexpfin( - wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, - (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - - (1 << GM_ALPHA_PREC_BITS), - (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); - if (type != VERTRAPEZOID) - aom_wb_write_signed_primitive_refsubexpfin( - wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, - (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF), - (params->wmmat[3] >> GM_ALPHA_PREC_DIFF)); - if (type >= AFFINE) { - if (type != HORTRAPEZOID) - aom_wb_write_signed_primitive_refsubexpfin( - wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, - (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF), - (params->wmmat[4] >> GM_ALPHA_PREC_DIFF)); - aom_wb_write_signed_primitive_refsubexpfin( - wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, - (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - - (1 << GM_ALPHA_PREC_BITS), - (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - - (1 << GM_ALPHA_PREC_BITS)); - } - // fallthrough intended - case TRANSLATION: - trans_bits = (type == TRANSLATION) ? GM_ABS_TRANS_ONLY_BITS - !allow_hp - : GM_ABS_TRANS_BITS; - trans_prec_diff = (type == TRANSLATION) - ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp - : GM_TRANS_PREC_DIFF; - aom_wb_write_signed_primitive_refsubexpfin( - wb, (1 << trans_bits) + 1, SUBEXPFIN_K, - (ref_params->wmmat[0] >> trans_prec_diff), - (params->wmmat[0] >> trans_prec_diff)); - aom_wb_write_signed_primitive_refsubexpfin( - wb, (1 << trans_bits) + 1, SUBEXPFIN_K, - (ref_params->wmmat[1] >> trans_prec_diff), - (params->wmmat[1] >> trans_prec_diff)); - break; - case IDENTITY: break; - default: assert(0); + } + + if (type >= ROTZOOM) { + aom_wb_write_signed_primitive_refsubexpfin( + wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS), + (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); + aom_wb_write_signed_primitive_refsubexpfin( + wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF), + (params->wmmat[3] >> GM_ALPHA_PREC_DIFF)); + } + + if (type >= AFFINE) { + aom_wb_write_signed_primitive_refsubexpfin( + wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF), + (params->wmmat[4] >> GM_ALPHA_PREC_DIFF)); + aom_wb_write_signed_primitive_refsubexpfin( + wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS), + (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); + } + + if (type >= TRANSLATION) { + const int trans_bits = (type == TRANSLATION) + ? GM_ABS_TRANS_ONLY_BITS - !allow_hp + : GM_ABS_TRANS_BITS; + const int trans_prec_diff = (type == TRANSLATION) + ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp + : GM_TRANS_PREC_DIFF; + aom_wb_write_signed_primitive_refsubexpfin( + wb, (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_params->wmmat[0] >> trans_prec_diff), + (params->wmmat[0] >> trans_prec_diff)); + aom_wb_write_signed_primitive_refsubexpfin( + wb, (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_params->wmmat[1] >> trans_prec_diff), + (params->wmmat[1] >> trans_prec_diff)); } } @@ -4513,8 +2824,8 @@ static void write_global_motion(AV1_COMP *cpi, int frame; for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) { const WarpedMotionParams *ref_params = - cm->error_resilient_mode ? &default_warp_params - : &cm->prev_frame->global_motion[frame]; + cm->prev_frame ? &cm->prev_frame->global_motion[frame] + : &default_warp_params; write_global_motion_params(&cm->global_motion[frame], ref_params, wb, cm->allow_high_precision_mv); // TODO(sarahparker, debargha): The logic in the commented out code below @@ -4541,820 +2852,452 @@ static void write_global_motion(AV1_COMP *cpi, */ } } -#endif -#if !CONFIG_OBU -static void write_uncompressed_header_frame(AV1_COMP *cpi, - struct aom_write_bit_buffer *wb) { +static void check_frame_refs_short_signaling(AV1_COMP *const cpi) { AV1_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; - - aom_wb_write_literal(wb, AOM_FRAME_MARKER, 2); - - write_profile(cm->profile, wb); + if (!cm->frame_refs_short_signaling) return; -#if CONFIG_EXT_TILE - aom_wb_write_literal(wb, cm->large_scale_tile, 1); -#endif // CONFIG_EXT_TILE - -#if CONFIG_EXT_REFS - // NOTE: By default all coded frames to be used as a reference - cm->is_reference_frame = 1; - - if (cm->show_existing_frame) { - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; - const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show]; - - if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) { - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, - "Buffer %d does not contain a reconstructed frame", - frame_to_show); - } - ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show); - - aom_wb_write_bit(wb, 1); // show_existing_frame - aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3); - -#if CONFIG_REFERENCE_BUFFER - if (cm->seq_params.frame_id_numbers_present_flag) { - int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7; - int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show]; - aom_wb_write_literal(wb, display_frame_id, frame_id_len); - /* Add a zero byte to prevent emulation of superframe marker */ - /* Same logic as when when terminating the entropy coder */ - /* Consider to have this logic only one place */ - aom_wb_write_literal(wb, 0, 8); + // Check whether all references are distinct frames. + int buf_markers[FRAME_BUFFERS] = { 0 }; + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); + if (buf_idx != INVALID_IDX) { + assert(buf_idx >= 0 && buf_idx < FRAME_BUFFERS); + buf_markers[buf_idx] = 1; } -#endif // CONFIG_REFERENCE_BUFFER - - return; - } else { -#endif // CONFIG_EXT_REFS - aom_wb_write_bit(wb, 0); // show_existing_frame -#if CONFIG_EXT_REFS } -#endif // CONFIG_EXT_REFS - aom_wb_write_bit(wb, cm->frame_type); - aom_wb_write_bit(wb, cm->show_frame); - if (cm->frame_type != KEY_FRAME) - if (!cm->show_frame) aom_wb_write_bit(wb, cm->intra_only); - aom_wb_write_bit(wb, cm->error_resilient_mode); - - if (frame_is_intra_only(cm)) { -#if CONFIG_REFERENCE_BUFFER - write_sequence_header(cm, wb); -#endif // CONFIG_REFERENCE_BUFFER - } -#if CONFIG_REFERENCE_BUFFER - cm->invalid_delta_frame_id_minus1 = 0; - if (cm->seq_params.frame_id_numbers_present_flag) { - int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7; - aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len); + int num_refs = 0; + for (int buf_idx = 0; buf_idx < FRAME_BUFFERS; ++buf_idx) { + num_refs += buf_markers[buf_idx]; } -#endif // CONFIG_REFERENCE_BUFFER - if (cm->frame_type == KEY_FRAME) { - write_bitdepth_colorspace_sampling(cm, wb); - write_frame_size(cm, wb); - write_sb_size(cm, wb); - -#if CONFIG_ANS && ANS_MAX_SYMBOLS - assert(cpi->common.ans_window_size_log2 >= 8); - assert(cpi->common.ans_window_size_log2 < 24); - aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4); -#endif // CONFIG_ANS && ANS_MAX_SYMBOLS - aom_wb_write_bit(wb, cm->allow_screen_content_tools); -#if CONFIG_AMVR - if (cm->allow_screen_content_tools) { - if (cm->seq_mv_precision_level == 2) { - aom_wb_write_bit(wb, 1); - } else { - aom_wb_write_bit(wb, 0); - aom_wb_write_bit(wb, cm->seq_mv_precision_level == 0); - } - } -#endif - } else { - if (cm->intra_only) aom_wb_write_bit(wb, cm->allow_screen_content_tools); -#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING - if (!cm->error_resilient_mode) { - if (cm->intra_only) { - aom_wb_write_bit(wb, - cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL); - } else { - aom_wb_write_bit(wb, - cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE); - if (cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE) - aom_wb_write_bit(wb, - cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL); - } - } -#endif -#if CONFIG_EXT_REFS - cpi->refresh_frame_mask = get_refresh_mask(cpi); -#endif // CONFIG_EXT_REFS - - if (cm->intra_only) { - write_bitdepth_colorspace_sampling(cm, wb); - -#if CONFIG_EXT_REFS - aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES); -#else - aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES); -#endif // CONFIG_EXT_REFS - write_frame_size(cm, wb); - -#if CONFIG_ANS && ANS_MAX_SYMBOLS - assert(cpi->common.ans_window_size_log2 >= 8); - assert(cpi->common.ans_window_size_log2 < 24); - aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4); -#endif // CONFIG_ANS && ANS_MAX_SYMBOLS - } else { - MV_REFERENCE_FRAME ref_frame; - -#if CONFIG_EXT_REFS - aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES); -#else - aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES); -#endif // CONFIG_EXT_REFS - -#if CONFIG_EXT_REFS - if (!cpi->refresh_frame_mask) { - // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame - // will not be used as a reference - cm->is_reference_frame = 0; - } -#endif // CONFIG_EXT_REFS - - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX); - aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame), - REF_FRAMES_LOG2); -#if !CONFIG_FRAME_SIGN_BIAS - aom_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]); -#endif // !CONFIG_FRAME_SIGN_BIAS -#if CONFIG_REFERENCE_BUFFER - if (cm->seq_params.frame_id_numbers_present_flag) { - int i = get_ref_frame_map_idx(cpi, ref_frame); - int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7; - int diff_len = cm->seq_params.delta_frame_id_length_minus2 + 2; - int delta_frame_id_minus1 = - ((cm->current_frame_id - cm->ref_frame_id[i] + - (1 << frame_id_len)) % - (1 << frame_id_len)) - - 1; - if (delta_frame_id_minus1 < 0 || - delta_frame_id_minus1 >= (1 << diff_len)) - cm->invalid_delta_frame_id_minus1 = 1; - aom_wb_write_literal(wb, delta_frame_id_minus1, diff_len); - } -#endif // CONFIG_REFERENCE_BUFFER - } -#if CONFIG_FRAME_SIGN_BIAS -#define FRAME_SIGN_BIAS_DEBUG 0 -#if FRAME_SIGN_BIAS_DEBUG - { - printf("\n\nENCODER: Frame=%d, show_frame=%d:", cm->current_video_frame, - cm->show_frame); - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - printf(" sign_bias[%d]=%d", ref_frame, - cm->ref_frame_sign_bias[ref_frame]); - } - printf("\n"); - } -#endif // FRAME_SIGN_BIAS_DEBUG -#undef FRAME_SIGN_BIAS_DEBUG -#endif // CONFIG_FRAME_SIGN_BIAS - -#if CONFIG_FRAME_SIZE - if (cm->error_resilient_mode == 0) { - write_frame_size_with_refs(cpi, wb); - } else { - write_frame_size(cm, wb); - } -#else - write_frame_size_with_refs(cpi, wb); -#endif - -#if CONFIG_AMVR - if (cm->seq_mv_precision_level == 2) { - aom_wb_write_bit(wb, cm->cur_frame_mv_precision_level == 0); - } -#endif - aom_wb_write_bit(wb, cm->allow_high_precision_mv); - - fix_interp_filter(cm, cpi->td.counts); - write_frame_interp_filter(cm->interp_filter, wb); -#if CONFIG_TEMPMV_SIGNALING - if (frame_might_use_prev_frame_mvs(cm)) { - aom_wb_write_bit(wb, cm->use_prev_frame_mvs); - } -#endif - } + // We only turn on frame_refs_short_signaling when all references are + // distinct. + if (num_refs < INTER_REFS_PER_FRAME) { + // It indicates that there exist more than one reference frame pointing to + // the same reference buffer, i.e. two or more references are duplicate. + cm->frame_refs_short_signaling = 0; + return; } -#if CONFIG_FRAME_MARKER - if (cm->show_frame == 0) { - int arf_offset = AOMMIN( - (MAX_GF_INTERVAL - 1), - cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]); -#if CONFIG_EXT_REFS - int brf_offset = - cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index]; + // Check whether the encoder side ref frame choices are aligned with that to + // be derived at the decoder side. + RefBuffer frame_refs_copy[INTER_REFS_PER_FRAME]; - arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset); -#endif - aom_wb_write_literal(wb, arf_offset, 4); - } -#endif + // Backup the frame refs info + memcpy(frame_refs_copy, cm->frame_refs, + INTER_REFS_PER_FRAME * sizeof(RefBuffer)); -#if CONFIG_REFERENCE_BUFFER - if (cm->seq_params.frame_id_numbers_present_flag) { - cm->refresh_mask = - cm->frame_type == KEY_FRAME ? 0xFF : get_refresh_mask(cpi); - } -#endif // CONFIG_REFERENCE_BUFFER + const int lst_map_idx = get_ref_frame_map_idx(cpi, LAST_FRAME); + const int gld_map_idx = get_ref_frame_map_idx(cpi, GOLDEN_FRAME); - if (!cm->error_resilient_mode) { - aom_wb_write_bit( - wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD); - } -#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING - aom_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2); -#endif - encode_loopfilter(cm, wb); - encode_quantization(cm, wb); - encode_segmentation(cm, xd, wb); - { - int i; - struct segmentation *const seg = &cm->seg; - int segment_quantizer_active = 0; - for (i = 0; i < MAX_SEGMENTS; i++) { - if (segfeature_active(seg, i, SEG_LVL_ALT_Q)) { - segment_quantizer_active = 1; - } - } + // Set up the frame refs mapping indexes according to the + // frame_refs_short_signaling policy. + av1_set_frame_refs(cm, lst_map_idx, gld_map_idx); - if (cm->delta_q_present_flag) - assert(segment_quantizer_active == 0 && cm->base_qindex > 0); - if (segment_quantizer_active == 0 && cm->base_qindex > 0) { - aom_wb_write_bit(wb, cm->delta_q_present_flag); - if (cm->delta_q_present_flag) { - aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2); - xd->prev_qindex = cm->base_qindex; -#if CONFIG_EXT_DELTA_Q - assert(seg->abs_delta == SEGMENT_DELTADATA); - aom_wb_write_bit(wb, cm->delta_lf_present_flag); - if (cm->delta_lf_present_flag) { - aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2); - xd->prev_delta_lf_from_base = 0; -#if CONFIG_LOOPFILTER_LEVEL - aom_wb_write_bit(wb, cm->delta_lf_multi); - for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) - xd->prev_delta_lf[lf_id] = 0; -#endif // CONFIG_LOOPFILTER_LEVEL - } -#endif // CONFIG_EXT_DELTA_Q - } + // We only turn on frame_refs_short_signaling when the encoder side decision + // on ref frames is identical to that at the decoder side. + for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ++ref_idx) { + // Compare the buffer index between two reference frames indexed + // respectively by the encoder and the decoder side decisions. + if (cm->frame_refs[ref_idx].idx != frame_refs_copy[ref_idx].idx) { + cm->frame_refs_short_signaling = 0; + break; } } -#if CONFIG_CDEF - if (!cm->all_lossless) { - encode_cdef(cm, wb); - } -#endif -#if CONFIG_LOOP_RESTORATION - encode_restoration_mode(cm, wb); -#endif // CONFIG_LOOP_RESTORATION - write_tx_mode(cm, &cm->tx_mode, wb); - - if (cpi->allow_comp_inter_inter) { - const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT; -#if !CONFIG_REF_ADAPT - const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE; -#endif // !CONFIG_REF_ADAPT - aom_wb_write_bit(wb, use_hybrid_pred); -#if !CONFIG_REF_ADAPT - if (!use_hybrid_pred) aom_wb_write_bit(wb, use_compound_pred); -#endif // !CONFIG_REF_ADAPT +#if 0 // For debug + printf("\nFrame=%d: \n", cm->current_video_frame); + printf("***frame_refs_short_signaling=%d\n", cm->frame_refs_short_signaling); + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + printf("enc_ref(map_idx=%d, buf_idx=%d)=%d, vs. " + "dec_ref(map_idx=%d, buf_idx=%d)=%d\n", + get_ref_frame_map_idx(cpi, ref_frame), + get_ref_frame_buf_idx(cpi, ref_frame), ref_frame, + cm->frame_refs[ref_frame - LAST_FRAME].map_idx, + cm->frame_refs[ref_frame - LAST_FRAME].idx, ref_frame); } - write_compound_tools(cm, wb); - -#if CONFIG_EXT_TX - aom_wb_write_bit(wb, cm->reduced_tx_set_used); -#endif // CONFIG_EXT_TX - -#if CONFIG_ADAPT_SCAN - aom_wb_write_bit(wb, cm->use_adapt_scan); -#endif - -#if CONFIG_GLOBAL_MOTION - if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb); -#endif // CONFIG_GLOBAL_MOTION +#endif // 0 - write_tile_info(cm, wb); + // Restore the frame refs info if frame_refs_short_signaling is off. + if (!cm->frame_refs_short_signaling) + memcpy(cm->frame_refs, frame_refs_copy, + INTER_REFS_PER_FRAME * sizeof(RefBuffer)); } -#else // New function based on HLS R18 static void write_uncompressed_header_obu(AV1_COMP *cpi, + struct aom_write_bit_buffer *saved_wb, struct aom_write_bit_buffer *wb) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; -#if CONFIG_EXT_TILE - aom_wb_write_literal(wb, cm->large_scale_tile, 1); -#endif // CONFIG_EXT_TILE - -#if CONFIG_EXT_REFS // NOTE: By default all coded frames to be used as a reference cm->is_reference_frame = 1; + cm->frame_type = cm->intra_only ? INTRA_ONLY_FRAME : cm->frame_type; - if (cm->show_existing_frame) { - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; - const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show]; + if (cm->seq_params.still_picture) { + assert(cm->show_existing_frame == 0); + assert(cm->show_frame == 1); + assert(cm->frame_type == KEY_FRAME); + } + if (!cm->seq_params.reduced_still_picture_hdr) { + if (cm->show_existing_frame) { + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show]; - if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) { - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, - "Buffer %d does not contain a reconstructed frame", - frame_to_show); + if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Buffer %d does not contain a reconstructed frame", + frame_to_show); + } + ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show); + + aom_wb_write_bit(wb, 1); // show_existing_frame + aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3); + + if (cm->seq_params.decoder_model_info_present_flag && + cm->timing_info.equal_picture_interval == 0) { + write_tu_pts_info(cm, wb); + } + if (cm->seq_params.frame_id_numbers_present_flag) { + int frame_id_len = cm->seq_params.frame_id_length; + int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show]; + aom_wb_write_literal(wb, display_frame_id, frame_id_len); + } + + if (cm->reset_decoder_state && + frame_bufs[frame_to_show].frame_type != KEY_FRAME) { + aom_internal_error( + &cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "show_existing_frame to reset state on KEY_FRAME only"); + } + + return; + } else { + aom_wb_write_bit(wb, 0); // show_existing_frame } - ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show); - aom_wb_write_bit(wb, 1); // show_existing_frame - aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3); + aom_wb_write_literal(wb, cm->frame_type, 2); -#if CONFIG_REFERENCE_BUFFER - if (cm->seq_params.frame_id_numbers_present_flag) { - int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7; - int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show]; - aom_wb_write_literal(wb, display_frame_id, frame_id_len); - /* Add a zero byte to prevent emulation of superframe marker */ - /* Same logic as when when terminating the entropy coder */ - /* Consider to have this logic only one place */ - aom_wb_write_literal(wb, 0, 8); + aom_wb_write_bit(wb, cm->show_frame); + if (cm->show_frame) { + if (cm->seq_params.decoder_model_info_present_flag && + cm->timing_info.equal_picture_interval == 0) + write_tu_pts_info(cm, wb); + } else { + aom_wb_write_bit(wb, cm->showable_frame); + } + if (frame_is_sframe(cm)) { + assert(cm->error_resilient_mode); + } else if (!(cm->frame_type == KEY_FRAME && cm->show_frame)) { + aom_wb_write_bit(wb, cm->error_resilient_mode); } -#endif // CONFIG_REFERENCE_BUFFER + } + aom_wb_write_bit(wb, cm->disable_cdf_update); - return; + if (cm->seq_params.force_screen_content_tools == 2) { + aom_wb_write_bit(wb, cm->allow_screen_content_tools); } else { -#endif // CONFIG_EXT_REFS - aom_wb_write_bit(wb, 0); // show_existing_frame -#if CONFIG_EXT_REFS + assert(cm->allow_screen_content_tools == + cm->seq_params.force_screen_content_tools); } -#endif // CONFIG_EXT_REFS - cm->frame_type = cm->intra_only ? INTRA_ONLY_FRAME : cm->frame_type; - aom_wb_write_literal(wb, cm->frame_type, 2); + if (cm->allow_screen_content_tools) { + if (cm->seq_params.force_integer_mv == 2) { + aom_wb_write_bit(wb, cm->cur_frame_force_integer_mv); + } else { + assert(cm->cur_frame_force_integer_mv == cm->seq_params.force_integer_mv); + } + } else { + assert(cm->cur_frame_force_integer_mv == 0); + } - if (cm->intra_only) cm->frame_type = INTRA_ONLY_FRAME; + cm->invalid_delta_frame_id_minus_1 = 0; + int frame_size_override_flag = 0; + cm->frame_refs_short_signaling = 0; - aom_wb_write_bit(wb, cm->show_frame); - aom_wb_write_bit(wb, cm->error_resilient_mode); + if (cm->seq_params.reduced_still_picture_hdr) { + assert(cm->width == cm->seq_params.max_frame_width && + cm->height == cm->seq_params.max_frame_height); + } else { + if (cm->seq_params.frame_id_numbers_present_flag) { + int frame_id_len = cm->seq_params.frame_id_length; + aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len); + } -#if CONFIG_REFERENCE_BUFFER - cm->invalid_delta_frame_id_minus1 = 0; - if (cm->seq_params.frame_id_numbers_present_flag) { - int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7; - aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len); + if (cm->width > cm->seq_params.max_frame_width || + cm->height > cm->seq_params.max_frame_height) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Frame dimensions are larger than the maximum values"); + } + + frame_size_override_flag = + frame_is_sframe(cm) ? 1 + : (cm->width != cm->seq_params.max_frame_width || + cm->height != cm->seq_params.max_frame_height); + if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag); + + if (cm->seq_params.enable_order_hint) + aom_wb_write_literal(wb, cm->frame_offset, + cm->seq_params.order_hint_bits_minus_1 + 1); + + if (!cm->error_resilient_mode && !frame_is_intra_only(cm)) { + aom_wb_write_literal(wb, cm->primary_ref_frame, PRIMARY_REF_BITS); + } + } + + if (cm->seq_params.decoder_model_info_present_flag) { + aom_wb_write_bit(wb, cm->buffer_removal_delay_present); + if (cm->buffer_removal_delay_present) { + for (int op_num = 0; + op_num < cm->seq_params.operating_points_cnt_minus_1 + 1; op_num++) { + if (cm->op_params[op_num].decoder_model_param_present_flag) { + if (((cm->seq_params.operating_point_idc[op_num] >> + cm->temporal_layer_id) & + 0x1 && + (cm->seq_params.operating_point_idc[op_num] >> + (cm->spatial_layer_id + 8)) & + 0x1) || + cm->seq_params.operating_point_idc[op_num] == 0) { + aom_wb_write_literal( + wb, (uint32_t)cm->op_frame_timing[op_num].buffer_removal_delay, + cm->buffer_model.buffer_removal_delay_length); + cm->op_frame_timing[op_num].buffer_removal_delay++; + } + } + } + } } -#endif // CONFIG_REFERENCE_BUFFER + cpi->refresh_frame_mask = get_refresh_mask(cpi); if (cm->frame_type == KEY_FRAME) { - write_frame_size(cm, wb); - write_sb_size(cm, wb); - -#if CONFIG_ANS && ANS_MAX_SYMBOLS - assert(cpi->common.ans_window_size_log2 >= 8); - assert(cpi->common.ans_window_size_log2 < 24); - aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4); -#endif // CONFIG_ANS && ANS_MAX_SYMBOLS - aom_wb_write_bit(wb, cm->allow_screen_content_tools); -#if CONFIG_AMVR - if (cm->allow_screen_content_tools) { - if (cm->seq_mv_precision_level == 2) { - aom_wb_write_bit(wb, 1); - } else { - aom_wb_write_bit(wb, 0); - aom_wb_write_bit(wb, cm->seq_mv_precision_level == 0); - } + if (!cm->show_frame) { // unshown keyframe (forward keyframe) + aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES); + } else { + assert(cpi->refresh_frame_mask == 0xFF); } -#endif - } else if (cm->frame_type == INTRA_ONLY_FRAME) { - if (cm->intra_only) aom_wb_write_bit(wb, cm->allow_screen_content_tools); -#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING - if (!cm->error_resilient_mode) { - if (cm->intra_only) { - aom_wb_write_bit(wb, - cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL); + } else { + if (cm->frame_type == INTRA_ONLY_FRAME) { + assert(cpi->refresh_frame_mask != 0xFF); + int updated_fb = -1; + for (int i = 0; i < REF_FRAMES; i++) { + // If more than one frame is refreshed, it doesn't matter which one + // we pick, so pick the first. + if (cpi->refresh_frame_mask & (1 << i)) { + updated_fb = i; + break; + } } - } -#endif -#if CONFIG_EXT_REFS - cpi->refresh_frame_mask = get_refresh_mask(cpi); -#endif // CONFIG_EXT_REFS - - if (cm->intra_only) { -#if CONFIG_EXT_REFS + assert(updated_fb >= 0); + cm->fb_of_context_type[cm->frame_context_idx] = updated_fb; aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES); -#else - aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES); -#endif // CONFIG_EXT_REFS - write_frame_size(cm, wb); - -#if CONFIG_ANS && ANS_MAX_SYMBOLS - assert(cpi->common.ans_window_size_log2 >= 8); - assert(cpi->common.ans_window_size_log2 < 24); - aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4); -#endif // CONFIG_ANS && ANS_MAX_SYMBOLS - } - } else if (cm->frame_type == INTER_FRAME) { - MV_REFERENCE_FRAME ref_frame; -#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING - if (!cm->error_resilient_mode) { - aom_wb_write_bit(wb, cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE); - if (cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE) - aom_wb_write_bit(wb, - cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL); - } -#endif + } else if (cm->frame_type == INTER_FRAME || frame_is_sframe(cm)) { + if (cm->frame_type == INTER_FRAME) { + aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES); + } else { + assert(frame_is_sframe(cm) && cpi->refresh_frame_mask == 0xFF); + } + int updated_fb = -1; + for (int i = 0; i < REF_FRAMES; i++) { + // If more than one frame is refreshed, it doesn't matter which one + // we pick, so pick the first. + if (cpi->refresh_frame_mask & (1 << i)) { + updated_fb = i; + break; + } + } + // large scale tile sometimes won't refresh any fbs + if (updated_fb >= 0) { + cm->fb_of_context_type[cm->frame_context_idx] = updated_fb; + } -#if CONFIG_EXT_REFS - cpi->refresh_frame_mask = get_refresh_mask(cpi); - aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES); -#else - aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES); -#endif // CONFIG_EXT_REFS - -#if CONFIG_EXT_REFS - if (!cpi->refresh_frame_mask) { - // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame - // will not be used as a reference - cm->is_reference_frame = 0; - } -#endif // CONFIG_EXT_REFS - - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX); - aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame), - REF_FRAMES_LOG2); -#if !CONFIG_FRAME_SIGN_BIAS - aom_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]); -#endif // !CONFIG_FRAME_SIGN_BIAS -#if CONFIG_REFERENCE_BUFFER - if (cm->seq_params.frame_id_numbers_present_flag) { - int i = get_ref_frame_map_idx(cpi, ref_frame); - int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7; - int diff_len = cm->seq_params.delta_frame_id_length_minus2 + 2; - int delta_frame_id_minus1 = - ((cm->current_frame_id - cm->ref_frame_id[i] + - (1 << frame_id_len)) % - (1 << frame_id_len)) - - 1; - if (delta_frame_id_minus1 < 0 || - delta_frame_id_minus1 >= (1 << diff_len)) - cm->invalid_delta_frame_id_minus1 = 1; - aom_wb_write_literal(wb, delta_frame_id_minus1, diff_len); + if (!cpi->refresh_frame_mask) { + // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame + // will not be used as a reference + cm->is_reference_frame = 0; } -#endif // CONFIG_REFERENCE_BUFFER } + } -#if CONFIG_FRAME_SIZE - if (cm->error_resilient_mode == 0) { - write_frame_size_with_refs(cpi, wb); - } else { - write_frame_size(cm, wb); - } -#else - write_frame_size_with_refs(cpi, wb); -#endif + if (!frame_is_intra_only(cm) || cpi->refresh_frame_mask != 0xFF) { + // Write all ref frame order hints if error_resilient_mode == 1 + if (cm->error_resilient_mode && cm->seq_params.enable_order_hint) { + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) { + // Get buffer index + const int buf_idx = cm->ref_frame_map[ref_idx]; + assert(buf_idx >= 0 && buf_idx < FRAME_BUFFERS); -#if CONFIG_AMVR - if (cm->seq_mv_precision_level == 2) { - aom_wb_write_bit(wb, cm->cur_frame_mv_precision_level == 0); + // Write order hint to bit stream + aom_wb_write_literal(wb, frame_bufs[buf_idx].cur_frame_offset, + cm->seq_params.order_hint_bits_minus_1 + 1); + } } -#endif - aom_wb_write_bit(wb, cm->allow_high_precision_mv); + } - fix_interp_filter(cm, cpi->td.counts); - write_frame_interp_filter(cm->interp_filter, wb); -#if CONFIG_TEMPMV_SIGNALING - if (frame_might_use_prev_frame_mvs(cm)) { - aom_wb_write_bit(wb, cm->use_prev_frame_mvs); - } -#endif - } else if (cm->frame_type == S_FRAME) { - MV_REFERENCE_FRAME ref_frame; - -#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING - if (!cm->error_resilient_mode) { - aom_wb_write_bit(wb, cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE); - if (cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE) - aom_wb_write_bit(wb, - cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL); - } -#endif + if (cm->frame_type == KEY_FRAME) { + write_frame_size(cm, frame_size_override_flag, wb); + assert(!av1_superres_scaled(cm) || !cm->allow_intrabc); + if (cm->allow_screen_content_tools && !av1_superres_scaled(cm)) + aom_wb_write_bit(wb, cm->allow_intrabc); + // all eight fbs are refreshed, pick one that will live long enough + cm->fb_of_context_type[REGULAR_FRAME] = 0; + } else { + if (cm->frame_type == INTRA_ONLY_FRAME) { + write_frame_size(cm, frame_size_override_flag, wb); + assert(!av1_superres_scaled(cm) || !cm->allow_intrabc); + if (cm->allow_screen_content_tools && !av1_superres_scaled(cm)) + aom_wb_write_bit(wb, cm->allow_intrabc); + } else if (cm->frame_type == INTER_FRAME || frame_is_sframe(cm)) { + MV_REFERENCE_FRAME ref_frame; -#if CONFIG_EXT_REFS - if (!cpi->refresh_frame_mask) { - // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame - // will not be used as a reference - cm->is_reference_frame = 0; - } -#endif // CONFIG_EXT_REFS - - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX); - aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame), - REF_FRAMES_LOG2); - assert(cm->ref_frame_sign_bias[ref_frame] == 0); -#if CONFIG_REFERENCE_BUFFER - if (cm->seq_params.frame_id_numbers_present_flag) { - int i = get_ref_frame_map_idx(cpi, ref_frame); - int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7; - int diff_len = cm->seq_params.delta_frame_id_length_minus2 + 2; - int delta_frame_id_minus1 = - ((cm->current_frame_id - cm->ref_frame_id[i] + - (1 << frame_id_len)) % - (1 << frame_id_len)) - - 1; - if (delta_frame_id_minus1 < 0 || - delta_frame_id_minus1 >= (1 << diff_len)) - cm->invalid_delta_frame_id_minus1 = 1; - aom_wb_write_literal(wb, delta_frame_id_minus1, diff_len); + // NOTE: Error resilient mode turns off frame_refs_short_signaling + // automatically. +#define FRAME_REFS_SHORT_SIGNALING 0 +#if FRAME_REFS_SHORT_SIGNALING + cm->frame_refs_short_signaling = cm->seq_params.enable_order_hint; +#endif // FRAME_REFS_SHORT_SIGNALING + + if (cm->frame_refs_short_signaling) { + // NOTE(zoeliu@google.com): + // An example solution for encoder-side implementation on frame refs + // short signaling, which is only turned on when the encoder side + // decision on ref frames is identical to that at the decoder side. + check_frame_refs_short_signaling(cpi); } -#endif // CONFIG_REFERENCE_BUFFER - } -#if CONFIG_FRAME_SIZE - if (cm->error_resilient_mode == 0) { - write_frame_size_with_refs(cpi, wb); - } else { - write_frame_size(cm, wb); - } -#else - write_frame_size_with_refs(cpi, wb); -#endif + if (cm->seq_params.enable_order_hint) + aom_wb_write_bit(wb, cm->frame_refs_short_signaling); - aom_wb_write_bit(wb, cm->allow_high_precision_mv); + if (cm->frame_refs_short_signaling) { + const int lst_ref = get_ref_frame_map_idx(cpi, LAST_FRAME); + aom_wb_write_literal(wb, lst_ref, REF_FRAMES_LOG2); - fix_interp_filter(cm, cpi->td.counts); - write_frame_interp_filter(cm->interp_filter, wb); -#if CONFIG_TEMPMV_SIGNALING - if (frame_might_use_prev_frame_mvs(cm)) { - aom_wb_write_bit(wb, cm->use_prev_frame_mvs); - } -#endif - } + const int gld_ref = get_ref_frame_map_idx(cpi, GOLDEN_FRAME); + aom_wb_write_literal(wb, gld_ref, REF_FRAMES_LOG2); + } -#if CONFIG_MFMV - if (cm->show_frame == 0) { - int arf_offset = AOMMIN( - (MAX_GF_INTERVAL - 1), - cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]); -#if CONFIG_EXT_REFS - int brf_offset = - cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index]; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX); + if (!cm->frame_refs_short_signaling) + aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame), + REF_FRAMES_LOG2); + if (cm->seq_params.frame_id_numbers_present_flag) { + int i = get_ref_frame_map_idx(cpi, ref_frame); + int frame_id_len = cm->seq_params.frame_id_length; + int diff_len = cm->seq_params.delta_frame_id_length; + int delta_frame_id_minus_1 = + ((cm->current_frame_id - cm->ref_frame_id[i] + + (1 << frame_id_len)) % + (1 << frame_id_len)) - + 1; + if (delta_frame_id_minus_1 < 0 || + delta_frame_id_minus_1 >= (1 << diff_len)) + cm->invalid_delta_frame_id_minus_1 = 1; + aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len); + } + } - arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset); -#endif - aom_wb_write_literal(wb, arf_offset, 4); - } -#endif + if (!cm->error_resilient_mode && frame_size_override_flag) { + write_frame_size_with_refs(cpi, wb); + } else { + write_frame_size(cm, frame_size_override_flag, wb); + } -#if CONFIG_REFERENCE_BUFFER - if (cm->seq_params.frame_id_numbers_present_flag) { - cm->refresh_mask = - cm->frame_type == KEY_FRAME ? 0xFF : get_refresh_mask(cpi); + if (cm->cur_frame_force_integer_mv) { + cm->allow_high_precision_mv = 0; + } else { + aom_wb_write_bit(wb, cm->allow_high_precision_mv); + } + fix_interp_filter(cm, cpi->td.counts); + write_frame_interp_filter(cm->interp_filter, wb); + aom_wb_write_bit(wb, cm->switchable_motion_mode); + if (frame_might_allow_ref_frame_mvs(cm)) { + aom_wb_write_bit(wb, cm->allow_ref_frame_mvs); + } else { + assert(cm->allow_ref_frame_mvs == 0); + } + } } -#endif // CONFIG_REFERENCE_BUFFER - if (!cm->error_resilient_mode) { + const int might_bwd_adapt = + !(cm->seq_params.reduced_still_picture_hdr) && !(cm->disable_cdf_update); + if (cm->large_scale_tile) + cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; + + if (might_bwd_adapt) { aom_wb_write_bit( - wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD); + wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED); } -#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING - aom_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2); -#endif - encode_loopfilter(cm, wb); + + write_tile_info(cm, saved_wb, wb); encode_quantization(cm, wb); encode_segmentation(cm, xd, wb); - { - int i; - struct segmentation *const seg = &cm->seg; - int segment_quantizer_active = 0; - for (i = 0; i < MAX_SEGMENTS; i++) { - if (segfeature_active(seg, i, SEG_LVL_ALT_Q)) { - segment_quantizer_active = 1; - } - } - if (cm->delta_q_present_flag) - assert(segment_quantizer_active == 0 && cm->base_qindex > 0); - if (segment_quantizer_active == 0 && cm->base_qindex > 0) { - aom_wb_write_bit(wb, cm->delta_q_present_flag); - if (cm->delta_q_present_flag) { - aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2); - xd->prev_qindex = cm->base_qindex; -#if CONFIG_EXT_DELTA_Q - assert(seg->abs_delta == SEGMENT_DELTADATA); + if (cm->delta_q_present_flag) assert(cm->base_qindex > 0); + if (cm->base_qindex > 0) { + aom_wb_write_bit(wb, cm->delta_q_present_flag); + if (cm->delta_q_present_flag) { + aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2); + xd->current_qindex = cm->base_qindex; + if (cm->allow_intrabc) + assert(cm->delta_lf_present_flag == 0); + else aom_wb_write_bit(wb, cm->delta_lf_present_flag); - if (cm->delta_lf_present_flag) { - aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2); -#if CONFIG_LOOPFILTER_LEVEL - for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) - xd->prev_delta_lf[lf_id] = 0; -#endif // CONFIG_LOOPFILTER_LEVEL - xd->prev_delta_lf_from_base = 0; - } -#endif // CONFIG_EXT_DELTA_Q + if (cm->delta_lf_present_flag) { + aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2); + aom_wb_write_bit(wb, cm->delta_lf_multi); + av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); } } } -#if CONFIG_CDEF - if (!cm->all_lossless) { - encode_cdef(cm, wb); + + if (cm->all_lossless) { + assert(!av1_superres_scaled(cm)); + } else { + if (!cm->coded_lossless) { + encode_loopfilter(cm, wb); + encode_cdef(cm, wb); + } + encode_restoration_mode(cm, wb); } -#endif -#if CONFIG_LOOP_RESTORATION - encode_restoration_mode(cm, wb); -#endif // CONFIG_LOOP_RESTORATION + write_tx_mode(cm, &cm->tx_mode, wb); if (cpi->allow_comp_inter_inter) { const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT; -#if !CONFIG_REF_ADAPT - const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE; -#endif // !CONFIG_REF_ADAPT aom_wb_write_bit(wb, use_hybrid_pred); -#if !CONFIG_REF_ADAPT - if (!use_hybrid_pred) aom_wb_write_bit(wb, use_compound_pred); -#endif // !CONFIG_REF_ADAPT } - write_compound_tools(cm, wb); - -#if CONFIG_EXT_TX - aom_wb_write_bit(wb, cm->reduced_tx_set_used); -#endif // CONFIG_EXT_TX - -#if CONFIG_GLOBAL_MOTION - if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb); -#endif // CONFIG_GLOBAL_MOTION - - write_tile_info(cm, wb); -} -#endif // CONFIG_OBU - -static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) { - AV1_COMMON *const cm = &cpi->common; -#if CONFIG_SUPERTX - MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; -#endif // CONFIG_SUPERTX - FRAME_CONTEXT *const fc = cm->fc; - aom_writer *header_bc; - int i; -#if !CONFIG_NEW_MULTISYMBOL - FRAME_COUNTS *counts = cpi->td.counts; - int j; -#endif - - const int probwt = cm->num_tg; - (void)probwt; - (void)i; - (void)fc; - - aom_writer real_header_bc; - header_bc = &real_header_bc; -#if CONFIG_ANS - header_bc->size = 1 << cpi->common.ans_window_size_log2; -#endif - aom_start_encode(header_bc, data); -#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX) - if (cm->tx_mode == TX_MODE_SELECT) - av1_cond_prob_diff_update(header_bc, &cm->fc->quarter_tx_size_prob, - cm->counts.quarter_tx_size, probwt); -#endif -#if CONFIG_LV_MAP - av1_write_txb_probs(cpi, header_bc); -#endif // CONFIG_LV_MAP + if (cm->is_skip_mode_allowed) aom_wb_write_bit(wb, cm->skip_mode_flag); -#if CONFIG_VAR_TX && !CONFIG_NEW_MULTISYMBOL - if (cm->tx_mode == TX_MODE_SELECT) - update_txfm_partition_probs(cm, header_bc, counts, probwt); -#endif - -#if !CONFIG_NEW_MULTISYMBOL - update_skip_probs(cm, header_bc, counts); -#endif - - if (!frame_is_intra_only(cm)) { -#if !CONFIG_NEW_MULTISYMBOL - update_inter_mode_probs(cm, header_bc, counts); -#endif -#if CONFIG_INTERINTRA - if (cm->reference_mode != COMPOUND_REFERENCE && - cm->allow_interintra_compound) { -#if !CONFIG_NEW_MULTISYMBOL - for (i = 0; i < BLOCK_SIZE_GROUPS; i++) { - if (is_interintra_allowed_bsize_group(i)) { - av1_cond_prob_diff_update(header_bc, &fc->interintra_prob[i], - cm->counts.interintra[i], probwt); - } - } -#endif -#if CONFIG_WEDGE && !CONFIG_NEW_MULTISYMBOL -#if CONFIG_EXT_PARTITION_TYPES - int block_sizes_to_update = BLOCK_SIZES_ALL; -#else - int block_sizes_to_update = BLOCK_SIZES; -#endif - for (i = 0; i < block_sizes_to_update; i++) { - if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i)) - av1_cond_prob_diff_update(header_bc, &fc->wedge_interintra_prob[i], - cm->counts.wedge_interintra[i], probwt); - } -#endif // CONFIG_WEDGE && CONFIG_NEW_MULTISYMBOL - } -#endif // CONFIG_INTERINTRA - -#if !CONFIG_NEW_MULTISYMBOL - for (i = 0; i < INTRA_INTER_CONTEXTS; i++) - av1_cond_prob_diff_update(header_bc, &fc->intra_inter_prob[i], - counts->intra_inter[i], probwt); -#endif + if (frame_might_allow_warped_motion(cm)) + aom_wb_write_bit(wb, cm->allow_warped_motion); + else + assert(!cm->allow_warped_motion); -#if !CONFIG_NEW_MULTISYMBOL - if (cpi->allow_comp_inter_inter) { - const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT; - if (use_hybrid_pred) - for (i = 0; i < COMP_INTER_CONTEXTS; i++) - av1_cond_prob_diff_update(header_bc, &fc->comp_inter_prob[i], - counts->comp_inter[i], probwt); - } + aom_wb_write_bit(wb, cm->reduced_tx_set_used); - if (cm->reference_mode != COMPOUND_REFERENCE) { - for (i = 0; i < REF_CONTEXTS; i++) { - for (j = 0; j < (SINGLE_REFS - 1); j++) { - av1_cond_prob_diff_update(header_bc, &fc->single_ref_prob[i][j], - counts->single_ref[i][j], probwt); - } - } - } + if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb); - if (cm->reference_mode != SINGLE_REFERENCE) { -#if CONFIG_EXT_COMP_REFS - for (i = 0; i < COMP_REF_TYPE_CONTEXTS; i++) - av1_cond_prob_diff_update(header_bc, &fc->comp_ref_type_prob[i], - counts->comp_ref_type[i], probwt); - - for (i = 0; i < UNI_COMP_REF_CONTEXTS; i++) - for (j = 0; j < (UNIDIR_COMP_REFS - 1); j++) - av1_cond_prob_diff_update(header_bc, &fc->uni_comp_ref_prob[i][j], - counts->uni_comp_ref[i][j], probwt); -#endif // CONFIG_EXT_COMP_REFS - - for (i = 0; i < REF_CONTEXTS; i++) { -#if CONFIG_EXT_REFS - for (j = 0; j < (FWD_REFS - 1); j++) { - av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j], - counts->comp_ref[i][j], probwt); - } - for (j = 0; j < (BWD_REFS - 1); j++) { - av1_cond_prob_diff_update(header_bc, &fc->comp_bwdref_prob[i][j], - counts->comp_bwdref[i][j], probwt); - } -#else - for (j = 0; j < (COMP_REFS - 1); j++) { - av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j], - counts->comp_ref[i][j], probwt); - } -#endif // CONFIG_EXT_REFS - } + if (cm->film_grain_params_present && (cm->show_frame || cm->showable_frame)) { + int flip_back_update_parameters_flag = 0; + if (cm->frame_type != INTER_FRAME && + cm->film_grain_params.update_parameters == 0) { + cm->film_grain_params.update_parameters = 1; + flip_back_update_parameters_flag = 1; } -#endif // CONFIG_NEW_MULTISYMBOL - -#if CONFIG_COMPOUND_SINGLEREF - for (i = 0; i < COMP_INTER_MODE_CONTEXTS; i++) - av1_cond_prob_diff_update(header_bc, &fc->comp_inter_mode_prob[i], - counts->comp_inter_mode[i], probwt); -#endif // CONFIG_COMPOUND_SINGLEREF + write_film_grain_params(cpi, wb); -#if !CONFIG_NEW_MULTISYMBOL - av1_write_nmv_probs(cm, cm->allow_high_precision_mv, header_bc, counts->mv); -#endif -#if CONFIG_SUPERTX - if (!xd->lossless[0]) update_supertx_probs(cm, probwt, header_bc); -#endif // CONFIG_SUPERTX + if (flip_back_update_parameters_flag) + cm->film_grain_params.update_parameters = 0; } - aom_stop_encode(header_bc); - assert(header_bc->pos <= 0xffff); - return header_bc->pos; + + if (cm->large_scale_tile) write_ext_tile_info(cm, saved_wb, wb); } -#if !CONFIG_OBU || CONFIG_EXT_TILE static int choose_size_bytes(uint32_t size, int spare_msbs) { // Choose the number of bytes required to represent size, without // using the 'spare_msbs' number of most significant bits. @@ -5394,116 +3337,112 @@ static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst, int tsb; int tcsb; -#if CONFIG_EXT_TILE if (cm->large_scale_tile) { // The top bit in the tile size field indicates tile copy mode, so we // have 1 less bit to code the tile size tsb = choose_size_bytes(max_tile_size, 1); tcsb = choose_size_bytes(max_tile_col_size, 0); } else { -#endif // CONFIG_EXT_TILE tsb = choose_size_bytes(max_tile_size, 0); tcsb = 4; // This is ignored (void)max_tile_col_size; -#if CONFIG_EXT_TILE } -#endif // CONFIG_EXT_TILE assert(tsb > 0); assert(tcsb > 0); *tile_size_bytes = tsb; *tile_col_size_bytes = tcsb; + if (tsb == 4 && tcsb == 4) return data_size; - if (tsb == 4 && tcsb == 4) { - return data_size; - } else { - uint32_t wpos = 0; - uint32_t rpos = 0; - -#if CONFIG_EXT_TILE - if (cm->large_scale_tile) { - int tile_row; - int tile_col; - - for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) { - // All but the last column has a column header - if (tile_col < cm->tile_cols - 1) { - uint32_t tile_col_size = mem_get_le32(dst + rpos); - rpos += 4; - - // Adjust the tile column size by the number of bytes removed - // from the tile size fields. - tile_col_size -= (4 - tsb) * cm->tile_rows; - - mem_put_varsize(dst + wpos, tcsb, tile_col_size); - wpos += tcsb; - } + uint32_t wpos = 0; + uint32_t rpos = 0; - for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) { - // All, including the last row has a header - uint32_t tile_header = mem_get_le32(dst + rpos); - rpos += 4; - - // If this is a copy tile, we need to shift the MSB to the - // top bit of the new width, and there is no data to copy. - if (tile_header >> 31 != 0) { - if (tsb < 4) tile_header >>= 32 - 8 * tsb; - mem_put_varsize(dst + wpos, tsb, tile_header); - wpos += tsb; - } else { - mem_put_varsize(dst + wpos, tsb, tile_header); - wpos += tsb; + if (cm->large_scale_tile) { + int tile_row; + int tile_col; - memmove(dst + wpos, dst + rpos, tile_header); - rpos += tile_header; - wpos += tile_header; - } - } + for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) { + // All but the last column has a column header + if (tile_col < cm->tile_cols - 1) { + uint32_t tile_col_size = mem_get_le32(dst + rpos); + rpos += 4; + + // Adjust the tile column size by the number of bytes removed + // from the tile size fields. + tile_col_size -= (4 - tsb) * cm->tile_rows; + + mem_put_varsize(dst + wpos, tcsb, tile_col_size); + wpos += tcsb; } - } else { -#endif // CONFIG_EXT_TILE - const int n_tiles = cm->tile_cols * cm->tile_rows; - int n; - for (n = 0; n < n_tiles; n++) { - int tile_size; + for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) { + // All, including the last row has a header + uint32_t tile_header = mem_get_le32(dst + rpos); + rpos += 4; - if (n == n_tiles - 1) { - tile_size = data_size - rpos; + // If this is a copy tile, we need to shift the MSB to the + // top bit of the new width, and there is no data to copy. + if (tile_header >> 31 != 0) { + if (tsb < 4) tile_header >>= 32 - 8 * tsb; + mem_put_varsize(dst + wpos, tsb, tile_header); + wpos += tsb; } else { - tile_size = mem_get_le32(dst + rpos); - rpos += 4; - mem_put_varsize(dst + wpos, tsb, tile_size); + mem_put_varsize(dst + wpos, tsb, tile_header); wpos += tsb; - } - memmove(dst + wpos, dst + rpos, tile_size); - - rpos += tile_size; - wpos += tile_size; + tile_header += AV1_MIN_TILE_SIZE_BYTES; + memmove(dst + wpos, dst + rpos, tile_header); + rpos += tile_header; + wpos += tile_header; + } } -#if CONFIG_EXT_TILE } -#endif // CONFIG_EXT_TILE assert(rpos > wpos); assert(rpos == data_size); return wpos; } + const int n_tiles = cm->tile_cols * cm->tile_rows; + int n; + + for (n = 0; n < n_tiles; n++) { + int tile_size; + + if (n == n_tiles - 1) { + tile_size = data_size - rpos; + } else { + tile_size = mem_get_le32(dst + rpos); + rpos += 4; + mem_put_varsize(dst + wpos, tsb, tile_size); + tile_size += AV1_MIN_TILE_SIZE_BYTES; + wpos += tsb; + } + + memmove(dst + wpos, dst + rpos, tile_size); + + rpos += tile_size; + wpos += tile_size; + } + + assert(rpos > wpos); + assert(rpos == data_size); + + return wpos; } -#endif -#if CONFIG_OBU -static uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension, - uint8_t *const dst) { +uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension, + uint8_t *const dst) { struct aom_write_bit_buffer wb = { dst, 0 }; uint32_t size = 0; - aom_wb_write_literal(&wb, (int)obu_type, 5); - aom_wb_write_literal(&wb, 0, 2); + aom_wb_write_literal(&wb, 0, 1); // forbidden bit. + aom_wb_write_literal(&wb, (int)obu_type, 4); aom_wb_write_literal(&wb, obu_extension ? 1 : 0, 1); + aom_wb_write_literal(&wb, 1, 1); // obu_has_payload_length_field + aom_wb_write_literal(&wb, 0, 1); // reserved + if (obu_extension) { aom_wb_write_literal(&wb, obu_extension & 0xFF, 8); } @@ -5512,87 +3451,156 @@ static uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension, return size; } -static uint32_t write_temporal_delimiter_obu() { return 0; } +int write_uleb_obu_size(uint32_t obu_header_size, uint32_t obu_payload_size, + uint8_t *dest) { + const uint32_t obu_size = obu_payload_size; + const uint32_t offset = obu_header_size; + size_t coded_obu_size = 0; -static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) { - AV1_COMMON *const cm = &cpi->common; - SequenceHeader *const seq_params = &cm->seq_params; - struct aom_write_bit_buffer wb = { dst, 0 }; - uint32_t size = 0; + if (aom_uleb_encode(obu_size, sizeof(obu_size), dest + offset, + &coded_obu_size) != 0) { + return AOM_CODEC_ERROR; + } - write_profile(cm->profile, &wb); + return AOM_CODEC_OK; +} - aom_wb_write_literal(&wb, 0, 4); +static size_t obu_memmove(uint32_t obu_header_size, uint32_t obu_payload_size, + uint8_t *data) { + const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size); + const uint32_t move_dst_offset = + (uint32_t)length_field_size + obu_header_size; + const uint32_t move_src_offset = obu_header_size; + const uint32_t move_size = obu_payload_size; + memmove(data + move_dst_offset, data + move_src_offset, move_size); + return length_field_size; +} - seq_params->frame_id_numbers_present_flag = FRAME_ID_NUMBERS_PRESENT_FLAG; - aom_wb_write_literal(&wb, seq_params->frame_id_numbers_present_flag, 1); - if (seq_params->frame_id_numbers_present_flag) { - seq_params->frame_id_length_minus7 = FRAME_ID_LENGTH_MINUS7; - seq_params->delta_frame_id_length_minus2 = DELTA_FRAME_ID_LENGTH_MINUS2; - aom_wb_write_literal(&wb, seq_params->frame_id_length_minus7, 4); - aom_wb_write_literal(&wb, seq_params->delta_frame_id_length_minus2, 4); +static void add_trailing_bits(struct aom_write_bit_buffer *wb) { + if (aom_wb_is_byte_aligned(wb)) { + aom_wb_write_literal(wb, 0x80, 8); + } else { + // assumes that the other bits are already 0s + aom_wb_write_bit(wb, 1); } +} - // color_config - write_bitdepth_colorspace_sampling(cm, &wb); - - size = aom_wb_bytes_written(&wb); - return size; +static void write_bitstream_level(BitstreamLevel bl, + struct aom_write_bit_buffer *wb) { + uint8_t seq_level_idx = major_minor_to_seq_level_idx(bl); + assert(is_valid_seq_level_idx(seq_level_idx)); + aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS); } -static uint32_t write_frame_header_obu(AV1_COMP *cpi, uint8_t *const dst) { +static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) { AV1_COMMON *const cm = &cpi->common; struct aom_write_bit_buffer wb = { dst, 0 }; - uint32_t total_size = 0; - uint32_t compressed_hdr_size, uncompressed_hdr_size; + uint32_t size = 0; - write_uncompressed_header_obu(cpi, &wb); + write_profile(cm->profile, &wb); - if (cm->show_existing_frame) { - total_size = aom_wb_bytes_written(&wb); - return total_size; + // Still picture or not + aom_wb_write_bit(&wb, cm->seq_params.still_picture); + assert(IMPLIES(!cm->seq_params.still_picture, + !cm->seq_params.reduced_still_picture_hdr)); + // whether to use reduced still picture header + aom_wb_write_bit(&wb, cm->seq_params.reduced_still_picture_hdr); + + if (cm->seq_params.reduced_still_picture_hdr) { + assert(cm->timing_info_present == 0); + assert(cm->seq_params.decoder_model_info_present_flag == 0); + assert(cm->seq_params.display_model_info_present_flag == 0); + write_bitstream_level(cm->seq_params.level[0], &wb); + } else { + aom_wb_write_bit(&wb, cm->timing_info_present); // timing info present flag + + if (cm->timing_info_present) { + // timing_info + write_timing_info_header(cm, &wb); + aom_wb_write_bit(&wb, cm->seq_params.decoder_model_info_present_flag); + if (cm->seq_params.decoder_model_info_present_flag) { + write_decoder_model_info(cm, &wb); + } + } + aom_wb_write_bit(&wb, cm->seq_params.display_model_info_present_flag); + aom_wb_write_literal(&wb, cm->seq_params.operating_points_cnt_minus_1, + OP_POINTS_CNT_MINUS_1_BITS); + int i; + for (i = 0; i < cm->seq_params.operating_points_cnt_minus_1 + 1; i++) { + aom_wb_write_literal(&wb, cm->seq_params.operating_point_idc[i], + OP_POINTS_IDC_BITS); + write_bitstream_level(cm->seq_params.level[i], &wb); + if (cm->seq_params.level[i].major > 3) + aom_wb_write_bit(&wb, cm->seq_params.tier[i]); + if (cm->seq_params.decoder_model_info_present_flag) { + aom_wb_write_bit(&wb, + cm->op_params[i].decoder_model_param_present_flag); + if (cm->op_params[i].decoder_model_param_present_flag) + write_dec_model_op_parameters(cm, &wb, i); + } + if (cm->seq_params.display_model_info_present_flag) { + aom_wb_write_bit(&wb, + cm->op_params[i].display_model_param_present_flag); + if (cm->op_params[i].display_model_param_present_flag) { + assert(cm->op_params[i].initial_display_delay <= 10); + aom_wb_write_literal(&wb, cm->op_params[i].initial_display_delay - 1, + 4); + } + } + } } + write_sequence_header(cpi, &wb); - // write the tile length code (Always 4 bytes for now) - aom_wb_write_literal(&wb, 3, 2); + write_color_config(cm, &wb); - if (!use_compressed_header(cm)) { - uncompressed_hdr_size = aom_wb_bytes_written(&wb); - compressed_hdr_size = 0; - } else { - // placeholder for the compressed header length - struct aom_write_bit_buffer compr_hdr_len_wb = wb; - aom_wb_write_literal(&wb, 0, 16); + aom_wb_write_bit(&wb, cm->film_grain_params_present); - uncompressed_hdr_size = aom_wb_bytes_written(&wb); - compressed_hdr_size = - write_compressed_header(cpi, dst + uncompressed_hdr_size); - aom_wb_overwrite_literal(&compr_hdr_len_wb, (int)(compressed_hdr_size), 16); - } + add_trailing_bits(&wb); - total_size = uncompressed_hdr_size + compressed_hdr_size; - return total_size; + size = aom_wb_bytes_written(&wb); + return size; +} + +static uint32_t write_frame_header_obu(AV1_COMP *cpi, + struct aom_write_bit_buffer *saved_wb, + uint8_t *const dst, + int append_trailing_bits) { + struct aom_write_bit_buffer wb = { dst, 0 }; + write_uncompressed_header_obu(cpi, saved_wb, &wb); + if (append_trailing_bits) add_trailing_bits(&wb); + return aom_wb_bytes_written(&wb); } static uint32_t write_tile_group_header(uint8_t *const dst, int startTile, - int endTile, int tiles_log2) { + int endTile, int tiles_log2, + int tile_start_and_end_present_flag) { struct aom_write_bit_buffer wb = { dst, 0 }; uint32_t size = 0; - aom_wb_write_literal(&wb, startTile, tiles_log2); - aom_wb_write_literal(&wb, endTile, tiles_log2); + if (!tiles_log2) return size; + + aom_wb_write_bit(&wb, tile_start_and_end_present_flag); + + if (tile_start_and_end_present_flag) { + aom_wb_write_literal(&wb, startTile, tiles_log2); + aom_wb_write_literal(&wb, endTile, tiles_log2); + } size = aom_wb_bytes_written(&wb); return size; } +typedef struct { + uint8_t *frame_header; + size_t obu_header_byte_offset; + size_t total_length; +} FrameHeaderInfo; + static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, - unsigned int *max_tile_size, - unsigned int *max_tile_col_size, - uint8_t *const frame_header_obu_location, - uint32_t frame_header_obu_size, - int insert_frame_header_obu_flag) { - const AV1_COMMON *const cm = &cpi->common; + struct aom_write_bit_buffer *saved_wb, + uint8_t obu_extension_header, + const FrameHeaderInfo *fh_info) { + AV1_COMMON *const cm = &cpi->common; aom_writer mode_bc; int tile_row, tile_col; TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok; @@ -5601,29 +3609,53 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, const int tile_cols = cm->tile_cols; const int tile_rows = cm->tile_rows; unsigned int tile_size = 0; + unsigned int max_tile_size = 0; + unsigned int max_tile_col_size = 0; const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols; // Fixed size tile groups for the moment const int num_tg_hdrs = cm->num_tg; const int tg_size = -#if CONFIG_EXT_TILE (cm->large_scale_tile) ? 1 - : -#endif // CONFIG_EXT_TILE - (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs; + : (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs; int tile_count = 0; int curr_tg_data_size = 0; uint8_t *data = dst; int new_tg = 1; -#if CONFIG_EXT_TILE const int have_tiles = tile_cols * tile_rows > 1; -#endif + int first_tg = 1; - *max_tile_size = 0; - *max_tile_col_size = 0; + cm->largest_tile_id = 0; -#if CONFIG_EXT_TILE if (cm->large_scale_tile) { + // For large_scale_tile case, we always have only one tile group, so it can + // be written as an OBU_FRAME. + const OBU_TYPE obu_type = OBU_FRAME; + const uint32_t tg_hdr_size = write_obu_header(obu_type, 0, data); + data += tg_hdr_size; + + const uint32_t frame_header_size = + write_frame_header_obu(cpi, saved_wb, data, 0); + data += frame_header_size; + total_size += frame_header_size; + +#define EXT_TILE_DEBUG 0 +#if EXT_TILE_DEBUG + { + char fn[20] = "./fh"; + fn[4] = cm->current_video_frame / 100 + '0'; + fn[5] = (cm->current_video_frame % 100) / 10 + '0'; + fn[6] = (cm->current_video_frame % 10) + '0'; + fn[7] = '\0'; + av1_print_uncompressed_frame_header(data - frame_header_size, + frame_header_size, fn); + } +#endif // EXT_TILE_DEBUG +#undef EXT_TILE_DEBUG + + int tile_size_bytes = 0; + int tile_col_size_bytes = 0; + for (tile_col = 0; tile_col < tile_cols; tile_col++) { TileInfo tile_info; const int is_last_col = (tile_col == tile_cols - 1); @@ -5643,7 +3675,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; av1_tile_set_row(&tile_info, cm, tile_row); - buf->data = dst + total_size; + buf->data = dst + total_size + tg_hdr_size; // Is CONFIG_EXT_TILE = 1, every tile in the row has a header, // even for the last one, unless no tiling is used at all. @@ -5651,29 +3683,25 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, // Initialise tile context from the frame context this_tile->tctx = *cm->fc; cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; -#if CONFIG_PVQ - cpi->td.mb.pvq_q = &this_tile->pvq_q; - cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context; -#endif // CONFIG_PVQ -#if CONFIG_ANS - mode_bc.size = 1 << cpi->common.ans_window_size_log2; -#endif + mode_bc.allow_update_cdf = !cm->large_scale_tile; + mode_bc.allow_update_cdf = + mode_bc.allow_update_cdf && !cm->disable_cdf_update; aom_start_encode(&mode_bc, buf->data + data_offset); write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end); assert(tok == tok_end); aom_stop_encode(&mode_bc); tile_size = mode_bc.pos; -#if CONFIG_PVQ - cpi->td.mb.pvq_q = NULL; -#endif buf->size = tile_size; // Record the maximum tile size we see, so we can compact headers later. - *max_tile_size = AOMMAX(*max_tile_size, tile_size); + if (tile_size > max_tile_size) { + max_tile_size = tile_size; + cm->largest_tile_id = tile_cols * tile_row + tile_col; + } if (have_tiles) { // tile header: size of this tile, or copy offset - uint32_t tile_header = tile_size; + uint32_t tile_header = tile_size - AV1_MIN_TILE_SIZE_BYTES; const int tile_copy_mode = ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256) ? 1 @@ -5683,12 +3711,12 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, // Very low chances to have copy tiles on the key frames, so don't // search on key frames to reduce unnecessary search. if (cm->frame_type != KEY_FRAME && tile_copy_mode) { - const int idendical_tile_offset = + const int identical_tile_offset = find_identical_tile(tile_row, tile_col, tile_buffers); - if (idendical_tile_offset > 0) { + if (identical_tile_offset > 0) { tile_size = 0; - tile_header = idendical_tile_offset | 0x80; + tile_header = identical_tile_offset | 0x80; tile_header <<= 24; } } @@ -5701,263 +3729,287 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, if (!is_last_col) { uint32_t col_size = total_size - col_offset - 4; - mem_put_le32(dst + col_offset, col_size); + mem_put_le32(dst + col_offset + tg_hdr_size, col_size); - // If it is not final packing, record the maximum tile column size we - // see, otherwise, check if the tile size is out of the range. - *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size); + // Record the maximum tile column size we see. + max_tile_col_size = AOMMAX(max_tile_col_size, col_size); } } - } else { -#endif // CONFIG_EXT_TILE - for (tile_row = 0; tile_row < tile_rows; tile_row++) { - TileInfo tile_info; - const int is_last_row = (tile_row == tile_rows - 1); - av1_tile_set_row(&tile_info, cm, tile_row); + if (have_tiles) { + total_size = remux_tiles(cm, data, total_size - frame_header_size, + max_tile_size, max_tile_col_size, + &tile_size_bytes, &tile_col_size_bytes); + total_size += frame_header_size; + } + + // In EXT_TILE case, only use 1 tile group. Follow the obu syntax, write + // current tile group size before tile data(include tile column header). + // Tile group size doesn't include the bytes storing tg size. + total_size += tg_hdr_size; + const uint32_t obu_payload_size = total_size - tg_hdr_size; + const size_t length_field_size = + obu_memmove(tg_hdr_size, obu_payload_size, dst); + if (write_uleb_obu_size(tg_hdr_size, obu_payload_size, dst) != + AOM_CODEC_OK) { + assert(0); + } + total_size += (uint32_t)length_field_size; + saved_wb->bit_buffer += length_field_size; - for (tile_col = 0; tile_col < tile_cols; tile_col++) { - const int tile_idx = tile_row * tile_cols + tile_col; - TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col]; - TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; - const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col]; - const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col]; - const int is_last_col = (tile_col == tile_cols - 1); - const int is_last_tile = is_last_col && is_last_row; - int is_last_tile_in_tg = 0; - - if (new_tg) { - if (insert_frame_header_obu_flag && tile_idx) { - // insert a copy of frame header OBU (including 4-byte size), - // except before the first tile group - data = dst + total_size; - memmove(data, frame_header_obu_location, frame_header_obu_size); - total_size += frame_header_obu_size; - } - data = dst + total_size; - // A new tile group begins at this tile. Write the obu header and - // tile group header - curr_tg_data_size = write_obu_header(OBU_TILE_GROUP, 0, data + 4); - if (n_log2_tiles) - curr_tg_data_size += write_tile_group_header( - data + curr_tg_data_size + 4, tile_idx, - AOMMIN(tile_idx + tg_size - 1, tile_cols * tile_rows - 1), - n_log2_tiles); - total_size += curr_tg_data_size + 4; - new_tg = 0; - tile_count = 0; - } - tile_count++; - av1_tile_set_col(&tile_info, cm, tile_col); + // Now fill in the gaps in the uncompressed header. + if (have_tiles) { + assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4); + aom_wb_overwrite_literal(saved_wb, tile_col_size_bytes - 1, 2); - if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) { - is_last_tile_in_tg = 1; - new_tg = 1; - } else { - is_last_tile_in_tg = 0; + assert(tile_size_bytes >= 1 && tile_size_bytes <= 4); + aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2); + } + return total_size; + } + + uint32_t obu_header_size = 0; + uint8_t *tile_data_start = dst + total_size; + for (tile_row = 0; tile_row < tile_rows; tile_row++) { + TileInfo tile_info; + av1_tile_set_row(&tile_info, cm, tile_row); + + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + const int tile_idx = tile_row * tile_cols + tile_col; + TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col]; + TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; + const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col]; + const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col]; + int is_last_tile_in_tg = 0; + + if (new_tg) { + data = dst + total_size; + + // A new tile group begins at this tile. Write the obu header and + // tile group header + const OBU_TYPE obu_type = + (num_tg_hdrs == 1) ? OBU_FRAME : OBU_TILE_GROUP; + curr_tg_data_size = + write_obu_header(obu_type, obu_extension_header, data); + obu_header_size = curr_tg_data_size; + + if (num_tg_hdrs == 1) { + curr_tg_data_size += write_frame_header_obu( + cpi, saved_wb, data + curr_tg_data_size, 0); } + curr_tg_data_size += write_tile_group_header( + data + curr_tg_data_size, tile_idx, + AOMMIN(tile_idx + tg_size - 1, tile_cols * tile_rows - 1), + n_log2_tiles, cm->num_tg > 1); + total_size += curr_tg_data_size; + tile_data_start += curr_tg_data_size; + new_tg = 0; + tile_count = 0; + } + tile_count++; + av1_tile_set_col(&tile_info, cm, tile_col); -#if CONFIG_DEPENDENT_HORZTILES - av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col); -#endif - buf->data = dst + total_size; + if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) { + is_last_tile_in_tg = 1; + new_tg = 1; + } else { + is_last_tile_in_tg = 0; + } - // The last tile of the tile group does not have a header. - if (!is_last_tile_in_tg) total_size += 4; + buf->data = dst + total_size; - // Initialise tile context from the frame context - this_tile->tctx = *cm->fc; - cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; -#if CONFIG_PVQ - cpi->td.mb.pvq_q = &this_tile->pvq_q; - cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context; -#endif // CONFIG_PVQ -#if CONFIG_ANS - mode_bc.size = 1 << cpi->common.ans_window_size_log2; -#endif // CONFIG_ANS - aom_start_encode(&mode_bc, dst + total_size); - write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end); -#if !CONFIG_LV_MAP -#if !CONFIG_PVQ - assert(tok == tok_end); -#endif // !CONFIG_PVQ -#endif // !CONFIG_LV_MAP - aom_stop_encode(&mode_bc); - tile_size = mode_bc.pos; -#if CONFIG_PVQ - cpi->td.mb.pvq_q = NULL; -#endif - assert(tile_size > 0); + // The last tile of the tile group does not have a header. + if (!is_last_tile_in_tg) total_size += 4; - curr_tg_data_size += (tile_size + (is_last_tile_in_tg ? 0 : 4)); - buf->size = tile_size; + // Initialise tile context from the frame context + this_tile->tctx = *cm->fc; + cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; + mode_bc.allow_update_cdf = 1; + mode_bc.allow_update_cdf = + mode_bc.allow_update_cdf && !cm->disable_cdf_update; + const int num_planes = av1_num_planes(cm); + av1_reset_loop_restoration(&cpi->td.mb.e_mbd, num_planes); + + aom_start_encode(&mode_bc, dst + total_size); + write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end); + aom_stop_encode(&mode_bc); + tile_size = mode_bc.pos; + assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES); - if (!is_last_tile) { - *max_tile_size = AOMMAX(*max_tile_size, tile_size); + curr_tg_data_size += (tile_size + (is_last_tile_in_tg ? 0 : 4)); + buf->size = tile_size; + if (tile_size > max_tile_size) { + cm->largest_tile_id = tile_cols * tile_row + tile_col; + max_tile_size = tile_size; + } + + if (!is_last_tile_in_tg) { + // size of this tile + mem_put_le32(buf->data, tile_size - AV1_MIN_TILE_SIZE_BYTES); + } else { + // write current tile group size + const uint32_t obu_payload_size = curr_tg_data_size - obu_header_size; + const size_t length_field_size = + obu_memmove(obu_header_size, obu_payload_size, data); + if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) != + AOM_CODEC_OK) { + assert(0); } - if (!is_last_tile_in_tg) { - // size of this tile - mem_put_le32(buf->data, tile_size); - } else { - // write current tile group size - mem_put_le32(data, curr_tg_data_size); + curr_tg_data_size += (int)length_field_size; + total_size += (uint32_t)length_field_size; + tile_data_start += length_field_size; + if (num_tg_hdrs == 1) { + // if this tg is combined with the frame header then update saved + // frame header base offset accroding to length field size + saved_wb->bit_buffer += length_field_size; } - total_size += tile_size; + if (!first_tg && cm->error_resilient_mode) { + // Make room for a duplicate Frame Header OBU. + memmove(data + fh_info->total_length, data, curr_tg_data_size); + + // Insert a copy of the Frame Header OBU. + memcpy(data, fh_info->frame_header, fh_info->total_length); + + // Force context update tile to be the first tile in error + // resiliant mode as the duplicate frame headers will have + // context_update_tile_id set to 0 + cm->largest_tile_id = 0; + + // Rewrite the OBU header to change the OBU type to Redundant Frame + // Header. + write_obu_header(OBU_REDUNDANT_FRAME_HEADER, obu_extension_header, + &data[fh_info->obu_header_byte_offset]); + + data += fh_info->total_length; + + curr_tg_data_size += (int)(fh_info->total_length); + total_size += (uint32_t)(fh_info->total_length); + } + first_tg = 0; } + + total_size += tile_size; } -#if CONFIG_EXT_TILE } -#endif // CONFIG_EXT_TILE - return (uint32_t)total_size; -} -#endif + if (have_tiles) { + // Fill in context_update_tile_id indicating the tile to use for the + // cdf update. The encoder currently sets it to the largest tile + // (but is up to the encoder) + aom_wb_overwrite_literal(saved_wb, cm->largest_tile_id, + cm->log2_tile_cols + cm->log2_tile_rows); + // If more than one tile group. tile_size_bytes takes the default value 4 + // and does not need to be set. For a single tile group it is set in the + // section below. + if (num_tg_hdrs == 1) { + int tile_size_bytes = 4, unused; + const uint32_t tile_data_offset = (uint32_t)(tile_data_start - dst); + const uint32_t tile_data_size = total_size - tile_data_offset; + + total_size = + remux_tiles(cm, tile_data_start, tile_data_size, max_tile_size, + max_tile_col_size, &tile_size_bytes, &unused); + total_size += tile_data_offset; + assert(tile_size_bytes >= 1 && tile_size_bytes <= 4); -void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) { + aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2); + + // Update the OBU length if remux_tiles() reduced the size. + uint64_t payload_size; + size_t length_field_size; + int res = + aom_uleb_decode(dst + obu_header_size, total_size - obu_header_size, + &payload_size, &length_field_size); + assert(res == 0); + (void)res; + + const uint64_t new_payload_size = + total_size - obu_header_size - length_field_size; + if (new_payload_size != payload_size) { + size_t new_length_field_size; + res = aom_uleb_encode(new_payload_size, length_field_size, + dst + obu_header_size, &new_length_field_size); + assert(res == 0); + if (new_length_field_size < length_field_size) { + const size_t src_offset = obu_header_size + length_field_size; + const size_t dst_offset = obu_header_size + new_length_field_size; + memmove(dst + dst_offset, dst + src_offset, (size_t)payload_size); + total_size -= (int)(length_field_size - new_length_field_size); + } + } + } + } + return total_size; +} + +int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) { uint8_t *data = dst; uint32_t data_size; -#if CONFIG_EXT_TILE - AV1_COMMON *const cm = &cpi->common; - uint32_t compressed_hdr_size = 0; - uint32_t uncompressed_hdr_size; - struct aom_write_bit_buffer saved_wb; - struct aom_write_bit_buffer wb = { data, 0 }; - const int have_tiles = cm->tile_cols * cm->tile_rows > 1; - int tile_size_bytes; - int tile_col_size_bytes; -#endif // CONFIG_EXT_TILE - unsigned int max_tile_size; - unsigned int max_tile_col_size; -#if CONFIG_OBU -#if !CONFIG_EXT_TILE AV1_COMMON *const cm = &cpi->common; -#endif - uint32_t obu_size; - uint8_t *frame_header_location; - uint32_t frame_header_size; -#endif + uint32_t obu_header_size = 0; + uint32_t obu_payload_size = 0; + FrameHeaderInfo fh_info = { NULL, 0, 0 }; + const uint8_t obu_extension_header = + cm->temporal_layer_id << 5 | cm->spatial_layer_id << 3 | 0; #if CONFIG_BITSTREAM_DEBUG bitstream_queue_reset_write(); #endif -#if CONFIG_OBU - // write temporal delimiter obu, preceded by 4-byte size - obu_size = write_obu_header(OBU_TD, 0, data + 4); - obu_size += write_temporal_delimiter_obu(/*data + 4 + obu_size*/); - mem_put_le32(data, obu_size); - data += obu_size + 4; + // The TD is now written outside the frame encode loop // write sequence header obu if KEY_FRAME, preceded by 4-byte size if (cm->frame_type == KEY_FRAME) { - obu_size = write_obu_header(OBU_SEQUENCE_HEADER, 0, data + 4); - obu_size += write_sequence_header_obu(cpi, data + 4 + obu_size); - mem_put_le32(data, obu_size); - data += obu_size + 4; - } + obu_header_size = write_obu_header(OBU_SEQUENCE_HEADER, 0, data); - // write frame header obu, preceded by 4-byte size - frame_header_location = data + 4; - obu_size = write_obu_header(OBU_FRAME_HEADER, 0, frame_header_location); - frame_header_size = write_frame_header_obu(cpi, data + 4 + obu_size); - obu_size += frame_header_size; - mem_put_le32(data, obu_size); - data += obu_size + 4; + obu_payload_size = write_sequence_header_obu(cpi, data + obu_header_size); + const size_t length_field_size = + obu_memmove(obu_header_size, obu_payload_size, data); + if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } - if (cm->show_existing_frame) { - data_size = 0; - } else { - // Each tile group obu will be preceded by 4-byte size of the tile group - // obu - data_size = - write_tiles_in_tg_obus(cpi, data, &max_tile_size, &max_tile_col_size, - frame_header_location - 4, obu_size + 4, - 1 /* cm->error_resilient_mode */); + data += obu_header_size + obu_payload_size + length_field_size; } -#endif - -#if CONFIG_EXT_TILE - if (cm->large_scale_tile) { - // Write the uncompressed header - write_uncompressed_header_frame(cpi, &wb); - -#if CONFIG_EXT_REFS - if (cm->show_existing_frame) { - *size = aom_wb_bytes_written(&wb); - return; - } -#endif // CONFIG_EXT_REFS - - // We do not know these in advance. Output placeholder bit. - saved_wb = wb; - // Write tile size magnitudes - if (have_tiles) { - // Note that the last item in the uncompressed header is the data - // describing tile configuration. - // Number of bytes in tile column size - 1 - aom_wb_write_literal(&wb, 0, 2); + const int write_frame_header = (cm->num_tg > 1 || cm->show_existing_frame); + struct aom_write_bit_buffer saved_wb; + if (write_frame_header) { + // Write Frame Header OBU. + fh_info.frame_header = data; + obu_header_size = + write_obu_header(OBU_FRAME_HEADER, obu_extension_header, data); + obu_payload_size = + write_frame_header_obu(cpi, &saved_wb, data + obu_header_size, 1); - // Number of bytes in tile size - 1 - aom_wb_write_literal(&wb, 0, 2); + const size_t length_field_size = + obu_memmove(obu_header_size, obu_payload_size, data); + if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; } - if (!use_compressed_header(cm)) { - uncompressed_hdr_size = (uint32_t)aom_wb_bytes_written(&wb); - aom_clear_system_state(); - compressed_hdr_size = 0; - } else { - // Size of compressed header - aom_wb_write_literal(&wb, 0, 16); - uncompressed_hdr_size = (uint32_t)aom_wb_bytes_written(&wb); - aom_clear_system_state(); - // Write the compressed header - compressed_hdr_size = - write_compressed_header(cpi, data + uncompressed_hdr_size); - } - data += uncompressed_hdr_size + compressed_hdr_size; + fh_info.obu_header_byte_offset = 0; + fh_info.total_length = + obu_header_size + obu_payload_size + length_field_size; + data += fh_info.total_length; - // Write the encoded tile data - data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size); - } else { -#endif // CONFIG_EXT_TILE -#if !CONFIG_OBU - data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size); -#endif -#if CONFIG_EXT_TILE + // Since length_field_size is determined adaptively after frame header + // encoding, saved_wb must be adjusted accordingly. + saved_wb.bit_buffer += length_field_size; } -#endif // CONFIG_EXT_TILE -#if CONFIG_EXT_TILE - if (cm->large_scale_tile) { - if (have_tiles) { - data_size = - remux_tiles(cm, data, data_size, max_tile_size, max_tile_col_size, - &tile_size_bytes, &tile_col_size_bytes); - } - - data += data_size; - // Now fill in the gaps in the uncompressed header. - if (have_tiles) { - assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4); - aom_wb_write_literal(&saved_wb, tile_col_size_bytes - 1, 2); - - assert(tile_size_bytes >= 1 && tile_size_bytes <= 4); - aom_wb_write_literal(&saved_wb, tile_size_bytes - 1, 2); - } - // TODO(jbb): Figure out what to do if compressed_hdr_size > 16 bits. - assert(compressed_hdr_size <= 0xffff); - aom_wb_write_literal(&saved_wb, compressed_hdr_size, 16); + if (cm->show_existing_frame) { + data_size = 0; } else { -#endif // CONFIG_EXT_TILE - data += data_size; -#if CONFIG_EXT_TILE - } -#endif // CONFIG_EXT_TILE -#if CONFIG_ANS && ANS_REVERSE - // Avoid aliasing the superframe index - *data++ = 0; -#endif + // Each tile group obu will be preceded by 4-byte size of the tile group + // obu + data_size = write_tiles_in_tg_obus(cpi, data, &saved_wb, + obu_extension_header, &fh_info); + } + data += data_size; *size = data - dst; + return AOM_CODEC_OK; } diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h index 76eb85116..2047b6833 100644 --- a/third_party/aom/av1/encoder/bitstream.h +++ b/third_party/aom/av1/encoder/bitstream.h @@ -20,34 +20,24 @@ extern "C" { struct aom_write_bit_buffer; -#if CONFIG_REFERENCE_BUFFER -void write_sequence_header(AV1_COMMON *const cm, - struct aom_write_bit_buffer *wb); -#endif +void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb); + +uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension, + uint8_t *const dst); -void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size); +int write_uleb_obu_size(uint32_t obu_header_size, uint32_t obu_payload_size, + uint8_t *dest); -void av1_encode_token_init(void); +int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size); static INLINE int av1_preserve_existing_gf(AV1_COMP *cpi) { -#if CONFIG_EXT_REFS // Do not swap gf and arf indices for internal overlay frames return !cpi->multi_arf_allowed && cpi->rc.is_src_frame_alt_ref && !cpi->rc.is_src_frame_ext_arf; -#else - return !cpi->multi_arf_allowed && cpi->refresh_golden_frame && - cpi->rc.is_src_frame_alt_ref; -#endif // CONFIG_EXT_REFS } void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, -#if CONFIG_SUPERTX - const int supertx_enabled, -#endif -#if CONFIG_TXK_SEL - int blk_row, int blk_col, int block, int plane, - TX_SIZE tx_size, -#endif + int blk_row, int blk_col, int plane, TX_SIZE tx_size, aom_writer *w); #ifdef __cplusplus diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h index 8b6627825..13fc11c31 100644 --- a/third_party/aom/av1/encoder/block.h +++ b/third_party/aom/av1/encoder/block.h @@ -14,9 +14,6 @@ #include "av1/common/entropymv.h" #include "av1/common/entropy.h" -#if CONFIG_PVQ -#include "av1/encoder/encint.h" -#endif #include "av1/common/mvref_common.h" #include "av1/encoder/hash.h" #if CONFIG_DIST_8X8 @@ -27,12 +24,6 @@ extern "C" { #endif -#if CONFIG_PVQ -// Maximum possible # of tx blocks in luma plane, which is currently 256, -// since there can be 16x16 of 4x4 tx. -#define MAX_PVQ_BLOCKS_IN_SB (MAX_SB_SQUARE >> 2 * OD_LOG_BSIZE0) -#endif - typedef struct { unsigned int sse; int sum; @@ -41,53 +32,39 @@ typedef struct { typedef struct macroblock_plane { DECLARE_ALIGNED(16, int16_t, src_diff[MAX_SB_SQUARE]); -#if CONFIG_PVQ - DECLARE_ALIGNED(16, int16_t, src_int16[MAX_SB_SQUARE]); -#endif tran_low_t *qcoeff; tran_low_t *coeff; uint16_t *eobs; -#if CONFIG_LV_MAP uint8_t *txb_entropy_ctx; -#endif struct buf_2d src; // Quantizer setings - const int16_t *quant_fp; - const int16_t *round_fp; - const int16_t *quant; - const int16_t *quant_shift; - const int16_t *zbin; - const int16_t *round; -#if CONFIG_NEW_QUANT - const cuml_bins_type_nuq *cuml_bins_nuq[QUANT_PROFILES]; -#endif // CONFIG_NEW_QUANT + // These are used/accessed only in the quantization process + // RDO does not / must not depend on any of these values + // All values below share the coefficient scale/shift used in TX + const int16_t *quant_fp_QTX; + const int16_t *round_fp_QTX; + const int16_t *quant_QTX; + const int16_t *quant_shift_QTX; + const int16_t *zbin_QTX; + const int16_t *round_QTX; + const int16_t *dequant_QTX; } MACROBLOCK_PLANE; -typedef int av1_coeff_cost[PLANE_TYPES][REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] - [TAIL_TOKENS]; - -#if CONFIG_LV_MAP typedef struct { int txb_skip_cost[TXB_SKIP_CONTEXTS][2]; - int nz_map_cost[SIG_COEF_CONTEXTS][2]; - int eob_cost[EOB_COEF_CONTEXTS][2]; + int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3]; + int base_cost[SIG_COEF_CONTEXTS][4]; + int eob_extra_cost[EOB_COEF_CONTEXTS][2]; int dc_sign_cost[DC_SIGN_CONTEXTS][2]; - int base_cost[NUM_BASE_LEVELS][COEFF_BASE_CONTEXTS][2]; -#if BR_NODE int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1]; - int br_cost[BASE_RANGE_SETS][LEVEL_CONTEXTS][2]; -#else // BR_NODE - int lps_cost[LEVEL_CONTEXTS][2]; -#endif // BR_NODE -#if CONFIG_CTX1D - int eob_mode_cost[TX_CLASSES][2]; - int empty_line_cost[TX_CLASSES][EMPTY_LINE_CONTEXTS][2]; - int hv_eob_cost[TX_CLASSES][HV_EOB_CONTEXTS][2]; -#endif } LV_MAP_COEFF_COST; typedef struct { + int eob_cost[2][11]; +} LV_MAP_EOB_COST; + +typedef struct { tran_low_t tcoeff[MAX_MB_PLANE][MAX_SB_SQUARE]; uint16_t eobs[MAX_MB_PLANE][MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)]; uint8_t txb_skip_ctx[MAX_MB_PLANE] @@ -95,20 +72,17 @@ typedef struct { int dc_sign_ctx[MAX_MB_PLANE] [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)]; } CB_COEFF_BUFFER; -#endif typedef struct { - int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; int16_t mode_context[MODE_CTX_REF_FRAMES]; -#if CONFIG_LV_MAP // TODO(angiebird): Reduce the buffer size according to sb_type tran_low_t *tcoeff[MAX_MB_PLANE]; uint16_t *eobs[MAX_MB_PLANE]; uint8_t *txb_skip_ctx[MAX_MB_PLANE]; int *dc_sign_ctx[MAX_MB_PLANE]; -#endif uint8_t ref_mv_count[MODE_CTX_REF_FRAMES]; CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE]; + int_mv global_mvs[REF_FRAMES]; int16_t compound_mode_context[MODE_CTX_REF_FRAMES]; } MB_MODE_INFO_EXT; @@ -120,39 +94,119 @@ typedef struct { } MvLimits; typedef struct { - uint8_t best_palette_color_map[MAX_SB_SQUARE]; - float kmeans_data_buf[2 * MAX_SB_SQUARE]; + uint8_t best_palette_color_map[MAX_PALETTE_SQUARE]; + int kmeans_data_buf[2 * MAX_PALETTE_SQUARE]; } PALETTE_BUFFER; typedef struct { - TX_TYPE tx_type; TX_SIZE tx_size; -#if CONFIG_VAR_TX - TX_SIZE min_tx_size; - TX_SIZE inter_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE]; - uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8]; -#endif // CONFIG_VAR_TX -#if CONFIG_TXK_SEL - TX_TYPE txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)]; -#endif // CONFIG_TXK_SEL + TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN]; + uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + TX_TYPE txk_type[TXK_TYPE_BUF_LEN]; RD_STATS rd_stats; uint32_t hash_value; -} TX_RD_INFO; +} MB_RD_INFO; #define RD_RECORD_BUFFER_LEN 8 typedef struct { - TX_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN]; // Circular buffer. + MB_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN]; // Circular buffer. int index_start; int num; - CRC_CALCULATOR crc_calculator; // Hash function. -} TX_RD_RECORD; + CRC32C crc_calculator; // Hash function. +} MB_RD_RECORD; + +typedef struct { + int64_t dist; + int64_t sse; + int rate; + uint16_t eob; + TX_TYPE tx_type; + uint16_t entropy_context; + uint8_t txb_entropy_ctx; + uint8_t valid; + uint8_t fast; // This is not being used now. +} TXB_RD_INFO; + +#define TX_SIZE_RD_RECORD_BUFFER_LEN 256 +typedef struct { + uint32_t hash_vals[TX_SIZE_RD_RECORD_BUFFER_LEN]; + TXB_RD_INFO tx_rd_info[TX_SIZE_RD_RECORD_BUFFER_LEN]; + int index_start; + int num; +} TXB_RD_RECORD; + +typedef struct tx_size_rd_info_node { + TXB_RD_INFO *rd_info_array; // Points to array of size TX_TYPES. + struct tx_size_rd_info_node *children[4]; +} TXB_RD_INFO_NODE; + +// Region size for mode decision sampling in the first pass of partition +// search(two_pass_partition_search speed feature), in units of mi size(4). +// Used by the mode_pruning_based_on_two_pass_partition_search speed feature. +#define FIRST_PARTITION_PASS_SAMPLE_REGION 8 +#define FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2 3 +#define FIRST_PARTITION_PASS_STATS_TABLES \ + (MAX_MIB_SIZE >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2) * \ + (MAX_MIB_SIZE >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2) +#define FIRST_PARTITION_PASS_STATS_STRIDE \ + (MAX_MIB_SIZE_LOG2 - FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2) + +static INLINE int av1_first_partition_pass_stats_index(int mi_row, int mi_col) { + const int row = + (mi_row & MAX_MIB_MASK) >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2; + const int col = + (mi_col & MAX_MIB_MASK) >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2; + return (row << FIRST_PARTITION_PASS_STATS_STRIDE) + col; +} + +typedef struct { + uint8_t ref0_counts[REF_FRAMES]; // Counters for ref_frame[0]. + uint8_t ref1_counts[REF_FRAMES]; // Counters for ref_frame[1]. + int sample_counts; // Number of samples collected. +} FIRST_PARTITION_PASS_STATS; + +#define MAX_INTERP_FILTER_STATS 64 +typedef struct { + InterpFilters filters; + int_mv mv[2]; + int8_t ref_frames[2]; +} INTERPOLATION_FILTER_STATS; typedef struct macroblock MACROBLOCK; struct macroblock { struct macroblock_plane plane[MAX_MB_PLANE]; - // Save the transform RD search info. - TX_RD_RECORD tx_rd_record; + // Determine if one would go with reduced complexity transform block + // search model to select prediction modes, or full complexity model + // to select transform kernel. + int rd_model; + + // Indicate if the encoder is running in the first pass partition search. + // In that case, apply certain speed features therein to reduce the overhead + // cost in the first pass search. + int cb_partition_scan; + + FIRST_PARTITION_PASS_STATS + first_partition_pass_stats[FIRST_PARTITION_PASS_STATS_TABLES]; + + // [comp_idx][saved stat_idx] + INTERPOLATION_FILTER_STATS interp_filter_stats[2][MAX_INTERP_FILTER_STATS]; + int interp_filter_stats_idx[2]; + + // Activate constrained coding block partition search range. + int use_cb_search_range; + + // Inter macroblock RD search info. + MB_RD_RECORD mb_rd_record; + + // Inter transform block RD search info. for square TX sizes. + TXB_RD_RECORD txb_rd_record_8X8[(MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)]; + TXB_RD_RECORD txb_rd_record_16X16[(MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)]; + TXB_RD_RECORD txb_rd_record_32X32[(MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)]; + TXB_RD_RECORD txb_rd_record_64X64[(MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)]; + + // Intra transform block RD search info. for square TX sizes. + TXB_RD_RECORD txb_rd_record_intra; MACROBLOCKD e_mbd; MB_MODE_INFO_EXT *mbmi_ext; @@ -173,34 +227,29 @@ struct macroblock { int *m_search_count_ptr; int *ex_search_count_ptr; -#if CONFIG_VAR_TX unsigned int txb_split_count; -#endif // These are set to their default values at the beginning, and then adjusted // further in the encoding process. BLOCK_SIZE min_partition_size; BLOCK_SIZE max_partition_size; - int mv_best_ref_index[TOTAL_REFS_PER_FRAME]; - unsigned int max_mv_context[TOTAL_REFS_PER_FRAME]; + unsigned int max_mv_context[REF_FRAMES]; unsigned int source_variance; - unsigned int pred_sse[TOTAL_REFS_PER_FRAME]; - int pred_mv_sad[TOTAL_REFS_PER_FRAME]; + unsigned int pred_sse[REF_FRAMES]; + int pred_mv_sad[REF_FRAMES]; int *nmvjointcost; - int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS]; - int *nmvcost[NMV_CONTEXTS][2]; - int *nmvcost_hp[NMV_CONTEXTS][2]; - int **mv_cost_stack[NMV_CONTEXTS]; + int nmv_vec_cost[MV_JOINTS]; + int *nmvcost[2]; + int *nmvcost_hp[2]; + int **mv_cost_stack; int **mvcost; -#if CONFIG_MOTION_VAR int32_t *wsrc_buf; int32_t *mask_buf; uint8_t *above_pred_buf; uint8_t *left_pred_buf; -#endif // CONFIG_MOTION_VAR PALETTE_BUFFER *palette_buffer; @@ -208,108 +257,80 @@ struct macroblock { // from extending outside the UMV borders MvLimits mv_limits; -#if CONFIG_VAR_TX - uint8_t blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 8]; - uint8_t blk_skip_drl[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 8]; -#endif + uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t blk_skip_drl[MAX_MIB_SIZE * MAX_MIB_SIZE]; int skip; - -#if CONFIG_CB4X4 int skip_chroma_rd; -#endif + int skip_cost[SKIP_CONTEXTS][2]; + + int skip_mode; // 0: off; 1: on + int skip_mode_cost[SKIP_CONTEXTS][2]; + + int compound_idx; -#if CONFIG_LV_MAP LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES]; + LV_MAP_EOB_COST eob_costs[7][2]; uint16_t cb_offset; -#endif - - av1_coeff_cost token_head_costs[TX_SIZES]; - av1_coeff_cost token_tail_costs[TX_SIZES]; // mode costs + int intra_inter_cost[INTRA_INTER_CONTEXTS][2]; + int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES]; int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2]; - int zeromv_mode_cost[ZEROMV_MODE_CONTEXTS][2]; + int zeromv_mode_cost[GLOBALMV_MODE_CONTEXTS][2]; int refmv_mode_cost[REFMV_MODE_CONTEXTS][2]; int drl_mode_cost0[DRL_MODE_CONTEXTS][2]; + int comp_inter_cost[COMP_INTER_CONTEXTS][2]; + int single_ref_cost[REF_CONTEXTS][SINGLE_REFS - 1][2]; + int comp_ref_type_cost[COMP_REF_TYPE_CONTEXTS] + [CDF_SIZE(COMP_REFERENCE_TYPES)]; + int uni_comp_ref_cost[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1] + [CDF_SIZE(2)]; + // Cost for signaling ref_frame[0] (LAST_FRAME, LAST2_FRAME, LAST3_FRAME or + // GOLDEN_FRAME) in bidir-comp mode. + int comp_ref_cost[REF_CONTEXTS][FWD_REFS - 1][2]; + // Cost for signaling ref_frame[1] (ALTREF_FRAME, ALTREF2_FRAME, or + // BWDREF_FRAME) in bidir-comp mode. + int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2]; int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES]; - int compound_type_cost[BLOCK_SIZES_ALL][COMPOUND_TYPES]; -#if CONFIG_COMPOUND_SINGLEREF - int inter_singleref_comp_mode_cost[INTER_MODE_CONTEXTS] - [INTER_SINGLEREF_COMP_MODES]; -#endif // CONFIG_COMPOUND_SINGLEREF -#if CONFIG_INTERINTRA + int compound_type_cost[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1]; + int wedge_idx_cost[BLOCK_SIZES_ALL][16]; + int interintra_cost[BLOCK_SIZE_GROUPS][2]; + int wedge_interintra_cost[BLOCK_SIZES_ALL][2]; int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES]; -#endif // CONFIG_INTERINTRA -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES]; -#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION int motion_mode_cost1[BLOCK_SIZES_ALL][2]; -#if CONFIG_NCOBMC_ADAPT_WEIGHT - int motion_mode_cost2[BLOCK_SIZES_ALL][OBMC_FAMILY_MODES]; -#endif -#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION -#if CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT - int ncobmc_mode_cost[ADAPT_OVERLAP_BLOCKS][MAX_NCOBMC_MODES]; -#endif // CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - int intra_uv_mode_cost[INTRA_MODES][UV_INTRA_MODES]; + int intra_uv_mode_cost[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES]; int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; + int filter_intra_cost[BLOCK_SIZES_ALL][2]; + int filter_intra_mode_cost[FILTER_INTRA_MODES]; int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; -#if CONFIG_EXT_PARTITION_TYPES - int partition_cost[PARTITION_CONTEXTS + CONFIG_UNPOISON_PARTITION_CTX] - [EXT_PARTITION_TYPES]; -#else - int partition_cost[PARTITION_CONTEXTS + CONFIG_UNPOISON_PARTITION_CTX] - [PARTITION_TYPES]; -#endif // CONFIG_EXT_PARTITION_TYPES -#if CONFIG_MRC_TX - int mrc_mask_inter_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS] - [PALETTE_COLORS]; - int mrc_mask_intra_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS] - [PALETTE_COLORS]; -#endif // CONFIG_MRC_TX - int palette_y_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES]; - int palette_uv_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES]; + int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES]; + int palette_y_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; + int palette_uv_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS] [PALETTE_COLORS]; int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS] [PALETTE_COLORS]; -#if CONFIG_CFL + int palette_y_mode_cost[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2]; + int palette_uv_mode_cost[PALETTE_UV_MODE_CONTEXTS][2]; // The rate associated with each alpha codeword int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE]; -#endif // CONFIG_CFL int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES]; -#if CONFIG_EXT_TX -#if CONFIG_LGT_FROM_PRED - int intra_lgt_cost[LGT_SIZES][INTRA_MODES][2]; - int inter_lgt_cost[LGT_SIZES][2]; -#endif + int txfm_partition_cost[TXFM_PARTITION_CONTEXTS][2]; int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES]; int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES] [TX_TYPES]; -#else - int intra_tx_type_costs[EXT_TX_SIZES][TX_TYPES][TX_TYPES]; - int inter_tx_type_costs[EXT_TX_SIZES][TX_TYPES]; -#endif // CONFIG_EXT_TX -#if CONFIG_EXT_INTRA -#if CONFIG_INTRA_INTERP - int intra_filter_cost[INTRA_FILTERS + 1][INTRA_FILTERS]; -#endif // CONFIG_INTRA_INTERP -#endif // CONFIG_EXT_INTRA -#if CONFIG_LOOP_RESTORATION + int angle_delta_cost[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1]; int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES]; -#endif // CONFIG_LOOP_RESTORATION -#if CONFIG_INTRABC + int wiener_restore_cost[2]; + int sgrproj_restore_cost[2]; int intrabc_cost[2]; -#endif // CONFIG_INTRABC - - int optimize; // Used to store sub partition's choices. - MV pred_mv[TOTAL_REFS_PER_FRAME]; + MV pred_mv[REF_FRAMES]; // Store the best motion vector during motion search int_mv best_mv; @@ -320,38 +341,65 @@ struct macroblock { int use_default_intra_tx_type; // use default transform and skip transform type search for inter modes int use_default_inter_tx_type; -#if CONFIG_PVQ - int rate; - // 1 if neither AC nor DC is coded. Only used during RDO. - int pvq_skip[MAX_MB_PLANE]; - PVQ_QUEUE *pvq_q; - - // Storage for PVQ tx block encodings in a superblock. - // There can be max 16x16 of 4x4 blocks (and YUV) encode by PVQ - // 256 is the max # of 4x4 blocks in a SB (64x64), which comes from: - // 1) Since PVQ is applied to each trasnform-ed block - // 2) 4x4 is the smallest tx size in AV1 - // 3) AV1 allows using smaller tx size than block (i.e. partition) size - // TODO(yushin) : The memory usage could be improved a lot, since this has - // storage for 10 bands and 128 coefficients for every 4x4 block, - PVQ_INFO pvq[MAX_PVQ_BLOCKS_IN_SB][MAX_MB_PLANE]; - daala_enc_ctx daala_enc; - int pvq_speed; - int pvq_coded; // Indicates whether pvq_info needs be stored to tokenize -#endif #if CONFIG_DIST_8X8 int using_dist_8x8; aom_tune_metric tune_metric; -#if CONFIG_CB4X4 -#if CONFIG_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, decoded_8x8[8 * 8]); -#else - DECLARE_ALIGNED(16, uint8_t, decoded_8x8[8 * 8]); -#endif -#endif // CONFIG_CB4X4 + DECLARE_ALIGNED(16, int16_t, pred_luma[MAX_SB_SQUARE]); #endif // CONFIG_DIST_8X8 + int comp_idx_cost[COMP_INDEX_CONTEXTS][2]; + int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2]; + // Bit flags for pruning tx type search, tx split, etc. + int tx_search_prune[EXT_TX_SET_TYPES]; + int must_find_valid_partition; + int tx_split_prune_flag; // Flag to skip tx split RD search. }; +static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) { + static const char LUT[BLOCK_SIZES_ALL] = { + 0, // BLOCK_4X4 + 1, // BLOCK_4X8 + 1, // BLOCK_8X4 + 0, // BLOCK_8X8 + 1, // BLOCK_8X16 + 1, // BLOCK_16X8 + 0, // BLOCK_16X16 + 1, // BLOCK_16X32 + 1, // BLOCK_32X16 + 0, // BLOCK_32X32 + 1, // BLOCK_32X64 + 1, // BLOCK_64X32 + 0, // BLOCK_64X64 + 0, // BLOCK_64X128 + 0, // BLOCK_128X64 + 0, // BLOCK_128X128 + 1, // BLOCK_4X16 + 1, // BLOCK_16X4 + 1, // BLOCK_8X32 + 1, // BLOCK_32X8 + 1, // BLOCK_16X64 + 1, // BLOCK_64X16 + }; + + return LUT[bsize]; +} + +static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi) { + return is_rect_tx_allowed_bsize(mbmi->sb_type) && + !xd->lossless[mbmi->segment_id]; +} + +static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) { + TX_SIZE ctx_size = max_txsize_rect_lookup[bsize]; + int depth = 0; + while (tx_size != ctx_size) { + depth++; + ctx_size = sub_tx_size_map[ctx_size]; + assert(depth <= MAX_TX_DEPTH); + } + return depth; +} + #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c index 113ceb29d..66dedd9ed 100644 --- a/third_party/aom/av1/encoder/blockiness.c +++ b/third_party/aom/av1/encoder/blockiness.c @@ -9,9 +9,10 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./av1_rtcd.h" -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "av1/common/common.h" #include "av1/common/filter.h" #include "aom/aom_integer.h" diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c index 4bbf0e5fb..d6e556b93 100644 --- a/third_party/aom/av1/encoder/context_tree.c +++ b/third_party/aom/av1/encoder/context_tree.c @@ -13,32 +13,18 @@ #include "av1/encoder/encoder.h" static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = { -#if CONFIG_CB4X4 - BLOCK_4X4, -#endif - BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, -#if CONFIG_EXT_PARTITION - BLOCK_128X128, -#endif // CONFIG_EXT_PARTITION + BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128, }; static void alloc_mode_context(AV1_COMMON *cm, int num_pix, -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_TYPE partition, -#endif PICK_MODE_CONTEXT *ctx) { + const int num_planes = av1_num_planes(cm); int i; const int num_blk = num_pix / 16; ctx->num_4x4_blk = num_blk; -#if CONFIG_EXT_PARTITION_TYPES - ctx->partition = partition; -#endif - - for (i = 0; i < MAX_MB_PLANE; ++i) { -#if CONFIG_VAR_TX - CHECK_MEM_ERROR(cm, ctx->blk_skip[i], aom_calloc(num_blk, sizeof(uint8_t))); -#endif + CHECK_MEM_ERROR(cm, ctx->blk_skip, aom_calloc(num_blk, sizeof(uint8_t))); + for (i = 0; i < num_planes; ++i) { CHECK_MEM_ERROR(cm, ctx->coeff[i], aom_memalign(32, num_pix * sizeof(*ctx->coeff[i]))); CHECK_MEM_ERROR(cm, ctx->qcoeff[i], @@ -47,148 +33,94 @@ static void alloc_mode_context(AV1_COMMON *cm, int num_pix, aom_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i]))); CHECK_MEM_ERROR(cm, ctx->eobs[i], aom_memalign(32, num_blk * sizeof(*ctx->eobs[i]))); -#if CONFIG_LV_MAP CHECK_MEM_ERROR( cm, ctx->txb_entropy_ctx[i], aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i]))); -#endif - -#if CONFIG_PVQ - CHECK_MEM_ERROR(cm, ctx->pvq_ref_coeff[i], - aom_memalign(32, num_pix * sizeof(*ctx->pvq_ref_coeff[i]))); -#endif } - for (i = 0; i < 2; ++i) { - CHECK_MEM_ERROR( - cm, ctx->color_index_map[i], - aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i]))); + if (num_pix <= MAX_PALETTE_SQUARE) { + for (i = 0; i < 2; ++i) { + CHECK_MEM_ERROR( + cm, ctx->color_index_map[i], + aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i]))); + } } -#if CONFIG_MRC_TX - CHECK_MEM_ERROR(cm, ctx->mrc_mask, - aom_memalign(32, num_pix * sizeof(*ctx->mrc_mask))); -#endif // CONFIG_MRC_TX } -static void free_mode_context(PICK_MODE_CONTEXT *ctx) { +static void free_mode_context(PICK_MODE_CONTEXT *ctx, const int num_planes) { int i; - for (i = 0; i < MAX_MB_PLANE; ++i) { -#if CONFIG_VAR_TX - aom_free(ctx->blk_skip[i]); - ctx->blk_skip[i] = 0; -#endif + aom_free(ctx->blk_skip); + ctx->blk_skip = 0; + for (i = 0; i < num_planes; ++i) { aom_free(ctx->coeff[i]); ctx->coeff[i] = 0; aom_free(ctx->qcoeff[i]); ctx->qcoeff[i] = 0; aom_free(ctx->dqcoeff[i]); ctx->dqcoeff[i] = 0; -#if CONFIG_PVQ - aom_free(ctx->pvq_ref_coeff[i]); - ctx->pvq_ref_coeff[i] = 0; -#endif aom_free(ctx->eobs[i]); ctx->eobs[i] = 0; -#if CONFIG_LV_MAP aom_free(ctx->txb_entropy_ctx[i]); ctx->txb_entropy_ctx[i] = 0; -#endif } for (i = 0; i < 2; ++i) { aom_free(ctx->color_index_map[i]); ctx->color_index_map[i] = 0; } -#if CONFIG_MRC_TX - aom_free(ctx->mrc_mask); - ctx->mrc_mask = 0; -#endif // CONFIG_MRC_TX } -static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree, int num_pix) { -#if CONFIG_EXT_PARTITION_TYPES - alloc_mode_context(cm, num_pix, PARTITION_NONE, &tree->none); - alloc_mode_context(cm, num_pix / 2, PARTITION_HORZ, &tree->horizontal[0]); - alloc_mode_context(cm, num_pix / 2, PARTITION_VERT, &tree->vertical[0]); - alloc_mode_context(cm, num_pix / 2, PARTITION_VERT, &tree->horizontal[1]); - alloc_mode_context(cm, num_pix / 2, PARTITION_VERT, &tree->vertical[1]); - - alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_A, &tree->horizontala[0]); - alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_A, &tree->horizontala[1]); - alloc_mode_context(cm, num_pix / 2, PARTITION_HORZ_A, &tree->horizontala[2]); - alloc_mode_context(cm, num_pix / 2, PARTITION_HORZ_B, &tree->horizontalb[0]); - alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_B, &tree->horizontalb[1]); - alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_B, &tree->horizontalb[2]); - alloc_mode_context(cm, num_pix / 4, PARTITION_VERT_A, &tree->verticala[0]); - alloc_mode_context(cm, num_pix / 4, PARTITION_VERT_A, &tree->verticala[1]); - alloc_mode_context(cm, num_pix / 2, PARTITION_VERT_A, &tree->verticala[2]); - alloc_mode_context(cm, num_pix / 2, PARTITION_VERT_B, &tree->verticalb[0]); - alloc_mode_context(cm, num_pix / 4, PARTITION_VERT_B, &tree->verticalb[1]); - alloc_mode_context(cm, num_pix / 4, PARTITION_VERT_B, &tree->verticalb[2]); - for (int i = 0; i < 4; ++i) { - alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_4, - &tree->horizontal4[i]); - alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_4, &tree->vertical4[i]); - } -#if CONFIG_SUPERTX - alloc_mode_context(cm, num_pix, PARTITION_HORZ, &tree->horizontal_supertx); - alloc_mode_context(cm, num_pix, PARTITION_VERT, &tree->vertical_supertx); - alloc_mode_context(cm, num_pix, PARTITION_SPLIT, &tree->split_supertx); - alloc_mode_context(cm, num_pix, PARTITION_HORZ_A, &tree->horizontala_supertx); - alloc_mode_context(cm, num_pix, PARTITION_HORZ_B, &tree->horizontalb_supertx); - alloc_mode_context(cm, num_pix, PARTITION_VERT_A, &tree->verticala_supertx); - alloc_mode_context(cm, num_pix, PARTITION_VERT_B, &tree->verticalb_supertx); -#endif // CONFIG_SUPERTX -#else +static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree, int num_pix, + int is_leaf) { alloc_mode_context(cm, num_pix, &tree->none); + + if (is_leaf) return; + alloc_mode_context(cm, num_pix / 2, &tree->horizontal[0]); alloc_mode_context(cm, num_pix / 2, &tree->vertical[0]); -#if CONFIG_SUPERTX - alloc_mode_context(cm, num_pix, &tree->horizontal_supertx); - alloc_mode_context(cm, num_pix, &tree->vertical_supertx); - alloc_mode_context(cm, num_pix, &tree->split_supertx); -#endif - if (num_pix > 16) { - alloc_mode_context(cm, num_pix / 2, &tree->horizontal[1]); - alloc_mode_context(cm, num_pix / 2, &tree->vertical[1]); - } else { - memset(&tree->horizontal[1], 0, sizeof(tree->horizontal[1])); - memset(&tree->vertical[1], 0, sizeof(tree->vertical[1])); + alloc_mode_context(cm, num_pix / 2, &tree->horizontal[1]); + alloc_mode_context(cm, num_pix / 2, &tree->vertical[1]); + + alloc_mode_context(cm, num_pix / 4, &tree->horizontala[0]); + alloc_mode_context(cm, num_pix / 4, &tree->horizontala[1]); + alloc_mode_context(cm, num_pix / 2, &tree->horizontala[2]); + + alloc_mode_context(cm, num_pix / 2, &tree->horizontalb[0]); + alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[1]); + alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[2]); + + alloc_mode_context(cm, num_pix / 4, &tree->verticala[0]); + alloc_mode_context(cm, num_pix / 4, &tree->verticala[1]); + alloc_mode_context(cm, num_pix / 2, &tree->verticala[2]); + + alloc_mode_context(cm, num_pix / 2, &tree->verticalb[0]); + alloc_mode_context(cm, num_pix / 4, &tree->verticalb[1]); + alloc_mode_context(cm, num_pix / 4, &tree->verticalb[2]); + + for (int i = 0; i < 4; ++i) { + alloc_mode_context(cm, num_pix / 4, &tree->horizontal4[i]); + alloc_mode_context(cm, num_pix / 4, &tree->vertical4[i]); } -#endif // CONFIG_EXT_PARTITION_TYPES } -static void free_tree_contexts(PC_TREE *tree) { -#if CONFIG_EXT_PARTITION_TYPES +static void free_tree_contexts(PC_TREE *tree, const int num_planes) { int i; for (i = 0; i < 3; i++) { - free_mode_context(&tree->horizontala[i]); - free_mode_context(&tree->horizontalb[i]); - free_mode_context(&tree->verticala[i]); - free_mode_context(&tree->verticalb[i]); + free_mode_context(&tree->horizontala[i], num_planes); + free_mode_context(&tree->horizontalb[i], num_planes); + free_mode_context(&tree->verticala[i], num_planes); + free_mode_context(&tree->verticalb[i], num_planes); } for (i = 0; i < 4; ++i) { - free_mode_context(&tree->horizontal4[i]); - free_mode_context(&tree->vertical4[i]); + free_mode_context(&tree->horizontal4[i], num_planes); + free_mode_context(&tree->vertical4[i], num_planes); } -#endif // CONFIG_EXT_PARTITION_TYPES - free_mode_context(&tree->none); - free_mode_context(&tree->horizontal[0]); - free_mode_context(&tree->horizontal[1]); - free_mode_context(&tree->vertical[0]); - free_mode_context(&tree->vertical[1]); -#if CONFIG_SUPERTX - free_mode_context(&tree->horizontal_supertx); - free_mode_context(&tree->vertical_supertx); - free_mode_context(&tree->split_supertx); -#if CONFIG_EXT_PARTITION_TYPES - free_mode_context(&tree->horizontala_supertx); - free_mode_context(&tree->horizontalb_supertx); - free_mode_context(&tree->verticala_supertx); - free_mode_context(&tree->verticalb_supertx); -#endif // CONFIG_EXT_PARTITION_TYPES -#endif // CONFIG_SUPERTX + free_mode_context(&tree->none, num_planes); + free_mode_context(&tree->horizontal[0], num_planes); + free_mode_context(&tree->horizontal[1], num_planes); + free_mode_context(&tree->vertical[0], num_planes); + free_mode_context(&tree->vertical[1], num_planes); } // This function sets up a tree of contexts such that at each square @@ -197,65 +129,25 @@ static void free_tree_contexts(PC_TREE *tree) { // represents the state of our search. void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) { int i, j; -#if CONFIG_CB4X4 -#if CONFIG_EXT_PARTITION const int tree_nodes_inc = 1024; -#else - const int tree_nodes_inc = 256; -#endif // CONFIG_EXT_PARTITION const int leaf_factor = 4; -#else - const int tree_nodes_inc = 0; - const int leaf_factor = 1; -#endif -#if CONFIG_EXT_PARTITION const int leaf_nodes = 256 * leaf_factor; const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1; -#else - const int leaf_nodes = 64 * leaf_factor; - const int tree_nodes = tree_nodes_inc + 64 + 16 + 4 + 1; -#endif // CONFIG_EXT_PARTITION int pc_tree_index = 0; PC_TREE *this_pc; int square_index = 1; int nodes; -#if !CONFIG_CB4X4 - aom_free(td->leaf_tree); - CHECK_MEM_ERROR(cm, td->leaf_tree, - aom_calloc(leaf_nodes, sizeof(*td->leaf_tree))); - PICK_MODE_CONTEXT *this_leaf = &td->leaf_tree[0]; -#endif aom_free(td->pc_tree); CHECK_MEM_ERROR(cm, td->pc_tree, aom_calloc(tree_nodes, sizeof(*td->pc_tree))); this_pc = &td->pc_tree[0]; -#if !CONFIG_CB4X4 - // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same - // context so we only need to allocate 1 for each 8x8 block. - for (i = 0; i < leaf_nodes; ++i) { -#if CONFIG_EXT_PARTITION_TYPES - alloc_mode_context(cm, 4, PARTITION_NONE, &td->leaf_tree[i]); -#else - alloc_mode_context(cm, 16, &td->leaf_tree[i]); -#endif - } -#endif - // Sets up all the leaf nodes in the tree. for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) { PC_TREE *const tree = &td->pc_tree[pc_tree_index]; tree->block_size = square[0]; -#if CONFIG_CB4X4 - alloc_tree_contexts(cm, tree, 16); -#else - alloc_tree_contexts(cm, tree, 4); -#endif -#if !CONFIG_CB4X4 - tree->leaf_split[0] = this_leaf++; - for (j = 1; j < 4; j++) tree->leaf_split[j] = tree->leaf_split[0]; -#endif + alloc_tree_contexts(cm, tree, 16, 1); } // Each node has 4 leaf nodes, fill each block_size level of the tree @@ -263,11 +155,7 @@ void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) { for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) { for (i = 0; i < nodes; ++i) { PC_TREE *const tree = &td->pc_tree[pc_tree_index]; -#if CONFIG_CB4X4 - alloc_tree_contexts(cm, tree, 16 << (2 * square_index)); -#else - alloc_tree_contexts(cm, tree, 4 << (2 * square_index)); -#endif + alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 0); tree->block_size = square[square_index]; for (j = 0; j < 4; j++) tree->split[j] = this_pc++; ++pc_tree_index; @@ -286,35 +174,41 @@ void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) { } } -void av1_free_pc_tree(ThreadData *td) { -#if CONFIG_CB4X4 -#if CONFIG_EXT_PARTITION +void av1_free_pc_tree(ThreadData *td, const int num_planes) { const int tree_nodes_inc = 1024; -#else - const int tree_nodes_inc = 256; -#endif // CONFIG_EXT_PARTITION -#else - const int tree_nodes_inc = 0; -#endif -#if CONFIG_EXT_PARTITION const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1; -#else - const int tree_nodes = tree_nodes_inc + 64 + 16 + 4 + 1; -#endif // CONFIG_EXT_PARTITION int i; - for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]); + for (i = 0; i < tree_nodes; ++i) + free_tree_contexts(&td->pc_tree[i], num_planes); aom_free(td->pc_tree); td->pc_tree = NULL; -#if !CONFIG_CB4X4 - const int leaf_factor = 1; -#if CONFIG_EXT_PARTITION - const int leaf_nodes = 256 * leaf_factor; -#else - const int leaf_nodes = 64 * leaf_factor; -#endif // CONFIG_EXT_PARTITION - for (i = 0; i < leaf_nodes; ++i) free_mode_context(&td->leaf_tree[i]); - aom_free(td->leaf_tree); - td->leaf_tree = NULL; -#endif +} + +void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx, + PICK_MODE_CONTEXT *src_ctx) { + dst_ctx->mic = src_ctx->mic; + dst_ctx->mbmi_ext = src_ctx->mbmi_ext; + + dst_ctx->num_4x4_blk = src_ctx->num_4x4_blk; + dst_ctx->skip = src_ctx->skip; + dst_ctx->skippable = src_ctx->skippable; + dst_ctx->best_mode_index = src_ctx->best_mode_index; + + memcpy(dst_ctx->blk_skip, src_ctx->blk_skip, + sizeof(uint8_t) * src_ctx->num_4x4_blk); + + dst_ctx->hybrid_pred_diff = src_ctx->hybrid_pred_diff; + dst_ctx->comp_pred_diff = src_ctx->comp_pred_diff; + dst_ctx->single_pred_diff = src_ctx->single_pred_diff; + + dst_ctx->rate = src_ctx->rate; + dst_ctx->dist = src_ctx->dist; + dst_ctx->rdcost = src_ctx->rdcost; + dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready; + + memcpy(dst_ctx->pred_mv, src_ctx->pred_mv, sizeof(MV) * REF_FRAMES); + dst_ctx->pred_interp_filter = src_ctx->pred_interp_filter; + + dst_ctx->partition = src_ctx->partition; } diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h index 38052ba27..c05f48a7a 100644 --- a/third_party/aom/av1/encoder/context_tree.h +++ b/third_party/aom/av1/encoder/context_tree.h @@ -23,28 +23,29 @@ struct AV1_COMP; struct AV1Common; struct ThreadData; +typedef enum { + // Search all the partition types in this plane. + SEARCH_FULL_PLANE = 0, + // Only search none_partition coding block. + NONE_PARTITION_PLANE = 1, + // Search all the partition types in this plane except split. + SEARCH_SAME_PLANE = 2, + // Skip search partition on this plane. Go split directly. + SPLIT_PLANE = 3, +} CB_TREE_SEARCH; + // Structure to hold snapshot of coding context during the mode picking process typedef struct { - MODE_INFO mic; + MB_MODE_INFO mic; MB_MODE_INFO_EXT mbmi_ext; uint8_t *color_index_map[2]; -#if CONFIG_MRC_TX - uint8_t *mrc_mask; -#endif // CONFIG_MRC_TX -#if CONFIG_VAR_TX - uint8_t *blk_skip[MAX_MB_PLANE]; -#endif + uint8_t *blk_skip; tran_low_t *coeff[MAX_MB_PLANE]; tran_low_t *qcoeff[MAX_MB_PLANE]; tran_low_t *dqcoeff[MAX_MB_PLANE]; -#if CONFIG_PVQ - tran_low_t *pvq_ref_coeff[MAX_MB_PLANE]; -#endif uint16_t *eobs[MAX_MB_PLANE]; -#if CONFIG_LV_MAP uint8_t *txb_entropy_ctx[MAX_MB_PLANE]; -#endif int num_4x4_blk; int skip; @@ -60,16 +61,27 @@ typedef struct { // scope of refactoring. int rate; int64_t dist; + int64_t rdcost; + int rd_mode_is_ready; // Flag to indicate whether rd pick mode decision has + // been made. // motion vector cache for adaptive motion search control in partition // search loop - MV pred_mv[TOTAL_REFS_PER_FRAME]; + MV pred_mv[REF_FRAMES]; InterpFilter pred_interp_filter; -#if CONFIG_EXT_PARTITION_TYPES PARTITION_TYPE partition; -#endif } PICK_MODE_CONTEXT; +typedef struct { + int valid; + int split; + int skip; + int64_t rdcost; + int sub_block_split[4]; + int sub_block_skip[4]; + int64_t sub_block_rdcost[4]; +} PC_TREE_STATS; + typedef struct PC_TREE { int index; PARTITION_TYPE partitioning; @@ -77,34 +89,21 @@ typedef struct PC_TREE { PICK_MODE_CONTEXT none; PICK_MODE_CONTEXT horizontal[2]; PICK_MODE_CONTEXT vertical[2]; -#if CONFIG_EXT_PARTITION_TYPES PICK_MODE_CONTEXT horizontala[3]; PICK_MODE_CONTEXT horizontalb[3]; PICK_MODE_CONTEXT verticala[3]; PICK_MODE_CONTEXT verticalb[3]; PICK_MODE_CONTEXT horizontal4[4]; PICK_MODE_CONTEXT vertical4[4]; -#endif - // TODO(jingning): remove leaf_split[] when cb4x4 experiment flag is removed. - union { - struct PC_TREE *split[4]; - PICK_MODE_CONTEXT *leaf_split[4]; - }; -#if CONFIG_SUPERTX - PICK_MODE_CONTEXT horizontal_supertx; - PICK_MODE_CONTEXT vertical_supertx; - PICK_MODE_CONTEXT split_supertx; -#if CONFIG_EXT_PARTITION_TYPES - PICK_MODE_CONTEXT horizontala_supertx; - PICK_MODE_CONTEXT horizontalb_supertx; - PICK_MODE_CONTEXT verticala_supertx; - PICK_MODE_CONTEXT verticalb_supertx; -#endif -#endif + CB_TREE_SEARCH cb_search_range; + struct PC_TREE *split[4]; + PC_TREE_STATS pc_tree_stats; } PC_TREE; void av1_setup_pc_tree(struct AV1Common *cm, struct ThreadData *td); -void av1_free_pc_tree(struct ThreadData *td); +void av1_free_pc_tree(struct ThreadData *td, const int num_planes); +void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx, + PICK_MODE_CONTEXT *src_ctx); #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/av1/encoder/corner_match.c b/third_party/aom/av1/encoder/corner_match.c index 3827b65fa..29e934deb 100644 --- a/third_party/aom/av1/encoder/corner_match.c +++ b/third_party/aom/av1/encoder/corner_match.c @@ -13,7 +13,8 @@ #include <memory.h> #include <math.h> -#include "./av1_rtcd.h" +#include "config/av1_rtcd.h" + #include "av1/encoder/corner_match.h" #define SEARCH_SZ 9 diff --git a/third_party/aom/av1/encoder/cost.c b/third_party/aom/av1/encoder/cost.c index e33df53e4..323e2aed5 100644 --- a/third_party/aom/av1/encoder/cost.c +++ b/third_party/aom/av1/encoder/cost.c @@ -13,65 +13,26 @@ #include "av1/encoder/cost.h" #include "av1/common/entropy.h" -/* round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT)) - Begins with a bogus entry for simpler addressing. */ -const uint16_t av1_prob_cost[256] = { - 4096, 4096, 3584, 3284, 3072, 2907, 2772, 2659, 2560, 2473, 2395, 2325, 2260, - 2201, 2147, 2096, 2048, 2003, 1961, 1921, 1883, 1847, 1813, 1780, 1748, 1718, - 1689, 1661, 1635, 1609, 1584, 1559, 1536, 1513, 1491, 1470, 1449, 1429, 1409, - 1390, 1371, 1353, 1335, 1318, 1301, 1284, 1268, 1252, 1236, 1221, 1206, 1192, - 1177, 1163, 1149, 1136, 1123, 1110, 1097, 1084, 1072, 1059, 1047, 1036, 1024, - 1013, 1001, 990, 979, 968, 958, 947, 937, 927, 917, 907, 897, 887, - 878, 868, 859, 850, 841, 832, 823, 814, 806, 797, 789, 780, 772, - 764, 756, 748, 740, 732, 724, 717, 709, 702, 694, 687, 680, 673, - 665, 658, 651, 644, 637, 631, 624, 617, 611, 604, 598, 591, 585, - 578, 572, 566, 560, 554, 547, 541, 535, 530, 524, 518, 512, 506, - 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435, - 430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371, - 366, 361, 356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311, - 307, 302, 298, 294, 289, 285, 281, 277, 273, 268, 264, 260, 256, - 252, 248, 244, 240, 236, 232, 228, 224, 220, 216, 212, 209, 205, - 201, 197, 194, 190, 186, 182, 179, 175, 171, 168, 164, 161, 157, - 153, 150, 146, 143, 139, 136, 132, 129, 125, 122, 119, 115, 112, - 109, 105, 102, 99, 95, 92, 89, 86, 82, 79, 76, 73, 70, - 66, 63, 60, 57, 54, 51, 48, 45, 42, 38, 35, 32, 29, - 26, 23, 20, 18, 15, 12, 9, 6, 3 +// round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT)); i = 128~255. +const uint16_t av1_prob_cost[128] = { + 512, 506, 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435, + 430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371, 366, 361, + 356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311, 307, 302, 298, 294, + 289, 285, 281, 277, 273, 268, 264, 260, 256, 252, 248, 244, 240, 236, 232, + 228, 224, 220, 216, 212, 209, 205, 201, 197, 194, 190, 186, 182, 179, 175, + 171, 168, 164, 161, 157, 153, 150, 146, 143, 139, 136, 132, 129, 125, 122, + 119, 115, 112, 109, 105, 102, 99, 95, 92, 89, 86, 82, 79, 76, 73, + 70, 66, 63, 60, 57, 54, 51, 48, 45, 42, 38, 35, 32, 29, 26, + 23, 20, 18, 15, 12, 9, 6, 3, }; -static void cost(int *costs, aom_tree tree, const aom_prob *probs, int i, - int c) { - const aom_prob prob = probs[i / 2]; - int b; - - assert(prob != 0); - for (b = 0; b <= 1; ++b) { - const int cc = c + av1_cost_bit(prob, b); - const aom_tree_index ii = tree[i + b]; - - if (ii <= 0) - costs[-ii] = cc; - else - cost(costs, tree, probs, ii, cc); - } -} - -void av1_cost_tokens(int *costs, const aom_prob *probs, aom_tree tree) { - cost(costs, tree, probs, 0, 0); -} - -void av1_cost_tokens_skip(int *costs, const aom_prob *probs, aom_tree tree) { - assert(tree[0] <= 0 && tree[1] > 0); - - costs[-tree[0]] = av1_cost_bit(probs[0], 0); - cost(costs, tree, probs, 2, 0); -} - void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf, const int *inv_map) { int i; aom_cdf_prob prev_cdf = 0; for (i = 0;; ++i) { - const aom_cdf_prob p15 = AOM_ICDF(cdf[i]) - prev_cdf; + aom_cdf_prob p15 = AOM_ICDF(cdf[i]) - prev_cdf; + p15 = (p15 < EC_MIN_PROB) ? EC_MIN_PROB : p15; prev_cdf = AOM_ICDF(cdf[i]); if (inv_map) diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h index e60632005..5de7765c5 100644 --- a/third_party/aom/av1/encoder/cost.h +++ b/third_party/aom/av1/encoder/cost.h @@ -19,17 +19,11 @@ extern "C" { #endif -extern const uint16_t av1_prob_cost[256]; +extern const uint16_t av1_prob_cost[128]; // The factor to scale from cost in bits to cost in av1_prob_cost units. #define AV1_PROB_COST_SHIFT 9 -#define av1_cost_zero(prob) (av1_prob_cost[prob]) - -#define av1_cost_one(prob) av1_cost_zero(256 - (prob)) - -#define av1_cost_bit(prob, bit) av1_cost_zero((bit) ? 256 - (prob) : (prob)) - // Cost of coding an n bit literal, using 128 (i.e. 50%) probability // for each bit. #define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT)) @@ -38,31 +32,11 @@ extern const uint16_t av1_prob_cost[256]; static INLINE int av1_cost_symbol(aom_cdf_prob p15) { assert(0 < p15 && p15 < CDF_PROB_TOP); const int shift = CDF_PROB_BITS - 1 - get_msb(p15); - return av1_cost_zero(get_prob(p15 << shift, CDF_PROB_TOP)) + - av1_cost_literal(shift); -} - -static INLINE unsigned int cost_branch256(const unsigned int ct[2], - aom_prob p) { - return ct[0] * av1_cost_zero(p) + ct[1] * av1_cost_one(p); -} - -static INLINE int treed_cost(aom_tree tree, const aom_prob *probs, int bits, - int len) { - int cost = 0; - aom_tree_index i = 0; - - do { - const int bit = (bits >> --len) & 1; - cost += av1_cost_bit(probs[i >> 1], bit); - i = tree[i + bit]; - } while (len); - - return cost; + const int prob = get_prob(p15 << shift, CDF_PROB_TOP); + assert(prob >= 128); + return av1_prob_cost[prob - 128] + av1_cost_literal(shift); } -void av1_cost_tokens(int *costs, const aom_prob *probs, aom_tree tree); -void av1_cost_tokens_skip(int *costs, const aom_prob *probs, aom_tree tree); void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf, const int *inv_map); diff --git a/third_party/aom/av1/encoder/daala_compat_enc.c b/third_party/aom/av1/encoder/daala_compat_enc.c deleted file mode 100644 index c60e2d3d7..000000000 --- a/third_party/aom/av1/encoder/daala_compat_enc.c +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "encint.h" - -void od_encode_checkpoint(const daala_enc_ctx *enc, od_rollback_buffer *rbuf) { -#if !CONFIG_ANS - od_ec_enc_checkpoint(&rbuf->ec, &enc->w.ec); -#else -#error "CONFIG_PVQ currently requires !CONFIG_ANS." -#endif - OD_COPY(&rbuf->adapt, enc->state.adapt, 1); -} - -void od_encode_rollback(daala_enc_ctx *enc, const od_rollback_buffer *rbuf) { -#if !CONFIG_ANS - od_ec_enc_rollback(&enc->w.ec, &rbuf->ec); -#else -#error "CONFIG_PVQ currently requires !CONFIG_ANS." -#endif - OD_COPY(enc->state.adapt, &rbuf->adapt, 1); -} diff --git a/third_party/aom/av1/encoder/dct.c b/third_party/aom/av1/encoder/dct.c deleted file mode 100644 index a04d46b72..000000000 --- a/third_party/aom/av1/encoder/dct.c +++ /dev/null @@ -1,2797 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <math.h> - -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" -#include "./av1_rtcd.h" -#include "aom_dsp/fwd_txfm.h" -#include "aom_ports/mem.h" -#include "av1/common/blockd.h" -#include "av1/common/av1_fwd_txfm1d.h" -#include "av1/common/av1_fwd_txfm1d_cfg.h" -#include "av1/common/idct.h" -#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \ - CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64 -#include "av1/common/daala_tx.h" -#endif - -static INLINE void range_check(const tran_low_t *input, const int size, - const int bit) { -#if 0 // CONFIG_COEFFICIENT_RANGE_CHECKING -// TODO(angiebird): the range_check is not used because the bit range -// in fdct# is not correct. Since we are going to merge in a new version -// of fdct# from nextgenv2, we won't fix the incorrect bit range now. - int i; - for (i = 0; i < size; ++i) { - assert(abs(input[i]) < (1 << bit)); - } -#else - (void)input; - (void)size; - (void)bit; -#endif -} - -static void fdct4(const tran_low_t *input, tran_low_t *output) { - tran_high_t temp; - tran_low_t step[4]; - - // stage 0 - range_check(input, 4, 14); - - // stage 1 - output[0] = input[0] + input[3]; - output[1] = input[1] + input[2]; - output[2] = input[1] - input[2]; - output[3] = input[0] - input[3]; - - range_check(output, 4, 15); - - // stage 2 - temp = output[0] * cospi_16_64 + output[1] * cospi_16_64; - step[0] = (tran_low_t)fdct_round_shift(temp); - temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64; - step[1] = (tran_low_t)fdct_round_shift(temp); - temp = output[2] * cospi_24_64 + output[3] * cospi_8_64; - step[2] = (tran_low_t)fdct_round_shift(temp); - temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64; - step[3] = (tran_low_t)fdct_round_shift(temp); - - range_check(step, 4, 16); - - // stage 3 - output[0] = step[0]; - output[1] = step[2]; - output[2] = step[1]; - output[3] = step[3]; - - range_check(output, 4, 16); -} - -static void fdct8(const tran_low_t *input, tran_low_t *output) { - tran_high_t temp; - tran_low_t step[8]; - - // stage 0 - range_check(input, 8, 13); - - // stage 1 - output[0] = input[0] + input[7]; - output[1] = input[1] + input[6]; - output[2] = input[2] + input[5]; - output[3] = input[3] + input[4]; - output[4] = input[3] - input[4]; - output[5] = input[2] - input[5]; - output[6] = input[1] - input[6]; - output[7] = input[0] - input[7]; - - range_check(output, 8, 14); - - // stage 2 - step[0] = output[0] + output[3]; - step[1] = output[1] + output[2]; - step[2] = output[1] - output[2]; - step[3] = output[0] - output[3]; - step[4] = output[4]; - temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64; - step[5] = (tran_low_t)fdct_round_shift(temp); - temp = output[6] * cospi_16_64 + output[5] * cospi_16_64; - step[6] = (tran_low_t)fdct_round_shift(temp); - step[7] = output[7]; - - range_check(step, 8, 15); - - // stage 3 - temp = step[0] * cospi_16_64 + step[1] * cospi_16_64; - output[0] = (tran_low_t)fdct_round_shift(temp); - temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64; - output[1] = (tran_low_t)fdct_round_shift(temp); - temp = step[2] * cospi_24_64 + step[3] * cospi_8_64; - output[2] = (tran_low_t)fdct_round_shift(temp); - temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64; - output[3] = (tran_low_t)fdct_round_shift(temp); - output[4] = step[4] + step[5]; - output[5] = step[4] - step[5]; - output[6] = step[7] - step[6]; - output[7] = step[7] + step[6]; - - range_check(output, 8, 16); - - // stage 4 - step[0] = output[0]; - step[1] = output[1]; - step[2] = output[2]; - step[3] = output[3]; - temp = output[4] * cospi_28_64 + output[7] * cospi_4_64; - step[4] = (tran_low_t)fdct_round_shift(temp); - temp = output[5] * cospi_12_64 + output[6] * cospi_20_64; - step[5] = (tran_low_t)fdct_round_shift(temp); - temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64; - step[6] = (tran_low_t)fdct_round_shift(temp); - temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64; - step[7] = (tran_low_t)fdct_round_shift(temp); - - range_check(step, 8, 16); - - // stage 5 - output[0] = step[0]; - output[1] = step[4]; - output[2] = step[2]; - output[3] = step[6]; - output[4] = step[1]; - output[5] = step[5]; - output[6] = step[3]; - output[7] = step[7]; - - range_check(output, 8, 16); -} - -static void fdct16(const tran_low_t *input, tran_low_t *output) { - tran_high_t temp; - tran_low_t step[16]; - - // stage 0 - range_check(input, 16, 13); - - // stage 1 - output[0] = input[0] + input[15]; - output[1] = input[1] + input[14]; - output[2] = input[2] + input[13]; - output[3] = input[3] + input[12]; - output[4] = input[4] + input[11]; - output[5] = input[5] + input[10]; - output[6] = input[6] + input[9]; - output[7] = input[7] + input[8]; - output[8] = input[7] - input[8]; - output[9] = input[6] - input[9]; - output[10] = input[5] - input[10]; - output[11] = input[4] - input[11]; - output[12] = input[3] - input[12]; - output[13] = input[2] - input[13]; - output[14] = input[1] - input[14]; - output[15] = input[0] - input[15]; - - range_check(output, 16, 14); - - // stage 2 - step[0] = output[0] + output[7]; - step[1] = output[1] + output[6]; - step[2] = output[2] + output[5]; - step[3] = output[3] + output[4]; - step[4] = output[3] - output[4]; - step[5] = output[2] - output[5]; - step[6] = output[1] - output[6]; - step[7] = output[0] - output[7]; - step[8] = output[8]; - step[9] = output[9]; - temp = output[10] * -cospi_16_64 + output[13] * cospi_16_64; - step[10] = (tran_low_t)fdct_round_shift(temp); - temp = output[11] * -cospi_16_64 + output[12] * cospi_16_64; - step[11] = (tran_low_t)fdct_round_shift(temp); - temp = output[12] * cospi_16_64 + output[11] * cospi_16_64; - step[12] = (tran_low_t)fdct_round_shift(temp); - temp = output[13] * cospi_16_64 + output[10] * cospi_16_64; - step[13] = (tran_low_t)fdct_round_shift(temp); - step[14] = output[14]; - step[15] = output[15]; - - range_check(step, 16, 15); - - // stage 3 - output[0] = step[0] + step[3]; - output[1] = step[1] + step[2]; - output[2] = step[1] - step[2]; - output[3] = step[0] - step[3]; - output[4] = step[4]; - temp = step[5] * -cospi_16_64 + step[6] * cospi_16_64; - output[5] = (tran_low_t)fdct_round_shift(temp); - temp = step[6] * cospi_16_64 + step[5] * cospi_16_64; - output[6] = (tran_low_t)fdct_round_shift(temp); - output[7] = step[7]; - output[8] = step[8] + step[11]; - output[9] = step[9] + step[10]; - output[10] = step[9] - step[10]; - output[11] = step[8] - step[11]; - output[12] = step[15] - step[12]; - output[13] = step[14] - step[13]; - output[14] = step[14] + step[13]; - output[15] = step[15] + step[12]; - - range_check(output, 16, 16); - - // stage 4 - temp = output[0] * cospi_16_64 + output[1] * cospi_16_64; - step[0] = (tran_low_t)fdct_round_shift(temp); - temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64; - step[1] = (tran_low_t)fdct_round_shift(temp); - temp = output[2] * cospi_24_64 + output[3] * cospi_8_64; - step[2] = (tran_low_t)fdct_round_shift(temp); - temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64; - step[3] = (tran_low_t)fdct_round_shift(temp); - step[4] = output[4] + output[5]; - step[5] = output[4] - output[5]; - step[6] = output[7] - output[6]; - step[7] = output[7] + output[6]; - step[8] = output[8]; - temp = output[9] * -cospi_8_64 + output[14] * cospi_24_64; - step[9] = (tran_low_t)fdct_round_shift(temp); - temp = output[10] * -cospi_24_64 + output[13] * -cospi_8_64; - step[10] = (tran_low_t)fdct_round_shift(temp); - step[11] = output[11]; - step[12] = output[12]; - temp = output[13] * cospi_24_64 + output[10] * -cospi_8_64; - step[13] = (tran_low_t)fdct_round_shift(temp); - temp = output[14] * cospi_8_64 + output[9] * cospi_24_64; - step[14] = (tran_low_t)fdct_round_shift(temp); - step[15] = output[15]; - - range_check(step, 16, 16); - - // stage 5 - output[0] = step[0]; - output[1] = step[1]; - output[2] = step[2]; - output[3] = step[3]; - temp = step[4] * cospi_28_64 + step[7] * cospi_4_64; - output[4] = (tran_low_t)fdct_round_shift(temp); - temp = step[5] * cospi_12_64 + step[6] * cospi_20_64; - output[5] = (tran_low_t)fdct_round_shift(temp); - temp = step[6] * cospi_12_64 + step[5] * -cospi_20_64; - output[6] = (tran_low_t)fdct_round_shift(temp); - temp = step[7] * cospi_28_64 + step[4] * -cospi_4_64; - output[7] = (tran_low_t)fdct_round_shift(temp); - output[8] = step[8] + step[9]; - output[9] = step[8] - step[9]; - output[10] = step[11] - step[10]; - output[11] = step[11] + step[10]; - output[12] = step[12] + step[13]; - output[13] = step[12] - step[13]; - output[14] = step[15] - step[14]; - output[15] = step[15] + step[14]; - - range_check(output, 16, 16); - - // stage 6 - step[0] = output[0]; - step[1] = output[1]; - step[2] = output[2]; - step[3] = output[3]; - step[4] = output[4]; - step[5] = output[5]; - step[6] = output[6]; - step[7] = output[7]; - temp = output[8] * cospi_30_64 + output[15] * cospi_2_64; - step[8] = (tran_low_t)fdct_round_shift(temp); - temp = output[9] * cospi_14_64 + output[14] * cospi_18_64; - step[9] = (tran_low_t)fdct_round_shift(temp); - temp = output[10] * cospi_22_64 + output[13] * cospi_10_64; - step[10] = (tran_low_t)fdct_round_shift(temp); - temp = output[11] * cospi_6_64 + output[12] * cospi_26_64; - step[11] = (tran_low_t)fdct_round_shift(temp); - temp = output[12] * cospi_6_64 + output[11] * -cospi_26_64; - step[12] = (tran_low_t)fdct_round_shift(temp); - temp = output[13] * cospi_22_64 + output[10] * -cospi_10_64; - step[13] = (tran_low_t)fdct_round_shift(temp); - temp = output[14] * cospi_14_64 + output[9] * -cospi_18_64; - step[14] = (tran_low_t)fdct_round_shift(temp); - temp = output[15] * cospi_30_64 + output[8] * -cospi_2_64; - step[15] = (tran_low_t)fdct_round_shift(temp); - - range_check(step, 16, 16); - - // stage 7 - output[0] = step[0]; - output[1] = step[8]; - output[2] = step[4]; - output[3] = step[12]; - output[4] = step[2]; - output[5] = step[10]; - output[6] = step[6]; - output[7] = step[14]; - output[8] = step[1]; - output[9] = step[9]; - output[10] = step[5]; - output[11] = step[13]; - output[12] = step[3]; - output[13] = step[11]; - output[14] = step[7]; - output[15] = step[15]; - - range_check(output, 16, 16); -} - -static void fdct32(const tran_low_t *input, tran_low_t *output) { - tran_high_t temp; - tran_low_t step[32]; - - // stage 0 - range_check(input, 32, 14); - - // stage 1 - output[0] = input[0] + input[31]; - output[1] = input[1] + input[30]; - output[2] = input[2] + input[29]; - output[3] = input[3] + input[28]; - output[4] = input[4] + input[27]; - output[5] = input[5] + input[26]; - output[6] = input[6] + input[25]; - output[7] = input[7] + input[24]; - output[8] = input[8] + input[23]; - output[9] = input[9] + input[22]; - output[10] = input[10] + input[21]; - output[11] = input[11] + input[20]; - output[12] = input[12] + input[19]; - output[13] = input[13] + input[18]; - output[14] = input[14] + input[17]; - output[15] = input[15] + input[16]; - output[16] = input[15] - input[16]; - output[17] = input[14] - input[17]; - output[18] = input[13] - input[18]; - output[19] = input[12] - input[19]; - output[20] = input[11] - input[20]; - output[21] = input[10] - input[21]; - output[22] = input[9] - input[22]; - output[23] = input[8] - input[23]; - output[24] = input[7] - input[24]; - output[25] = input[6] - input[25]; - output[26] = input[5] - input[26]; - output[27] = input[4] - input[27]; - output[28] = input[3] - input[28]; - output[29] = input[2] - input[29]; - output[30] = input[1] - input[30]; - output[31] = input[0] - input[31]; - - range_check(output, 32, 15); - - // stage 2 - step[0] = output[0] + output[15]; - step[1] = output[1] + output[14]; - step[2] = output[2] + output[13]; - step[3] = output[3] + output[12]; - step[4] = output[4] + output[11]; - step[5] = output[5] + output[10]; - step[6] = output[6] + output[9]; - step[7] = output[7] + output[8]; - step[8] = output[7] - output[8]; - step[9] = output[6] - output[9]; - step[10] = output[5] - output[10]; - step[11] = output[4] - output[11]; - step[12] = output[3] - output[12]; - step[13] = output[2] - output[13]; - step[14] = output[1] - output[14]; - step[15] = output[0] - output[15]; - step[16] = output[16]; - step[17] = output[17]; - step[18] = output[18]; - step[19] = output[19]; - temp = output[20] * -cospi_16_64 + output[27] * cospi_16_64; - step[20] = (tran_low_t)fdct_round_shift(temp); - temp = output[21] * -cospi_16_64 + output[26] * cospi_16_64; - step[21] = (tran_low_t)fdct_round_shift(temp); - temp = output[22] * -cospi_16_64 + output[25] * cospi_16_64; - step[22] = (tran_low_t)fdct_round_shift(temp); - temp = output[23] * -cospi_16_64 + output[24] * cospi_16_64; - step[23] = (tran_low_t)fdct_round_shift(temp); - temp = output[24] * cospi_16_64 + output[23] * cospi_16_64; - step[24] = (tran_low_t)fdct_round_shift(temp); - temp = output[25] * cospi_16_64 + output[22] * cospi_16_64; - step[25] = (tran_low_t)fdct_round_shift(temp); - temp = output[26] * cospi_16_64 + output[21] * cospi_16_64; - step[26] = (tran_low_t)fdct_round_shift(temp); - temp = output[27] * cospi_16_64 + output[20] * cospi_16_64; - step[27] = (tran_low_t)fdct_round_shift(temp); - step[28] = output[28]; - step[29] = output[29]; - step[30] = output[30]; - step[31] = output[31]; - - range_check(step, 32, 16); - - // stage 3 - output[0] = step[0] + step[7]; - output[1] = step[1] + step[6]; - output[2] = step[2] + step[5]; - output[3] = step[3] + step[4]; - output[4] = step[3] - step[4]; - output[5] = step[2] - step[5]; - output[6] = step[1] - step[6]; - output[7] = step[0] - step[7]; - output[8] = step[8]; - output[9] = step[9]; - temp = step[10] * -cospi_16_64 + step[13] * cospi_16_64; - output[10] = (tran_low_t)fdct_round_shift(temp); - temp = step[11] * -cospi_16_64 + step[12] * cospi_16_64; - output[11] = (tran_low_t)fdct_round_shift(temp); - temp = step[12] * cospi_16_64 + step[11] * cospi_16_64; - output[12] = (tran_low_t)fdct_round_shift(temp); - temp = step[13] * cospi_16_64 + step[10] * cospi_16_64; - output[13] = (tran_low_t)fdct_round_shift(temp); - output[14] = step[14]; - output[15] = step[15]; - output[16] = step[16] + step[23]; - output[17] = step[17] + step[22]; - output[18] = step[18] + step[21]; - output[19] = step[19] + step[20]; - output[20] = step[19] - step[20]; - output[21] = step[18] - step[21]; - output[22] = step[17] - step[22]; - output[23] = step[16] - step[23]; - output[24] = step[31] - step[24]; - output[25] = step[30] - step[25]; - output[26] = step[29] - step[26]; - output[27] = step[28] - step[27]; - output[28] = step[28] + step[27]; - output[29] = step[29] + step[26]; - output[30] = step[30] + step[25]; - output[31] = step[31] + step[24]; - - range_check(output, 32, 17); - - // stage 4 - step[0] = output[0] + output[3]; - step[1] = output[1] + output[2]; - step[2] = output[1] - output[2]; - step[3] = output[0] - output[3]; - step[4] = output[4]; - temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64; - step[5] = (tran_low_t)fdct_round_shift(temp); - temp = output[6] * cospi_16_64 + output[5] * cospi_16_64; - step[6] = (tran_low_t)fdct_round_shift(temp); - step[7] = output[7]; - step[8] = output[8] + output[11]; - step[9] = output[9] + output[10]; - step[10] = output[9] - output[10]; - step[11] = output[8] - output[11]; - step[12] = output[15] - output[12]; - step[13] = output[14] - output[13]; - step[14] = output[14] + output[13]; - step[15] = output[15] + output[12]; - step[16] = output[16]; - step[17] = output[17]; - temp = output[18] * -cospi_8_64 + output[29] * cospi_24_64; - step[18] = (tran_low_t)fdct_round_shift(temp); - temp = output[19] * -cospi_8_64 + output[28] * cospi_24_64; - step[19] = (tran_low_t)fdct_round_shift(temp); - temp = output[20] * -cospi_24_64 + output[27] * -cospi_8_64; - step[20] = (tran_low_t)fdct_round_shift(temp); - temp = output[21] * -cospi_24_64 + output[26] * -cospi_8_64; - step[21] = (tran_low_t)fdct_round_shift(temp); - step[22] = output[22]; - step[23] = output[23]; - step[24] = output[24]; - step[25] = output[25]; - temp = output[26] * cospi_24_64 + output[21] * -cospi_8_64; - step[26] = (tran_low_t)fdct_round_shift(temp); - temp = output[27] * cospi_24_64 + output[20] * -cospi_8_64; - step[27] = (tran_low_t)fdct_round_shift(temp); - temp = output[28] * cospi_8_64 + output[19] * cospi_24_64; - step[28] = (tran_low_t)fdct_round_shift(temp); - temp = output[29] * cospi_8_64 + output[18] * cospi_24_64; - step[29] = (tran_low_t)fdct_round_shift(temp); - step[30] = output[30]; - step[31] = output[31]; - - range_check(step, 32, 18); - - // stage 5 - temp = step[0] * cospi_16_64 + step[1] * cospi_16_64; - output[0] = (tran_low_t)fdct_round_shift(temp); - temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64; - output[1] = (tran_low_t)fdct_round_shift(temp); - temp = step[2] * cospi_24_64 + step[3] * cospi_8_64; - output[2] = (tran_low_t)fdct_round_shift(temp); - temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64; - output[3] = (tran_low_t)fdct_round_shift(temp); - output[4] = step[4] + step[5]; - output[5] = step[4] - step[5]; - output[6] = step[7] - step[6]; - output[7] = step[7] + step[6]; - output[8] = step[8]; - temp = step[9] * -cospi_8_64 + step[14] * cospi_24_64; - output[9] = (tran_low_t)fdct_round_shift(temp); - temp = step[10] * -cospi_24_64 + step[13] * -cospi_8_64; - output[10] = (tran_low_t)fdct_round_shift(temp); - output[11] = step[11]; - output[12] = step[12]; - temp = step[13] * cospi_24_64 + step[10] * -cospi_8_64; - output[13] = (tran_low_t)fdct_round_shift(temp); - temp = step[14] * cospi_8_64 + step[9] * cospi_24_64; - output[14] = (tran_low_t)fdct_round_shift(temp); - output[15] = step[15]; - output[16] = step[16] + step[19]; - output[17] = step[17] + step[18]; - output[18] = step[17] - step[18]; - output[19] = step[16] - step[19]; - output[20] = step[23] - step[20]; - output[21] = step[22] - step[21]; - output[22] = step[22] + step[21]; - output[23] = step[23] + step[20]; - output[24] = step[24] + step[27]; - output[25] = step[25] + step[26]; - output[26] = step[25] - step[26]; - output[27] = step[24] - step[27]; - output[28] = step[31] - step[28]; - output[29] = step[30] - step[29]; - output[30] = step[30] + step[29]; - output[31] = step[31] + step[28]; - - range_check(output, 32, 18); - - // stage 6 - step[0] = output[0]; - step[1] = output[1]; - step[2] = output[2]; - step[3] = output[3]; - temp = output[4] * cospi_28_64 + output[7] * cospi_4_64; - step[4] = (tran_low_t)fdct_round_shift(temp); - temp = output[5] * cospi_12_64 + output[6] * cospi_20_64; - step[5] = (tran_low_t)fdct_round_shift(temp); - temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64; - step[6] = (tran_low_t)fdct_round_shift(temp); - temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64; - step[7] = (tran_low_t)fdct_round_shift(temp); - step[8] = output[8] + output[9]; - step[9] = output[8] - output[9]; - step[10] = output[11] - output[10]; - step[11] = output[11] + output[10]; - step[12] = output[12] + output[13]; - step[13] = output[12] - output[13]; - step[14] = output[15] - output[14]; - step[15] = output[15] + output[14]; - step[16] = output[16]; - temp = output[17] * -cospi_4_64 + output[30] * cospi_28_64; - step[17] = (tran_low_t)fdct_round_shift(temp); - temp = output[18] * -cospi_28_64 + output[29] * -cospi_4_64; - step[18] = (tran_low_t)fdct_round_shift(temp); - step[19] = output[19]; - step[20] = output[20]; - temp = output[21] * -cospi_20_64 + output[26] * cospi_12_64; - step[21] = (tran_low_t)fdct_round_shift(temp); - temp = output[22] * -cospi_12_64 + output[25] * -cospi_20_64; - step[22] = (tran_low_t)fdct_round_shift(temp); - step[23] = output[23]; - step[24] = output[24]; - temp = output[25] * cospi_12_64 + output[22] * -cospi_20_64; - step[25] = (tran_low_t)fdct_round_shift(temp); - temp = output[26] * cospi_20_64 + output[21] * cospi_12_64; - step[26] = (tran_low_t)fdct_round_shift(temp); - step[27] = output[27]; - step[28] = output[28]; - temp = output[29] * cospi_28_64 + output[18] * -cospi_4_64; - step[29] = (tran_low_t)fdct_round_shift(temp); - temp = output[30] * cospi_4_64 + output[17] * cospi_28_64; - step[30] = (tran_low_t)fdct_round_shift(temp); - step[31] = output[31]; - - range_check(step, 32, 18); - - // stage 7 - output[0] = step[0]; - output[1] = step[1]; - output[2] = step[2]; - output[3] = step[3]; - output[4] = step[4]; - output[5] = step[5]; - output[6] = step[6]; - output[7] = step[7]; - temp = step[8] * cospi_30_64 + step[15] * cospi_2_64; - output[8] = (tran_low_t)fdct_round_shift(temp); - temp = step[9] * cospi_14_64 + step[14] * cospi_18_64; - output[9] = (tran_low_t)fdct_round_shift(temp); - temp = step[10] * cospi_22_64 + step[13] * cospi_10_64; - output[10] = (tran_low_t)fdct_round_shift(temp); - temp = step[11] * cospi_6_64 + step[12] * cospi_26_64; - output[11] = (tran_low_t)fdct_round_shift(temp); - temp = step[12] * cospi_6_64 + step[11] * -cospi_26_64; - output[12] = (tran_low_t)fdct_round_shift(temp); - temp = step[13] * cospi_22_64 + step[10] * -cospi_10_64; - output[13] = (tran_low_t)fdct_round_shift(temp); - temp = step[14] * cospi_14_64 + step[9] * -cospi_18_64; - output[14] = (tran_low_t)fdct_round_shift(temp); - temp = step[15] * cospi_30_64 + step[8] * -cospi_2_64; - output[15] = (tran_low_t)fdct_round_shift(temp); - output[16] = step[16] + step[17]; - output[17] = step[16] - step[17]; - output[18] = step[19] - step[18]; - output[19] = step[19] + step[18]; - output[20] = step[20] + step[21]; - output[21] = step[20] - step[21]; - output[22] = step[23] - step[22]; - output[23] = step[23] + step[22]; - output[24] = step[24] + step[25]; - output[25] = step[24] - step[25]; - output[26] = step[27] - step[26]; - output[27] = step[27] + step[26]; - output[28] = step[28] + step[29]; - output[29] = step[28] - step[29]; - output[30] = step[31] - step[30]; - output[31] = step[31] + step[30]; - - range_check(output, 32, 18); - - // stage 8 - step[0] = output[0]; - step[1] = output[1]; - step[2] = output[2]; - step[3] = output[3]; - step[4] = output[4]; - step[5] = output[5]; - step[6] = output[6]; - step[7] = output[7]; - step[8] = output[8]; - step[9] = output[9]; - step[10] = output[10]; - step[11] = output[11]; - step[12] = output[12]; - step[13] = output[13]; - step[14] = output[14]; - step[15] = output[15]; - temp = output[16] * cospi_31_64 + output[31] * cospi_1_64; - step[16] = (tran_low_t)fdct_round_shift(temp); - temp = output[17] * cospi_15_64 + output[30] * cospi_17_64; - step[17] = (tran_low_t)fdct_round_shift(temp); - temp = output[18] * cospi_23_64 + output[29] * cospi_9_64; - step[18] = (tran_low_t)fdct_round_shift(temp); - temp = output[19] * cospi_7_64 + output[28] * cospi_25_64; - step[19] = (tran_low_t)fdct_round_shift(temp); - temp = output[20] * cospi_27_64 + output[27] * cospi_5_64; - step[20] = (tran_low_t)fdct_round_shift(temp); - temp = output[21] * cospi_11_64 + output[26] * cospi_21_64; - step[21] = (tran_low_t)fdct_round_shift(temp); - temp = output[22] * cospi_19_64 + output[25] * cospi_13_64; - step[22] = (tran_low_t)fdct_round_shift(temp); - temp = output[23] * cospi_3_64 + output[24] * cospi_29_64; - step[23] = (tran_low_t)fdct_round_shift(temp); - temp = output[24] * cospi_3_64 + output[23] * -cospi_29_64; - step[24] = (tran_low_t)fdct_round_shift(temp); - temp = output[25] * cospi_19_64 + output[22] * -cospi_13_64; - step[25] = (tran_low_t)fdct_round_shift(temp); - temp = output[26] * cospi_11_64 + output[21] * -cospi_21_64; - step[26] = (tran_low_t)fdct_round_shift(temp); - temp = output[27] * cospi_27_64 + output[20] * -cospi_5_64; - step[27] = (tran_low_t)fdct_round_shift(temp); - temp = output[28] * cospi_7_64 + output[19] * -cospi_25_64; - step[28] = (tran_low_t)fdct_round_shift(temp); - temp = output[29] * cospi_23_64 + output[18] * -cospi_9_64; - step[29] = (tran_low_t)fdct_round_shift(temp); - temp = output[30] * cospi_15_64 + output[17] * -cospi_17_64; - step[30] = (tran_low_t)fdct_round_shift(temp); - temp = output[31] * cospi_31_64 + output[16] * -cospi_1_64; - step[31] = (tran_low_t)fdct_round_shift(temp); - - range_check(step, 32, 18); - - // stage 9 - output[0] = step[0]; - output[1] = step[16]; - output[2] = step[8]; - output[3] = step[24]; - output[4] = step[4]; - output[5] = step[20]; - output[6] = step[12]; - output[7] = step[28]; - output[8] = step[2]; - output[9] = step[18]; - output[10] = step[10]; - output[11] = step[26]; - output[12] = step[6]; - output[13] = step[22]; - output[14] = step[14]; - output[15] = step[30]; - output[16] = step[1]; - output[17] = step[17]; - output[18] = step[9]; - output[19] = step[25]; - output[20] = step[5]; - output[21] = step[21]; - output[22] = step[13]; - output[23] = step[29]; - output[24] = step[3]; - output[25] = step[19]; - output[26] = step[11]; - output[27] = step[27]; - output[28] = step[7]; - output[29] = step[23]; - output[30] = step[15]; - output[31] = step[31]; - - range_check(output, 32, 18); -} - -#ifndef AV1_DCT_GTEST -static void fadst4(const tran_low_t *input, tran_low_t *output) { - tran_high_t x0, x1, x2, x3; - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - - x0 = input[0]; - x1 = input[1]; - x2 = input[2]; - x3 = input[3]; - - if (!(x0 | x1 | x2 | x3)) { - output[0] = output[1] = output[2] = output[3] = 0; - return; - } - - s0 = sinpi_1_9 * x0; - s1 = sinpi_4_9 * x0; - s2 = sinpi_2_9 * x1; - s3 = sinpi_1_9 * x1; - s4 = sinpi_3_9 * x2; - s5 = sinpi_4_9 * x3; - s6 = sinpi_2_9 * x3; - s7 = x0 + x1 - x3; - - x0 = s0 + s2 + s5; - x1 = sinpi_3_9 * s7; - x2 = s1 - s3 + s6; - x3 = s4; - - s0 = x0 + x3; - s1 = x1; - s2 = x2 - x3; - s3 = x2 - x0 + x3; - - // 1-D transform scaling factor is sqrt(2). - output[0] = (tran_low_t)fdct_round_shift(s0); - output[1] = (tran_low_t)fdct_round_shift(s1); - output[2] = (tran_low_t)fdct_round_shift(s2); - output[3] = (tran_low_t)fdct_round_shift(s3); -} - -static void fadst8(const tran_low_t *input, tran_low_t *output) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - - tran_high_t x0 = input[7]; - tran_high_t x1 = input[0]; - tran_high_t x2 = input[5]; - tran_high_t x3 = input[2]; - tran_high_t x4 = input[3]; - tran_high_t x5 = input[4]; - tran_high_t x6 = input[1]; - tran_high_t x7 = input[6]; - - // stage 1 - s0 = cospi_2_64 * x0 + cospi_30_64 * x1; - s1 = cospi_30_64 * x0 - cospi_2_64 * x1; - s2 = cospi_10_64 * x2 + cospi_22_64 * x3; - s3 = cospi_22_64 * x2 - cospi_10_64 * x3; - s4 = cospi_18_64 * x4 + cospi_14_64 * x5; - s5 = cospi_14_64 * x4 - cospi_18_64 * x5; - s6 = cospi_26_64 * x6 + cospi_6_64 * x7; - s7 = cospi_6_64 * x6 - cospi_26_64 * x7; - - x0 = s0 + s4; - x1 = s1 + s5; - x2 = s2 + s6; - x3 = s3 + s7; - x4 = fdct_round_shift(s0 - s4); - x5 = fdct_round_shift(s1 - s5); - x6 = fdct_round_shift(s2 - s6); - x7 = fdct_round_shift(s3 - s7); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = cospi_8_64 * x4 + cospi_24_64 * x5; - s5 = cospi_24_64 * x4 - cospi_8_64 * x5; - s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; - s7 = cospi_8_64 * x6 + cospi_24_64 * x7; - - x0 = fdct_round_shift(s0 + s2); - x1 = fdct_round_shift(s1 + s3); - x2 = fdct_round_shift(s0 - s2); - x3 = fdct_round_shift(s1 - s3); - x4 = fdct_round_shift(s4 + s6); - x5 = fdct_round_shift(s5 + s7); - x6 = fdct_round_shift(s4 - s6); - x7 = fdct_round_shift(s5 - s7); - - // stage 3 - s2 = cospi_16_64 * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (x6 - x7); - - x2 = fdct_round_shift(s2); - x3 = fdct_round_shift(s3); - x6 = fdct_round_shift(s6); - x7 = fdct_round_shift(s7); - - output[0] = (tran_low_t)x0; - output[1] = (tran_low_t)-x4; - output[2] = (tran_low_t)x6; - output[3] = (tran_low_t)-x2; - output[4] = (tran_low_t)x3; - output[5] = (tran_low_t)-x7; - output[6] = (tran_low_t)x5; - output[7] = (tran_low_t)-x1; -} - -static void fadst16(const tran_low_t *input, tran_low_t *output) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; - tran_high_t s9, s10, s11, s12, s13, s14, s15; - - tran_high_t x0 = input[15]; - tran_high_t x1 = input[0]; - tran_high_t x2 = input[13]; - tran_high_t x3 = input[2]; - tran_high_t x4 = input[11]; - tran_high_t x5 = input[4]; - tran_high_t x6 = input[9]; - tran_high_t x7 = input[6]; - tran_high_t x8 = input[7]; - tran_high_t x9 = input[8]; - tran_high_t x10 = input[5]; - tran_high_t x11 = input[10]; - tran_high_t x12 = input[3]; - tran_high_t x13 = input[12]; - tran_high_t x14 = input[1]; - tran_high_t x15 = input[14]; - - // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; - s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; - s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; - s5 = x4 * cospi_23_64 - x5 * cospi_9_64; - s6 = x6 * cospi_13_64 + x7 * cospi_19_64; - s7 = x6 * cospi_19_64 - x7 * cospi_13_64; - s8 = x8 * cospi_17_64 + x9 * cospi_15_64; - s9 = x8 * cospi_15_64 - x9 * cospi_17_64; - s10 = x10 * cospi_21_64 + x11 * cospi_11_64; - s11 = x10 * cospi_11_64 - x11 * cospi_21_64; - s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; - s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - - x0 = s0 + s8; - x1 = s1 + s9; - x2 = s2 + s10; - x3 = s3 + s11; - x4 = s4 + s12; - x5 = s5 + s13; - x6 = s6 + s14; - x7 = s7 + s15; - - x8 = fdct_round_shift(s0 - s8); - x9 = fdct_round_shift(s1 - s9); - x10 = fdct_round_shift(s2 - s10); - x11 = fdct_round_shift(s3 - s11); - x12 = fdct_round_shift(s4 - s12); - x13 = fdct_round_shift(s5 - s13); - x14 = fdct_round_shift(s6 - s14); - x15 = fdct_round_shift(s7 - s15); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4; - s5 = x5; - s6 = x6; - s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - - x0 = s0 + s4; - x1 = s1 + s5; - x2 = s2 + s6; - x3 = s3 + s7; - x4 = fdct_round_shift(s0 - s4); - x5 = fdct_round_shift(s1 - s5); - x6 = fdct_round_shift(s2 - s6); - x7 = fdct_round_shift(s3 - s7); - - x8 = s8 + s12; - x9 = s9 + s13; - x10 = s10 + s14; - x11 = s11 + s15; - x12 = fdct_round_shift(s8 - s12); - x13 = fdct_round_shift(s9 - s13); - x14 = fdct_round_shift(s10 - s14); - x15 = fdct_round_shift(s11 - s15); - - // stage 3 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; - s8 = x8; - s9 = x9; - s10 = x10; - s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - - x0 = fdct_round_shift(s0 + s2); - x1 = fdct_round_shift(s1 + s3); - x2 = fdct_round_shift(s0 - s2); - x3 = fdct_round_shift(s1 - s3); - - x4 = fdct_round_shift(s4 + s6); - x5 = fdct_round_shift(s5 + s7); - x6 = fdct_round_shift(s4 - s6); - x7 = fdct_round_shift(s5 - s7); - - x8 = fdct_round_shift(s8 + s10); - x9 = fdct_round_shift(s9 + s11); - x10 = fdct_round_shift(s8 - s10); - x11 = fdct_round_shift(s9 - s11); - - x12 = fdct_round_shift(s12 + s14); - x13 = fdct_round_shift(s13 + s15); - x14 = fdct_round_shift(s12 - s14); - x15 = fdct_round_shift(s13 - s15); - - // stage 4 - s2 = (-cospi_16_64) * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (-x6 + x7); - s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (-x10 + x11); - s14 = (-cospi_16_64) * (x14 + x15); - s15 = cospi_16_64 * (x14 - x15); - - x2 = fdct_round_shift(s2); - x3 = fdct_round_shift(s3); - x6 = fdct_round_shift(s6); - x7 = fdct_round_shift(s7); - x10 = fdct_round_shift(s10); - x11 = fdct_round_shift(s11); - x14 = fdct_round_shift(s14); - x15 = fdct_round_shift(s15); - - output[0] = (tran_low_t)x0; - output[1] = (tran_low_t)-x8; - output[2] = (tran_low_t)x12; - output[3] = (tran_low_t)-x4; - output[4] = (tran_low_t)x6; - output[5] = (tran_low_t)x14; - output[6] = (tran_low_t)x10; - output[7] = (tran_low_t)x2; - output[8] = (tran_low_t)x3; - output[9] = (tran_low_t)x11; - output[10] = (tran_low_t)x15; - output[11] = (tran_low_t)x7; - output[12] = (tran_low_t)x5; - output[13] = (tran_low_t)-x13; - output[14] = (tran_low_t)x9; - output[15] = (tran_low_t)-x1; -} - -// For use in lieu of ADST -static void fhalfright32(const tran_low_t *input, tran_low_t *output) { - int i; - tran_low_t inputhalf[16]; - for (i = 0; i < 16; ++i) { - output[16 + i] = input[i] * 4; - } - // Multiply input by sqrt(2) - for (i = 0; i < 16; ++i) { - inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2); - } - fdct16(inputhalf, output); - // Note overall scaling factor is 4 times orthogonal -} - -#if CONFIG_MRC_TX -static void get_masked_residual32(const int16_t **input, int *input_stride, - const uint8_t *pred, int pred_stride, - int16_t *masked_input, - TxfmParam *txfm_param) { - int n_masked_vals = 0; - uint8_t *mrc_mask; - uint8_t mask_tmp[32 * 32]; - if ((txfm_param->is_inter && SIGNAL_MRC_MASK_INTER) || - (!txfm_param->is_inter && SIGNAL_MRC_MASK_INTRA)) { - mrc_mask = txfm_param->mask; - n_masked_vals = get_mrc_diff_mask(*input, *input_stride, mrc_mask, 32, 32, - 32, txfm_param->is_inter); - } else { - mrc_mask = mask_tmp; - n_masked_vals = get_mrc_pred_mask(pred, pred_stride, mrc_mask, 32, 32, 32, - txfm_param->is_inter); - } - - // Do not use MRC_DCT if mask is invalid. DCT_DCT will be used instead. - if (!is_valid_mrc_mask(n_masked_vals, 32, 32)) { - *txfm_param->valid_mask = 0; - return; - } - int32_t sum = 0; - int16_t avg; - // Get the masked average of the prediction - for (int i = 0; i < 32; ++i) { - for (int j = 0; j < 32; ++j) { - sum += mrc_mask[i * 32 + j] * (*input)[i * (*input_stride) + j]; - } - } - avg = sum / n_masked_vals; - // Replace all of the unmasked pixels in the prediction with the average - // of the masked pixels - for (int i = 0; i < 32; ++i) { - for (int j = 0; j < 32; ++j) - masked_input[i * 32 + j] = - (mrc_mask[i * 32 + j]) ? (*input)[i * (*input_stride) + j] : avg; - } - *input = masked_input; - *input_stride = 32; - *txfm_param->valid_mask = 1; -} -#endif // CONFIG_MRC_TX - -#if CONFIG_LGT || CONFIG_LGT_FROM_PRED -static void flgt4(const tran_low_t *input, tran_low_t *output, - const tran_high_t *lgtmtx) { - if (!lgtmtx) assert(0); -#if CONFIG_LGT_FROM_PRED - // For DCT/ADST, use butterfly implementations - if (lgtmtx[0] == DCT4) { - fdct4(input, output); - return; - } else if (lgtmtx[0] == ADST4) { - fadst4(input, output); - return; - } -#endif // CONFIG_LGT_FROM_PRED - - // evaluate s[j] = sum of all lgtmtx[j][i]*input[i] over i=1,...,4 - tran_high_t s[4] = { 0 }; - for (int i = 0; i < 4; ++i) - for (int j = 0; j < 4; ++j) s[j] += lgtmtx[j * 4 + i] * input[i]; - - for (int i = 0; i < 4; ++i) output[i] = (tran_low_t)fdct_round_shift(s[i]); -} - -static void flgt8(const tran_low_t *input, tran_low_t *output, - const tran_high_t *lgtmtx) { - if (!lgtmtx) assert(0); -#if CONFIG_LGT_FROM_PRED - // For DCT/ADST, use butterfly implementations - if (lgtmtx[0] == DCT8) { - fdct8(input, output); - return; - } else if (lgtmtx[0] == ADST8) { - fadst8(input, output); - return; - } -#endif // CONFIG_LGT_FROM_PRED - - // evaluate s[j] = sum of all lgtmtx[j][i]*input[i] over i=1,...,8 - tran_high_t s[8] = { 0 }; - for (int i = 0; i < 8; ++i) - for (int j = 0; j < 8; ++j) s[j] += lgtmtx[j * 8 + i] * input[i]; - - for (int i = 0; i < 8; ++i) output[i] = (tran_low_t)fdct_round_shift(s[i]); -} -#endif // CONFIG_LGT || CONFIG_LGT_FROM_PRED - -#if CONFIG_LGT_FROM_PRED -static void flgt16up(const tran_low_t *input, tran_low_t *output, - const tran_high_t *lgtmtx) { - if (lgtmtx[0] == DCT16) { - fdct16(input, output); - return; - } else if (lgtmtx[0] == ADST16) { - fadst16(input, output); - return; - } else if (lgtmtx[0] == DCT32) { - fdct32(input, output); - return; - } else if (lgtmtx[0] == ADST32) { - fhalfright32(input, output); - return; - } else { - assert(0); - } -} - -typedef void (*FlgtFunc)(const tran_low_t *input, tran_low_t *output, - const tran_high_t *lgtmtx); - -static FlgtFunc flgt_func[4] = { flgt4, flgt8, flgt16up, flgt16up }; - -typedef void (*GetLgtFunc)(const TxfmParam *txfm_param, int is_col, - const tran_high_t *lgtmtx[], int ntx); - -static GetLgtFunc get_lgt_func[4] = { get_lgt4_from_pred, get_lgt8_from_pred, - get_lgt16up_from_pred, - get_lgt16up_from_pred }; - -// this inline function corresponds to the up scaling before the first -// transform in the av1_fht* functions -static INLINE tran_low_t fwd_upscale_wrt_txsize(const tran_high_t val, - const TX_SIZE tx_size) { - switch (tx_size) { - case TX_4X4: return (tran_low_t)val << 4; - case TX_8X8: - case TX_4X16: - case TX_16X4: - case TX_8X32: - case TX_32X8: return (tran_low_t)val << 2; - case TX_4X8: - case TX_8X4: - case TX_8X16: - case TX_16X8: return (tran_low_t)fdct_round_shift(val * 4 * Sqrt2); - default: assert(0); break; - } - return 0; -} - -// This inline function corresponds to the bit shift after the second -// transform in the av1_fht* functions -static INLINE tran_low_t fwd_downscale_wrt_txsize(const tran_low_t val, - const TX_SIZE tx_size) { - switch (tx_size) { - case TX_4X4: return (val + 1) >> 2; - case TX_4X8: - case TX_8X4: - case TX_8X8: - case TX_4X16: - case TX_16X4: return (val + (val < 0)) >> 1; - case TX_8X16: - case TX_16X8: return val; - case TX_8X32: - case TX_32X8: return ROUND_POWER_OF_TWO_SIGNED(val, 2); - default: assert(0); break; - } - return 0; -} - -void flgt2d_from_pred_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_SIZE tx_size = txfm_param->tx_size; - const int w = tx_size_wide[tx_size]; - const int h = tx_size_high[tx_size]; - const int wlog2 = tx_size_wide_log2[tx_size]; - const int hlog2 = tx_size_high_log2[tx_size]; - assert(w <= 8 || h <= 8); - - int i, j; - tran_low_t out[256]; // max size: 8x32 and 32x8 - tran_low_t temp_in[32], temp_out[32]; - const tran_high_t *lgtmtx_col[1]; - const tran_high_t *lgtmtx_row[1]; - get_lgt_func[hlog2 - 2](txfm_param, 1, lgtmtx_col, w); - get_lgt_func[wlog2 - 2](txfm_param, 0, lgtmtx_row, h); - - // For forward transforms, to be consistent with av1_fht functions, we apply - // short transform first and long transform second. - if (w < h) { - // Row transforms - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) - temp_in[j] = fwd_upscale_wrt_txsize(input[i * stride + j], tx_size); - flgt_func[wlog2 - 2](temp_in, temp_out, lgtmtx_row[0]); - // right shift of 2 bits here in fht8x16 and fht16x8 - for (j = 0; j < w; ++j) - out[j * h + i] = (tx_size == TX_16X8 || tx_size == TX_8X16) - ? ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2) - : temp_out[j]; - } - // Column transforms - for (i = 0; i < w; ++i) { - for (j = 0; j < h; ++j) temp_in[j] = out[j + i * h]; - flgt_func[hlog2 - 2](temp_in, temp_out, lgtmtx_col[0]); - for (j = 0; j < h; ++j) - output[j * w + i] = fwd_downscale_wrt_txsize(temp_out[j], tx_size); - } - } else { - // Column transforms - for (i = 0; i < w; ++i) { - for (j = 0; j < h; ++j) - temp_in[j] = fwd_upscale_wrt_txsize(input[j * stride + i], tx_size); - flgt_func[hlog2 - 2](temp_in, temp_out, lgtmtx_col[0]); - // fht8x16 and fht16x8 have right shift of 2 bits here - for (j = 0; j < h; ++j) - out[j * w + i] = (tx_size == TX_16X8 || tx_size == TX_8X16) - ? ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2) - : temp_out[j]; - } - // Row transforms - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) temp_in[j] = out[j + i * w]; - flgt_func[wlog2 - 2](temp_in, temp_out, lgtmtx_row[0]); - for (j = 0; j < w; ++j) - output[j + i * w] = fwd_downscale_wrt_txsize(temp_out[j], tx_size); - } - } -} -#endif // CONFIG_LGT_FROM_PRED - -#if CONFIG_EXT_TX -// TODO(sarahparker) these functions will be removed once the highbitdepth -// codepath works properly for rectangular transforms. They have almost -// identical versions in av1_fwd_txfm1d.c, but those are currently only -// being used for square transforms. -static void fidtx4(const tran_low_t *input, tran_low_t *output) { - int i; - for (i = 0; i < 4; ++i) { - output[i] = (tran_low_t)fdct_round_shift(input[i] * Sqrt2); - } -} - -static void fidtx8(const tran_low_t *input, tran_low_t *output) { - int i; - for (i = 0; i < 8; ++i) { - output[i] = input[i] * 2; - } -} - -static void fidtx16(const tran_low_t *input, tran_low_t *output) { - int i; - for (i = 0; i < 16; ++i) { - output[i] = (tran_low_t)fdct_round_shift(input[i] * 2 * Sqrt2); - } -} - -static void fidtx32(const tran_low_t *input, tran_low_t *output) { - int i; - for (i = 0; i < 32; ++i) { - output[i] = input[i] * 4; - } -} - -static void copy_block(const int16_t *src, int src_stride, int l, int w, - int16_t *dest, int dest_stride) { - int i; - for (i = 0; i < l; ++i) { - memcpy(dest + dest_stride * i, src + src_stride * i, w * sizeof(int16_t)); - } -} - -static void fliplr(int16_t *dest, int stride, int l, int w) { - int i, j; - for (i = 0; i < l; ++i) { - for (j = 0; j < w / 2; ++j) { - const int16_t tmp = dest[i * stride + j]; - dest[i * stride + j] = dest[i * stride + w - 1 - j]; - dest[i * stride + w - 1 - j] = tmp; - } - } -} - -static void flipud(int16_t *dest, int stride, int l, int w) { - int i, j; - for (j = 0; j < w; ++j) { - for (i = 0; i < l / 2; ++i) { - const int16_t tmp = dest[i * stride + j]; - dest[i * stride + j] = dest[(l - 1 - i) * stride + j]; - dest[(l - 1 - i) * stride + j] = tmp; - } - } -} - -static void fliplrud(int16_t *dest, int stride, int l, int w) { - int i, j; - for (i = 0; i < l / 2; ++i) { - for (j = 0; j < w; ++j) { - const int16_t tmp = dest[i * stride + j]; - dest[i * stride + j] = dest[(l - 1 - i) * stride + w - 1 - j]; - dest[(l - 1 - i) * stride + w - 1 - j] = tmp; - } - } -} - -static void copy_fliplr(const int16_t *src, int src_stride, int l, int w, - int16_t *dest, int dest_stride) { - copy_block(src, src_stride, l, w, dest, dest_stride); - fliplr(dest, dest_stride, l, w); -} - -static void copy_flipud(const int16_t *src, int src_stride, int l, int w, - int16_t *dest, int dest_stride) { - copy_block(src, src_stride, l, w, dest, dest_stride); - flipud(dest, dest_stride, l, w); -} - -static void copy_fliplrud(const int16_t *src, int src_stride, int l, int w, - int16_t *dest, int dest_stride) { - copy_block(src, src_stride, l, w, dest, dest_stride); - fliplrud(dest, dest_stride, l, w); -} - -static void maybe_flip_input(const int16_t **src, int *src_stride, int l, int w, - int16_t *buff, TX_TYPE tx_type) { - switch (tx_type) { -#if CONFIG_MRC_TX - case MRC_DCT: -#endif // CONFIG_MRC_TX - case DCT_DCT: - case ADST_DCT: - case DCT_ADST: - case ADST_ADST: - case IDTX: - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: break; - case FLIPADST_DCT: - case FLIPADST_ADST: - case V_FLIPADST: - copy_flipud(*src, *src_stride, l, w, buff, w); - *src = buff; - *src_stride = w; - break; - case DCT_FLIPADST: - case ADST_FLIPADST: - case H_FLIPADST: - copy_fliplr(*src, *src_stride, l, w, buff, w); - *src = buff; - *src_stride = w; - break; - case FLIPADST_FLIPADST: - copy_fliplrud(*src, *src_stride, l, w, buff, w); - *src = buff; - *src_stride = w; - break; - default: assert(0); break; - } -} -#endif // CONFIG_EXT_TX - -void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif -#if CONFIG_DCT_ONLY - assert(tx_type == DCT_DCT); -#endif -#if !CONFIG_DAALA_DCT4 - if (tx_type == DCT_DCT) { - aom_fdct4x4_c(input, output, stride); - return; - } -#endif - { - static const transform_2d FHT[] = { -#if CONFIG_DAALA_DCT4 - { daala_fdct4, daala_fdct4 }, // DCT_DCT - { daala_fdst4, daala_fdct4 }, // ADST_DCT - { daala_fdct4, daala_fdst4 }, // DCT_ADST - { daala_fdst4, daala_fdst4 }, // ADST_ADST -#if CONFIG_EXT_TX - { daala_fdst4, daala_fdct4 }, // FLIPADST_DCT - { daala_fdct4, daala_fdst4 }, // DCT_FLIPADST - { daala_fdst4, daala_fdst4 }, // FLIPADST_FLIPADST - { daala_fdst4, daala_fdst4 }, // ADST_FLIPADST - { daala_fdst4, daala_fdst4 }, // FLIPADST_ADST - { daala_idtx4, daala_idtx4 }, // IDTX - { daala_fdct4, daala_idtx4 }, // V_DCT - { daala_idtx4, daala_fdct4 }, // H_DCT - { daala_fdst4, daala_idtx4 }, // V_ADST - { daala_idtx4, daala_fdst4 }, // H_ADST - { daala_fdst4, daala_idtx4 }, // V_FLIPADST - { daala_idtx4, daala_fdst4 }, // H_FLIPADST -#endif -#else - { fdct4, fdct4 }, // DCT_DCT - { fadst4, fdct4 }, // ADST_DCT - { fdct4, fadst4 }, // DCT_ADST - { fadst4, fadst4 }, // ADST_ADST -#if CONFIG_EXT_TX - { fadst4, fdct4 }, // FLIPADST_DCT - { fdct4, fadst4 }, // DCT_FLIPADST - { fadst4, fadst4 }, // FLIPADST_FLIPADST - { fadst4, fadst4 }, // ADST_FLIPADST - { fadst4, fadst4 }, // FLIPADST_ADST - { fidtx4, fidtx4 }, // IDTX - { fdct4, fidtx4 }, // V_DCT - { fidtx4, fdct4 }, // H_DCT - { fadst4, fidtx4 }, // V_ADST - { fidtx4, fadst4 }, // H_ADST - { fadst4, fidtx4 }, // V_FLIPADST - { fidtx4, fadst4 }, // H_FLIPADST -#endif -#endif - }; - const transform_2d ht = FHT[tx_type]; - tran_low_t out[4 * 4]; - int i, j; - tran_low_t temp_in[4], temp_out[4]; - -#if CONFIG_EXT_TX - int16_t flipped_input[4 * 4]; - maybe_flip_input(&input, &stride, 4, 4, flipped_input, tx_type); -#endif - -#if CONFIG_LGT - // Choose LGT adaptive to the prediction. We may apply different LGTs for - // different rows/columns, indicated by the pointers to 2D arrays - const tran_high_t *lgtmtx_col[1]; - const tran_high_t *lgtmtx_row[1]; - int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col); - int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row); -#endif - - // Columns - for (i = 0; i < 4; ++i) { - /* A C99-safe upshift by 4 for both Daala and VPx TX. */ - for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16; -#if !CONFIG_DAALA_DCT4 - if (i == 0 && temp_in[0]) temp_in[0] += 1; -#endif -#if CONFIG_LGT - if (use_lgt_col) - flgt4(temp_in, temp_out, lgtmtx_col[0]); - else -#endif - ht.cols(temp_in, temp_out); - for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j]; - } - - // Rows - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4]; -#if CONFIG_LGT - if (use_lgt_row) - flgt4(temp_in, temp_out, lgtmtx_row[0]); - else -#endif - ht.rows(temp_in, temp_out); -#if CONFIG_DAALA_DCT4 - /* Daala TX has orthonormal scaling; shift down by only 1 to achieve - the usual VPx coefficient left-shift of 3. */ - for (j = 0; j < 4; ++j) output[j + i * 4] = temp_out[j] >> 1; -#else - for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2; -#endif - } - } -} - -void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif // CONFIG_MRC_TX -#if CONFIG_DCT_ONLY - assert(tx_type == DCT_DCT); -#endif - static const transform_2d FHT[] = { - { fdct8, fdct4 }, // DCT_DCT - { fadst8, fdct4 }, // ADST_DCT - { fdct8, fadst4 }, // DCT_ADST - { fadst8, fadst4 }, // ADST_ADST -#if CONFIG_EXT_TX - { fadst8, fdct4 }, // FLIPADST_DCT - { fdct8, fadst4 }, // DCT_FLIPADST - { fadst8, fadst4 }, // FLIPADST_FLIPADST - { fadst8, fadst4 }, // ADST_FLIPADST - { fadst8, fadst4 }, // FLIPADST_ADST - { fidtx8, fidtx4 }, // IDTX - { fdct8, fidtx4 }, // V_DCT - { fidtx8, fdct4 }, // H_DCT - { fadst8, fidtx4 }, // V_ADST - { fidtx8, fadst4 }, // H_ADST - { fadst8, fidtx4 }, // V_FLIPADST - { fidtx8, fadst4 }, // H_FLIPADST -#endif - }; - const transform_2d ht = FHT[tx_type]; - const int n = 4; - const int n2 = 8; - tran_low_t out[8 * 4]; - tran_low_t temp_in[8], temp_out[8]; - int i, j; -#if CONFIG_EXT_TX - int16_t flipped_input[8 * 4]; - maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type); -#endif - -#if CONFIG_LGT - const tran_high_t *lgtmtx_col[1]; - const tran_high_t *lgtmtx_row[1]; - int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col); - int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row); -#endif - - // Rows - for (i = 0; i < n2; ++i) { - for (j = 0; j < n; ++j) - temp_in[j] = - (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2); -#if CONFIG_LGT - if (use_lgt_row) - flgt4(temp_in, temp_out, lgtmtx_row[0]); - else -#endif - ht.rows(temp_in, temp_out); - for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j]; - } - - // Columns - for (i = 0; i < n; ++i) { - for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; -#if CONFIG_LGT - if (use_lgt_col) - flgt8(temp_in, temp_out, lgtmtx_col[0]); - else -#endif - ht.cols(temp_in, temp_out); - for (j = 0; j < n2; ++j) - output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1; - } - // Note: overall scale factor of transform is 8 times unitary -} - -void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif // CONFIG_MRC_TX -#if CONFIG_DCT_ONLY - assert(tx_type == DCT_DCT); -#endif - static const transform_2d FHT[] = { - { fdct4, fdct8 }, // DCT_DCT - { fadst4, fdct8 }, // ADST_DCT - { fdct4, fadst8 }, // DCT_ADST - { fadst4, fadst8 }, // ADST_ADST -#if CONFIG_EXT_TX - { fadst4, fdct8 }, // FLIPADST_DCT - { fdct4, fadst8 }, // DCT_FLIPADST - { fadst4, fadst8 }, // FLIPADST_FLIPADST - { fadst4, fadst8 }, // ADST_FLIPADST - { fadst4, fadst8 }, // FLIPADST_ADST - { fidtx4, fidtx8 }, // IDTX - { fdct4, fidtx8 }, // V_DCT - { fidtx4, fdct8 }, // H_DCT - { fadst4, fidtx8 }, // V_ADST - { fidtx4, fadst8 }, // H_ADST - { fadst4, fidtx8 }, // V_FLIPADST - { fidtx4, fadst8 }, // H_FLIPADST -#endif - }; - const transform_2d ht = FHT[tx_type]; - const int n = 4; - const int n2 = 8; - tran_low_t out[8 * 4]; - tran_low_t temp_in[8], temp_out[8]; - int i, j; -#if CONFIG_EXT_TX - int16_t flipped_input[8 * 4]; - maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type); -#endif - -#if CONFIG_LGT - const tran_high_t *lgtmtx_col[1]; - const tran_high_t *lgtmtx_row[1]; - int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col); - int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row); -#endif - - // Columns - for (i = 0; i < n2; ++i) { - for (j = 0; j < n; ++j) - temp_in[j] = - (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2); -#if CONFIG_LGT - if (use_lgt_col) - flgt4(temp_in, temp_out, lgtmtx_col[0]); - else -#endif - ht.cols(temp_in, temp_out); - for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j]; - } - - // Rows - for (i = 0; i < n; ++i) { - for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; -#if CONFIG_LGT - if (use_lgt_row) - flgt8(temp_in, temp_out, lgtmtx_row[0]); - else -#endif - ht.rows(temp_in, temp_out); - for (j = 0; j < n2; ++j) - output[j + i * n2] = (temp_out[j] + (temp_out[j] < 0)) >> 1; - } - // Note: overall scale factor of transform is 8 times unitary -} - -void av1_fht4x16_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif // CONFIG_MRC_TX -#if CONFIG_DCT_ONLY - assert(tx_type == DCT_DCT); -#endif - static const transform_2d FHT[] = { - { fdct16, fdct4 }, // DCT_DCT - { fadst16, fdct4 }, // ADST_DCT - { fdct16, fadst4 }, // DCT_ADST - { fadst16, fadst4 }, // ADST_ADST -#if CONFIG_EXT_TX - { fadst16, fdct4 }, // FLIPADST_DCT - { fdct16, fadst4 }, // DCT_FLIPADST - { fadst16, fadst4 }, // FLIPADST_FLIPADST - { fadst16, fadst4 }, // ADST_FLIPADST - { fadst16, fadst4 }, // FLIPADST_ADST - { fidtx16, fidtx4 }, // IDTX - { fdct16, fidtx4 }, // V_DCT - { fidtx16, fdct4 }, // H_DCT - { fadst16, fidtx4 }, // V_ADST - { fidtx16, fadst4 }, // H_ADST - { fadst16, fidtx4 }, // V_FLIPADST - { fidtx16, fadst4 }, // H_FLIPADST -#endif - }; - const transform_2d ht = FHT[tx_type]; - const int n = 4; - const int n4 = 16; - tran_low_t out[16 * 4]; - tran_low_t temp_in[16], temp_out[16]; - int i, j; -#if CONFIG_EXT_TX - int16_t flipped_input[16 * 4]; - maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type); -#endif - -#if CONFIG_LGT - const tran_high_t *lgtmtx_row[1]; - int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row); -#endif - - // Rows - for (i = 0; i < n4; ++i) { - for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4; -#if CONFIG_LGT - if (use_lgt_row) - flgt4(temp_in, temp_out, lgtmtx_row[0]); - else -#endif - ht.rows(temp_in, temp_out); - for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j]; - } - - // Columns - for (i = 0; i < n; ++i) { - for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4]; - ht.cols(temp_in, temp_out); - for (j = 0; j < n4; ++j) - output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1; - } - // Note: overall scale factor of transform is 8 times unitary -} - -void av1_fht16x4_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif // CONFIG_MRC_TX -#if CONFIG_DCT_ONLY - assert(tx_type == DCT_DCT); -#endif - static const transform_2d FHT[] = { - { fdct4, fdct16 }, // DCT_DCT - { fadst4, fdct16 }, // ADST_DCT - { fdct4, fadst16 }, // DCT_ADST - { fadst4, fadst16 }, // ADST_ADST -#if CONFIG_EXT_TX - { fadst4, fdct16 }, // FLIPADST_DCT - { fdct4, fadst16 }, // DCT_FLIPADST - { fadst4, fadst16 }, // FLIPADST_FLIPADST - { fadst4, fadst16 }, // ADST_FLIPADST - { fadst4, fadst16 }, // FLIPADST_ADST - { fidtx4, fidtx16 }, // IDTX - { fdct4, fidtx16 }, // V_DCT - { fidtx4, fdct16 }, // H_DCT - { fadst4, fidtx16 }, // V_ADST - { fidtx4, fadst16 }, // H_ADST - { fadst4, fidtx16 }, // V_FLIPADST - { fidtx4, fadst16 }, // H_FLIPADST -#endif - }; - const transform_2d ht = FHT[tx_type]; - const int n = 4; - const int n4 = 16; - tran_low_t out[16 * 4]; - tran_low_t temp_in[16], temp_out[16]; - int i, j; -#if CONFIG_EXT_TX - int16_t flipped_input[16 * 4]; - maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type); -#endif - -#if CONFIG_LGT - const tran_high_t *lgtmtx_col[1]; - int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col); -#endif - - // Columns - for (i = 0; i < n4; ++i) { - for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4; -#if CONFIG_LGT - if (use_lgt_col) - flgt4(temp_in, temp_out, lgtmtx_col[0]); - else -#endif - ht.cols(temp_in, temp_out); - for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j]; - } - - // Rows - for (i = 0; i < n; ++i) { - for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4]; - ht.rows(temp_in, temp_out); - for (j = 0; j < n4; ++j) - output[j + i * n4] = (temp_out[j] + (temp_out[j] < 0)) >> 1; - } - // Note: overall scale factor of transform is 8 times unitary -} - -void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif // CONFIG_MRC_TX -#if CONFIG_DCT_ONLY - assert(tx_type == DCT_DCT); -#endif - static const transform_2d FHT[] = { - { fdct16, fdct8 }, // DCT_DCT - { fadst16, fdct8 }, // ADST_DCT - { fdct16, fadst8 }, // DCT_ADST - { fadst16, fadst8 }, // ADST_ADST -#if CONFIG_EXT_TX - { fadst16, fdct8 }, // FLIPADST_DCT - { fdct16, fadst8 }, // DCT_FLIPADST - { fadst16, fadst8 }, // FLIPADST_FLIPADST - { fadst16, fadst8 }, // ADST_FLIPADST - { fadst16, fadst8 }, // FLIPADST_ADST - { fidtx16, fidtx8 }, // IDTX - { fdct16, fidtx8 }, // V_DCT - { fidtx16, fdct8 }, // H_DCT - { fadst16, fidtx8 }, // V_ADST - { fidtx16, fadst8 }, // H_ADST - { fadst16, fidtx8 }, // V_FLIPADST - { fidtx16, fadst8 }, // H_FLIPADST -#endif - }; - const transform_2d ht = FHT[tx_type]; - const int n = 8; - const int n2 = 16; - tran_low_t out[16 * 8]; - tran_low_t temp_in[16], temp_out[16]; - int i, j; -#if CONFIG_EXT_TX - int16_t flipped_input[16 * 8]; - maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type); -#endif - -#if CONFIG_LGT - const tran_high_t *lgtmtx_row[1]; - int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row); -#endif - - // Rows - for (i = 0; i < n2; ++i) { - for (j = 0; j < n; ++j) - temp_in[j] = - (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2); -#if CONFIG_LGT - if (use_lgt_row) - flgt8(temp_in, temp_out, lgtmtx_row[0]); - else -#endif - ht.rows(temp_in, temp_out); - for (j = 0; j < n; ++j) - out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); - } - - // Columns - for (i = 0; i < n; ++i) { - for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; - ht.cols(temp_in, temp_out); - for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j]; - } - // Note: overall scale factor of transform is 8 times unitary -} - -void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif // CONFIG_MRC_TX -#if CONFIG_DCT_ONLY - assert(tx_type == DCT_DCT); -#endif - static const transform_2d FHT[] = { - { fdct8, fdct16 }, // DCT_DCT - { fadst8, fdct16 }, // ADST_DCT - { fdct8, fadst16 }, // DCT_ADST - { fadst8, fadst16 }, // ADST_ADST -#if CONFIG_EXT_TX - { fadst8, fdct16 }, // FLIPADST_DCT - { fdct8, fadst16 }, // DCT_FLIPADST - { fadst8, fadst16 }, // FLIPADST_FLIPADST - { fadst8, fadst16 }, // ADST_FLIPADST - { fadst8, fadst16 }, // FLIPADST_ADST - { fidtx8, fidtx16 }, // IDTX - { fdct8, fidtx16 }, // V_DCT - { fidtx8, fdct16 }, // H_DCT - { fadst8, fidtx16 }, // V_ADST - { fidtx8, fadst16 }, // H_ADST - { fadst8, fidtx16 }, // V_FLIPADST - { fidtx8, fadst16 }, // H_FLIPADST -#endif - }; - const transform_2d ht = FHT[tx_type]; - const int n = 8; - const int n2 = 16; - tran_low_t out[16 * 8]; - tran_low_t temp_in[16], temp_out[16]; - int i, j; -#if CONFIG_EXT_TX - int16_t flipped_input[16 * 8]; - maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type); -#endif - -#if CONFIG_LGT - const tran_high_t *lgtmtx_col[1]; - int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col); -#endif - - // Columns - for (i = 0; i < n2; ++i) { - for (j = 0; j < n; ++j) - temp_in[j] = - (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2); -#if CONFIG_LGT - if (use_lgt_col) - flgt8(temp_in, temp_out, lgtmtx_col[0]); - else -#endif - ht.cols(temp_in, temp_out); - for (j = 0; j < n; ++j) - out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); - } - - // Rows - for (i = 0; i < n; ++i) { - for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; - ht.rows(temp_in, temp_out); - for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j]; - } - // Note: overall scale factor of transform is 8 times unitary -} - -void av1_fht8x32_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif // CONFIG_MRC_TX -#if CONFIG_DCT_ONLY - assert(tx_type == DCT_DCT); -#endif - static const transform_2d FHT[] = { - { fdct32, fdct8 }, // DCT_DCT - { fhalfright32, fdct8 }, // ADST_DCT - { fdct32, fadst8 }, // DCT_ADST - { fhalfright32, fadst8 }, // ADST_ADST -#if CONFIG_EXT_TX - { fhalfright32, fdct8 }, // FLIPADST_DCT - { fdct32, fadst8 }, // DCT_FLIPADST - { fhalfright32, fadst8 }, // FLIPADST_FLIPADST - { fhalfright32, fadst8 }, // ADST_FLIPADST - { fhalfright32, fadst8 }, // FLIPADST_ADST - { fidtx32, fidtx8 }, // IDTX - { fdct32, fidtx8 }, // V_DCT - { fidtx32, fdct8 }, // H_DCT - { fhalfright32, fidtx8 }, // V_ADST - { fidtx32, fadst8 }, // H_ADST - { fhalfright32, fidtx8 }, // V_FLIPADST - { fidtx32, fadst8 }, // H_FLIPADST -#endif - }; - const transform_2d ht = FHT[tx_type]; - const int n = 8; - const int n4 = 32; - tran_low_t out[32 * 8]; - tran_low_t temp_in[32], temp_out[32]; - int i, j; -#if CONFIG_EXT_TX - int16_t flipped_input[32 * 8]; - maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type); -#endif - -#if CONFIG_LGT - const tran_high_t *lgtmtx_row[1]; - int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row); -#endif - - // Rows - for (i = 0; i < n4; ++i) { - for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4; -#if CONFIG_LGT - if (use_lgt_row) - flgt8(temp_in, temp_out, lgtmtx_row[0]); - else -#endif - ht.rows(temp_in, temp_out); - for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j]; - } - - // Columns - for (i = 0; i < n; ++i) { - for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4]; - ht.cols(temp_in, temp_out); - for (j = 0; j < n4; ++j) - output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); - } - // Note: overall scale factor of transform is 8 times unitary -} - -void av1_fht32x8_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif // CONFIG_MRC_TX -#if CONFIG_DCT_ONLY - assert(tx_type == DCT_DCT); -#endif - static const transform_2d FHT[] = { - { fdct8, fdct32 }, // DCT_DCT - { fadst8, fdct32 }, // ADST_DCT - { fdct8, fhalfright32 }, // DCT_ADST - { fadst8, fhalfright32 }, // ADST_ADST -#if CONFIG_EXT_TX - { fadst8, fdct32 }, // FLIPADST_DCT - { fdct8, fhalfright32 }, // DCT_FLIPADST - { fadst8, fhalfright32 }, // FLIPADST_FLIPADST - { fadst8, fhalfright32 }, // ADST_FLIPADST - { fadst8, fhalfright32 }, // FLIPADST_ADST - { fidtx8, fidtx32 }, // IDTX - { fdct8, fidtx32 }, // V_DCT - { fidtx8, fdct32 }, // H_DCT - { fadst8, fidtx32 }, // V_ADST - { fidtx8, fhalfright32 }, // H_ADST - { fadst8, fidtx32 }, // V_FLIPADST - { fidtx8, fhalfright32 }, // H_FLIPADST -#endif - }; - const transform_2d ht = FHT[tx_type]; - const int n = 8; - const int n4 = 32; - tran_low_t out[32 * 8]; - tran_low_t temp_in[32], temp_out[32]; - int i, j; -#if CONFIG_EXT_TX - int16_t flipped_input[32 * 8]; - maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type); -#endif - -#if CONFIG_LGT - const tran_high_t *lgtmtx_col[1]; - int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col); -#endif - - // Columns - for (i = 0; i < n4; ++i) { - for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4; -#if CONFIG_LGT - if (use_lgt_col) - flgt8(temp_in, temp_out, lgtmtx_col[0]); - else -#endif - ht.cols(temp_in, temp_out); - for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j]; - } - - // Rows - for (i = 0; i < n; ++i) { - for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4]; - ht.rows(temp_in, temp_out); - for (j = 0; j < n4; ++j) - output[j + i * n4] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); - } - // Note: overall scale factor of transform is 8 times unitary -} - -void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif // CONFIG_MRC_TX -#if CONFIG_DCT_ONLY - assert(tx_type == DCT_DCT); -#endif - static const transform_2d FHT[] = { - { fdct32, fdct16 }, // DCT_DCT - { fhalfright32, fdct16 }, // ADST_DCT - { fdct32, fadst16 }, // DCT_ADST - { fhalfright32, fadst16 }, // ADST_ADST -#if CONFIG_EXT_TX - { fhalfright32, fdct16 }, // FLIPADST_DCT - { fdct32, fadst16 }, // DCT_FLIPADST - { fhalfright32, fadst16 }, // FLIPADST_FLIPADST - { fhalfright32, fadst16 }, // ADST_FLIPADST - { fhalfright32, fadst16 }, // FLIPADST_ADST - { fidtx32, fidtx16 }, // IDTX - { fdct32, fidtx16 }, // V_DCT - { fidtx32, fdct16 }, // H_DCT - { fhalfright32, fidtx16 }, // V_ADST - { fidtx32, fadst16 }, // H_ADST - { fhalfright32, fidtx16 }, // V_FLIPADST - { fidtx32, fadst16 }, // H_FLIPADST -#endif - }; - const transform_2d ht = FHT[tx_type]; - const int n = 16; - const int n2 = 32; - tran_low_t out[32 * 16]; - tran_low_t temp_in[32], temp_out[32]; - int i, j; -#if CONFIG_EXT_TX - int16_t flipped_input[32 * 16]; - maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type); -#endif - - // Rows - for (i = 0; i < n2; ++i) { - for (j = 0; j < n; ++j) - temp_in[j] = - (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2); - ht.rows(temp_in, temp_out); - for (j = 0; j < n; ++j) - out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4); - } - - // Columns - for (i = 0; i < n; ++i) { - for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; - ht.cols(temp_in, temp_out); - for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j]; - } - // Note: overall scale factor of transform is 4 times unitary -} - -void av1_fht32x16_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif // CONFIG_MRC_TX -#if CONFIG_DCT_ONLY - assert(tx_type == DCT_DCT); -#endif - static const transform_2d FHT[] = { - { fdct16, fdct32 }, // DCT_DCT - { fadst16, fdct32 }, // ADST_DCT - { fdct16, fhalfright32 }, // DCT_ADST - { fadst16, fhalfright32 }, // ADST_ADST -#if CONFIG_EXT_TX - { fadst16, fdct32 }, // FLIPADST_DCT - { fdct16, fhalfright32 }, // DCT_FLIPADST - { fadst16, fhalfright32 }, // FLIPADST_FLIPADST - { fadst16, fhalfright32 }, // ADST_FLIPADST - { fadst16, fhalfright32 }, // FLIPADST_ADST - { fidtx16, fidtx32 }, // IDTX - { fdct16, fidtx32 }, // V_DCT - { fidtx16, fdct32 }, // H_DCT - { fadst16, fidtx32 }, // V_ADST - { fidtx16, fhalfright32 }, // H_ADST - { fadst16, fidtx32 }, // V_FLIPADST - { fidtx16, fhalfright32 }, // H_FLIPADST -#endif - }; - const transform_2d ht = FHT[tx_type]; - const int n = 16; - const int n2 = 32; - tran_low_t out[32 * 16]; - tran_low_t temp_in[32], temp_out[32]; - int i, j; -#if CONFIG_EXT_TX - int16_t flipped_input[32 * 16]; - maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type); -#endif - - // Columns - for (i = 0; i < n2; ++i) { - for (j = 0; j < n; ++j) - temp_in[j] = - (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2); - ht.cols(temp_in, temp_out); - for (j = 0; j < n; ++j) - out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4); - } - - // Rows - for (i = 0; i < n; ++i) { - for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; - ht.rows(temp_in, temp_out); - for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j]; - } - // Note: overall scale factor of transform is 4 times unitary -} - -void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif // CONFIG_MRC_TX -#if CONFIG_DCT_ONLY - assert(tx_type == DCT_DCT); -#endif -#if !CONFIG_DAALA_DCT8 - if (tx_type == DCT_DCT) { - aom_fdct8x8_c(input, output, stride); - return; - } -#endif - { - static const transform_2d FHT[] = { -#if CONFIG_DAALA_DCT8 - { daala_fdct8, daala_fdct8 }, // DCT_DCT - { daala_fdst8, daala_fdct8 }, // ADST_DCT - { daala_fdct8, daala_fdst8 }, // DCT_ADST - { daala_fdst8, daala_fdst8 }, // ADST_ADST -#if CONFIG_EXT_TX - { daala_fdst8, daala_fdct8 }, // FLIPADST_DCT - { daala_fdct8, daala_fdst8 }, // DCT_FLIPADST - { daala_fdst8, daala_fdst8 }, // FLIPADST_FLIPADST - { daala_fdst8, daala_fdst8 }, // ADST_FLIPADST - { daala_fdst8, daala_fdst8 }, // FLIPADST_ADST - { daala_idtx8, daala_idtx8 }, // IDTX - { daala_fdct8, daala_idtx8 }, // V_DCT - { daala_idtx8, daala_fdct8 }, // H_DCT - { daala_fdst8, daala_idtx8 }, // V_ADST - { daala_idtx8, daala_fdst8 }, // H_ADST - { daala_fdst8, daala_idtx8 }, // V_FLIPADST - { daala_idtx8, daala_fdst8 }, // H_FLIPADST -#endif -#else - { fdct8, fdct8 }, // DCT_DCT - { fadst8, fdct8 }, // ADST_DCT - { fdct8, fadst8 }, // DCT_ADST - { fadst8, fadst8 }, // ADST_ADST -#if CONFIG_EXT_TX - { fadst8, fdct8 }, // FLIPADST_DCT - { fdct8, fadst8 }, // DCT_FLIPADST - { fadst8, fadst8 }, // FLIPADST_FLIPADST - { fadst8, fadst8 }, // ADST_FLIPADST - { fadst8, fadst8 }, // FLIPADST_ADST - { fidtx8, fidtx8 }, // IDTX - { fdct8, fidtx8 }, // V_DCT - { fidtx8, fdct8 }, // H_DCT - { fadst8, fidtx8 }, // V_ADST - { fidtx8, fadst8 }, // H_ADST - { fadst8, fidtx8 }, // V_FLIPADST - { fidtx8, fadst8 }, // H_FLIPADST -#endif -#endif - }; - const transform_2d ht = FHT[tx_type]; - tran_low_t out[64]; - int i, j; - tran_low_t temp_in[8], temp_out[8]; - -#if CONFIG_EXT_TX - int16_t flipped_input[8 * 8]; - maybe_flip_input(&input, &stride, 8, 8, flipped_input, tx_type); -#endif - -#if CONFIG_LGT - const tran_high_t *lgtmtx_col[1]; - const tran_high_t *lgtmtx_row[1]; - int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col); - int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row); -#endif - - // Columns - for (i = 0; i < 8; ++i) { -#if CONFIG_DAALA_DCT8 - for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 16; -#else - for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4; -#endif -#if CONFIG_LGT - if (use_lgt_col) - flgt8(temp_in, temp_out, lgtmtx_col[0]); - else -#endif - ht.cols(temp_in, temp_out); - for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j]; - } - - // Rows - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8]; -#if CONFIG_LGT - if (use_lgt_row) - flgt8(temp_in, temp_out, lgtmtx_row[0]); - else -#endif - ht.rows(temp_in, temp_out); -#if CONFIG_DAALA_DCT8 - for (j = 0; j < 8; ++j) - output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; -#else - for (j = 0; j < 8; ++j) - output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; -#endif - } - } -} - -/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per - pixel. */ -void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { - int i; - tran_high_t a1, b1, c1, d1, e1; - const int16_t *ip_pass0 = input; - const tran_low_t *ip = NULL; - tran_low_t *op = output; - - for (i = 0; i < 4; i++) { - a1 = ip_pass0[0 * stride]; - b1 = ip_pass0[1 * stride]; - c1 = ip_pass0[2 * stride]; - d1 = ip_pass0[3 * stride]; - - a1 += b1; - d1 = d1 - c1; - e1 = (a1 - d1) >> 1; - b1 = e1 - b1; - c1 = e1 - c1; - a1 -= c1; - d1 += b1; - op[0] = (tran_low_t)a1; - op[4] = (tran_low_t)c1; - op[8] = (tran_low_t)d1; - op[12] = (tran_low_t)b1; - - ip_pass0++; - op++; - } - ip = output; - op = output; - - for (i = 0; i < 4; i++) { - a1 = ip[0]; - b1 = ip[1]; - c1 = ip[2]; - d1 = ip[3]; - - a1 += b1; - d1 -= c1; - e1 = (a1 - d1) >> 1; - b1 = e1 - b1; - c1 = e1 - c1; - a1 -= c1; - d1 += b1; - op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR); - op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR); - op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR); - op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR); - - ip += 4; - op += 4; - } -} - -void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif // CONFIG_MRC_TX -#if CONFIG_DCT_ONLY - assert(tx_type == DCT_DCT); -#endif - static const transform_2d FHT[] = { -#if CONFIG_DAALA_DCT16 - { daala_fdct16, daala_fdct16 }, // DCT_DCT - { daala_fdst16, daala_fdct16 }, // ADST_DCT - { daala_fdct16, daala_fdst16 }, // DCT_ADST - { daala_fdst16, daala_fdst16 }, // ADST_ADST -#if CONFIG_EXT_TX - { daala_fdst16, daala_fdct16 }, // FLIPADST_DCT - { daala_fdct16, daala_fdst16 }, // DCT_FLIPADST - { daala_fdst16, daala_fdst16 }, // FLIPADST_FLIPADST - { daala_fdst16, daala_fdst16 }, // ADST_FLIPADST - { daala_fdst16, daala_fdst16 }, // FLIPADST_ADST - { daala_idtx16, daala_idtx16 }, // IDTX - { daala_fdct16, daala_idtx16 }, // V_DCT - { daala_idtx16, daala_fdct16 }, // H_DCT - { daala_fdst16, daala_idtx16 }, // V_ADST - { daala_idtx16, daala_fdst16 }, // H_ADST - { daala_fdst16, daala_idtx16 }, // V_FLIPADST - { daala_idtx16, daala_fdst16 }, // H_FLIPADST -#endif -#else - { fdct16, fdct16 }, // DCT_DCT - { fadst16, fdct16 }, // ADST_DCT - { fdct16, fadst16 }, // DCT_ADST - { fadst16, fadst16 }, // ADST_ADST -#if CONFIG_EXT_TX - { fadst16, fdct16 }, // FLIPADST_DCT - { fdct16, fadst16 }, // DCT_FLIPADST - { fadst16, fadst16 }, // FLIPADST_FLIPADST - { fadst16, fadst16 }, // ADST_FLIPADST - { fadst16, fadst16 }, // FLIPADST_ADST - { fidtx16, fidtx16 }, // IDTX - { fdct16, fidtx16 }, // V_DCT - { fidtx16, fdct16 }, // H_DCT - { fadst16, fidtx16 }, // V_ADST - { fidtx16, fadst16 }, // H_ADST - { fadst16, fidtx16 }, // V_FLIPADST - { fidtx16, fadst16 }, // H_FLIPADST -#endif -#endif - }; - const transform_2d ht = FHT[tx_type]; - tran_low_t out[256]; - int i, j; - tran_low_t temp_in[16], temp_out[16]; - -#if CONFIG_EXT_TX - int16_t flipped_input[16 * 16]; - maybe_flip_input(&input, &stride, 16, 16, flipped_input, tx_type); -#endif - - // Columns - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) { -#if CONFIG_DAALA_DCT16 - temp_in[j] = input[j * stride + i] * 16; -#else - temp_in[j] = input[j * stride + i] * 4; -#endif - } - ht.cols(temp_in, temp_out); - for (j = 0; j < 16; ++j) { -#if CONFIG_DAALA_DCT16 - out[j * 16 + i] = temp_out[j]; -#else - out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; -#endif - } - } - - // Rows - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16]; - ht.rows(temp_in, temp_out); - for (j = 0; j < 16; ++j) { -#if CONFIG_DAALA_DCT16 - output[j + i * 16] = (temp_out[j] + (temp_out[j] < 0)) >> 1; -#else - output[j + i * 16] = temp_out[j]; -#endif - } - } -} - -void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, - int stride) { - av1_fwht4x4_c(input, output, stride); -} - -void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_DCT_ONLY - assert(tx_type == DCT_DCT); -#endif - static const transform_2d FHT[] = { -#if CONFIG_DAALA_DCT32 - { daala_fdct32, daala_fdct32 }, // DCT_DCT -#if CONFIG_EXT_TX - { daala_fdst32, daala_fdct32 }, // ADST_DCT - { daala_fdct32, daala_fdst32 }, // DCT_ADST - { daala_fdst32, daala_fdst32 }, // ADST_ADST - { daala_fdst32, daala_fdct32 }, // FLIPADST_DCT - { daala_fdct32, daala_fdst32 }, // DCT_FLIPADST - { daala_fdst32, daala_fdst32 }, // FLIPADST_FLIPADST - { daala_fdst32, daala_fdst32 }, // ADST_FLIPADST - { daala_fdst32, daala_fdst32 }, // FLIPADST_ADST - { daala_idtx32, daala_idtx32 }, // IDTX - { daala_fdct32, daala_idtx32 }, // V_DCT - { daala_idtx32, daala_fdct32 }, // H_DCT - { daala_fdst32, daala_idtx32 }, // V_ADST - { daala_idtx32, daala_fdst32 }, // H_ADST - { daala_fdst32, daala_idtx32 }, // V_FLIPADST - { daala_idtx32, daala_fdst32 }, // H_FLIPADST -#endif -#else - { fdct32, fdct32 }, // DCT_DCT -#if CONFIG_EXT_TX - { fhalfright32, fdct32 }, // ADST_DCT - { fdct32, fhalfright32 }, // DCT_ADST - { fhalfright32, fhalfright32 }, // ADST_ADST - { fhalfright32, fdct32 }, // FLIPADST_DCT - { fdct32, fhalfright32 }, // DCT_FLIPADST - { fhalfright32, fhalfright32 }, // FLIPADST_FLIPADST - { fhalfright32, fhalfright32 }, // ADST_FLIPADST - { fhalfright32, fhalfright32 }, // FLIPADST_ADST - { fidtx32, fidtx32 }, // IDTX - { fdct32, fidtx32 }, // V_DCT - { fidtx32, fdct32 }, // H_DCT - { fhalfright32, fidtx32 }, // V_ADST - { fidtx32, fhalfright32 }, // H_ADST - { fhalfright32, fidtx32 }, // V_FLIPADST - { fidtx32, fhalfright32 }, // H_FLIPADST -#endif -#endif -#if CONFIG_MRC_TX - { fdct32, fdct32 }, // MRC_TX -#endif // CONFIG_MRC_TX - }; - const transform_2d ht = FHT[tx_type]; - tran_low_t out[1024]; - int i, j; - tran_low_t temp_in[32], temp_out[32]; - -#if CONFIG_EXT_TX - int16_t flipped_input[32 * 32]; - maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type); -#endif - -#if CONFIG_MRC_TX - if (tx_type == MRC_DCT) { - int16_t masked_input[32 * 32]; - get_masked_residual32(&input, &stride, txfm_param->dst, txfm_param->stride, - masked_input, txfm_param); - } -#endif // CONFIG_MRC_TX - - // Columns - for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) { -#if CONFIG_DAALA_DCT32 - temp_in[j] = input[j * stride + i] * 16; -#else - temp_in[j] = input[j * stride + i] * 4; -#endif - } - ht.cols(temp_in, temp_out); - for (j = 0; j < 32; ++j) { -#if CONFIG_DAALA_DCT32 - out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); -#else - out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4); -#endif - } - } - - // Rows - for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32]; - ht.rows(temp_in, temp_out); - for (j = 0; j < 32; ++j) { - output[j + i * 32] = temp_out[j]; - } - } -} - -#if CONFIG_TX64X64 -#if !CONFIG_DAALA_DCT64 -#if CONFIG_EXT_TX -static void fidtx64(const tran_low_t *input, tran_low_t *output) { - int i; - for (i = 0; i < 64; ++i) - output[i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2); -} - -// For use in lieu of ADST -static void fhalfright64(const tran_low_t *input, tran_low_t *output) { - int i; - tran_low_t inputhalf[32]; - for (i = 0; i < 32; ++i) { - output[32 + i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2); - } - // Multiply input by sqrt(2) - for (i = 0; i < 32; ++i) { - inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 32] * Sqrt2); - } - fdct32(inputhalf, output); - // Note overall scaling factor is 2 times unitary -} -#endif // CONFIG_EXT_TX - -static void fdct64_col(const tran_low_t *input, tran_low_t *output) { - int32_t in[64], out[64]; - int i; - for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i]; - av1_fdct64_new(in, out, fwd_cos_bit_col_dct_64, fwd_stage_range_col_dct_64); - for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i]; -} - -static void fdct64_row(const tran_low_t *input, tran_low_t *output) { - int32_t in[64], out[64]; - int i; - for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i]; - av1_fdct64_new(in, out, fwd_cos_bit_row_dct_64, fwd_stage_range_row_dct_64); - for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i]; -} -#endif - -void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif // CONFIG_MRC_TX -#if CONFIG_DCT_ONLY - assert(tx_type == DCT_DCT); -#endif - static const transform_2d FHT[] = { -#if CONFIG_DAALA_DCT64 - { daala_fdct64, daala_fdct64 }, // DCT_DCT -#if CONFIG_EXT_TX - { daala_fdst64, daala_fdct64 }, // ADST_DCT - { daala_fdct64, daala_fdst64 }, // DCT_ADST - { daala_fdst64, daala_fdst64 }, // ADST_ADST - { daala_fdst64, daala_fdct64 }, // FLIPADST_DCT - { daala_fdct64, daala_fdst64 }, // DCT_FLIPADST - { daala_fdst64, daala_fdst64 }, // FLIPADST_FLIPADST - { daala_fdst64, daala_fdst64 }, // ADST_FLIPADST - { daala_fdst64, daala_fdst64 }, // FLIPADST_ADST - { daala_idtx64, daala_idtx64 }, // IDTX - { daala_fdct64, daala_idtx64 }, // V_DCT - { daala_idtx64, daala_fdct64 }, // H_DCT - { daala_fdst64, daala_idtx64 }, // V_ADST - { daala_idtx64, daala_fdst64 }, // H_ADST - { daala_fdst64, daala_idtx64 }, // V_FLIPADST - { daala_idtx64, daala_fdst64 }, // H_FLIPADST -#endif // CONFIG_EXT_TX -#else - { fdct64_col, fdct64_row }, // DCT_DCT -#if CONFIG_EXT_TX - { fhalfright64, fdct64_row }, // ADST_DCT - { fdct64_col, fhalfright64 }, // DCT_ADST - { fhalfright64, fhalfright64 }, // ADST_ADST - { fhalfright64, fdct64_row }, // FLIPADST_DCT - { fdct64_col, fhalfright64 }, // DCT_FLIPADST - { fhalfright64, fhalfright64 }, // FLIPADST_FLIPADST - { fhalfright64, fhalfright64 }, // ADST_FLIPADST - { fhalfright64, fhalfright64 }, // FLIPADST_ADST - { fidtx64, fidtx64 }, // IDTX - { fdct64_col, fidtx64 }, // V_DCT - { fidtx64, fdct64_row }, // H_DCT - { fhalfright64, fidtx64 }, // V_ADST - { fidtx64, fhalfright64 }, // H_ADST - { fhalfright64, fidtx64 }, // V_FLIPADST - { fidtx64, fhalfright64 }, // H_FLIPADST -#endif // CONFIG_EXT_TX -#endif // CONFIG_DAALA_DCT64 - }; - const transform_2d ht = FHT[tx_type]; - tran_low_t out[4096]; - int i, j; - tran_low_t temp_in[64], temp_out[64]; -#if CONFIG_EXT_TX - int16_t flipped_input[64 * 64]; - maybe_flip_input(&input, &stride, 64, 64, flipped_input, tx_type); -#endif - - // Columns - for (i = 0; i < 64; ++i) { -#if CONFIG_DAALA_DCT64 - for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i] * 16; - ht.cols(temp_in, temp_out); - for (j = 0; j < 64; ++j) - out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 3; - -#else - for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i]; - ht.cols(temp_in, temp_out); - for (j = 0; j < 64; ++j) - out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; -#endif - } - - // Rows - for (i = 0; i < 64; ++i) { - for (j = 0; j < 64; ++j) temp_in[j] = out[j + i * 64]; - ht.rows(temp_in, temp_out); - for (j = 0; j < 64; ++j) -#if CONFIG_DAALA_DCT64 - output[j + i * 64] = temp_out[j]; -#else - output[j + i * 64] = - (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); -#endif - } -} - -void av1_fht64x32_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif // CONFIG_MRC_TX -#if CONFIG_DCT_ONLY - assert(tx_type == DCT_DCT); -#endif - static const transform_2d FHT[] = { - { fdct32, fdct64_row }, // DCT_DCT -#if CONFIG_EXT_TX - { fhalfright32, fdct64_row }, // ADST_DCT - { fdct32, fhalfright64 }, // DCT_ADST - { fhalfright32, fhalfright64 }, // ADST_ADST - { fhalfright32, fdct64_row }, // FLIPADST_DCT - { fdct32, fhalfright64 }, // DCT_FLIPADST - { fhalfright32, fhalfright64 }, // FLIPADST_FLIPADST - { fhalfright32, fhalfright64 }, // ADST_FLIPADST - { fhalfright32, fhalfright64 }, // FLIPADST_ADST - { fidtx32, fidtx64 }, // IDTX - { fdct32, fidtx64 }, // V_DCT - { fidtx32, fdct64_row }, // H_DCT - { fhalfright32, fidtx64 }, // V_ADST - { fidtx32, fhalfright64 }, // H_ADST - { fhalfright32, fidtx64 }, // V_FLIPADST - { fidtx32, fhalfright64 }, // H_FLIPADST -#endif // CONFIG_EXT_TX - }; - const transform_2d ht = FHT[tx_type]; - tran_low_t out[2048]; - int i, j; - tran_low_t temp_in[64], temp_out[64]; - const int n = 32; - const int n2 = 64; -#if CONFIG_EXT_TX - int16_t flipped_input[32 * 64]; - maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type); -#endif - - // Columns - for (i = 0; i < n2; ++i) { - for (j = 0; j < n; ++j) - temp_in[j] = (tran_low_t)fdct_round_shift(input[j * stride + i] * Sqrt2); - ht.cols(temp_in, temp_out); - for (j = 0; j < n; ++j) - out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); - } - - // Rows - for (i = 0; i < n; ++i) { - for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; - ht.rows(temp_in, temp_out); - for (j = 0; j < n2; ++j) - output[j + i * n2] = - (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); - } -} - -void av1_fht32x64_c(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif // CONFIG_MRC_TX -#if CONFIG_DCT_ONLY - assert(tx_type == DCT_DCT); -#endif - static const transform_2d FHT[] = { - { fdct64_row, fdct32 }, // DCT_DCT -#if CONFIG_EXT_TX - { fhalfright64, fdct32 }, // ADST_DCT - { fdct64_row, fhalfright32 }, // DCT_ADST - { fhalfright64, fhalfright32 }, // ADST_ADST - { fhalfright64, fdct32 }, // FLIPADST_DCT - { fdct64_row, fhalfright32 }, // DCT_FLIPADST - { fhalfright64, fhalfright32 }, // FLIPADST_FLIPADST - { fhalfright64, fhalfright32 }, // ADST_FLIPADST - { fhalfright64, fhalfright32 }, // FLIPADST_ADST - { fidtx64, fidtx32 }, // IDTX - { fdct64_row, fidtx32 }, // V_DCT - { fidtx64, fdct32 }, // H_DCT - { fhalfright64, fidtx32 }, // V_ADST - { fidtx64, fhalfright32 }, // H_ADST - { fhalfright64, fidtx32 }, // V_FLIPADST - { fidtx64, fhalfright32 }, // H_FLIPADST -#endif // CONFIG_EXT_TX - }; - const transform_2d ht = FHT[tx_type]; - tran_low_t out[32 * 64]; - int i, j; - tran_low_t temp_in[64], temp_out[64]; - const int n = 32; - const int n2 = 64; -#if CONFIG_EXT_TX - int16_t flipped_input[32 * 64]; - maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type); -#endif - - // Rows - for (i = 0; i < n2; ++i) { - for (j = 0; j < n; ++j) - temp_in[j] = (tran_low_t)fdct_round_shift(input[i * stride + j] * Sqrt2); - ht.rows(temp_in, temp_out); - for (j = 0; j < n; ++j) - out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); - } - - // Columns - for (i = 0; i < n; ++i) { - for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; - ht.cols(temp_in, temp_out); - for (j = 0; j < n2; ++j) - output[i + j * n] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); - } -} -#endif // CONFIG_TX64X64 - -#if CONFIG_EXT_TX -// Forward identity transform. -void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, - int bsx, int bsy, TX_TYPE tx_type) { - int r, c; - const int pels = bsx * bsy; - const int shift = 3 - ((pels > 256) + (pels > 1024)); - if (tx_type == IDTX) { - for (r = 0; r < bsy; ++r) { - for (c = 0; c < bsx; ++c) coeff[c] = src_diff[c] * (1 << shift); - src_diff += stride; - coeff += bsx; - } - } -} -#endif // CONFIG_EXT_TX -#endif // !AV1_DCT_GTEST diff --git a/third_party/aom/av1/encoder/dwt.c b/third_party/aom/av1/encoder/dwt.c new file mode 100644 index 000000000..0a57ebcfb --- /dev/null +++ b/third_party/aom/av1/encoder/dwt.c @@ -0,0 +1,144 @@ +#include <assert.h> +#include <stdlib.h> +#include <math.h> + +#include "config/av1_rtcd.h" +#include "av1/encoder/dwt.h" + +// Note: block length must be even for this implementation +static void analysis_53_row(int length, tran_low_t *x, tran_low_t *lowpass, + tran_low_t *highpass) { + int n; + tran_low_t r, *a, *b; + + n = length >> 1; + b = highpass; + a = lowpass; + while (--n) { + *a++ = (r = *x++) * 2; + *b++ = *x - ((r + x[1] + 1) >> 1); + x++; + } + *a = (r = *x++) * 2; + *b = *x - r; + + n = length >> 1; + b = highpass; + a = lowpass; + r = *highpass; + while (n--) { + *a++ += (r + (*b) + 1) >> 1; + r = *b++; + } +} + +static void analysis_53_col(int length, tran_low_t *x, tran_low_t *lowpass, + tran_low_t *highpass) { + int n; + tran_low_t r, *a, *b; + + n = length >> 1; + b = highpass; + a = lowpass; + while (--n) { + *a++ = (r = *x++); + *b++ = (((*x) * 2) - (r + x[1]) + 2) >> 2; + x++; + } + *a = (r = *x++); + *b = (*x - r + 1) >> 1; + + n = length >> 1; + b = highpass; + a = lowpass; + r = *highpass; + while (n--) { + *a++ += (r + (*b) + 1) >> 1; + r = *b++; + } +} + +static void dyadic_analyze_53_uint8_input(int levels, int width, int height, + uint8_t *x, int pitch_x, + tran_low_t *c, int pitch_c, + int dwt_scale_bits, int hbd) { + int lv, i, j, nh, nw, hh = height, hw = width; + tran_low_t buffer[2 * DWT_MAX_LENGTH]; + + if (hbd) { + uint16_t *x16 = CONVERT_TO_SHORTPTR(x); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + c[i * pitch_c + j] = x16[i * pitch_x + j] << dwt_scale_bits; + } + } + } else { + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits; + } + } + } + + for (lv = 0; lv < levels; lv++) { + nh = hh; + hh = (hh + 1) >> 1; + nw = hw; + hw = (hw + 1) >> 1; + if ((nh < 2) || (nw < 2)) return; + for (i = 0; i < nh; i++) { + memcpy(buffer, &c[i * pitch_c], nw * sizeof(tran_low_t)); + analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw); + } + for (j = 0; j < nw; j++) { + for (i = 0; i < nh; i++) buffer[i + nh] = c[i * pitch_c + j]; + analysis_53_col(nh, buffer + nh, buffer, buffer + hh); + for (i = 0; i < nh; i++) c[i * pitch_c + j] = buffer[i]; + } + } +} + +void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride, + int hbd) { + dyadic_analyze_53_uint8_input(4, 8, 8, input, stride, output, 8, 2, hbd); +} + +int av1_haar_ac_sad(tran_low_t *output, int bw, int bh, int stride) { + int acsad = 0; + + for (int r = 0; r < bh; ++r) + for (int c = 0; c < bw; ++c) { + if (r >= bh / 2 || c >= bw / 2) acsad += abs(output[r * stride + c]); + } + return acsad; +} + +uint64_t av1_dct_ac_sad(tran_low_t *output, int bw, int bh, int stride) { + uint64_t acsad = 0; + + for (int r = 0; r < bh; ++r) + for (int c = 0; c < bw; ++c) { + if (r > 0 || c > 0) acsad += abs(output[r * stride + c]); + } + + return acsad; +} + +uint32_t av1_variance(uint8_t *input, int bw, int bh, int stride) { + int sum = 0; + uint32_t sse = 0; + + for (int r = 0; r < bh; ++r) + for (int c = 0; c < bw; ++c) { + sum += input[r * stride + c]; + sse += input[r * stride + c] * input[r * stride + c]; + } + return sse - (uint32_t)(((int64_t)sum * sum) / (bw * bh)); +} + +int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd) { + tran_low_t output[64]; + + av1_fdwt8x8_uint8_input_c(input, output, stride, hbd); + return av1_haar_ac_sad(output, 8, 8, 8); +} diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h new file mode 100644 index 000000000..9a86db2f1 --- /dev/null +++ b/third_party/aom/av1/encoder/dwt.h @@ -0,0 +1,9 @@ +#include "av1/common/common.h" +#include "av1/common/enums.h" + +#define DWT_MAX_LENGTH 64 + +void av1_fdwt8x8(tran_low_t *input, tran_low_t *output, int stride); +void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride, + int hbd); +int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd); diff --git a/third_party/aom/av1/encoder/encint.h b/third_party/aom/av1/encoder/encint.h deleted file mode 100644 index 30ea8521f..000000000 --- a/third_party/aom/av1/encoder/encint.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -/* clang-format off */ - -#if !defined(_encint_H) -# define _encint_H (1) - -typedef struct daala_enc_ctx od_enc_ctx; -typedef struct od_params_ctx od_params_ctx; -typedef struct od_rollback_buffer od_rollback_buffer; - -# include "aom_dsp/entenc.h" -# include "av1/common/odintrin.h" -# include "av1/common/pvq_state.h" - -struct daala_enc_ctx{ - /* Stores context-adaptive CDFs for PVQ. */ - od_state state; - /* AOM entropy encoder. */ - aom_writer w; - int use_activity_masking; - /* Mode of quantization matrice : FLAT (0) or HVS (1) */ - int qm; - /*Normalized PVQ lambda for use where we've already performed - quantization.*/ - double pvq_norm_lambda; - double pvq_norm_lambda_dc; -}; - -// from daalaenc.h -/**The encoder context.*/ -typedef struct daala_enc_ctx daala_enc_ctx; - -/** Holds important encoder information so we can roll back decisions */ -struct od_rollback_buffer { - od_ec_enc ec; - od_adapt_ctx adapt; -}; - -void od_encode_checkpoint(const daala_enc_ctx *enc, od_rollback_buffer *rbuf); -void od_encode_rollback(daala_enc_ctx *enc, const od_rollback_buffer *rbuf); - -#endif diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c index f79a678fb..027b80a16 100644 --- a/third_party/aom/av1/encoder/encodeframe.c +++ b/third_party/aom/av1/encoder/encodeframe.c @@ -13,9 +13,9 @@ #include <math.h> #include <stdio.h> -#include "./av1_rtcd.h" -#include "./aom_dsp_rtcd.h" -#include "./aom_config.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/binary_codes_writer.h" @@ -23,6 +23,11 @@ #include "aom_ports/aom_timer.h" #include "aom_ports/system_state.h" +#if CONFIG_MISMATCH_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_MISMATCH_DEBUG + +#include "av1/common/cfl.h" #include "av1/common/common.h" #include "av1/common/entropy.h" #include "av1/common/entropymode.h" @@ -36,105 +41,55 @@ #include "av1/common/seg_common.h" #include "av1/common/tile_common.h" +#include "av1/encoder/ab_partition_model_weights.h" #include "av1/encoder/aq_complexity.h" #include "av1/encoder/aq_cyclicrefresh.h" #include "av1/encoder/aq_variance.h" -#if CONFIG_SUPERTX -#include "av1/encoder/cost.h" -#endif -#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION #include "av1/common/warped_motion.h" -#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION -#if CONFIG_GLOBAL_MOTION #include "av1/encoder/global_motion.h" -#endif // CONFIG_GLOBAL_MOTION #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodemb.h" #include "av1/encoder/encodemv.h" -#if CONFIG_LV_MAP #include "av1/encoder/encodetxb.h" -#endif #include "av1/encoder/ethread.h" #include "av1/encoder/extend.h" +#include "av1/encoder/ml.h" #include "av1/encoder/rd.h" #include "av1/encoder/rdopt.h" #include "av1/encoder/segmentation.h" #include "av1/encoder/tokenize.h" -#if CONFIG_PVQ -#include "av1/common/pvq.h" -#include "av1/encoder/pvq_encoder.h" -#endif -#if CONFIG_HIGHBITDEPTH -#define IF_HBD(...) __VA_ARGS__ -#else -#define IF_HBD(...) -#endif // CONFIG_HIGHBITDEPTH - -static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td, - TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row, - int mi_col, BLOCK_SIZE bsize, int *rate); - -#if CONFIG_SUPERTX -static int check_intra_b(PICK_MODE_CONTEXT *ctx); - -static int check_intra_sb(const AV1_COMP *cpi, const TileInfo *const tile, - int mi_row, int mi_col, BLOCK_SIZE bsize, - PC_TREE *pc_tree); -static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td, - int mi_row_ori, int mi_col_ori, int mi_row_pred, - int mi_col_pred, int plane, - BLOCK_SIZE bsize_pred, int b_sub8x8, int block); -static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size, - PC_TREE *pc_tree); -static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td, - const TileInfo *const tile, int mi_row, - int mi_col, int mi_row_ori, int mi_col_ori, - RUN_TYPE dry_run, BLOCK_SIZE bsize, - BLOCK_SIZE top_bsize, uint8_t *dst_buf[3], - int dst_stride[3], PC_TREE *pc_tree); -static void update_state_sb_supertx(const AV1_COMP *const cpi, ThreadData *td, - const TileInfo *const tile, int mi_row, - int mi_col, BLOCK_SIZE bsize, - RUN_TYPE dry_run, PC_TREE *pc_tree); -static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td, - const TileInfo *const tile, int mi_row, int mi_col, - BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist, - TX_TYPE *best_tx, PC_TREE *pc_tree); -#endif // CONFIG_SUPERTX + +static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data, + ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int *rate); // This is used as a reference when computing the source variance for the // purposes of activity masking. // Eventually this should be replaced by custom no-reference routines, // which will be faster. static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = { - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, -#if CONFIG_EXT_PARTITION - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 -#endif // CONFIG_EXT_PARTITION + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128 }; -#if CONFIG_HIGHBITDEPTH static const uint16_t AV1_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = { - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, -#if CONFIG_EXT_PARTITION - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 -#endif // CONFIG_EXT_PARTITION + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128 }; static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = { @@ -146,7 +101,6 @@ static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = { 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, -#if CONFIG_EXT_PARTITION 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, @@ -155,7 +109,6 @@ static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = { 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4 -#endif // CONFIG_EXT_PARTITION }; static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = { @@ -168,8 +121,6 @@ static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = { 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, - 128 * 16, -#if CONFIG_EXT_PARTITION 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, @@ -179,10 +130,17 @@ static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = { 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, - 128 * 16 -#endif // CONFIG_EXT_PARTITION + 128 * 16, 128 * 16 }; -#endif // CONFIG_HIGHBITDEPTH + +#if CONFIG_FP_MB_STATS +static const uint8_t num_16x16_blocks_wide_lookup[BLOCK_SIZES_ALL] = { + 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 1, 1, 1, 2, 2, 4 +}; +static const uint8_t num_16x16_blocks_high_lookup[BLOCK_SIZES_ALL] = { + 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 1, 1, 2, 1, 4, 2 +}; +#endif // CONFIG_FP_MB_STATS unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi, const struct buf_2d *ref, @@ -193,7 +151,6 @@ unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi, return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); } -#if CONFIG_HIGHBITDEPTH unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs, int bd) { @@ -218,7 +175,6 @@ unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi, } return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); } -#endif // CONFIG_HIGHBITDEPTH static unsigned int get_sby_perpixel_diff_variance(const AV1_COMP *const cpi, const struct buf_2d *ref, @@ -266,24 +222,21 @@ static void set_offsets_without_segment_id(const AV1_COMP *const cpi, MACROBLOCK *const x, int mi_row, int mi_col, BLOCK_SIZE bsize) { const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; const int mi_width = mi_size_wide[bsize]; const int mi_height = mi_size_high[bsize]; set_mode_info_offsets(cpi, x, xd, mi_row, mi_col); - set_skip_context(xd, mi_row, mi_col); -#if CONFIG_VAR_TX - xd->above_txfm_context = - cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2); - xd->left_txfm_context = xd->left_txfm_context_buffer + - ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2); - xd->max_tx_size = max_txsize_lookup[bsize]; -#endif + set_skip_context(xd, mi_row, mi_col, num_planes); + xd->above_txfm_context = cm->above_txfm_context[tile->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); // Set up destination pointers. av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row, - mi_col); + mi_col, 0, num_planes); // Set up limit values for MV components. // Mv beyond the range do not produce new/different prediction block. @@ -293,18 +246,15 @@ static void set_offsets_without_segment_id(const AV1_COMP *const cpi, x->mv_limits.row_max = (cm->mi_rows - mi_row) * MI_SIZE + AOM_INTERP_EXTEND; x->mv_limits.col_max = (cm->mi_cols - mi_col) * MI_SIZE + AOM_INTERP_EXTEND; - set_plane_n4(xd, mi_width, mi_height); + set_plane_n4(xd, mi_width, mi_height, num_planes); // Set up distance of MB to edge of frame in 1/8th pel units. assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); - set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, -#if CONFIG_DEPENDENT_HORZTILES - cm->dependent_horz_tiles, -#endif // CONFIG_DEPENDENT_HORZTILES - cm->mi_rows, cm->mi_cols); + set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, cm->mi_rows, + cm->mi_cols); // Set up source buffers. - av1_setup_src_planes(x, cpi->source, mi_row, mi_col); + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes); // R/D setup. x->rdmult = cpi->rd.RDMULT; @@ -323,292 +273,111 @@ static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile, set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); - mbmi = &xd->mi[0]->mbmi; -#if CONFIG_CFL - xd->cfl->mi_row = mi_row; - xd->cfl->mi_col = mi_col; -#endif + mbmi = xd->mi[0]; + xd->cfl.mi_row = mi_row; + xd->cfl.mi_col = mi_col; + + mbmi->segment_id = 0; // Setup segment ID. if (seg->enabled) { - if (!cpi->vaq_refresh) { + if (seg->enabled && !cpi->vaq_refresh) { const uint8_t *const map = seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + mbmi->segment_id = + map ? get_segment_id(cm, map, bsize, mi_row, mi_col) : 0; } av1_init_plane_quantizers(cpi, x, mbmi->segment_id); - } else { - mbmi->segment_id = 0; } - -#if CONFIG_SUPERTX - mbmi->segment_id_supertx = MAX_SEGMENTS; -#endif // CONFIG_SUPERTX -} - -#if CONFIG_SUPERTX -static void set_offsets_supertx(const AV1_COMP *const cpi, ThreadData *td, - const TileInfo *const tile, int mi_row, - int mi_col, BLOCK_SIZE bsize) { - MACROBLOCK *const x = &td->mb; - const AV1_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &x->e_mbd; - const int mi_width = mi_size_wide[bsize]; - const int mi_height = mi_size_high[bsize]; -#if CONFIG_DEPENDENT_HORZTILES - set_mode_info_offsets(cpi, x, xd, mi_row, mi_col, cm->dependent_horz_tiles); -#else - set_mode_info_offsets(cpi, x, xd, mi_row, mi_col); -#endif - - // Set up distance of MB to edge of frame in 1/8th pel units. - assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); - set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, -#if CONFIG_DEPENDENT_HORZTILES - cm->dependent_horz_tiles, -#endif // CONFIG_DEPENDENT_HORZTILES - cm->mi_rows, cm->mi_cols); } -static void set_offsets_extend(const AV1_COMP *const cpi, ThreadData *td, - const TileInfo *const tile, int mi_row_pred, - int mi_col_pred, int mi_row_ori, int mi_col_ori, - BLOCK_SIZE bsize_pred) { - // Used in supertx - // (mi_row_ori, mi_col_ori, bsize_ori): region for mv - // (mi_row_pred, mi_col_pred, bsize_pred): region to predict - MACROBLOCK *const x = &td->mb; - const AV1_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &x->e_mbd; - const int mi_width = mi_size_wide[bsize_pred]; - const int mi_height = mi_size_high[bsize_pred]; - -#if CONFIG_DEPENDENT_HORZTILES - set_mode_info_offsets(cpi, x, xd, mi_row_ori, mi_col_ori, - cm->dependent_horz_tiles); -#else - set_mode_info_offsets(cpi, x, xd, mi_row_ori, mi_col_ori); -#endif - - // Set up limit values for MV components. - // Mv beyond the range do not produce new/different prediction block. - x->mv_limits.row_min = - -(((mi_row_pred + mi_height) * MI_SIZE) + AOM_INTERP_EXTEND); - x->mv_limits.col_min = - -(((mi_col_pred + mi_width) * MI_SIZE) + AOM_INTERP_EXTEND); - x->mv_limits.row_max = - (cm->mi_rows - mi_row_pred) * MI_SIZE + AOM_INTERP_EXTEND; - x->mv_limits.col_max = - (cm->mi_cols - mi_col_pred) * MI_SIZE + AOM_INTERP_EXTEND; - -// Set up distance of MB to edge of frame in 1/8th pel units. -#if !CONFIG_CB4X4 - assert(!(mi_col_pred & (mi_width - mi_size_wide[BLOCK_8X8])) && - !(mi_row_pred & (mi_height - mi_size_high[BLOCK_8X8]))); -#endif - set_mi_row_col(xd, tile, mi_row_pred, mi_height, mi_col_pred, mi_width, -#if CONFIG_DEPENDENT_HORZTILES - cm->dependent_horz_tiles, -#endif // CONFIG_DEPENDENT_HORZTILES - cm->mi_rows, cm->mi_cols); - xd->up_available = (mi_row_ori > tile->mi_row_start); - xd->left_available = (mi_col_ori > tile->mi_col_start); - - // R/D setup. - x->rdmult = cpi->rd.RDMULT; -} - -static void set_segment_id_supertx(const AV1_COMP *const cpi, - MACROBLOCK *const x, const int mi_row, - const int mi_col, const BLOCK_SIZE bsize) { - const AV1_COMMON *cm = &cpi->common; - const struct segmentation *seg = &cm->seg; - const int miw = AOMMIN(mi_size_wide[bsize], cm->mi_cols - mi_col); - const int mih = AOMMIN(mi_size_high[bsize], cm->mi_rows - mi_row); - const int mi_offset = mi_row * cm->mi_stride + mi_col; - MODE_INFO **const mip = cm->mi_grid_visible + mi_offset; - int r, c; - int seg_id_supertx = MAX_SEGMENTS; - - if (!seg->enabled) { - seg_id_supertx = 0; - } else { - // Find the minimum segment_id - for (r = 0; r < mih; r++) - for (c = 0; c < miw; c++) - seg_id_supertx = - AOMMIN(mip[r * cm->mi_stride + c]->mbmi.segment_id, seg_id_supertx); - assert(0 <= seg_id_supertx && seg_id_supertx < MAX_SEGMENTS); - - // Initialize plane quantisers - av1_init_plane_quantizers(cpi, x, seg_id_supertx); - } - - // Assign the the segment_id back to segment_id_supertx - for (r = 0; r < mih; r++) - for (c = 0; c < miw; c++) - mip[r * cm->mi_stride + c]->mbmi.segment_id_supertx = seg_id_supertx; -} -#endif // CONFIG_SUPERTX - -#if CONFIG_DUAL_FILTER -static void reset_intmv_filter_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, - MB_MODE_INFO *mbmi) { +static void reset_intmv_filter_type(MB_MODE_INFO *mbmi) { InterpFilter filters[2]; - InterpFilter default_filter = av1_unswitchable_filter(cm->interp_filter); for (int dir = 0; dir < 2; ++dir) { - filters[dir] = ((!has_subpel_mv_component(xd->mi[0], xd, dir) && - (mbmi->ref_frame[1] == NONE_FRAME || - !has_subpel_mv_component(xd->mi[0], xd, dir + 2))) - ? default_filter - : av1_extract_interp_filter(mbmi->interp_filters, dir)); + filters[dir] = av1_extract_interp_filter(mbmi->interp_filters, dir); } mbmi->interp_filters = av1_make_interp_filters(filters[0], filters[1]); } -static void update_filter_type_count(FRAME_COUNTS *counts, +static void update_filter_type_count(uint8_t allow_update_cdf, + FRAME_COUNTS *counts, const MACROBLOCKD *xd, const MB_MODE_INFO *mbmi) { int dir; for (dir = 0; dir < 2; ++dir) { - if (has_subpel_mv_component(xd->mi[0], xd, dir) || - (mbmi->ref_frame[1] > INTRA_FRAME && - has_subpel_mv_component(xd->mi[0], xd, dir + 2))) { - const int ctx = av1_get_pred_context_switchable_interp(xd, dir); - InterpFilter filter = - av1_extract_interp_filter(mbmi->interp_filters, dir); - ++counts->switchable_interp[ctx][filter]; + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir); + ++counts->switchable_interp[ctx][filter]; + if (allow_update_cdf) { update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter, SWITCHABLE_FILTERS); } } } -#endif -#if CONFIG_GLOBAL_MOTION + static void update_global_motion_used(PREDICTION_MODE mode, BLOCK_SIZE bsize, const MB_MODE_INFO *mbmi, RD_COUNTS *rdc) { - if (mode == ZEROMV || mode == ZERO_ZEROMV) { - const int num_4x4s = - num_4x4_blocks_wide_lookup[bsize] * num_4x4_blocks_high_lookup[bsize]; + if (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) { + const int num_4x4s = mi_size_wide[bsize] * mi_size_high[bsize]; int ref; for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { rdc->global_motion_used[mbmi->ref_frame[ref]] += num_4x4s; } } } -#endif // CONFIG_GLOBAL_MOTION -static void reset_tx_size(MACROBLOCKD *xd, MB_MODE_INFO *mbmi, +static void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi, const TX_MODE tx_mode) { + MACROBLOCKD *const xd = &x->e_mbd; if (xd->lossless[mbmi->segment_id]) { mbmi->tx_size = TX_4X4; } else if (tx_mode != TX_MODE_SELECT) { - mbmi->tx_size = - tx_size_from_tx_mode(mbmi->sb_type, tx_mode, is_inter_block(mbmi)); - } -} - -static void set_ref_and_pred_mvs(MACROBLOCK *const x, int_mv *const mi_pred_mv, - int8_t rf_type) { - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - - const int bw = xd->n8_w << MI_SIZE_LOG2; - const int bh = xd->n8_h << MI_SIZE_LOG2; - int ref_mv_idx = mbmi->ref_mv_idx; - MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; - CANDIDATE_MV *const curr_ref_mv_stack = mbmi_ext->ref_mv_stack[rf_type]; - - if (has_second_ref(mbmi)) { - // Special case: NEAR_NEWMV and NEW_NEARMV modes use 1 + mbmi->ref_mv_idx - // (like NEARMV) instead - if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) ref_mv_idx += 1; - - if (compound_ref0_mode(mbmi->mode) == NEWMV) { - int_mv this_mv = curr_ref_mv_stack[ref_mv_idx].this_mv; - clamp_mv_ref(&this_mv.as_mv, bw, bh, xd); - mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv; - mbmi->pred_mv[0] = this_mv; - mi_pred_mv[0] = this_mv; - } - if (compound_ref1_mode(mbmi->mode) == NEWMV) { - int_mv this_mv = curr_ref_mv_stack[ref_mv_idx].comp_mv; - clamp_mv_ref(&this_mv.as_mv, bw, bh, xd); - mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv; - mbmi->pred_mv[1] = this_mv; - mi_pred_mv[1] = this_mv; - } -#if CONFIG_COMPOUND_SINGLEREF - } else if (is_inter_singleref_comp_mode(mbmi->mode)) { - // Special case: SR_NEAR_NEWMV uses 1 + mbmi->ref_mv_idx - // (like NEARMV) instead - if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx += 1; - - if (compound_ref0_mode(mbmi->mode) == NEWMV || - compound_ref1_mode(mbmi->mode) == NEWMV) { - int_mv this_mv = curr_ref_mv_stack[ref_mv_idx].this_mv; - clamp_mv_ref(&this_mv.as_mv, bw, bh, xd); - mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv; - mbmi->pred_mv[0] = this_mv; - mi_pred_mv[0] = this_mv; - } -#endif // CONFIG_COMPOUND_SINGLEREF + mbmi->tx_size = tx_size_from_tx_mode(mbmi->sb_type, tx_mode); } else { - if (mbmi->mode == NEWMV) { - int i; - for (i = 0; i < 1 + has_second_ref(mbmi); ++i) { - int_mv this_mv = (i == 0) ? curr_ref_mv_stack[ref_mv_idx].this_mv - : curr_ref_mv_stack[ref_mv_idx].comp_mv; - clamp_mv_ref(&this_mv.as_mv, bw, bh, xd); - mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0] = this_mv; - mbmi->pred_mv[i] = this_mv; - mi_pred_mv[i] = this_mv; - } - } + BLOCK_SIZE bsize = mbmi->sb_type; + TX_SIZE min_tx_size = depth_to_tx_size(MAX_TX_DEPTH, bsize); + mbmi->tx_size = (TX_SIZE)TXSIZEMAX(mbmi->tx_size, min_tx_size); + } + if (is_inter_block(mbmi)) { + memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); } + memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN); + av1_zero(x->blk_skip); + x->skip = 0; } -static void update_state(const AV1_COMP *const cpi, ThreadData *td, - PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, - BLOCK_SIZE bsize, RUN_TYPE dry_run) { +static void update_state(const AV1_COMP *const cpi, TileDataEnc *tile_data, + ThreadData *td, PICK_MODE_CONTEXT *ctx, int mi_row, + int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) { int i, x_idx, y; const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); RD_COUNTS *const rdc = &td->rd_counts; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; - MODE_INFO *mi = &ctx->mic; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - MODE_INFO *mi_addr = xd->mi[0]; + MB_MODE_INFO *mi = &ctx->mic; + MB_MODE_INFO *const mi_addr = xd->mi[0]; const struct segmentation *const seg = &cm->seg; - const int bw = mi_size_wide[mi->mbmi.sb_type]; - const int bh = mi_size_high[mi->mbmi.sb_type]; + const int bw = mi_size_wide[mi->sb_type]; + const int bh = mi_size_high[mi->sb_type]; const int mis = cm->mi_stride; const int mi_width = mi_size_wide[bsize]; const int mi_height = mi_size_high[bsize]; - const int unify_bsize = CONFIG_CB4X4; - int8_t rf_type; - -#if !CONFIG_SUPERTX - assert(mi->mbmi.sb_type == bsize); -#endif + assert(mi->sb_type == bsize); *mi_addr = *mi; *x->mbmi_ext = ctx->mbmi_ext; -#if CONFIG_DUAL_FILTER - reset_intmv_filter_type(cm, xd, mbmi); -#endif + reset_intmv_filter_type(mi_addr); - rf_type = av1_ref_frame_type(mbmi->ref_frame); - if (x->mbmi_ext->ref_mv_count[rf_type] > 1 && - (mbmi->sb_type >= BLOCK_8X8 || unify_bsize)) { - set_ref_and_pred_mvs(x, mi->mbmi.pred_mv, rf_type); - } + memcpy(x->blk_skip, ctx->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + + x->skip = ctx->skip; // If segmentation in use if (seg->enabled) { @@ -616,34 +385,29 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td, if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { const uint8_t *const map = seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - mi_addr->mbmi.segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); - reset_tx_size(xd, &mi_addr->mbmi, cm->tx_mode); + mi_addr->segment_id = + map ? get_segment_id(cm, map, bsize, mi_row, mi_col) : 0; + reset_tx_size(x, mi_addr, cm->tx_mode); } // Else for cyclic refresh mode update the segment map, set the segment id // and then update the quantizer. if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { - av1_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row, mi_col, - bsize, ctx->rate, ctx->dist, x->skip); - reset_tx_size(xd, &mi_addr->mbmi, cm->tx_mode); + av1_cyclic_refresh_update_segment(cpi, mi_addr, mi_row, mi_col, bsize, + ctx->rate, ctx->dist, x->skip); + reset_tx_size(x, mi_addr, cm->tx_mode); } + if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd)) + mi_addr->uv_mode = UV_DC_PRED; } - for (i = 0; i < MAX_MB_PLANE; ++i) { + for (i = 0; i < num_planes; ++i) { p[i].coeff = ctx->coeff[i]; p[i].qcoeff = ctx->qcoeff[i]; pd[i].dqcoeff = ctx->dqcoeff[i]; -#if CONFIG_PVQ - pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i]; -#endif p[i].eobs = ctx->eobs[i]; -#if CONFIG_LV_MAP p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; -#endif // CONFIG_LV_MAP } for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i]; -#if CONFIG_MRC_TX - xd->mrc_mask = ctx->mrc_mask; -#endif // CONFIG_MRC_TX // Restore the coding context of the MB to that that was in place // when the mode was picked for it for (y = 0; y < mi_height; y++) @@ -653,26 +417,7 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td, xd->mi[x_idx + y * mis] = mi_addr; } -#if !CONFIG_EXT_DELTA_Q - if (cpi->oxcf.aq_mode > NO_AQ && cpi->oxcf.aq_mode < DELTA_AQ) - av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id); -#else - if (cpi->oxcf.aq_mode) - av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id); -#endif - - if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8 && !unify_bsize) { - mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int; - mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int; - } - - x->skip = ctx->skip; - -#if CONFIG_VAR_TX - for (i = 0; i < 1; ++i) - memcpy(x->blk_skip[i], ctx->blk_skip[i], - sizeof(uint8_t) * ctx->num_4x4_blk); -#endif + if (cpi->oxcf.aq_mode) av1_init_plane_quantizers(cpi, x, mi_addr->segment_id); if (dry_run) return; @@ -687,18 +432,16 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td, THR_H_PRED /*H_PRED*/, THR_D45_PRED /*D45_PRED*/, THR_D135_PRED /*D135_PRED*/, - THR_D117_PRED /*D117_PRED*/, - THR_D153_PRED /*D153_PRED*/, - THR_D207_PRED /*D207_PRED*/, - THR_D63_PRED /*D63_PRED*/, - THR_SMOOTH, /*SMOOTH_PRED*/ -#if CONFIG_SMOOTH_HV + THR_D113_PRED /*D113_PRED*/, + THR_D157_PRED /*D157_PRED*/, + THR_D203_PRED /*D203_PRED*/, + THR_D67_PRED /*D67_PRED*/, + THR_SMOOTH, /*SMOOTH_PRED*/ THR_SMOOTH_V, /*SMOOTH_V_PRED*/ THR_SMOOTH_H, /*SMOOTH_H_PRED*/ -#endif // CONFIG_SMOOTH_HV - THR_TM /*TM_PRED*/, + THR_PAETH /*PAETH_PRED*/, }; - ++mode_chosen_counts[kf_mode_index[mbmi->mode]]; + ++mode_chosen_counts[kf_mode_index[mi_addr->mode]]; } else { // Note how often each mode chosen as best ++mode_chosen_counts[ctx->best_mode_index]; @@ -706,188 +449,17 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td, } #endif if (!frame_is_intra_only(cm)) { - if (is_inter_block(mbmi)) { - av1_update_mv_count(td); -#if CONFIG_GLOBAL_MOTION - if (bsize >= BLOCK_8X8) { - // TODO(sarahparker): global motion stats need to be handled per-tile - // to be compatible with tile-based threading. - update_global_motion_used(mbmi->mode, bsize, mbmi, rdc); - } else { - const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; - const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; - int idx, idy; - for (idy = 0; idy < 2; idy += num_4x4_h) { - for (idx = 0; idx < 2; idx += num_4x4_w) { - const int j = idy * 2 + idx; - update_global_motion_used(mi->bmi[j].as_mode, bsize, mbmi, rdc); - } - } - } -#endif // CONFIG_GLOBAL_MOTION - if (cm->interp_filter == SWITCHABLE -#if CONFIG_WARPED_MOTION - && mbmi->motion_mode != WARPED_CAUSAL -#endif // CONFIG_WARPED_MOTION -#if CONFIG_GLOBAL_MOTION - && !is_nontrans_global_motion(xd) -#endif // CONFIG_GLOBAL_MOTION - ) { -#if CONFIG_DUAL_FILTER - update_filter_type_count(td->counts, xd, mbmi); -#else - const int switchable_ctx = av1_get_pred_context_switchable_interp(xd); - const InterpFilter filter = - av1_extract_interp_filter(mbmi->interp_filters, 0); - ++td->counts->switchable_interp[switchable_ctx][filter]; -#endif - } - } - - rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff; - rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff; - rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff; - } - - const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col); - const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row); - av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis); -} - -#if CONFIG_SUPERTX -static void update_state_supertx(const AV1_COMP *const cpi, ThreadData *td, - PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, - BLOCK_SIZE bsize, RUN_TYPE dry_run) { - int y, x_idx; -#if CONFIG_VAR_TX - int i; -#endif - const AV1_COMMON *const cm = &cpi->common; - RD_COUNTS *const rdc = &td->rd_counts; - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *mi = &ctx->mic; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - MODE_INFO *mi_addr = xd->mi[0]; - const struct segmentation *const seg = &cm->seg; - const int mis = cm->mi_stride; - const int mi_width = mi_size_wide[bsize]; - const int mi_height = mi_size_high[bsize]; - const int unify_bsize = CONFIG_CB4X4; - int8_t rf_type; - - *mi_addr = *mi; - *x->mbmi_ext = ctx->mbmi_ext; - assert(is_inter_block(mbmi)); - assert(mbmi->tx_size == ctx->mic.mbmi.tx_size); - -#if CONFIG_DUAL_FILTER - reset_intmv_filter_type(cm, xd, mbmi); -#endif - - rf_type = av1_ref_frame_type(mbmi->ref_frame); - if (x->mbmi_ext->ref_mv_count[rf_type] > 1 && - (mbmi->sb_type >= BLOCK_8X8 || unify_bsize)) { - set_ref_and_pred_mvs(x, mi->mbmi.pred_mv, rf_type); - } - - // If segmentation in use - if (seg->enabled) { - if (cpi->vaq_refresh) { - const int energy = - bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize); - mi_addr->mbmi.segment_id = av1_vaq_segment_id(energy); - } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { - // For cyclic refresh mode, now update the segment map - // and set the segment id. - av1_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row, mi_col, - bsize, ctx->rate, ctx->dist, 1); - } else { - // Otherwise just set the segment id based on the current segment map - const uint8_t *const map = - seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - mi_addr->mbmi.segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + if (is_inter_block(mi_addr)) { + // TODO(sarahparker): global motion stats need to be handled per-tile + // to be compatible with tile-based threading. + update_global_motion_used(mi_addr->mode, bsize, mi_addr, rdc); } - mi_addr->mbmi.segment_id_supertx = MAX_SEGMENTS; - } - // Restore the coding context of the MB to that that was in place - // when the mode was picked for it - for (y = 0; y < mi_height; y++) - for (x_idx = 0; x_idx < mi_width; x_idx++) - if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx && - (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) { - xd->mi[x_idx + y * mis] = mi_addr; - } - -#if !CONFIG_CB4X4 - if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) { - mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int; - mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int; - } -#endif - - x->skip = ctx->skip; - -#if CONFIG_VAR_TX - for (i = 0; i < 1; ++i) - memcpy(x->blk_skip[i], ctx->blk_skip[i], - sizeof(uint8_t) * ctx->num_4x4_blk); - - if (!is_inter_block(mbmi) || mbmi->skip) - mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size); -#endif // CONFIG_VAR_TX -#if CONFIG_VAR_TX - { - const TX_SIZE mtx = mbmi->tx_size; - const int num_4x4_blocks_wide = tx_size_wide_unit[mtx] >> 1; - const int num_4x4_blocks_high = tx_size_high_unit[mtx] >> 1; - int idy, idx; - mbmi->inter_tx_size[0][0] = mtx; - for (idy = 0; idy < num_4x4_blocks_high; ++idy) - for (idx = 0; idx < num_4x4_blocks_wide; ++idx) - mbmi->inter_tx_size[idy][idx] = mtx; - } -#endif // CONFIG_VAR_TX - // Turn motion variation off for supertx - mbmi->motion_mode = SIMPLE_TRANSLATION; - - if (dry_run) return; - - if (!frame_is_intra_only(cm)) { - av1_update_mv_count(td); - -#if CONFIG_GLOBAL_MOTION - if (is_inter_block(mbmi)) { - if (bsize >= BLOCK_8X8) { - // TODO(sarahparker): global motion stats need to be handled per-tile - // to be compatible with tile-based threading. - update_global_motion_used(mbmi->mode, bsize, mbmi, rdc); - } else { - const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; - const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; - int idx, idy; - for (idy = 0; idy < 2; idy += num_4x4_h) { - for (idx = 0; idx < 2; idx += num_4x4_w) { - const int j = idy * 2 + idx; - update_global_motion_used(mi->bmi[j].as_mode, bsize, mbmi, rdc); - } - } - } - } -#endif // CONFIG_GLOBAL_MOTION - - if (cm->interp_filter == SWITCHABLE -#if CONFIG_GLOBAL_MOTION - && !is_nontrans_global_motion(xd) -#endif // CONFIG_GLOBAL_MOTION - ) { -#if CONFIG_DUAL_FILTER - update_filter_type_count(td->counts, xd, mbmi); -#else - const int pred_ctx = av1_get_pred_context_switchable_interp(xd); - ++td->counts->switchable_interp[pred_ctx][mbmi->interp_filter]; -#endif + if (cm->interp_filter == SWITCHABLE && + mi_addr->motion_mode != WARPED_CAUSAL && + !is_nontrans_global_motion(xd, xd->mi[0])) { + update_filter_type_count(tile_data->allow_update_cdf, td->counts, xd, + mi_addr); } rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff; @@ -895,572 +467,114 @@ static void update_state_supertx(const AV1_COMP *const cpi, ThreadData *td, rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff; } - const int x_mis = AOMMIN(mi_width, cm->mi_cols - mi_col); - const int y_mis = AOMMIN(mi_height, cm->mi_rows - mi_row); + const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col); + const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row); av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis); } -static void update_state_sb_supertx(const AV1_COMP *const cpi, ThreadData *td, - const TileInfo *const tile, int mi_row, - int mi_col, BLOCK_SIZE bsize, - RUN_TYPE dry_run, PC_TREE *pc_tree) { - const AV1_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - struct macroblock_plane *const p = x->plane; - struct macroblockd_plane *const pd = xd->plane; - int hbs = mi_size_wide[bsize] / 2; -#if CONFIG_CB4X4 - const int unify_bsize = 1; -#else - const int unify_bsize = 0; -#endif - PARTITION_TYPE partition = pc_tree->partitioning; - BLOCK_SIZE subsize = get_subsize(bsize, partition); - int i; -#if CONFIG_EXT_PARTITION_TYPES - BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT); -#endif - PICK_MODE_CONTEXT *pmc = NULL; - - if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - - if (bsize == BLOCK_16X16 && cpi->vaq_refresh) - x->mb_energy = av1_block_energy(cpi, x, bsize); - - switch (partition) { - case PARTITION_NONE: - set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize); - update_state_supertx(cpi, td, &pc_tree->none, mi_row, mi_col, subsize, - dry_run); - break; - case PARTITION_VERT: - set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize); - update_state_supertx(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, - subsize, dry_run); - if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) { - set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize); - update_state_supertx(cpi, td, &pc_tree->vertical[1], mi_row, - mi_col + hbs, subsize, dry_run); - } - pmc = &pc_tree->vertical_supertx; - break; - case PARTITION_HORZ: - set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize); - update_state_supertx(cpi, td, &pc_tree->horizontal[0], mi_row, mi_col, - subsize, dry_run); - if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) { - set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize); - update_state_supertx(cpi, td, &pc_tree->horizontal[1], mi_row + hbs, - mi_col, subsize, dry_run); - } - pmc = &pc_tree->horizontal_supertx; - break; - case PARTITION_SPLIT: - if (bsize == BLOCK_8X8 && !unify_bsize) { - set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize); - update_state_supertx(cpi, td, pc_tree->leaf_split[0], mi_row, mi_col, - subsize, dry_run); - } else { - set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize); - update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, subsize, dry_run, - pc_tree->split[0]); - set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize); - update_state_sb_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize, - dry_run, pc_tree->split[1]); - set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize); - update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize, - dry_run, pc_tree->split[2]); - set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, subsize); - update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, - subsize, dry_run, pc_tree->split[3]); - } - pmc = &pc_tree->split_supertx; - break; -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION_TYPES_AB -#error HORZ/VERT_A/B partitions not yet updated in superres code -#endif - case PARTITION_HORZ_A: - set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2); - update_state_supertx(cpi, td, &pc_tree->horizontala[0], mi_row, mi_col, - bsize2, dry_run); - set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2); - update_state_supertx(cpi, td, &pc_tree->horizontala[1], mi_row, - mi_col + hbs, bsize2, dry_run); - set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize); - update_state_supertx(cpi, td, &pc_tree->horizontala[2], mi_row + hbs, - mi_col, subsize, dry_run); - pmc = &pc_tree->horizontala_supertx; - break; - case PARTITION_HORZ_B: - set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize); - update_state_supertx(cpi, td, &pc_tree->horizontalb[0], mi_row, mi_col, - subsize, dry_run); - set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2); - update_state_supertx(cpi, td, &pc_tree->horizontalb[1], mi_row + hbs, - mi_col, bsize2, dry_run); - set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2); - update_state_supertx(cpi, td, &pc_tree->horizontalb[2], mi_row + hbs, - mi_col + hbs, bsize2, dry_run); - pmc = &pc_tree->horizontalb_supertx; - break; - case PARTITION_VERT_A: - set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2); - update_state_supertx(cpi, td, &pc_tree->verticala[0], mi_row, mi_col, - bsize2, dry_run); - set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2); - update_state_supertx(cpi, td, &pc_tree->verticala[1], mi_row + hbs, - mi_col, bsize2, dry_run); - set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize); - update_state_supertx(cpi, td, &pc_tree->verticala[2], mi_row, - mi_col + hbs, subsize, dry_run); - pmc = &pc_tree->verticala_supertx; - break; - case PARTITION_VERT_B: - set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize); - update_state_supertx(cpi, td, &pc_tree->verticalb[0], mi_row, mi_col, - subsize, dry_run); - set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2); - update_state_supertx(cpi, td, &pc_tree->verticalb[1], mi_row, - mi_col + hbs, bsize2, dry_run); - set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2); - update_state_supertx(cpi, td, &pc_tree->verticalb[2], mi_row + hbs, - mi_col + hbs, bsize2, dry_run); - pmc = &pc_tree->verticalb_supertx; - break; -#endif // CONFIG_EXT_PARTITION_TYPES - default: assert(0); - } - - for (i = 0; i < MAX_MB_PLANE; ++i) { - if (pmc != NULL) { - p[i].coeff = pmc->coeff[i]; - p[i].qcoeff = pmc->qcoeff[i]; - pd[i].dqcoeff = pmc->dqcoeff[i]; - p[i].eobs = pmc->eobs[i]; - } else { - // These should never be used - p[i].coeff = NULL; - p[i].qcoeff = NULL; - pd[i].dqcoeff = NULL; - p[i].eobs = NULL; - } - } -} - -static void update_supertx_param(ThreadData *td, PICK_MODE_CONTEXT *ctx, - int best_tx, TX_SIZE supertx_size) { - MACROBLOCK *const x = &td->mb; -#if CONFIG_VAR_TX - int i; - - for (i = 0; i < 1; ++i) - memcpy(ctx->blk_skip[i], x->blk_skip[i], - sizeof(uint8_t) * ctx->num_4x4_blk); - ctx->mic.mbmi.min_tx_size = get_min_tx_size(supertx_size); -#endif // CONFIG_VAR_TX - ctx->mic.mbmi.tx_size = supertx_size; - ctx->skip = x->skip; - ctx->mic.mbmi.tx_type = best_tx; -} - -static void update_supertx_param_sb(const AV1_COMP *const cpi, ThreadData *td, - int mi_row, int mi_col, BLOCK_SIZE bsize, - int best_tx, TX_SIZE supertx_size, - PC_TREE *pc_tree) { - const AV1_COMMON *const cm = &cpi->common; - const int hbs = mi_size_wide[bsize] / 2; - PARTITION_TYPE partition = pc_tree->partitioning; - BLOCK_SIZE subsize = get_subsize(bsize, partition); -#if CONFIG_CB4X4 - const int unify_bsize = 1; -#else - const int unify_bsize = 0; -#endif -#if CONFIG_EXT_PARTITION_TYPES - int i; -#endif - - if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - - switch (partition) { - case PARTITION_NONE: - update_supertx_param(td, &pc_tree->none, best_tx, supertx_size); - break; - case PARTITION_VERT: - update_supertx_param(td, &pc_tree->vertical[0], best_tx, supertx_size); - if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) - update_supertx_param(td, &pc_tree->vertical[1], best_tx, supertx_size); - break; - case PARTITION_HORZ: - update_supertx_param(td, &pc_tree->horizontal[0], best_tx, supertx_size); - if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) - update_supertx_param(td, &pc_tree->horizontal[1], best_tx, - supertx_size); - break; - case PARTITION_SPLIT: - if (bsize == BLOCK_8X8 && !unify_bsize) { - update_supertx_param(td, pc_tree->leaf_split[0], best_tx, supertx_size); - } else { - update_supertx_param_sb(cpi, td, mi_row, mi_col, subsize, best_tx, - supertx_size, pc_tree->split[0]); - update_supertx_param_sb(cpi, td, mi_row, mi_col + hbs, subsize, best_tx, - supertx_size, pc_tree->split[1]); - update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col, subsize, best_tx, - supertx_size, pc_tree->split[2]); - update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col + hbs, subsize, - best_tx, supertx_size, pc_tree->split[3]); - } - break; -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION_TYPES_AB -#error HORZ/VERT_A/B partitions not yet updated in superres code -#endif - case PARTITION_HORZ_A: - for (i = 0; i < 3; i++) - update_supertx_param(td, &pc_tree->horizontala[i], best_tx, - supertx_size); - break; - case PARTITION_HORZ_B: - for (i = 0; i < 3; i++) - update_supertx_param(td, &pc_tree->horizontalb[i], best_tx, - supertx_size); - break; - case PARTITION_VERT_A: - for (i = 0; i < 3; i++) - update_supertx_param(td, &pc_tree->verticala[i], best_tx, supertx_size); - break; - case PARTITION_VERT_B: - for (i = 0; i < 3; i++) - update_supertx_param(td, &pc_tree->verticalb[i], best_tx, supertx_size); - break; -#endif // CONFIG_EXT_PARTITION_TYPES - default: assert(0); - } -} -#endif // CONFIG_SUPERTX - -#if CONFIG_MOTION_VAR && NC_MODE_INFO -static void set_mode_info_b(const AV1_COMP *const cpi, - const TileInfo *const tile, ThreadData *td, - int mi_row, int mi_col, BLOCK_SIZE bsize, - PICK_MODE_CONTEXT *ctx) { - MACROBLOCK *const x = &td->mb; - set_offsets(cpi, tile, x, mi_row, mi_col, bsize); - update_state(cpi, td, ctx, mi_row, mi_col, bsize, 1); -} - -static void set_mode_info_sb(const AV1_COMP *const cpi, ThreadData *td, - const TileInfo *const tile, TOKENEXTRA **tp, - int mi_row, int mi_col, BLOCK_SIZE bsize, - PC_TREE *pc_tree) { - const AV1_COMMON *const cm = &cpi->common; - const int hbs = mi_size_wide[bsize] / 2; - const PARTITION_TYPE partition = pc_tree->partitioning; - BLOCK_SIZE subsize = get_subsize(bsize, partition); -#if CONFIG_EXT_PARTITION_TYPES - const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT); - const int quarter_step = mi_size_wide[bsize] / 4; -#endif -#if CONFIG_CB4X4 - const int unify_bsize = 1; -#else - const int unify_bsize = 0; - assert(bsize >= BLOCK_8X8); -#endif - - if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - - switch (partition) { - case PARTITION_NONE: - set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize, &pc_tree->none); - break; - case PARTITION_VERT: - set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize, - &pc_tree->vertical[0]); - if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) { - set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, subsize, - &pc_tree->vertical[1]); - } - break; - case PARTITION_HORZ: - set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize, - &pc_tree->horizontal[0]); - if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) { - set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, subsize, - &pc_tree->horizontal[1]); - } - break; - case PARTITION_SPLIT: - if (bsize == BLOCK_8X8 && !unify_bsize) { - set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize, - pc_tree->leaf_split[0]); - } else { - set_mode_info_sb(cpi, td, tile, tp, mi_row, mi_col, subsize, - pc_tree->split[0]); - set_mode_info_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, subsize, - pc_tree->split[1]); - set_mode_info_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, subsize, - pc_tree->split[2]); - set_mode_info_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, subsize, - pc_tree->split[3]); - } - break; -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION_TYPES_AB -#error NC_MODE_INFO+MOTION_VAR not yet supported for new HORZ/VERT_AB partitions -#endif - case PARTITION_HORZ_A: - set_mode_info_b(cpi, tile, td, mi_row, mi_col, bsize2, - &pc_tree->horizontala[0]); - set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, bsize2, - &pc_tree->horizontala[1]); - set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, subsize, - &pc_tree->horizontala[2]); - break; - case PARTITION_HORZ_B: - set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize, - &pc_tree->horizontalb[0]); - set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, bsize2, - &pc_tree->horizontalb[1]); - set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col + hbs, bsize2, - &pc_tree->horizontalb[2]); - break; - case PARTITION_VERT_A: - set_mode_info_b(cpi, tile, td, mi_row, mi_col, bsize2, - &pc_tree->verticala[0]); - set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, bsize2, - &pc_tree->verticala[1]); - set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, subsize, - &pc_tree->verticala[2]); - break; - case PARTITION_VERT_B: - set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize, - &pc_tree->verticalb[0]); - set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, bsize2, - &pc_tree->verticalb[1]); - set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col + hbs, bsize2, - &pc_tree->verticalb[2]); - break; - case PARTITION_HORZ_4: - for (int i = 0; i < 4; ++i) { - int this_mi_row = mi_row + i * quarter_step; - if (i > 0 && this_mi_row >= cm->mi_rows) break; - - set_mode_info_b(cpi, tile, td, this_mi_row, mi_col, subsize, - &pc_tree->horizontal4[i]); - } - break; - case PARTITION_VERT_4: - for (int i = 0; i < 4; ++i) { - int this_mi_col = mi_col + i * quarter_step; - if (i > 0 && this_mi_col >= cm->mi_cols) break; - - set_mode_info_b(cpi, tile, td, mi_row, this_mi_col, subsize, - &pc_tree->vertical4[i]); - } - break; -#endif // CONFIG_EXT_PARTITION_TYPES - default: assert(0 && "Invalid partition type."); break; - } -} - -#if CONFIG_NCOBMC_ADAPT_WEIGHT -static void av1_get_ncobmc_mode_rd(const AV1_COMP *const cpi, - MACROBLOCK *const x, MACROBLOCKD *const xd, - int bsize, const int mi_row, - const int mi_col, NCOBMC_MODE *mode) { - const AV1_COMMON *const cm = &cpi->common; - const int mi_width = mi_size_wide[bsize]; - const int mi_height = mi_size_high[bsize]; - - assert(bsize >= BLOCK_8X8); - - reset_xd_boundary(xd, mi_row, mi_height, mi_col, mi_width, cm->mi_rows, - cm->mi_cols); - - // set up source buffers before calling the mode searching function - av1_setup_src_planes(x, cpi->source, mi_row, mi_col); - - *mode = get_ncobmc_mode(cpi, x, xd, mi_row, mi_col, bsize); -} -static void get_ncobmc_intrpl_pred(const AV1_COMP *const cpi, ThreadData *td, - int mi_row, int mi_col, BLOCK_SIZE bsize) { - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - const int mi_width = mi_size_wide[bsize]; - const int mi_height = mi_size_high[bsize]; - const int hbs = AOMMAX(mi_size_wide[bsize] / 2, mi_size_high[bsize] / 2); - const BLOCK_SIZE sqr_blk = bsize_2_sqr_bsize[bsize]; - - if (mi_width > mi_height) { - // horizontal partition - av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row, mi_col, - &mbmi->ncobmc_mode[0]); - xd->mi += hbs; - av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row, mi_col + hbs, - &mbmi->ncobmc_mode[1]); - } else if (mi_height > mi_width) { - // vertical partition - av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row, mi_col, - &mbmi->ncobmc_mode[0]); - xd->mi += hbs * xd->mi_stride; - av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row + hbs, mi_col, - &mbmi->ncobmc_mode[1]); - } else { - av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row, mi_col, - &mbmi->ncobmc_mode[0]); - } - // restore the info - av1_setup_src_planes(x, cpi->source, mi_row, mi_col); - set_mode_info_offsets(cpi, x, xd, mi_row, mi_col); -} -#endif // CONFIG_NCOBMC_ADAPT_WEIGHT -#endif // CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT) - void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, - int mi_row, int mi_col) { - uint8_t *const buffers[3] = { src->y_buffer, src->u_buffer, src->v_buffer }; - const int widths[3] = { src->y_crop_width, src->uv_crop_width, - src->uv_crop_width }; - const int heights[3] = { src->y_crop_height, src->uv_crop_height, - src->uv_crop_height }; - const int strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; - int i; - + int mi_row, int mi_col, const int num_planes) { // Set current frame pointer. x->e_mbd.cur_buf = src; - for (i = 0; i < MAX_MB_PLANE; i++) - setup_pred_plane(&x->plane[i].src, x->e_mbd.mi[0]->mbmi.sb_type, buffers[i], - widths[i], heights[i], strides[i], mi_row, mi_col, NULL, + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); i++) { + const int is_uv = i > 0; + setup_pred_plane(&x->plane[i].src, x->e_mbd.mi[0]->sb_type, src->buffers[i], + src->crop_widths[is_uv], src->crop_heights[is_uv], + src->strides[is_uv], mi_row, mi_col, NULL, x->e_mbd.plane[i].subsampling_x, x->e_mbd.plane[i].subsampling_y); + } } static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, int8_t segment_id) { - int segment_qindex; const AV1_COMMON *const cm = &cpi->common; av1_init_plane_quantizers(cpi, x, segment_id); aom_clear_system_state(); - segment_qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex); + int segment_qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex); return av1_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q); } -#if CONFIG_DIST_8X8 && CONFIG_CB4X4 -static void dist_8x8_set_sub8x8_dst(MACROBLOCK *const x, uint8_t *dst8x8, - BLOCK_SIZE bsize, int bw, int bh, - int mi_row, int mi_col) { - MACROBLOCKD *const xd = &x->e_mbd; - struct macroblockd_plane *const pd = &xd->plane[0]; - const int dst_stride = pd->dst.stride; - uint8_t *dst = pd->dst.buf; - - assert(bsize < BLOCK_8X8); - - if (bsize < BLOCK_8X8) { - int i, j; -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - uint16_t *dst8x8_16 = (uint16_t *)dst8x8; - uint16_t *dst_sub8x8 = &dst8x8_16[((mi_row & 1) * 8 + (mi_col & 1)) << 2]; - - for (j = 0; j < bh; ++j) - for (i = 0; i < bw; ++i) - dst_sub8x8[j * 8 + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i]; - } else { -#endif - uint8_t *dst_sub8x8 = &dst8x8[((mi_row & 1) * 8 + (mi_col & 1)) << 2]; +static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) { + const AV1_COMMON *const cm = &cpi->common; - for (j = 0; j < bh; ++j) - for (i = 0; i < bw; ++i) - dst_sub8x8[j * 8 + i] = dst[j * dst_stride + i]; -#if CONFIG_HIGHBITDEPTH - } -#endif - } + return av1_compute_rd_mult( + cpi, cm->base_qindex + xd->delta_qindex + cm->y_dc_delta_q); } -#endif static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x, int mi_row, int mi_col, - RD_STATS *rd_cost, -#if CONFIG_SUPERTX - int *totalrate_nocoef, -#endif -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_TYPE partition, -#endif + RD_STATS *rd_cost, PARTITION_TYPE partition, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd) { const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi; + MB_MODE_INFO *ctx_mbmi = &ctx->mic; struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; const AQ_MODE aq_mode = cpi->oxcf.aq_mode; + const DELTAQ_MODE deltaq_mode = cpi->oxcf.deltaq_mode; int i, orig_rdmult; aom_clear_system_state(); -#if CONFIG_PVQ - x->pvq_speed = 1; - x->pvq_coded = 0; -#endif - set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); - mbmi = &xd->mi[0]->mbmi; - mbmi->sb_type = bsize; + + mbmi = xd->mi[0]; + + if (ctx->rd_mode_is_ready) { + assert(ctx_mbmi->sb_type == bsize); + assert(ctx_mbmi->partition == partition); + *mbmi = *ctx_mbmi; + rd_cost->rate = ctx->rate; + rd_cost->dist = ctx->dist; + rd_cost->rdcost = ctx->rdcost; + } else { + mbmi->sb_type = bsize; + mbmi->partition = partition; + } + #if CONFIG_RD_DEBUG mbmi->mi_row = mi_row; mbmi->mi_col = mi_col; #endif -#if CONFIG_SUPERTX - // We set tx_size here as skip blocks would otherwise not set it. - // tx_size needs to be set at this point as supertx_enable in - // write_modes_sb is computed based on this, and if the garbage in memory - // just happens to be the supertx_size, then the packer will code this - // block as a supertx block, even if rdopt did not pick it as such. - mbmi->tx_size = max_txsize_lookup[bsize]; -#endif -#if CONFIG_EXT_PARTITION_TYPES - mbmi->partition = partition; -#endif - for (i = 0; i < MAX_MB_PLANE; ++i) { + for (i = 0; i < num_planes; ++i) { p[i].coeff = ctx->coeff[i]; p[i].qcoeff = ctx->qcoeff[i]; pd[i].dqcoeff = ctx->dqcoeff[i]; -#if CONFIG_PVQ - pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i]; -#endif p[i].eobs = ctx->eobs[i]; -#if CONFIG_LV_MAP p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; -#endif } for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i]; -#if CONFIG_MRC_TX - xd->mrc_mask = ctx->mrc_mask; -#endif // CONFIG_MRC_TX - ctx->skippable = 0; + if (!ctx->rd_mode_is_ready) { + ctx->skippable = 0; - // Set to zero to make sure we do not use the previous encoded frame stats - mbmi->skip = 0; + // Set to zero to make sure we do not use the previous encoded frame stats + mbmi->skip = 0; + + // Reset skip mode flag. + mbmi->skip_mode = 0; + } -#if CONFIG_CB4X4 x->skip_chroma_rd = !is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y); -#endif -#if CONFIG_HIGHBITDEPTH + if (ctx->rd_mode_is_ready) { + x->skip = ctx->skip; + *x->mbmi_ext = ctx->mbmi_ext; + return; + } + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { x->source_variance = av1_high_get_sby_perpixel_variance( cpi, &x->plane[0].src, bsize, xd->bd); @@ -1468,10 +582,6 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data, x->source_variance = av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); } -#else - x->source_variance = - av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); -#endif // CONFIG_HIGHBITDEPTH // Save rdmult before it might be changed, so it can be restored later. orig_rdmult = x->rdmult; @@ -1481,8 +591,6 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data, const int energy = bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize); mbmi->segment_id = av1_vaq_segment_id(energy); - // Re-initialise quantiser - av1_init_plane_quantizers(cpi, x, mbmi->segment_id); } x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id); } else if (aq_mode == COMPLEXITY_AQ) { @@ -1493,29 +601,20 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data, x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); } + if (deltaq_mode > 0) x->rdmult = set_deltaq_rdmult(cpi, xd); + // Find best coding mode & reconstruct the MB so it is available // as a predictor for MBs that follow in the SB if (frame_is_intra_only(cm)) { - av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd); -#if CONFIG_SUPERTX - *totalrate_nocoef = 0; -#endif // CONFIG_SUPERTX + av1_rd_pick_intra_mode_sb(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx, + best_rd); } else { if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col, rd_cost, bsize, ctx, best_rd); -#if CONFIG_SUPERTX - *totalrate_nocoef = rd_cost->rate; -#endif // CONFIG_SUPERTX } else { av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost, -#if CONFIG_SUPERTX - totalrate_nocoef, -#endif // CONFIG_SUPERTX bsize, ctx, best_rd); -#if CONFIG_SUPERTX - assert(*totalrate_nocoef >= 0); -#endif // CONFIG_SUPERTX } } @@ -1523,9 +622,7 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data, if ((rd_cost->rate != INT_MAX) && (aq_mode == COMPLEXITY_AQ) && (bsize >= BLOCK_16X16) && (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame || -#if CONFIG_EXT_REFS cpi->refresh_alt2_ref_frame || -#endif // CONFIG_EXT_REFS (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) { av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate); } @@ -1538,363 +635,630 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data, ctx->rate = rd_cost->rate; ctx->dist = rd_cost->dist; + ctx->rdcost = rd_cost->rdcost; } -static void update_inter_mode_stats(FRAME_COUNTS *counts, PREDICTION_MODE mode, - int16_t mode_context) { +static void update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts, + PREDICTION_MODE mode, int16_t mode_context, + uint8_t allow_update_cdf) { + (void)counts; + int16_t mode_ctx = mode_context & NEWMV_CTX_MASK; if (mode == NEWMV) { +#if CONFIG_ENTROPY_STATS ++counts->newmv_mode[mode_ctx][0]; +#endif + if (allow_update_cdf) update_cdf(fc->newmv_cdf[mode_ctx], 0, 2); return; } else { +#if CONFIG_ENTROPY_STATS ++counts->newmv_mode[mode_ctx][1]; +#endif + if (allow_update_cdf) update_cdf(fc->newmv_cdf[mode_ctx], 1, 2); - if (mode_context & (1 << ALL_ZERO_FLAG_OFFSET)) { - return; - } - - mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK; - if (mode == ZEROMV) { + mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + if (mode == GLOBALMV) { +#if CONFIG_ENTROPY_STATS ++counts->zeromv_mode[mode_ctx][0]; +#endif + if (allow_update_cdf) update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2); return; } else { +#if CONFIG_ENTROPY_STATS ++counts->zeromv_mode[mode_ctx][1]; +#endif + if (allow_update_cdf) update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2); mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; +#if CONFIG_ENTROPY_STATS + ++counts->refmv_mode[mode_ctx][mode != NEARESTMV]; +#endif + if (allow_update_cdf) + update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2); + } + } +} + +static void update_palette_cdf(MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi, + FRAME_COUNTS *counts, uint8_t allow_update_cdf) { + FRAME_CONTEXT *fc = xd->tile_ctx; + const BLOCK_SIZE bsize = mbmi->sb_type; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int palette_bsize_ctx = av1_get_palette_bsize_ctx(bsize); + + (void)counts; - if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6; - if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7; - if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8; + if (mbmi->mode == DC_PRED) { + const int n = pmi->palette_size[0]; + const int palette_mode_ctx = av1_get_palette_mode_ctx(xd); - ++counts->refmv_mode[mode_ctx][mode != NEARESTMV]; +#if CONFIG_ENTROPY_STATS + ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0]; +#endif + if (allow_update_cdf) + update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx], + n > 0, 2); + if (n > 0) { +#if CONFIG_ENTROPY_STATS + ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE]; +#endif + if (allow_update_cdf) { + update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx], + n - PALETTE_MIN_SIZE, PALETTE_SIZES); + } + } + } + + if (mbmi->uv_mode == UV_DC_PRED) { + const int n = pmi->palette_size[1]; + const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0); + +#if CONFIG_ENTROPY_STATS + ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0]; +#endif + if (allow_update_cdf) + update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2); + + if (n > 0) { +#if CONFIG_ENTROPY_STATS + ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE]; +#endif + if (allow_update_cdf) { + update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx], + n - PALETTE_MIN_SIZE, PALETTE_SIZES); + } } } } -static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row, - int mi_col -#if CONFIG_SUPERTX - , - int supertx_enabled +static void sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts, + MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi, + const MB_MODE_INFO *above_mi, + const MB_MODE_INFO *left_mi, const int intraonly, + const int mi_row, const int mi_col, + uint8_t allow_update_cdf) { + FRAME_CONTEXT *fc = xd->tile_ctx; + const PREDICTION_MODE y_mode = mbmi->mode; + const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; + (void)counts; + const BLOCK_SIZE bsize = mbmi->sb_type; + + if (intraonly) { +#if CONFIG_ENTROPY_STATS + const PREDICTION_MODE above = av1_above_block_mode(above_mi); + const PREDICTION_MODE left = av1_left_block_mode(left_mi); + const int above_ctx = intra_mode_context[above]; + const int left_ctx = intra_mode_context[left]; + ++counts->kf_y_mode[above_ctx][left_ctx][y_mode]; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) + update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES); + } else { +#if CONFIG_ENTROPY_STATS + ++counts->y_mode[size_group_lookup[bsize]][y_mode]; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) + update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES); + } + + if (av1_filter_intra_allowed(cm, mbmi)) { + const int use_filter_intra_mode = + mbmi->filter_intra_mode_info.use_filter_intra; +#if CONFIG_ENTROPY_STATS + ++counts->filter_intra[mbmi->sb_type][use_filter_intra_mode]; + if (use_filter_intra_mode) { + ++counts + ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode]; + } +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) { + update_cdf(fc->filter_intra_cdfs[mbmi->sb_type], use_filter_intra_mode, + 2); + if (use_filter_intra_mode) { + update_cdf(fc->filter_intra_mode_cdf, + mbmi->filter_intra_mode_info.filter_intra_mode, + FILTER_INTRA_MODES); + } + } + } + if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) { +#if CONFIG_ENTROPY_STATS + ++counts->angle_delta[mbmi->mode - V_PRED] + [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA]; +#endif + if (allow_update_cdf) { + update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED], + mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA, + 2 * MAX_ANGLE_DELTA + 1); + } + } + + if (!is_chroma_reference(mi_row, mi_col, bsize, + xd->plane[AOM_PLANE_U].subsampling_x, + xd->plane[AOM_PLANE_U].subsampling_y)) + return; + +#if CONFIG_ENTROPY_STATS + ++counts->uv_mode[is_cfl_allowed(xd)][y_mode][uv_mode]; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) { + const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd); + update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode, + UV_INTRA_MODES - !cfl_allowed); + } + if (uv_mode == UV_CFL_PRED) { + const int joint_sign = mbmi->cfl_alpha_signs; + const int idx = mbmi->cfl_alpha_idx; + +#if CONFIG_ENTROPY_STATS + ++counts->cfl_sign[joint_sign]; +#endif + if (allow_update_cdf) + update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS); + if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; + +#if CONFIG_ENTROPY_STATS + ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)]; +#endif + if (allow_update_cdf) + update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE); + } + if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; + +#if CONFIG_ENTROPY_STATS + ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)]; #endif - ) { + if (allow_update_cdf) + update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE); + } + } + if (av1_is_directional_mode(get_uv_mode(uv_mode)) && + av1_use_angle_delta(bsize)) { +#if CONFIG_ENTROPY_STATS + ++counts->angle_delta[uv_mode - UV_V_PRED] + [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA]; +#endif + if (allow_update_cdf) { + update_cdf(fc->angle_delta_cdf[uv_mode - UV_V_PRED], + mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA, + 2 * MAX_ANGLE_DELTA + 1); + } + } + if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) + update_palette_cdf(xd, mbmi, counts, allow_update_cdf); +} + +static void update_stats(const AV1_COMMON *const cm, TileDataEnc *tile_data, + ThreadData *td, int mi_row, int mi_col) { MACROBLOCK *x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; - const MODE_INFO *const mi = xd->mi[0]; - const MB_MODE_INFO *const mbmi = &mi->mbmi; + const MB_MODE_INFO *const mbmi = xd->mi[0]; const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; const BLOCK_SIZE bsize = mbmi->sb_type; FRAME_CONTEXT *fc = xd->tile_ctx; + const uint8_t allow_update_cdf = tile_data->allow_update_cdf; // delta quant applies to both intra and inter - int super_block_upper_left = - ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0); + const int super_block_upper_left = + ((mi_row & (cm->seq_params.mib_size - 1)) == 0) && + ((mi_col & (cm->seq_params.mib_size - 1)) == 0); + + const int seg_ref_active = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); + + if (cm->skip_mode_flag && !seg_ref_active && is_comp_ref_allowed(bsize)) { + const int skip_mode_ctx = av1_get_skip_mode_context(xd); +#if CONFIG_ENTROPY_STATS + td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++; +#endif + if (allow_update_cdf) + update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2); + } + + if (!mbmi->skip_mode) { + if (!seg_ref_active) { + const int skip_ctx = av1_get_skip_context(xd); +#if CONFIG_ENTROPY_STATS + td->counts->skip[skip_ctx][mbmi->skip]++; +#endif + if (allow_update_cdf) update_cdf(fc->skip_cdfs[skip_ctx], mbmi->skip, 2); + } + } - if (cm->delta_q_present_flag && (bsize != cm->sb_size || !mbmi->skip) && + if (cm->delta_q_present_flag && + (bsize != cm->seq_params.sb_size || !mbmi->skip) && super_block_upper_left) { - const int dq = (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res; +#if CONFIG_ENTROPY_STATS + const int dq = + (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res; const int absdq = abs(dq); - int i; - for (i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) { + for (int i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) { td->counts->delta_q[i][1]++; } if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++; - xd->prev_qindex = mbmi->current_q_index; -#if CONFIG_EXT_DELTA_Q -#if CONFIG_LOOPFILTER_LEVEL +#endif + xd->current_qindex = mbmi->current_qindex; if (cm->delta_lf_present_flag) { if (cm->delta_lf_multi) { - for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { +#if CONFIG_ENTROPY_STATS const int delta_lf = - (mbmi->curr_delta_lf[lf_id] - xd->prev_delta_lf[lf_id]) / - cm->delta_lf_res; + (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / cm->delta_lf_res; const int abs_delta_lf = abs(delta_lf); - for (i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) { + for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) { td->counts->delta_lf_multi[lf_id][i][1]++; } if (abs_delta_lf < DELTA_LF_SMALL) td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++; - xd->prev_delta_lf[lf_id] = mbmi->curr_delta_lf[lf_id]; +#endif + xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; } } else { +#if CONFIG_ENTROPY_STATS const int delta_lf = - (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) / + (mbmi->delta_lf_from_base - xd->delta_lf_from_base) / cm->delta_lf_res; const int abs_delta_lf = abs(delta_lf); - for (i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) { + for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) { td->counts->delta_lf[i][1]++; } if (abs_delta_lf < DELTA_LF_SMALL) td->counts->delta_lf[abs_delta_lf][0]++; - xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base; - } - } -#else - if (cm->delta_lf_present_flag) { - const int dlf = - (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) / - cm->delta_lf_res; - const int absdlf = abs(dlf); - for (i = 0; i < AOMMIN(absdlf, DELTA_LF_SMALL); ++i) { - td->counts->delta_lf[i][1]++; +#endif + xd->delta_lf_from_base = mbmi->delta_lf_from_base; } - if (absdlf < DELTA_LF_SMALL) td->counts->delta_lf[absdlf][0]++; - xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base; } -#endif // CONFIG_LOOPFILTER_LEVEL -#endif } + + if (!is_inter_block(mbmi)) { + sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi, + frame_is_intra_only(cm), mi_row, mi_col, + tile_data->allow_update_cdf); + } + + if (av1_allow_intrabc(cm)) { + if (allow_update_cdf) + update_cdf(fc->intrabc_cdf, is_intrabc_block(mbmi), 2); +#if CONFIG_ENTROPY_STATS + ++td->counts->intrabc[is_intrabc_block(mbmi)]; +#endif // CONFIG_ENTROPY_STATS + } + if (!frame_is_intra_only(cm)) { - FRAME_COUNTS *const counts = td->counts; RD_COUNTS *rdc = &td->rd_counts; + + FRAME_COUNTS *const counts = td->counts; + + if (mbmi->skip_mode) { + rdc->skip_mode_used_flag = 1; + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + assert(has_second_ref(mbmi)); + rdc->compound_ref_used_flag = 1; + } + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + return; + } + const int inter_block = is_inter_block(mbmi); - const int seg_ref_active = - segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); + if (!seg_ref_active) { -#if CONFIG_SUPERTX - if (!supertx_enabled) -#endif - counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++; -#if CONFIG_NEW_MULTISYMBOL - update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)], - inter_block, 2); +#if CONFIG_ENTROPY_STATS + counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++; #endif + if (allow_update_cdf) { + update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)], + inter_block, 2); + } // If the segment reference feature is enabled we have only a single // reference frame allowed for the segment so exclude it from // the reference frame counts used to work out probabilities. if (inter_block) { const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0]; -#if CONFIG_EXT_REFS const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1]; -#endif // CONFIG_EXT_REFS + + av1_collect_neighbors_ref_counts(xd); if (cm->reference_mode == REFERENCE_MODE_SELECT) { if (has_second_ref(mbmi)) // This flag is also updated for 4x4 blocks rdc->compound_ref_used_flag = 1; - else - // This flag is also updated for 4x4 blocks - rdc->single_ref_used_flag = 1; - if (is_comp_ref_allowed(mbmi->sb_type)) { - counts->comp_inter[av1_get_reference_mode_context(cm, xd)] + if (is_comp_ref_allowed(bsize)) { +#if CONFIG_ENTROPY_STATS + counts->comp_inter[av1_get_reference_mode_context(xd)] [has_second_ref(mbmi)]++; -#if CONFIG_NEW_MULTISYMBOL - update_cdf(av1_get_reference_mode_cdf(cm, xd), has_second_ref(mbmi), - 2); -#endif // CONFIG_NEW_MULTISYMBOL +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) { + update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi), + 2); + } } } if (has_second_ref(mbmi)) { -#if CONFIG_EXT_COMP_REFS const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi) ? UNIDIR_COMP_REFERENCE : BIDIR_COMP_REFERENCE; -#if !USE_UNI_COMP_REFS - // TODO(zoeliu): Temporarily turn off uni-directional comp refs - assert(comp_ref_type == BIDIR_COMP_REFERENCE); -#endif // !USE_UNI_COMP_REFS + if (allow_update_cdf) { + update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type, + COMP_REFERENCE_TYPES); + } +#if CONFIG_ENTROPY_STATS counts->comp_ref_type[av1_get_comp_reference_type_context(xd)] [comp_ref_type]++; +#endif // CONFIG_ENTROPY_STATS if (comp_ref_type == UNIDIR_COMP_REFERENCE) { const int bit = (ref0 == BWDREF_FRAME); + if (allow_update_cdf) + update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2); +#if CONFIG_ENTROPY_STATS counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0] [bit]++; +#endif // CONFIG_ENTROPY_STATS if (!bit) { const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME); + if (allow_update_cdf) + update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2); +#if CONFIG_ENTROPY_STATS counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1] [bit1]++; +#endif // CONFIG_ENTROPY_STATS if (bit1) { + if (allow_update_cdf) { + update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd), + ref1 == GOLDEN_FRAME, 2); + } +#if CONFIG_ENTROPY_STATS counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)] [2][ref1 == GOLDEN_FRAME]++; +#endif // CONFIG_ENTROPY_STATS } } } else { -#endif // CONFIG_EXT_COMP_REFS -#if CONFIG_EXT_REFS const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME); - - counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0][bit]++; + if (allow_update_cdf) + update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2); +#if CONFIG_ENTROPY_STATS + counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++; +#endif // CONFIG_ENTROPY_STATS if (!bit) { - counts->comp_ref[av1_get_pred_context_comp_ref_p1(cm, xd)][1] - [ref0 == LAST_FRAME]++; + if (allow_update_cdf) { + update_cdf(av1_get_pred_cdf_comp_ref_p1(xd), + ref0 == LAST2_FRAME, 2); + } +#if CONFIG_ENTROPY_STATS + counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1] + [ref0 == LAST2_FRAME]++; +#endif // CONFIG_ENTROPY_STATS } else { - counts->comp_ref[av1_get_pred_context_comp_ref_p2(cm, xd)][2] + if (allow_update_cdf) { + update_cdf(av1_get_pred_cdf_comp_ref_p2(xd), + ref0 == GOLDEN_FRAME, 2); + } +#if CONFIG_ENTROPY_STATS + counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2] [ref0 == GOLDEN_FRAME]++; +#endif // CONFIG_ENTROPY_STATS } - - counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(cm, xd)][0] + if (allow_update_cdf) { + update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd), + ref1 == ALTREF_FRAME, 2); + } +#if CONFIG_ENTROPY_STATS + counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0] [ref1 == ALTREF_FRAME]++; - if (ref1 != ALTREF_FRAME) - counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(cm, xd)] - [1][ref1 == ALTREF2_FRAME]++; -#else // !CONFIG_EXT_REFS - counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0] - [ref0 == GOLDEN_FRAME]++; -#endif // CONFIG_EXT_REFS -#if CONFIG_EXT_COMP_REFS +#endif // CONFIG_ENTROPY_STATS + if (ref1 != ALTREF_FRAME) { + if (allow_update_cdf) { + update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd), + ref1 == ALTREF2_FRAME, 2); + } +#if CONFIG_ENTROPY_STATS + counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1] + [ref1 == ALTREF2_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } } -#endif // CONFIG_EXT_COMP_REFS } else { -#if CONFIG_EXT_REFS const int bit = (ref0 >= BWDREF_FRAME); - + if (allow_update_cdf) + update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2); +#if CONFIG_ENTROPY_STATS counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++; +#endif // CONFIG_ENTROPY_STATS if (bit) { assert(ref0 <= ALTREF_FRAME); + if (allow_update_cdf) { + update_cdf(av1_get_pred_cdf_single_ref_p2(xd), + ref0 == ALTREF_FRAME, 2); + } +#if CONFIG_ENTROPY_STATS counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1] [ref0 == ALTREF_FRAME]++; - if (ref0 != ALTREF_FRAME) +#endif // CONFIG_ENTROPY_STATS + if (ref0 != ALTREF_FRAME) { + if (allow_update_cdf) { + update_cdf(av1_get_pred_cdf_single_ref_p6(xd), + ref0 == ALTREF2_FRAME, 2); + } +#if CONFIG_ENTROPY_STATS counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5] [ref0 == ALTREF2_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } } else { const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME); + if (allow_update_cdf) + update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2); +#if CONFIG_ENTROPY_STATS counts ->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++; +#endif // CONFIG_ENTROPY_STATS if (!bit1) { + if (allow_update_cdf) { + update_cdf(av1_get_pred_cdf_single_ref_p4(xd), + ref0 != LAST_FRAME, 2); + } +#if CONFIG_ENTROPY_STATS counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3] [ref0 != LAST_FRAME]++; +#endif // CONFIG_ENTROPY_STATS } else { + if (allow_update_cdf) { + update_cdf(av1_get_pred_cdf_single_ref_p5(xd), + ref0 != LAST3_FRAME, 2); + } +#if CONFIG_ENTROPY_STATS counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4] [ref0 != LAST3_FRAME]++; +#endif // CONFIG_ENTROPY_STATS } } -#else // !CONFIG_EXT_REFS - counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0] - [ref0 != LAST_FRAME]++; - if (ref0 != LAST_FRAME) { - counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1] - [ref0 != GOLDEN_FRAME]++; - } -#endif // CONFIG_EXT_REFS } -#if CONFIG_COMPOUND_SINGLEREF - if (!has_second_ref(mbmi)) - counts->comp_inter_mode[av1_get_inter_mode_context(xd)] - [is_inter_singleref_comp_mode(mbmi->mode)]++; -#endif // CONFIG_COMPOUND_SINGLEREF - -#if CONFIG_INTERINTRA - if (cm->reference_mode != COMPOUND_REFERENCE && -#if CONFIG_SUPERTX - !supertx_enabled && -#endif - cm->allow_interintra_compound && is_interintra_allowed(mbmi)) { + if (cm->seq_params.enable_interintra_compound && + is_interintra_allowed(mbmi)) { const int bsize_group = size_group_lookup[bsize]; if (mbmi->ref_frame[1] == INTRA_FRAME) { +#if CONFIG_ENTROPY_STATS counts->interintra[bsize_group][1]++; -#if CONFIG_NEW_MULTISYMBOL - update_cdf(fc->interintra_cdf[bsize_group], 1, 2); #endif + if (allow_update_cdf) + update_cdf(fc->interintra_cdf[bsize_group], 1, 2); +#if CONFIG_ENTROPY_STATS counts->interintra_mode[bsize_group][mbmi->interintra_mode]++; - update_cdf(fc->interintra_mode_cdf[bsize_group], - mbmi->interintra_mode, INTERINTRA_MODES); +#endif + if (allow_update_cdf) { + update_cdf(fc->interintra_mode_cdf[bsize_group], + mbmi->interintra_mode, INTERINTRA_MODES); + } if (is_interintra_wedge_used(bsize)) { +#if CONFIG_ENTROPY_STATS counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++; -#if CONFIG_NEW_MULTISYMBOL - update_cdf(fc->wedge_interintra_cdf[bsize], - mbmi->use_wedge_interintra, 2); #endif + if (allow_update_cdf) { + update_cdf(fc->wedge_interintra_cdf[bsize], + mbmi->use_wedge_interintra, 2); + } + if (mbmi->use_wedge_interintra) { +#if CONFIG_ENTROPY_STATS + counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++; +#endif + if (allow_update_cdf) { + update_cdf(fc->wedge_idx_cdf[bsize], + mbmi->interintra_wedge_index, 16); + } + } } } else { +#if CONFIG_ENTROPY_STATS counts->interintra[bsize_group][0]++; -#if CONFIG_NEW_MULTISYMBOL - update_cdf(fc->interintra_cdf[bsize_group], 0, 2); #endif + if (allow_update_cdf) + update_cdf(fc->interintra_cdf[bsize_group], 0, 2); } } -#endif // CONFIG_INTERINTRA -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION -#if CONFIG_WARPED_MOTION set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + const MOTION_MODE motion_allowed = + cm->switchable_motion_mode + ? motion_mode_allowed(xd->global_motion, xd, mbmi, + cm->allow_warped_motion) + : SIMPLE_TRANSLATION; + if (mbmi->ref_frame[1] != INTRA_FRAME) { + if (motion_allowed == WARPED_CAUSAL) { +#if CONFIG_ENTROPY_STATS + counts->motion_mode[bsize][mbmi->motion_mode]++; #endif - const MOTION_MODE motion_allowed = motion_mode_allowed( -#if CONFIG_GLOBAL_MOTION - 0, xd->global_motion, -#endif // CONFIG_GLOBAL_MOTION -#if CONFIG_WARPED_MOTION - xd, -#endif - mi); -#if CONFIG_SUPERTX - if (!supertx_enabled) -#endif // CONFIG_SUPERTX - if (mbmi->ref_frame[1] != INTRA_FRAME) -#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION - { - if (motion_allowed == WARPED_CAUSAL) { - counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++; - update_cdf(fc->motion_mode_cdf[mbmi->sb_type], mbmi->motion_mode, + if (allow_update_cdf) { + update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode, MOTION_MODES); -#if CONFIG_NCOBMC_ADAPT_WEIGHT - } else if (motion_allowed == NCOBMC_ADAPT_WEIGHT) { - counts->ncobmc[mbmi->sb_type][mbmi->motion_mode]++; - update_cdf(fc->ncobmc_cdf[mbmi->sb_type], mbmi->motion_mode, - OBMC_FAMILY_MODES); - } else if (motion_allowed == OBMC_CAUSAL) { - counts->obmc[mbmi->sb_type][mbmi->motion_mode == OBMC_CAUSAL]++; - update_cdf(fc->obmc_cdf[mbmi->sb_type], mbmi->motion_mode, 2); } -#else - } else if (motion_allowed == OBMC_CAUSAL) { - counts->obmc[mbmi->sb_type][mbmi->motion_mode == OBMC_CAUSAL]++; -#if CONFIG_NEW_MULTISYMBOL - update_cdf(fc->obmc_cdf[mbmi->sb_type], - mbmi->motion_mode == OBMC_CAUSAL, 2); + } else if (motion_allowed == OBMC_CAUSAL) { +#if CONFIG_ENTROPY_STATS + counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++; #endif + if (allow_update_cdf) { + update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL, + 2); } -#endif // CONFIG_NCOBMC_ADAPT_WEIGHT - } -#else - if (motion_allowed > SIMPLE_TRANSLATION) { - counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++; - update_cdf(fc->motion_mode_cdf[mbmi->sb_type], mbmi->motion_mode, - MOTION_MODES); - } -#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION - -#if CONFIG_NCOBMC_ADAPT_WEIGHT - if (mbmi->motion_mode == NCOBMC_ADAPT_WEIGHT) { - ADAPT_OVERLAP_BLOCK ao_block = - adapt_overlap_block_lookup[mbmi->sb_type]; - ++counts->ncobmc_mode[ao_block][mbmi->ncobmc_mode[0]]; - update_cdf(fc->ncobmc_mode_cdf[ao_block], mbmi->ncobmc_mode[0], - MAX_NCOBMC_MODES); - if (mi_size_wide[mbmi->sb_type] != mi_size_high[mbmi->sb_type]) { - ++counts->ncobmc_mode[ao_block][mbmi->ncobmc_mode[1]]; - update_cdf(fc->ncobmc_mode_cdf[ao_block], mbmi->ncobmc_mode[1], - MAX_NCOBMC_MODES); } } + + if (has_second_ref(mbmi)) { + assert(cm->reference_mode != SINGLE_REFERENCE && + is_inter_compound_mode(mbmi->mode) && + mbmi->motion_mode == SIMPLE_TRANSLATION); + + const int masked_compound_used = + is_any_masked_compound_used(bsize) && + cm->seq_params.enable_masked_compound; + if (masked_compound_used) { + const int comp_group_idx_ctx = get_comp_group_idx_context(xd); +#if CONFIG_ENTROPY_STATS + ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx]; #endif + if (allow_update_cdf) { + update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx], + mbmi->comp_group_idx, 2); + } + } -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - - if ( -#if CONFIG_COMPOUND_SINGLEREF - is_inter_anyref_comp_mode(mbmi->mode) -#else // !CONFIG_COMPOUND_SINGLEREF - cm->reference_mode != SINGLE_REFERENCE && - is_inter_compound_mode(mbmi->mode) -#endif // CONFIG_COMPOUND_SINGLEREF -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - && mbmi->motion_mode == SIMPLE_TRANSLATION -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - ) { -#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT -#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT - if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) { + if (mbmi->comp_group_idx == 0) { + const int comp_index_ctx = get_comp_index_context(cm, xd); +#if CONFIG_ENTROPY_STATS + ++counts->compound_index[comp_index_ctx][mbmi->compound_idx]; #endif - counts - ->compound_interinter[bsize][mbmi->interinter_compound_type]++; - update_cdf(fc->compound_type_cdf[bsize], - mbmi->interinter_compound_type, COMPOUND_TYPES); -#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT + if (allow_update_cdf) { + update_cdf(fc->compound_index_cdf[comp_index_ctx], + mbmi->compound_idx, 2); + } + } else { + assert(masked_compound_used); + if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) { +#if CONFIG_ENTROPY_STATS + ++counts->compound_type[bsize][mbmi->interinter_comp.type - 1]; +#endif + if (allow_update_cdf) { + update_cdf(fc->compound_type_cdf[bsize], + mbmi->interinter_comp.type - 1, COMPOUND_TYPES - 1); + } + } } + } + if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { + if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) { +#if CONFIG_ENTROPY_STATS + counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++; #endif -#endif // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT + if (allow_update_cdf) { + update_cdf(fc->wedge_idx_cdf[bsize], + mbmi->interinter_comp.wedge_index, 16); + } + } } } } @@ -1903,37 +1267,33 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row, !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { int16_t mode_ctx; const PREDICTION_MODE mode = mbmi->mode; + + mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); if (has_second_ref(mbmi)) { - mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]]; +#if CONFIG_ENTROPY_STATS ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)]; - update_cdf(fc->inter_compound_mode_cdf[mode_ctx], - INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES); -#if CONFIG_COMPOUND_SINGLEREF - } else if (is_inter_singleref_comp_mode(mode)) { - mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]]; - ++counts->inter_singleref_comp_mode[mode_ctx] - [INTER_SINGLEREF_COMP_OFFSET(mode)]; -#endif // CONFIG_COMPOUND_SINGLEREF +#endif + if (allow_update_cdf) + update_cdf(fc->inter_compound_mode_cdf[mode_ctx], + INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES); } else { - mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, - mbmi->ref_frame, bsize, -1); - update_inter_mode_stats(counts, mode, mode_ctx); + update_inter_mode_stats(fc, counts, mode, mode_ctx, allow_update_cdf); } int mode_allowed = (mbmi->mode == NEWMV); mode_allowed |= (mbmi->mode == NEW_NEWMV); -#if CONFIG_COMPOUND_SINGLEREF - mode_allowed |= (mbmi->mode == SR_NEW_NEWMV); -#endif // CONFIG_COMPOUND_SINGLEREF if (mode_allowed) { uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); int idx; for (idx = 0; idx < 2; ++idx) { if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { +#if CONFIG_ENTROPY_STATS uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx); ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx]; +#endif if (mbmi->ref_mv_idx == idx) break; } @@ -1946,47 +1306,35 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row, for (idx = 1; idx < 3; ++idx) { if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { +#if CONFIG_ENTROPY_STATS uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx); ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1]; +#endif if (mbmi->ref_mv_idx == idx - 1) break; } } } } -#if CONFIG_INTRABC - } else { - if (av1_allow_intrabc(bsize, cm)) { - FRAME_COUNTS *const counts = td->counts; - ++counts->intrabc[mbmi->use_intrabc]; - } else { - assert(!mbmi->use_intrabc); - } -#endif } } typedef struct { - ENTROPY_CONTEXT a[2 * MAX_MIB_SIZE * MAX_MB_PLANE]; - ENTROPY_CONTEXT l[2 * MAX_MIB_SIZE * MAX_MB_PLANE]; + ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE]; + ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE]; PARTITION_CONTEXT sa[MAX_MIB_SIZE]; PARTITION_CONTEXT sl[MAX_MIB_SIZE]; -#if CONFIG_VAR_TX TXFM_CONTEXT *p_ta; TXFM_CONTEXT *p_tl; - TXFM_CONTEXT ta[2 * MAX_MIB_SIZE]; - TXFM_CONTEXT tl[2 * MAX_MIB_SIZE]; -#endif + TXFM_CONTEXT ta[MAX_MIB_SIZE]; + TXFM_CONTEXT tl[MAX_MIB_SIZE]; } RD_SEARCH_MACROBLOCK_CONTEXT; static void restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, int mi_row, - int mi_col, -#if CONFIG_PVQ - od_rollback_buffer *rdo_buf, -#endif - BLOCK_SIZE bsize) { + int mi_col, BLOCK_SIZE bsize, + const int num_planes) { MACROBLOCKD *xd = &x->e_mbd; int p; const int num_4x4_blocks_wide = @@ -1995,11 +1343,9 @@ static void restore_context(MACROBLOCK *x, block_size_high[bsize] >> tx_size_high_log2[0]; int mi_width = mi_size_wide[bsize]; int mi_height = mi_size_high[bsize]; - for (p = 0; p < MAX_MB_PLANE; p++) { - int tx_col; - int tx_row; - tx_col = mi_col << (MI_SIZE_LOG2 - tx_size_wide_log2[0]); - tx_row = (mi_row & MAX_MIB_MASK) << (MI_SIZE_LOG2 - tx_size_high_log2[0]); + for (p = 0; p < num_planes; p++) { + int tx_col = mi_col; + int tx_row = mi_row & MAX_MIB_MASK; memcpy(xd->above_context[p] + (tx_col >> xd->plane[p].subsampling_x), ctx->a + num_4x4_blocks_wide * p, (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> @@ -2013,25 +1359,17 @@ static void restore_context(MACROBLOCK *x, sizeof(*xd->above_seg_context) * mi_width); memcpy(xd->left_seg_context + (mi_row & MAX_MIB_MASK), ctx->sl, sizeof(xd->left_seg_context[0]) * mi_height); -#if CONFIG_VAR_TX xd->above_txfm_context = ctx->p_ta; xd->left_txfm_context = ctx->p_tl; memcpy(xd->above_txfm_context, ctx->ta, - sizeof(*xd->above_txfm_context) * (mi_width << TX_UNIT_WIDE_LOG2)); + sizeof(*xd->above_txfm_context) * mi_width); memcpy(xd->left_txfm_context, ctx->tl, - sizeof(*xd->left_txfm_context) * (mi_height << TX_UNIT_HIGH_LOG2)); -#endif -#if CONFIG_PVQ - od_encode_rollback(&x->daala_enc, rdo_buf); -#endif + sizeof(*xd->left_txfm_context) * mi_height); } static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx, - int mi_row, int mi_col, -#if CONFIG_PVQ - od_rollback_buffer *rdo_buf, -#endif - BLOCK_SIZE bsize) { + int mi_row, int mi_col, BLOCK_SIZE bsize, + const int num_planes) { const MACROBLOCKD *xd = &x->e_mbd; int p; const int num_4x4_blocks_wide = @@ -2042,11 +1380,9 @@ static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx, int mi_height = mi_size_high[bsize]; // buffer the above/left context information of the block in search. - for (p = 0; p < MAX_MB_PLANE; ++p) { - int tx_col; - int tx_row; - tx_col = mi_col << (MI_SIZE_LOG2 - tx_size_wide_log2[0]); - tx_row = (mi_row & MAX_MIB_MASK) << (MI_SIZE_LOG2 - tx_size_high_log2[0]); + for (p = 0; p < num_planes; ++p) { + int tx_col = mi_col; + int tx_row = mi_row & MAX_MIB_MASK; memcpy(ctx->a + num_4x4_blocks_wide * p, xd->above_context[p] + (tx_col >> xd->plane[p].subsampling_x), (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> @@ -2060,386 +1396,165 @@ static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx, sizeof(*xd->above_seg_context) * mi_width); memcpy(ctx->sl, xd->left_seg_context + (mi_row & MAX_MIB_MASK), sizeof(xd->left_seg_context[0]) * mi_height); -#if CONFIG_VAR_TX memcpy(ctx->ta, xd->above_txfm_context, - sizeof(*xd->above_txfm_context) * (mi_width << TX_UNIT_WIDE_LOG2)); + sizeof(*xd->above_txfm_context) * mi_width); memcpy(ctx->tl, xd->left_txfm_context, - sizeof(*xd->left_txfm_context) * (mi_height << TX_UNIT_HIGH_LOG2)); + sizeof(*xd->left_txfm_context) * mi_height); ctx->p_ta = xd->above_txfm_context; ctx->p_tl = xd->left_txfm_context; -#endif -#if CONFIG_PVQ - od_encode_checkpoint(&x->daala_enc, rdo_buf); -#endif } -static void encode_b(const AV1_COMP *const cpi, const TileInfo *const tile, +static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data, ThreadData *td, TOKENEXTRA **tp, int mi_row, int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize, -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_TYPE partition, -#endif - PICK_MODE_CONTEXT *ctx, int *rate) { + PARTITION_TYPE partition, PICK_MODE_CONTEXT *ctx, + int *rate) { + TileInfo *const tile = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; -#if (CONFIG_MOTION_VAR && CONFIG_NCOBMC) | CONFIG_EXT_DELTA_Q | \ - CONFIG_NCOBMC_ADAPT_WEIGHT MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi; -#if CONFIG_MOTION_VAR && CONFIG_NCOBMC - int check_ncobmc; -#endif -#endif set_offsets(cpi, tile, x, mi_row, mi_col, bsize); -#if CONFIG_EXT_PARTITION_TYPES - x->e_mbd.mi[0]->mbmi.partition = partition; -#endif - update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run); -#if CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT) - mbmi = &xd->mi[0]->mbmi; -#if CONFIG_WARPED_MOTION - set_ref_ptrs(&cpi->common, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); -#endif -#endif - -#if CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT) - const MOTION_MODE motion_allowed = motion_mode_allowed( -#if CONFIG_GLOBAL_MOTION - 0, xd->global_motion, -#endif // CONFIG_GLOBAL_MOTION -#if CONFIG_WARPED_MOTION - xd, -#endif - xd->mi[0]); -#endif // CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT) - -#if CONFIG_MOTION_VAR && CONFIG_NCOBMC - check_ncobmc = is_inter_block(mbmi) && motion_allowed >= OBMC_CAUSAL; - if (!dry_run && check_ncobmc) { - av1_check_ncobmc_rd(cpi, x, mi_row, mi_col); - av1_setup_dst_planes(x->e_mbd.plane, bsize, - get_frame_new_buffer(&cpi->common), mi_row, mi_col); - } -#endif + MB_MODE_INFO *mbmi = xd->mi[0]; + mbmi->partition = partition; + update_state(cpi, tile_data, td, ctx, mi_row, mi_col, bsize, dry_run); -#if CONFIG_LV_MAP - av1_set_coeff_buffer(cpi, x, mi_row, mi_col); -#endif + if (!dry_run) av1_set_coeff_buffer(cpi, x, mi_row, mi_col); -#if CONFIG_NCOBMC_ADAPT_WEIGHT - if (dry_run == OUTPUT_ENABLED && !frame_is_intra_only(&cpi->common)) { - if (motion_allowed >= NCOBMC_ADAPT_WEIGHT && is_inter_block(mbmi)) { - get_ncobmc_intrpl_pred(cpi, td, mi_row, mi_col, bsize); - av1_check_ncobmc_adapt_weight_rd(cpi, x, mi_row, mi_col); - } - av1_setup_dst_planes(x->e_mbd.plane, bsize, - get_frame_new_buffer(&cpi->common), mi_row, mi_col); - } -#endif // CONFIG_NCOBMC_ADAPT_WEIGHT - - encode_superblock(cpi, td, tp, dry_run, mi_row, mi_col, bsize, rate); + encode_superblock(cpi, tile_data, td, tp, dry_run, mi_row, mi_col, bsize, + rate); -#if CONFIG_LV_MAP if (dry_run == 0) x->cb_offset += block_size_wide[bsize] * block_size_high[bsize]; -#endif if (!dry_run) { -#if CONFIG_EXT_DELTA_Q - mbmi = &xd->mi[0]->mbmi; - if (bsize == cpi->common.sb_size && mbmi->skip == 1 && + if (bsize == cpi->common.seq_params.sb_size && mbmi->skip == 1 && cpi->common.delta_lf_present_flag) { -#if CONFIG_LOOPFILTER_LEVEL - for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) - mbmi->curr_delta_lf[lf_id] = xd->prev_delta_lf[lf_id]; -#endif // CONFIG_LOOPFILTER_LEVEL - mbmi->current_delta_lf_from_base = xd->prev_delta_lf_from_base; + const int frame_lf_count = av1_num_planes(&cpi->common) > 1 + ? FRAME_LF_COUNT + : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) + mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id]; + mbmi->delta_lf_from_base = xd->delta_lf_from_base; } -#endif -#if CONFIG_SUPERTX - update_stats(&cpi->common, td, mi_row, mi_col, 0); -#else - update_stats(&cpi->common, td, mi_row, mi_col); -#endif + if (has_second_ref(mbmi)) { + if (mbmi->compound_idx == 0 || + mbmi->interinter_comp.type == COMPOUND_AVERAGE) + mbmi->comp_group_idx = 0; + else + mbmi->comp_group_idx = 1; + } + update_stats(&cpi->common, tile_data, td, mi_row, mi_col); } } static void encode_sb(const AV1_COMP *const cpi, ThreadData *td, - const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, + TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row, int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize, PC_TREE *pc_tree, int *rate) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; const int hbs = mi_size_wide[bsize] / 2; -#if CONFIG_EXT_PARTITION_TYPES && CONFIG_EXT_PARTITION_TYPES_AB - const int qbs = mi_size_wide[bsize] / 4; -#endif const int is_partition_root = bsize >= BLOCK_8X8; const int ctx = is_partition_root - ? partition_plane_context(xd, mi_row, mi_col, -#if CONFIG_UNPOISON_PARTITION_CTX - mi_row + hbs < cm->mi_rows, - mi_col + hbs < cm->mi_cols, -#endif - bsize) + ? partition_plane_context(xd, mi_row, mi_col, bsize) : -1; const PARTITION_TYPE partition = pc_tree->partitioning; - const BLOCK_SIZE subsize = get_subsize(bsize, partition); -#if CONFIG_EXT_PARTITION_TYPES + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); int quarter_step = mi_size_wide[bsize] / 4; int i; -#if !CONFIG_EXT_PARTITION_TYPES_AB - BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT); -#endif -#endif - -#if CONFIG_CB4X4 - const int unify_bsize = 1; -#else - const int unify_bsize = 0; - assert(bsize >= BLOCK_8X8); -#endif + BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - if (!dry_run && ctx >= 0) td->counts->partition[ctx][partition]++; - -#if CONFIG_SUPERTX - if (!frame_is_intra_only(cm) && bsize <= MAX_SUPERTX_BLOCK_SIZE && - partition != PARTITION_NONE && !xd->lossless[0]) { - int supertx_enabled; - TX_SIZE supertx_size = max_txsize_lookup[bsize]; - supertx_enabled = check_supertx_sb(bsize, supertx_size, pc_tree); - if (supertx_enabled) { - const int mi_width = mi_size_wide[bsize]; - const int mi_height = mi_size_high[bsize]; - int x_idx, y_idx, i; - uint8_t *dst_buf[3]; - int dst_stride[3]; - set_skip_context(xd, mi_row, mi_col); - set_mode_info_offsets(cpi, x, xd, mi_row, mi_col); - update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, dry_run, - pc_tree); - - av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row, - mi_col); - for (i = 0; i < MAX_MB_PLANE; i++) { - dst_buf[i] = xd->plane[i].dst.buf; - dst_stride[i] = xd->plane[i].dst.stride; - } - predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, dry_run, - bsize, bsize, dst_buf, dst_stride, pc_tree); - - set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); - set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize); - - if (!x->skip) { - int this_rate = 0; - av1_encode_sb_supertx((AV1_COMMON *)cm, x, bsize); - av1_tokenize_sb_supertx(cpi, td, tp, dry_run, mi_row, mi_col, bsize, - rate); - if (rate) *rate += this_rate; - } else { - xd->mi[0]->mbmi.skip = 1; - if (!dry_run) td->counts->skip[av1_get_skip_context(xd)][1]++; - av1_reset_skip_context(xd, mi_row, mi_col, bsize); - } - if (!dry_run) { - for (y_idx = 0; y_idx < mi_height; y_idx++) - for (x_idx = 0; x_idx < mi_width; x_idx++) { - if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > - x_idx && - (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > - y_idx) { - xd->mi[x_idx + y_idx * cm->mi_stride]->mbmi.skip = - xd->mi[0]->mbmi.skip; - } - } - td->counts->supertx[partition_supertx_context_lookup[partition]] - [supertx_size][1]++; - td->counts->supertx_size[supertx_size]++; + if (!dry_run && ctx >= 0) { + const int has_rows = (mi_row + hbs) < cm->mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_cols; + + if (has_rows && has_cols) { #if CONFIG_ENTROPY_STATS -#if CONFIG_EXT_TX - if (get_ext_tx_types(supertx_size, bsize, 1, cm->reduced_tx_set_used) > - 1 && - !xd->mi[0]->mbmi.skip) { - const int eset = - get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used); - if (eset > 0) { - ++td->counts - ->inter_ext_tx[eset][supertx_size][xd->mi[0]->mbmi.tx_type]; - } - } -#else - if (supertx_size < TX_32X32 && !xd->mi[0]->mbmi.skip) { - ++td->counts->inter_ext_tx[supertx_size][xd->mi[0]->mbmi.tx_type]; - } -#endif // CONFIG_EXT_TX -#endif // CONFIG_ENTROPY_STATS - } -#if CONFIG_EXT_PARTITION_TYPES - update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, - partition); -#else - if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) - update_partition_context(xd, mi_row, mi_col, subsize, bsize); + td->counts->partition[ctx][partition]++; #endif -#if CONFIG_VAR_TX - set_txfm_ctxs(supertx_size, mi_width, mi_height, xd->mi[0]->mbmi.skip, - xd); -#endif // CONFIG_VAR_TX - return; - } else { - if (!dry_run) { - td->counts->supertx[partition_supertx_context_lookup[partition]] - [supertx_size][0]++; + + if (tile_data->allow_update_cdf) { + FRAME_CONTEXT *fc = xd->tile_ctx; + update_cdf(fc->partition_cdf[ctx], partition, + partition_cdf_length(bsize)); } } } -#endif // CONFIG_SUPERTX switch (partition) { case PARTITION_NONE: - encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, -#if CONFIG_EXT_PARTITION_TYPES - partition, -#endif - &pc_tree->none, rate); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, &pc_tree->none, rate); break; case PARTITION_VERT: - encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, -#if CONFIG_EXT_PARTITION_TYPES - partition, -#endif - &pc_tree->vertical[0], rate); - if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) { - encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize, -#if CONFIG_EXT_PARTITION_TYPES - partition, -#endif - &pc_tree->vertical[1], rate); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, &pc_tree->vertical[0], rate); + if (mi_col + hbs < cm->mi_cols) { + encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize, + partition, &pc_tree->vertical[1], rate); } break; case PARTITION_HORZ: - encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, -#if CONFIG_EXT_PARTITION_TYPES - partition, -#endif - &pc_tree->horizontal[0], rate); - if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) { - encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize, -#if CONFIG_EXT_PARTITION_TYPES - partition, -#endif - &pc_tree->horizontal[1], rate); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, &pc_tree->horizontal[0], rate); + if (mi_row + hbs < cm->mi_rows) { + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize, + partition, &pc_tree->horizontal[1], rate); } break; case PARTITION_SPLIT: - if (bsize == BLOCK_8X8 && !unify_bsize) { - encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, -#if CONFIG_EXT_PARTITION_TYPES - partition, -#endif - pc_tree->leaf_split[0], rate); - } else { - encode_sb(cpi, td, tile, tp, mi_row, mi_col, dry_run, subsize, - pc_tree->split[0], rate); - encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, dry_run, subsize, - pc_tree->split[1], rate); - encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, dry_run, subsize, - pc_tree->split[2], rate); - encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, dry_run, - subsize, pc_tree->split[3], rate); - } - break; - -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION_TYPES_AB - case PARTITION_HORZ_A: - encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, - get_subsize(bsize, PARTITION_HORZ_4), partition, - &pc_tree->horizontala[0], rate); - encode_b(cpi, tile, td, tp, mi_row + qbs, mi_col, dry_run, - get_subsize(bsize, PARTITION_HORZ_4), partition, - &pc_tree->horizontala[1], rate); - encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize, - partition, &pc_tree->horizontala[2], rate); - break; - case PARTITION_HORZ_B: - encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition, - &pc_tree->horizontalb[0], rate); - encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, - get_subsize(bsize, PARTITION_HORZ_4), partition, - &pc_tree->horizontalb[1], rate); - if (mi_row + 3 * qbs < cm->mi_rows) - encode_b(cpi, tile, td, tp, mi_row + 3 * qbs, mi_col, dry_run, - get_subsize(bsize, PARTITION_HORZ_4), partition, - &pc_tree->horizontalb[2], rate); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize, + pc_tree->split[0], rate); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize, + pc_tree->split[1], rate); + encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize, + pc_tree->split[2], rate); + encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run, + subsize, pc_tree->split[3], rate); break; - case PARTITION_VERT_A: - encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, - get_subsize(bsize, PARTITION_VERT_4), partition, - &pc_tree->verticala[0], rate); - encode_b(cpi, tile, td, tp, mi_row, mi_col + qbs, dry_run, - get_subsize(bsize, PARTITION_VERT_4), partition, - &pc_tree->verticala[1], rate); - encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize, - partition, &pc_tree->verticala[2], rate); - break; - case PARTITION_VERT_B: - encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition, - &pc_tree->verticalb[0], rate); - encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, - get_subsize(bsize, PARTITION_VERT_4), partition, - &pc_tree->verticalb[1], rate); - if (mi_col + 3 * qbs < cm->mi_cols) - encode_b(cpi, tile, td, tp, mi_row, mi_col + 3 * qbs, dry_run, - get_subsize(bsize, PARTITION_VERT_4), partition, - &pc_tree->verticalb[2], rate); - break; -#else case PARTITION_HORZ_A: - encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition, - &pc_tree->horizontala[0], rate); - encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2, + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2, + partition, &pc_tree->horizontala[0], rate); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2, partition, &pc_tree->horizontala[1], rate); - encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize, + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize, partition, &pc_tree->horizontala[2], rate); break; case PARTITION_HORZ_B: - encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition, - &pc_tree->horizontalb[0], rate); - encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2, + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, &pc_tree->horizontalb[0], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2, partition, &pc_tree->horizontalb[1], rate); - encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2, - partition, &pc_tree->horizontalb[2], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run, + bsize2, partition, &pc_tree->horizontalb[2], rate); break; case PARTITION_VERT_A: - encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition, - &pc_tree->verticala[0], rate); - encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2, + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2, + partition, &pc_tree->verticala[0], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2, partition, &pc_tree->verticala[1], rate); - encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize, + encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize, partition, &pc_tree->verticala[2], rate); break; case PARTITION_VERT_B: - encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition, - &pc_tree->verticalb[0], rate); - encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2, + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, &pc_tree->verticalb[0], rate); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2, partition, &pc_tree->verticalb[1], rate); - encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2, - partition, &pc_tree->verticalb[2], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run, + bsize2, partition, &pc_tree->verticalb[2], rate); break; -#endif case PARTITION_HORZ_4: for (i = 0; i < 4; ++i) { int this_mi_row = mi_row + i * quarter_step; if (i > 0 && this_mi_row >= cm->mi_rows) break; - encode_b(cpi, tile, td, tp, this_mi_row, mi_col, dry_run, subsize, + encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize, partition, &pc_tree->horizontal4[i], rate); } break; @@ -2448,20 +1563,14 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td, int this_mi_col = mi_col + i * quarter_step; if (i > 0 && this_mi_col >= cm->mi_cols) break; - encode_b(cpi, tile, td, tp, mi_row, this_mi_col, dry_run, subsize, + encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize, partition, &pc_tree->vertical4[i], rate); } break; -#endif // CONFIG_EXT_PARTITION_TYPES default: assert(0 && "Invalid partition type."); break; } -#if CONFIG_EXT_PARTITION_TYPES update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); -#else - if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) - update_partition_context(xd, mi_row, mi_col, subsize, bsize); -#endif // CONFIG_EXT_PARTITION_TYPES } // Check to see if the given partition size is allowed for a specified number @@ -2483,19 +1592,19 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left, return bsize; } -static void set_partial_sb_partition(const AV1_COMMON *const cm, MODE_INFO *mi, - int bh_in, int bw_in, +static void set_partial_sb_partition(const AV1_COMMON *const cm, + MB_MODE_INFO *mi, int bh_in, int bw_in, int mi_rows_remaining, int mi_cols_remaining, BLOCK_SIZE bsize, - MODE_INFO **mib) { + MB_MODE_INFO **mib) { int bh = bh_in; int r, c; - for (r = 0; r < cm->mib_size; r += bh) { + for (r = 0; r < cm->seq_params.mib_size; r += bh) { int bw = bw_in; - for (c = 0; c < cm->mib_size; c += bw) { + for (c = 0; c < cm->seq_params.mib_size; c += bw) { const int index = r * cm->mi_stride + c; mib[index] = mi + index; - mib[index]->mbmi.sb_type = find_partition_size( + mib[index]->sb_type = find_partition_size( bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw); } } @@ -2507,26 +1616,27 @@ static void set_partial_sb_partition(const AV1_COMMON *const cm, MODE_INFO *mi, // may not be allowed in which case this code attempts to choose the largest // allowable partition. static void set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile, - MODE_INFO **mib, int mi_row, int mi_col, + MB_MODE_INFO **mib, int mi_row, int mi_col, BLOCK_SIZE bsize) { AV1_COMMON *const cm = &cpi->common; const int mi_rows_remaining = tile->mi_row_end - mi_row; const int mi_cols_remaining = tile->mi_col_end - mi_col; int block_row, block_col; - MODE_INFO *const mi_upper_left = cm->mi + mi_row * cm->mi_stride + mi_col; + MB_MODE_INFO *const mi_upper_left = cm->mi + mi_row * cm->mi_stride + mi_col; int bh = mi_size_high[bsize]; int bw = mi_size_wide[bsize]; assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0)); // Apply the requested partition size to the SB if it is all "in image" - if ((mi_cols_remaining >= cm->mib_size) && - (mi_rows_remaining >= cm->mib_size)) { - for (block_row = 0; block_row < cm->mib_size; block_row += bh) { - for (block_col = 0; block_col < cm->mib_size; block_col += bw) { + if ((mi_cols_remaining >= cm->seq_params.mib_size) && + (mi_rows_remaining >= cm->seq_params.mib_size)) { + for (block_row = 0; block_row < cm->seq_params.mib_size; block_row += bh) { + for (block_col = 0; block_col < cm->seq_params.mib_size; + block_col += bw) { int index = block_row * cm->mi_stride + block_col; mib[index] = mi_upper_left + index; - mib[index]->mbmi.sb_type = bsize; + mib[index]->sb_type = bsize; } } } else { @@ -2537,14 +1647,12 @@ static void set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile, } static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, - TileDataEnc *tile_data, MODE_INFO **mib, + TileDataEnc *tile_data, MB_MODE_INFO **mib, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, int64_t *dist, -#if CONFIG_SUPERTX - int *rate_nocoef, -#endif int do_recon, PC_TREE *pc_tree) { AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -2552,37 +1660,23 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, const int hbs = bs / 2; int i; const int pl = (bsize >= BLOCK_8X8) - ? partition_plane_context(xd, mi_row, mi_col, -#if CONFIG_UNPOISON_PARTITION_CTX - mi_row + hbs < cm->mi_rows, - mi_col + hbs < cm->mi_cols, -#endif - bsize) + ? partition_plane_context(xd, mi_row, mi_col, bsize) : 0; const PARTITION_TYPE partition = (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize) : PARTITION_NONE; - const BLOCK_SIZE subsize = get_subsize(bsize, partition); + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; RD_STATS last_part_rdc, none_rdc, chosen_rdc; BLOCK_SIZE sub_subsize = BLOCK_4X4; int splits_below = 0; - BLOCK_SIZE bs_type = mib[0]->mbmi.sb_type; + BLOCK_SIZE bs_type = mib[0]->sb_type; int do_partition_search = 1; PICK_MODE_CONTEXT *ctx_none = &pc_tree->none; - const int unify_bsize = CONFIG_CB4X4; -#if CONFIG_SUPERTX - int last_part_rate_nocoef = INT_MAX; - int none_rate_nocoef = INT_MAX; - int chosen_rate_nocoef = INT_MAX; -#endif -#if CONFIG_PVQ - od_rollback_buffer pre_rdo_buf; -#endif + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - assert(num_4x4_blocks_wide_lookup[bsize] == - num_4x4_blocks_high_lookup[bsize]); + assert(mi_size_wide[bsize] == mi_size_high[bsize]); av1_invalid_rd_stats(&last_part_rdc); av1_invalid_rd_stats(&none_rdc); @@ -2590,17 +1684,10 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, pc_tree->partitioning = partition; -#if CONFIG_VAR_TX - xd->above_txfm_context = - cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2); - xd->left_txfm_context = xd->left_txfm_context_buffer + - ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2); -#endif -#if !CONFIG_PVQ - save_context(x, &x_ctx, mi_row, mi_col, bsize); -#else - save_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); -#endif + xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); if (bsize == BLOCK_16X16 && cpi->vaq_refresh) { set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); @@ -2612,12 +1699,12 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, cpi->sf.adjust_partitioning_from_last_frame) { // Check if any of the sub blocks are further split. if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) { - sub_subsize = get_subsize(subsize, PARTITION_SPLIT); + sub_subsize = get_partition_subsize(subsize, PARTITION_SPLIT); splits_below = 1; for (i = 0; i < 4; i++) { int jj = i >> 1, ii = i & 0x01; - MODE_INFO *this_mi = mib[jj * hbs * cm->mi_stride + ii * hbs]; - if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) { + MB_MODE_INFO *this_mi = mib[jj * hbs * cm->mi_stride + ii * hbs]; + if (this_mi && this_mi->sb_type >= sub_subsize) { splits_below = 0; } } @@ -2629,28 +1716,15 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) { pc_tree->partitioning = PARTITION_NONE; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, -#if CONFIG_SUPERTX - &none_rate_nocoef, -#endif -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_NONE, -#endif - bsize, ctx_none, INT64_MAX); + PARTITION_NONE, bsize, ctx_none, INT64_MAX); if (none_rdc.rate < INT_MAX) { none_rdc.rate += x->partition_cost[pl][PARTITION_NONE]; none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist); -#if CONFIG_SUPERTX - none_rate_nocoef += x->partition_cost[pl][PARTITION_NONE]; -#endif } -#if !CONFIG_PVQ - restore_context(x, &x_ctx, mi_row, mi_col, bsize); -#else - restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); -#endif - mib[0]->mbmi.sb_type = bs_type; + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + mib[0]->sb_type = bs_type; pc_tree->partitioning = partition; } } @@ -2658,127 +1732,65 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, switch (partition) { case PARTITION_NONE: rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, -#if CONFIG_SUPERTX - &last_part_rate_nocoef, -#endif -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_NONE, -#endif - bsize, ctx_none, INT64_MAX); + PARTITION_NONE, bsize, ctx_none, INT64_MAX); break; case PARTITION_HORZ: rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, -#if CONFIG_SUPERTX - &last_part_rate_nocoef, -#endif -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_HORZ, -#endif - subsize, &pc_tree->horizontal[0], INT64_MAX); + PARTITION_HORZ, subsize, &pc_tree->horizontal[0], + INT64_MAX); if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_row + hbs < cm->mi_rows) { RD_STATS tmp_rdc; -#if CONFIG_SUPERTX - int rt_nocoef = 0; -#endif PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0]; av1_init_rd_stats(&tmp_rdc); - update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1); - encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize, - NULL); + update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, + mi_col, subsize, NULL); rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc, -#if CONFIG_SUPERTX - &rt_nocoef, -#endif -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_HORZ, -#endif - subsize, &pc_tree->horizontal[1], INT64_MAX); + PARTITION_HORZ, subsize, &pc_tree->horizontal[1], + INT64_MAX); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { av1_invalid_rd_stats(&last_part_rdc); -#if CONFIG_SUPERTX - last_part_rate_nocoef = INT_MAX; -#endif break; } last_part_rdc.rate += tmp_rdc.rate; last_part_rdc.dist += tmp_rdc.dist; last_part_rdc.rdcost += tmp_rdc.rdcost; -#if CONFIG_SUPERTX - last_part_rate_nocoef += rt_nocoef; -#endif } break; case PARTITION_VERT: rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, -#if CONFIG_SUPERTX - &last_part_rate_nocoef, -#endif -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_VERT, -#endif - subsize, &pc_tree->vertical[0], INT64_MAX); + PARTITION_VERT, subsize, &pc_tree->vertical[0], + INT64_MAX); if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_col + hbs < cm->mi_cols) { RD_STATS tmp_rdc; -#if CONFIG_SUPERTX - int rt_nocoef = 0; -#endif PICK_MODE_CONTEXT *ctx_v = &pc_tree->vertical[0]; av1_init_rd_stats(&tmp_rdc); - update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1); - encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize, - NULL); + update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, + mi_col, subsize, NULL); rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc, -#if CONFIG_SUPERTX - &rt_nocoef, -#endif -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_VERT, -#endif - subsize, &pc_tree->vertical[bsize > BLOCK_8X8], - INT64_MAX); + PARTITION_VERT, subsize, + &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { av1_invalid_rd_stats(&last_part_rdc); -#if CONFIG_SUPERTX - last_part_rate_nocoef = INT_MAX; -#endif break; } last_part_rdc.rate += tmp_rdc.rate; last_part_rdc.dist += tmp_rdc.dist; last_part_rdc.rdcost += tmp_rdc.rdcost; -#if CONFIG_SUPERTX - last_part_rate_nocoef += rt_nocoef; -#endif } break; case PARTITION_SPLIT: - if (bsize == BLOCK_8X8 && !unify_bsize) { - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, -#if CONFIG_SUPERTX - &last_part_rate_nocoef, -#endif -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_SPLIT, -#endif - subsize, pc_tree->leaf_split[0], INT64_MAX); - break; - } last_part_rdc.rate = 0; last_part_rdc.dist = 0; last_part_rdc.rdcost = 0; -#if CONFIG_SUPERTX - last_part_rate_nocoef = 0; -#endif for (i = 0; i < 4; i++) { int x_idx = (i & 1) * hbs; int y_idx = (i >> 1) * hbs; int jj = i >> 1, ii = i & 0x01; RD_STATS tmp_rdc; -#if CONFIG_SUPERTX - int rt_nocoef; -#endif if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; @@ -2786,33 +1798,21 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, rd_use_partition(cpi, td, tile_data, mib + jj * hbs * cm->mi_stride + ii * hbs, tp, mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate, - &tmp_rdc.dist, -#if CONFIG_SUPERTX - &rt_nocoef, -#endif - i != 3, pc_tree->split[i]); + &tmp_rdc.dist, i != 3, pc_tree->split[i]); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { av1_invalid_rd_stats(&last_part_rdc); -#if CONFIG_SUPERTX - last_part_rate_nocoef = INT_MAX; -#endif break; } last_part_rdc.rate += tmp_rdc.rate; last_part_rdc.dist += tmp_rdc.dist; -#if CONFIG_SUPERTX - last_part_rate_nocoef += rt_nocoef; -#endif } break; -#if CONFIG_EXT_PARTITION_TYPES case PARTITION_VERT_A: case PARTITION_VERT_B: case PARTITION_HORZ_A: case PARTITION_HORZ_B: case PARTITION_HORZ_4: case PARTITION_VERT_4: assert(0 && "Cannot handle extended partiton types"); -#endif // CONFIG_EXT_PARTITION_TYPES default: assert(0); break; } @@ -2820,9 +1820,6 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, last_part_rdc.rate += x->partition_cost[pl][partition]; last_part_rdc.rdcost = RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist); -#if CONFIG_SUPERTX - last_part_rate_nocoef += x->partition_cost[pl][partition]; -#endif } if (do_partition_search && cpi->sf.adjust_partitioning_from_last_frame && @@ -2830,17 +1827,11 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, partition != PARTITION_SPLIT && bsize > BLOCK_8X8 && (mi_row + bs < cm->mi_rows || mi_row + hbs == cm->mi_rows) && (mi_col + bs < cm->mi_cols || mi_col + hbs == cm->mi_cols)) { - BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT); + BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT); chosen_rdc.rate = 0; chosen_rdc.dist = 0; -#if CONFIG_SUPERTX - chosen_rate_nocoef = 0; -#endif -#if !CONFIG_PVQ - restore_context(x, &x_ctx, mi_row, mi_col, bsize); -#else - restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); -#endif + + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); pc_tree->partitioning = PARTITION_SPLIT; // Split partition. @@ -2848,175 +1839,108 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, int x_idx = (i & 1) * hbs; int y_idx = (i >> 1) * hbs; RD_STATS tmp_rdc; -#if CONFIG_SUPERTX - int rt_nocoef = 0; -#endif -#if CONFIG_PVQ - od_rollback_buffer buf; -#endif + if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; -#if !CONFIG_PVQ - save_context(x, &x_ctx, mi_row, mi_col, bsize); -#else - save_context(x, &x_ctx, mi_row, mi_col, &buf, bsize); -#endif + save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); pc_tree->split[i]->partitioning = PARTITION_NONE; rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, - &tmp_rdc, -#if CONFIG_SUPERTX - &rt_nocoef, -#endif -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_SPLIT, -#endif - split_subsize, &pc_tree->split[i]->none, INT64_MAX); + &tmp_rdc, PARTITION_SPLIT, split_subsize, + &pc_tree->split[i]->none, INT64_MAX); -#if !CONFIG_PVQ - restore_context(x, &x_ctx, mi_row, mi_col, bsize); -#else - restore_context(x, &x_ctx, mi_row, mi_col, &buf, bsize); -#endif + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { av1_invalid_rd_stats(&chosen_rdc); -#if CONFIG_SUPERTX - chosen_rate_nocoef = INT_MAX; -#endif break; } chosen_rdc.rate += tmp_rdc.rate; chosen_rdc.dist += tmp_rdc.dist; -#if CONFIG_SUPERTX - chosen_rate_nocoef += rt_nocoef; -#endif if (i != 3) - encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, + encode_sb(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL); chosen_rdc.rate += x->partition_cost[pl][PARTITION_NONE]; -#if CONFIG_SUPERTX - chosen_rate_nocoef += x->partition_cost[pl][PARTITION_SPLIT]; -#endif } if (chosen_rdc.rate < INT_MAX) { chosen_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT]; chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist); -#if CONFIG_SUPERTX - chosen_rate_nocoef += x->partition_cost[pl][PARTITION_NONE]; -#endif } } // If last_part is better set the partitioning to that. if (last_part_rdc.rdcost < chosen_rdc.rdcost) { - mib[0]->mbmi.sb_type = bsize; + mib[0]->sb_type = bsize; if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition; chosen_rdc = last_part_rdc; -#if CONFIG_SUPERTX - chosen_rate_nocoef = last_part_rate_nocoef; -#endif } // If none was better set the partitioning to that. if (none_rdc.rdcost < chosen_rdc.rdcost) { if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; chosen_rdc = none_rdc; -#if CONFIG_SUPERTX - chosen_rate_nocoef = none_rate_nocoef; -#endif } -#if !CONFIG_PVQ - restore_context(x, &x_ctx, mi_row, mi_col, bsize); -#else - restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); -#endif + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); // We must have chosen a partitioning and encoding or we'll fail later on. // No other opportunities for success. - if (bsize == cm->sb_size) + if (bsize == cm->seq_params.sb_size) assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX); if (do_recon) { - if (bsize == cm->sb_size) { + if (bsize == cm->seq_params.sb_size) { // NOTE: To get estimate for rate due to the tokens, use: // int rate_coeffs = 0; - // encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS, + // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS, // bsize, pc_tree, &rate_coeffs); - encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, + x->cb_offset = 0; + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, pc_tree, NULL); } else { - encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, pc_tree, NULL); } } *rate = chosen_rdc.rate; *dist = chosen_rdc.dist; -#if CONFIG_SUPERTX - *rate_nocoef = chosen_rate_nocoef; -#endif } /* clang-format off */ static const BLOCK_SIZE min_partition_size[BLOCK_SIZES_ALL] = { -#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8 - BLOCK_2X2, BLOCK_2X2, BLOCK_2X2, // 2x2, 2x4, 4x2 -#endif BLOCK_4X4, // 4x4 BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, // 4x8, 8x4, 8x8 BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 8x16, 16x8, 16x16 BLOCK_8X8, BLOCK_8X8, BLOCK_16X16, // 16x32, 32x16, 32x32 BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, // 32x64, 64x32, 64x64 -#if CONFIG_EXT_PARTITION BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, // 64x128, 128x64, 128x128 -#endif // CONFIG_EXT_PARTITION BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 4x16, 16x4, 8x32 BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, // 32x8, 16x64, 64x16 -#if CONFIG_EXT_PARTITION - BLOCK_16X16, BLOCK_16X16 // 32x128, 128x32 -#endif // CONFIG_EXT_PARTITION }; static const BLOCK_SIZE max_partition_size[BLOCK_SIZES_ALL] = { -#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8 - BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, // 2x2, 2x4, 4x2 -#endif BLOCK_8X8, // 4x4 BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, // 4x8, 8x4, 8x8 BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, // 8x16, 16x8, 16x16 BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, // 16x32, 32x16, 32x32 BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST, // 32x64, 64x32, 64x64 -#if CONFIG_EXT_PARTITION BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST, // 64x128, 128x64, 128x128 -#endif // CONFIG_EXT_PARTITION BLOCK_16X16, BLOCK_16X16, BLOCK_32X32, // 4x16, 16x4, 8x32 BLOCK_32X32, BLOCK_LARGEST, BLOCK_LARGEST, // 32x8, 16x64, 64x16 -#if CONFIG_EXT_PARTITION - BLOCK_LARGEST, BLOCK_LARGEST // 32x128, 128x32 -#endif // CONFIG_EXT_PARTITION }; // Next square block size less or equal than current block size. static const BLOCK_SIZE next_square_size[BLOCK_SIZES_ALL] = { -#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8 - BLOCK_2X2, BLOCK_2X2, BLOCK_2X2, // 2x2, 2x4, 4x2 -#endif BLOCK_4X4, // 4x4 BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 4x8, 8x4, 8x8 BLOCK_8X8, BLOCK_8X8, BLOCK_16X16, // 8x16, 16x8, 16x16 BLOCK_16X16, BLOCK_16X16, BLOCK_32X32, // 16x32, 32x16, 32x32 BLOCK_32X32, BLOCK_32X32, BLOCK_64X64, // 32x64, 64x32, 64x64 -#if CONFIG_EXT_PARTITION BLOCK_64X64, BLOCK_64X64, BLOCK_128X128, // 64x128, 128x64, 128x128 -#endif // CONFIG_EXT_PARTITION BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 4x16, 16x4, 8x32 BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, // 32x8, 16x64, 64x16 -#if CONFIG_EXT_PARTITION - BLOCK_32X32, BLOCK_32X32 // 32x128, 128x32 -#endif // CONFIG_EXT_PARTITION }; /* clang-format on */ @@ -3029,17 +1953,17 @@ static const BLOCK_SIZE next_square_size[BLOCK_SIZES_ALL] = { // function so repeat calls can accumulate a min and max of more than one // superblock. static void get_sb_partition_size_range(const AV1_COMMON *const cm, - MACROBLOCKD *xd, MODE_INFO **mib, + MACROBLOCKD *xd, MB_MODE_INFO **mib, BLOCK_SIZE *min_block_size, BLOCK_SIZE *max_block_size) { int i, j; int index = 0; // Check the sb_type for each block that belongs to this region. - for (i = 0; i < cm->mib_size; ++i) { - for (j = 0; j < cm->mib_size; ++j) { - MODE_INFO *mi = mib[index + j]; - BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : BLOCK_4X4; + for (i = 0; i < cm->seq_params.mib_size; ++i) { + for (j = 0; j < cm->seq_params.mib_size; ++j) { + MB_MODE_INFO *mi = mib[index + j]; + BLOCK_SIZE sb_type = mi ? mi->sb_type : BLOCK_4X4; *min_block_size = AOMMIN(*min_block_size, sb_type); *max_block_size = AOMMAX(*max_block_size, sb_type); } @@ -3047,6 +1971,68 @@ static void get_sb_partition_size_range(const AV1_COMMON *const cm, } } +// Checks to see if a super block is on a horizontal image edge. +// In most cases this is the "real" edge unless there are formatting +// bars embedded in the stream. +static int active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) { + int top_edge = 0; + int bottom_edge = cpi->common.mi_rows; + int is_active_h_edge = 0; + + // For two pass account for any formatting bars detected. + if (cpi->oxcf.pass == 2) { + const TWO_PASS *const twopass = &cpi->twopass; + + // The inactive region is specified in MBs not mi units. + // The image edge is in the following MB row. + top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2); + + bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2); + bottom_edge = AOMMAX(top_edge, bottom_edge); + } + + if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) || + ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) { + is_active_h_edge = 1; + } + return is_active_h_edge; +} + +// Checks to see if a super block is on a vertical image edge. +// In most cases this is the "real" edge unless there are formatting +// bars embedded in the stream. +static int active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) { + int left_edge = 0; + int right_edge = cpi->common.mi_cols; + int is_active_v_edge = 0; + + // For two pass account for any formatting bars detected. + if (cpi->oxcf.pass == 2) { + const TWO_PASS *const twopass = &cpi->twopass; + + // The inactive region is specified in MBs not mi units. + // The image edge is in the following MB row. + left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2); + + right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2); + right_edge = AOMMAX(left_edge, right_edge); + } + + if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) || + ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) { + is_active_v_edge = 1; + } + return is_active_v_edge; +} + +// Checks to see if a super block is at the edge of the active image. +// In most cases this is the "real" edge unless there are formatting +// bars embedded in the stream. +static int active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) { + return active_h_edge(cpi, mi_row, cpi->common.seq_params.mib_size) || + active_v_edge(cpi, mi_col, cpi->common.seq_params.mib_size); +} + // Look at neighboring blocks and set a min and max partition size based on // what they chose. static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile, @@ -3054,7 +2040,7 @@ static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile, int mi_col, BLOCK_SIZE *min_block_size, BLOCK_SIZE *max_block_size) { AV1_COMMON *const cm = &cpi->common; - MODE_INFO **mi = xd->mi; + MB_MODE_INFO **mi = xd->mi; const int left_in_image = xd->left_available && mi[-1]; const int above_in_image = xd->up_available && mi[-xd->mi_stride]; const int mi_rows_remaining = tile->mi_row_end - mi_row; @@ -3073,18 +2059,19 @@ static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile, // passed in values for min and max as a starting point. // Find the min and max partition used in previous frame at this location if (cm->frame_type != KEY_FRAME) { - MODE_INFO **prev_mi = + MB_MODE_INFO **prev_mi = &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col]; get_sb_partition_size_range(cm, xd, prev_mi, &min_size, &max_size); } // Find the min and max partition sizes used in the left superblock if (left_in_image) { - MODE_INFO **left_sb_mi = &mi[-cm->mib_size]; + MB_MODE_INFO **left_sb_mi = &mi[-cm->seq_params.mib_size]; get_sb_partition_size_range(cm, xd, left_sb_mi, &min_size, &max_size); } // Find the min and max partition sizes used in the above suprblock. if (above_in_image) { - MODE_INFO **above_sb_mi = &mi[-xd->mi_stride * cm->mib_size]; + MB_MODE_INFO **above_sb_mi = + &mi[-xd->mi_stride * cm->seq_params.mib_size]; get_sb_partition_size_range(cm, xd, above_sb_mi, &min_size, &max_size); } @@ -3103,7 +2090,7 @@ static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile, // Test for blocks at the edge of the active image. // This may be the actual edge of the image or where there are formatting // bars. - if (av1_active_edge_sb(cpi, mi_row, mi_col)) { + if (active_edge_sb(cpi, mi_row, mi_col)) { min_size = BLOCK_4X4; } else { min_size = AOMMIN(cpi->sf.rd_auto_partition_min_limit, min_size); @@ -3116,8 +2103,8 @@ static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile, min_size = AOMMIN(min_size, next_square_size[max_size]); } - *min_block_size = AOMMIN(min_size, cm->sb_size); - *max_block_size = AOMMIN(max_size, cm->sb_size); + *min_block_size = AOMMIN(min_size, cm->seq_params.sb_size); + *max_block_size = AOMMIN(max_size, cm->seq_params.sb_size); } // TODO(jingning) refactor functions setting partition search range @@ -3131,15 +2118,15 @@ static void set_partition_range(const AV1_COMMON *const cm, int idx, idy; const int idx_str = cm->mi_stride * mi_row + mi_col; - MODE_INFO **const prev_mi = &cm->prev_mi_grid_visible[idx_str]; - BLOCK_SIZE min_size = cm->sb_size; // default values + MB_MODE_INFO **const prev_mi = &cm->prev_mi_grid_visible[idx_str]; + BLOCK_SIZE min_size = cm->seq_params.sb_size; // default values BLOCK_SIZE max_size = BLOCK_4X4; if (prev_mi) { for (idy = 0; idy < mi_height; ++idy) { for (idx = 0; idx < mi_width; ++idx) { - const MODE_INFO *const mi = prev_mi[idy * cm->mi_stride + idx]; - const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize; + const MB_MODE_INFO *const mi = prev_mi[idy * cm->mi_stride + idx]; + const BLOCK_SIZE bs = mi ? mi->sb_type : bsize; min_size = AOMMIN(min_size, bs); max_size = AOMMAX(max_size, bs); } @@ -3148,8 +2135,8 @@ static void set_partition_range(const AV1_COMMON *const cm, if (xd->left_available) { for (idy = 0; idy < mi_height; ++idy) { - const MODE_INFO *const mi = xd->mi[idy * cm->mi_stride - 1]; - const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize; + const MB_MODE_INFO *const mi = xd->mi[idy * cm->mi_stride - 1]; + const BLOCK_SIZE bs = mi ? mi->sb_type : bsize; min_size = AOMMIN(min_size, bs); max_size = AOMMAX(max_size, bs); } @@ -3157,8 +2144,8 @@ static void set_partition_range(const AV1_COMMON *const cm, if (xd->up_available) { for (idx = 0; idx < mi_width; ++idx) { - const MODE_INFO *const mi = xd->mi[idx - cm->mi_stride]; - const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize; + const MB_MODE_INFO *const mi = xd->mi[idx - cm->mi_stride]; + const BLOCK_SIZE bs = mi ? mi->sb_type : bsize; min_size = AOMMIN(min_size, bs); max_size = AOMMAX(max_size, bs); } @@ -3169,8 +2156,8 @@ static void set_partition_range(const AV1_COMMON *const cm, max_size = max_partition_size[max_size]; } - *min_bs = AOMMIN(min_size, cm->sb_size); - *max_bs = AOMMIN(max_size, cm->sb_size); + *min_bs = AOMMIN(min_size, cm->seq_params.sb_size); + *max_bs = AOMMIN(max_size, cm->seq_params.sb_size); } static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { @@ -3184,24 +2171,18 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { #if CONFIG_FP_MB_STATS const int qindex_skip_threshold_lookup[BLOCK_SIZES] = { 0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120, -#if CONFIG_EXT_PARTITION // TODO(debargha): What are the correct numbers here? 130, 130, 150 -#endif // CONFIG_EXT_PARTITION }; const int qindex_split_threshold_lookup[BLOCK_SIZES] = { 0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120, -#if CONFIG_EXT_PARTITION // TODO(debargha): What are the correct numbers here? 160, 160, 240 -#endif // CONFIG_EXT_PARTITION }; const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6, -#if CONFIG_EXT_PARTITION // TODO(debargha): What are the correct numbers here? 8, 8, 10 -#endif // CONFIG_EXT_PARTITION }; typedef enum { @@ -3237,7 +2218,6 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv, } #endif -#if CONFIG_EXT_PARTITION_TYPES // Try searching for an encoding for the given subblock. Returns zero if the // rdcost is already too high (to tell the caller not to bother searching for // encodings of further subblocks) @@ -3246,20 +2226,11 @@ static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td, int is_first, int is_last, int mi_row, int mi_col, BLOCK_SIZE subsize, RD_STATS *best_rdc, RD_STATS *sum_rdc, RD_STATS *this_rdc, -#if CONFIG_SUPERTX - int64_t best_rd, int *sum_rate_nocoef, - int *this_rate_nocoef, int *abort_flag, -#endif PARTITION_TYPE partition, PICK_MODE_CONTEXT *prev_ctx, PICK_MODE_CONTEXT *this_ctx) { -#if CONFIG_SUPERTX -#define RTS_X_RATE_NOCOEF_ARG ((is_first) ? sum_rate_nocoef : this_rate_nocoef), -#define RTS_MAX_RDCOST INT64_MAX -#else #define RTS_X_RATE_NOCOEF_ARG #define RTS_MAX_RDCOST best_rdc->rdcost -#endif MACROBLOCK *const x = &td->mb; @@ -3276,32 +2247,22 @@ static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td, RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx, rdcost_remaining); -#if CONFIG_SUPERTX - if (is_first) *abort_flag = sum_rdc->rdcost >= best_rd; -#endif - if (!is_first) { if (this_rdc->rate == INT_MAX) { sum_rdc->rdcost = INT64_MAX; -#if CONFIG_SUPERTX - *sum_rate_nocoef = INT_MAX; -#endif } else { sum_rdc->rate += this_rdc->rate; sum_rdc->dist += this_rdc->dist; sum_rdc->rdcost += this_rdc->rdcost; -#if CONFIG_SUPERTX - *sum_rate_nocoef += *this_rate_nocoef; -#endif } } if (sum_rdc->rdcost >= RTS_MAX_RDCOST) return 0; if (!is_last) { - update_state(cpi, td, this_ctx, mi_row, mi_col, subsize, 1); - encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize, - NULL); + update_state(cpi, tile_data, td, this_ctx, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, + subsize, NULL); } return 1; @@ -3310,41 +2271,19 @@ static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td, #undef RTS_MAX_RDCOST } -static void rd_test_partition3( - const AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, - TOKENEXTRA **tp, PC_TREE *pc_tree, RD_STATS *best_rdc, - PICK_MODE_CONTEXT ctxs[3], PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, - BLOCK_SIZE bsize, PARTITION_TYPE partition, -#if CONFIG_SUPERTX - int64_t best_rd, int *best_rate_nocoef, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, -#endif - int mi_row0, int mi_col0, BLOCK_SIZE subsize0, int mi_row1, int mi_col1, - BLOCK_SIZE subsize1, int mi_row2, int mi_col2, BLOCK_SIZE subsize2) { +static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TOKENEXTRA **tp, + PC_TREE *pc_tree, RD_STATS *best_rdc, + PICK_MODE_CONTEXT ctxs[3], + PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, + BLOCK_SIZE bsize, PARTITION_TYPE partition, + int mi_row0, int mi_col0, BLOCK_SIZE subsize0, + int mi_row1, int mi_col1, BLOCK_SIZE subsize1, + int mi_row2, int mi_col2, BLOCK_SIZE subsize2) { MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; RD_STATS sum_rdc, this_rdc; -#if CONFIG_UNPOISON_PARTITION_CTX - const AV1_COMMON *const cm = &cpi->common; - const int hbs = mi_size_wide[bsize] / 2; - const int has_rows = mi_row + hbs < cm->mi_rows; - const int has_cols = mi_col + hbs < cm->mi_cols; -#endif // CONFIG_UNPOISON_PARTITION_CTX -#if CONFIG_SUPERTX || CONFIG_EXT_PARTITION_TYPES_AB - const AV1_COMMON *const cm = &cpi->common; -#endif -#if CONFIG_SUPERTX - TileInfo *const tile_info = &tile_data->tile_info; - int sum_rate_nocoef, this_rate_nocoef; - int abort_flag; - const int supertx_allowed = !frame_is_intra_only(cm) && - bsize <= MAX_SUPERTX_BLOCK_SIZE && - !xd->lossless[0]; - -#define RTP_STX_TRY_ARGS \ - best_rd, &sum_rate_nocoef, &this_rate_nocoef, &abort_flag, -#else #define RTP_STX_TRY_ARGS -#endif if (!rd_try_subblock(cpi, td, tile_data, tp, 1, 0, mi_row0, mi_col0, subsize0, best_rdc, &sum_rdc, &this_rdc, @@ -3356,131 +2295,586 @@ static void rd_test_partition3( RTP_STX_TRY_ARGS partition, &ctxs[0], &ctxs[1])) return; -// With the new layout of mixed partitions for PARTITION_HORZ_B and -// PARTITION_VERT_B, the last subblock might start past halfway through the -// main block, so we might signal it even though the subblock lies strictly -// outside the image. In that case, we won't spend any bits coding it and the -// difference (obviously) doesn't contribute to the error. -#if CONFIG_EXT_PARTITION_TYPES_AB - const int try_block2 = mi_row2 < cm->mi_rows && mi_col2 < cm->mi_cols; -#else + // With the new layout of mixed partitions for PARTITION_HORZ_B and + // PARTITION_VERT_B, the last subblock might start past halfway through the + // main block, so we might signal it even though the subblock lies strictly + // outside the image. In that case, we won't spend any bits coding it and the + // difference (obviously) doesn't contribute to the error. const int try_block2 = 1; -#endif if (try_block2 && !rd_try_subblock(cpi, td, tile_data, tp, 0, 1, mi_row2, mi_col2, subsize2, best_rdc, &sum_rdc, &this_rdc, RTP_STX_TRY_ARGS partition, &ctxs[1], &ctxs[2])) return; -#if CONFIG_SUPERTX - if (supertx_allowed && !abort_flag && sum_rdc.rdcost < INT64_MAX) { - TX_SIZE supertx_size = max_txsize_lookup[bsize]; - const PARTITION_TYPE best_partition = pc_tree->partitioning; - pc_tree->partitioning = partition; - sum_rdc.rate += av1_cost_bit( - cm->fc->supertx_prob[partition_supertx_context_lookup[partition]] - [supertx_size], - 0); - sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); - - if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) { - TX_TYPE best_tx = DCT_DCT; - RD_STATS tmp_rdc = { sum_rate_nocoef, 0, 0 }; - - restore_context(x, x_ctx, mi_row, mi_col, bsize); - - rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate, - &tmp_rdc.dist, &best_tx, pc_tree); - - tmp_rdc.rate += av1_cost_bit( - cm->fc->supertx_prob[partition_supertx_context_lookup[partition]] - [supertx_size], - 1); - tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist); - if (tmp_rdc.rdcost < sum_rdc.rdcost) { - sum_rdc = tmp_rdc; - update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx, - supertx_size, pc_tree); - } - } - - pc_tree->partitioning = best_partition; - } -#endif - if (sum_rdc.rdcost >= best_rdc->rdcost) return; - int pl = partition_plane_context(xd, mi_row, mi_col, -#if CONFIG_UNPOISON_PARTITION_CTX - has_rows, has_cols, -#endif - bsize); + int pl = partition_plane_context(xd, mi_row, mi_col, bsize); sum_rdc.rate += x->partition_cost[pl][partition]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); -#if CONFIG_SUPERTX - sum_rate_nocoef += x->partition_cost[pl][partition]; -#endif if (sum_rdc.rdcost >= best_rdc->rdcost) return; -#if CONFIG_SUPERTX - *best_rate_nocoef = sum_rate_nocoef; - assert(*best_rate_nocoef >= 0); -#endif *best_rdc = sum_rdc; pc_tree->partitioning = partition; #undef RTP_STX_TRY_ARGS } -#endif // CONFIG_EXT_PARTITION_TYPES -#if CONFIG_DIST_8X8 && CONFIG_CB4X4 +#if CONFIG_DIST_8X8 static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x, - uint8_t *y_src_8x8) { + uint8_t *src_plane_8x8[MAX_MB_PLANE], + uint8_t *dst_plane_8x8[MAX_MB_PLANE]) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; int64_t dist_8x8, dist_8x8_uv, total_dist; const int src_stride = x->plane[0].src.stride; - uint8_t *decoded_8x8; int plane; -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - decoded_8x8 = CONVERT_TO_BYTEPTR(x->decoded_8x8); - else -#endif - decoded_8x8 = (uint8_t *)x->decoded_8x8; - - dist_8x8 = av1_dist_8x8(cpi, x, y_src_8x8, src_stride, decoded_8x8, 8, - BLOCK_8X8, 8, 8, 8, 8, x->qindex) - << 4; + const int dst_stride = xd->plane[0].dst.stride; + dist_8x8 = + av1_dist_8x8(cpi, x, src_plane_8x8[0], src_stride, dst_plane_8x8[0], + dst_stride, BLOCK_8X8, 8, 8, 8, 8, x->qindex) + << 4; // Compute chroma distortion for a luma 8x8 block dist_8x8_uv = 0; - for (plane = 1; plane < MAX_MB_PLANE; ++plane) { - const int src_stride_uv = x->plane[plane].src.stride; - const int dst_stride_uv = xd->plane[plane].dst.stride; - // uv buff pointers now (i.e. the last sub8x8 block) is the same - // to those at the first sub8x8 block because - // uv buff pointer is set only once at first sub8x8 block in a 8x8. - uint8_t *src_uv = x->plane[plane].src.buf; - uint8_t *dst_uv = xd->plane[plane].dst.buf; - unsigned sse; -#if CONFIG_CHROMA_SUB8X8 - const BLOCK_SIZE plane_bsize = - AOMMAX(BLOCK_4X4, get_plane_block_size(BLOCK_8X8, &xd->plane[plane])); -#else - const BLOCK_SIZE plane_bsize = - get_plane_block_size(BLOCK_8X8, &xd->plane[plane]); -#endif - cpi->fn_ptr[plane_bsize].vf(src_uv, src_stride_uv, dst_uv, dst_stride_uv, - &sse); - dist_8x8_uv += (int64_t)sse << 4; + if (num_planes > 1) { + for (plane = 1; plane < MAX_MB_PLANE; ++plane) { + unsigned sse; + const int src_stride_uv = x->plane[plane].src.stride; + const int dst_stride_uv = xd->plane[plane].dst.stride; + const int ssx = xd->plane[plane].subsampling_x; + const int ssy = xd->plane[plane].subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(BLOCK_8X8, ssx, ssy); + + cpi->fn_ptr[plane_bsize].vf(src_plane_8x8[plane], src_stride_uv, + dst_plane_8x8[plane], dst_stride_uv, &sse); + dist_8x8_uv += (int64_t)sse << 4; + } } return total_dist = dist_8x8 + dist_8x8_uv; } -#endif // CONFIG_DIST_8X8 && CONFIG_CB4X4 +#endif // CONFIG_DIST_8X8 + +static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) { + pc_tree->partitioning = PARTITION_NONE; + pc_tree->cb_search_range = SEARCH_FULL_PLANE; + + if (bsize >= BLOCK_8X8) { + BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + for (int idx = 0; idx < 4; ++idx) + reset_partition(pc_tree->split[idx], subsize); + } +} + +static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TOKENEXTRA **tp, + int mi_row, int mi_col, BLOCK_SIZE bsize, + RD_STATS *rd_cost, int64_t best_rd, + PC_TREE *pc_tree, int64_t *none_rd) { + const AV1_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_step = mi_size_wide[bsize] / 2; + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + const TOKENEXTRA *const tp_orig = *tp; + PICK_MODE_CONTEXT *ctx_none = &pc_tree->none; + int tmp_partition_cost[PARTITION_TYPES]; + BLOCK_SIZE subsize; + RD_STATS this_rdc, sum_rdc, best_rdc, pn_rdc; + const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8); + int do_square_split = bsize_at_least_8x8; + const int pl = bsize_at_least_8x8 + ? partition_plane_context(xd, mi_row, mi_col, bsize) + : 0; + const int *partition_cost = + pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0]; + const int num_planes = av1_num_planes(cm); + + int64_t split_rd[4] = { 0, 0, 0, 0 }; + + // Override skipping rectangular partition operations for edge blocks + const int has_rows = (mi_row + mi_step < cm->mi_rows); + const int has_cols = (mi_col + mi_step < cm->mi_cols); + + if (none_rd) *none_rd = 0; + + int partition_none_allowed = has_rows && has_cols; + + (void)*tp_orig; + (void)split_rd; + + av1_zero(pc_tree->pc_tree_stats); + pc_tree->pc_tree_stats.valid = 1; + + // Override partition costs at the edges of the frame in the same + // way as in read_partition (see decodeframe.c) + if (!(has_rows && has_cols)) { + assert(bsize_at_least_8x8 && pl >= 0); + const aom_cdf_prob *partition_cdf = cm->fc->partition_cdf[pl]; + for (int i = 0; i < PARTITION_TYPES; ++i) tmp_partition_cost[i] = INT_MAX; + if (has_cols) { + // At the bottom, the two possibilities are HORZ and SPLIT + aom_cdf_prob bot_cdf[2]; + partition_gather_vert_alike(bot_cdf, partition_cdf, bsize); + static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT }; + av1_cost_tokens_from_cdf(tmp_partition_cost, bot_cdf, bot_inv_map); + } else if (has_rows) { + // At the right, the two possibilities are VERT and SPLIT + aom_cdf_prob rhs_cdf[2]; + partition_gather_horz_alike(rhs_cdf, partition_cdf, bsize); + static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT }; + av1_cost_tokens_from_cdf(tmp_partition_cost, rhs_cdf, rhs_inv_map); + } else { + // At the bottom right, we always split + tmp_partition_cost[PARTITION_SPLIT] = 0; + } + + partition_cost = tmp_partition_cost; + } + +#ifndef NDEBUG + // Nothing should rely on the default value of this array (which is just + // leftover from encoding the previous block. Setting it to magic number + // when debugging. + memset(x->blk_skip, 234, sizeof(x->blk_skip)); +#endif // NDEBUG + + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + + av1_init_rd_stats(&this_rdc); + av1_init_rd_stats(&sum_rdc); + av1_invalid_rd_stats(&best_rdc); + best_rdc.rdcost = best_rd; + + set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + + if (bsize == BLOCK_16X16 && cpi->vaq_refresh) + x->mb_energy = av1_block_energy(cpi, x, bsize); + + xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + // PARTITION_NONE + if (partition_none_allowed) { + if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE; + + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, + PARTITION_NONE, bsize, ctx_none, best_rdc.rdcost); + + pc_tree->pc_tree_stats.rdcost = ctx_none->rdcost; + pc_tree->pc_tree_stats.skip = ctx_none->skip; + + if (none_rd) *none_rd = this_rdc.rdcost; + if (this_rdc.rate != INT_MAX) { + if (bsize_at_least_8x8) { + const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX + ? partition_cost[PARTITION_NONE] + : 0; + this_rdc.rate += pt_cost; + this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); + } + + if (this_rdc.rdcost < best_rdc.rdcost) { + // Adjust dist breakout threshold according to the partition size. + const int64_t dist_breakout_thr = + cpi->sf.partition_search_breakout_dist_thr >> + ((2 * (MAX_SB_SIZE_LOG2 - 2)) - + (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize])); + const int rate_breakout_thr = + cpi->sf.partition_search_breakout_rate_thr * + num_pels_log2_lookup[bsize]; + + best_rdc = this_rdc; + if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE; + + pc_tree->cb_search_range = SEARCH_FULL_PLANE; + + // If all y, u, v transform blocks in this partition are skippable, and + // the dist & rate are within the thresholds, the partition search is + // terminated for current branch of the partition search tree. + // The dist & rate thresholds are set to 0 at speed 0 to disable the + // early termination at that speed. + if (!x->e_mbd.lossless[xd->mi[0]->segment_id] && + (ctx_none->skippable && best_rdc.dist < dist_breakout_thr && + best_rdc.rate < rate_breakout_thr)) { + do_square_split = 0; + } + } + } + + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + } + + // store estimated motion vector + if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none); + + int64_t temp_best_rdcost = best_rdc.rdcost; + pn_rdc = best_rdc; + +#if CONFIG_DIST_8X8 + uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE]; + + if (x->using_dist_8x8 && bsize == BLOCK_8X8) { + for (int i = 0; i < MAX_MB_PLANE; i++) { + src_plane_8x8[i] = x->plane[i].src.buf; + dst_plane_8x8[i] = xd->plane[i].dst.buf; + } + } +#endif // CONFIG_DIST_8X8 + + // PARTITION_SPLIT + if (do_square_split) { + int reached_last_index = 0; + subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + int idx; + + for (idx = 0; idx < 4 && sum_rdc.rdcost < temp_best_rdcost; ++idx) { + const int x_idx = (idx & 1) * mi_step; + const int y_idx = (idx >> 1) * mi_step; + + if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) + continue; + + if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none); + + pc_tree->split[idx]->index = idx; + int64_t *p_split_rd = &split_rd[idx]; + rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row + y_idx, + mi_col + x_idx, subsize, &this_rdc, + temp_best_rdcost - sum_rdc.rdcost, + pc_tree->split[idx], p_split_rd); + + pc_tree->pc_tree_stats.sub_block_rdcost[idx] = this_rdc.rdcost; + pc_tree->pc_tree_stats.sub_block_skip[idx] = + pc_tree->split[idx]->none.skip; + + if (this_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; + break; + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost += this_rdc.rdcost; + } + } + reached_last_index = (idx == 4); + +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8 && reached_last_index && + sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) { + sum_rdc.dist = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8); + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); + } +#endif // CONFIG_DIST_8X8 + + if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) { + sum_rdc.rate += partition_cost[PARTITION_SPLIT]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); + + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; + pc_tree->partitioning = PARTITION_SPLIT; + } + } + + int has_split = 0; + if (pc_tree->partitioning == PARTITION_SPLIT) { + for (int cb_idx = 0; cb_idx <= AOMMIN(idx, 3); ++cb_idx) { + if (pc_tree->split[cb_idx]->partitioning == PARTITION_SPLIT) + ++has_split; + } + + if (has_split >= 3 || sum_rdc.rdcost < (pn_rdc.rdcost >> 1)) { + pc_tree->cb_search_range = SPLIT_PLANE; + } + } + + if (pc_tree->partitioning == PARTITION_NONE) { + pc_tree->cb_search_range = SEARCH_SAME_PLANE; + if (pn_rdc.dist <= sum_rdc.dist) + pc_tree->cb_search_range = NONE_PARTITION_PLANE; + } + + if (pn_rdc.rate == INT_MAX) pc_tree->cb_search_range = NONE_PARTITION_PLANE; + + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + } // if (do_split) + + pc_tree->pc_tree_stats.split = pc_tree->partitioning == PARTITION_SPLIT; + if (do_square_split) { + for (int i = 0; i < 4; ++i) { + pc_tree->pc_tree_stats.sub_block_split[i] = + pc_tree->split[i]->partitioning == PARTITION_SPLIT; + } + } + + // TODO(jbb): This code added so that we avoid static analysis + // warning related to the fact that best_rd isn't used after this + // point. This code should be refactored so that the duplicate + // checks occur in some sub function and thus are used... + (void)best_rd; + *rd_cost = best_rdc; + + if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && + pc_tree->index != 3) { + if (bsize == cm->seq_params.sb_size) { + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + } else { + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + pc_tree, NULL); + } + } + +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8 && best_rdc.rate < INT_MAX && + best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) { + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + pc_tree, NULL); + } +#endif // CONFIG_DIST_8X8 + + if (bsize == cm->seq_params.sb_size) { + assert(best_rdc.rate < INT_MAX); + assert(best_rdc.dist < INT64_MAX); + } else { + assert(tp_orig == *tp); + } +} + +#define FEATURE_SIZE 19 +static const float two_pass_split_partition_weights_128[FEATURE_SIZE + 1] = { + 2.683936f, -0.193620f, -4.106470f, -0.141320f, -0.282289f, + 0.125296f, -1.134961f, 0.862757f, -0.418799f, -0.637666f, + 0.016232f, 0.345013f, 0.018823f, -0.393394f, -1.130700f, + 0.695357f, 0.112569f, -0.341975f, -0.513882f, 5.7488966f, +}; + +static const float two_pass_split_partition_weights_64[FEATURE_SIZE + 1] = { + 2.990993f, 0.423273f, -0.926544f, 0.454646f, -0.292698f, + -1.311632f, -0.284432f, 0.717141f, -0.419257f, -0.574760f, + -0.674444f, 0.669047f, -0.374255f, 0.380624f, -0.804036f, + 0.264021f, 0.004163f, 1.896802f, 0.924287f, 0.13490619f, +}; + +static const float two_pass_split_partition_weights_32[FEATURE_SIZE + 1] = { + 2.795181f, -0.136943f, -0.924842f, 0.405330f, -0.463505f, + -0.584076f, -0.831472f, 0.382985f, -0.597544f, -0.138915f, + -1.354350f, 0.466035f, -0.553961f, 0.213202f, -1.166429f, + 0.010776f, -0.096236f, 2.335084f, 1.699857f, -0.58178353f, +}; + +static const float two_pass_split_partition_weights_16[FEATURE_SIZE + 1] = { + 1.987888f, -0.431100f, -1.687703f, 0.262602f, -0.425298f, + -0.463870f, -1.493457f, 0.470917f, -0.528457f, -0.087700f, + -1.815092f, 0.152883f, -0.337908f, 0.093679f, -1.548267f, + -0.042387f, -0.000861f, 2.556746f, 1.619192f, 0.03643292f, +}; + +static const float two_pass_split_partition_weights_8[FEATURE_SIZE + 1] = { + 2.188344f, -0.817528f, -2.119219f, 0.000000f, -0.348167f, + -0.658074f, -1.960362f, 0.000000f, -0.403080f, 0.282699f, + -2.061088f, 0.000000f, -0.431919f, -0.127960f, -1.099550f, + 0.000000f, 0.121622f, 2.017455f, 2.058228f, -0.15475988f, +}; + +static const float two_pass_none_partition_weights_128[FEATURE_SIZE + 1] = { + -1.006689f, 0.777908f, 4.461072f, -0.395782f, -0.014610f, + -0.853863f, 0.729997f, -0.420477f, 0.282429f, -1.194595f, + 3.181220f, -0.511416f, 0.117084f, -1.149348f, 1.507990f, + -0.477212f, 0.202963f, -1.469581f, 0.624461f, -0.89081228f, +}; + +static const float two_pass_none_partition_weights_64[FEATURE_SIZE + 1] = { + -1.241117f, 0.844878f, 5.638803f, -0.489780f, -0.108796f, + -4.576821f, 1.540624f, -0.477519f, 0.227791f, -1.443968f, + 1.586911f, -0.505125f, 0.140764f, -0.464194f, 1.466658f, + -0.641166f, 0.195412f, 1.427905f, 2.080007f, -1.98272777f, +}; + +static const float two_pass_none_partition_weights_32[FEATURE_SIZE + 1] = { + -2.130825f, 0.476023f, 5.907343f, -0.516002f, -0.097471f, + -2.662754f, 0.614858f, -0.576728f, 0.085261f, -0.031901f, + 0.727842f, -0.600034f, 0.079326f, 0.324328f, 0.504502f, + -0.547105f, -0.037670f, 0.304995f, 0.369018f, -2.66299987f, +}; + +static const float two_pass_none_partition_weights_16[FEATURE_SIZE + 1] = { + -1.626410f, 0.872047f, 5.414965f, -0.554781f, -0.084514f, + -3.020550f, 0.467632f, -0.382280f, 0.199568f, 0.426220f, + 0.829426f, -0.467100f, 0.153098f, 0.662994f, 0.327545f, + -0.560106f, -0.141610f, 0.403372f, 0.523991f, -3.02891231f, +}; + +static const float two_pass_none_partition_weights_8[FEATURE_SIZE + 1] = { + -1.463349f, 0.375376f, 4.751430f, 0.000000f, -0.184451f, + -1.655447f, 0.443214f, 0.000000f, 0.127961f, 0.152435f, + 0.083288f, 0.000000f, 0.143105f, 0.438012f, 0.073238f, + 0.000000f, -0.278137f, 0.186134f, 0.073737f, -1.6494962f, +}; + +// split_score indicates confidence of picking split partition; +// none_score indicates confidence of picking none partition; +static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats, + BLOCK_SIZE bsize, int *split_score, + int *none_score) { + if (!pc_tree_stats->valid) return 0; + const float *split_weights = NULL; + const float *none_weights = NULL; + switch (bsize) { + case BLOCK_4X4: break; + case BLOCK_8X8: + split_weights = two_pass_split_partition_weights_8; + none_weights = two_pass_none_partition_weights_8; + break; + case BLOCK_16X16: + split_weights = two_pass_split_partition_weights_16; + none_weights = two_pass_none_partition_weights_16; + break; + case BLOCK_32X32: + split_weights = two_pass_split_partition_weights_32; + none_weights = two_pass_none_partition_weights_32; + break; + case BLOCK_64X64: + split_weights = two_pass_split_partition_weights_64; + none_weights = two_pass_none_partition_weights_64; + break; + case BLOCK_128X128: + split_weights = two_pass_split_partition_weights_128; + none_weights = two_pass_none_partition_weights_128; + break; + default: assert(0 && "Unexpected bsize."); + } + if (!split_weights || !none_weights) return 0; + + aom_clear_system_state(); + + float features[FEATURE_SIZE]; + int feature_index = 0; + features[feature_index++] = (float)pc_tree_stats->split; + features[feature_index++] = (float)pc_tree_stats->skip; + const int rdcost = (int)AOMMIN(INT_MAX, pc_tree_stats->rdcost); + const int rd_valid = rdcost > 0 && rdcost < 1000000000; + features[feature_index++] = (float)rd_valid; + for (int i = 0; i < 4; ++i) { + features[feature_index++] = (float)pc_tree_stats->sub_block_split[i]; + features[feature_index++] = (float)pc_tree_stats->sub_block_skip[i]; + const int sub_rdcost = + (int)AOMMIN(INT_MAX, pc_tree_stats->sub_block_rdcost[i]); + const int sub_rd_valid = sub_rdcost > 0 && sub_rdcost < 1000000000; + features[feature_index++] = (float)sub_rd_valid; + // Ratio between the sub-block RD and the whole-block RD. + float rd_ratio = 1.0f; + if (rd_valid && sub_rd_valid && sub_rdcost < rdcost) + rd_ratio = (float)sub_rdcost / (float)rdcost; + features[feature_index++] = rd_ratio; + } + assert(feature_index == FEATURE_SIZE); + + float score_1 = split_weights[FEATURE_SIZE]; + float score_2 = none_weights[FEATURE_SIZE]; + for (int i = 0; i < FEATURE_SIZE; ++i) { + score_1 += features[i] * split_weights[i]; + score_2 += features[i] * none_weights[i]; + } + *split_score = (int)(score_1 * 100); + *none_score = (int)(score_2 * 100); + return 1; +} +#undef FEATURE_SIZE + +// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be +// considered. +static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx, + int64_t best_rd, int64_t horz_rd[2], + int64_t vert_rd[2], int64_t split_rd[4], + int *const horza_partition_allowed, + int *const horzb_partition_allowed, + int *const verta_partition_allowed, + int *const vertb_partition_allowed) { + if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return; + const NN_CONFIG *nn_config = NULL; + switch (bsize) { + case BLOCK_8X8: nn_config = NULL; break; + case BLOCK_16X16: nn_config = &av1_ab_partition_nnconfig_16; break; + case BLOCK_32X32: nn_config = &av1_ab_partition_nnconfig_32; break; + case BLOCK_64X64: nn_config = &av1_ab_partition_nnconfig_64; break; + case BLOCK_128X128: nn_config = &av1_ab_partition_nnconfig_128; break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config) return; + + aom_clear_system_state(); + + // Generate features. + float features[10]; + int feature_index = 0; + features[feature_index++] = (float)part_ctx; + features[feature_index++] = (float)var_ctx; + const int rdcost = (int)AOMMIN(INT_MAX, best_rd); + int sub_block_rdcost[8] = { 0 }; + int rd_index = 0; + for (int i = 0; i < 2; ++i) { + if (horz_rd[i] > 0 && horz_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)horz_rd[i]; + ++rd_index; + } + for (int i = 0; i < 2; ++i) { + if (vert_rd[i] > 0 && vert_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)vert_rd[i]; + ++rd_index; + } + for (int i = 0; i < 4; ++i) { + if (split_rd[i] > 0 && split_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)split_rd[i]; + ++rd_index; + } + for (int i = 0; i < 8; ++i) { + // Ratio between the sub-block RD and the whole-block RD. + float rd_ratio = 1.0f; + if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost) + rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost; + features[feature_index++] = rd_ratio; + } + assert(feature_index == 10); + + // Calculate scores using the NN model. + float score[16] = { 0.0f }; + av1_nn_predict(features, nn_config, score); + int int_score[16]; + int max_score = -1000; + for (int i = 0; i < 16; ++i) { + int_score[i] = (int)(100 * score[i]); + max_score = AOMMAX(int_score[i], max_score); + } + + // Make decisions based on the model scores. + int thresh = max_score; + switch (bsize) { + case BLOCK_16X16: thresh -= 150; break; + case BLOCK_32X32: thresh -= 100; break; + default: break; + } + *horza_partition_allowed = 0; + *horzb_partition_allowed = 0; + *verta_partition_allowed = 0; + *vertb_partition_allowed = 0; + for (int i = 0; i < 16; ++i) { + if (int_score[i] >= thresh) { + if ((i >> 0) & 1) *horza_partition_allowed = 1; + if ((i >> 1) & 1) *horzb_partition_allowed = 1; + if ((i >> 2) & 1) *verta_partition_allowed = 1; + if ((i >> 3) & 1) *vertb_partition_allowed = 1; + } + } +} // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization @@ -3488,12 +2882,10 @@ static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x, static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, - RD_STATS *rd_cost, -#if CONFIG_SUPERTX - int *rate_nocoef, -#endif - int64_t best_rd, PC_TREE *pc_tree) { + RD_STATS *rd_cost, int64_t best_rd, + PC_TREE *pc_tree, int64_t *none_rd) { const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -3501,114 +2893,87 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; const TOKENEXTRA *const tp_orig = *tp; PICK_MODE_CONTEXT *ctx_none = &pc_tree->none; -#if CONFIG_UNPOISON_PARTITION_CTX - const int hbs = mi_size_wide[bsize] / 2; - const int has_rows = mi_row + hbs < cm->mi_rows; - const int has_cols = mi_col + hbs < cm->mi_cols; -#else int tmp_partition_cost[PARTITION_TYPES]; -#endif BLOCK_SIZE subsize; RD_STATS this_rdc, sum_rdc, best_rdc; const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8); int do_square_split = bsize_at_least_8x8; -#if CONFIG_CB4X4 - const int unify_bsize = 1; const int pl = bsize_at_least_8x8 - ? partition_plane_context(xd, mi_row, mi_col, -#if CONFIG_UNPOISON_PARTITION_CTX - has_rows, has_cols, -#endif - bsize) + ? partition_plane_context(xd, mi_row, mi_col, bsize) : 0; -#else - const int unify_bsize = 0; - const int pl = partition_plane_context(xd, mi_row, mi_col, -#if CONFIG_UNPOISON_PARTITION_CTX - has_rows, has_cols, -#endif - bsize); -#endif // CONFIG_CB4X4 const int *partition_cost = pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0]; -#if CONFIG_SUPERTX - int this_rate_nocoef, sum_rate_nocoef = 0, best_rate_nocoef = INT_MAX; - int abort_flag; - const int supertx_allowed = !frame_is_intra_only(cm) && bsize >= BLOCK_8X8 && - bsize <= MAX_SUPERTX_BLOCK_SIZE && - !xd->lossless[0]; -#endif // CONFIG_SUPERTX int do_rectangular_split = 1; -#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB - BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT); -#endif + int64_t split_rd[4] = { 0, 0, 0, 0 }; + int64_t horz_rd[2] = { 0, 0 }; + int64_t vert_rd[2] = { 0, 0 }; + + int split_ctx_is_ready[2] = { 0, 0 }; + int horz_ctx_is_ready = 0; + int vert_ctx_is_ready = 0; + BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); + + if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0; // Override skipping rectangular partition operations for edge blocks - const int force_horz_split = (mi_row + mi_step >= cm->mi_rows); - const int force_vert_split = (mi_col + mi_step >= cm->mi_cols); + const int has_rows = (mi_row + mi_step < cm->mi_rows); + const int has_cols = (mi_col + mi_step < cm->mi_cols); const int xss = x->e_mbd.plane[1].subsampling_x; const int yss = x->e_mbd.plane[1].subsampling_y; BLOCK_SIZE min_size = x->min_partition_size; BLOCK_SIZE max_size = x->max_partition_size; + if (none_rd) *none_rd = 0; + #if CONFIG_FP_MB_STATS unsigned int src_diff_var = UINT_MAX; int none_complexity = 0; #endif - int partition_none_allowed = !force_horz_split && !force_vert_split; - int partition_horz_allowed = - !force_vert_split && yss <= xss && bsize_at_least_8x8; - int partition_vert_allowed = - !force_horz_split && xss <= yss && bsize_at_least_8x8; - -#if CONFIG_PVQ - od_rollback_buffer pre_rdo_buf; -#endif + int partition_none_allowed = has_rows && has_cols; + int partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8; + int partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8; (void)*tp_orig; -#if !CONFIG_UNPOISON_PARTITION_CTX - if (force_horz_split || force_vert_split) { - tmp_partition_cost[PARTITION_NONE] = INT_MAX; - - if (!force_vert_split) { // force_horz_split only - tmp_partition_cost[PARTITION_VERT] = INT_MAX; - tmp_partition_cost[PARTITION_HORZ] = - av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_HORZ], 0); - tmp_partition_cost[PARTITION_SPLIT] = - av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_HORZ], 1); - } else if (!force_horz_split) { // force_vert_split only - tmp_partition_cost[PARTITION_HORZ] = INT_MAX; - tmp_partition_cost[PARTITION_VERT] = - av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_VERT], 0); - tmp_partition_cost[PARTITION_SPLIT] = - av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_VERT], 1); - } else { // force_ horz_split && force_vert_split horz_split - tmp_partition_cost[PARTITION_HORZ] = INT_MAX; - tmp_partition_cost[PARTITION_VERT] = INT_MAX; + // Override partition costs at the edges of the frame in the same + // way as in read_partition (see decodeframe.c) + if (!(has_rows && has_cols)) { + assert(bsize_at_least_8x8 && pl >= 0); + const aom_cdf_prob *partition_cdf = cm->fc->partition_cdf[pl]; + for (int i = 0; i < PARTITION_TYPES; ++i) tmp_partition_cost[i] = INT_MAX; + if (has_cols) { + // At the bottom, the two possibilities are HORZ and SPLIT + aom_cdf_prob bot_cdf[2]; + partition_gather_vert_alike(bot_cdf, partition_cdf, bsize); + static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT }; + av1_cost_tokens_from_cdf(tmp_partition_cost, bot_cdf, bot_inv_map); + } else if (has_rows) { + // At the right, the two possibilities are VERT and SPLIT + aom_cdf_prob rhs_cdf[2]; + partition_gather_horz_alike(rhs_cdf, partition_cdf, bsize); + static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT }; + av1_cost_tokens_from_cdf(tmp_partition_cost, rhs_cdf, rhs_inv_map); + } else { + // At the bottom right, we always split tmp_partition_cost[PARTITION_SPLIT] = 0; } partition_cost = tmp_partition_cost; } -#endif -#if CONFIG_VAR_TX #ifndef NDEBUG // Nothing should rely on the default value of this array (which is just // leftover from encoding the previous block. Setting it to magic number // when debugging. - memset(x->blk_skip[0], 234, sizeof(x->blk_skip[0])); + memset(x->blk_skip, 234, sizeof(x->blk_skip)); #endif // NDEBUG -#endif // CONFIG_VAR_TX assert(mi_size_wide[bsize] == mi_size_high[bsize]); av1_init_rd_stats(&this_rdc); - av1_init_rd_stats(&sum_rdc); av1_invalid_rd_stats(&best_rdc); best_rdc.rdcost = best_rd; @@ -3634,26 +2999,70 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, // Note: Further partitioning is NOT allowed when bsize == min_size already. const int partition_allowed = (bsize <= max_size && bsize > min_size); partition_none_allowed &= no_partition_allowed; - partition_horz_allowed &= partition_allowed || force_horz_split; - partition_vert_allowed &= partition_allowed || force_vert_split; + partition_horz_allowed &= partition_allowed || !has_rows; + partition_vert_allowed &= partition_allowed || !has_cols; do_square_split &= bsize > min_size; } if (cpi->sf.use_square_partition_only) { - partition_horz_allowed &= force_horz_split; - partition_vert_allowed &= force_vert_split; + partition_horz_allowed &= !has_rows; + partition_vert_allowed &= !has_cols; + } + + if (bsize > BLOCK_4X4 && x->use_cb_search_range && + cpi->sf.auto_min_max_partition_size == 0) { + int split_score = 0; + int none_score = 0; + const int score_valid = ml_prune_2pass_split_partition( + &pc_tree->pc_tree_stats, bsize, &split_score, &none_score); + if (score_valid) { + { + const int only_split_thresh = 300; + const int no_none_thresh = 250; + const int no_split_thresh = 0; + if (split_score > only_split_thresh) { + partition_none_allowed = 0; + partition_horz_allowed = 0; + partition_vert_allowed = 0; + } else if (split_score > no_none_thresh) { + partition_none_allowed = 0; + } + if (split_score < no_split_thresh) do_square_split = 0; + } + { + const int no_split_thresh = 120; + const int no_none_thresh = -120; + if (none_score > no_split_thresh && partition_none_allowed) + do_square_split = 0; + if (none_score < no_none_thresh) partition_none_allowed = 0; + } + } else { + if (pc_tree->cb_search_range == SPLIT_PLANE) { + partition_none_allowed = 0; + partition_horz_allowed = 0; + partition_vert_allowed = 0; + } + if (pc_tree->cb_search_range == SEARCH_SAME_PLANE) do_square_split = 0; + if (pc_tree->cb_search_range == NONE_PARTITION_PLANE) { + do_square_split = 0; + partition_horz_allowed = 0; + partition_vert_allowed = 0; + } + } + + // Fall back to default values in case all partition modes are rejected. + if (partition_none_allowed == 0 && do_square_split == 0 && + partition_horz_allowed == 0 && partition_vert_allowed == 0) { + do_square_split = bsize_at_least_8x8; + partition_none_allowed = has_rows && has_cols; + partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8; + partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8; + } } -#if CONFIG_VAR_TX - xd->above_txfm_context = - cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2); - xd->left_txfm_context = xd->left_txfm_context_buffer + - ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2); -#endif -#if !CONFIG_PVQ - save_context(x, &x_ctx, mi_row, mi_col, bsize); -#else - save_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); -#endif + xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { @@ -3712,16 +3121,17 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, } #endif +BEGIN_PARTITION_SEARCH: + if (x->must_find_valid_partition) { + partition_none_allowed = has_rows && has_cols; + partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8; + partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8; + } // PARTITION_NONE if (partition_none_allowed) { rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, -#if CONFIG_SUPERTX - &this_rate_nocoef, -#endif -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_NONE, -#endif - bsize, ctx_none, best_rdc.rdcost); + PARTITION_NONE, bsize, ctx_none, best_rdc.rdcost); + if (none_rd) *none_rd = this_rdc.rdcost; if (this_rdc.rate != INT_MAX) { if (bsize_at_least_8x8) { const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX @@ -3729,9 +3139,6 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, : 0; this_rdc.rate += pt_cost; this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); -#if CONFIG_SUPERTX - this_rate_nocoef += pt_cost; -#endif } if (this_rdc.rdcost < best_rdc.rdcost) { @@ -3739,16 +3146,12 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, const int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_dist_thr >> ((2 * (MAX_SB_SIZE_LOG2 - 2)) - - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize])); + (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize])); const int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr * num_pels_log2_lookup[bsize]; best_rdc = this_rdc; -#if CONFIG_SUPERTX - best_rate_nocoef = this_rate_nocoef; - assert(best_rate_nocoef >= 0); -#endif if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE; // If all y, u, v transform blocks in this partition are skippable, and @@ -3756,7 +3159,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, // terminated for current branch of the partition search tree. // The dist & rate thresholds are set to 0 at speed 0 to disable the // early termination at that speed. - if (!x->e_mbd.lossless[xd->mi[0]->mbmi.segment_id] && + if (!x->e_mbd.lossless[xd->mi[0]->segment_id] && (ctx_none->skippable && best_rdc.dist < dist_breakout_thr && best_rdc.rate < rate_breakout_thr)) { do_square_split = 0; @@ -3810,202 +3213,88 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, #endif } } -#if !CONFIG_PVQ - restore_context(x, &x_ctx, mi_row, mi_col, bsize); -#else - restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); -#endif -#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG - if (!x->skip_chroma_rd) { - cfl_clear_sub8x8_val(xd->cfl); - } -#endif // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG + + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } // store estimated motion vector if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none); -#if CONFIG_SUPERTX - int64_t temp_best_rdcost = INT64_MAX; -#else - int64_t temp_best_rdcost = best_rdc.rdcost; -#endif +#if CONFIG_DIST_8X8 + uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE]; + + if (x->using_dist_8x8 && bsize == BLOCK_8X8) { + for (int i = 0; i < num_planes; i++) { + src_plane_8x8[i] = x->plane[i].src.buf; + dst_plane_8x8[i] = xd->plane[i].dst.buf; + } + } +#endif // CONFIG_DIST_8X8 // PARTITION_SPLIT - // TODO(jingning): use the motion vectors given by the above search as - // the starting point of motion search in the following partition type check. if (do_square_split) { + av1_init_rd_stats(&sum_rdc); int reached_last_index = 0; - subsize = get_subsize(bsize, PARTITION_SPLIT); - if (bsize == BLOCK_8X8 && !unify_bsize) { - if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed) - pc_tree->leaf_split[0]->pred_interp_filter = - av1_extract_interp_filter(ctx_none->mic.mbmi.interp_filters, 0); - - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, -#if CONFIG_SUPERTX - &sum_rate_nocoef, -#endif -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_SPLIT, -#endif - subsize, pc_tree->leaf_split[0], temp_best_rdcost); - if (sum_rdc.rate == INT_MAX) { - sum_rdc.rdcost = INT64_MAX; -#if CONFIG_SUPERTX - sum_rate_nocoef = INT_MAX; -#endif - } -#if CONFIG_SUPERTX - if (supertx_allowed && sum_rdc.rdcost < INT64_MAX) { - TX_SIZE supertx_size = max_txsize_lookup[bsize]; - const PARTITION_TYPE best_partition = pc_tree->partitioning; - - pc_tree->partitioning = PARTITION_SPLIT; - - sum_rdc.rate += av1_cost_bit( - cm->fc->supertx_prob[partition_supertx_context_lookup - [PARTITION_SPLIT]][supertx_size], - 0); - sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); + subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + int idx; - if (is_inter_mode(pc_tree->leaf_split[0]->mic.mbmi.mode)) { - TX_TYPE best_tx = DCT_DCT; - RD_STATS tmp_rdc; - av1_init_rd_stats(&tmp_rdc); - tmp_rdc.rate = sum_rate_nocoef; - - restore_context(x, &x_ctx, mi_row, mi_col, bsize); - - rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, - &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree); - - tmp_rdc.rate += av1_cost_bit( - cm->fc->supertx_prob[partition_supertx_context_lookup - [PARTITION_SPLIT]][supertx_size], - 1); - tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist); - if (tmp_rdc.rdcost < sum_rdc.rdcost) { - sum_rdc = tmp_rdc; - update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx, - supertx_size, pc_tree); - } - } + for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) { + const int x_idx = (idx & 1) * mi_step; + const int y_idx = (idx >> 1) * mi_step; - pc_tree->partitioning = best_partition; - } -#endif // CONFIG_SUPERTX - reached_last_index = 1; - } else { - int idx; - for (idx = 0; idx < 4 && sum_rdc.rdcost < temp_best_rdcost; ++idx) { - const int x_idx = (idx & 1) * mi_step; - const int y_idx = (idx >> 1) * mi_step; - - if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) - continue; - - if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none); - - pc_tree->split[idx]->index = idx; - rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, - mi_col + x_idx, subsize, &this_rdc, -#if CONFIG_SUPERTX - &this_rate_nocoef, -#endif - temp_best_rdcost - sum_rdc.rdcost, - pc_tree->split[idx]); - - if (this_rdc.rate == INT_MAX) { - sum_rdc.rdcost = INT64_MAX; -#if CONFIG_SUPERTX - sum_rate_nocoef = INT_MAX; -#endif // CONFIG_SUPERTX - break; - } else { - sum_rdc.rate += this_rdc.rate; - sum_rdc.dist += this_rdc.dist; - sum_rdc.rdcost += this_rdc.rdcost; -#if CONFIG_SUPERTX - sum_rate_nocoef += this_rate_nocoef; -#endif // CONFIG_SUPERTX - } - } - reached_last_index = (idx == 4); - -#if CONFIG_DIST_8X8 && CONFIG_CB4X4 - if (x->using_dist_8x8 && reached_last_index && - sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) { - const int src_stride = x->plane[0].src.stride; - int64_t dist_8x8; - dist_8x8 = - dist_8x8_yuv(cpi, x, x->plane[0].src.buf - 4 * src_stride - 4); - sum_rdc.dist = dist_8x8; - sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); - } -#endif // CONFIG_DIST_8X8 && CONFIG_CB4X4 + if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) + continue; -#if CONFIG_SUPERTX - if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && reached_last_index) { - TX_SIZE supertx_size = max_txsize_lookup[bsize]; - const PARTITION_TYPE best_partition = pc_tree->partitioning; + if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none); - pc_tree->partitioning = PARTITION_SPLIT; + pc_tree->split[idx]->index = idx; + int64_t *p_split_rd = &split_rd[idx]; + rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, + subsize, &this_rdc, best_rdc.rdcost - sum_rdc.rdcost, + pc_tree->split[idx], p_split_rd); - sum_rdc.rate += av1_cost_bit( - cm->fc->supertx_prob[partition_supertx_context_lookup - [PARTITION_SPLIT]][supertx_size], - 0); - sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); + if (this_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; + break; + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost += this_rdc.rdcost; - if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) { - TX_TYPE best_tx = DCT_DCT; - RD_STATS tmp_rdc; - av1_init_rd_stats(&tmp_rdc); - tmp_rdc.rate = sum_rate_nocoef; - - restore_context(x, &x_ctx, mi_row, mi_col, bsize); - - rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, - &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree); - - tmp_rdc.rate += av1_cost_bit( - cm->fc->supertx_prob[partition_supertx_context_lookup - [PARTITION_SPLIT]][supertx_size], - 1); - tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist); - if (tmp_rdc.rdcost < sum_rdc.rdcost) { - sum_rdc = tmp_rdc; - update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx, - supertx_size, pc_tree); + if (idx <= 1 && (bsize <= BLOCK_8X8 || + pc_tree->split[idx]->partitioning == PARTITION_NONE)) { + MB_MODE_INFO *const mbmi = &(pc_tree->split[idx]->none.mic); + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + // Neither palette mode nor cfl predicted + if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) { + if (mbmi->uv_mode != UV_CFL_PRED) split_ctx_is_ready[idx] = 1; } } - - pc_tree->partitioning = best_partition; } -#endif // CONFIG_SUPERTX } + reached_last_index = (idx == 4); -#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG - if (!reached_last_index && sum_rdc.rdcost >= best_rdc.rdcost) - cfl_clear_sub8x8_val(xd->cfl); -#endif // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8 && reached_last_index && + sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) { + int64_t dist_8x8; + dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8); +#ifdef DEBUG_DIST_8X8 + // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled + if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/) + assert(sum_rdc.dist == dist_8x8); +#endif // DEBUG_DIST_8X8 + sum_rdc.dist = dist_8x8; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); + } +#endif // CONFIG_DIST_8X8 if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) { sum_rdc.rate += partition_cost[PARTITION_SPLIT]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); -#if CONFIG_SUPERTX - sum_rate_nocoef += partition_cost[PARTITION_SPLIT]; -#endif // CONFIG_SUPERTX if (sum_rdc.rdcost < best_rdc.rdcost) { best_rdc = sum_rdc; -#if CONFIG_SUPERTX - best_rate_nocoef = sum_rate_nocoef; - assert(best_rate_nocoef >= 0); -#else - temp_best_rdcost = best_rdc.rdcost; -#endif // CONFIG_SUPERTX pc_tree->partitioning = PARTITION_SPLIT; } } else if (cpi->sf.less_rectangular_check) { @@ -4013,473 +3302,362 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, // gives better rd cost do_rectangular_split &= !partition_none_allowed; } -#if !CONFIG_PVQ - restore_context(x, &x_ctx, mi_row, mi_col, bsize); -#else - restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); -#endif + + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } // if (do_split) // PARTITION_HORZ if (partition_horz_allowed && - (do_rectangular_split || av1_active_h_edge(cpi, mi_row, mi_step))) { - subsize = get_subsize(bsize, PARTITION_HORZ); + (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) { + av1_init_rd_stats(&sum_rdc); + subsize = get_partition_subsize(bsize, PARTITION_HORZ); if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->horizontal[0].pred_interp_filter = - av1_extract_interp_filter(ctx_none->mic.mbmi.interp_filters, 0); + av1_extract_interp_filter(ctx_none->mic.interp_filters, 0); rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, -#if CONFIG_SUPERTX - &sum_rate_nocoef, -#endif // CONFIG_SUPERTX -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_HORZ, -#endif - subsize, &pc_tree->horizontal[0], best_rdc.rdcost); + PARTITION_HORZ, subsize, &pc_tree->horizontal[0], + best_rdc.rdcost); + horz_rd[0] = sum_rdc.rdcost; -#if CONFIG_SUPERTX - abort_flag = - (sum_rdc.rdcost >= best_rd && (bsize > BLOCK_8X8 || unify_bsize)) || - (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8); -#endif - if (sum_rdc.rdcost < temp_best_rdcost && !force_horz_split && - (bsize > BLOCK_8X8 || unify_bsize)) { + if (sum_rdc.rdcost < best_rdc.rdcost && has_rows) { PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0]; - update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1); - encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize, - NULL); + MB_MODE_INFO *const mbmi = &(pc_tree->horizontal[0].mic); + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + // Neither palette mode nor cfl predicted + if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) { + if (mbmi->uv_mode != UV_CFL_PRED) horz_ctx_is_ready = 1; + } + update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, + subsize, NULL); if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_h); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->horizontal[1].pred_interp_filter = - av1_extract_interp_filter(ctx_h->mic.mbmi.interp_filters, 0); + av1_extract_interp_filter(ctx_h->mic.interp_filters, 0); -#if CONFIG_SUPERTX - rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc, - &this_rate_nocoef, -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_HORZ, -#endif - subsize, &pc_tree->horizontal[1], INT64_MAX); -#else rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc, -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_HORZ, -#endif - subsize, &pc_tree->horizontal[1], + PARTITION_HORZ, subsize, &pc_tree->horizontal[1], best_rdc.rdcost - sum_rdc.rdcost); -#endif // CONFIG_SUPERTX + horz_rd[1] = this_rdc.rdcost; -#if CONFIG_DIST_8X8 && CONFIG_CB4X4 +#if CONFIG_DIST_8X8 if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) { - update_state(cpi, td, &pc_tree->horizontal[1], mi_row + mi_step, mi_col, - subsize, DRY_RUN_NORMAL); - encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row + mi_step, mi_col, - subsize, NULL); + update_state(cpi, tile_data, td, &pc_tree->horizontal[1], + mi_row + mi_step, mi_col, subsize, DRY_RUN_NORMAL); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, + mi_row + mi_step, mi_col, subsize, NULL); } -#endif // CONFIG_DIST_8X8 && CONFIG_CB4X4 +#endif // CONFIG_DIST_8X8 if (this_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; -#if CONFIG_SUPERTX - sum_rate_nocoef = INT_MAX; -#endif // CONFIG_SUPERTX } else { sum_rdc.rate += this_rdc.rate; sum_rdc.dist += this_rdc.dist; sum_rdc.rdcost += this_rdc.rdcost; -#if CONFIG_SUPERTX - sum_rate_nocoef += this_rate_nocoef; -#endif // CONFIG_SUPERTX } -#if CONFIG_DIST_8X8 && CONFIG_CB4X4 +#if CONFIG_DIST_8X8 if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) { - const int src_stride = x->plane[0].src.stride; int64_t dist_8x8; - dist_8x8 = dist_8x8_yuv(cpi, x, x->plane[0].src.buf - 4 * src_stride); + dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8); +#ifdef DEBUG_DIST_8X8 + // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled + if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/) + assert(sum_rdc.dist == dist_8x8); +#endif // DEBUG_DIST_8X8 sum_rdc.dist = dist_8x8; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); } -#endif // CONFIG_DIST_8X8 && CONFIG_CB4X4 - } - -#if CONFIG_SUPERTX - if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) { - TX_SIZE supertx_size = max_txsize_lookup[bsize]; - const PARTITION_TYPE best_partition = pc_tree->partitioning; - - pc_tree->partitioning = PARTITION_HORZ; - - sum_rdc.rate += av1_cost_bit( - cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]] - [supertx_size], - 0); - sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); - - if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) { - TX_TYPE best_tx = DCT_DCT; - RD_STATS tmp_rdc; - av1_init_rd_stats(&tmp_rdc); - tmp_rdc.rate = sum_rate_nocoef; - - restore_context(x, &x_ctx, mi_row, mi_col, bsize); - - rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate, - &tmp_rdc.dist, &best_tx, pc_tree); - - tmp_rdc.rate += av1_cost_bit( - cm->fc - ->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]] - [supertx_size], - 1); - tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist); - if (tmp_rdc.rdcost < sum_rdc.rdcost) { - sum_rdc = tmp_rdc; - update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx, - supertx_size, pc_tree); - } - } - - pc_tree->partitioning = best_partition; +#endif // CONFIG_DIST_8X8 } -#endif // CONFIG_SUPERTX -#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG - cfl_clear_sub8x8_val(xd->cfl); -#endif // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG if (sum_rdc.rdcost < best_rdc.rdcost) { sum_rdc.rate += partition_cost[PARTITION_HORZ]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); -#if CONFIG_SUPERTX - sum_rate_nocoef += partition_cost[PARTITION_HORZ]; -#endif // CONFIG_SUPERTX if (sum_rdc.rdcost < best_rdc.rdcost) { best_rdc = sum_rdc; -#if CONFIG_SUPERTX - best_rate_nocoef = sum_rate_nocoef; - assert(best_rate_nocoef >= 0); -#endif // CONFIG_SUPERTX pc_tree->partitioning = PARTITION_HORZ; } } -#if !CONFIG_PVQ - restore_context(x, &x_ctx, mi_row, mi_col, bsize); -#else - restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); -#endif + + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } // PARTITION_VERT if (partition_vert_allowed && - (do_rectangular_split || av1_active_v_edge(cpi, mi_col, mi_step))) { - subsize = get_subsize(bsize, PARTITION_VERT); + (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step))) { + av1_init_rd_stats(&sum_rdc); + subsize = get_partition_subsize(bsize, PARTITION_VERT); if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->vertical[0].pred_interp_filter = - av1_extract_interp_filter(ctx_none->mic.mbmi.interp_filters, 0); + av1_extract_interp_filter(ctx_none->mic.interp_filters, 0); rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, -#if CONFIG_SUPERTX - &sum_rate_nocoef, -#endif // CONFIG_SUPERTX -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_VERT, -#endif - subsize, &pc_tree->vertical[0], best_rdc.rdcost); -#if CONFIG_SUPERTX - abort_flag = - (sum_rdc.rdcost >= best_rd && (bsize > BLOCK_8X8 || unify_bsize)) || - (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8); - const int64_t vert_max_rdcost = INT64_MAX; -#else + PARTITION_VERT, subsize, &pc_tree->vertical[0], + best_rdc.rdcost); + vert_rd[0] = sum_rdc.rdcost; const int64_t vert_max_rdcost = best_rdc.rdcost; -#endif // CONFIG_SUPERTX - if (sum_rdc.rdcost < vert_max_rdcost && !force_vert_split && - (bsize > BLOCK_8X8 || unify_bsize)) { - update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1); - encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize, - NULL); + if (sum_rdc.rdcost < vert_max_rdcost && has_cols) { + MB_MODE_INFO *const mbmi = &(pc_tree->vertical[0].mic); + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + // Neither palette mode nor cfl predicted + if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) { + if (mbmi->uv_mode != UV_CFL_PRED) vert_ctx_is_ready = 1; + } + update_state(cpi, tile_data, td, &pc_tree->vertical[0], mi_row, mi_col, + subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, + subsize, NULL); if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->vertical[1].pred_interp_filter = - av1_extract_interp_filter(ctx_none->mic.mbmi.interp_filters, 0); + av1_extract_interp_filter(ctx_none->mic.interp_filters, 0); -#if CONFIG_SUPERTX - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc, - &this_rate_nocoef, -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_VERT, -#endif - subsize, &pc_tree->vertical[1], - INT64_MAX - sum_rdc.rdcost); -#else rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc, -#if CONFIG_EXT_PARTITION_TYPES - PARTITION_VERT, -#endif - subsize, &pc_tree->vertical[1], + PARTITION_VERT, subsize, &pc_tree->vertical[1], best_rdc.rdcost - sum_rdc.rdcost); -#endif // CONFIG_SUPERTX + vert_rd[1] = this_rdc.rdcost; -#if CONFIG_DIST_8X8 && CONFIG_CB4X4 +#if CONFIG_DIST_8X8 if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) { - update_state(cpi, td, &pc_tree->vertical[1], mi_row, mi_col + mi_step, - subsize, DRY_RUN_NORMAL); - encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col + mi_step, - subsize, NULL); + update_state(cpi, tile_data, td, &pc_tree->vertical[1], mi_row, + mi_col + mi_step, subsize, DRY_RUN_NORMAL); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, + mi_col + mi_step, subsize, NULL); } -#endif // CONFIG_DIST_8X8 && CONFIG_CB4X4 +#endif // CONFIG_DIST_8X8 if (this_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; -#if CONFIG_SUPERTX - sum_rate_nocoef = INT_MAX; -#endif // CONFIG_SUPERTX } else { sum_rdc.rate += this_rdc.rate; sum_rdc.dist += this_rdc.dist; sum_rdc.rdcost += this_rdc.rdcost; -#if CONFIG_SUPERTX - sum_rate_nocoef += this_rate_nocoef; -#endif // CONFIG_SUPERTX } -#if CONFIG_DIST_8X8 && CONFIG_CB4X4 +#if CONFIG_DIST_8X8 if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) { int64_t dist_8x8; - dist_8x8 = dist_8x8_yuv(cpi, x, x->plane[0].src.buf - 4); + dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8); +#ifdef DEBUG_DIST_8X8 + // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled + if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && + 0 /* !CONFIG_CFL */) + assert(sum_rdc.dist == dist_8x8); +#endif // DEBUG_DIST_8X8 sum_rdc.dist = dist_8x8; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); } -#endif // CONFIG_DIST_8X8 && CONFIG_CB4X4 - } -#if CONFIG_SUPERTX - if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) { - TX_SIZE supertx_size = max_txsize_lookup[bsize]; - const PARTITION_TYPE best_partition = pc_tree->partitioning; - - pc_tree->partitioning = PARTITION_VERT; - - sum_rdc.rate += av1_cost_bit( - cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]] - [supertx_size], - 0); - sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); - - if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) { - TX_TYPE best_tx = DCT_DCT; - RD_STATS tmp_rdc; - av1_init_rd_stats(&tmp_rdc); - tmp_rdc.rate = sum_rate_nocoef; - - restore_context(x, &x_ctx, mi_row, mi_col, bsize); - - rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate, - &tmp_rdc.dist, &best_tx, pc_tree); - - tmp_rdc.rate += av1_cost_bit( - cm->fc - ->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]] - [supertx_size], - 1); - tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist); - if (tmp_rdc.rdcost < sum_rdc.rdcost) { - sum_rdc = tmp_rdc; - update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx, - supertx_size, pc_tree); - } - } - - pc_tree->partitioning = best_partition; +#endif // CONFIG_DIST_8X8 } -#endif // CONFIG_SUPERTX - -#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG - cfl_clear_sub8x8_val(xd->cfl); -#endif // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG if (sum_rdc.rdcost < best_rdc.rdcost) { sum_rdc.rate += partition_cost[PARTITION_VERT]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); -#if CONFIG_SUPERTX - sum_rate_nocoef += partition_cost[PARTITION_VERT]; -#endif // CONFIG_SUPERTX if (sum_rdc.rdcost < best_rdc.rdcost) { best_rdc = sum_rdc; -#if CONFIG_SUPERTX - best_rate_nocoef = sum_rate_nocoef; - assert(best_rate_nocoef >= 0); -#endif // CONFIG_SUPERTX pc_tree->partitioning = PARTITION_VERT; } } -#if !CONFIG_PVQ - restore_context(x, &x_ctx, mi_row, mi_col, bsize); -#else - restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); -#endif + + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } -#if CONFIG_EXT_PARTITION_TYPES const int ext_partition_allowed = do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed; -#if CONFIG_EXT_PARTITION && CONFIG_EXT_PARTITION_TYPES_AB - // Don't allow A/B partitions on 128x128 blocks for now (support for - // 128x32 and 32x128 blocks doesn't yet exist). - const int ab_partition_allowed = - ext_partition_allowed && bsize < BLOCK_128X128; -#else - const int ab_partition_allowed = ext_partition_allowed; -#endif + // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or + // PARTITION_VERT_4 for this block. This is almost the same as + // ext_partition_allowed, except that we don't allow 128x32 or 32x128 blocks, + // so we require that bsize is not BLOCK_128X128. + const int partition4_allowed = + ext_partition_allowed && bsize != BLOCK_128X128; + + // The standard AB partitions are allowed whenever ext-partition-types are + // allowed + int horzab_partition_allowed = ext_partition_allowed; + int vertab_partition_allowed = ext_partition_allowed; + + if (cpi->sf.prune_ext_partition_types_search_level) { + if (cpi->sf.prune_ext_partition_types_search_level == 1) { + horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ || + (pc_tree->partitioning == PARTITION_NONE && + x->source_variance < 32) || + pc_tree->partitioning == PARTITION_SPLIT); + vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT || + (pc_tree->partitioning == PARTITION_NONE && + x->source_variance < 32) || + pc_tree->partitioning == PARTITION_SPLIT); + } else { + horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ || + pc_tree->partitioning == PARTITION_SPLIT); + vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT || + pc_tree->partitioning == PARTITION_SPLIT); + } + horz_rd[0] = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0); + horz_rd[1] = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0); + vert_rd[0] = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0); + vert_rd[1] = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0); + split_rd[0] = (split_rd[0] < INT64_MAX ? split_rd[0] : 0); + split_rd[1] = (split_rd[1] < INT64_MAX ? split_rd[1] : 0); + split_rd[2] = (split_rd[2] < INT64_MAX ? split_rd[2] : 0); + split_rd[3] = (split_rd[3] < INT64_MAX ? split_rd[3] : 0); + } + int horza_partition_allowed = horzab_partition_allowed; + int horzb_partition_allowed = horzab_partition_allowed; + if (cpi->sf.prune_ext_partition_types_search_level) { + const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1]; + const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3]; + switch (cpi->sf.prune_ext_partition_types_search_level) { + case 1: + horza_partition_allowed &= (horz_a_rd / 16 * 14 < best_rdc.rdcost); + horzb_partition_allowed &= (horz_b_rd / 16 * 14 < best_rdc.rdcost); + break; + case 2: + default: + horza_partition_allowed &= (horz_a_rd / 16 * 15 < best_rdc.rdcost); + horzb_partition_allowed &= (horz_b_rd / 16 * 15 < best_rdc.rdcost); + break; + } + } + + int verta_partition_allowed = vertab_partition_allowed; + int vertb_partition_allowed = vertab_partition_allowed; + if (cpi->sf.prune_ext_partition_types_search_level) { + const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2]; + const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3]; + switch (cpi->sf.prune_ext_partition_types_search_level) { + case 1: + verta_partition_allowed &= (vert_a_rd / 16 * 14 < best_rdc.rdcost); + vertb_partition_allowed &= (vert_b_rd / 16 * 14 < best_rdc.rdcost); + break; + case 2: + default: + verta_partition_allowed &= (vert_a_rd / 16 * 15 < best_rdc.rdcost); + vertb_partition_allowed &= (vert_b_rd / 16 * 15 < best_rdc.rdcost); + break; + } + } + + if (cpi->sf.ml_prune_ab_partition && ext_partition_allowed && + partition_horz_allowed && partition_vert_allowed) { + ml_prune_ab_partition(bsize, pc_tree->partitioning, + get_unsigned_bits(x->source_variance), + best_rdc.rdcost, horz_rd, vert_rd, split_rd, + &horza_partition_allowed, &horzb_partition_allowed, + &verta_partition_allowed, &vertb_partition_allowed); + } // PARTITION_HORZ_A - if (partition_horz_allowed && ab_partition_allowed) { -#if CONFIG_EXT_PARTITION_TYPES_AB - rd_test_partition3( - cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontala, - ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_A, -#if CONFIG_SUPERTX - best_rd, &best_rate_nocoef, &x_ctx, -#endif - mi_row, mi_col, get_subsize(bsize, PARTITION_HORZ_4), - mi_row + mi_step / 2, mi_col, get_subsize(bsize, PARTITION_HORZ_4), - mi_row + mi_step, mi_col, get_subsize(bsize, PARTITION_HORZ)); -#else - subsize = get_subsize(bsize, PARTITION_HORZ_A); + if (partition_horz_allowed && horza_partition_allowed) { + subsize = get_partition_subsize(bsize, PARTITION_HORZ_A); + pc_tree->horizontala[0].rd_mode_is_ready = 0; + pc_tree->horizontala[1].rd_mode_is_ready = 0; + pc_tree->horizontala[2].rd_mode_is_ready = 0; + if (split_ctx_is_ready[0]) { + av1_copy_tree_context(&pc_tree->horizontala[0], &pc_tree->split[0]->none); + pc_tree->horizontala[0].mic.partition = PARTITION_HORZ_A; + pc_tree->horizontala[0].rd_mode_is_ready = 1; + if (split_ctx_is_ready[1]) { + av1_copy_tree_context(&pc_tree->horizontala[1], + &pc_tree->split[1]->none); + pc_tree->horizontala[1].mic.partition = PARTITION_HORZ_A; + pc_tree->horizontala[1].rd_mode_is_ready = 1; + } + } rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize, - PARTITION_HORZ_A, -#if CONFIG_SUPERTX - best_rd, &best_rate_nocoef, &x_ctx, -#endif - mi_row, mi_col, bsize2, mi_row, mi_col + mi_step, bsize2, - mi_row + mi_step, mi_col, subsize); -#endif -#if !CONFIG_PVQ - restore_context(x, &x_ctx, mi_row, mi_col, bsize); -#else - restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); -#endif // !CONFIG_PVQ + PARTITION_HORZ_A, mi_row, mi_col, bsize2, mi_row, + mi_col + mi_step, bsize2, mi_row + mi_step, mi_col, + subsize); + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } // PARTITION_HORZ_B - if (partition_horz_allowed && ab_partition_allowed) { -#if CONFIG_EXT_PARTITION_TYPES_AB - rd_test_partition3( - cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontalb, - ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_B, -#if CONFIG_SUPERTX - best_rd, &best_rate_nocoef, &x_ctx, -#endif - mi_row, mi_col, get_subsize(bsize, PARTITION_HORZ), mi_row + mi_step, - mi_col, get_subsize(bsize, PARTITION_HORZ_4), mi_row + 3 * mi_step / 2, - mi_col, get_subsize(bsize, PARTITION_HORZ_4)); -#else - subsize = get_subsize(bsize, PARTITION_HORZ_B); + if (partition_horz_allowed && horzb_partition_allowed) { + subsize = get_partition_subsize(bsize, PARTITION_HORZ_B); + pc_tree->horizontalb[0].rd_mode_is_ready = 0; + pc_tree->horizontalb[1].rd_mode_is_ready = 0; + pc_tree->horizontalb[2].rd_mode_is_ready = 0; + if (horz_ctx_is_ready) { + av1_copy_tree_context(&pc_tree->horizontalb[0], &pc_tree->horizontal[0]); + pc_tree->horizontalb[0].mic.partition = PARTITION_HORZ_B; + pc_tree->horizontalb[0].rd_mode_is_ready = 1; + } rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize, - PARTITION_HORZ_B, -#if CONFIG_SUPERTX - best_rd, &best_rate_nocoef, &x_ctx, -#endif - mi_row, mi_col, subsize, mi_row + mi_step, mi_col, - bsize2, mi_row + mi_step, mi_col + mi_step, bsize2); -#endif -#if !CONFIG_PVQ - restore_context(x, &x_ctx, mi_row, mi_col, bsize); -#else - restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); -#endif // !CONFIG_PVQ + PARTITION_HORZ_B, mi_row, mi_col, subsize, + mi_row + mi_step, mi_col, bsize2, mi_row + mi_step, + mi_col + mi_step, bsize2); + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } + // PARTITION_VERT_A - if (partition_vert_allowed && ab_partition_allowed) { -#if CONFIG_EXT_PARTITION_TYPES_AB - rd_test_partition3( - cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticala, - ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_A, -#if CONFIG_SUPERTX - best_rd, &best_rate_nocoef, &x_ctx, -#endif - mi_row, mi_col, get_subsize(bsize, PARTITION_VERT_4), mi_row, - mi_col + mi_step / 2, get_subsize(bsize, PARTITION_VERT_4), mi_row, - mi_col + mi_step, get_subsize(bsize, PARTITION_VERT)); -#else - subsize = get_subsize(bsize, PARTITION_VERT_A); + if (partition_vert_allowed && verta_partition_allowed) { + subsize = get_partition_subsize(bsize, PARTITION_VERT_A); + pc_tree->verticala[0].rd_mode_is_ready = 0; + pc_tree->verticala[1].rd_mode_is_ready = 0; + pc_tree->verticala[2].rd_mode_is_ready = 0; + if (split_ctx_is_ready[0]) { + av1_copy_tree_context(&pc_tree->verticala[0], &pc_tree->split[0]->none); + pc_tree->verticala[0].mic.partition = PARTITION_VERT_A; + pc_tree->verticala[0].rd_mode_is_ready = 1; + } rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticala, ctx_none, mi_row, mi_col, bsize, - PARTITION_VERT_A, -#if CONFIG_SUPERTX - best_rd, &best_rate_nocoef, &x_ctx, -#endif - mi_row, mi_col, bsize2, mi_row + mi_step, mi_col, bsize2, - mi_row, mi_col + mi_step, subsize); -#endif -#if !CONFIG_PVQ - restore_context(x, &x_ctx, mi_row, mi_col, bsize); -#else - restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); -#endif // !CONFIG_PVQ + PARTITION_VERT_A, mi_row, mi_col, bsize2, + mi_row + mi_step, mi_col, bsize2, mi_row, + mi_col + mi_step, subsize); + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } // PARTITION_VERT_B - if (partition_vert_allowed && ab_partition_allowed) { -#if CONFIG_EXT_PARTITION_TYPES_AB - rd_test_partition3( - cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticalb, - ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_B, -#if CONFIG_SUPERTX - best_rd, &best_rate_nocoef, &x_ctx, -#endif - mi_row, mi_col, get_subsize(bsize, PARTITION_VERT), mi_row, - mi_col + mi_step, get_subsize(bsize, PARTITION_VERT_4), mi_row, - mi_col + 3 * mi_step / 2, get_subsize(bsize, PARTITION_VERT_4)); -#else - subsize = get_subsize(bsize, PARTITION_VERT_B); + if (partition_vert_allowed && vertb_partition_allowed) { + subsize = get_partition_subsize(bsize, PARTITION_VERT_B); + pc_tree->verticalb[0].rd_mode_is_ready = 0; + pc_tree->verticalb[1].rd_mode_is_ready = 0; + pc_tree->verticalb[2].rd_mode_is_ready = 0; + if (vert_ctx_is_ready) { + av1_copy_tree_context(&pc_tree->verticalb[0], &pc_tree->vertical[0]); + pc_tree->verticalb[0].mic.partition = PARTITION_VERT_B; + pc_tree->verticalb[0].rd_mode_is_ready = 1; + } rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize, - PARTITION_VERT_B, -#if CONFIG_SUPERTX - best_rd, &best_rate_nocoef, &x_ctx, -#endif - mi_row, mi_col, subsize, mi_row, mi_col + mi_step, - bsize2, mi_row + mi_step, mi_col + mi_step, bsize2); -#endif -#if !CONFIG_PVQ - restore_context(x, &x_ctx, mi_row, mi_col, bsize); -#else - restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); -#endif // !CONFIG_PVQ + PARTITION_VERT_B, mi_row, mi_col, subsize, mi_row, + mi_col + mi_step, bsize2, mi_row + mi_step, + mi_col + mi_step, bsize2); + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } -#if CONFIG_EXT_PARTITION - const int can_partition_4 = (bsize == BLOCK_128X128 || bsize == BLOCK_64X64 || - bsize == BLOCK_32X32 || bsize == BLOCK_16X16); -#else - const int can_partition_4 = - (bsize == BLOCK_64X64 || bsize == BLOCK_32X32 || bsize == BLOCK_16X16); -#endif // CONFIG_EXT_PARTITION - // PARTITION_HORZ_4 - // TODO(david.barker): For this and PARTITION_VERT_4, - // * Add support for BLOCK_16X16 once we support 2x8 and 8x2 blocks for the - // chroma plane - // * Add support for supertx - if (can_partition_4 && partition_horz_allowed && !force_horz_split && - (do_rectangular_split || av1_active_h_edge(cpi, mi_row, mi_step))) { + int partition_horz4_allowed = partition4_allowed && partition_horz_allowed; + if (cpi->sf.prune_ext_partition_types_search_level == 2) { + partition_horz4_allowed &= (pc_tree->partitioning == PARTITION_HORZ || + pc_tree->partitioning == PARTITION_HORZ_A || + pc_tree->partitioning == PARTITION_HORZ_B || + pc_tree->partitioning == PARTITION_SPLIT || + pc_tree->partitioning == PARTITION_NONE); + } + if (partition_horz4_allowed && has_rows && + (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) { + av1_init_rd_stats(&sum_rdc); const int quarter_step = mi_size_high[bsize] / 4; PICK_MODE_CONTEXT *ctx_prev = ctx_none; - subsize = get_subsize(bsize, PARTITION_HORZ_4); + subsize = get_partition_subsize(bsize, PARTITION_HORZ_4); for (int i = 0; i < 4; ++i) { int this_mi_row = mi_row + i * quarter_step; @@ -4488,6 +3666,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i]; + ctx_this->rd_mode_is_ready = 0; if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3), this_mi_row, mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc, PARTITION_HORZ_4, ctx_prev, ctx_this)) @@ -4504,19 +3683,25 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, pc_tree->partitioning = PARTITION_HORZ_4; } } -#if !CONFIG_PVQ - restore_context(x, &x_ctx, mi_row, mi_col, bsize); -#else - restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); -#endif + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } + // PARTITION_VERT_4 - if (can_partition_4 && partition_vert_allowed && !force_vert_split && - (do_rectangular_split || av1_active_v_edge(cpi, mi_row, mi_step))) { + int partition_vert4_allowed = partition4_allowed && partition_vert_allowed; + if (cpi->sf.prune_ext_partition_types_search_level == 2) { + partition_vert4_allowed &= (pc_tree->partitioning == PARTITION_VERT || + pc_tree->partitioning == PARTITION_VERT_A || + pc_tree->partitioning == PARTITION_VERT_B || + pc_tree->partitioning == PARTITION_SPLIT || + pc_tree->partitioning == PARTITION_NONE); + } + if (partition_vert4_allowed && has_cols && + (do_rectangular_split || active_v_edge(cpi, mi_row, mi_step))) { + av1_init_rd_stats(&sum_rdc); const int quarter_step = mi_size_wide[bsize] / 4; PICK_MODE_CONTEXT *ctx_prev = ctx_none; - subsize = get_subsize(bsize, PARTITION_VERT_4); + subsize = get_partition_subsize(bsize, PARTITION_VERT_4); for (int i = 0; i < 4; ++i) { int this_mi_col = mi_col + i * quarter_step; @@ -4525,6 +3710,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i]; + ctx_this->rd_mode_is_ready = 0; if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3), mi_row, this_mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc, PARTITION_VERT_4, ctx_prev, ctx_this)) @@ -4541,13 +3727,15 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, pc_tree->partitioning = PARTITION_VERT_4; } } -#if !CONFIG_PVQ - restore_context(x, &x_ctx, mi_row, mi_col, bsize); -#else - restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); -#endif + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + } + + if (bsize == cm->seq_params.sb_size && best_rdc.rate == INT_MAX) { + // Did not find a valid partition, go back and search again, with less + // constraint on which partition types to search. + x->must_find_valid_partition = 1; + goto BEGIN_PARTITION_SEARCH; } -#endif // CONFIG_EXT_PARTITION_TYPES // TODO(jbb): This code added so that we avoid static analysis // warning related to the fact that best_rd isn't used after this @@ -4556,44 +3744,27 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, (void)best_rd; *rd_cost = best_rdc; -#if CONFIG_SUPERTX - *rate_nocoef = best_rate_nocoef; -#endif // CONFIG_SUPERTX - if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && pc_tree->index != 3) { - if (bsize == cm->sb_size) { -#if CONFIG_MOTION_VAR && NC_MODE_INFO - set_mode_info_sb(cpi, td, tile_info, tp, mi_row, mi_col, bsize, pc_tree); -#endif - -#if CONFIG_LV_MAP + if (bsize == cm->seq_params.sb_size) { x->cb_offset = 0; -#endif - -#if CONFIG_NCOBMC_ADAPT_WEIGHT - set_sb_mi_boundaries(cm, xd, mi_row, mi_col); -#endif - encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, pc_tree, NULL); } else { - encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, pc_tree, NULL); } } -#if CONFIG_DIST_8X8 && CONFIG_CB4X4 +#if CONFIG_DIST_8X8 if (x->using_dist_8x8 && best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) { - encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, pc_tree, NULL); } -#endif // CONFIG_DIST_8X8 && CONFIG_CB4X4 +#endif // CONFIG_DIST_8X8 - if (bsize == cm->sb_size) { -#if !CONFIG_PVQ && !CONFIG_LV_MAP - assert(tp_orig < *tp || (tp_orig == *tp && xd->mi[0]->mbmi.skip)); -#endif + if (bsize == cm->seq_params.sb_size) { assert(best_rdc.rate < INT_MAX); assert(best_rdc.dist < INT64_MAX); } else { @@ -4601,71 +3772,62 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, } } +// Set all the counters as max. +static void init_first_partition_pass_stats_tables( + FIRST_PARTITION_PASS_STATS *stats) { + for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) { + memset(stats[i].ref0_counts, 0xff, sizeof(stats[i].ref0_counts)); + memset(stats[i].ref1_counts, 0xff, sizeof(stats[i].ref1_counts)); + stats[i].sample_counts = INT_MAX; + } +} + +// Minimum number of samples to trigger the +// mode_pruning_based_on_two_pass_partition_search feature. +#define FIRST_PARTITION_PASS_MIN_SAMPLES 16 + static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, int mi_row, TOKENEXTRA **tp) { AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); const TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; SPEED_FEATURES *const sf = &cpi->sf; int mi_col; -#if CONFIG_EXT_PARTITION const int leaf_nodes = 256; -#else - const int leaf_nodes = 64; -#endif // CONFIG_EXT_PARTITION // Initialize the left context for the new SB row av1_zero_left_context(xd); // Reset delta for every tile - if (cm->delta_q_present_flag) - if (mi_row == tile_info->mi_row_start) xd->prev_qindex = cm->base_qindex; -#if CONFIG_EXT_DELTA_Q - if (cm->delta_lf_present_flag) { -#if CONFIG_LOOPFILTER_LEVEL - if (mi_row == tile_info->mi_row_start) - for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) - xd->prev_delta_lf[lf_id] = 0; -#endif // CONFIG_LOOPFILTER_LEVEL - if (mi_row == tile_info->mi_row_start) xd->prev_delta_lf_from_base = 0; + if (mi_row == tile_info->mi_row_start) { + if (cm->delta_q_present_flag) xd->current_qindex = cm->base_qindex; + if (cm->delta_lf_present_flag) { + av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); + } } -#endif // Code each SB in the row for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end; - mi_col += cm->mib_size) { + mi_col += cm->seq_params.mib_size) { const struct segmentation *const seg = &cm->seg; int dummy_rate; int64_t dummy_dist; RD_STATS dummy_rdc; -#if CONFIG_SUPERTX - int dummy_rate_nocoef; -#endif // CONFIG_SUPERTX int i; int seg_skip = 0; const int idx_str = cm->mi_stride * mi_row + mi_col; - MODE_INFO **mi = cm->mi_grid_visible + idx_str; - PC_TREE *const pc_root = td->pc_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2]; + MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str; + PC_TREE *const pc_root = + td->pc_root[cm->seq_params.mib_size_log2 - MIN_MIB_SIZE_LOG2]; -#if CONFIG_LV_MAP && LV_MAP_PROB - av1_fill_coeff_costs(&td->mb, xd->tile_ctx); -#else - av1_fill_token_costs_from_cdf(x->token_head_costs, - x->e_mbd.tile_ctx->coef_head_cdfs); - av1_fill_token_costs_from_cdf(x->token_tail_costs, - x->e_mbd.tile_ctx->coef_tail_cdfs); -#endif + av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes); av1_fill_mode_rates(cm, x, xd->tile_ctx); if (sf->adaptive_pred_interp_filter) { -#if !CONFIG_CB4X4 - for (i = 0; i < leaf_nodes; ++i) - td->leaf_tree[i].pred_interp_filter = SWITCHABLE; -#endif - for (i = 0; i < leaf_nodes; ++i) { td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE; td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE; @@ -4674,29 +3836,43 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, } } - x->tx_rd_record.num = x->tx_rd_record.index_start = 0; + x->mb_rd_record.num = x->mb_rd_record.index_start = 0; + + av1_zero(x->txb_rd_record_8X8); + av1_zero(x->txb_rd_record_16X16); + av1_zero(x->txb_rd_record_32X32); + av1_zero(x->txb_rd_record_64X64); + av1_zero(x->txb_rd_record_intra); + av1_zero(x->pred_mv); pc_root->index = 0; if (seg->enabled) { const uint8_t *const map = seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - int segment_id = get_segment_id(cm, map, cm->sb_size, mi_row, mi_col); + int segment_id = + map ? get_segment_id(cm, map, cm->seq_params.sb_size, mi_row, mi_col) + : 0; seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); } -#if CONFIG_AMVR - xd->cur_frame_mv_precision_level = cm->cur_frame_mv_precision_level; -#endif + xd->cur_frame_force_integer_mv = cm->cur_frame_force_integer_mv; if (cm->delta_q_present_flag) { - // Test mode for delta quantization - int sb_row = mi_row >> 3; - int sb_col = mi_col >> 3; - int sb_stride = (cm->width + MAX_SB_SIZE - 1) >> MAX_SB_SIZE_LOG2; - int index = ((sb_row * sb_stride + sb_col + 8) & 31) - 16; - - // Ensure divisibility of delta_qindex by delta_q_res - int offset_qindex = (index < 0 ? -index - 8 : index - 8); + // Delta-q modulation based on variance + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes); + + int offset_qindex; + if (DELTAQ_MODULATION == 1) { + const int block_wavelet_energy_level = + av1_block_wavelet_energy_level(cpi, x, cm->seq_params.sb_size); + offset_qindex = av1_compute_deltaq_from_energy_level( + cpi, block_wavelet_energy_level); + } else { + const int block_var_level = + av1_block_energy(cpi, x, cm->seq_params.sb_size); + offset_qindex = + av1_compute_deltaq_from_energy_level(cpi, block_var_level); + } int qmask = ~(cm->delta_q_res - 1); int current_qindex = clamp(cm->base_qindex + offset_qindex, cm->delta_q_res, 256 - cm->delta_q_res); @@ -4707,136 +3883,163 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, assert(current_qindex > 0); xd->delta_qindex = current_qindex - cm->base_qindex; - set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size); - xd->mi[0]->mbmi.current_q_index = current_qindex; -#if !CONFIG_EXT_DELTA_Q - xd->mi[0]->mbmi.segment_id = 0; -#endif // CONFIG_EXT_DELTA_Q - av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id); -#if CONFIG_EXT_DELTA_Q + set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size); + xd->mi[0]->current_qindex = current_qindex; + av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id); if (cpi->oxcf.deltaq_mode == DELTA_Q_LF) { int j, k; int lfmask = ~(cm->delta_lf_res - 1); - int current_delta_lf_from_base = offset_qindex / 2; - current_delta_lf_from_base = - ((current_delta_lf_from_base + cm->delta_lf_res / 2) & lfmask); + int delta_lf_from_base = offset_qindex / 2; + delta_lf_from_base = + ((delta_lf_from_base + cm->delta_lf_res / 2) & lfmask); // pre-set the delta lf for loop filter. Note that this value is set // before mi is assigned for each block in current superblock - for (j = 0; j < AOMMIN(cm->mib_size, cm->mi_rows - mi_row); j++) { - for (k = 0; k < AOMMIN(cm->mib_size, cm->mi_cols - mi_col); k++) { + for (j = 0; j < AOMMIN(cm->seq_params.mib_size, cm->mi_rows - mi_row); + j++) { + for (k = 0; k < AOMMIN(cm->seq_params.mib_size, cm->mi_cols - mi_col); + k++) { cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)] - .mbmi.current_delta_lf_from_base = - clamp(current_delta_lf_from_base, 0, MAX_LOOP_FILTER); -#if CONFIG_LOOPFILTER_LEVEL - for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) { + .delta_lf_from_base = + clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)] - .mbmi.curr_delta_lf[lf_id] = current_delta_lf_from_base; + .delta_lf[lf_id] = + clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); } -#endif // CONFIG_LOOPFILTER_LEVEL } } } -#endif // CONFIG_EXT_DELTA_Q } x->source_variance = UINT_MAX; if (sf->partition_search_type == FIXED_PARTITION || seg_skip) { BLOCK_SIZE bsize; - set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size); - bsize = seg_skip ? cm->sb_size : sf->always_this_block_size; + set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size); + bsize = seg_skip ? cm->seq_params.sb_size : sf->always_this_block_size; set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); - rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size, - &dummy_rate, &dummy_dist, -#if CONFIG_SUPERTX - &dummy_rate_nocoef, -#endif // CONFIG_SUPERTX - 1, pc_root); + rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, + cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1, + pc_root); } else if (cpi->partition_search_skippable_frame) { BLOCK_SIZE bsize; - set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size); + set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size); bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col); set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); - rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size, - &dummy_rate, &dummy_dist, -#if CONFIG_SUPERTX - &dummy_rate_nocoef, -#endif // CONFIG_SUPERTX - 1, pc_root); + rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, + cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1, + pc_root); } else { // If required set upper and lower partition size limits if (sf->auto_min_max_partition_size) { - set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size); + set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size); rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col, &x->min_partition_size, &x->max_partition_size); } - rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->sb_size, - &dummy_rdc, -#if CONFIG_SUPERTX - &dummy_rate_nocoef, -#endif // CONFIG_SUPERTX - INT64_MAX, pc_root); + + reset_partition(pc_root, cm->seq_params.sb_size); + x->use_cb_search_range = 0; + init_first_partition_pass_stats_tables(x->first_partition_pass_stats); + if (cpi->sf.two_pass_partition_search && + mi_row + mi_size_high[cm->seq_params.sb_size] < cm->mi_rows && + mi_col + mi_size_wide[cm->seq_params.sb_size] < cm->mi_cols && + cm->frame_type != KEY_FRAME) { + x->cb_partition_scan = 1; + // Reset the stats tables. + if (sf->mode_pruning_based_on_two_pass_partition_search) + av1_zero(x->first_partition_pass_stats); + rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row, mi_col, + cm->seq_params.sb_size, &dummy_rdc, INT64_MAX, + pc_root, NULL); + x->cb_partition_scan = 0; + + x->source_variance = UINT_MAX; + if (sf->adaptive_pred_interp_filter) { + for (i = 0; i < leaf_nodes; ++i) { + td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE; + td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE; + td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE; + td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE; + } + } + + x->mb_rd_record.num = x->mb_rd_record.index_start = 0; + av1_zero(x->txb_rd_record_8X8); + av1_zero(x->txb_rd_record_16X16); + av1_zero(x->txb_rd_record_32X32); + av1_zero(x->txb_rd_record_64X64); + av1_zero(x->txb_rd_record_intra); + av1_zero(x->pred_mv); + pc_root->index = 0; + + for (int idy = 0; idy < mi_size_high[cm->seq_params.sb_size]; ++idy) { + for (int idx = 0; idx < mi_size_wide[cm->seq_params.sb_size]; ++idx) { + const int offset = cm->mi_stride * (mi_row + idy) + (mi_col + idx); + cm->mi_grid_visible[offset] = 0; + } + } + + x->use_cb_search_range = 1; + + if (sf->mode_pruning_based_on_two_pass_partition_search) { + for (i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) { + FIRST_PARTITION_PASS_STATS *const stat = + &x->first_partition_pass_stats[i]; + if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) { + // If there are not enough samples collected, make all available. + memset(stat->ref0_counts, 0xff, sizeof(stat->ref0_counts)); + memset(stat->ref1_counts, 0xff, sizeof(stat->ref1_counts)); + } else if (sf->selective_ref_frame < 2) { + // ALTREF2_FRAME and BWDREF_FRAME may be skipped during the + // initial partition scan, so we don't eliminate them. + stat->ref0_counts[ALTREF2_FRAME] = 0xff; + stat->ref1_counts[ALTREF2_FRAME] = 0xff; + stat->ref0_counts[BWDREF_FRAME] = 0xff; + stat->ref1_counts[BWDREF_FRAME] = 0xff; + } + } + } + + rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, + cm->seq_params.sb_size, &dummy_rdc, INT64_MAX, + pc_root, NULL); + } else { + rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, + cm->seq_params.sb_size, &dummy_rdc, INT64_MAX, + pc_root, NULL); + } + } +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile. + if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 && + cm->tile_rows == 1) { + av1_inter_mode_data_fit(x->rdmult); } +#endif } } static void init_encode_frame_mb_context(AV1_COMP *cpi) { - MACROBLOCK *const x = &cpi->td.mb; AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &cpi->td.mb; MACROBLOCKD *const xd = &x->e_mbd; // Copy data over into macro block data structures. - av1_setup_src_planes(x, cpi->source, 0, 0); + av1_setup_src_planes(x, cpi->source, 0, 0, num_planes); - av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y); + av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y, num_planes); } -#if !CONFIG_REF_ADAPT -static int check_dual_ref_flags(AV1_COMP *cpi) { - const int ref_flags = cpi->ref_frame_flags; - - if (segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) { - return 0; - } else { - return (!!(ref_flags & AOM_GOLD_FLAG) + !!(ref_flags & AOM_LAST_FLAG) + -#if CONFIG_EXT_REFS - !!(ref_flags & AOM_LAST2_FLAG) + !!(ref_flags & AOM_LAST3_FLAG) + - !!(ref_flags & AOM_BWD_FLAG) + !!(ref_flags & AOM_ALT2_FLAG) + -#endif // CONFIG_EXT_REFS - !!(ref_flags & AOM_ALT_FLAG)) >= 2; - } -} -#endif // !CONFIG_REF_ADAPT - -#if !CONFIG_VAR_TX -static void reset_skip_tx_size(AV1_COMMON *cm, TX_SIZE max_tx_size) { - int mi_row, mi_col; - const int mis = cm->mi_stride; - MODE_INFO **mi_ptr = cm->mi_grid_visible; - - for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) { - for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { - if (txsize_sqr_up_map[mi_ptr[mi_col]->mbmi.tx_size] > max_tx_size) - mi_ptr[mi_col]->mbmi.tx_size = max_tx_size; - } - } -} -#endif - static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) { if (frame_is_intra_only(&cpi->common)) return INTRA_FRAME; -#if CONFIG_EXT_REFS // We will not update the golden frame with an internal overlay frame else if ((cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) || cpi->rc.is_src_frame_ext_arf) -#else - else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) -#endif // CONFIG_EXT_REFS return ALTREF_FRAME; - else if (cpi->refresh_golden_frame || -#if CONFIG_EXT_REFS - cpi->refresh_alt2_ref_frame || -#endif // CONFIG_EXT_REFS + else if (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame || cpi->refresh_alt_ref_frame) return GOLDEN_FRAME; else @@ -4846,22 +4049,19 @@ static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) { } static TX_MODE select_tx_mode(const AV1_COMP *cpi) { - if (cpi->common.all_lossless) return ONLY_4X4; -#if CONFIG_VAR_TX_NO_TX_MODE - return TX_MODE_SELECT; -#else + if (cpi->common.coded_lossless) return ONLY_4X4; if (cpi->sf.tx_size_search_method == USE_LARGESTALL) - return ALLOW_32X32 + CONFIG_TX64X64; + return TX_MODE_LARGEST; else if (cpi->sf.tx_size_search_method == USE_FULL_RD || - cpi->sf.tx_size_search_method == USE_TX_8X8) + cpi->sf.tx_size_search_method == USE_FAST_RD) return TX_MODE_SELECT; else return cpi->common.tx_mode; -#endif // CONFIG_VAR_TX_NO_TX_MODE } void av1_init_tile_data(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); const int tile_cols = cm->tile_cols; const int tile_rows = cm->tile_rows; int tile_col, tile_row; @@ -4886,29 +4086,23 @@ void av1_init_tile_data(AV1_COMP *cpi) { tile_data->mode_map[i][j] = j; } } -#if CONFIG_PVQ - // This will be dynamically increased as more pvq block is encoded. - tile_data->pvq_q.buf_len = 1000; - CHECK_MEM_ERROR( - cm, tile_data->pvq_q.buf, - aom_malloc(tile_data->pvq_q.buf_len * sizeof(PVQ_INFO))); - tile_data->pvq_q.curr_pos = 0; -#endif } } for (tile_row = 0; tile_row < tile_rows; ++tile_row) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - TileInfo *const tile_info = - &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info; + TileDataEnc *const tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + TileInfo *const tile_info = &tile_data->tile_info; av1_tile_init(tile_info, cm, tile_row, tile_col); cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok; pre_tok = cpi->tile_tok[tile_row][tile_col]; - tile_tok = allocated_tokens(*tile_info); -#if CONFIG_PVQ - cpi->tile_data[tile_row * tile_cols + tile_col].pvq_q.curr_pos = 0; -#endif + tile_tok = allocated_tokens( + *tile_info, cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes); + tile_data->allow_update_cdf = !cm->large_scale_tile; + tile_data->allow_update_cdf = + tile_data->allow_update_cdf && !cm->disable_cdf_update; } } } @@ -4922,134 +4116,35 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row, TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col]; int mi_row; -#if CONFIG_DEPENDENT_HORZTILES - if ((!cm->dependent_horz_tiles) || (tile_row == 0) || - tile_info->tg_horz_boundary) { - av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end); - } -#else - av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end); -#endif + av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end, + tile_row); + av1_init_above_context(cm, &td->mb.e_mbd, tile_row); // Set up pointers to per thread motion search counters. this_tile->m_search_count = 0; // Count of motion search hits. this_tile->ex_search_count = 0; // Exhaustive mesh search hits. td->mb.m_search_count_ptr = &this_tile->m_search_count; td->mb.ex_search_count_ptr = &this_tile->ex_search_count; - -#if CONFIG_PVQ - td->mb.pvq_q = &this_tile->pvq_q; - - // TODO(yushin) : activity masking info needs be signaled by a bitstream - td->mb.daala_enc.use_activity_masking = AV1_PVQ_ENABLE_ACTIVITY_MASKING; - - if (td->mb.daala_enc.use_activity_masking) - td->mb.daala_enc.qm = OD_HVS_QM; // Hard coded. Enc/dec required to sync. - else - td->mb.daala_enc.qm = OD_FLAT_QM; // Hard coded. Enc/dec required to sync. - - { - // FIXME: Multiple segments support - int segment_id = 0; - int rdmult = set_segment_rdmult(cpi, &td->mb, segment_id); - int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex); -#if CONFIG_HIGHBITDEPTH - const int quantizer_shift = td->mb.e_mbd.bd - 8; -#else - const int quantizer_shift = 0; -#endif // CONFIG_HIGHBITDEPTH - int64_t q_ac = OD_MAXI( - 1, av1_ac_quant(qindex, 0, cpi->common.bit_depth) >> quantizer_shift); - int64_t q_dc = OD_MAXI( - 1, av1_dc_quant(qindex, 0, cpi->common.bit_depth) >> quantizer_shift); - /* td->mb.daala_enc.pvq_norm_lambda = OD_PVQ_LAMBDA; */ - td->mb.daala_enc.pvq_norm_lambda = - (double)rdmult * (64 / 16) / (q_ac * q_ac * (1 << RDDIV_BITS)); - td->mb.daala_enc.pvq_norm_lambda_dc = - (double)rdmult * (64 / 16) / (q_dc * q_dc * (1 << RDDIV_BITS)); - // printf("%f\n", td->mb.daala_enc.pvq_norm_lambda); - } - od_init_qm(td->mb.daala_enc.state.qm, td->mb.daala_enc.state.qm_inv, - td->mb.daala_enc.qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT); - - if (td->mb.daala_enc.use_activity_masking) { - int pli; - int use_masking = td->mb.daala_enc.use_activity_masking; - int segment_id = 0; - int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex); - - for (pli = 0; pli < MAX_MB_PLANE; pli++) { - int i; - int q; - - q = qindex; - if (q <= OD_DEFAULT_QMS[use_masking][0][pli].interp_q << OD_COEFF_SHIFT) { - od_interp_qm(&td->mb.daala_enc.state.pvq_qm_q4[pli][0], q, - &OD_DEFAULT_QMS[use_masking][0][pli], NULL); - } else { - i = 0; - while (OD_DEFAULT_QMS[use_masking][i + 1][pli].qm_q4 != NULL && - q > OD_DEFAULT_QMS[use_masking][i + 1][pli].interp_q - << OD_COEFF_SHIFT) { - i++; - } - od_interp_qm(&td->mb.daala_enc.state.pvq_qm_q4[pli][0], q, - &OD_DEFAULT_QMS[use_masking][i][pli], - &OD_DEFAULT_QMS[use_masking][i + 1][pli]); - } - } - } - -#if !CONFIG_ANS - od_ec_enc_init(&td->mb.daala_enc.w.ec, 65025); - od_ec_enc_reset(&td->mb.daala_enc.w.ec); -#else -#error "CONFIG_PVQ currently requires !CONFIG_ANS." -#endif -#endif // #if CONFIG_PVQ - this_tile->tctx = *cm->fc; td->mb.e_mbd.tile_ctx = &this_tile->tctx; -#if CONFIG_CFL - MACROBLOCKD *const xd = &td->mb.e_mbd; - xd->cfl = &this_tile->cfl; - cfl_init(xd->cfl, cm); -#endif - -#if CONFIG_PVQ - td->mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context; -#endif // CONFIG_PVQ + cfl_init(&td->mb.e_mbd.cfl, cm); -#if CONFIG_LOOPFILTERING_ACROSS_TILES - if (!cm->loop_filter_across_tiles_enabled) - av1_setup_across_tile_boundary_info(cm, tile_info); -#endif + av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator); - av1_crc_calculator_init(&td->mb.tx_rd_record.crc_calculator, 24, 0x5D6DCB); + td->intrabc_used_this_tile = 0; for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; - mi_row += cm->mib_size) { + mi_row += cm->seq_params.mib_size) { encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok); } cpi->tok_count[tile_row][tile_col] = (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]); - assert(cpi->tok_count[tile_row][tile_col] <= allocated_tokens(*tile_info)); -#if CONFIG_PVQ -#if !CONFIG_ANS - od_ec_enc_clear(&td->mb.daala_enc.w.ec); -#else -#error "CONFIG_PVQ currently requires !CONFIG_ANS." -#endif - - td->mb.pvq_q->last_pos = td->mb.pvq_q->curr_pos; - // rewind current position so that bitstream can be written - // from the 1st pvq block - td->mb.pvq_q->curr_pos = 0; - - td->mb.pvq_q = NULL; -#endif + assert(cpi->tok_count[tile_row][tile_col] <= + allocated_tokens(*tile_info, + cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, + av1_num_planes(cm))); } static void encode_tiles(AV1_COMP *cpi) { @@ -5058,9 +4153,12 @@ static void encode_tiles(AV1_COMP *cpi) { av1_init_tile_data(cpi); - for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) - for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) + for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) { av1_encode_tile(cpi, &cpi->td, tile_row, tile_col); + cpi->intrabc_used |= cpi->td.intrabc_used_this_tile; + } + } } #if CONFIG_FP_MB_STATS @@ -5077,52 +4175,34 @@ static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats, } #endif -#if CONFIG_GLOBAL_MOTION #define GLOBAL_TRANS_TYPES_ENC 3 // highest motion model to search static int gm_get_params_cost(const WarpedMotionParams *gm, const WarpedMotionParams *ref_gm, int allow_hp) { - assert(gm->wmtype < GLOBAL_TRANS_TYPES); int params_cost = 0; int trans_bits, trans_prec_diff; switch (gm->wmtype) { - case HOMOGRAPHY: - case HORTRAPEZOID: - case VERTRAPEZOID: - if (gm->wmtype != HORTRAPEZOID) - params_cost += aom_count_signed_primitive_refsubexpfin( - GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K, - (ref_gm->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF), - (gm->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF)); - if (gm->wmtype != VERTRAPEZOID) - params_cost += aom_count_signed_primitive_refsubexpfin( - GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K, - (ref_gm->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF), - (gm->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF)); - // Fallthrough intended case AFFINE: case ROTZOOM: params_cost += aom_count_signed_primitive_refsubexpfin( GM_ALPHA_MAX + 1, SUBEXPFIN_K, (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS), (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); - if (gm->wmtype != VERTRAPEZOID) + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF), + (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF)); + if (gm->wmtype >= AFFINE) { params_cost += aom_count_signed_primitive_refsubexpfin( GM_ALPHA_MAX + 1, SUBEXPFIN_K, - (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF), - (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF)); - if (gm->wmtype >= AFFINE) { - if (gm->wmtype != HORTRAPEZOID) - params_cost += aom_count_signed_primitive_refsubexpfin( - GM_ALPHA_MAX + 1, SUBEXPFIN_K, - (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF), - (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF)); + (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF), + (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF)); params_cost += aom_count_signed_primitive_refsubexpfin( GM_ALPHA_MAX + 1, SUBEXPFIN_K, (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS), (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); } - // Fallthrough intended + AOM_FALLTHROUGH_INTENDED; case TRANSLATION: trans_bits = (gm->wmtype == TRANSLATION) ? GM_ABS_TRANS_ONLY_BITS - !allow_hp @@ -5138,7 +4218,7 @@ static int gm_get_params_cost(const WarpedMotionParams *gm, (1 << trans_bits) + 1, SUBEXPFIN_K, (ref_gm->wmmat[1] >> trans_prec_diff), (gm->wmmat[1] >> trans_prec_diff)); - // Fallthrough intended + AOM_FALLTHROUGH_INTENDED; case IDENTITY: break; default: assert(0); } @@ -5152,26 +4232,16 @@ static int do_gm_search_logic(SPEED_FEATURES *const sf, int num_refs_using_gm, switch (sf->gm_search_type) { case GM_FULL_SEARCH: return 1; case GM_REDUCED_REF_SEARCH: -#if CONFIG_EXT_REFS return !(frame == LAST2_FRAME || frame == LAST3_FRAME); -#else - return (num_refs_using_gm < 2); -#endif // CONFIG_EXT_REFS case GM_DISABLE_SEARCH: return 0; default: assert(0); } return 1; } -#endif // CONFIG_GLOBAL_MOTION -// TODO(anybody) : remove this flag when PVQ supports pallete coding tool -#if !CONFIG_PVQ // Estimate if the source frame is screen content, based on the portion of // blocks that have no more than 4 (experimentally selected) luma colors. -static int is_screen_content(const uint8_t *src, -#if CONFIG_HIGHBITDEPTH - int use_hbd, int bd, -#endif // CONFIG_HIGHBITDEPTH +static int is_screen_content(const uint8_t *src, int use_hbd, int bd, int stride, int width, int height) { assert(src != NULL); int counts = 0; @@ -5180,20 +4250,198 @@ static int is_screen_content(const uint8_t *src, const int limit = 4; for (int r = 0; r + blk_h <= height; r += blk_h) { for (int c = 0; c + blk_w <= width; c += blk_w) { + int count_buf[1 << 12]; // Maximum (1 << 12) color levels. const int n_colors = -#if CONFIG_HIGHBITDEPTH use_hbd ? av1_count_colors_highbd(src + r * stride + c, stride, blk_w, - blk_h, bd) - : -#endif // CONFIG_HIGHBITDEPTH - av1_count_colors(src + r * stride + c, stride, blk_w, blk_h); + blk_h, bd, count_buf) + : av1_count_colors(src + r * stride + c, stride, blk_w, blk_h, + count_buf); if (n_colors > 1 && n_colors <= limit) counts++; } } // The threshold is 10%. return counts * blk_h * blk_w * 10 > width * height; } -#endif // !CONFIG_PVQ + +// Enforce the number of references for each arbitrary frame limited to +// (INTER_REFS_PER_FRAME - 1) +static void enforce_max_ref_frames(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + static const int flag_list[REF_FRAMES] = { 0, + AOM_LAST_FLAG, + AOM_LAST2_FLAG, + AOM_LAST3_FLAG, + AOM_GOLD_FLAG, + AOM_BWD_FLAG, + AOM_ALT2_FLAG, + AOM_ALT_FLAG }; + MV_REFERENCE_FRAME ref_frame; + int total_valid_refs = 0; + + (void)flag_list; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + if (cpi->ref_frame_flags & flag_list[ref_frame]) total_valid_refs++; + } + + // NOTE(zoeliu): When all the possible reference frames are availble, we + // reduce the number of reference frames by 1, following the rules of: + // (1) Retain GOLDEN_FARME/ALTEF_FRAME; + // (2) Check the earliest 2 remaining reference frames, and remove the one + // with the lower quality factor, otherwise if both have been coded at + // the same quality level, remove the earliest reference frame. + + if (total_valid_refs == INTER_REFS_PER_FRAME) { + unsigned int min_ref_offset = UINT_MAX; + unsigned int second_min_ref_offset = UINT_MAX; + MV_REFERENCE_FRAME earliest_ref_frames[2] = { LAST3_FRAME, LAST2_FRAME }; + int earliest_buf_idxes[2] = { 0 }; + + // Locate the earliest two reference frames except GOLDEN/ALTREF. + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + // Retain GOLDEN/ALTERF + if (ref_frame == GOLDEN_FRAME || ref_frame == ALTREF_FRAME) continue; + + const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx; + if (buf_idx >= 0) { + const unsigned int ref_offset = + cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset; + + if (min_ref_offset == UINT_MAX) { + min_ref_offset = ref_offset; + earliest_ref_frames[0] = ref_frame; + earliest_buf_idxes[0] = buf_idx; + } else { + if (get_relative_dist(cm, ref_offset, min_ref_offset) < 0) { + second_min_ref_offset = min_ref_offset; + earliest_ref_frames[1] = earliest_ref_frames[0]; + earliest_buf_idxes[1] = earliest_buf_idxes[0]; + + min_ref_offset = ref_offset; + earliest_ref_frames[0] = ref_frame; + earliest_buf_idxes[0] = buf_idx; + } else if (second_min_ref_offset == UINT_MAX || + get_relative_dist(cm, ref_offset, second_min_ref_offset) < + 0) { + second_min_ref_offset = ref_offset; + earliest_ref_frames[1] = ref_frame; + earliest_buf_idxes[1] = buf_idx; + } + } + } + } + // Check the coding quality factors of the two earliest reference frames. + RATE_FACTOR_LEVEL ref_rf_level[2]; + double ref_rf_deltas[2]; + for (int i = 0; i < 2; ++i) { + ref_rf_level[i] = cpi->frame_rf_level[earliest_buf_idxes[i]]; + ref_rf_deltas[i] = rate_factor_deltas[ref_rf_level[i]]; + } + (void)ref_rf_level; + (void)ref_rf_deltas; + +#define USE_RF_LEVEL_TO_ENFORCE 1 +#if USE_RF_LEVEL_TO_ENFORCE + // If both earliest two reference frames are coded using the same rate- + // factor, disable the earliest reference frame; Otherwise disable the + // reference frame that uses a lower rate-factor delta. + const MV_REFERENCE_FRAME ref_frame_to_disable = + (ref_rf_deltas[0] <= ref_rf_deltas[1]) ? earliest_ref_frames[0] + : earliest_ref_frames[1]; +#else + // Always disable the earliest reference frame + const MV_REFERENCE_FRAME ref_frame_to_disable = earliest_ref_frames[0]; +#endif // USE_RF_LEVEL_TO_ENFORCE +#undef USE_RF_LEVEL_TO_ENFORCE + + switch (ref_frame_to_disable) { + case LAST_FRAME: cpi->ref_frame_flags &= ~AOM_LAST_FLAG; break; + case LAST2_FRAME: cpi->ref_frame_flags &= ~AOM_LAST2_FLAG; break; + case LAST3_FRAME: cpi->ref_frame_flags &= ~AOM_LAST3_FLAG; break; + case BWDREF_FRAME: cpi->ref_frame_flags &= ~AOM_BWD_FLAG; break; + case ALTREF2_FRAME: cpi->ref_frame_flags &= ~AOM_ALT2_FLAG; break; + default: break; + } + } +} + +static INLINE int av1_refs_are_one_sided(const AV1_COMMON *cm) { + assert(!frame_is_intra_only(cm)); + + int one_sided_refs = 1; + for (int ref = 0; ref < INTER_REFS_PER_FRAME; ++ref) { + const int buf_idx = cm->frame_refs[ref].idx; + if (buf_idx == INVALID_IDX) continue; + + const int ref_offset = + cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset; + if (get_relative_dist(cm, ref_offset, (int)cm->frame_offset) > 0) { + one_sided_refs = 0; // bwd reference + break; + } + } + return one_sided_refs; +} + +static INLINE void get_skip_mode_ref_offsets(const AV1_COMMON *cm, + int ref_offset[2]) { + ref_offset[0] = ref_offset[1] = 0; + if (!cm->is_skip_mode_allowed) return; + + const int buf_idx_0 = cm->frame_refs[cm->ref_frame_idx_0].idx; + const int buf_idx_1 = cm->frame_refs[cm->ref_frame_idx_1].idx; + assert(buf_idx_0 != INVALID_IDX && buf_idx_1 != INVALID_IDX); + + ref_offset[0] = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset; + ref_offset[1] = cm->buffer_pool->frame_bufs[buf_idx_1].cur_frame_offset; +} + +static int check_skip_mode_enabled(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + + av1_setup_skip_mode_allowed(cm); + if (!cm->is_skip_mode_allowed) return 0; + + // Turn off skip mode if the temporal distances of the reference pair to the + // current frame are different by more than 1 frame. + const int cur_offset = (int)cm->frame_offset; + int ref_offset[2]; + get_skip_mode_ref_offsets(cm, ref_offset); + const int cur_to_ref0 = get_relative_dist(cm, cur_offset, ref_offset[0]); + const int cur_to_ref1 = abs(get_relative_dist(cm, cur_offset, ref_offset[1])); + if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0; + + // High Latency: Turn off skip mode if all refs are fwd. + if (cpi->all_one_sided_refs && cpi->oxcf.lag_in_frames > 0) return 0; + + static const int flag_list[REF_FRAMES] = { 0, + AOM_LAST_FLAG, + AOM_LAST2_FLAG, + AOM_LAST3_FLAG, + AOM_GOLD_FLAG, + AOM_BWD_FLAG, + AOM_ALT2_FLAG, + AOM_ALT_FLAG }; + const int ref_frame[2] = { cm->ref_frame_idx_0 + LAST_FRAME, + cm->ref_frame_idx_1 + LAST_FRAME }; + if (!(cpi->ref_frame_flags & flag_list[ref_frame[0]]) || + !(cpi->ref_frame_flags & flag_list[ref_frame[1]])) + return 0; + + return 1; +} + +// Function to decide if we can skip the global motion parameter computation +// for a particular ref frame +static INLINE int skip_gm_frame(AV1_COMMON *const cm, int ref_frame) { + if ((ref_frame == LAST3_FRAME || ref_frame == LAST2_FRAME) && + cm->global_motion[GOLDEN_FRAME].wmtype != IDENTITY) { + return get_relative_dist( + cm, cm->cur_frame->ref_frame_offset[ref_frame - LAST_FRAME], + cm->cur_frame->ref_frame_offset[GOLDEN_FRAME - LAST_FRAME]) <= 0; + } + return 0; +} static void encode_frame_internal(AV1_COMP *cpi) { ThreadData *const td = &cpi->td; @@ -5202,16 +4450,9 @@ static void encode_frame_internal(AV1_COMP *cpi) { MACROBLOCKD *const xd = &x->e_mbd; RD_COUNTS *const rdc = &cpi->td.rd_counts; int i; -#if CONFIG_TEMPMV_SIGNALING || CONFIG_EXT_REFS - const int last_fb_buf_idx = get_ref_frame_buf_idx(cpi, LAST_FRAME); -#endif // CONFIG_TEMPMV_SIGNALING || CONFIG_EXT_REFS - -#if CONFIG_ADAPT_SCAN - av1_deliver_eob_threshold(cm, xd); -#endif - x->min_partition_size = AOMMIN(x->min_partition_size, cm->sb_size); - x->max_partition_size = AOMMIN(x->max_partition_size, cm->sb_size); + x->min_partition_size = AOMMIN(x->min_partition_size, cm->seq_params.sb_size); + x->max_partition_size = AOMMIN(x->max_partition_size, cm->seq_params.sb_size); #if CONFIG_DIST_8X8 x->using_dist_8x8 = cpi->oxcf.using_dist_8x8; x->tune_metric = cpi->oxcf.tuning; @@ -5225,23 +4466,29 @@ static void encode_frame_internal(AV1_COMP *cpi) { av1_zero(rdc->comp_pred_diff); if (frame_is_intra_only(cm)) { -// TODO(anybody) : remove this flag when PVQ supports pallete coding tool -#if !CONFIG_PVQ - cm->allow_screen_content_tools = - cpi->oxcf.content == AOM_CONTENT_SCREEN || - is_screen_content(cpi->source->y_buffer, -#if CONFIG_HIGHBITDEPTH - cpi->source->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd, -#endif // CONFIG_HIGHBITDEPTH - cpi->source->y_stride, cpi->source->y_width, - cpi->source->y_height); -#else - cm->allow_screen_content_tools = 0; -#endif // !CONFIG_PVQ + if (cm->seq_params.force_screen_content_tools == 2) { + cm->allow_screen_content_tools = + cpi->oxcf.content == AOM_CONTENT_SCREEN || + is_screen_content(cpi->source->y_buffer, + cpi->source->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd, + cpi->source->y_stride, cpi->source->y_width, + cpi->source->y_height); + } else { + cm->allow_screen_content_tools = + cm->seq_params.force_screen_content_tools; + } } -#if CONFIG_HASH_ME - if (cpi->oxcf.pass != 1 && cpi->common.allow_screen_content_tools) { + // Allow intrabc when screen content tools are enabled. + cm->allow_intrabc = cm->allow_screen_content_tools; + // Reset the flag. + cpi->intrabc_used = 0; + // Need to disable intrabc when superres is selected + if (av1_superres_scaled(cm)) { + cm->allow_intrabc = 0; + } + + if (cpi->oxcf.pass != 1 && av1_use_hash_me(cm)) { // add to hash table const int pic_width = cpi->source->y_crop_width; const int pic_height = cpi->source->y_crop_height; @@ -5295,6 +4542,13 @@ static void encode_frame_internal(AV1_COMP *cpi) { &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2], pic_width, pic_height, 64); + av1_generate_block_hash_value(cpi->source, 128, block_hash_values[1], + block_hash_values[0], is_block_same[1], + is_block_same[0]); + av1_add_to_hash_map_by_row_with_precal_data( + &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2], + pic_width, pic_height, 128); + for (k = 0; k < 2; k++) { for (j = 0; j < 2; j++) { aom_free(block_hash_values[k][j]); @@ -5305,18 +4559,71 @@ static void encode_frame_internal(AV1_COMP *cpi) { } } } -#endif -#if CONFIG_NCOBMC_ADAPT_WEIGHT - alloc_ncobmc_pred_buffer(xd); -#endif + for (i = 0; i < MAX_SEGMENTS; ++i) { + const int qindex = cm->seg.enabled + ? av1_get_qindex(&cm->seg, i, cm->base_qindex) + : cm->base_qindex; + xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 && + cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 && + cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0; + if (xd->lossless[i]) cpi->has_lossless_segment = 1; + xd->qindex[i] = qindex; + if (xd->lossless[i]) { + cpi->optimize_seg_arr[i] = 0; + } else { + cpi->optimize_seg_arr[i] = cpi->optimize_speed_feature; + } + } + cm->coded_lossless = is_coded_lossless(cm, xd); + cm->all_lossless = cm->coded_lossless && !av1_superres_scaled(cm); + + cm->tx_mode = select_tx_mode(cpi); + + // Fix delta q resolution for the moment + cm->delta_q_res = DEFAULT_DELTA_Q_RES; + // Set delta_q_present_flag before it is used for the first time + cm->delta_lf_res = DEFAULT_DELTA_LF_RES; + cm->delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q; + cm->delta_lf_present_flag = cpi->oxcf.deltaq_mode == DELTA_Q_LF; + cm->delta_lf_multi = DEFAULT_DELTA_LF_MULTI; + // update delta_q_present_flag and delta_lf_present_flag based on base_qindex + cm->delta_q_present_flag &= cm->base_qindex > 0; + cm->delta_lf_present_flag &= cm->base_qindex > 0; + + av1_frame_init_quantizer(cpi); + + av1_initialize_rd_consts(cpi); + av1_initialize_me_consts(cpi, x, cm->base_qindex); + init_encode_frame_mb_context(cpi); + + if (cm->prev_frame) + cm->last_frame_seg_map = cm->prev_frame->seg_map; + else + cm->last_frame_seg_map = NULL; + cm->current_frame_seg_map = cm->cur_frame->seg_map; + if (cm->allow_intrabc || cm->coded_lossless) { + av1_set_default_ref_deltas(cm->lf.ref_deltas); + av1_set_default_mode_deltas(cm->lf.mode_deltas); + } else if (cm->prev_frame) { + memcpy(cm->lf.ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES); + memcpy(cm->lf.mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS); + } + memcpy(cm->cur_frame->ref_deltas, cm->lf.ref_deltas, REF_FRAMES); + memcpy(cm->cur_frame->mode_deltas, cm->lf.mode_deltas, MAX_MODE_LF_DELTAS); + + // Special case: set prev_mi to NULL when the previous mode info + // context cannot be used. + cm->prev_mi = cm->allow_ref_frame_mvs ? cm->prev_mip : NULL; + + x->txb_split_count = 0; + av1_zero(x->blk_skip_drl); -#if CONFIG_GLOBAL_MOTION av1_zero(rdc->global_motion_used); av1_zero(cpi->gmparams_cost); if (cpi->common.frame_type == INTER_FRAME && cpi->source && !cpi->global_motion_search_done) { - YV12_BUFFER_CONFIG *ref_buf[TOTAL_REFS_PER_FRAME]; + YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES]; int frame; double params_by_motion[RANSAC_NUM_MOTIONS * (MAX_PARAMDIM - 1)]; const double *params_this_motion; @@ -5327,32 +4634,31 @@ static void encode_frame_internal(AV1_COMP *cpi) { }; int num_refs_using_gm = 0; - for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) { + for (frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) { ref_buf[frame] = get_ref_frame_buffer(cpi, frame); int pframe; cm->global_motion[frame] = default_warp_params; const WarpedMotionParams *ref_params = - cm->error_resilient_mode ? &default_warp_params - : &cm->prev_frame->global_motion[frame]; + cm->prev_frame ? &cm->prev_frame->global_motion[frame] + : &default_warp_params; // check for duplicate buffer - for (pframe = LAST_FRAME; pframe < frame; ++pframe) { + for (pframe = ALTREF_FRAME; pframe > frame; --pframe) { if (ref_buf[frame] == ref_buf[pframe]) break; } - if (pframe < frame) { + if (pframe > frame) { memcpy(&cm->global_motion[frame], &cm->global_motion[pframe], sizeof(WarpedMotionParams)); } else if (ref_buf[frame] && ref_buf[frame]->y_crop_width == cpi->source->y_crop_width && ref_buf[frame]->y_crop_height == cpi->source->y_crop_height && - do_gm_search_logic(&cpi->sf, num_refs_using_gm, frame)) { + do_gm_search_logic(&cpi->sf, num_refs_using_gm, frame) && + !(cpi->sf.selective_ref_gm && skip_gm_frame(cm, frame))) { TransformationType model; - const int64_t ref_frame_error = av1_frame_error( -#if CONFIG_HIGHBITDEPTH - xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd, -#endif // CONFIG_HIGHBITDEPTH - ref_buf[frame]->y_buffer, ref_buf[frame]->y_stride, - cpi->source->y_buffer, cpi->source->y_width, cpi->source->y_height, - cpi->source->y_stride); + const int64_t ref_frame_error = + av1_frame_error(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd, + ref_buf[frame]->y_buffer, ref_buf[frame]->y_stride, + cpi->source->y_buffer, cpi->source->y_width, + cpi->source->y_height, cpi->source->y_stride); if (ref_frame_error == 0) continue; @@ -5366,10 +4672,7 @@ static void encode_frame_internal(AV1_COMP *cpi) { } compute_global_motion_feature_based( - model, cpi->source, ref_buf[frame], -#if CONFIG_HIGHBITDEPTH - cpi->common.bit_depth, -#endif // CONFIG_HIGHBITDEPTH + model, cpi->source, ref_buf[frame], cpi->common.bit_depth, inliers_by_motion, params_by_motion, RANSAC_NUM_MOTIONS); for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) { @@ -5381,9 +4684,7 @@ static void encode_frame_internal(AV1_COMP *cpi) { if (tmp_wm_params.wmtype != IDENTITY) { const int64_t warp_error = refine_integerized_param( &tmp_wm_params, tmp_wm_params.wmtype, -#if CONFIG_HIGHBITDEPTH xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd, -#endif // CONFIG_HIGHBITDEPTH ref_buf[frame]->y_buffer, ref_buf[frame]->y_width, ref_buf[frame]->y_height, ref_buf[frame]->y_stride, cpi->source->y_buffer, cpi->source->y_width, @@ -5418,7 +4719,8 @@ static void encode_frame_internal(AV1_COMP *cpi) { if (!is_enough_erroradvantage( (double)best_warp_error / ref_frame_error, gm_get_params_cost(&cm->global_motion[frame], ref_params, - cm->allow_high_precision_mv))) { + cm->allow_high_precision_mv), + cpi->sf.gm_erroradv_type)) { cm->global_motion[frame] = default_warp_params; } if (cm->global_motion[frame].wmtype != IDENTITY) break; @@ -5435,91 +4737,14 @@ static void encode_frame_internal(AV1_COMP *cpi) { cpi->global_motion_search_done = 1; } memcpy(cm->cur_frame->global_motion, cm->global_motion, - TOTAL_REFS_PER_FRAME * sizeof(WarpedMotionParams)); -#endif // CONFIG_GLOBAL_MOTION - - for (i = 0; i < MAX_SEGMENTS; ++i) { - const int qindex = cm->seg.enabled - ? av1_get_qindex(&cm->seg, i, cm->base_qindex) - : cm->base_qindex; - xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 && - cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0; - xd->qindex[i] = qindex; - } - cm->all_lossless = all_lossless(cm, xd); - if (!cm->seg.enabled && xd->lossless[0]) x->optimize = 0; + REF_FRAMES * sizeof(WarpedMotionParams)); - cm->tx_mode = select_tx_mode(cpi); - - // Fix delta q resolution for the moment - cm->delta_q_res = DEFAULT_DELTA_Q_RES; -// Set delta_q_present_flag before it is used for the first time -#if CONFIG_EXT_DELTA_Q - cm->delta_lf_res = DEFAULT_DELTA_LF_RES; - // update delta_q_present_flag and delta_lf_present_flag based on base_qindex - cm->delta_q_present_flag &= cm->base_qindex > 0; - cm->delta_lf_present_flag &= cm->base_qindex > 0; -#else - cm->delta_q_present_flag = - cpi->oxcf.aq_mode == DELTA_AQ && cm->base_qindex > 0; -#endif // CONFIG_EXT_DELTA_Q - - av1_frame_init_quantizer(cpi); - - av1_initialize_rd_consts(cpi); - av1_initialize_me_consts(cpi, x, cm->base_qindex); - init_encode_frame_mb_context(cpi); - -#if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING - // NOTE(zoeliu): As cm->prev_frame can take neither a frame of - // show_exisiting_frame=1, nor can it take a frame not used as - // a reference, it is probable that by the time it is being - // referred to, the frame buffer it originally points to may - // already get expired and have been reassigned to the current - // newly coded frame. Hence, we need to check whether this is - // the case, and if yes, we have 2 choices: - // (1) Simply disable the use of previous frame mvs; or - // (2) Have cm->prev_frame point to one reference frame buffer, - // e.g. LAST_FRAME. - if (!enc_is_ref_frame_buf(cpi, cm->prev_frame)) { - // Reassign the LAST_FRAME buffer to cm->prev_frame. - cm->prev_frame = last_fb_buf_idx != INVALID_IDX - ? &cm->buffer_pool->frame_bufs[last_fb_buf_idx] - : NULL; - } -#endif // CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING - -#if CONFIG_TEMPMV_SIGNALING - cm->use_prev_frame_mvs &= frame_can_use_prev_frame_mvs(cm); -#else - if (cm->prev_frame) { - cm->use_prev_frame_mvs = !cm->error_resilient_mode && -#if CONFIG_FRAME_SUPERRES - cm->width == cm->last_width && - cm->height == cm->last_height && -#else - cm->width == cm->prev_frame->buf.y_crop_width && - cm->height == cm->prev_frame->buf.y_crop_height && -#endif // CONFIG_FRAME_SUPERRES - !cm->intra_only && cm->last_show_frame; - } else { - cm->use_prev_frame_mvs = 0; - } -#endif // CONFIG_TEMPMV_SIGNALING - - // Special case: set prev_mi to NULL when the previous mode info - // context cannot be used. - cm->prev_mi = - cm->use_prev_frame_mvs ? cm->prev_mip + cm->mi_stride + 1 : NULL; + av1_setup_motion_field(cm); -#if CONFIG_VAR_TX - x->txb_split_count = 0; - av1_zero(x->blk_skip_drl); -#endif + cpi->all_one_sided_refs = + frame_is_intra_only(cm) ? 0 : av1_refs_are_one_sided(cm); -#if CONFIG_MFMV - av1_setup_motion_field(cm); -#endif // CONFIG_MFMV + cm->skip_mode_flag = check_skip_mode_enabled(cpi); { struct aom_usec_timer emr_timer; @@ -5532,7 +4757,9 @@ static void encode_frame_internal(AV1_COMP *cpi) { } #endif - av1_setup_frame_boundary_info(cm); +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + av1_inter_mode_data_init(); +#endif // If allowed, encoding tiles in parallel with one thread handling one tile. // TODO(geza.lore): The multi-threaded encoder is not safe with more than @@ -5543,109 +4770,72 @@ static void encode_frame_internal(AV1_COMP *cpi) { else encode_tiles(cpi); +#if CONFIG_COLLECT_INTER_MODE_RD_STATS +#if INTER_MODE_RD_TEST + if (cpi->sf.inter_mode_rd_model_estimation) { + av1_inter_mode_data_show(cm); + } +#endif +#endif + aom_usec_timer_mark(&emr_timer); cpi->time_encode_sb_row += aom_usec_timer_elapsed(&emr_timer); } -#if CONFIG_NCOBMC_ADAPT_WEIGHT - free_ncobmc_pred_buffer(xd); -#endif - -#if 0 - // Keep record of the total distortion this time around for future use - cpi->last_frame_distortion = cpi->frame_distortion; -#endif -} -static void make_consistent_compound_tools(AV1_COMMON *cm) { - (void)cm; -#if CONFIG_INTERINTRA - if (frame_is_intra_only(cm) || cm->reference_mode == COMPOUND_REFERENCE) - cm->allow_interintra_compound = 0; -#endif // CONFIG_INTERINTRA -#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE -#if CONFIG_COMPOUND_SINGLEREF - if (frame_is_intra_only(cm)) -#else // !CONFIG_COMPOUND_SINGLEREF - if (frame_is_intra_only(cm) || cm->reference_mode == SINGLE_REFERENCE) -#endif // CONFIG_COMPOUND_SINGLEREF - cm->allow_masked_compound = 0; -#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE + // If intrabc is allowed but never selected, reset the allow_intrabc flag. + if (cm->allow_intrabc && !cpi->intrabc_used) cm->allow_intrabc = 0; + if (cm->allow_intrabc) cm->delta_lf_present_flag = 0; } void av1_encode_frame(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; -#if CONFIG_EXT_TX + const int num_planes = av1_num_planes(cm); // Indicates whether or not to use a default reduced set for ext-tx // rather than the potential full set of 16 transforms cm->reduced_tx_set_used = 0; -#endif // CONFIG_EXT_TX -#if CONFIG_ADAPT_SCAN - cm->use_adapt_scan = 1; - // TODO(angiebird): call av1_init_scan_order only when use_adapt_scan - // switches from 1 to 0 - if (cm->use_adapt_scan == 0) av1_init_scan_order(cm); -#endif -#if CONFIG_FRAME_MARKER if (cm->show_frame == 0) { int arf_offset = AOMMIN( (MAX_GF_INTERVAL - 1), cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]); -#if CONFIG_EXT_REFS int brf_offset = cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index]; arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset); -#endif // CONFIG_EXT_REFS cm->frame_offset = cm->current_video_frame + arf_offset; } else { cm->frame_offset = cm->current_video_frame; } - av1_setup_frame_buf_refs(cm); -#if CONFIG_FRAME_SIGN_BIAS - av1_setup_frame_sign_bias(cm); -#endif // CONFIG_FRAME_SIGN_BIAS -#endif // CONFIG_FRAME_MARKER - - // In the longer term the encoder should be generalized to match the - // decoder such that we allow compound where one of the 3 buffers has a - // different sign bias and that buffer is then the fixed ref. However, this - // requires further work in the rd loop. For now the only supported encoder - // side behavior is where the ALT ref buffer has opposite sign bias to - // the other two. - if (!frame_is_intra_only(cm)) { -#if !CONFIG_ONE_SIDED_COMPOUND - if ((cm->ref_frame_sign_bias[ALTREF_FRAME] == - cm->ref_frame_sign_bias[GOLDEN_FRAME]) || - (cm->ref_frame_sign_bias[ALTREF_FRAME] == - cm->ref_frame_sign_bias[LAST_FRAME])) { - cpi->allow_comp_inter_inter = 0; - } else { -#endif // !CONFIG_ONE_SIDED_COMPOUND - cpi->allow_comp_inter_inter = 1; -#if CONFIG_EXT_REFS - cm->comp_fwd_ref[0] = LAST_FRAME; - cm->comp_fwd_ref[1] = LAST2_FRAME; - cm->comp_fwd_ref[2] = LAST3_FRAME; - cm->comp_fwd_ref[3] = GOLDEN_FRAME; - cm->comp_bwd_ref[0] = BWDREF_FRAME; - cm->comp_bwd_ref[1] = ALTREF2_FRAME; - cm->comp_bwd_ref[2] = ALTREF_FRAME; -#else // !CONFIG_EXT_REFS - cm->comp_fixed_ref = ALTREF_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = GOLDEN_FRAME; -#endif // CONFIG_EXT_REFS -#if !CONFIG_ONE_SIDED_COMPOUND // Normative in encoder + cm->frame_offset %= (1 << (cm->seq_params.order_hint_bits_minus_1 + 1)); + + // Make sure segment_id is no larger than last_active_segid. + if (cm->seg.enabled && cm->seg.update_map) { + const int mi_rows = cm->mi_rows; + const int mi_cols = cm->mi_cols; + const int last_active_segid = cm->seg.last_active_segid; + uint8_t *map = cpi->segmentation_map; + for (int mi_row = 0; mi_row < mi_rows; ++mi_row) { + for (int mi_col = 0; mi_col < mi_cols; ++mi_col) { + map[mi_col] = AOMMIN(map[mi_col], last_active_segid); + } + map += mi_cols; } -#endif // !CONFIG_ONE_SIDED_COMPOUND - } else { - cpi->allow_comp_inter_inter = 0; } + av1_setup_frame_buf_refs(cm); + if (cpi->sf.selective_ref_frame >= 2) enforce_max_ref_frames(cpi); + av1_setup_frame_sign_bias(cm); + +#if CONFIG_MISMATCH_DEBUG + mismatch_reset_frame(num_planes); +#else + (void)num_planes; +#endif + + cpi->allow_comp_inter_inter = !frame_is_intra_only(cm); + if (cpi->sf.frame_parameter_update) { int i; RD_OPT *const rd_opt = &cpi->rd; - FRAME_COUNTS *counts = cpi->td.counts; RD_COUNTS *const rdc = &cpi->td.rd_counts; // This code does a single RD pass over the whole frame assuming @@ -5662,39 +4852,20 @@ void av1_encode_frame(AV1_COMP *cpi) { int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type]; const int is_alt_ref = frame_type == ALTREF_FRAME; -/* prediction (compound, single or hybrid) mode selection */ -#if CONFIG_REF_ADAPT - // NOTE(zoeliu): "is_alt_ref" is true only for OVERLAY/INTNL_OVERLAY frames + /* prediction (compound, single or hybrid) mode selection */ + // NOTE: "is_alt_ref" is true only for OVERLAY/INTNL_OVERLAY frames if (is_alt_ref || !cpi->allow_comp_inter_inter) cm->reference_mode = SINGLE_REFERENCE; else cm->reference_mode = REFERENCE_MODE_SELECT; -#else -#if CONFIG_BGSPRITE - (void)is_alt_ref; - if (!cpi->allow_comp_inter_inter) -#else - if (is_alt_ref || !cpi->allow_comp_inter_inter) -#endif // CONFIG_BGSPRITE - cm->reference_mode = SINGLE_REFERENCE; - else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] && - mode_thrs[COMPOUND_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT] && - check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100) - cm->reference_mode = COMPOUND_REFERENCE; - else if (mode_thrs[SINGLE_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT]) - cm->reference_mode = SINGLE_REFERENCE; - else - cm->reference_mode = REFERENCE_MODE_SELECT; -#endif // CONFIG_REF_ADAPT -#if CONFIG_DUAL_FILTER cm->interp_filter = SWITCHABLE; -#endif + if (cm->large_scale_tile) cm->interp_filter = EIGHTTAP_REGULAR; - make_consistent_compound_tools(cm); + cm->switchable_motion_mode = 1; - rdc->single_ref_used_flag = 0; rdc->compound_ref_used_flag = 0; + rdc->skip_mode_used_flag = 0; encode_frame_internal(cpi); @@ -5705,406 +4876,124 @@ void av1_encode_frame(AV1_COMP *cpi) { // Use a flag that includes 4x4 blocks if (rdc->compound_ref_used_flag == 0) { cm->reference_mode = SINGLE_REFERENCE; - av1_zero(counts->comp_inter); -#if !CONFIG_REF_ADAPT - // Use a flag that includes 4x4 blocks - } else if (rdc->single_ref_used_flag == 0) { - cm->reference_mode = COMPOUND_REFERENCE; - av1_zero(counts->comp_inter); -#endif // !CONFIG_REF_ADAPT - } - } - make_consistent_compound_tools(cm); - -#if CONFIG_VAR_TX -#if CONFIG_RECT_TX_EXT - if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0 && - counts->quarter_tx_size[1] == 0) -#else - if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0) -#endif - cm->tx_mode = ALLOW_32X32 + CONFIG_TX64X64; -#else -#if CONFIG_RECT_TX_EXT && CONFIG_EXT_TX - if (cm->tx_mode == TX_MODE_SELECT && counts->quarter_tx_size[1] == 0) -#else - if (cm->tx_mode == TX_MODE_SELECT) -#endif - { -#if CONFIG_TX64X64 - int count4x4 = 0; - int count8x8_8x8p = 0, count8x8_lp = 0; - int count16x16_16x16p = 0, count16x16_lp = 0; - int count32x32_32x32p = 0, count32x32_lp = 0; - int count64x64_64x64p = 0; - for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { - int depth; - // counts->tx_size[max_depth][context_idx][this_depth_level] - depth = tx_size_to_depth(TX_4X4); - count4x4 += counts->tx_size[TX_8X8 - TX_SIZE_CTX_MIN][i][depth]; - count4x4 += counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth]; - count4x4 += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth]; - count4x4 += counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth]; - - depth = tx_size_to_depth(TX_8X8); - count8x8_8x8p += counts->tx_size[TX_8X8 - TX_SIZE_CTX_MIN][i][depth]; - count8x8_lp += counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth]; - count8x8_lp += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth]; - count8x8_lp += counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth]; - - depth = tx_size_to_depth(TX_16X16); - count16x16_16x16p += - counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth]; - count16x16_lp += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth]; - count16x16_lp += counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth]; - - depth = tx_size_to_depth(TX_32X32); - count32x32_32x32p += - counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth]; - count32x32_lp += counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth]; - - depth = tx_size_to_depth(TX_64X64); - count64x64_64x64p += - counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth]; - } -#if CONFIG_EXT_TX && CONFIG_RECT_TX - count4x4 += counts->tx_size_implied[TX_4X4][TX_4X4]; - count4x4 += counts->tx_size_implied[TX_8X8][TX_4X4]; - count4x4 += counts->tx_size_implied[TX_16X16][TX_4X4]; - count4x4 += counts->tx_size_implied[TX_32X32][TX_4X4]; - count8x8_8x8p += counts->tx_size_implied[TX_8X8][TX_8X8]; - count8x8_lp += counts->tx_size_implied[TX_16X16][TX_8X8]; - count8x8_lp += counts->tx_size_implied[TX_32X32][TX_8X8]; - count8x8_lp += counts->tx_size_implied[TX_64X64][TX_8X8]; - count16x16_16x16p += counts->tx_size_implied[TX_16X16][TX_16X16]; - count16x16_lp += counts->tx_size_implied[TX_32X32][TX_16X16]; - count16x16_lp += counts->tx_size_implied[TX_64X64][TX_16X16]; - count32x32_32x32p += counts->tx_size_implied[TX_32X32][TX_32X32]; - count32x32_lp += counts->tx_size_implied[TX_64X64][TX_32X32]; - count64x64_64x64p += counts->tx_size_implied[TX_64X64][TX_64X64]; -#endif // CONFIG_EXT_TX && CONFIG_RECT_TX - if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 && - count32x32_lp == 0 && count32x32_32x32p == 0 && -#if CONFIG_SUPERTX - cm->counts.supertx_size[TX_16X16] == 0 && - cm->counts.supertx_size[TX_32X32] == 0 && - cm->counts.supertx_size[TX_64X64] == 0 && -#endif - count64x64_64x64p == 0) { - cm->tx_mode = ALLOW_8X8; - reset_skip_tx_size(cm, TX_8X8); - } else if (count8x8_8x8p == 0 && count8x8_lp == 0 && - count16x16_16x16p == 0 && count16x16_lp == 0 && - count32x32_32x32p == 0 && count32x32_lp == 0 && -#if CONFIG_SUPERTX - cm->counts.supertx_size[TX_8X8] == 0 && - cm->counts.supertx_size[TX_16X16] == 0 && - cm->counts.supertx_size[TX_32X32] == 0 && - cm->counts.supertx_size[TX_64X64] == 0 && -#endif - count64x64_64x64p == 0) { - cm->tx_mode = ONLY_4X4; - reset_skip_tx_size(cm, TX_4X4); - } else if (count4x4 == 0 && count8x8_lp == 0 && count16x16_lp == 0 && - count32x32_lp == 0) { - cm->tx_mode = ALLOW_64X64; - } else if (count4x4 == 0 && count8x8_lp == 0 && count16x16_lp == 0 && -#if CONFIG_SUPERTX - cm->counts.supertx_size[TX_64X64] == 0 && -#endif - count64x64_64x64p == 0) { - cm->tx_mode = ALLOW_32X32; - reset_skip_tx_size(cm, TX_32X32); - } else if (count4x4 == 0 && count8x8_lp == 0 && count32x32_lp == 0 && - count32x32_32x32p == 0 && -#if CONFIG_SUPERTX - cm->counts.supertx_size[TX_32X32] == 0 && - cm->counts.supertx_size[TX_64X64] == 0 && -#endif - count64x64_64x64p == 0) { - cm->tx_mode = ALLOW_16X16; - reset_skip_tx_size(cm, TX_16X16); - } - -#else // CONFIG_TX64X64 - - int count4x4 = 0; - int count8x8_lp = 0, count8x8_8x8p = 0; - int count16x16_16x16p = 0, count16x16_lp = 0; - int count32x32 = 0; - for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { - int depth; - // counts->tx_size[max_depth][context_idx][this_depth_level] - depth = tx_size_to_depth(TX_4X4); - count4x4 += counts->tx_size[TX_8X8 - TX_SIZE_CTX_MIN][i][depth]; - count4x4 += counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth]; - count4x4 += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth]; - - depth = tx_size_to_depth(TX_8X8); - count8x8_8x8p += counts->tx_size[TX_8X8 - TX_SIZE_CTX_MIN][i][depth]; - count8x8_lp += counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth]; - count8x8_lp += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth]; - - depth = tx_size_to_depth(TX_16X16); - count16x16_16x16p += - counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth]; - count16x16_lp += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth]; - - depth = tx_size_to_depth(TX_32X32); - count32x32 += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth]; - } -#if CONFIG_EXT_TX && CONFIG_RECT_TX - count4x4 += counts->tx_size_implied[TX_4X4][TX_4X4]; - count4x4 += counts->tx_size_implied[TX_8X8][TX_4X4]; - count4x4 += counts->tx_size_implied[TX_16X16][TX_4X4]; - count4x4 += counts->tx_size_implied[TX_32X32][TX_4X4]; - count8x8_8x8p += counts->tx_size_implied[TX_8X8][TX_8X8]; - count8x8_lp += counts->tx_size_implied[TX_16X16][TX_8X8]; - count8x8_lp += counts->tx_size_implied[TX_32X32][TX_8X8]; - count16x16_16x16p += counts->tx_size_implied[TX_16X16][TX_16X16]; - count16x16_lp += counts->tx_size_implied[TX_32X32][TX_16X16]; - count32x32 += counts->tx_size_implied[TX_32X32][TX_32X32]; -#endif // CONFIG_EXT_TX && CONFIG_RECT_TX - if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 && -#if CONFIG_SUPERTX - cm->counts.supertx_size[TX_16X16] == 0 && - cm->counts.supertx_size[TX_32X32] == 0 && -#endif // CONFIG_SUPERTX - count32x32 == 0) { - cm->tx_mode = ALLOW_8X8; - reset_skip_tx_size(cm, TX_8X8); - } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 && - count8x8_lp == 0 && count16x16_lp == 0 && -#if CONFIG_SUPERTX - cm->counts.supertx_size[TX_8X8] == 0 && - cm->counts.supertx_size[TX_16X16] == 0 && - cm->counts.supertx_size[TX_32X32] == 0 && -#endif // CONFIG_SUPERTX - count32x32 == 0) { - cm->tx_mode = ONLY_4X4; - reset_skip_tx_size(cm, TX_4X4); - } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) { - cm->tx_mode = ALLOW_32X32; - } else if (count32x32 == 0 && count8x8_lp == 0 && -#if CONFIG_SUPERTX - cm->counts.supertx_size[TX_32X32] == 0 && -#endif // CONFIG_SUPERTX - count4x4 == 0) { - cm->tx_mode = ALLOW_16X16; - reset_skip_tx_size(cm, TX_16X16); - } -#endif // CONFIG_TX64X64 - } -#endif - } else { - make_consistent_compound_tools(cm); - encode_frame_internal(cpi); - } -} - -static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd, - const MODE_INFO *mi, const MODE_INFO *above_mi, - const MODE_INFO *left_mi, const int intraonly, - const int mi_row, const int mi_col) { - FRAME_CONTEXT *fc = xd->tile_ctx; - const MB_MODE_INFO *const mbmi = &mi->mbmi; - const PREDICTION_MODE y_mode = mbmi->mode; - const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; - (void)counts; - const BLOCK_SIZE bsize = mbmi->sb_type; - const int unify_bsize = CONFIG_CB4X4; - - if (bsize < BLOCK_8X8 && !unify_bsize) { - int idx, idy; - const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; - const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; - for (idy = 0; idy < 2; idy += num_4x4_h) - for (idx = 0; idx < 2; idx += num_4x4_w) { - const int bidx = idy * 2 + idx; - const PREDICTION_MODE bmode = mi->bmi[bidx].as_mode; - if (intraonly) { #if CONFIG_ENTROPY_STATS - const PREDICTION_MODE a = av1_above_block_mode(mi, above_mi, bidx); - const PREDICTION_MODE l = av1_left_block_mode(mi, left_mi, bidx); - ++counts->kf_y_mode[a][l][bmode]; + av1_zero(cpi->td.counts->comp_inter); #endif // CONFIG_ENTROPY_STATS - update_cdf(get_y_mode_cdf(fc, mi, above_mi, left_mi, bidx), bmode, - INTRA_MODES); - } else { -#if CONFIG_ENTROPY_STATS - ++counts->y_mode[0][bmode]; -#endif // CONFIG_ENTROPY_STATS - update_cdf(fc->y_mode_cdf[0], bmode, INTRA_MODES); - } } - } else { - if (intraonly) { -#if CONFIG_ENTROPY_STATS - const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, 0); - const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, 0); - ++counts->kf_y_mode[above][left][y_mode]; -#endif // CONFIG_ENTROPY_STATS - update_cdf(get_y_mode_cdf(fc, mi, above_mi, left_mi, 0), y_mode, - INTRA_MODES); - } else { -#if CONFIG_ENTROPY_STATS - ++counts->y_mode[size_group_lookup[bsize]][y_mode]; -#endif // CONFIG_ENTROPY_STATS - update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES); } - -#if CONFIG_FILTER_INTRA - if (mbmi->mode == DC_PRED && mbmi->palette_mode_info.palette_size[0] == 0) { - const int use_filter_intra_mode = - mbmi->filter_intra_mode_info.use_filter_intra_mode[0]; - ++counts->filter_intra[0][use_filter_intra_mode]; - } - if (mbmi->uv_mode == UV_DC_PRED -#if CONFIG_CB4X4 - && - is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, - xd->plane[1].subsampling_y) -#endif - && mbmi->palette_mode_info.palette_size[1] == 0) { - const int use_filter_intra_mode = - mbmi->filter_intra_mode_info.use_filter_intra_mode[1]; - ++counts->filter_intra[1][use_filter_intra_mode]; + // Re-check on the skip mode status as reference mode may have been changed. + if (frame_is_intra_only(cm) || cm->reference_mode == SINGLE_REFERENCE) { + cm->is_skip_mode_allowed = 0; + cm->skip_mode_flag = 0; } -#endif // CONFIG_FILTER_INTRA -#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP - if (av1_is_directional_mode(mbmi->mode, bsize)) { - const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd); - const int p_angle = - mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP; - if (av1_is_intra_filter_switchable(p_angle)) - ++counts->intra_filter[intra_filter_ctx][mbmi->intra_filter]; + if (cm->skip_mode_flag && rdc->skip_mode_used_flag == 0) + cm->skip_mode_flag = 0; + + if (!cm->large_scale_tile) { + if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0) + cm->tx_mode = TX_MODE_LARGEST; } -#endif // CONFIG_INTRA_INTERP && CONFIG_INTRA_INTERP + } else { + encode_frame_internal(cpi); } - -#if CONFIG_CB4X4 - if (!is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, - xd->plane[1].subsampling_y)) - return; -#else - (void)mi_row; - (void)mi_col; - (void)xd; -#endif -#if CONFIG_ENTROPY_STATS - ++counts->uv_mode[y_mode][uv_mode]; -#endif // CONFIG_ENTROPY_STATS - update_cdf(fc->uv_mode_cdf[y_mode], uv_mode, UV_INTRA_MODES); } -#if CONFIG_VAR_TX static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd, FRAME_COUNTS *counts, TX_SIZE tx_size, int depth, - int blk_row, int blk_col) { - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - const int tx_row = blk_row >> 1; - const int tx_col = blk_col >> 1; - const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0); - const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0); + int blk_row, int blk_col, + uint8_t allow_update_cdf) { + MB_MODE_INFO *mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int max_blocks_high = max_block_high(xd, bsize, 0); + const int max_blocks_wide = max_block_wide(xd, bsize, 0); int ctx = txfm_partition_context(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, mbmi->sb_type, tx_size); - const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col]; + const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col); + const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index]; if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; assert(tx_size > TX_4X4); if (depth == MAX_VARTX_DEPTH) { -// Don't add to counts in this case -#if CONFIG_RECT_TX_EXT - if (tx_size == plane_tx_size) -#endif - mbmi->tx_size = tx_size; + // Don't add to counts in this case + mbmi->tx_size = tx_size; txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, tx_size, tx_size); return; } -#if CONFIG_RECT_TX_EXT - if (tx_size == plane_tx_size || - mbmi->tx_size == quarter_txsize_lookup[mbmi->sb_type]) -#else - if (tx_size == plane_tx_size) -#endif - { + if (tx_size == plane_tx_size) { +#if CONFIG_ENTROPY_STATS ++counts->txfm_partition[ctx][0]; -#if CONFIG_RECT_TX_EXT - if (tx_size == plane_tx_size) #endif - mbmi->tx_size = tx_size; + if (allow_update_cdf) + update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 0, 2); + mbmi->tx_size = tx_size; txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, tx_size, tx_size); } else { const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; - const int bs = tx_size_wide_unit[sub_txs]; - int i; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; +#if CONFIG_ENTROPY_STATS ++counts->txfm_partition[ctx][1]; +#endif + if (allow_update_cdf) + update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 1, 2); ++x->txb_split_count; if (sub_txs == TX_4X4) { - mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4; + mbmi->inter_tx_size[txb_size_index] = TX_4X4; mbmi->tx_size = TX_4X4; txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, TX_4X4, tx_size); return; } - for (i = 0; i < 4; ++i) { - int offsetr = (i >> 1) * bs; - int offsetc = (i & 0x01) * bs; - update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr, - blk_col + offsetc); + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + int offsetr = row; + int offsetc = col; + + update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr, + blk_col + offsetc, allow_update_cdf); + } } } } static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE plane_bsize, int mi_row, - int mi_col, FRAME_COUNTS *td_counts) { + int mi_col, FRAME_COUNTS *td_counts, + uint8_t allow_update_cdf) { MACROBLOCKD *xd = &x->e_mbd; const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; - const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0]; - TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize, 0); + const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0); const int bh = tx_size_high_unit[max_tx_size]; const int bw = tx_size_wide_unit[max_tx_size]; int idx, idy; - int init_depth = - (mi_height != mi_width) ? RECT_VARTX_DEPTH_INIT : SQR_VARTX_DEPTH_INIT; - -#if CONFIG_INTRABC - // Intrabc doesn't support var-tx yet. So no need to update tx partition - // info., except for the split count (otherwise common->tx_mode may be - // modified, causing mismatch). - if (is_intrabc_block(&x->e_mbd.mi[0]->mbmi)) { - if (x->e_mbd.mi[0]->mbmi.tx_size != max_tx_size) ++x->txb_split_count; - return; - } -#endif // CONFIG_INTRABC - xd->above_txfm_context = - cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2); - xd->left_txfm_context = xd->left_txfm_context_buffer + - ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2); + xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); for (idy = 0; idy < mi_height; idy += bh) for (idx = 0; idx < mi_width; idx += bw) - update_txfm_count(x, xd, td_counts, max_tx_size, init_depth, idy, idx); + update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx, + allow_update_cdf); } static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row, int blk_col) { - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - const int tx_row = blk_row >> 1; - const int tx_col = blk_col >> 1; - const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0); - const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0); - const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col]; + MB_MODE_INFO *mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int max_blocks_high = max_block_high(xd, bsize, 0); + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col); + const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index]; if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; @@ -6114,23 +5003,23 @@ static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row, xd->left_txfm_context + blk_row, tx_size, tx_size); } else { - const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; - const int bsl = tx_size_wide_unit[sub_txs]; - int i; - if (tx_size == TX_8X8) { - mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4; + mbmi->inter_tx_size[txb_size_index] = TX_4X4; mbmi->tx_size = TX_4X4; txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, TX_4X4, tx_size); return; } - - assert(bsl > 0); - for (i = 0; i < 4; ++i) { - int offsetr = (i >> 1) * bsl; - int offsetc = (i & 0x01) * bsl; - set_txfm_context(xd, sub_txs, blk_row + offsetr, blk_col + offsetc); + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + const int offsetr = blk_row + row; + const int offsetc = blk_col + col; + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + set_txfm_context(xd, sub_txs, offsetr, offsetc); + } } } } @@ -6140,214 +5029,94 @@ static void tx_partition_set_contexts(const AV1_COMMON *const cm, int mi_row, int mi_col) { const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0]; - TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize, 0); + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0); const int bh = tx_size_high_unit[max_tx_size]; const int bw = tx_size_wide_unit[max_tx_size]; int idx, idy; - xd->above_txfm_context = - cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2); - xd->left_txfm_context = xd->left_txfm_context_buffer + - ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2); + xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); for (idy = 0; idy < mi_height; idy += bh) for (idx = 0; idx < mi_width; idx += bw) set_txfm_context(xd, max_tx_size, idy, idx); } -#endif -void av1_update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd, -#if CONFIG_TXK_SEL - int blk_row, int blk_col, int block, int plane, -#endif - BLOCK_SIZE bsize, TX_SIZE tx_size, - FRAME_COUNTS *counts) { - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - int is_inter = is_inter_block(mbmi); - FRAME_CONTEXT *fc = xd->tile_ctx; -#if !CONFIG_ENTROPY_STATS - (void)counts; -#endif // !CONFIG_ENTROPY_STATS - -#if !CONFIG_TXK_SEL - TX_TYPE tx_type = mbmi->tx_type; -#else - (void)blk_row; - (void)blk_col; - // Only y plane's tx_type is updated - if (plane > 0) return; - TX_TYPE tx_type = - av1_get_tx_type(PLANE_TYPE_Y, xd, blk_row, blk_col, block, tx_size); -#endif -#if CONFIG_EXT_TX - if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 && - cm->base_qindex > 0 && !mbmi->skip && - !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { - const int eset = - get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used); - if (eset > 0) { -#if !CONFIG_LGT_FROM_PRED - const TxSetType tx_set_type = get_ext_tx_set_type( - tx_size, bsize, is_inter, cm->reduced_tx_set_used); - if (is_inter) { - update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]], - av1_ext_tx_ind[tx_set_type][tx_type], - av1_num_ext_tx_set[tx_set_type]); -#if CONFIG_ENTROPY_STATS - ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]][tx_type]; -#endif // CONFIG_ENTROPY_STATS - } else { -#if CONFIG_ENTROPY_STATS - ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][mbmi->mode] - [tx_type]; -#endif // CONFIG_ENTROPY_STATS - update_cdf( - fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][mbmi->mode], - av1_ext_tx_ind[tx_set_type][tx_type], - av1_num_ext_tx_set[tx_set_type]); - } -#else - (void)tx_type; - (void)fc; - if (is_inter) { - if (LGT_FROM_PRED_INTER) { - if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used) - ++counts->inter_lgt[txsize_sqr_map[tx_size]][mbmi->use_lgt]; -#if CONFIG_ENTROPY_STATS - if (!mbmi->use_lgt) - ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]][tx_type]; - else -#endif // CONFIG_ENTROPY_STATS - mbmi->tx_type = DCT_DCT; - } else { -#if CONFIG_ENTROPY_STATS - ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]][tx_type]; -#endif // CONFIG_ENTROPY_STATS - } - } else { - if (LGT_FROM_PRED_INTRA) { - if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used) - ++counts->intra_lgt[txsize_sqr_map[tx_size]][mbmi->mode] - [mbmi->use_lgt]; -#if CONFIG_ENTROPY_STATS - if (!mbmi->use_lgt) - ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][mbmi->mode] - [tx_type]; - else -#endif // CONFIG_ENTROPY_STATS - mbmi->tx_type = DCT_DCT; - } else { -#if CONFIG_ENTROPY_STATS - ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][mbmi->mode] - [tx_type]; -#endif // CONFIG_ENTROPY_STATS - } - } -#endif // CONFIG_LGT_FROM_PRED - } - } -#else - (void)bsize; - if (tx_size < TX_32X32 && - ((!cm->seg.enabled && cm->base_qindex > 0) || - (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) && - !mbmi->skip && - !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { - if (is_inter) { -#if CONFIG_ENTROPY_STATS - ++counts->inter_ext_tx[tx_size][tx_type]; -#endif // CONFIG_ENTROPY_STATS - update_cdf(fc->inter_ext_tx_cdf[tx_size], av1_ext_tx_ind[tx_type], - TX_TYPES); - } else { -#if CONFIG_ENTROPY_STATS - ++counts->intra_ext_tx[tx_size][intra_mode_to_tx_type_context[mbmi->mode]] - [tx_type]; -#endif // CONFIG_ENTROPY_STATS - update_cdf( - fc->intra_ext_tx_cdf[tx_size] - [intra_mode_to_tx_type_context[mbmi->mode]], - av1_ext_tx_ind[tx_type], TX_TYPES); - } - } -#endif // CONFIG_EXT_TX -} - -static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td, - TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row, - int mi_col, BLOCK_SIZE bsize, int *rate) { +static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data, + ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int *rate) { const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO **mi_8x8 = xd->mi; - MODE_INFO *mi = mi_8x8[0]; - MB_MODE_INFO *mbmi = &mi->mbmi; + MB_MODE_INFO **mi_4x4 = xd->mi; + MB_MODE_INFO *mbmi = mi_4x4[0]; const int seg_skip = segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); const int mis = cm->mi_stride; const int mi_width = mi_size_wide[bsize]; const int mi_height = mi_size_high[bsize]; const int is_inter = is_inter_block(mbmi); -#if CONFIG_CB4X4 - const BLOCK_SIZE block_size = bsize; -#else - const BLOCK_SIZE block_size = AOMMAX(bsize, BLOCK_8X8); -#endif -#if CONFIG_PVQ - x->pvq_speed = 0; - x->pvq_coded = (dry_run == OUTPUT_ENABLED) ? 1 : 0; -#endif + if (cpi->sf.mode_pruning_based_on_two_pass_partition_search && + x->cb_partition_scan) { + for (int row = mi_row; row < mi_row + mi_width; + row += FIRST_PARTITION_PASS_SAMPLE_REGION) { + for (int col = mi_col; col < mi_col + mi_height; + col += FIRST_PARTITION_PASS_SAMPLE_REGION) { + const int index = av1_first_partition_pass_stats_index(row, col); + FIRST_PARTITION_PASS_STATS *const stats = + &x->first_partition_pass_stats[index]; + // Increase the counter of data samples. + ++stats->sample_counts; + // Increase the counter for ref_frame[0] and ref_frame[1]. + if (stats->ref0_counts[mbmi->ref_frame[0]] < 255) + ++stats->ref0_counts[mbmi->ref_frame[0]]; + if (mbmi->ref_frame[1] >= 0 && + stats->ref1_counts[mbmi->ref_frame[0]] < 255) + ++stats->ref1_counts[mbmi->ref_frame[1]]; + } + } + } if (!is_inter) { -#if CONFIG_CFL - xd->cfl->store_y = 1; -#endif // CONFIG_CFL - int plane; + xd->cfl.is_chroma_reference = is_chroma_reference( + mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y); + xd->cfl.store_y = store_cfl_required(cm, xd); mbmi->skip = 1; - for (plane = 0; plane < MAX_MB_PLANE; ++plane) { - av1_encode_intra_block_plane((AV1_COMMON *)cm, x, block_size, plane, 1, + for (int plane = 0; plane < num_planes; ++plane) { + av1_encode_intra_block_plane(cpi, x, bsize, plane, + cpi->optimize_seg_arr[mbmi->segment_id], mi_row, mi_col); } -#if CONFIG_CFL - xd->cfl->store_y = 0; -#if CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG - if (is_chroma_reference(mi_row, mi_col, bsize, xd->cfl->subsampling_x, - xd->cfl->subsampling_y) && - !xd->cfl->are_parameters_computed) { - cfl_clear_sub8x8_val(xd->cfl); - } -#endif // CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG -#endif // CONFIG_CFL - if (!dry_run) { - sum_intra_stats(td->counts, xd, mi, xd->above_mi, xd->left_mi, - frame_is_intra_only(cm), mi_row, mi_col); - } -// TODO(anybody) : remove this flag when PVQ supports pallete coding tool -#if !CONFIG_PVQ - if (bsize >= BLOCK_8X8) { - for (plane = 0; plane <= 1; ++plane) { + // If there is at least one lossless segment, force the skip for intra + // block to be 0, in order to avoid the segment_id to be changed by in + // write_segment_id(). + if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map && + cpi->has_lossless_segment) + mbmi->skip = 0; + + xd->cfl.store_y = 0; + if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) { + for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) { if (mbmi->palette_mode_info.palette_size[plane] > 0) { - if (!dry_run) - av1_tokenize_color_map(x, plane, 0, t, bsize, mbmi->tx_size, - PALETTE_MAP); - else if (dry_run == DRY_RUN_COSTCOEFFS) - rate += av1_cost_color_map(x, plane, 0, bsize, mbmi->tx_size, - PALETTE_MAP); + if (!dry_run) { + av1_tokenize_color_map(x, plane, t, bsize, mbmi->tx_size, + PALETTE_MAP, tile_data->allow_update_cdf, + td->counts); + } else if (dry_run == DRY_RUN_COSTCOEFFS) { + rate += + av1_cost_color_map(x, plane, bsize, mbmi->tx_size, PALETTE_MAP); + } } } } -#endif // !CONFIG_PVQ -#if CONFIG_VAR_TX - mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size); -#endif -#if CONFIG_LV_MAP - av1_update_txb_context(cpi, td, dry_run, block_size, rate, mi_row, mi_col); -#else // CONFIG_LV_MAP - av1_tokenize_sb(cpi, td, t, dry_run, block_size, rate, mi_row, mi_col); -#endif // CONFIG_LV_MAP + av1_update_txb_context(cpi, td, dry_run, bsize, rate, mi_row, mi_col, + tile_data->allow_update_cdf); } else { int ref; const int is_compound = has_second_ref(mbmi); @@ -6355,123 +5124,66 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td, set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); for (ref = 0; ref < 1 + is_compound; ++ref) { YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]); -#if CONFIG_INTRABC assert(IMPLIES(!is_intrabc_block(mbmi), cfg)); -#else - assert(cfg != NULL); -#endif // !CONFIG_INTRABC av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col, - &xd->block_refs[ref]->sf); + &xd->block_refs[ref]->sf, num_planes); } -#if CONFIG_COMPOUND_SINGLEREF - // Single ref compound mode - if (!is_compound && is_inter_singleref_comp_mode(mbmi->mode)) { - xd->block_refs[1] = xd->block_refs[0]; - YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[0]); -#if CONFIG_INTRABC - assert(IMPLIES(!is_intrabc_block(mbmi), cfg)); -#else - assert(cfg != NULL); -#endif // !CONFIG_INTRABC - av1_setup_pre_planes(xd, 1, cfg, mi_row, mi_col, &xd->block_refs[1]->sf); - } -#endif // CONFIG_COMPOUND_SINGLEREF - - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, block_size); -#if !CONFIG_NCOBMC_ADAPT_WEIGHT -#if CONFIG_MOTION_VAR - if (mbmi->motion_mode == OBMC_CAUSAL) { -#if CONFIG_NCOBMC - if (dry_run == OUTPUT_ENABLED) - av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col); - else -#endif - av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col); - } -#endif // CONFIG_MOTION_VAR -#else - if (mbmi->motion_mode == OBMC_CAUSAL) { + av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize); + if (mbmi->motion_mode == OBMC_CAUSAL) av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col); - } else if (mbmi->motion_mode == NCOBMC_ADAPT_WEIGHT && - dry_run == OUTPUT_ENABLED) { - int p; - for (p = 0; p < MAX_MB_PLANE; ++p) { - get_pred_from_intrpl_buf(xd, mi_row, mi_col, block_size, p); + +#if CONFIG_MISMATCH_DEBUG + if (dry_run == OUTPUT_ENABLED) { + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + int pixel_c, pixel_r; + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, + pd->subsampling_x, pd->subsampling_y); + if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, + pd->subsampling_y)) + continue; + mismatch_record_block_pre(pd->dst.buf, pd->dst.stride, cm->frame_offset, + plane, pixel_c, pixel_r, pd->width, + pd->height, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); } } -#endif - - av1_encode_sb((AV1_COMMON *)cm, x, block_size, mi_row, mi_col); -#if CONFIG_VAR_TX - if (mbmi->skip) mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size); - av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col, block_size, - rate); #else -#if CONFIG_LV_MAP - av1_update_txb_context(cpi, td, dry_run, block_size, rate, mi_row, mi_col); -#else // CONFIG_LV_MAP - av1_tokenize_sb(cpi, td, t, dry_run, block_size, rate, mi_row, mi_col); -#endif // CONFIG_LV_MAP + (void)num_planes; #endif - } -#if CONFIG_DIST_8X8 && CONFIG_CB4X4 - if (x->using_dist_8x8 && bsize < BLOCK_8X8) { - dist_8x8_set_sub8x8_dst(x, (uint8_t *)x->decoded_8x8, bsize, - block_size_wide[bsize], block_size_high[bsize], - mi_row, mi_col); + av1_encode_sb(cpi, x, bsize, mi_row, mi_col, dry_run); + av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col, bsize, rate, + tile_data->allow_update_cdf); } -#endif if (!dry_run) { -#if CONFIG_VAR_TX - TX_SIZE tx_size = - is_inter && !mbmi->skip ? mbmi->min_tx_size : mbmi->tx_size; -#else - TX_SIZE tx_size = mbmi->tx_size; -#endif + if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi)) + td->intrabc_used_this_tile = 1; if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id] && -#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX - mbmi->sb_type > BLOCK_4X4 && -#else - mbmi->sb_type >= BLOCK_8X8 && -#endif - !(is_inter && (mbmi->skip || seg_skip))) { -#if CONFIG_VAR_TX + mbmi->sb_type > BLOCK_4X4 && !(is_inter && (mbmi->skip || seg_skip))) { if (is_inter) { - tx_partition_count_update(cm, x, bsize, mi_row, mi_col, td->counts); + tx_partition_count_update(cm, x, bsize, mi_row, mi_col, td->counts, + tile_data->allow_update_cdf); } else { - const int tx_size_ctx = get_tx_size_context(xd); - const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize] - : intra_tx_size_cat_lookup[bsize]; - const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size]; - const int depth = tx_size_to_depth(coded_tx_size); - ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth]; - if (tx_size != max_txsize_rect_lookup[bsize]) ++x->txb_split_count; - } -#else - const int tx_size_ctx = get_tx_size_context(xd); - const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize] - : intra_tx_size_cat_lookup[bsize]; - const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size]; - const int depth = tx_size_to_depth(coded_tx_size); - - ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth]; + if (mbmi->tx_size != max_txsize_rect_lookup[bsize]) + ++x->txb_split_count; + if (block_signals_txsize(bsize)) { + const int tx_size_ctx = get_tx_size_context(xd); + const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); + const int depth = tx_size_to_depth(mbmi->tx_size, bsize); + const int max_depths = bsize_to_max_depth(bsize); + + if (tile_data->allow_update_cdf) + update_cdf(xd->tile_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx], + depth, max_depths + 1); +#if CONFIG_ENTROPY_STATS + ++td->counts->intra_tx_size[tx_size_cat][tx_size_ctx][depth]; #endif - -#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX) - if (is_quarter_tx_allowed(xd, mbmi, is_inter) && - quarter_txsize_lookup[bsize] != max_txsize_rect_lookup[bsize] && - (mbmi->tx_size == quarter_txsize_lookup[bsize] || - mbmi->tx_size == max_txsize_rect_lookup[bsize])) { - ++td->counts - ->quarter_tx_size[mbmi->tx_size == quarter_txsize_lookup[bsize]]; + } } -#endif -#if CONFIG_EXT_TX && CONFIG_RECT_TX - assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi))); -#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi))); } else { int i, j; TX_SIZE intra_tx_size; @@ -6480,43 +5192,22 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td, if (xd->lossless[mbmi->segment_id]) { intra_tx_size = TX_4X4; } else { - intra_tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, 1); + intra_tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode); } } else { -#if CONFIG_EXT_TX && CONFIG_RECT_TX - intra_tx_size = tx_size; -#else - intra_tx_size = (bsize >= BLOCK_8X8) ? tx_size : TX_4X4; -#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + intra_tx_size = mbmi->tx_size; } -#if CONFIG_EXT_TX && CONFIG_RECT_TX - ++td->counts->tx_size_implied[max_txsize_lookup[bsize]] - [txsize_sqr_up_map[tx_size]]; -#endif // CONFIG_EXT_TX && CONFIG_RECT_TX for (j = 0; j < mi_height; j++) for (i = 0; i < mi_width; i++) if (mi_col + i < cm->mi_cols && mi_row + j < cm->mi_rows) - mi_8x8[mis * j + i]->mbmi.tx_size = intra_tx_size; + mi_4x4[mis * j + i]->tx_size = intra_tx_size; -#if CONFIG_VAR_TX - mbmi->min_tx_size = get_min_tx_size(intra_tx_size); if (intra_tx_size != max_txsize_rect_lookup[bsize]) ++x->txb_split_count; -#endif } - -#if !CONFIG_TXK_SEL - av1_update_tx_type_count(cm, xd, bsize, tx_size, td->counts); -#endif } -#if CONFIG_VAR_TX - if (cm->tx_mode == TX_MODE_SELECT && -#if CONFIG_CB4X4 - mbmi->sb_type > BLOCK_4X4 && -#else - mbmi->sb_type >= BLOCK_8X8 && -#endif + if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type) && is_inter && !(mbmi->skip || seg_skip) && !xd->lossless[mbmi->segment_id]) { if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col); @@ -6527,1137 +5218,20 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td, if (xd->lossless[mbmi->segment_id]) { tx_size = TX_4X4; } else { - tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, is_inter); + tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode); } } else { tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4; } mbmi->tx_size = tx_size; - set_txfm_ctxs(tx_size, xd->n8_w, xd->n8_h, (mbmi->skip || seg_skip), xd); - } -#endif // CONFIG_VAR_TX -#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 - CFL_CTX *const cfl = xd->cfl; -#if CONFIG_DEBUG - if (is_chroma_reference(mi_row, mi_col, bsize, cfl->subsampling_x, - cfl->subsampling_y) && - !cfl->are_parameters_computed) { - cfl_clear_sub8x8_val(cfl); - } -#endif // CONFIG_DEBUG + set_txfm_ctxs(tx_size, xd->n8_w, xd->n8_h, + (mbmi->skip || seg_skip) && is_inter_block(mbmi), xd); + } + CFL_CTX *const cfl = &xd->cfl; if (is_inter_block(mbmi) && !is_chroma_reference(mi_row, mi_col, bsize, cfl->subsampling_x, - cfl->subsampling_y)) { + cfl->subsampling_y) && + is_cfl_allowed(xd)) { cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size); } -#endif // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 -} - -#if CONFIG_SUPERTX -static int check_intra_b(PICK_MODE_CONTEXT *ctx) { - if (!is_inter_mode((&ctx->mic)->mbmi.mode)) return 1; - if (ctx->mic.mbmi.ref_frame[1] == INTRA_FRAME) return 1; - return 0; -} - -static int check_intra_sb(const AV1_COMP *const cpi, const TileInfo *const tile, - int mi_row, int mi_col, BLOCK_SIZE bsize, - PC_TREE *pc_tree) { - const AV1_COMMON *const cm = &cpi->common; - const int hbs = mi_size_wide[bsize] / 2; - const PARTITION_TYPE partition = pc_tree->partitioning; - const BLOCK_SIZE subsize = get_subsize(bsize, partition); -#if CONFIG_EXT_PARTITION_TYPES - int i; -#endif -#if CONFIG_CB4X4 - const int unify_bsize = 1; -#else - const int unify_bsize = 0; -#endif - -#if !CONFIG_CB4X4 - assert(bsize >= BLOCK_8X8); -#endif - - if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return 1; - - switch (partition) { - case PARTITION_NONE: return check_intra_b(&pc_tree->none); break; - case PARTITION_VERT: - if (check_intra_b(&pc_tree->vertical[0])) return 1; - if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) { - if (check_intra_b(&pc_tree->vertical[1])) return 1; - } - break; - case PARTITION_HORZ: - if (check_intra_b(&pc_tree->horizontal[0])) return 1; - if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) { - if (check_intra_b(&pc_tree->horizontal[1])) return 1; - } - break; - case PARTITION_SPLIT: - if (bsize == BLOCK_8X8 && !unify_bsize) { - if (check_intra_b(pc_tree->leaf_split[0])) return 1; - } else { - if (check_intra_sb(cpi, tile, mi_row, mi_col, subsize, - pc_tree->split[0])) - return 1; - if (check_intra_sb(cpi, tile, mi_row, mi_col + hbs, subsize, - pc_tree->split[1])) - return 1; - if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col, subsize, - pc_tree->split[2])) - return 1; - if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col + hbs, subsize, - pc_tree->split[3])) - return 1; - } - break; -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION_TYPES_AB -#error HORZ/VERT_A/B partitions not yet updated in superres code -#endif - case PARTITION_HORZ_A: - for (i = 0; i < 3; i++) { - if (check_intra_b(&pc_tree->horizontala[i])) return 1; - } - break; - case PARTITION_HORZ_B: - for (i = 0; i < 3; i++) { - if (check_intra_b(&pc_tree->horizontalb[i])) return 1; - } - break; - case PARTITION_VERT_A: - for (i = 0; i < 3; i++) { - if (check_intra_b(&pc_tree->verticala[i])) return 1; - } - break; - case PARTITION_VERT_B: - for (i = 0; i < 3; i++) { - if (check_intra_b(&pc_tree->verticalb[i])) return 1; - } - break; -#endif // CONFIG_EXT_PARTITION_TYPES - default: assert(0); - } - return 0; -} - -static int check_supertx_b(TX_SIZE supertx_size, PICK_MODE_CONTEXT *ctx) { - return ctx->mic.mbmi.tx_size == supertx_size; -} - -static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size, - PC_TREE *pc_tree) { - PARTITION_TYPE partition; - BLOCK_SIZE subsize; -#if CONFIG_CB4X4 - const int unify_bsize = 1; -#else - const int unify_bsize = 0; -#endif - - partition = pc_tree->partitioning; - subsize = get_subsize(bsize, partition); - switch (partition) { - case PARTITION_NONE: return check_supertx_b(supertx_size, &pc_tree->none); - case PARTITION_VERT: - return check_supertx_b(supertx_size, &pc_tree->vertical[0]); - case PARTITION_HORZ: - return check_supertx_b(supertx_size, &pc_tree->horizontal[0]); - case PARTITION_SPLIT: - if (bsize == BLOCK_8X8 && !unify_bsize) - return check_supertx_b(supertx_size, pc_tree->leaf_split[0]); - else - return check_supertx_sb(subsize, supertx_size, pc_tree->split[0]); -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION_TYPES_AB -#error HORZ/VERT_A/B partitions not yet updated in superres code -#endif - case PARTITION_HORZ_A: - return check_supertx_b(supertx_size, &pc_tree->horizontala[0]); - case PARTITION_HORZ_B: - return check_supertx_b(supertx_size, &pc_tree->horizontalb[0]); - case PARTITION_VERT_A: - return check_supertx_b(supertx_size, &pc_tree->verticala[0]); - case PARTITION_VERT_B: - return check_supertx_b(supertx_size, &pc_tree->verticalb[0]); -#endif // CONFIG_EXT_PARTITION_TYPES - default: assert(0); return 0; - } -} - -static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td, - int mi_row_ori, int mi_col_ori, int mi_row_pred, - int mi_col_pred, int plane, - BLOCK_SIZE bsize_pred, int b_sub8x8, int block) { - // Used in supertx - // (mi_row_ori, mi_col_ori): location for mv - // (mi_row_pred, mi_col_pred, bsize_pred): region to predict - const AV1_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *mi_8x8 = xd->mi[0]; - MODE_INFO *mi = mi_8x8; - MB_MODE_INFO *mbmi = &mi->mbmi; - int ref; - const int is_compound = has_second_ref(mbmi); - - set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); - - for (ref = 0; ref < 1 + is_compound; ++ref) { - YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]); - av1_setup_pre_planes(xd, ref, cfg, mi_row_pred, mi_col_pred, - &xd->block_refs[ref]->sf); - } - -#if CONFIG_COMPOUND_SINGLEREF - // Single ref compound mode - if (!is_compound && is_inter_singleref_comp_mode(mbmi->mode)) { - xd->block_refs[1] = xd->block_refs[0]; - YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[0]); - av1_setup_pre_planes(xd, 1, cfg, mi_row_pred, mi_col_pred, - &xd->block_refs[1]->sf); - } -#endif // CONFIG_COMPOUND_SINGLEREF - - if (!b_sub8x8) - av1_build_inter_predictor_sb_extend(cm, xd, mi_row_ori, mi_col_ori, - mi_row_pred, mi_col_pred, plane, - bsize_pred); - else - av1_build_inter_predictor_sb_sub8x8_extend(cm, xd, mi_row_ori, mi_col_ori, - mi_row_pred, mi_col_pred, plane, - bsize_pred, block); -} - -static void predict_b_extend(const AV1_COMP *const cpi, ThreadData *td, - const TileInfo *const tile, int block, - int mi_row_ori, int mi_col_ori, int mi_row_pred, - int mi_col_pred, int mi_row_top, int mi_col_top, - int plane, uint8_t *dst_buf, int dst_stride, - BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred, - RUN_TYPE dry_run, int b_sub8x8) { - // Used in supertx - // (mi_row_ori, mi_col_ori): location for mv - // (mi_row_pred, mi_col_pred, bsize_pred): region to predict - // (mi_row_top, mi_col_top, bsize_top): region of the top partition size - // block: sub location of sub8x8 blocks - // b_sub8x8: 1: ori is sub8x8; 0: ori is not sub8x8 - // bextend: 1: region to predict is an extension of ori; 0: not - - MACROBLOCK *const x = &td->mb; - const AV1_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &x->e_mbd; - int r = (mi_row_pred - mi_row_top) * MI_SIZE; - int c = (mi_col_pred - mi_col_top) * MI_SIZE; - const int mi_width_top = mi_size_wide[bsize_top]; - const int mi_height_top = mi_size_high[bsize_top]; - - if (mi_row_pred < mi_row_top || mi_col_pred < mi_col_top || - mi_row_pred >= mi_row_top + mi_height_top || - mi_col_pred >= mi_col_top + mi_width_top || mi_row_pred >= cm->mi_rows || - mi_col_pred >= cm->mi_cols) - return; - - set_offsets_extend(cpi, td, tile, mi_row_pred, mi_col_pred, mi_row_ori, - mi_col_ori, bsize_pred); - xd->plane[plane].dst.stride = dst_stride; - xd->plane[plane].dst.buf = - dst_buf + (r >> xd->plane[plane].subsampling_y) * dst_stride + - (c >> xd->plane[plane].subsampling_x); - - predict_superblock(cpi, td, mi_row_ori, mi_col_ori, mi_row_pred, mi_col_pred, - plane, bsize_pred, b_sub8x8, block); - - if (!dry_run && (plane == 0) && (block == 0 || !b_sub8x8)) - update_stats(&cpi->common, td, mi_row_pred, mi_col_pred, 1); -} - -static void extend_dir(const AV1_COMP *const cpi, ThreadData *td, - const TileInfo *const tile, int block, BLOCK_SIZE bsize, - BLOCK_SIZE top_bsize, int mi_row_ori, int mi_col_ori, - int mi_row, int mi_col, int mi_row_top, int mi_col_top, - int plane, uint8_t *dst_buf, int dst_stride, int dir) { - // dir: 0-lower, 1-upper, 2-left, 3-right - // 4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright - MACROBLOCKD *xd = &td->mb.e_mbd; - const int mi_width = mi_size_wide[bsize]; - const int mi_height = mi_size_high[bsize]; - int xss = xd->plane[1].subsampling_x; - int yss = xd->plane[1].subsampling_y; -#if CONFIG_CB4X4 - const int unify_bsize = 1; -#else - const int unify_bsize = 0; -#endif - int b_sub8x8 = (bsize < BLOCK_8X8) && !unify_bsize ? 1 : 0; - int wide_unit, high_unit; - int i, j; - int ext_offset = 0; - - BLOCK_SIZE extend_bsize; - int mi_row_pred, mi_col_pred; - - if (dir == 0 || dir == 1) { // lower and upper - extend_bsize = - (mi_width == mi_size_wide[BLOCK_8X8] || bsize < BLOCK_8X8 || xss < yss) - ? BLOCK_8X8 - : BLOCK_16X8; - -#if CONFIG_CB4X4 - if (bsize < BLOCK_8X8) { - extend_bsize = BLOCK_4X4; - ext_offset = mi_size_wide[BLOCK_8X8]; - } -#endif - wide_unit = mi_size_wide[extend_bsize]; - high_unit = mi_size_high[extend_bsize]; - - mi_row_pred = mi_row + ((dir == 0) ? mi_height : -(mi_height + ext_offset)); - mi_col_pred = mi_col; - - for (j = 0; j < mi_height + ext_offset; j += high_unit) - for (i = 0; i < mi_width + ext_offset; i += wide_unit) - predict_b_extend(cpi, td, tile, block, mi_row_ori, mi_col_ori, - mi_row_pred + j, mi_col_pred + i, mi_row_top, - mi_col_top, plane, dst_buf, dst_stride, top_bsize, - extend_bsize, 1, b_sub8x8); - } else if (dir == 2 || dir == 3) { // left and right - extend_bsize = - (mi_height == mi_size_high[BLOCK_8X8] || bsize < BLOCK_8X8 || yss < xss) - ? BLOCK_8X8 - : BLOCK_8X16; -#if CONFIG_CB4X4 - if (bsize < BLOCK_8X8) { - extend_bsize = BLOCK_4X4; - ext_offset = mi_size_wide[BLOCK_8X8]; - } -#endif - wide_unit = mi_size_wide[extend_bsize]; - high_unit = mi_size_high[extend_bsize]; - - mi_row_pred = mi_row; - mi_col_pred = mi_col + ((dir == 3) ? mi_width : -(mi_width + ext_offset)); - - for (j = 0; j < mi_height + ext_offset; j += high_unit) - for (i = 0; i < mi_width + ext_offset; i += wide_unit) - predict_b_extend(cpi, td, tile, block, mi_row_ori, mi_col_ori, - mi_row_pred + j, mi_col_pred + i, mi_row_top, - mi_col_top, plane, dst_buf, dst_stride, top_bsize, - extend_bsize, 1, b_sub8x8); - } else { - extend_bsize = BLOCK_8X8; -#if CONFIG_CB4X4 - if (bsize < BLOCK_8X8) { - extend_bsize = BLOCK_4X4; - ext_offset = mi_size_wide[BLOCK_8X8]; - } -#endif - wide_unit = mi_size_wide[extend_bsize]; - high_unit = mi_size_high[extend_bsize]; - - mi_row_pred = mi_row + ((dir == 4 || dir == 6) ? mi_height - : -(mi_height + ext_offset)); - mi_col_pred = - mi_col + ((dir == 6 || dir == 7) ? mi_width : -(mi_width + ext_offset)); - - for (j = 0; j < mi_height + ext_offset; j += high_unit) - for (i = 0; i < mi_width + ext_offset; i += wide_unit) - predict_b_extend(cpi, td, tile, block, mi_row_ori, mi_col_ori, - mi_row_pred + j, mi_col_pred + i, mi_row_top, - mi_col_top, plane, dst_buf, dst_stride, top_bsize, - extend_bsize, 1, b_sub8x8); - } -} - -static void extend_all(const AV1_COMP *const cpi, ThreadData *td, - const TileInfo *const tile, int block, BLOCK_SIZE bsize, - BLOCK_SIZE top_bsize, int mi_row_ori, int mi_col_ori, - int mi_row, int mi_col, int mi_row_top, int mi_col_top, - int plane, uint8_t *dst_buf, int dst_stride) { - assert(block >= 0 && block < 4); - for (int i = 0; i < 8; ++i) { - extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row_ori, mi_col_ori, - mi_row, mi_col, mi_row_top, mi_col_top, plane, dst_buf, - dst_stride, i); - } -} - -// This function generates prediction for multiple blocks, between which -// discontinuity around boundary is reduced by smoothing masks. The basic -// smoothing mask is a soft step function along horz/vert direction. In more -// complicated case when a block is split into 4 subblocks, the basic mask is -// first applied to neighboring subblocks (2 pairs) in horizontal direction and -// then applied to the 2 masked prediction mentioned above in vertical direction -// If the block is split into more than one level, at every stage, masked -// prediction is stored in dst_buf[] passed from higher level. -static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td, - const TileInfo *const tile, int mi_row, - int mi_col, int mi_row_top, int mi_col_top, - RUN_TYPE dry_run, BLOCK_SIZE bsize, - BLOCK_SIZE top_bsize, uint8_t *dst_buf[3], - int dst_stride[3], PC_TREE *pc_tree) { - const AV1_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - const int hbs = mi_size_wide[bsize] / 2; - const int is_partition_root = bsize >= BLOCK_8X8; - const int ctx = is_partition_root - ? partition_plane_context(xd, mi_row, mi_col, -#if CONFIG_UNPOISON_PARTITION_CTX - mi_row + hbs < cm->mi_rows, - mi_col + hbs < cm->mi_cols, -#endif - bsize) - : -1; - const PARTITION_TYPE partition = pc_tree->partitioning; - const BLOCK_SIZE subsize = get_subsize(bsize, partition); -#if CONFIG_EXT_PARTITION_TYPES - const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT); -#endif - - int i; - uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3]; - DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]); - DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]); - DECLARE_ALIGNED(16, uint8_t, tmp_buf3[MAX_MB_PLANE * MAX_TX_SQUARE * 2]); - int dst_stride1[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE }; - int dst_stride2[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE }; - int dst_stride3[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE }; -#if CONFIG_CB4X4 - const int unify_bsize = 1; -#else - const int unify_bsize = 0; - assert(bsize >= BLOCK_8X8); -#endif - - if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - int len = sizeof(uint16_t); - dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1); - dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_TX_SQUARE * len); - dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_TX_SQUARE * len); - dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2); - dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_TX_SQUARE * len); - dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_TX_SQUARE * len); - dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3); - dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAX_TX_SQUARE * len); - dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAX_TX_SQUARE * len); - } else { -#endif // CONFIG_HIGHBITDEPTH - dst_buf1[0] = tmp_buf1; - dst_buf1[1] = tmp_buf1 + MAX_TX_SQUARE; - dst_buf1[2] = tmp_buf1 + 2 * MAX_TX_SQUARE; - dst_buf2[0] = tmp_buf2; - dst_buf2[1] = tmp_buf2 + MAX_TX_SQUARE; - dst_buf2[2] = tmp_buf2 + 2 * MAX_TX_SQUARE; - dst_buf3[0] = tmp_buf3; - dst_buf3[1] = tmp_buf3 + MAX_TX_SQUARE; - dst_buf3[2] = tmp_buf3 + 2 * MAX_TX_SQUARE; -#if CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_HIGHBITDEPTH - - if (!dry_run && ctx >= 0 && bsize < top_bsize) { - // Explicitly cast away const. - FRAME_COUNTS *const frame_counts = (FRAME_COUNTS *)&cm->counts; - frame_counts->partition[ctx][partition]++; - } - - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].dst.buf = dst_buf[i]; - xd->plane[i].dst.stride = dst_stride[i]; - } - - switch (partition) { - case PARTITION_NONE: - assert(bsize < top_bsize); - for (i = 0; i < MAX_MB_PLANE; ++i) { - predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, - mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i], - top_bsize, bsize, dry_run, 0); - extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, mi_col, mi_row, - mi_col, mi_row_top, mi_col_top, i, dst_buf[i], - dst_stride[i]); - } - break; - case PARTITION_HORZ: - if (bsize == BLOCK_8X8 && !unify_bsize) { - for (i = 0; i < MAX_MB_PLANE; ++i) { - // First half - predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, - mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i], - top_bsize, BLOCK_8X8, dry_run, 1); - if (bsize < top_bsize) - extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, - mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i], - dst_stride[i]); - - // Second half - predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col, - mi_row_top, mi_col_top, i, dst_buf1[i], - dst_stride1[i], top_bsize, BLOCK_8X8, dry_run, 1); - if (bsize < top_bsize) - extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col, - mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf1[i], - dst_stride1[i]); - } - - // Smooth - xd->plane[0].dst.buf = dst_buf[0]; - xd->plane[0].dst.stride = dst_stride[0]; - av1_build_masked_inter_predictor_complex( - xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row, - mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ, - 0); - } else { - for (i = 0; i < MAX_MB_PLANE; ++i) { -#if CONFIG_CB4X4 - const struct macroblockd_plane *pd = &xd->plane[i]; - int handle_chroma_sub8x8 = need_handle_chroma_sub8x8( - subsize, pd->subsampling_x, pd->subsampling_y); - - if (handle_chroma_sub8x8) { - int mode_offset_row = CONFIG_CHROMA_SUB8X8 ? hbs : 0; - - predict_b_extend(cpi, td, tile, 0, mi_row + mode_offset_row, mi_col, - mi_row, mi_col, mi_row_top, mi_col_top, i, - dst_buf[i], dst_stride[i], top_bsize, bsize, - dry_run, 0); - if (bsize < top_bsize) - extend_all(cpi, td, tile, 0, bsize, top_bsize, - mi_row + mode_offset_row, mi_col, mi_row, mi_col, - mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i]); - } else { -#endif - // First half - predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, - mi_row_top, mi_col_top, i, dst_buf[i], - dst_stride[i], top_bsize, subsize, dry_run, 0); - if (bsize < top_bsize) - extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, - mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i], - dst_stride[i]); - else - extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, - mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i], - dst_stride[i], 0); - xd->plane[i].dst.buf = dst_buf[i]; - xd->plane[i].dst.stride = dst_stride[i]; - - if (mi_row + hbs < cm->mi_rows) { - // Second half - predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, - mi_row + hbs, mi_col, mi_row_top, mi_col_top, i, - dst_buf1[i], dst_stride1[i], top_bsize, subsize, - dry_run, 0); - if (bsize < top_bsize) - extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, - mi_col, mi_row + hbs, mi_col, mi_row_top, mi_col_top, - i, dst_buf1[i], dst_stride1[i]); - else - extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, - mi_col, mi_row + hbs, mi_col, mi_row_top, mi_col_top, - i, dst_buf1[i], dst_stride1[i], 1); - // Smooth - xd->plane[i].dst.buf = dst_buf[i]; - xd->plane[i].dst.stride = dst_stride[i]; - av1_build_masked_inter_predictor_complex( - xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], - mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize, - PARTITION_HORZ, i); - } -#if CONFIG_CB4X4 - } -#endif - } - } - break; - case PARTITION_VERT: - if (bsize == BLOCK_8X8 && !unify_bsize) { - for (i = 0; i < MAX_MB_PLANE; ++i) { - // First half - predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, - mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i], - top_bsize, BLOCK_8X8, dry_run, 1); - if (bsize < top_bsize) - extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, - mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i], - dst_stride[i]); - - // Second half - predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col, - mi_row_top, mi_col_top, i, dst_buf1[i], - dst_stride1[i], top_bsize, BLOCK_8X8, dry_run, 1); - if (bsize < top_bsize) - extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col, - mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf1[i], - dst_stride1[i]); - } - - // Smooth - xd->plane[0].dst.buf = dst_buf[0]; - xd->plane[0].dst.stride = dst_stride[0]; - av1_build_masked_inter_predictor_complex( - xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row, - mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT, - 0); - } else { - for (i = 0; i < MAX_MB_PLANE; ++i) { -#if CONFIG_CB4X4 - const struct macroblockd_plane *pd = &xd->plane[i]; - int handle_chroma_sub8x8 = need_handle_chroma_sub8x8( - subsize, pd->subsampling_x, pd->subsampling_y); - - if (handle_chroma_sub8x8) { - int mode_offset_col = CONFIG_CHROMA_SUB8X8 ? hbs : 0; - - predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + mode_offset_col, - mi_row, mi_col, mi_row_top, mi_col_top, i, - dst_buf[i], dst_stride[i], top_bsize, bsize, - dry_run, 0); - if (bsize < top_bsize) - extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, - mi_col + mode_offset_col, mi_row, mi_col, mi_row_top, - mi_col_top, i, dst_buf[i], dst_stride[i]); - } else { -#endif - predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, - mi_row_top, mi_col_top, i, dst_buf[i], - dst_stride[i], top_bsize, subsize, dry_run, 0); - if (bsize < top_bsize) - extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, - mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i], - dst_stride[i]); - else - extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, - mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i], - dst_stride[i], 3); - xd->plane[i].dst.buf = dst_buf[i]; - xd->plane[i].dst.stride = dst_stride[i]; - - if (mi_col + hbs < cm->mi_cols) { - predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row, - mi_col + hbs, mi_row_top, mi_col_top, i, - dst_buf1[i], dst_stride1[i], top_bsize, subsize, - dry_run, 0); - if (bsize < top_bsize) - extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, - mi_col + hbs, mi_row, mi_col + hbs, mi_row_top, - mi_col_top, i, dst_buf1[i], dst_stride1[i]); - else - extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, - mi_col + hbs, mi_row, mi_col + hbs, mi_row_top, - mi_col_top, i, dst_buf1[i], dst_stride1[i], 2); - - // smooth - xd->plane[i].dst.buf = dst_buf[i]; - xd->plane[i].dst.stride = dst_stride[i]; - av1_build_masked_inter_predictor_complex( - xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], - mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize, - PARTITION_VERT, i); - } -#if CONFIG_CB4X4 - } -#endif - } - } - break; - case PARTITION_SPLIT: - if (bsize == BLOCK_8X8 && !unify_bsize) { - for (i = 0; i < MAX_MB_PLANE; i++) { - predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, - mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i], - top_bsize, BLOCK_8X8, dry_run, 1); - predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col, - mi_row_top, mi_col_top, i, dst_buf1[i], - dst_stride1[i], top_bsize, BLOCK_8X8, dry_run, 1); - predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col, - mi_row_top, mi_col_top, i, dst_buf2[i], - dst_stride2[i], top_bsize, BLOCK_8X8, dry_run, 1); - predict_b_extend(cpi, td, tile, 3, mi_row, mi_col, mi_row, mi_col, - mi_row_top, mi_col_top, i, dst_buf3[i], - dst_stride3[i], top_bsize, BLOCK_8X8, dry_run, 1); - - if (bsize < top_bsize) { - extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, - mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i], - dst_stride[i]); - extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col, - mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf1[i], - dst_stride1[i]); - extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col, - mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf2[i], - dst_stride2[i]); - extend_all(cpi, td, tile, 3, subsize, top_bsize, mi_row, mi_col, - mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf3[i], - dst_stride3[i]); - } - } -#if CONFIG_CB4X4 - } else if (bsize == BLOCK_8X8) { - for (i = 0; i < MAX_MB_PLANE; i++) { - const struct macroblockd_plane *pd = &xd->plane[i]; - int handle_chroma_sub8x8 = need_handle_chroma_sub8x8( - subsize, pd->subsampling_x, pd->subsampling_y); - - if (handle_chroma_sub8x8) { - int mode_offset_row = - CONFIG_CHROMA_SUB8X8 && mi_row + hbs < cm->mi_rows ? hbs : 0; - int mode_offset_col = - CONFIG_CHROMA_SUB8X8 && mi_col + hbs < cm->mi_cols ? hbs : 0; - - predict_b_extend(cpi, td, tile, 0, mi_row + mode_offset_row, - mi_col + mode_offset_col, mi_row, mi_col, - mi_row_top, mi_col_top, i, dst_buf[i], - dst_stride[i], top_bsize, BLOCK_8X8, dry_run, 0); - if (bsize < top_bsize) - extend_all(cpi, td, tile, 0, BLOCK_8X8, top_bsize, - mi_row + mode_offset_row, mi_col + mode_offset_col, - mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i], - dst_stride[i]); - } else { - predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, - mi_row_top, mi_col_top, i, dst_buf[i], - dst_stride[i], top_bsize, subsize, dry_run, 0); - if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) - predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row, - mi_col + hbs, mi_row_top, mi_col_top, i, - dst_buf1[i], dst_stride1[i], top_bsize, subsize, - dry_run, 0); - if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) - predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, - mi_row + hbs, mi_col, mi_row_top, mi_col_top, i, - dst_buf2[i], dst_stride2[i], top_bsize, subsize, - dry_run, 0); - if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) - predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs, - mi_row + hbs, mi_col + hbs, mi_row_top, - mi_col_top, i, dst_buf3[i], dst_stride3[i], - top_bsize, subsize, dry_run, 0); - - if (bsize < top_bsize) { - extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, - mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i], - dst_stride[i]); - if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) - extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, - mi_col + hbs, mi_row, mi_col + hbs, mi_row_top, - mi_col_top, i, dst_buf1[i], dst_stride1[i]); - if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) - extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, - mi_col, mi_row + hbs, mi_col, mi_row_top, mi_col_top, - i, dst_buf2[i], dst_stride2[i]); - if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) - extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, - mi_col + hbs, mi_row + hbs, mi_col + hbs, mi_row_top, - mi_col_top, i, dst_buf3[i], dst_stride3[i]); - } - } - } -#endif - } else { - predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row_top, - mi_col_top, dry_run, subsize, top_bsize, dst_buf, - dst_stride, pc_tree->split[0]); - if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) - predict_sb_complex(cpi, td, tile, mi_row, mi_col + hbs, mi_row_top, - mi_col_top, dry_run, subsize, top_bsize, dst_buf1, - dst_stride1, pc_tree->split[1]); - if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) - predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col, mi_row_top, - mi_col_top, dry_run, subsize, top_bsize, dst_buf2, - dst_stride2, pc_tree->split[2]); - if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) - predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col + hbs, - mi_row_top, mi_col_top, dry_run, subsize, - top_bsize, dst_buf3, dst_stride3, - pc_tree->split[3]); - } - for (i = 0; i < MAX_MB_PLANE; i++) { -#if CONFIG_CB4X4 - const struct macroblockd_plane *pd = &xd->plane[i]; - int handle_chroma_sub8x8 = need_handle_chroma_sub8x8( - subsize, pd->subsampling_x, pd->subsampling_y); - if (handle_chroma_sub8x8) continue; // Skip <4x4 chroma smoothing -#else - if (bsize == BLOCK_8X8 && i != 0) - continue; // Skip <4x4 chroma smoothing -#endif - - if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) { - av1_build_masked_inter_predictor_complex( - xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], - mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize, - PARTITION_VERT, i); - if (mi_row + hbs < cm->mi_rows) { - av1_build_masked_inter_predictor_complex( - xd, dst_buf2[i], dst_stride2[i], dst_buf3[i], dst_stride3[i], - mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize, - PARTITION_VERT, i); - av1_build_masked_inter_predictor_complex( - xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], - mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize, - PARTITION_HORZ, i); - } - } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) { - av1_build_masked_inter_predictor_complex( - xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], - mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize, - PARTITION_HORZ, i); - } - } - break; -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION_TYPES_AB -#error HORZ/VERT_A/B partitions not yet updated in superres code -#endif - case PARTITION_HORZ_A: - predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, - mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize, - bsize2, dry_run, 0, 0); - extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col, - mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride); - - predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row, - mi_col + hbs, mi_row_top, mi_col_top, dst_buf1, - dst_stride1, top_bsize, bsize2, dry_run, 0, 0); - extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs, - mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1); - - predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs, - mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2, - top_bsize, subsize, dry_run, 0, 0); - if (bsize < top_bsize) - extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col, - mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2); - else - extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col, - mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 1); - - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].dst.buf = dst_buf[i]; - xd->plane[i].dst.stride = dst_stride[i]; - av1_build_masked_inter_predictor_complex( - xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row, - mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT, - i); - } - for (i = 0; i < MAX_MB_PLANE; i++) { - av1_build_masked_inter_predictor_complex( - xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row, - mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ, - i); - } - - break; - case PARTITION_VERT_A: - - predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, - mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize, - bsize2, dry_run, 0, 0); - extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col, - mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride); - - predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs, - mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1, - top_bsize, bsize2, dry_run, 0, 0); - extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col, - mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1); - - predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row, - mi_col + hbs, mi_row_top, mi_col_top, dst_buf2, - dst_stride2, top_bsize, subsize, dry_run, 0, 0); - if (bsize < top_bsize) - extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs, - mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2); - else - extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs, - mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 2); - - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].dst.buf = dst_buf[i]; - xd->plane[i].dst.stride = dst_stride[i]; - av1_build_masked_inter_predictor_complex( - xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row, - mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ, - i); - } - for (i = 0; i < MAX_MB_PLANE; i++) { - av1_build_masked_inter_predictor_complex( - xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row, - mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT, - i); - } - break; - case PARTITION_HORZ_B: - - predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, - mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize, - subsize, dry_run, 0, 0); - if (bsize < top_bsize) - extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, - mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride); - else - extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, - mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0); - - predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs, - mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1, - top_bsize, bsize2, dry_run, 0, 0); - extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col, - mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1); - - predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs, - mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top, - dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0); - extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, - mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2, - dst_stride2); - - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].dst.buf = dst_buf1[i]; - xd->plane[i].dst.stride = dst_stride1[i]; - av1_build_masked_inter_predictor_complex( - xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i], - mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize, - PARTITION_VERT, i); - } - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].dst.buf = dst_buf[i]; - xd->plane[i].dst.stride = dst_stride[i]; - av1_build_masked_inter_predictor_complex( - xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row, - mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ, - i); - } - break; - case PARTITION_VERT_B: - - predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, - mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize, - subsize, dry_run, 0, 0); - if (bsize < top_bsize) - extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, - mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride); - else - extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, - mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3); - - predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row, - mi_col + hbs, mi_row_top, mi_col_top, dst_buf1, - dst_stride1, top_bsize, bsize2, dry_run, 0, 0); - extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs, - mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1); - - predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs, - mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top, - dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0); - extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, - mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2, - dst_stride2); - - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].dst.buf = dst_buf1[i]; - xd->plane[i].dst.stride = dst_stride1[i]; - av1_build_masked_inter_predictor_complex( - xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i], - mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize, - PARTITION_HORZ, i); - } - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].dst.buf = dst_buf[i]; - xd->plane[i].dst.stride = dst_stride[i]; - av1_build_masked_inter_predictor_complex( - xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row, - mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT, - i); - } - break; -#endif // CONFIG_EXT_PARTITION_TYPES - default: assert(0); - } - -#if CONFIG_EXT_PARTITION_TYPES - if (bsize < top_bsize) - update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); -#else - if (bsize < top_bsize && (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)) - update_partition_context(xd, mi_row, mi_col, subsize, bsize); -#endif // CONFIG_EXT_PARTITION_TYPES -} - -static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td, - const TileInfo *const tile, int mi_row, int mi_col, - BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist, - TX_TYPE *best_tx, PC_TREE *pc_tree) { - const AV1_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - int plane, pnskip, skippable, skippable_uv, rate_uv, this_rate, - base_rate = *tmp_rate; - int64_t sse, pnsse, sse_uv, this_dist, dist_uv; - uint8_t *dst_buf[3]; - int dst_stride[3]; - TX_SIZE tx_size; - MB_MODE_INFO *mbmi; - TX_TYPE tx_type, best_tx_nostx; - int tmp_rate_tx = 0, skip_tx = 0; - int64_t tmp_dist_tx = 0, rd_tx, bestrd_tx = INT64_MAX; - - set_skip_context(xd, mi_row, mi_col); - set_mode_info_offsets(cpi, x, xd, mi_row, mi_col); - update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, 1, pc_tree); - av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row, - mi_col); - for (plane = 0; plane < MAX_MB_PLANE; plane++) { - dst_buf[plane] = xd->plane[plane].dst.buf; - dst_stride[plane] = xd->plane[plane].dst.stride; - } - predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, 1, bsize, - bsize, dst_buf, dst_stride, pc_tree); - - set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); - set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize); - - mbmi = &xd->mi[0]->mbmi; - best_tx_nostx = mbmi->tx_type; - - *best_tx = DCT_DCT; - - // chroma - skippable_uv = 1; - rate_uv = 0; - dist_uv = 0; - sse_uv = 0; - for (plane = 1; plane < MAX_MB_PLANE; ++plane) { -#if CONFIG_VAR_TX - ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE]; - ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE]; - const struct macroblockd_plane *const pd = &xd->plane[plane]; - RD_STATS this_rd_stats; - av1_init_rd_stats(&this_rd_stats); - - tx_size = max_txsize_lookup[bsize]; - tx_size = - uv_txsize_lookup[bsize][tx_size][cm->subsampling_x][cm->subsampling_y]; - av1_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl); - - av1_subtract_plane(x, bsize, plane); - av1_tx_block_rd_b(cpi, x, tx_size, 0, 0, plane, 0, - get_plane_block_size(bsize, pd), &ctxa[0], &ctxl[0], - &this_rd_stats); - - this_rate = this_rd_stats.rate; - this_dist = this_rd_stats.dist; - pnsse = this_rd_stats.sse; - pnskip = this_rd_stats.skip; -#else - tx_size = max_txsize_lookup[bsize]; - tx_size = - uv_txsize_lookup[bsize][tx_size][cm->subsampling_x][cm->subsampling_y]; - av1_subtract_plane(x, bsize, plane); - av1_txfm_rd_in_plane_supertx(x, cpi, &this_rate, &this_dist, &pnskip, - &pnsse, INT64_MAX, plane, bsize, tx_size, 0); -#endif // CONFIG_VAR_TX - - rate_uv += this_rate; - dist_uv += this_dist; - sse_uv += pnsse; - skippable_uv &= pnskip; - } - - // luma - tx_size = max_txsize_lookup[bsize]; - av1_subtract_plane(x, bsize, 0); -#if CONFIG_EXT_TX - int ext_tx_set = get_ext_tx_set(tx_size, bsize, 1, cm->reduced_tx_set_used); - const TxSetType tx_set_type = - get_ext_tx_set_type(tx_size, bsize, 1, cm->reduced_tx_set_used); -#endif // CONFIG_EXT_TX - for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) { -#if CONFIG_VAR_TX - ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE]; - ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE]; - const struct macroblockd_plane *const pd = &xd->plane[0]; - RD_STATS this_rd_stats; -#endif // CONFIG_VAR_TX - -#if CONFIG_EXT_TX - if (!av1_ext_tx_used[tx_set_type][tx_type]) continue; -#else - if (tx_size >= TX_32X32 && tx_type != DCT_DCT) continue; -#endif // CONFIG_EXT_TX - mbmi->tx_type = tx_type; - -#if CONFIG_VAR_TX - av1_init_rd_stats(&this_rd_stats); - av1_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl); - av1_tx_block_rd_b(cpi, x, tx_size, 0, 0, 0, 0, bsize, &ctxa[0], &ctxl[0], - &this_rd_stats); - - this_rate = this_rd_stats.rate; - this_dist = this_rd_stats.dist; - pnsse = this_rd_stats.sse; - pnskip = this_rd_stats.skip; -#else - av1_txfm_rd_in_plane_supertx(x, cpi, &this_rate, &this_dist, &pnskip, - &pnsse, INT64_MAX, 0, bsize, tx_size, 0); -#endif // CONFIG_VAR_TX - -#if CONFIG_EXT_TX - if (get_ext_tx_types(tx_size, bsize, 1, cm->reduced_tx_set_used) > 1 && - !xd->lossless[xd->mi[0]->mbmi.segment_id] && this_rate != INT_MAX) { - if (ext_tx_set > 0) - this_rate += - x->inter_tx_type_costs[ext_tx_set][mbmi->tx_size][mbmi->tx_type]; - } -#else - if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] && - this_rate != INT_MAX) { - this_rate += x->inter_tx_type_costs[tx_size][mbmi->tx_type]; - } -#endif // CONFIG_EXT_TX - *tmp_rate = rate_uv + this_rate; - *tmp_dist = dist_uv + this_dist; - sse = sse_uv + pnsse; - skippable = skippable_uv && pnskip; - if (skippable) { - *tmp_rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1); - x->skip = 1; - } else { - if (RDCOST(x->rdmult, *tmp_rate, *tmp_dist) < RDCOST(x->rdmult, 0, sse)) { - *tmp_rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0); - x->skip = 0; - } else { - *tmp_dist = sse; - *tmp_rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1); - x->skip = 1; - } - } - *tmp_rate += base_rate; - rd_tx = RDCOST(x->rdmult, *tmp_rate, *tmp_dist); - if (rd_tx < bestrd_tx * 0.99 || tx_type == DCT_DCT) { - *best_tx = tx_type; - bestrd_tx = rd_tx; - tmp_rate_tx = *tmp_rate; - tmp_dist_tx = *tmp_dist; - skip_tx = x->skip; - } - } - *tmp_rate = tmp_rate_tx; - *tmp_dist = tmp_dist_tx; - x->skip = skip_tx; -#if CONFIG_VAR_TX - for (plane = 0; plane < 1; ++plane) - memset(x->blk_skip[plane], x->skip, - sizeof(uint8_t) * pc_tree->none.num_4x4_blk); -#endif // CONFIG_VAR_TX - xd->mi[0]->mbmi.tx_type = best_tx_nostx; } -#endif // CONFIG_SUPERTX diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h index b54e54d25..62141dba4 100644 --- a/third_party/aom/av1/encoder/encodeframe.h +++ b/third_party/aom/av1/encoder/encodeframe.h @@ -20,6 +20,8 @@ extern "C" { #endif +#define DELTAQ_MODULATION 0 // 0: variance based, 1: wavelet AC energy based + struct macroblock; struct yv12_buffer_config; struct AV1_COMP; @@ -27,7 +29,7 @@ struct ThreadData; void av1_setup_src_planes(struct macroblock *x, const struct yv12_buffer_config *src, int mi_row, - int mi_col); + int mi_col, const int num_planes); void av1_encode_frame(struct AV1_COMP *cpi); @@ -35,12 +37,6 @@ void av1_init_tile_data(struct AV1_COMP *cpi); void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row, int tile_col); -void av1_update_tx_type_count(const struct AV1Common *cm, MACROBLOCKD *xd, -#if CONFIG_TXK_SEL - int blk_row, int blk_col, int block, int plane, -#endif - BLOCK_SIZE bsize, TX_SIZE tx_size, - FRAME_COUNTS *counts); #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c index f35ce8a4f..cea8db6f9 100644 --- a/third_party/aom/av1/encoder/encodemb.c +++ b/third_party/aom/av1/encoder/encodemb.c @@ -9,15 +9,20 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./av1_rtcd.h" -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" #include "aom_dsp/bitwriter.h" #include "aom_dsp/quantize.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG + +#include "av1/common/cfl.h" #include "av1/common/idct.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" @@ -25,22 +30,10 @@ #include "av1/encoder/av1_quantize.h" #include "av1/encoder/encodemb.h" -#if CONFIG_LV_MAP #include "av1/encoder/encodetxb.h" -#endif #include "av1/encoder/hybrid_fwd_txfm.h" #include "av1/encoder/rd.h" -#include "av1/encoder/tokenize.h" - -#if CONFIG_PVQ -#include "av1/encoder/encint.h" -#include "av1/common/partition.h" -#include "av1/encoder/pvq_encoder.h" -#endif - -#if CONFIG_CFL -#include "av1/common/cfl.h" -#endif +#include "av1/encoder/rdopt.h" // Check if one needs to use c version subtraction. static int check_subtract_block_size(int w, int h) { return w < 4 || h < 4; } @@ -49,31 +42,23 @@ static void subtract_block(const MACROBLOCKD *xd, int rows, int cols, int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src8, ptrdiff_t src_stride, const uint8_t *pred8, ptrdiff_t pred_stride) { -#if !CONFIG_HIGHBITDEPTH - (void)xd; -#endif - if (check_subtract_block_size(rows, cols)) { -#if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { aom_highbd_subtract_block_c(rows, cols, diff, diff_stride, src8, src_stride, pred8, pred_stride, xd->bd); return; } -#endif // CONFIG_HIGHBITDEPTH aom_subtract_block_c(rows, cols, diff, diff_stride, src8, src_stride, pred8, pred_stride); return; } -#if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8, pred_stride, xd->bd); return; } -#endif // CONFIG_HIGHBITDEPTH aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8, pred_stride); } @@ -101,7 +86,8 @@ void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); const int bw = block_size_wide[plane_bsize]; const int bh = block_size_high[plane_bsize]; const MACROBLOCKD *xd = &x->e_mbd; @@ -110,325 +96,26 @@ void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { pd->dst.buf, pd->dst.stride); } -// Shifting negative values is undefined behaviour in C99, -// and could mislead the optimizer, who might assume the shifted is positive. -// This also avoids ubsan warnings. -// In practise, this gets inlined by the optimizer to a single instruction. -static INLINE int signed_shift_right(int x, int shift) { - if (x >= 0) - return x >> shift; - else - return -((-x) >> shift); -} - -#if !CONFIG_LV_MAP -// These numbers are empirically obtained. -static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { - { 10, 7 }, { 8, 5 }, -}; - -static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, - int blk_row, int blk_col, int block, - TX_SIZE tx_size, int ctx) { +int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane, + int block, TX_SIZE tx_size, TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int fast_mode, + int *rate_cost) { MACROBLOCKD *const xd = &mb->e_mbd; struct macroblock_plane *const p = &mb->plane[plane]; - struct macroblockd_plane *const pd = &xd->plane[plane]; - const PLANE_TYPE plane_type = pd->plane_type; const int eob = p->eobs[block]; - assert(mb->qindex > 0); - assert((!plane_type && !plane) || (plane_type && plane)); - assert(eob <= tx_size_2d[tx_size]); - const int ref = is_inter_block(&xd->mi[0]->mbmi); - const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); - tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); - tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - const int16_t *const dequant_ptr = pd->dequant; - const uint8_t *const band_translate = get_band_translate(tx_size); - const TX_TYPE tx_type = - av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size); - const SCAN_ORDER *const scan_order = - get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi); - const int16_t *const scan = scan_order->scan; - const int16_t *const nb = scan_order->neighbors; - const int shift = av1_get_tx_scale(tx_size); -#if CONFIG_AOM_QM - int seg_id = xd->mi[0]->mbmi.segment_id; - // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms - const qm_val_t *iqmatrix = - IS_2D_TRANSFORM(tx_type) - ? pd->seg_iqmatrix[seg_id][!ref][tx_size] - : cm->giqmatrix[NUM_QM_LEVELS - 1][0][0][tx_size]; -#endif -#if CONFIG_NEW_QUANT - int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type); - const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq]; -#endif // CONFIG_NEW_QUANT - int64_t rd_cost0, rd_cost1; - int16_t t0, t1; - int i, final_eob = 0; - const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd); - int(*head_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] = - mb->token_head_costs[txsize_sqr_map[tx_size]][plane_type][ref]; - int(*tail_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] = - mb->token_tail_costs[txsize_sqr_map[tx_size]][plane_type][ref]; - const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1; - int64_t rate0, rate1; - int64_t eob_cost0, eob_cost1; - tran_low_t before_best_eob_qc = 0; - tran_low_t before_best_eob_dqc = 0; - - uint8_t token_cache[MAX_TX_SQUARE]; - for (i = 0; i < eob; i++) { - const int rc = scan[i]; - token_cache[rc] = av1_pt_energy_class[av1_get_token(qcoeff[rc])]; - } - - /* Record the r-d cost */ - int64_t accu_rate = 0; - // Initialized to the worst possible error for the largest transform size. - // This ensures that it never goes negative. - int64_t accu_error = ((int64_t)1) << 50; - rate0 = head_token_costs[0][ctx][0]; - int64_t best_block_rd_cost = RDCOST(rdmult, rate0, accu_error); - - // int64_t best_block_rd_cost_all0 = best_block_rd_cost; - const int seg_eob = - av1_get_tx_eob(&cm->seg, xd->mi[0]->mbmi.segment_id, tx_size); - for (i = 0; i < eob; i++) { - const int rc = scan[i]; - const int x = qcoeff[rc]; - const int sz = -(x < 0); - const int band_cur = band_translate[i]; - const int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i); - const int eob_val = - (i + 1 == eob) ? (i + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB; - const int is_first = (i == 0); - - if (x == 0) { - // no need to search when x == 0 - accu_rate += av1_get_coeff_token_cost( - ZERO_TOKEN, eob_val, is_first, head_token_costs[band_cur][ctx_cur], - tail_token_costs[band_cur][ctx_cur]); - // accu_error does not change when x==0 - } else { - /* Computing distortion - */ - // compute the distortion for the first candidate - // and the distortion for quantizing to 0. - int dx0 = abs(coeff[rc]) * (1 << shift); - dx0 >>= xd->bd - 8; - - const int64_t d0 = (int64_t)dx0 * dx0; - const int x_a = x - 2 * sz - 1; - int dqv; -#if CONFIG_AOM_QM - int iwt; - dqv = dequant_ptr[rc != 0]; - if (iqmatrix != NULL) { - iwt = iqmatrix[rc]; - dqv = ((iwt * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - } -#else - dqv = dequant_ptr[rc != 0]; -#endif + const int segment_id = xd->mi[0]->segment_id; - int dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift); - dx = signed_shift_right(dx, xd->bd - 8); - const int64_t d2 = (int64_t)dx * dx; - - /* compute the distortion for the second candidate - * x_a = x - 2 * sz + 1; - */ - int64_t d2_a; - if (x_a != 0) { -#if CONFIG_NEW_QUANT - dx = av1_dequant_coeff_nuq(x, dqv, dequant_val[band_translate[i]]) - - (coeff[rc] * (1 << shift)); - dx >>= xd->bd - 8; -#else // CONFIG_NEW_QUANT - dx -= ((dqv >> (xd->bd - 8)) + sz) ^ sz; -#endif // CONFIG_NEW_QUANT - d2_a = (int64_t)dx * dx; - } else { - d2_a = d0; - } - // Computing RD cost - int64_t base_bits; - // rate cost of x - base_bits = av1_get_token_cost(x, &t0, cat6_bits); - rate0 = base_bits + - av1_get_coeff_token_cost(t0, eob_val, is_first, - head_token_costs[band_cur][ctx_cur], - tail_token_costs[band_cur][ctx_cur]); - // rate cost of x_a - base_bits = av1_get_token_cost(x_a, &t1, cat6_bits); - if (t1 == ZERO_TOKEN && eob_val) { - rate1 = base_bits; - } else { - rate1 = base_bits + - av1_get_coeff_token_cost(t1, eob_val, is_first, - head_token_costs[band_cur][ctx_cur], - tail_token_costs[band_cur][ctx_cur]); - } - - int64_t next_bits0 = 0, next_bits1 = 0; - if (i < eob - 1) { - int ctx_next; - const int band_next = band_translate[i + 1]; - const int token_next = av1_get_token(qcoeff[scan[i + 1]]); - const int eob_val_next = - (i + 2 == eob) ? (i + 2 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB; - - token_cache[rc] = av1_pt_energy_class[t0]; - ctx_next = get_coef_context(nb, token_cache, i + 1); - next_bits0 = av1_get_coeff_token_cost( - token_next, eob_val_next, 0, head_token_costs[band_next][ctx_next], - tail_token_costs[band_next][ctx_next]); - - token_cache[rc] = av1_pt_energy_class[t1]; - ctx_next = get_coef_context(nb, token_cache, i + 1); - next_bits1 = av1_get_coeff_token_cost( - token_next, eob_val_next, 0, head_token_costs[band_next][ctx_next], - tail_token_costs[band_next][ctx_next]); - } - - rd_cost0 = RDCOST(rdmult, (rate0 + next_bits0), d2); - rd_cost1 = RDCOST(rdmult, (rate1 + next_bits1), d2_a); - const int best_x = (rd_cost1 < rd_cost0); - - const int eob_v = (i + 1 == seg_eob) ? LAST_EOB : EARLY_EOB; - int64_t next_eob_bits0, next_eob_bits1; - int best_eob_x; - next_eob_bits0 = av1_get_coeff_token_cost( - t0, eob_v, is_first, head_token_costs[band_cur][ctx_cur], - tail_token_costs[band_cur][ctx_cur]); - eob_cost0 = - RDCOST(rdmult, (accu_rate + next_eob_bits0), (accu_error + d2 - d0)); - eob_cost1 = eob_cost0; - if (x_a != 0) { - next_eob_bits1 = av1_get_coeff_token_cost( - t1, eob_v, is_first, head_token_costs[band_cur][ctx_cur], - tail_token_costs[band_cur][ctx_cur]); - eob_cost1 = RDCOST(rdmult, (accu_rate + next_eob_bits1), - (accu_error + d2_a - d0)); - best_eob_x = (eob_cost1 < eob_cost0); - } else { - best_eob_x = 0; - } - - const int dqc = dqcoeff[rc]; - int dqc_a = 0; - if (best_x || best_eob_x) { - if (x_a != 0) { -#if CONFIG_NEW_QUANT - dqc_a = av1_dequant_abscoeff_nuq(abs(x_a), dqv, - dequant_val[band_translate[i]]); - dqc_a = shift ? ROUND_POWER_OF_TWO(dqc_a, shift) : dqc_a; - if (sz) dqc_a = -dqc_a; -#else - if (x_a < 0) - dqc_a = -((-x_a * dqv) >> shift); - else - dqc_a = (x_a * dqv) >> shift; -#endif // CONFIG_NEW_QUANT - } else { - dqc_a = 0; - } // if (x_a != 0) - } - - // record the better quantized value - if (best_x) { - assert(d2_a <= d0); - qcoeff[rc] = x_a; - dqcoeff[rc] = dqc_a; - accu_rate += rate1; - accu_error += d2_a - d0; - token_cache[rc] = av1_pt_energy_class[t1]; - } else { - assert(d2 <= d0); - accu_rate += rate0; - accu_error += d2 - d0; - token_cache[rc] = av1_pt_energy_class[t0]; - } - assert(accu_error >= 0); - - // determine whether to move the eob position to i+1 - const int use_a = (x_a != 0) && (best_eob_x); - const int64_t best_eob_cost_i = use_a ? eob_cost1 : eob_cost0; - if (best_eob_cost_i < best_block_rd_cost) { - best_block_rd_cost = best_eob_cost_i; - final_eob = i + 1; - if (use_a) { - before_best_eob_qc = x_a; - before_best_eob_dqc = dqc_a; - } else { - before_best_eob_qc = x; - before_best_eob_dqc = dqc; - } - } - } // if (x==0) - } // for (i) - - assert(final_eob <= eob); - if (final_eob > 0) { - assert(before_best_eob_qc != 0); - i = final_eob - 1; - int rc = scan[i]; - qcoeff[rc] = before_best_eob_qc; - dqcoeff[rc] = before_best_eob_dqc; - } - - for (i = final_eob; i < eob; i++) { - int rc = scan[i]; - qcoeff[rc] = 0; - dqcoeff[rc] = 0; + if (eob == 0 || !cpi->optimize_seg_arr[segment_id] || + xd->lossless[segment_id]) { + *rate_cost = av1_cost_skip_txb(mb, txb_ctx, plane, tx_size); + return eob; } - p->eobs[block] = final_eob; - return final_eob; -} -#endif // !CONFIG_LV_MAP - -int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int blk_row, - int blk_col, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, const ENTROPY_CONTEXT *a, - const ENTROPY_CONTEXT *l, int fast_mode) { - MACROBLOCKD *const xd = &mb->e_mbd; - struct macroblock_plane *const p = &mb->plane[plane]; - const int eob = p->eobs[block]; - assert((mb->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0)); - if (eob == 0) return eob; - if (xd->lossless[xd->mi[0]->mbmi.segment_id]) return eob; - -#if CONFIG_PVQ - (void)cm; - (void)tx_size; - (void)a; - (void)l; - return eob; -#endif - -#if !CONFIG_LV_MAP - (void)plane_bsize; - (void)blk_row; - (void)blk_col; (void)fast_mode; -#if CONFIG_VAR_TX - int ctx = get_entropy_context(tx_size, a, l); -#else - int ctx = combine_entropy_contexts(*a, *l); -#endif // CONFIG_VAR_TX - return optimize_b_greedy(cm, mb, plane, blk_row, blk_col, block, tx_size, - ctx); -#else // !CONFIG_LV_MAP - TXB_CTX txb_ctx; - get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); - return av1_optimize_txb(cm, mb, plane, blk_row, blk_col, block, tx_size, - &txb_ctx, fast_mode); -#endif // !CONFIG_LV_MAP + return av1_optimize_txb_new(cpi, mb, plane, block, tx_size, tx_type, txb_ctx, + rate_cost, cpi->oxcf.sharpness); } -#if !CONFIG_PVQ typedef enum QUANT_FUNC { QUANT_FUNC_LOWBD = 0, QUANT_FUNC_HIGHBD = 1, @@ -437,394 +124,231 @@ typedef enum QUANT_FUNC { static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = { -#if !CONFIG_NEW_QUANT { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade }, { av1_quantize_b_facade, av1_highbd_quantize_b_facade }, { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade }, -#else // !CONFIG_NEW_QUANT - { av1_quantize_fp_nuq_facade, av1_highbd_quantize_fp_nuq_facade }, - { av1_quantize_b_nuq_facade, av1_highbd_quantize_b_nuq_facade }, - { av1_quantize_dc_nuq_facade, av1_highbd_quantize_dc_nuq_facade }, -#endif // !CONFIG_NEW_QUANT { NULL, NULL } }; -#endif // !CONFIG_PVQ - -#if !CONFIG_TXMG && !CONFIG_PVQ -typedef void (*fwdTxfmFunc)(const int16_t *diff, tran_low_t *coeff, int stride, - TxfmParam *txfm_param); -static const fwdTxfmFunc fwd_txfm_func[2] = { av1_fwd_txfm, - av1_highbd_fwd_txfm }; -#endif void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, int ctx, + TX_SIZE tx_size, TX_TYPE tx_type, AV1_XFORM_QUANT xform_quant_idx) { MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; -#if !(CONFIG_PVQ || CONFIG_DIST_8X8) + MB_MODE_INFO *const mbmi = xd->mi[0]; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; -#else - struct macroblock_plane *const p = &x->plane[plane]; - struct macroblockd_plane *const pd = &xd->plane[plane]; -#endif - PLANE_TYPE plane_type = get_plane_type(plane); - TX_TYPE tx_type = - av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size); - -#if (CONFIG_AOM_QM || CONFIG_NEW_QUANT) && !CONFIG_PVQ - const int is_inter = is_inter_block(mbmi); -#endif + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); - const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi); tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); uint16_t *const eob = &p->eobs[block]; const int diff_stride = block_size_wide[plane_bsize]; -#if CONFIG_AOM_QM && !CONFIG_PVQ int seg_id = mbmi->segment_id; + const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size); // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms const qm_val_t *qmatrix = - IS_2D_TRANSFORM(tx_type) ? pd->seg_qmatrix[seg_id][!is_inter][tx_size] - : cm->gqmatrix[NUM_QM_LEVELS - 1][0][0][tx_size]; + IS_2D_TRANSFORM(tx_type) ? pd->seg_qmatrix[seg_id][qm_tx_size] + : cm->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size]; const qm_val_t *iqmatrix = IS_2D_TRANSFORM(tx_type) - ? pd->seg_iqmatrix[seg_id][!is_inter][tx_size] - : cm->giqmatrix[NUM_QM_LEVELS - 1][0][0][tx_size]; -#endif + ? pd->seg_iqmatrix[seg_id][qm_tx_size] + : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size]; - TxfmParam txfm_param; - -#if CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT_FROM_PRED || CONFIG_MRC_TX - uint8_t *dst; - const int dst_stride = pd->dst.stride; -#if CONFIG_PVQ || CONFIG_DIST_8X8 - int16_t *pred; - const int txw = tx_size_wide[tx_size]; - const int txh = tx_size_high[tx_size]; - int i, j; -#endif -#endif - -#if !CONFIG_PVQ - const int tx2d_size = tx_size_2d[tx_size]; + const int src_offset = (blk_row * diff_stride + blk_col); + const int16_t *src_diff = &p->src_diff[src_offset << tx_size_wide_log2[0]]; QUANT_PARAM qparam; - const int16_t *src_diff; - - src_diff = - &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]]; qparam.log_scale = av1_get_tx_scale(tx_size); -#if CONFIG_NEW_QUANT qparam.tx_size = tx_size; - qparam.dq = get_dq_profile_from_ctx(x->qindex, ctx, is_inter, plane_type); -#endif // CONFIG_NEW_QUANT -#if CONFIG_AOM_QM qparam.qmatrix = qmatrix; qparam.iqmatrix = iqmatrix; -#endif // CONFIG_AOM_QM -#else - tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block); - int skip = 1; - PVQ_INFO *pvq_info = NULL; - uint8_t *src; - int16_t *src_int16; - const int src_stride = p->src.stride; - - (void)ctx; - (void)scan_order; - (void)qcoeff; - - if (x->pvq_coded) { - assert(block < MAX_PVQ_BLOCKS_IN_SB); - pvq_info = &x->pvq[block][plane]; - } - src = &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]]; - src_int16 = - &p->src_int16[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]]; - -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - for (j = 0; j < txh; j++) - for (i = 0; i < txw; i++) - src_int16[diff_stride * j + i] = - CONVERT_TO_SHORTPTR(src)[src_stride * j + i]; - } else { -#endif // CONFIG_HIGHBITDEPTH - for (j = 0; j < txh; j++) - for (i = 0; i < txw; i++) - src_int16[diff_stride * j + i] = src[src_stride * j + i]; -#if CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_HIGHBITDEPTH -#endif - -#if CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT_FROM_PRED || CONFIG_MRC_TX - dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; -#endif // CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT_FROM_PRED || - // CONFIG_MRC_TX - -#if CONFIG_PVQ || CONFIG_DIST_8X8 - if (CONFIG_PVQ -#if CONFIG_DIST_8X8 - || x->using_dist_8x8 -#endif // CONFIG_DIST_8X8 - ) { - pred = &pd->pred[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]]; - -// copy uint8 orig and predicted block to int16 buffer -// in order to use existing VP10 transform functions -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - for (j = 0; j < txh; j++) - for (i = 0; i < txw; i++) - pred[diff_stride * j + i] = - CONVERT_TO_SHORTPTR(dst)[dst_stride * j + i]; - } else { -#endif // CONFIG_HIGHBITDEPTH - for (j = 0; j < txh; j++) - for (i = 0; i < txw; i++) - pred[diff_stride * j + i] = dst[dst_stride * j + i]; -#if CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_PVQ || CONFIG_DIST_8X8 - - (void)ctx; - + TxfmParam txfm_param; txfm_param.tx_type = tx_type; txfm_param.tx_size = tx_size; txfm_param.lossless = xd->lossless[mbmi->segment_id]; -#if CONFIG_MRC_TX || CONFIG_LGT - txfm_param.is_inter = is_inter_block(mbmi); -#endif -#if CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED - txfm_param.dst = dst; - txfm_param.stride = dst_stride; -#if CONFIG_MRC_TX - txfm_param.valid_mask = &mbmi->valid_mrc_mask; -#if SIGNAL_ANY_MRC_MASK - txfm_param.mask = BLOCK_OFFSET(xd->mrc_mask, block); -#endif // SIGNAL_ANY_MRC_MASK -#endif // CONFIG_MRC_TX -#if CONFIG_LGT_FROM_PRED - txfm_param.mode = mbmi->mode; - txfm_param.use_lgt = mbmi->use_lgt; -#endif // CONFIG_LGT_FROM_PRED -#endif // CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED - -#if !CONFIG_PVQ + txfm_param.tx_set_type = av1_get_ext_tx_set_type( + txfm_param.tx_size, is_inter_block(mbmi), cm->reduced_tx_set_used); + txfm_param.bd = xd->bd; - const int is_hbd = get_bitdepth_data_path_index(xd); + txfm_param.is_hbd = get_bitdepth_data_path_index(xd); -#if CONFIG_TXMG - av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, &txfm_param); -#else // CONFIG_TXMG - fwd_txfm_func[is_hbd](src_diff, coeff, diff_stride, &txfm_param); -#endif // CONFIG_TXMG + av1_fwd_txfm(src_diff, coeff, diff_stride, &txfm_param); if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) { + const int n_coeffs = av1_get_max_eob(tx_size); if (LIKELY(!x->skip_block)) { - quant_func_list[xform_quant_idx][is_hbd]( - coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam); + quant_func_list[xform_quant_idx][txfm_param.is_hbd]( + coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, &qparam); } else { - av1_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob); + av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob); } } -#if CONFIG_LV_MAP - p->txb_entropy_ctx[block] = - (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob); -#endif // CONFIG_LV_MAP - return; -#else // CONFIG_PVQ - (void)xform_quant_idx; -#if CONFIG_HIGHBITDEPTH - txfm_param.bd = xd->bd; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - av1_highbd_fwd_txfm(src_int16, coeff, diff_stride, &txfm_param); - av1_highbd_fwd_txfm(pred, ref_coeff, diff_stride, &txfm_param); + // NOTE: optimize_b_following is ture means av1_optimze_b will be called + // When the condition of doing optimize_b is changed, + // this flag need update simultaneously + const int optimize_b_following = + (xform_quant_idx != AV1_XFORM_QUANT_FP) || (txfm_param.lossless); + if (optimize_b_following) { + p->txb_entropy_ctx[block] = + (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob); } else { -#endif - av1_fwd_txfm(src_int16, coeff, diff_stride, &txfm_param); - av1_fwd_txfm(pred, ref_coeff, diff_stride, &txfm_param); -#if CONFIG_HIGHBITDEPTH - } -#endif - - // PVQ for inter mode block - if (!x->skip_block) { - PVQ_SKIP_TYPE ac_dc_coded = - av1_pvq_encode_helper(x, - coeff, // target original vector - ref_coeff, // reference vector - dqcoeff, // de-quantized vector - eob, // End of Block marker - pd->dequant, // aom's quantizers - plane, // image plane - tx_size, // block size in log_2 - 2 - tx_type, - &x->rate, // rate measured - x->pvq_speed, - pvq_info); // PVQ info for a block - skip = ac_dc_coded == PVQ_SKIP; + p->txb_entropy_ctx[block] = 0; } - x->pvq_skip[plane] = skip; - - if (!skip) mbmi->skip = 0; -#endif // #if !CONFIG_PVQ + return; } static void encode_block(int plane, int block, int blk_row, int blk_col, - BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg, + int mi_row, int mi_col, RUN_TYPE dry_run) { + (void)mi_row; + (void)mi_col; + (void)dry_run; struct encode_b_args *const args = arg; - AV1_COMMON *cm = args->cm; + const AV1_COMMON *const cm = &args->cpi->common; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; - int ctx; + MB_MODE_INFO *mbmi = xd->mi[0]; struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block); -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK uint8_t *dst; -#if !CONFIG_PVQ ENTROPY_CONTEXT *a, *l; -#endif -#if CONFIG_VAR_TX - int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; -#endif + int dummy_rate_cost = 0; + + const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; dst = &pd->dst .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]]; -#if !CONFIG_PVQ a = &args->ta[blk_col]; l = &args->tl[blk_row]; -#if CONFIG_VAR_TX - ctx = get_entropy_context(tx_size, a, l); -#else - ctx = combine_entropy_contexts(*a, *l); -#endif -#else - ctx = 0; -#endif // CONFIG_PVQ - -#if CONFIG_VAR_TX // Assert not magic number (uninitialized). - assert(x->blk_skip[plane][blk_row * bw + blk_col] != 234); - - if (x->blk_skip[plane][blk_row * bw + blk_col] == 0) -#endif - { - av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, - ctx, AV1_XFORM_QUANT_FP); - } -#if CONFIG_VAR_TX - else { + assert(plane != 0 || x->blk_skip[blk_row * bw + blk_col] != 234); + + if ((plane != 0 || x->blk_skip[blk_row * bw + blk_col] == 0) && + !mbmi->skip_mode) { + TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col, + tx_size, cm->reduced_tx_set_used); + if (args->enable_optimize_b) { + av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, + tx_size, tx_type, AV1_XFORM_QUANT_FP); + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); + av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1, + &dummy_rate_cost); + } else { + av1_xform_quant( + cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type, + USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP); + } + } else { p->eobs[block] = 0; + p->txb_entropy_ctx[block] = 0; } -#endif - -#if !CONFIG_PVQ - av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size, a, - l, 0); av1_set_txb_context(x, plane, block, tx_size, a, l); - if (p->eobs[block]) *(args->skip) = 0; + if (p->eobs[block]) { + *(args->skip) = 0; - if (p->eobs[block] != 0) -#else - (void)ctx; - if (!x->pvq_skip[plane]) *(args->skip) = 0; + TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col, + tx_size, cm->reduced_tx_set_used); + av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, + pd->dst.stride, p->eobs[block], + cm->reduced_tx_set_used); + } - if (!x->pvq_skip[plane]) -#endif - { -#if CONFIG_LGT_FROM_PRED - PREDICTION_MODE mode = xd->mi[0]->mbmi.mode; -#endif // CONFIG_LGT_FROM_PRED - TX_TYPE tx_type = - av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col, block, tx_size); - av1_inverse_transform_block(xd, dqcoeff, -#if CONFIG_LGT_FROM_PRED - mode, + if (p->eobs[block] == 0 && plane == 0) { + // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0 + // case. It is possible that certain collision in hash index would cause + // the assertion failure. To further optimize the rate-distortion + // performance, we need to re-visit this part and enable this assert + // again. +#if 0 + if (args->cpi->oxcf.aq_mode == NO_AQ && + args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) { + // TODO(jingning,angiebird,huisu@google.com): enable txk_check when + // enable_optimize_b is true to detect potential RD bug. + const uint8_t disable_txk_check = args->enable_optimize_b; + if (!disable_txk_check) { + assert(mbmi->txk_type[av1_get_txk_type_index(plane_bsize, blk_row, + blk_col)] == DCT_DCT); + } + } #endif -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - mrc_mask, -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - tx_type, tx_size, dst, pd->dst.stride, - p->eobs[block]); + update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, + DCT_DCT); } + +#if CONFIG_MISMATCH_DEBUG + if (dry_run == OUTPUT_ENABLED) { + int pixel_c, pixel_r; + BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; + int blk_w = block_size_wide[bsize]; + int blk_h = block_size_high[bsize]; + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row, + pd->subsampling_x, pd->subsampling_y); + mismatch_record_block_tx(dst, pd->dst.stride, cm->frame_offset, plane, + pixel_c, pixel_r, blk_w, blk_h, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } +#endif } -#if CONFIG_VAR_TX static void encode_block_inter(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, - void *arg) { + void *arg, int mi_row, int mi_col, + RUN_TYPE dry_run) { + (void)mi_row; + (void)mi_col; struct encode_b_args *const args = arg; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - const BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; + MB_MODE_INFO *const mbmi = xd->mi[0]; const struct macroblockd_plane *const pd = &xd->plane[plane]; - const int tx_row = blk_row >> (1 - pd->subsampling_y); - const int tx_col = blk_col >> (1 - pd->subsampling_x); - TX_SIZE plane_tx_size; const int max_blocks_high = max_block_high(xd, plane_bsize, plane); const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; - plane_tx_size = - plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0] - : mbmi->inter_tx_size[tx_row][tx_col]; + const TX_SIZE plane_tx_size = + plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x, + pd->subsampling_y) + : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, + blk_col)]; + if (!plane) { + assert(tx_size_wide[tx_size] >= tx_size_wide[plane_tx_size] && + tx_size_high[tx_size] >= tx_size_high[plane_tx_size]); + } - if (tx_size == plane_tx_size) { - encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg); + if (tx_size == plane_tx_size || plane) { + encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg, + mi_row, mi_col, dry_run); } else { assert(tx_size < TX_SIZES_ALL); -#if CONFIG_RECT_TX_EXT - int is_qttx = plane_tx_size == quarter_txsize_lookup[plane_bsize]; - const TX_SIZE sub_txs = is_qttx ? plane_tx_size : sub_tx_size_map[tx_size]; - if (is_qttx) assert(blk_row == 0 && blk_col == 0 && block == 0); -#else const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size)); assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size)); -#endif // This is the square transform block partition entry point. - int bsl = tx_size_wide_unit[sub_txs]; - int i; - assert(bsl > 0); - - for (i = 0; i < 4; ++i) { -#if CONFIG_RECT_TX_EXT - int is_wide_tx = tx_size_wide_unit[sub_txs] > tx_size_high_unit[sub_txs]; - const int offsetr = - is_qttx ? (is_wide_tx ? i * tx_size_high_unit[sub_txs] : 0) - : blk_row + ((i >> 1) * bsl); - const int offsetc = - is_qttx ? (is_wide_tx ? 0 : i * tx_size_wide_unit[sub_txs]) - : blk_col + ((i & 0x01) * bsl); -#else - const int offsetr = blk_row + ((i >> 1) * bsl); - const int offsetc = blk_col + ((i & 0x01) * bsl); -#endif - int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int step = bsh * bsw; + assert(bsw > 0 && bsh > 0); - if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + const int offsetr = blk_row + row; + const int offsetc = blk_col + col; - encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs, - arg); - block += step; + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + + encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs, + arg, mi_row, mi_col, dry_run); + block += step; + } } } } -#endif typedef struct encode_block_pass1_args { AV1_COMMON *cm; @@ -843,57 +367,25 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col, tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); TxfmParam txfm_param; uint8_t *dst; - int ctx = 0; dst = &pd->dst .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]]; - av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, - ctx, AV1_XFORM_QUANT_B); -#if CONFIG_PVQ - if (!x->pvq_skip[plane]) { - int tx_blk_size; - int i, j; - // transform block size in pixels - tx_blk_size = tx_size_wide[tx_size]; - -// Since av1 does not have separate function which does inverse transform -// but av1_inv_txfm_add_*x*() also does addition of predicted image to -// inverse transformed image, -// pass blank dummy image to av1_inv_txfm_add_*x*(), i.e. set dst as zeros -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - for (j = 0; j < tx_blk_size; j++) - for (i = 0; i < tx_blk_size; i++) - CONVERT_TO_SHORTPTR(dst)[j * pd->dst.stride + i] = 0; - } else { -#endif // CONFIG_HIGHBITDEPTH - for (j = 0; j < tx_blk_size; j++) - for (i = 0; i < tx_blk_size; i++) dst[j * pd->dst.stride + i] = 0; -#if CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_PVQ + DCT_DCT, AV1_XFORM_QUANT_B); -#if !CONFIG_PVQ - if (p->eobs[block] > 0) -#endif - { + if (p->eobs[block] > 0) { txfm_param.bd = xd->bd; + txfm_param.is_hbd = get_bitdepth_data_path_index(xd); txfm_param.tx_type = DCT_DCT; + txfm_param.tx_size = tx_size; txfm_param.eob = p->eobs[block]; - txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id]; -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + txfm_param.lossless = xd->lossless[xd->mi[0]->segment_id]; + txfm_param.tx_set_type = av1_get_ext_tx_set_type( + txfm_param.tx_size, is_inter_block(xd->mi[0]), cm->reduced_tx_set_used); + if (txfm_param.is_hbd) { av1_highbd_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride, &txfm_param); return; } -#endif // CONFIG_HIGHBITDEPTH - if (xd->lossless[xd->mi[0]->mbmi.segment_id]) { - av1_iwht4x4_add(dqcoeff, dst, pd->dst.stride, &txfm_param); - } else { - av1_idct4x4_add(dqcoeff, dst, pd->dst.stride, &txfm_param); - } + av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param); } } @@ -904,20 +396,28 @@ void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) { encode_block_pass1, &args); } -void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, - int mi_col) { +void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int mi_row, int mi_col, RUN_TYPE dry_run) { + (void)dry_run; + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; struct optimize_ctx ctx; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - struct encode_b_args arg = { cm, x, &ctx, &mbmi->skip, NULL, NULL, 1 }; + MB_MODE_INFO *mbmi = xd->mi[0]; + struct encode_b_args arg = { cpi, + x, + &ctx, + &mbmi->skip, + NULL, + NULL, + cpi->optimize_seg_arr[mbmi->segment_id] }; int plane; mbmi->skip = 1; if (x->skip) return; - for (plane = 0; plane < MAX_MB_PLANE; ++plane) { -#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2 + for (plane = 0; plane < num_planes; ++plane) { const int subsampling_x = xd->plane[plane].subsampling_x; const int subsampling_y = xd->plane[plane].subsampling_y; @@ -925,41 +425,32 @@ void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, subsampling_y)) continue; - bsize = scale_chroma_bsize(bsize, subsampling_x, subsampling_y); -#else - (void)mi_row; - (void)mi_col; -#endif + const BLOCK_SIZE bsizec = + scale_chroma_bsize(bsize, subsampling_x, subsampling_y); -#if CONFIG_VAR_TX // TODO(jingning): Clean this up. const struct macroblockd_plane *const pd = &xd->plane[plane]; - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y); const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; - const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0]; - const TX_SIZE max_tx_size = get_vartx_max_txsize( - mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y); + const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); + const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size]; const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0]; - const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0]; + const int bh = block_size_high[txb_size] >> tx_size_high_log2[0]; int idx, idy; int block = 0; int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; - av1_get_entropy_contexts(bsize, 0, pd, ctx.ta[plane], ctx.tl[plane]); -#else - const struct macroblockd_plane *const pd = &xd->plane[plane]; - const TX_SIZE tx_size = av1_get_tx_size(plane, xd); - av1_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]); -#endif + av1_get_entropy_contexts(bsizec, pd, ctx.ta[plane], ctx.tl[plane]); + + av1_subtract_plane(x, bsizec, plane); -#if !CONFIG_PVQ - av1_subtract_plane(x, bsize, plane); -#endif arg.ta = ctx.ta[plane]; arg.tl = ctx.tl[plane]; -#if CONFIG_VAR_TX - const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd); + const BLOCK_SIZE max_unit_bsize = + get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y); int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0]; int mu_blocks_high = @@ -976,67 +467,14 @@ void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, for (blk_row = idy; blk_row < unit_height; blk_row += bh) { for (blk_col = idx; blk_col < unit_width; blk_col += bw) { encode_block_inter(plane, block, blk_row, blk_col, plane_bsize, - max_tx_size, &arg); + max_tx_size, &arg, mi_row, mi_col, dry_run); block += step; } } } } -#else - av1_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block, - &arg); -#endif - } -} - -#if CONFIG_SUPERTX -void av1_encode_sb_supertx(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) { - MACROBLOCKD *const xd = &x->e_mbd; - struct optimize_ctx ctx; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - struct encode_b_args arg = { cm, x, &ctx, &mbmi->skip, NULL, NULL, 1 }; - int plane; - - mbmi->skip = 1; - if (x->skip) return; - - for (plane = 0; plane < MAX_MB_PLANE; ++plane) { - const struct macroblockd_plane *const pd = &xd->plane[plane]; -#if CONFIG_VAR_TX - const TX_SIZE tx_size = TX_4X4; -#else - const TX_SIZE tx_size = av1_get_tx_size(plane, xd); -#endif - av1_subtract_plane(x, bsize, plane); - av1_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]); - arg.ta = ctx.ta[plane]; - arg.tl = ctx.tl[plane]; - av1_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block, - &arg); } } -#endif // CONFIG_SUPERTX - -#if !CONFIG_PVQ -void av1_set_txb_context(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, - ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) { - (void)tx_size; - struct macroblock_plane *p = &x->plane[plane]; - -#if !CONFIG_LV_MAP - *a = *l = p->eobs[block] > 0; -#else // !CONFIG_LV_MAP - *a = *l = p->txb_entropy_ctx[block]; -#endif // !CONFIG_LV_MAP - -#if CONFIG_VAR_TX || CONFIG_LV_MAP - int i; - for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) a[i] = a[0]; - - for (i = 0; i < tx_size_high_unit[tx_size]; ++i) l[i] = l[0]; -#endif -} -#endif static void encode_block_intra_and_set_context(int plane, int block, int blk_row, int blk_col, @@ -1044,260 +482,113 @@ static void encode_block_intra_and_set_context(int plane, int block, TX_SIZE tx_size, void *arg) { av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg); -#if !CONFIG_PVQ + struct encode_b_args *const args = arg; MACROBLOCK *x = args->x; ENTROPY_CONTEXT *a = &args->ta[blk_col]; ENTROPY_CONTEXT *l = &args->tl[blk_row]; av1_set_txb_context(x, plane, block, tx_size, a, l); -#endif } void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { struct encode_b_args *const args = arg; - AV1_COMMON *cm = args->cm; + const AV1_COMMON *const cm = &args->cpi->common; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block); -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK PLANE_TYPE plane_type = get_plane_type(plane); - const TX_TYPE tx_type = - av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size); + const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, + tx_size, cm->reduced_tx_set_used); uint16_t *eob = &p->eobs[block]; const int dst_stride = pd->dst.stride; uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; + int dummy_rate_cost = 0; - av1_predict_intra_block_facade(cm, xd, plane, block, blk_col, blk_row, - tx_size); - - av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); + av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size); - const ENTROPY_CONTEXT *a = &args->ta[blk_col]; - const ENTROPY_CONTEXT *l = &args->tl[blk_row]; - int ctx = combine_entropy_contexts(*a, *l); - if (args->enable_optimize_b) { - av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, - ctx, AV1_XFORM_QUANT_FP); - av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size, - a, l, 0); + const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + // Assert not magic number (uninitialized). + assert(plane != 0 || x->blk_skip[blk_row * bw + blk_col] != 234); + if (plane == 0 && x->blk_skip[blk_row * bw + blk_col]) { + *eob = 0; + p->txb_entropy_ctx[block] = 0; } else { - av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, - ctx, AV1_XFORM_QUANT_B); + av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); + + const ENTROPY_CONTEXT *a = &args->ta[blk_col]; + const ENTROPY_CONTEXT *l = &args->tl[blk_row]; + if (args->enable_optimize_b) { + av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, + tx_size, tx_type, AV1_XFORM_QUANT_FP); + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); + av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1, + &dummy_rate_cost); + } else { + av1_xform_quant( + cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type, + USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP); + } } -#if CONFIG_PVQ - // *(args->skip) == mbmi->skip - if (!x->pvq_skip[plane]) *(args->skip) = 0; + if (*eob) { + av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, + dst_stride, *eob, cm->reduced_tx_set_used); + } - if (x->pvq_skip[plane]) return; -#endif // CONFIG_PVQ - av1_inverse_transform_block(xd, dqcoeff, -#if CONFIG_LGT_FROM_PRED - xd->mi[0]->mbmi.mode, -#endif -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - mrc_mask, -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - tx_type, tx_size, dst, dst_stride, *eob); -#if !CONFIG_PVQ - if (*eob) *(args->skip) = 0; -#else -// Note : *(args->skip) == mbmi->skip + if (*eob == 0 && plane == 0) { + // TODO(jingning): Temporarily disable txk_type check for eob=0 case. + // It is possible that certain collision in hash index would cause + // the assertion failure. To further optimize the rate-distortion + // performance, we need to re-visit this part and enable this assert + // again. +#if 0 + if (args->cpi->oxcf.aq_mode == NO_AQ + && args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) { + assert(mbmi->txk_type[av1_get_txk_type_index(plane_bsize, blk_row, + blk_col)] == DCT_DCT); + } #endif -#if CONFIG_CFL - if (plane == AOM_PLANE_Y && xd->cfl->store_y) { + update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, + DCT_DCT); + } + + // For intra mode, skipped blocks are so rare that transmitting skip=1 is + // very expensive. + *(args->skip) = 0; + + if (plane == AOM_PLANE_Y && xd->cfl.store_y) { cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize); } -#endif // CONFIG_CFL } -void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x, +void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int plane, int enable_optimize_b, int mi_row, int mi_col) { const MACROBLOCKD *const xd = &x->e_mbd; - ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE] = { 0 }; - ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE] = { 0 }; + ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 }; + ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 }; struct encode_b_args arg = { - cm, x, NULL, &xd->mi[0]->mbmi.skip, ta, tl, enable_optimize_b + cpi, x, NULL, &(xd->mi[0]->skip), ta, tl, enable_optimize_b }; -#if CONFIG_CB4X4 if (!is_chroma_reference(mi_row, mi_col, bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y)) return; -#else - (void)mi_row; - (void)mi_col; -#endif if (enable_optimize_b) { const struct macroblockd_plane *const pd = &xd->plane[plane]; - const TX_SIZE tx_size = av1_get_tx_size(plane, xd); - av1_get_entropy_contexts(bsize, tx_size, pd, ta, tl); + av1_get_entropy_contexts(bsize, pd, ta, tl); } av1_foreach_transformed_block_in_plane( xd, bsize, plane, encode_block_intra_and_set_context, &arg); } - -#if CONFIG_PVQ -PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff, - tran_low_t *ref_coeff, - tran_low_t *const dqcoeff, uint16_t *eob, - const int16_t *quant, int plane, - TX_SIZE tx_size, TX_TYPE tx_type, int *rate, - int speed, PVQ_INFO *pvq_info) { - const int tx_blk_size = tx_size_wide[tx_size]; - daala_enc_ctx *daala_enc = &x->daala_enc; - PVQ_SKIP_TYPE ac_dc_coded; - int coeff_shift = 3 - av1_get_tx_scale(tx_size); - int hbd_downshift = 0; - int rounding_mask; - int pvq_dc_quant; - int use_activity_masking = daala_enc->use_activity_masking; - int tell; - int has_dc_skip = 1; - int i; - int off = od_qm_offset(tx_size, plane ? 1 : 0); - - DECLARE_ALIGNED(16, tran_low_t, coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]); - DECLARE_ALIGNED(16, tran_low_t, ref_coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]); - - DECLARE_ALIGNED(16, int32_t, in_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]); - DECLARE_ALIGNED(16, int32_t, ref_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]); - DECLARE_ALIGNED(16, int32_t, out_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]); - - hbd_downshift = x->e_mbd.bd - 8; - - assert(OD_COEFF_SHIFT >= 4); - // DC quantizer for PVQ - if (use_activity_masking) - pvq_dc_quant = - OD_MAXI(1, - (quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift) * - daala_enc->state - .pvq_qm_q4[plane][od_qm_get_index(tx_size, 0)] >> - 4); - else - pvq_dc_quant = - OD_MAXI(1, quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift); - - *eob = 0; - -#if !CONFIG_ANS - tell = od_ec_enc_tell_frac(&daala_enc->w.ec); -#else -#error "CONFIG_PVQ currently requires !CONFIG_ANS." -#endif - - // Change coefficient ordering for pvq encoding. - od_raster_to_coding_order(coeff_pvq, tx_blk_size, tx_type, coeff, - tx_blk_size); - od_raster_to_coding_order(ref_coeff_pvq, tx_blk_size, tx_type, ref_coeff, - tx_blk_size); - - // copy int16 inputs to int32 - for (i = 0; i < tx_blk_size * tx_blk_size; i++) { - ref_int32[i] = - AOM_SIGNED_SHL(ref_coeff_pvq[i], OD_COEFF_SHIFT - coeff_shift) >> - hbd_downshift; - in_int32[i] = AOM_SIGNED_SHL(coeff_pvq[i], OD_COEFF_SHIFT - coeff_shift) >> - hbd_downshift; - } - - if (abs(in_int32[0] - ref_int32[0]) < pvq_dc_quant * 141 / 256) { /* 0.55 */ - out_int32[0] = 0; - } else { - out_int32[0] = OD_DIV_R0(in_int32[0] - ref_int32[0], pvq_dc_quant); - } - - ac_dc_coded = od_pvq_encode( - daala_enc, ref_int32, in_int32, out_int32, - OD_MAXI(1, - quant[0] << (OD_COEFF_SHIFT - 3) >> - hbd_downshift), // scale/quantizer - OD_MAXI(1, - quant[1] << (OD_COEFF_SHIFT - 3) >> - hbd_downshift), // scale/quantizer - plane, tx_size, OD_PVQ_BETA[use_activity_masking][plane][tx_size], - 0, // is_keyframe, - daala_enc->state.qm + off, daala_enc->state.qm_inv + off, - speed, // speed - pvq_info); - - // Encode residue of DC coeff, if required. - if (!has_dc_skip || out_int32[0]) { - generic_encode(&daala_enc->w, &daala_enc->state.adapt->model_dc[plane], - abs(out_int32[0]) - has_dc_skip, - &daala_enc->state.adapt->ex_dc[plane][tx_size][0], 2); - } - if (out_int32[0]) { - aom_write_bit(&daala_enc->w, out_int32[0] < 0); - } - - // need to save quantized residue of DC coeff - // so that final pvq bitstream writing can know whether DC is coded. - if (pvq_info) pvq_info->dq_dc_residue = out_int32[0]; - - out_int32[0] = out_int32[0] * pvq_dc_quant; - out_int32[0] += ref_int32[0]; - - // copy int32 result back to int16 - assert(OD_COEFF_SHIFT > coeff_shift); - rounding_mask = (1 << (OD_COEFF_SHIFT - coeff_shift - 1)) - 1; - for (i = 0; i < tx_blk_size * tx_blk_size; i++) { - out_int32[i] = AOM_SIGNED_SHL(out_int32[i], hbd_downshift); - dqcoeff_pvq[i] = (out_int32[i] + (out_int32[i] < 0) + rounding_mask) >> - (OD_COEFF_SHIFT - coeff_shift); - } - - // Back to original coefficient order - od_coding_order_to_raster(dqcoeff, tx_blk_size, tx_type, dqcoeff_pvq, - tx_blk_size); - - *eob = tx_blk_size * tx_blk_size; - -#if !CONFIG_ANS - *rate = (od_ec_enc_tell_frac(&daala_enc->w.ec) - tell) - << (AV1_PROB_COST_SHIFT - OD_BITRES); -#else -#error "CONFIG_PVQ currently requires !CONFIG_ANS." -#endif - assert(*rate >= 0); - - return ac_dc_coded; -} - -void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta, int *k, - od_coeff *y, int nb_bands, const int *off, - int *size, int skip_rest, int skip_dir, - int bs) { // block size in log_2 -2 - int i; - const int tx_blk_size = tx_size_wide[bs]; - - for (i = 0; i < nb_bands; i++) { - pvq_info->qg[i] = qg[i]; - pvq_info->theta[i] = theta[i]; - pvq_info->k[i] = k[i]; - pvq_info->off[i] = off[i]; - pvq_info->size[i] = size[i]; - } - - memcpy(pvq_info->y, y, tx_blk_size * tx_blk_size * sizeof(od_coeff)); - - pvq_info->nb_bands = nb_bands; - pvq_info->skip_rest = skip_rest; - pvq_info->skip_dir = skip_dir; - pvq_info->bs = bs; -} -#endif diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h index c817a94f0..673f87ea7 100644 --- a/third_party/aom/av1/encoder/encodemb.h +++ b/third_party/aom/av1/encoder/encodemb.h @@ -12,21 +12,23 @@ #ifndef AV1_ENCODER_ENCODEMB_H_ #define AV1_ENCODER_ENCODEMB_H_ -#include "./aom_config.h" +#include "config/aom_config.h" + #include "av1/common/onyxc_int.h" +#include "av1/common/txb_common.h" #include "av1/encoder/block.h" - +#include "av1/encoder/tokenize.h" #ifdef __cplusplus extern "C" { #endif struct optimize_ctx { - ENTROPY_CONTEXT ta[MAX_MB_PLANE][2 * MAX_MIB_SIZE]; - ENTROPY_CONTEXT tl[MAX_MB_PLANE][2 * MAX_MIB_SIZE]; + ENTROPY_CONTEXT ta[MAX_MB_PLANE][MAX_MIB_SIZE]; + ENTROPY_CONTEXT tl[MAX_MB_PLANE][MAX_MIB_SIZE]; }; struct encode_b_args { - AV1_COMMON *cm; + const struct AV1_COMP *cpi; MACROBLOCK *x; struct optimize_ctx *ctx; int8_t *skip; @@ -43,52 +45,39 @@ typedef enum AV1_XFORM_QUANT { AV1_XFORM_QUANT_TYPES, } AV1_XFORM_QUANT; -void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, - int mi_col); -#if CONFIG_SUPERTX -void av1_encode_sb_supertx(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize); -#endif // CONFIG_SUPERTX +void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int mi_row, int mi_col, RUN_TYPE dry_run); void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize); void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, int ctx, AV1_XFORM_QUANT xform_quant_idx); + TX_SIZE tx_size, TX_TYPE tx_type, + AV1_XFORM_QUANT xform_quant_idx); -int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int blk_row, - int blk_col, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, const ENTROPY_CONTEXT *a, - const ENTROPY_CONTEXT *l, int fast_mode); +int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane, + int block, TX_SIZE tx_size, TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int fast_mode, int *rate_cost); void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, int blk_col, int blk_row, TX_SIZE tx_size); void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); -#if !CONFIG_PVQ -void av1_set_txb_context(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, - ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l); -#endif +static INLINE void av1_set_txb_context(MACROBLOCK *x, int plane, int block, + TX_SIZE tx_size, ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l) { + const uint8_t ctx = x->plane[plane].txb_entropy_ctx[block]; + memset(a, ctx, tx_size_wide_unit[tx_size] * sizeof(*a)); + memset(l, ctx, tx_size_high_unit[tx_size] * sizeof(*l)); +} void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); -void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x, +void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int plane, int enable_optimize_b, int mi_row, int mi_col); -#if CONFIG_PVQ -PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff, - tran_low_t *ref_coeff, - tran_low_t *const dqcoeff, uint16_t *eob, - const int16_t *quant, int plane, - TX_SIZE tx_size, TX_TYPE tx_type, int *rate, - int speed, PVQ_INFO *pvq_info); - -void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta, int *k, - od_coeff *y, int nb_bands, const int *off, - int *size, int skip_rest, int skip_dir, int bs); -#endif - #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c index f8a546999..944e2c53d 100644 --- a/third_party/aom/av1/encoder/encodemv.c +++ b/third_party/aom/av1/encoder/encodemv.c @@ -16,20 +16,9 @@ #include "av1/encoder/cost.h" #include "av1/encoder/encodemv.h" -#include "av1/encoder/subexp.h" #include "aom_dsp/aom_dsp_common.h" -static struct av1_token mv_joint_encodings[MV_JOINTS]; -static struct av1_token mv_class_encodings[MV_CLASSES]; -static struct av1_token mv_fp_encodings[MV_FP_SIZE]; - -void av1_entropy_mv_init(void) { - av1_tokens_from_tree(mv_joint_encodings, av1_mv_joint_tree); - av1_tokens_from_tree(mv_class_encodings, av1_mv_class_tree); - av1_tokens_from_tree(mv_fp_encodings, av1_mv_fp_tree); -} - static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp, MvSubpelPrecision precision) { int offset; @@ -42,38 +31,23 @@ static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp, assert(comp != 0); -// Sign -#if CONFIG_NEW_MULTISYMBOL - aom_write_bit(w, sign); -#else - aom_write(w, sign, mvcomp->sign); -#endif + // Sign + aom_write_symbol(w, sign, mvcomp->sign_cdf, 2); // Class - aom_write_symbol(w, mv_class, mvcomp->class_cdf, MV_CLASSES); + aom_write_symbol(w, mv_class, mvcomp->classes_cdf, MV_CLASSES); // Integer bits if (mv_class == MV_CLASS_0) { -#if CONFIG_NEW_MULTISYMBOL aom_write_symbol(w, d, mvcomp->class0_cdf, CLASS0_SIZE); -#else - aom_write(w, d, mvcomp->class0[0]); -#endif } else { int i; const int n = mv_class + CLASS0_BITS - 1; // number of bits -#if CONFIG_NEW_MULTISYMBOL for (i = 0; i < n; ++i) - aom_write_symbol(w, (d >> i) & 1, mvcomp->bits_cdf[(i + 1) / 2], 2); -#else - for (i = 0; i < n; ++i) aom_write(w, (d >> i) & 1, mvcomp->bits[i]); -#endif + aom_write_symbol(w, (d >> i) & 1, mvcomp->bits_cdf[i], 2); } -// Fractional bits -#if CONFIG_INTRABC || CONFIG_AMVR - if (precision > MV_SUBPEL_NONE) -#endif // CONFIG_INTRABC || CONFIG_AMVR - { + // Fractional bits + if (precision > MV_SUBPEL_NONE) { aom_write_symbol( w, fr, mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf, @@ -82,13 +56,9 @@ static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp, // High precision bit if (precision > MV_SUBPEL_LOW_PRECISION) -#if CONFIG_NEW_MULTISYMBOL aom_write_symbol( w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf, 2); -#else - aom_write(w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp); -#endif } static void build_nmv_component_cost_table(int *mvcost, @@ -100,24 +70,20 @@ static void build_nmv_component_cost_table(int *mvcost, int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE]; int class0_hp_cost[2], hp_cost[2]; - sign_cost[0] = av1_cost_zero(mvcomp->sign); - sign_cost[1] = av1_cost_one(mvcomp->sign); - av1_cost_tokens(class_cost, mvcomp->classes, av1_mv_class_tree); - av1_cost_tokens(class0_cost, mvcomp->class0, av1_mv_class0_tree); + av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, NULL); + av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, NULL); + av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, NULL); for (i = 0; i < MV_OFFSET_BITS; ++i) { - bits_cost[i][0] = av1_cost_zero(mvcomp->bits[i]); - bits_cost[i][1] = av1_cost_one(mvcomp->bits[i]); + av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], NULL); } for (i = 0; i < CLASS0_SIZE; ++i) - av1_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], av1_mv_fp_tree); - av1_cost_tokens(fp_cost, mvcomp->fp, av1_mv_fp_tree); + av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i], NULL); + av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL); if (precision > MV_SUBPEL_LOW_PRECISION) { - class0_hp_cost[0] = av1_cost_zero(mvcomp->class0_hp); - class0_hp_cost[1] = av1_cost_one(mvcomp->class0_hp); - hp_cost[0] = av1_cost_zero(mvcomp->hp); - hp_cost[1] = av1_cost_one(mvcomp->hp); + av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, NULL); + av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, NULL); } mvcost[0] = 0; for (v = 1; v <= MV_MAX; ++v) { @@ -134,10 +100,7 @@ static void build_nmv_component_cost_table(int *mvcost, const int b = c + CLASS0_BITS - 1; /* number of bits */ for (i = 0; i < b; ++i) cost += bits_cost[i][((d >> i) & 1)]; } -#if CONFIG_INTRABC || CONFIG_AMVR - if (precision > MV_SUBPEL_NONE) -#endif // CONFIG_INTRABC || CONFIG_AMVR - { + if (precision > MV_SUBPEL_NONE) { if (c == MV_CLASS_0) { cost += class0_fp_cost[d][f]; } else { @@ -156,50 +119,14 @@ static void build_nmv_component_cost_table(int *mvcost, } } -#if !CONFIG_NEW_MULTISYMBOL -static void update_mv(aom_writer *w, const unsigned int ct[2], aom_prob *cur_p, - aom_prob upd_p) { - (void)upd_p; - // Just use the default maximum number of tile groups to avoid passing in the - // actual - // number - av1_cond_prob_diff_update(w, cur_p, ct, DEFAULT_MAX_NUM_TG); -} - -void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w, - nmv_context_counts *const nmv_counts) { - int i; - int nmv_ctx = 0; -#if CONFIG_AMVR - if (cm->cur_frame_mv_precision_level) { - return; - } -#endif - for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) { - nmv_context *const mvc = &cm->fc->nmvc[nmv_ctx]; - nmv_context_counts *const counts = &nmv_counts[nmv_ctx]; - - if (usehp) { - for (i = 0; i < 2; ++i) { - update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp, - MV_UPDATE_PROB); - update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB); - } - } - } -} -#endif - void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref, nmv_context *mvctx, int usehp) { const MV diff = { mv->row - ref->row, mv->col - ref->col }; const MV_JOINT_TYPE j = av1_get_mv_joint(&diff); -#if CONFIG_AMVR - if (cpi->common.cur_frame_mv_precision_level) { + if (cpi->common.cur_frame_force_integer_mv) { usehp = MV_SUBPEL_NONE; } -#endif - aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS); + aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS); if (mv_joint_vertical(j)) encode_mv_component(w, diff.row, &mvctx->comps[0], usehp); @@ -214,212 +141,81 @@ void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref, } } -#if CONFIG_INTRABC void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref, nmv_context *mvctx) { + // DV and ref DV should not have sub-pel. + assert((mv->col & 7) == 0); + assert((mv->row & 7) == 0); + assert((ref->col & 7) == 0); + assert((ref->row & 7) == 0); const MV diff = { mv->row - ref->row, mv->col - ref->col }; const MV_JOINT_TYPE j = av1_get_mv_joint(&diff); - aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS); + aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS); if (mv_joint_vertical(j)) encode_mv_component(w, diff.row, &mvctx->comps[0], MV_SUBPEL_NONE); if (mv_joint_horizontal(j)) encode_mv_component(w, diff.col, &mvctx->comps[1], MV_SUBPEL_NONE); } -#endif // CONFIG_INTRABC void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2], const nmv_context *ctx, MvSubpelPrecision precision) { - av1_cost_tokens(mvjoint, ctx->joints, av1_mv_joint_tree); + av1_cost_tokens_from_cdf(mvjoint, ctx->joints_cdf, NULL); build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], precision); build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], precision); } -static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext, - const int_mv mvs[2], const int_mv pred_mvs[2], - nmv_context_counts *nmv_counts -#if CONFIG_AMVR - , - MvSubpelPrecision precision -#endif - ) { - int i; - PREDICTION_MODE mode = mbmi->mode; - - if (mode == NEWMV || mode == NEW_NEWMV) { - for (i = 0; i < 1 + has_second_ref(mbmi); ++i) { - const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv; - const MV diff = { mvs[i].as_mv.row - ref->row, - mvs[i].as_mv.col - ref->col }; - int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); - int nmv_ctx = - av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], - mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx); - nmv_context_counts *counts = &nmv_counts[nmv_ctx]; - (void)pred_mvs; -#if CONFIG_AMVR - av1_inc_mv(&diff, counts, precision); -#else - av1_inc_mv(&diff, counts, 1); -#endif +int_mv av1_get_ref_mv_from_stack(int ref_idx, + const MV_REFERENCE_FRAME *ref_frame, + int ref_mv_idx, + const MB_MODE_INFO_EXT *mbmi_ext) { + const int8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const CANDIDATE_MV *curr_ref_mv_stack = + mbmi_ext->ref_mv_stack[ref_frame_type]; + int_mv ref_mv; + ref_mv.as_int = INVALID_MV; + + if (ref_frame[1] > INTRA_FRAME) { + if (ref_idx == 0) { + ref_mv = curr_ref_mv_stack[ref_mv_idx].this_mv; + } else { + assert(ref_idx == 1); + ref_mv = curr_ref_mv_stack[ref_mv_idx].comp_mv; } - } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) { - const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv; - const MV diff = { mvs[1].as_mv.row - ref->row, - mvs[1].as_mv.col - ref->col }; - int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); - int nmv_ctx = - av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], - mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx); - nmv_context_counts *counts = &nmv_counts[nmv_ctx]; -#if CONFIG_AMVR - av1_inc_mv(&diff, counts, precision); -#else - av1_inc_mv(&diff, counts, 1); -#endif - } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) { - const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv; - const MV diff = { mvs[0].as_mv.row - ref->row, - mvs[0].as_mv.col - ref->col }; - int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); - int nmv_ctx = - av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], - mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx); - nmv_context_counts *counts = &nmv_counts[nmv_ctx]; -#if CONFIG_AMVR - av1_inc_mv(&diff, counts, precision); -#else - av1_inc_mv(&diff, counts, 1); -#endif -#if CONFIG_COMPOUND_SINGLEREF } else { - assert( // mode == SR_NEAREST_NEWMV || - mode == SR_NEAR_NEWMV || mode == SR_ZERO_NEWMV || mode == SR_NEW_NEWMV); - const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv; - int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); - int nmv_ctx = - av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], - mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx); - nmv_context_counts *counts = &nmv_counts[nmv_ctx]; - (void)pred_mvs; - MV diff; - if (mode == SR_NEW_NEWMV) { - diff.row = mvs[0].as_mv.row - ref->row; - diff.col = mvs[0].as_mv.col - ref->col; - av1_inc_mv(&diff, counts, 1); + assert(ref_idx == 0); + if (ref_mv_idx < mbmi_ext->ref_mv_count[ref_frame_type]) { + ref_mv = curr_ref_mv_stack[ref_mv_idx].this_mv; + } else { + ref_mv = mbmi_ext->global_mvs[ref_frame_type]; } - diff.row = mvs[1].as_mv.row - ref->row; - diff.col = mvs[1].as_mv.col - ref->col; - av1_inc_mv(&diff, counts, 1); -#endif // CONFIG_COMPOUND_SINGLEREF } + return ref_mv; } -static void inc_mvs_sub8x8(const MODE_INFO *mi, int block, const int_mv mvs[2], - const MB_MODE_INFO_EXT *mbmi_ext, - nmv_context_counts *nmv_counts -#if CONFIG_AMVR - , - MvSubpelPrecision precision -#endif - ) { - int i; - PREDICTION_MODE mode = mi->bmi[block].as_mode; - const MB_MODE_INFO *mbmi = &mi->mbmi; - - if (mode == NEWMV || mode == NEW_NEWMV) { - for (i = 0; i < 1 + has_second_ref(&mi->mbmi); ++i) { - const MV *ref = &mi->bmi[block].ref_mv[i].as_mv; - const MV diff = { mvs[i].as_mv.row - ref->row, - mvs[i].as_mv.col - ref->col }; - int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); - int nmv_ctx = - av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], - mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx); - nmv_context_counts *counts = &nmv_counts[nmv_ctx]; -#if CONFIG_AMVR - av1_inc_mv(&diff, counts, precision); -#else - av1_inc_mv(&diff, counts, 1); -#endif - } - } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) { - const MV *ref = &mi->bmi[block].ref_mv[1].as_mv; - const MV diff = { mvs[1].as_mv.row - ref->row, - mvs[1].as_mv.col - ref->col }; - int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); - int nmv_ctx = - av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], - mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx); - nmv_context_counts *counts = &nmv_counts[nmv_ctx]; -#if CONFIG_AMVR - av1_inc_mv(&diff, counts, precision); -#else - av1_inc_mv(&diff, counts, 1); -#endif - } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) { - const MV *ref = &mi->bmi[block].ref_mv[0].as_mv; - const MV diff = { mvs[0].as_mv.row - ref->row, - mvs[0].as_mv.col - ref->col }; - int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); - int nmv_ctx = - av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], - mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx); - nmv_context_counts *counts = &nmv_counts[nmv_ctx]; -#if CONFIG_AMVR - av1_inc_mv(&diff, counts, precision); -#else - av1_inc_mv(&diff, counts, 1); -#endif +int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + int ref_mv_idx = mbmi->ref_mv_idx; + if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) { + assert(has_second_ref(mbmi)); + ref_mv_idx += 1; } + return av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx, + x->mbmi_ext); } -void av1_update_mv_count(ThreadData *td) { - const MACROBLOCKD *xd = &td->mb.e_mbd; - const MODE_INFO *mi = xd->mi[0]; - const MB_MODE_INFO *const mbmi = &mi->mbmi; - const MB_MODE_INFO_EXT *mbmi_ext = td->mb.mbmi_ext; -#if CONFIG_CB4X4 - const int unify_bsize = 1; -#else - const int unify_bsize = 0; -#endif -#if CONFIG_AMVR - MvSubpelPrecision precision = 1; - if (xd->cur_frame_mv_precision_level) { - precision = MV_SUBPEL_NONE; - } -#endif - - if (mbmi->sb_type < BLOCK_8X8 && !unify_bsize) { - const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type]; - const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi->sb_type]; - int idx, idy; - - for (idy = 0; idy < 2; idy += num_4x4_h) { - for (idx = 0; idx < 2; idx += num_4x4_w) { - const int i = idy * 2 + idx; - - if (have_newmv_in_inter_mode(mi->bmi[i].as_mode)) - -#if CONFIG_AMVR - inc_mvs_sub8x8(mi, i, mi->bmi[i].as_mv, mbmi_ext, td->counts->mv, - precision); -#else - inc_mvs_sub8x8(mi, i, mi->bmi[i].as_mv, mbmi_ext, td->counts->mv); -#endif - } - } - } else { - if (have_newmv_in_inter_mode(mbmi->mode)) - -#if CONFIG_AMVR - inc_mvs(mbmi, mbmi_ext, mbmi->mv, mbmi->pred_mv, td->counts->mv, - precision); -#else - inc_mvs(mbmi, mbmi_ext, mbmi->mv, mbmi->pred_mv, td->counts->mv); -#endif - } +void av1_find_best_ref_mvs_from_stack(int allow_hp, + const MB_MODE_INFO_EXT *mbmi_ext, + MV_REFERENCE_FRAME ref_frame, + int_mv *nearest_mv, int_mv *near_mv, + int is_integer) { + const int ref_idx = 0; + MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME }; + *nearest_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 0, mbmi_ext); + lower_mv_precision(&nearest_mv->as_mv, allow_hp, is_integer); + *near_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 1, mbmi_ext); + lower_mv_precision(&near_mv->as_mv, allow_hp, is_integer); } diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h index 8689cec27..64e9e7162 100644 --- a/third_party/aom/av1/encoder/encodemv.h +++ b/third_party/aom/av1/encoder/encodemv.h @@ -18,13 +18,6 @@ extern "C" { #endif -void av1_entropy_mv_init(void); - -#if !CONFIG_NEW_MULTISYMBOL -void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w, - nmv_context_counts *const counts); -#endif - void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref, nmv_context *mvctx, int usehp); @@ -34,10 +27,18 @@ void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2], void av1_update_mv_count(ThreadData *td); -#if CONFIG_INTRABC void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref, nmv_context *mvctx); -#endif // CONFIG_INTRABC +int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx); +int_mv av1_get_ref_mv_from_stack(int ref_idx, + const MV_REFERENCE_FRAME *ref_frame, + int ref_mv_idx, + const MB_MODE_INFO_EXT *mbmi_ext); +void av1_find_best_ref_mvs_from_stack(int allow_hp, + const MB_MODE_INFO_EXT *mbmi_ext, + MV_REFERENCE_FRAME ref_frame, + int_mv *nearest_mv, int_mv *near_mv, + int is_integer); #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c index e9ab3c87f..196e18d8a 100644 --- a/third_party/aom/av1/encoder/encoder.c +++ b/third_party/aom/av1/encoder/encoder.c @@ -13,12 +13,13 @@ #include <math.h> #include <stdio.h> -#include "./aom_config.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" #include "av1/common/alloccommon.h" -#if CONFIG_CDEF #include "av1/common/cdef.h" -#endif // CONFIG_CDEF #include "av1/common/filter.h" #include "av1/common/idct.h" #include "av1/common/reconinter.h" @@ -30,32 +31,17 @@ #include "av1/encoder/aq_cyclicrefresh.h" #include "av1/encoder/aq_variance.h" #include "av1/encoder/bitstream.h" -#if CONFIG_BGSPRITE -#include "av1/encoder/bgsprite.h" -#endif // CONFIG_BGSPRITE -#if CONFIG_ANS -#include "aom_dsp/buf_ans.h" -#endif #include "av1/encoder/context_tree.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/encoder.h" -#if CONFIG_LV_MAP #include "av1/encoder/encodetxb.h" -#endif #include "av1/encoder/ethread.h" #include "av1/encoder/firstpass.h" -#if CONFIG_HASH_ME #include "av1/encoder/hash_motion.h" -#endif #include "av1/encoder/mbgraph.h" -#if CONFIG_NCOBMC_ADAPT_WEIGHT -#include "av1/common/ncobmc_kernels.h" -#endif // CONFIG_NCOBMC_ADAPT_WEIGHT #include "av1/encoder/picklpf.h" -#if CONFIG_LOOP_RESTORATION #include "av1/encoder/pickrst.h" -#endif // CONFIG_LOOP_RESTORATION #include "av1/encoder/random.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/rd.h" @@ -63,45 +49,41 @@ #include "av1/encoder/speed_features.h" #include "av1/encoder/temporal_filter.h" -#include "./av1_rtcd.h" -#include "./aom_dsp_rtcd.h" -#include "./aom_scale_rtcd.h" #include "aom_dsp/psnr.h" #if CONFIG_INTERNAL_STATS #include "aom_dsp/ssim.h" #endif +#include "av1/encoder/grain_test_vectors.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_ports/aom_timer.h" #include "aom_ports/mem.h" #include "aom_ports/system_state.h" #include "aom_scale/aom_scale.h" -#if CONFIG_BITSTREAM_DEBUG +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "aom_util/debug_util.h" -#endif // CONFIG_BITSTREAM_DEBUG +#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG + +#define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7 + +// av1 uses 10,000,000 ticks/second as time stamp +#define TICKS_PER_SEC 10000000LL #if CONFIG_ENTROPY_STATS FRAME_COUNTS aggregate_fc; -// Aggregate frame counts per frame context type -FRAME_COUNTS aggregate_fc_per_type[FRAME_CONTEXTS]; #endif // CONFIG_ENTROPY_STATS #define AM_SEGMENT_ID_INACTIVE 7 #define AM_SEGMENT_ID_ACTIVE 0 -#define SHARP_FILTER_QTHRESH 0 /* Q threshold for 8-tap sharp filter */ +// Whether to use high precision mv for altref computation. +#define ALTREF_HIGH_PRECISION_MV 1 -#define ALTREF_HIGH_PRECISION_MV 1 // Whether to use high precision mv - // for altref computation. -#define HIGH_PRECISION_MV_QTHRESH 200 // Q threshold for high precision - // mv. Choose a very high value for - // now so that HIGH_PRECISION is always - // chosen. +// Q threshold for high precision mv. Choose a very high value for now so that +// HIGH_PRECISION is always chosen. +#define HIGH_PRECISION_MV_QTHRESH 200 // #define OUTPUT_YUV_REC -#ifdef OUTPUT_YUV_DENOISED -FILE *yuv_denoised_file = NULL; -#endif #ifdef OUTPUT_YUV_SKINMAP FILE *yuv_skinmap_file = NULL; #endif @@ -110,20 +92,6 @@ FILE *yuv_rec_file; #define FILE_NAME_LEN 100 #endif -#if 0 -FILE *framepsnr; -FILE *kf_list; -FILE *keyfile; -#endif - -#if CONFIG_CFL -CFL_CTX NULL_CFL; -#endif - -#if CONFIG_INTERNAL_STATS -typedef enum { Y, U, V, ALL } STAT_TYPE; -#endif // CONFIG_INTERNAL_STATS - static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) { switch (mode) { case NORMAL: @@ -180,7 +148,6 @@ static void apply_active_map(AV1_COMP *cpi) { if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i]; av1_enable_segmentation(seg); av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP); -#if CONFIG_LOOPFILTER_LEVEL av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H); av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V); av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U); @@ -194,23 +161,12 @@ static void apply_active_map(AV1_COMP *cpi) { -MAX_LOOP_FILTER); av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V, -MAX_LOOP_FILTER); -#else - av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF); - // Setting the data to -MAX_LOOP_FILTER will result in the computed loop - // filter level being zero regardless of the value of seg->abs_delta. - av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF, - -MAX_LOOP_FILTER); -#endif // CONFIG_LOOPFILTER_LEVEL } else { av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP); -#if CONFIG_LOOPFILTER_LEVEL av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H); av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V); av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U); av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V); -#else - av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF); -#endif // CONFIG_LOOPFILTER_LEVEL if (seg->enabled) { seg->update_data = 1; seg->update_map = 1; @@ -277,54 +233,45 @@ int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows, } } -static void set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv -#if CONFIG_AMVR - , - int cur_frame_mv_precision_level -#endif - ) { +static void set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv, + int cur_frame_force_integer_mv) { MACROBLOCK *const mb = &cpi->td.mb; - cpi->common.allow_high_precision_mv = allow_high_precision_mv; - -#if CONFIG_AMVR - if (cpi->common.allow_high_precision_mv && - cur_frame_mv_precision_level == 0) { -#else - if (cpi->common.allow_high_precision_mv) { -#endif - int i; - for (i = 0; i < NMV_CONTEXTS; ++i) { - mb->mv_cost_stack[i] = mb->nmvcost_hp[i]; - } - } else { - int i; - for (i = 0; i < NMV_CONTEXTS; ++i) { - mb->mv_cost_stack[i] = mb->nmvcost[i]; - } - } + cpi->common.allow_high_precision_mv = + allow_high_precision_mv && cur_frame_force_integer_mv == 0; + const int copy_hp = + cpi->common.allow_high_precision_mv && cur_frame_force_integer_mv == 0; + int *(*src)[2] = copy_hp ? &mb->nmvcost_hp : &mb->nmvcost; + mb->mv_cost_stack = *src; } static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) { -#if CONFIG_EXT_PARTITION + const AV1_COMMON *const cm = &cpi->common; + if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_64X64) return BLOCK_64X64; - - if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128) - return BLOCK_128X128; +#if CONFIG_FILEOPTIONS + if (cm->options && cm->options->ext_partition) +#endif + if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128) + return BLOCK_128X128; assert(cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC); - assert(IMPLIES(cpi->common.tile_cols > 1, - cpi->common.tile_width % MAX_MIB_SIZE == 0)); - assert(IMPLIES(cpi->common.tile_rows > 1, - cpi->common.tile_height % MAX_MIB_SIZE == 0)); +// TODO(any): Possibly could improve this with a heuristic. +#if CONFIG_FILEOPTIONS + if (cm->options && !cm->options->ext_partition) return BLOCK_64X64; +#endif + + // When superres / resize is on, 'cm->width / height' can change between + // calls, so we don't apply this heuristic there. Also, this heuristic gives + // compression gain for speed >= 2 only. + if (cpi->oxcf.superres_mode == SUPERRES_NONE && + cpi->oxcf.resize_mode == RESIZE_NONE && cpi->oxcf.speed >= 2) { + return (cm->width >= 480 && cm->height >= 360) ? BLOCK_128X128 + : BLOCK_64X64; + } - // TODO(any): Possibly could improve this with a heuristic. return BLOCK_128X128; -#else - (void)cpi; - return BLOCK_64X64; -#endif // CONFIG_EXT_PARTITION } static void setup_frame(AV1_COMP *cpi) { @@ -334,96 +281,82 @@ static void setup_frame(AV1_COMP *cpi) { // frames where the error_resilient_mode or intra_only flag is set. For // other inter-frames the encoder currently uses only two contexts; // context 1 for ALTREF frames and context 0 for the others. - if (frame_is_intra_only(cm) || cm->error_resilient_mode) { + + cm->primary_ref_frame = PRIMARY_REF_NONE; + if (frame_is_intra_only(cm) || cm->error_resilient_mode || + cm->force_primary_ref_none) { av1_setup_past_independence(cm); + for (int i = 0; i < REF_FRAMES; i++) { + cm->fb_of_context_type[i] = -1; + } + cm->fb_of_context_type[REGULAR_FRAME] = + get_ref_frame_map_idx(cpi, GOLDEN_FRAME); + cm->frame_context_idx = REGULAR_FRAME; } else { -#if CONFIG_NO_FRAME_CONTEXT_SIGNALING -// Just use frame context from first signaled reference frame. -// This will always be LAST_FRAME for now. -#else -#if CONFIG_EXT_REFS const GF_GROUP *gf_group = &cpi->twopass.gf_group; if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) cm->frame_context_idx = EXT_ARF_FRAME; else if (cpi->refresh_alt_ref_frame) cm->frame_context_idx = ARF_FRAME; -#else // !CONFIG_EXT_REFS - if (cpi->refresh_alt_ref_frame) cm->frame_context_idx = ARF_FRAME; -#endif // CONFIG_EXT_REFS else if (cpi->rc.is_src_frame_alt_ref) cm->frame_context_idx = OVERLAY_FRAME; else if (cpi->refresh_golden_frame) cm->frame_context_idx = GLD_FRAME; -#if CONFIG_EXT_REFS else if (cpi->refresh_bwd_ref_frame) cm->frame_context_idx = BRF_FRAME; -#endif // CONFIG_EXT_REFS else cm->frame_context_idx = REGULAR_FRAME; -#endif // CONFIG_NO_FRAME_CONTEXT_SIGNALING + int wanted_fb = cm->fb_of_context_type[cm->frame_context_idx]; + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + int fb = get_ref_frame_map_idx(cpi, ref_frame); + if (fb == wanted_fb) { + cm->primary_ref_frame = ref_frame - LAST_FRAME; + } + } } if (cm->frame_type == KEY_FRAME) { cpi->refresh_golden_frame = 1; cpi->refresh_alt_ref_frame = 1; av1_zero(cpi->interp_filter_selected); - set_sb_size(cm, select_sb_size(cpi)); -#if CONFIG_REFERENCE_BUFFER + set_sb_size(&cm->seq_params, select_sb_size(cpi)); set_use_reference_buffer(cm, 0); -#endif // CONFIG_REFERENCE_BUFFER + } else if (frame_is_sframe(cm)) { + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 1; + av1_zero(cpi->interp_filter_selected); + set_sb_size(&cm->seq_params, select_sb_size(cpi)); } else { -#if CONFIG_NO_FRAME_CONTEXT_SIGNALING - if (frame_is_intra_only(cm) || cm->error_resilient_mode || - cm->frame_refs[0].idx < 0) { - *cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS]; + if (cm->primary_ref_frame == PRIMARY_REF_NONE || + cm->frame_refs[cm->primary_ref_frame].idx < 0) { + av1_setup_past_independence(cm); + cm->seg.update_map = 1; + cm->seg.update_data = 1; } else { - *cm->fc = cm->frame_contexts[cm->frame_refs[0].idx]; + *cm->fc = cm->frame_contexts[cm->frame_refs[cm->primary_ref_frame].idx]; } -#else - *cm->fc = cm->frame_contexts[cm->frame_context_idx]; -#endif // CONFIG_NO_FRAME_CONTEXT_SIGNALING av1_zero(cpi->interp_filter_selected[0]); } -#if CONFIG_EXT_REFS -#if CONFIG_ONE_SIDED_COMPOUND && \ - !CONFIG_EXT_COMP_REFS // No change to bitstream - if (cpi->sf.recode_loop == DISALLOW_RECODE) { - cpi->refresh_bwd_ref_frame = cpi->refresh_last_frame; - cpi->rc.is_bipred_frame = 1; - } -#endif // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS -#endif // CONFIG_EXT_REFS -#if CONFIG_NO_FRAME_CONTEXT_SIGNALING - if (frame_is_intra_only(cm) || cm->error_resilient_mode || - cm->frame_refs[0].idx < 0) { - // use default frame context values - cm->pre_fc = &cm->frame_contexts[FRAME_CONTEXT_DEFAULTS]; - } else { - *cm->fc = cm->frame_contexts[cm->frame_refs[0].idx]; - cm->pre_fc = &cm->frame_contexts[cm->frame_refs[0].idx]; - } -#else - cm->pre_fc = &cm->frame_contexts[cm->frame_context_idx]; -#endif // CONFIG_NO_FRAME_CONTEXT_SIGNALING + cm->prev_frame = get_prev_frame(cm); cpi->vaq_refresh = 0; } static void enc_setup_mi(AV1_COMMON *cm) { int i; - cm->mi = cm->mip + cm->mi_stride + 1; - memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip)); - cm->prev_mi = cm->prev_mip + cm->mi_stride + 1; + cm->mi = cm->mip; + memset(cm->mip, 0, cm->mi_stride * cm->mi_rows * sizeof(*cm->mip)); + cm->prev_mi = cm->prev_mip; // Clear top border row memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride); // Clear left border column - for (i = 1; i < cm->mi_rows + 1; ++i) + for (i = 0; i < cm->mi_rows; ++i) memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip)); - cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1; - cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1; + cm->mi_grid_visible = cm->mi_grid_base; + cm->prev_mi_grid_visible = cm->prev_mi_grid_base; memset(cm->mi_grid_base, 0, - cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base)); + cm->mi_stride * cm->mi_rows * sizeof(*cm->mi_grid_base)); } static int enc_alloc_mi(AV1_COMMON *cm, int mi_size) { @@ -433,10 +366,11 @@ static int enc_alloc_mi(AV1_COMMON *cm, int mi_size) { if (!cm->prev_mip) return 1; cm->mi_alloc_size = mi_size; - cm->mi_grid_base = (MODE_INFO **)aom_calloc(mi_size, sizeof(MODE_INFO *)); + cm->mi_grid_base = + (MB_MODE_INFO **)aom_calloc(mi_size, sizeof(MB_MODE_INFO *)); if (!cm->mi_grid_base) return 1; cm->prev_mi_grid_base = - (MODE_INFO **)aom_calloc(mi_size, sizeof(MODE_INFO *)); + (MB_MODE_INFO **)aom_calloc(mi_size, sizeof(MB_MODE_INFO *)); if (!cm->prev_mi_grid_base) return 1; return 0; @@ -456,19 +390,19 @@ static void enc_free_mi(AV1_COMMON *cm) { static void swap_mi_and_prev_mi(AV1_COMMON *cm) { // Current mip will be the prev_mip for the next frame. - MODE_INFO **temp_base = cm->prev_mi_grid_base; - MODE_INFO *temp = cm->prev_mip; + MB_MODE_INFO **temp_base = cm->prev_mi_grid_base; + MB_MODE_INFO *temp = cm->prev_mip; cm->prev_mip = cm->mip; cm->mip = temp; // Update the upper left visible macroblock ptrs. - cm->mi = cm->mip + cm->mi_stride + 1; - cm->prev_mi = cm->prev_mip + cm->mi_stride + 1; + cm->mi = cm->mip; + cm->prev_mi = cm->prev_mip; cm->prev_mi_grid_base = cm->mi_grid_base; cm->mi_grid_base = temp_base; - cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1; - cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1; + cm->mi_grid_visible = cm->mi_grid_base; + cm->prev_mi_grid_visible = cm->prev_mi_grid_base; } void av1_initialize_enc(void) { @@ -480,11 +414,7 @@ void av1_initialize_enc(void) { aom_scale_rtcd(); av1_init_intra_predictors(); av1_init_me_luts(); -#if !CONFIG_XIPHRC av1_rc_init_minq_luts(); -#endif - av1_entropy_mv_init(); - av1_encode_token_init(); av1_init_wedge_masks(); init_done = 1; } @@ -506,25 +436,47 @@ static void alloc_context_buffers_ext(AV1_COMP *cpi) { aom_calloc(mi_size, sizeof(*cpi->mbmi_ext_base))); } -static void dealloc_compressor_data(AV1_COMP *cpi) { +static void update_film_grain_parameters(struct AV1_COMP *cpi, + const AV1EncoderConfig *oxcf) { AV1_COMMON *const cm = &cpi->common; + cpi->oxcf = *oxcf; - dealloc_context_buffers_ext(cpi); + if (cm->film_grain_table) { + aom_film_grain_table_free(cm->film_grain_table); + aom_free(cm->film_grain_table); + } + cm->film_grain_table = 0; + + if (oxcf->film_grain_test_vector) { + cm->film_grain_params_present = 1; + if (cm->frame_type == KEY_FRAME) { + memcpy(&cm->film_grain_params, + film_grain_test_vectors + oxcf->film_grain_test_vector - 1, + sizeof(cm->film_grain_params)); -#if CONFIG_PVQ - if (cpi->oxcf.pass != 1) { - const int tile_cols = cm->tile_cols; - const int tile_rows = cm->tile_rows; - int tile_col, tile_row; - - for (tile_row = 0; tile_row < tile_rows; ++tile_row) - for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - TileDataEnc *tile_data = - &cpi->tile_data[tile_row * tile_cols + tile_col]; - aom_free(tile_data->pvq_q.buf); + cm->film_grain_params.bit_depth = cm->bit_depth; + if (cm->color_range == AOM_CR_FULL_RANGE) { + cm->film_grain_params.clip_to_restricted_range = 0; } + } + } else if (oxcf->film_grain_table_filename) { + cm->film_grain_table = aom_malloc(sizeof(*cm->film_grain_table)); + memset(cm->film_grain_table, 0, sizeof(aom_film_grain_table_t)); + + aom_film_grain_table_read(cm->film_grain_table, + oxcf->film_grain_table_filename, &cm->error); + } else { + cm->film_grain_params_present = 0; + memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params)); } -#endif +} + +static void dealloc_compressor_data(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + + dealloc_context_buffers_ext(cpi); + aom_free(cpi->tile_data); cpi->tile_data = NULL; @@ -538,7 +490,6 @@ static void dealloc_compressor_data(AV1_COMP *cpi) { aom_free(cpi->active_map.map); cpi->active_map.map = NULL; -#if CONFIG_MOTION_VAR aom_free(cpi->td.mb.above_pred_buf); cpi->td.mb.above_pred_buf = NULL; @@ -550,26 +501,17 @@ static void dealloc_compressor_data(AV1_COMP *cpi) { aom_free(cpi->td.mb.mask_buf); cpi->td.mb.mask_buf = NULL; -#endif + + aom_free(cm->tpl_mvs); + cm->tpl_mvs = NULL; av1_free_ref_frame_buffers(cm->buffer_pool); -#if CONFIG_LV_MAP av1_free_txb_buf(cpi); -#endif av1_free_context_buffers(cm); aom_free_frame_buffer(&cpi->last_frame_uf); -#if CONFIG_LOOP_RESTORATION av1_free_restoration_buffers(cm); - aom_free_frame_buffer(&cpi->last_frame_db); aom_free_frame_buffer(&cpi->trial_frame_rst); - aom_free(cpi->extra_rstbuf); - { - int i; - for (i = 0; i < MAX_MB_PLANE; ++i) - av1_free_restoration_struct(&cpi->rst_search[i]); - } -#endif // CONFIG_LOOP_RESTORATION aom_free_frame_buffer(&cpi->scaled_source); aom_free_frame_buffer(&cpi->scaled_last_source); aom_free_frame_buffer(&cpi->alt_ref_buffer); @@ -578,32 +520,22 @@ static void dealloc_compressor_data(AV1_COMP *cpi) { aom_free(cpi->tile_tok[0][0]); cpi->tile_tok[0][0] = 0; - av1_free_pc_tree(&cpi->td); + av1_free_pc_tree(&cpi->td, num_planes); aom_free(cpi->td.mb.palette_buffer); - -#if CONFIG_ANS - aom_buf_ans_free(&cpi->buf_ans); -#endif // CONFIG_ANS } static void save_coding_context(AV1_COMP *cpi) { CODING_CONTEXT *const cc = &cpi->coding_context; AV1_COMMON *cm = &cpi->common; - int i; // Stores a snapshot of key state variables which can subsequently be // restored with a call to av1_restore_coding_context. These functions are // intended for use in a re-code loop in av1_compress_frame where the // quantizer value is adjusted between loop iterations. - for (i = 0; i < NMV_CONTEXTS; ++i) { - av1_copy(cc->nmv_vec_cost[i], cpi->td.mb.nmv_vec_cost[i]); - av1_copy(cc->nmv_costs, cpi->nmv_costs); - av1_copy(cc->nmv_costs_hp, cpi->nmv_costs_hp); - } - - av1_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas); - av1_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas); + av1_copy(cc->nmv_vec_cost, cpi->td.mb.nmv_vec_cost); + av1_copy(cc->nmv_costs, cpi->nmv_costs); + av1_copy(cc->nmv_costs_hp, cpi->nmv_costs_hp); cc->fc = *cm->fc; } @@ -611,18 +543,12 @@ static void save_coding_context(AV1_COMP *cpi) { static void restore_coding_context(AV1_COMP *cpi) { CODING_CONTEXT *const cc = &cpi->coding_context; AV1_COMMON *cm = &cpi->common; - int i; // Restore key state variables to the snapshot state stored in the // previous call to av1_save_coding_context. - for (i = 0; i < NMV_CONTEXTS; ++i) { - av1_copy(cpi->td.mb.nmv_vec_cost[i], cc->nmv_vec_cost[i]); - av1_copy(cpi->nmv_costs, cc->nmv_costs); - av1_copy(cpi->nmv_costs_hp, cc->nmv_costs_hp); - } - - av1_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas); - av1_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas); + av1_copy(cpi->td.mb.nmv_vec_cost, cc->nmv_vec_cost); + av1_copy(cpi->nmv_costs, cc->nmv_costs); + av1_copy(cpi->nmv_costs_hp, cc->nmv_costs_hp); *cm->fc = cc->fc; } @@ -673,7 +599,6 @@ static void configure_static_seg_features(AV1_COMP *cpi) { qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875, cm->bit_depth); av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2); -#if CONFIG_LOOPFILTER_LEVEL av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2); av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2); av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2); @@ -683,15 +608,8 @@ static void configure_static_seg_features(AV1_COMP *cpi) { av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V); av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U); av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V); -#else - av1_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2); - av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF); -#endif // CONFIG_LOOPFILTER_LEVEL av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); - - // Where relevant assume segment data is delta data - seg->abs_delta = SEGMENT_DELTADATA; } } else if (seg->enabled) { // All other frames if segmentation has been enabled @@ -702,14 +620,12 @@ static void configure_static_seg_features(AV1_COMP *cpi) { if (rc->source_alt_ref_active) { seg->update_map = 0; seg->update_data = 1; - seg->abs_delta = SEGMENT_DELTADATA; qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125, cm->bit_depth); av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2); av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); -#if CONFIG_LOOPFILTER_LEVEL av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2); av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2); av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2); @@ -719,10 +635,6 @@ static void configure_static_seg_features(AV1_COMP *cpi) { av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V); av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U); av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V); -#else - av1_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2); - av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF); -#endif // CONFIG_LOOPFILTER_LEVEL // Segment coding disabled for compred testing if (high_q || (cpi->static_mb_pct == 100)) { @@ -777,16 +689,16 @@ static void configure_static_seg_features(AV1_COMP *cpi) { static void update_reference_segmentation_map(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; - MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible; - uint8_t *cache_ptr = cm->last_frame_seg_map; + MB_MODE_INFO **mi_4x4_ptr = cm->mi_grid_visible; + uint8_t *cache_ptr = cm->current_frame_seg_map; int row, col; for (row = 0; row < cm->mi_rows; row++) { - MODE_INFO **mi_8x8 = mi_8x8_ptr; + MB_MODE_INFO **mi_4x4 = mi_4x4_ptr; uint8_t *cache = cache_ptr; - for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++) - cache[0] = mi_8x8[0]->mbmi.segment_id; - mi_8x8_ptr += cm->mi_stride; + for (col = 0; col < cm->mi_cols; col++, mi_4x4++, cache++) + cache[0] = mi_4x4[0]->segment_id; + mi_4x4_ptr += cm->mi_stride; cache_ptr += cm->mi_cols; } } @@ -796,12 +708,9 @@ static void alloc_raw_frame_buffers(AV1_COMP *cpi) { const AV1EncoderConfig *oxcf = &cpi->oxcf; if (!cpi->lookahead) - cpi->lookahead = av1_lookahead_init(oxcf->width, oxcf->height, - cm->subsampling_x, cm->subsampling_y, -#if CONFIG_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - oxcf->lag_in_frames); + cpi->lookahead = av1_lookahead_init( + oxcf->width, oxcf->height, cm->subsampling_x, cm->subsampling_y, + cm->use_highbitdepth, oxcf->lag_in_frames); if (!cpi->lookahead) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate lag buffers"); @@ -809,11 +718,8 @@ static void alloc_raw_frame_buffers(AV1_COMP *cpi) { // TODO(agrange) Check if ARF is enabled and skip allocation if not. if (aom_realloc_frame_buffer(&cpi->alt_ref_buffer, oxcf->width, oxcf->height, cm->subsampling_x, cm->subsampling_y, -#if CONFIG_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, - NULL, NULL)) + cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate altref buffer"); } @@ -822,84 +728,49 @@ static void alloc_util_frame_buffers(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; if (aom_realloc_frame_buffer(&cpi->last_frame_uf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, -#if CONFIG_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, - NULL, NULL)) + cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate last frame buffer"); -#if CONFIG_LOOP_RESTORATION - if (aom_realloc_frame_buffer(&cpi->last_frame_db, cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, -#if CONFIG_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, - NULL, NULL)) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, - "Failed to allocate last frame deblocked buffer"); if (aom_realloc_frame_buffer( - &cpi->trial_frame_rst, -#if CONFIG_FRAME_SUPERRES - cm->superres_upscaled_width, cm->superres_upscaled_height, -#else - cm->width, cm->height, -#endif // CONFIG_FRAME_SUPERRES - cm->subsampling_x, cm->subsampling_y, -#if CONFIG_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL)) + &cpi->trial_frame_rst, cm->superres_upscaled_width, + cm->superres_upscaled_height, cm->subsampling_x, cm->subsampling_y, + cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, + NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate trial restored frame buffer"); - int extra_rstbuf_sz = RESTORATION_EXTBUF_SIZE; - if (extra_rstbuf_sz > 0) { - aom_free(cpi->extra_rstbuf); - CHECK_MEM_ERROR(cm, cpi->extra_rstbuf, - (uint8_t *)aom_malloc(extra_rstbuf_sz)); - } else { - cpi->extra_rstbuf = NULL; - } -#endif // CONFIG_LOOP_RESTORATION if (aom_realloc_frame_buffer(&cpi->scaled_source, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, -#if CONFIG_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, - NULL, NULL)) + cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate scaled source buffer"); if (aom_realloc_frame_buffer(&cpi->scaled_last_source, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, -#if CONFIG_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, - NULL, NULL)) + cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate scaled last source buffer"); } static void alloc_compressor_data(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); av1_alloc_context_buffers(cm, cm->width, cm->height); -#if CONFIG_LV_MAP av1_alloc_txb_buf(cpi); -#endif alloc_context_buffers_ext(cpi); aom_free(cpi->tile_tok[0][0]); { - unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols); + unsigned int tokens = + get_token_alloc(cm->mb_rows, cm->mb_cols, MAX_SB_SIZE_LOG2, num_planes); CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0], aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0]))); } @@ -909,18 +780,10 @@ static void alloc_compressor_data(AV1_COMP *cpi) { void av1_new_framerate(AV1_COMP *cpi, double framerate) { cpi->framerate = framerate < 0.1 ? 30 : framerate; -#if CONFIG_XIPHRC - if (!cpi->od_rc.cur_frame) return; - cpi->od_rc.framerate = cpi->framerate; - od_enc_rc_resize(&cpi->od_rc); -#else av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height); -#endif } -#if CONFIG_MAX_TILE - -static void set_tile_info_max_tile(AV1_COMP *cpi) { +static void set_tile_info(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; int i, start_sb; @@ -932,15 +795,15 @@ static void set_tile_info_max_tile(AV1_COMP *cpi) { cm->log2_tile_cols = AOMMAX(cpi->oxcf.tile_columns, cm->min_log2_tile_cols); cm->log2_tile_cols = AOMMIN(cm->log2_tile_cols, cm->max_log2_tile_cols); } else { - int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2); - int sb_cols = mi_cols >> MAX_MIB_SIZE_LOG2; + int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2); + int sb_cols = mi_cols >> cm->seq_params.mib_size_log2; int size_sb, j = 0; cm->uniform_tile_spacing_flag = 0; for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) { cm->tile_col_start_sb[i] = start_sb; size_sb = cpi->oxcf.tile_widths[j++]; if (j >= cpi->oxcf.tile_width_count) j = 0; - start_sb += AOMMIN(size_sb, MAX_TILE_WIDTH_SB); + start_sb += AOMMIN(size_sb, cm->max_tile_width_sb); } cm->tile_cols = i; cm->tile_col_start_sb[i] = sb_cols; @@ -952,8 +815,8 @@ static void set_tile_info_max_tile(AV1_COMP *cpi) { cm->log2_tile_rows = AOMMAX(cpi->oxcf.tile_rows, cm->min_log2_tile_rows); cm->log2_tile_rows = AOMMIN(cm->log2_tile_rows, cm->max_log2_tile_rows); } else { - int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2); - int sb_rows = mi_rows >> MAX_MIB_SIZE_LOG2; + int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2); + int sb_rows = mi_rows >> cm->seq_params.mib_size_log2; int size_sb, j = 0; for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) { cm->tile_row_start_sb[i] = start_sb; @@ -967,158 +830,174 @@ static void set_tile_info_max_tile(AV1_COMP *cpi) { av1_calculate_tile_rows(cm); } -#endif - -static void set_tile_info(AV1_COMP *cpi) { - AV1_COMMON *const cm = &cpi->common; -#if CONFIG_DEPENDENT_HORZTILES - int tile_row, tile_col, num_tiles_in_tg; - int tg_row_start, tg_col_start; -#endif -#if CONFIG_EXT_TILE - if (cpi->oxcf.large_scale_tile) { -#if CONFIG_EXT_PARTITION - if (cpi->oxcf.superblock_size != AOM_SUPERBLOCK_SIZE_64X64) { - cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 32); - cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 32); - cm->tile_width <<= MAX_MIB_SIZE_LOG2; - cm->tile_height <<= MAX_MIB_SIZE_LOG2; - } else { - cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 64); - cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64); - cm->tile_width <<= MAX_MIB_SIZE_LOG2 - 1; - cm->tile_height <<= MAX_MIB_SIZE_LOG2 - 1; - } -#else - cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 64); - cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64); - cm->tile_width <<= MAX_MIB_SIZE_LOG2; - cm->tile_height <<= MAX_MIB_SIZE_LOG2; -#endif // CONFIG_EXT_PARTITION - - cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols); - cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows); - - assert(cm->tile_width >> MAX_MIB_SIZE <= 32); - assert(cm->tile_height >> MAX_MIB_SIZE <= 32); - - // Get the number of tiles - cm->tile_cols = 1; - while (cm->tile_cols * cm->tile_width < cm->mi_cols) ++cm->tile_cols; - - cm->tile_rows = 1; - while (cm->tile_rows * cm->tile_height < cm->mi_rows) ++cm->tile_rows; - } else { -#endif // CONFIG_EXT_TILE - -#if CONFIG_MAX_TILE - set_tile_info_max_tile(cpi); -#else - int min_log2_tile_cols, max_log2_tile_cols; - av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); - - cm->log2_tile_cols = - clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols); - cm->log2_tile_rows = cpi->oxcf.tile_rows; - - cm->tile_width = - get_tile_size(cm->mi_cols, cm->log2_tile_cols, &cm->tile_cols); - cm->tile_height = - get_tile_size(cm->mi_rows, cm->log2_tile_rows, &cm->tile_rows); -#endif // CONFIG_MAX_TILE -#if CONFIG_EXT_TILE - } -#endif // CONFIG_EXT_TILE - -#if CONFIG_DEPENDENT_HORZTILES - cm->dependent_horz_tiles = cpi->oxcf.dependent_horz_tiles; -#if CONFIG_EXT_TILE - if (cm->large_scale_tile) { - // May not needed since cpi->oxcf.dependent_horz_tiles is already adjusted. - cm->dependent_horz_tiles = 0; - } else { -#endif // CONFIG_EXT_TILE - if (cm->log2_tile_rows == 0) cm->dependent_horz_tiles = 0; -#if CONFIG_EXT_TILE - } -#endif // CONFIG_EXT_TILE - -#if CONFIG_EXT_TILE - if (!cm->large_scale_tile) { -#endif // CONFIG_EXT_TILE - if (cpi->oxcf.mtu == 0) { - cm->num_tg = cpi->oxcf.num_tile_groups; - } else { - // Use a default value for the purposes of weighting costs in probability - // updates - cm->num_tg = DEFAULT_MAX_NUM_TG; - } - num_tiles_in_tg = - (cm->tile_cols * cm->tile_rows + cm->num_tg - 1) / cm->num_tg; - tg_row_start = 0; - tg_col_start = 0; - for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) { - for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) { - if ((tile_row * cm->tile_cols + tile_col) % num_tiles_in_tg == 0) { - tg_row_start = tile_row; - tg_col_start = tile_col; - } - cm->tile_group_start_row[tile_row][tile_col] = tg_row_start; - cm->tile_group_start_col[tile_row][tile_col] = tg_col_start; - } - } -#if CONFIG_EXT_TILE - } -#endif // CONFIG_EXT_TILE -#endif - -#if CONFIG_LOOPFILTERING_ACROSS_TILES - cm->loop_filter_across_tiles_enabled = - cpi->oxcf.loop_filter_across_tiles_enabled; -#endif // CONFIG_LOOPFILTERING_ACROSS_TILES -} - static void update_frame_size(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; av1_set_mb_mi(cm, cm->width, cm->height); av1_init_context_buffers(cm); - av1_init_macroblockd(cm, xd, -#if CONFIG_PVQ - NULL, -#endif -#if CONFIG_CFL - &NULL_CFL, -#endif - NULL); + av1_init_macroblockd(cm, xd, NULL); memset(cpi->mbmi_ext_base, 0, cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base)); set_tile_info(cpi); } static void init_buffer_indices(AV1_COMP *cpi) { -#if CONFIG_EXT_REFS int fb_idx; - for (fb_idx = 0; fb_idx < LAST_REF_FRAMES; ++fb_idx) - cpi->lst_fb_idxes[fb_idx] = fb_idx; - cpi->gld_fb_idx = LAST_REF_FRAMES; - cpi->bwd_fb_idx = LAST_REF_FRAMES + 1; - cpi->alt2_fb_idx = LAST_REF_FRAMES + 2; - cpi->alt_fb_idx = LAST_REF_FRAMES + 3; - cpi->ext_fb_idx = LAST_REF_FRAMES + 4; + for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx) + cpi->ref_fb_idx[fb_idx] = fb_idx; for (fb_idx = 0; fb_idx < MAX_EXT_ARFS + 1; ++fb_idx) cpi->arf_map[fb_idx] = LAST_REF_FRAMES + 2 + fb_idx; -#else // !CONFIG_EXT_REFS - cpi->lst_fb_idx = 0; - cpi->gld_fb_idx = 1; - cpi->alt_fb_idx = 2; -#endif // CONFIG_EXT_REFS -#if CONFIG_AMVR cpi->rate_index = 0; cpi->rate_size = 0; cpi->cur_poc = -1; -#endif +} + +static INLINE int does_level_match(int width, int height, double fps, + int lvl_width, int lvl_height, + double lvl_fps, int lvl_dim_mult) { + const int64_t lvl_luma_pels = lvl_width * lvl_height; + const double lvl_display_sample_rate = lvl_luma_pels * lvl_fps; + const int64_t luma_pels = width * height; + const double display_sample_rate = luma_pels * fps; + return luma_pels <= lvl_luma_pels && + display_sample_rate <= lvl_display_sample_rate && + width <= lvl_width * lvl_dim_mult && + height <= lvl_height * lvl_dim_mult; +} + +static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm, + const AV1EncoderConfig *oxcf) { + // TODO(any): This is a placeholder function that only addresses dimensions + // and max display sample rates. + // Need to add checks for max bit rate, max decoded luma sample rate, header + // rate, etc. that are not covered by this function. + (void)oxcf; + BitstreamLevel bl = { 9, 3 }; + if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 512, + 288, 30.0, 4)) { + bl.major = 2; + bl.minor = 0; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 704, 396, 30.0, 4)) { + bl.major = 2; + bl.minor = 1; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 1088, 612, 30.0, 4)) { + bl.major = 3; + bl.minor = 0; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 1376, 774, 30.0, 4)) { + bl.major = 3; + bl.minor = 1; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 2048, 1152, 30.0, 3)) { + bl.major = 4; + bl.minor = 0; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 2048, 1152, 60.0, 3)) { + bl.major = 4; + bl.minor = 1; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 4096, 2176, 30.0, 2)) { + bl.major = 5; + bl.minor = 0; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 4096, 2176, 60.0, 2)) { + bl.major = 5; + bl.minor = 1; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 4096, 2176, 120.0, 2)) { + bl.major = 5; + bl.minor = 2; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 8192, 4352, 30.0, 2)) { + bl.major = 6; + bl.minor = 0; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 8192, 4352, 60.0, 2)) { + bl.major = 6; + bl.minor = 1; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 8192, 4352, 120.0, 2)) { + bl.major = 6; + bl.minor = 2; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 16384, 8704, 30.0, 2)) { + bl.major = 7; + bl.minor = 0; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 16384, 8704, 60.0, 2)) { + bl.major = 7; + bl.minor = 1; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 16384, 8704, 120.0, 2)) { + bl.major = 7; + bl.minor = 2; + } + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { + seq->level[i] = bl; + seq->tier[i] = 0; // setting main tier by default + // Set the maximum parameters for bitrate and buffer size for this profile, + // level, and tier + cm->op_params[i].bitrate = max_level_bitrate( + cm->profile, major_minor_to_seq_level_idx(seq->level[i]), seq->tier[i]); + // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the + // check + if (cm->op_params[i].bitrate == 0) + aom_internal_error( + &cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "AV1 does not support this combination of profile, level, and tier."); + // Buffer size in bits/s is bitrate in bits/s * 1 s + cm->op_params[i].buffer_size = cm->op_params[i].bitrate; + } +} + +static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm, + const AV1EncoderConfig *oxcf) { + seq->still_picture = (oxcf->limit == 1); + seq->reduced_still_picture_hdr = seq->still_picture; + seq->reduced_still_picture_hdr &= !oxcf->full_still_picture_hdr; + seq->force_screen_content_tools = 2; + seq->force_integer_mv = 2; + seq->enable_order_hint = oxcf->enable_order_hint; + seq->frame_id_numbers_present_flag = oxcf->large_scale_tile; + if (seq->still_picture && seq->reduced_still_picture_hdr) { + seq->enable_order_hint = 0; + seq->frame_id_numbers_present_flag = 0; + seq->force_screen_content_tools = 2; + seq->force_integer_mv = 2; + } + seq->order_hint_bits_minus_1 = + seq->enable_order_hint ? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1 : -1; + + seq->enable_dual_filter = oxcf->enable_dual_filter; + seq->enable_jnt_comp = oxcf->enable_jnt_comp; + seq->enable_jnt_comp &= seq->enable_order_hint; + seq->enable_ref_frame_mvs = oxcf->enable_ref_frame_mvs; + seq->enable_ref_frame_mvs &= seq->enable_order_hint; + seq->enable_superres = oxcf->enable_superres; + seq->enable_cdef = oxcf->enable_cdef; + seq->enable_restoration = oxcf->enable_restoration; + seq->enable_warped_motion = oxcf->enable_warped_motion; + seq->enable_interintra_compound = 1; + seq->enable_masked_compound = 1; + seq->enable_intra_edge_filter = 1; + seq->enable_filter_intra = 1; + + set_bitstream_level_tier(seq, cm, oxcf); + + if (seq->operating_points_cnt_minus_1 == 0) { + seq->operating_point_idc[0] = 0; + } else { + // Set operating_point_idc[] such that for the i-th operating point the + // first (operating_points_cnt-i) spatial layers and the first temporal + // layer are decoded Note that highest quality operating point should come + // first + for (int i = 0; i < seq->operating_points_cnt_minus_1 + 1; i++) + seq->operating_point_idc[i] = + (~(~0u << (seq->operating_points_cnt_minus_1 + 1 - i)) << 8) | 1; + } } static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) { @@ -1129,22 +1008,53 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) { cm->profile = oxcf->profile; cm->bit_depth = oxcf->bit_depth; -#if CONFIG_HIGHBITDEPTH cm->use_highbitdepth = oxcf->use_highbitdepth; -#endif - cm->color_space = oxcf->color_space; -#if CONFIG_COLORSPACE_HEADERS - cm->transfer_function = oxcf->transfer_function; + cm->color_primaries = oxcf->color_primaries; + cm->transfer_characteristics = oxcf->transfer_characteristics; + cm->matrix_coefficients = oxcf->matrix_coefficients; + cm->seq_params.monochrome = oxcf->monochrome; cm->chroma_sample_position = oxcf->chroma_sample_position; -#endif cm->color_range = oxcf->color_range; + cm->timing_info_present = oxcf->timing_info_present; + cm->timing_info.num_units_in_display_tick = + oxcf->timing_info.num_units_in_display_tick; + cm->timing_info.time_scale = oxcf->timing_info.time_scale; + cm->timing_info.equal_picture_interval = + oxcf->timing_info.equal_picture_interval; + cm->timing_info.num_ticks_per_picture = + oxcf->timing_info.num_ticks_per_picture; + + cm->seq_params.display_model_info_present_flag = + oxcf->display_model_info_present_flag; + cm->seq_params.decoder_model_info_present_flag = + oxcf->decoder_model_info_present_flag; + if (oxcf->decoder_model_info_present_flag) { + // set the decoder model parameters in schedule mode + cm->buffer_model.num_units_in_decoding_tick = + oxcf->buffer_model.num_units_in_decoding_tick; + cm->buffer_removal_delay_present = 1; + set_aom_dec_model_info(&cm->buffer_model); + set_dec_model_op_parameters(&cm->op_params[0]); + } else if (cm->timing_info_present && + cm->timing_info.equal_picture_interval && + !cm->seq_params.decoder_model_info_present_flag) { + // set the decoder model parameters in resource availability mode + set_resource_availability_parameters(&cm->op_params[0]); + } else { + cm->op_params[0].initial_display_delay = + 10; // Default value (not signaled) + } cm->width = oxcf->width; cm->height = oxcf->height; + set_sb_size(&cm->seq_params, + select_sb_size(cpi)); // set sb size before allocations alloc_compressor_data(cpi); + update_film_grain_parameters(cpi, oxcf); + // Single thread case: use counts in common. - cpi->td.counts = &cm->counts; + cpi->td.counts = &cpi->counts; // change includes all joint functionality av1_change_config(cpi, oxcf); @@ -1173,16 +1083,15 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc, (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000; } -#if CONFIG_HIGHBITDEPTH -#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \ +#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \ cpi->fn_ptr[BT].sdf = SDF; \ cpi->fn_ptr[BT].sdaf = SDAF; \ cpi->fn_ptr[BT].vf = VF; \ cpi->fn_ptr[BT].svf = SVF; \ cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx3f = SDX3F; \ - cpi->fn_ptr[BT].sdx8f = SDX8F; \ - cpi->fn_ptr[BT].sdx4df = SDX4DF; + cpi->fn_ptr[BT].sdx4df = SDX4DF; \ + cpi->fn_ptr[BT].jsdaf = JSDAF; \ + cpi->fn_ptr[BT].jsvaf = JSVAF; #define MAKE_BFP_SAD_WRAPPER(fnname) \ static unsigned int fnname##_bits8(const uint8_t *src_ptr, \ @@ -1220,47 +1129,6 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc, 4; \ } -#define MAKE_BFP_SAD3_WRAPPER(fnname) \ - static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - } \ - static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - int i; \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - for (i = 0; i < 3; i++) sad_array[i] >>= 2; \ - } \ - static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - int i; \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - for (i = 0; i < 3; i++) sad_array[i] >>= 4; \ - } - -#define MAKE_BFP_SAD8_WRAPPER(fnname) \ - static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - } \ - static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - int i; \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - for (i = 0; i < 8; i++) sad_array[i] >>= 2; \ - } \ - static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - int i; \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - for (i = 0; i < 8; i++) sad_array[i] >>= 4; \ - } #define MAKE_BFP_SAD4D_WRAPPER(fnname) \ static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ const uint8_t *const ref_ptr[], int ref_stride, \ @@ -1282,11 +1150,33 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc, for (i = 0; i < 4; i++) sad_array[i] >>= 4; \ } -#if CONFIG_EXT_PARTITION +#define MAKE_BFP_JSADAVG_WRAPPER(fnname) \ + static unsigned int fnname##_bits8( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred, \ + const JNT_COMP_PARAMS *jcp_param) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \ + jcp_param); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred, \ + const JNT_COMP_PARAMS *jcp_param) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \ + jcp_param) >> \ + 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred, \ + const JNT_COMP_PARAMS *jcp_param) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \ + jcp_param) >> \ + 4; \ + } + MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg) -MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad128x128x3) -MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad128x128x8) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg) @@ -1294,7 +1184,6 @@ MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d) -#endif // CONFIG_EXT_PARTITION MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d) @@ -1309,49 +1198,32 @@ MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg) -MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad32x32x3) -MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad32x32x8) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg) -MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad64x64x3) -MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad64x64x8) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg) -MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad16x16x3) -MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad16x16x8) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg) -MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad16x8x3) -MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad16x8x8) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg) -MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad8x16x3) -MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x16x8) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg) -MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad8x8x3) -MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x8x8) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg) -MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x4x8) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg) -MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad4x8x8) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg) -MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad4x4x3) -MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad4x4x8) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d) -#if CONFIG_EXT_PARTITION_TYPES MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d) @@ -1370,15 +1242,29 @@ MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d) -#if CONFIG_EXT_PARTITION -MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x128) -MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x128_avg) -MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x128x4d) -MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x32) -MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x32_avg) -MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x32x4d) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES + +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad128x128_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad128x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x128_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x4_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x4_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x4_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x16_avg) #define HIGHBD_MBFP(BT, MCSDF, MCSVF) \ cpi->fn_ptr[BT].msdf = MCSDF; \ @@ -1409,11 +1295,9 @@ MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x32x4d) 4; \ } -#if CONFIG_EXT_PARTITION MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128) -#endif // CONFIG_EXT_PARTITION MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64) @@ -1427,21 +1311,13 @@ MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4) - -#if CONFIG_EXT_PARTITION_TYPES MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16) -#if CONFIG_EXT_PARTITION -MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x128) -MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES -#if CONFIG_MOTION_VAR #define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \ cpi->fn_ptr[BT].osdf = OSDF; \ cpi->fn_ptr[BT].ovf = OVF; \ @@ -1464,11 +1340,9 @@ MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x32) return fnname(ref, ref_stride, wsrc, msk) >> 4; \ } -#if CONFIG_EXT_PARTITION MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128) -#endif // CONFIG_EXT_PARTITION MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64) @@ -1482,198 +1356,190 @@ MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4) - -#if CONFIG_EXT_PARTITION_TYPES MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16) -#if CONFIG_EXT_PARTITION -MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x128) -MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES -#endif // CONFIG_MOTION_VAR static void highbd_set_var_fns(AV1_COMP *const cpi) { AV1_COMMON *const cm = &cpi->common; if (cm->use_highbitdepth) { switch (cm->bit_depth) { case AOM_BITS_8: -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION - HIGHBD_BFP(BLOCK_128X32, aom_highbd_sad128x32_bits8, - aom_highbd_sad128x32_avg_bits8, aom_highbd_8_variance128x32, - aom_highbd_8_sub_pixel_variance128x32, - aom_highbd_8_sub_pixel_avg_variance128x32, NULL, NULL, - aom_highbd_sad128x32x4d_bits8) - - HIGHBD_BFP(BLOCK_32X128, aom_highbd_sad32x128_bits8, - aom_highbd_sad32x128_avg_bits8, aom_highbd_8_variance32x128, - aom_highbd_8_sub_pixel_variance32x128, - aom_highbd_8_sub_pixel_avg_variance32x128, NULL, NULL, - aom_highbd_sad32x128x4d_bits8) -#endif // CONFIG_EXT_PARTITION - HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits8, aom_highbd_sad64x16_avg_bits8, aom_highbd_8_variance64x16, aom_highbd_8_sub_pixel_variance64x16, - aom_highbd_8_sub_pixel_avg_variance64x16, NULL, NULL, - aom_highbd_sad64x16x4d_bits8) + aom_highbd_8_sub_pixel_avg_variance64x16, + aom_highbd_sad64x16x4d_bits8, + aom_highbd_jnt_sad64x16_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance64x16) HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits8, aom_highbd_sad16x64_avg_bits8, aom_highbd_8_variance16x64, aom_highbd_8_sub_pixel_variance16x64, - aom_highbd_8_sub_pixel_avg_variance16x64, NULL, NULL, - aom_highbd_sad16x64x4d_bits8) - - HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits8, - aom_highbd_sad32x8_avg_bits8, aom_highbd_8_variance32x8, - aom_highbd_8_sub_pixel_variance32x8, - aom_highbd_8_sub_pixel_avg_variance32x8, NULL, NULL, - aom_highbd_sad32x8x4d_bits8) - - HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits8, - aom_highbd_sad8x32_avg_bits8, aom_highbd_8_variance8x32, - aom_highbd_8_sub_pixel_variance8x32, - aom_highbd_8_sub_pixel_avg_variance8x32, NULL, NULL, - aom_highbd_sad8x32x4d_bits8) - - HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits8, - aom_highbd_sad16x4_avg_bits8, aom_highbd_8_variance16x4, - aom_highbd_8_sub_pixel_variance16x4, - aom_highbd_8_sub_pixel_avg_variance16x4, NULL, NULL, - aom_highbd_sad16x4x4d_bits8) - - HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits8, - aom_highbd_sad4x16_avg_bits8, aom_highbd_8_variance4x16, - aom_highbd_8_sub_pixel_variance4x16, - aom_highbd_8_sub_pixel_avg_variance4x16, NULL, NULL, - aom_highbd_sad4x16x4d_bits8) -#endif + aom_highbd_8_sub_pixel_avg_variance16x64, + aom_highbd_sad16x64x4d_bits8, + aom_highbd_jnt_sad16x64_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance16x64) + + HIGHBD_BFP( + BLOCK_32X8, aom_highbd_sad32x8_bits8, aom_highbd_sad32x8_avg_bits8, + aom_highbd_8_variance32x8, aom_highbd_8_sub_pixel_variance32x8, + aom_highbd_8_sub_pixel_avg_variance32x8, + aom_highbd_sad32x8x4d_bits8, aom_highbd_jnt_sad32x8_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance32x8) + + HIGHBD_BFP( + BLOCK_8X32, aom_highbd_sad8x32_bits8, aom_highbd_sad8x32_avg_bits8, + aom_highbd_8_variance8x32, aom_highbd_8_sub_pixel_variance8x32, + aom_highbd_8_sub_pixel_avg_variance8x32, + aom_highbd_sad8x32x4d_bits8, aom_highbd_jnt_sad8x32_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance8x32) + + HIGHBD_BFP( + BLOCK_16X4, aom_highbd_sad16x4_bits8, aom_highbd_sad16x4_avg_bits8, + aom_highbd_8_variance16x4, aom_highbd_8_sub_pixel_variance16x4, + aom_highbd_8_sub_pixel_avg_variance16x4, + aom_highbd_sad16x4x4d_bits8, aom_highbd_jnt_sad16x4_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance16x4) + + HIGHBD_BFP( + BLOCK_4X16, aom_highbd_sad4x16_bits8, aom_highbd_sad4x16_avg_bits8, + aom_highbd_8_variance4x16, aom_highbd_8_sub_pixel_variance4x16, + aom_highbd_8_sub_pixel_avg_variance4x16, + aom_highbd_sad4x16x4d_bits8, aom_highbd_jnt_sad4x16_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance4x16) HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits8, aom_highbd_sad32x16_avg_bits8, aom_highbd_8_variance32x16, aom_highbd_8_sub_pixel_variance32x16, - aom_highbd_8_sub_pixel_avg_variance32x16, NULL, NULL, - aom_highbd_sad32x16x4d_bits8) + aom_highbd_8_sub_pixel_avg_variance32x16, + aom_highbd_sad32x16x4d_bits8, + aom_highbd_jnt_sad32x16_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance32x16) HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits8, aom_highbd_sad16x32_avg_bits8, aom_highbd_8_variance16x32, aom_highbd_8_sub_pixel_variance16x32, - aom_highbd_8_sub_pixel_avg_variance16x32, NULL, NULL, - aom_highbd_sad16x32x4d_bits8) + aom_highbd_8_sub_pixel_avg_variance16x32, + aom_highbd_sad16x32x4d_bits8, + aom_highbd_jnt_sad16x32_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance16x32) HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits8, aom_highbd_sad64x32_avg_bits8, aom_highbd_8_variance64x32, aom_highbd_8_sub_pixel_variance64x32, - aom_highbd_8_sub_pixel_avg_variance64x32, NULL, NULL, - aom_highbd_sad64x32x4d_bits8) + aom_highbd_8_sub_pixel_avg_variance64x32, + aom_highbd_sad64x32x4d_bits8, + aom_highbd_jnt_sad64x32_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance64x32) HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits8, aom_highbd_sad32x64_avg_bits8, aom_highbd_8_variance32x64, aom_highbd_8_sub_pixel_variance32x64, - aom_highbd_8_sub_pixel_avg_variance32x64, NULL, NULL, - aom_highbd_sad32x64x4d_bits8) + aom_highbd_8_sub_pixel_avg_variance32x64, + aom_highbd_sad32x64x4d_bits8, + aom_highbd_jnt_sad32x64_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance32x64) HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits8, aom_highbd_sad32x32_avg_bits8, aom_highbd_8_variance32x32, aom_highbd_8_sub_pixel_variance32x32, aom_highbd_8_sub_pixel_avg_variance32x32, - aom_highbd_sad32x32x3_bits8, aom_highbd_sad32x32x8_bits8, - aom_highbd_sad32x32x4d_bits8) + aom_highbd_sad32x32x4d_bits8, + aom_highbd_jnt_sad32x32_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance32x32) HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits8, aom_highbd_sad64x64_avg_bits8, aom_highbd_8_variance64x64, aom_highbd_8_sub_pixel_variance64x64, aom_highbd_8_sub_pixel_avg_variance64x64, - aom_highbd_sad64x64x3_bits8, aom_highbd_sad64x64x8_bits8, - aom_highbd_sad64x64x4d_bits8) + aom_highbd_sad64x64x4d_bits8, + aom_highbd_jnt_sad64x64_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance64x64) HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits8, aom_highbd_sad16x16_avg_bits8, aom_highbd_8_variance16x16, aom_highbd_8_sub_pixel_variance16x16, aom_highbd_8_sub_pixel_avg_variance16x16, - aom_highbd_sad16x16x3_bits8, aom_highbd_sad16x16x8_bits8, - aom_highbd_sad16x16x4d_bits8) + aom_highbd_sad16x16x4d_bits8, + aom_highbd_jnt_sad16x16_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance16x16) HIGHBD_BFP( BLOCK_16X8, aom_highbd_sad16x8_bits8, aom_highbd_sad16x8_avg_bits8, aom_highbd_8_variance16x8, aom_highbd_8_sub_pixel_variance16x8, - aom_highbd_8_sub_pixel_avg_variance16x8, aom_highbd_sad16x8x3_bits8, - aom_highbd_sad16x8x8_bits8, aom_highbd_sad16x8x4d_bits8) + aom_highbd_8_sub_pixel_avg_variance16x8, + aom_highbd_sad16x8x4d_bits8, aom_highbd_jnt_sad16x8_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance16x8) HIGHBD_BFP( BLOCK_8X16, aom_highbd_sad8x16_bits8, aom_highbd_sad8x16_avg_bits8, aom_highbd_8_variance8x16, aom_highbd_8_sub_pixel_variance8x16, - aom_highbd_8_sub_pixel_avg_variance8x16, aom_highbd_sad8x16x3_bits8, - aom_highbd_sad8x16x8_bits8, aom_highbd_sad8x16x4d_bits8) + aom_highbd_8_sub_pixel_avg_variance8x16, + aom_highbd_sad8x16x4d_bits8, aom_highbd_jnt_sad8x16_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance8x16) - HIGHBD_BFP( - BLOCK_8X8, aom_highbd_sad8x8_bits8, aom_highbd_sad8x8_avg_bits8, - aom_highbd_8_variance8x8, aom_highbd_8_sub_pixel_variance8x8, - aom_highbd_8_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits8, - aom_highbd_sad8x8x8_bits8, aom_highbd_sad8x8x4d_bits8) + HIGHBD_BFP(BLOCK_8X8, aom_highbd_sad8x8_bits8, + aom_highbd_sad8x8_avg_bits8, aom_highbd_8_variance8x8, + aom_highbd_8_sub_pixel_variance8x8, + aom_highbd_8_sub_pixel_avg_variance8x8, + aom_highbd_sad8x8x4d_bits8, aom_highbd_jnt_sad8x8_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance8x8) HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits8, aom_highbd_sad8x4_avg_bits8, aom_highbd_8_variance8x4, aom_highbd_8_sub_pixel_variance8x4, - aom_highbd_8_sub_pixel_avg_variance8x4, NULL, - aom_highbd_sad8x4x8_bits8, aom_highbd_sad8x4x4d_bits8) + aom_highbd_8_sub_pixel_avg_variance8x4, + aom_highbd_sad8x4x4d_bits8, aom_highbd_jnt_sad8x4_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance8x4) HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits8, aom_highbd_sad4x8_avg_bits8, aom_highbd_8_variance4x8, aom_highbd_8_sub_pixel_variance4x8, - aom_highbd_8_sub_pixel_avg_variance4x8, NULL, - aom_highbd_sad4x8x8_bits8, aom_highbd_sad4x8x4d_bits8) + aom_highbd_8_sub_pixel_avg_variance4x8, + aom_highbd_sad4x8x4d_bits8, aom_highbd_jnt_sad4x8_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance4x8) - HIGHBD_BFP( - BLOCK_4X4, aom_highbd_sad4x4_bits8, aom_highbd_sad4x4_avg_bits8, - aom_highbd_8_variance4x4, aom_highbd_8_sub_pixel_variance4x4, - aom_highbd_8_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits8, - aom_highbd_sad4x4x8_bits8, aom_highbd_sad4x4x4d_bits8) - -#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8 - HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_8_variance2x2, NULL, NULL, - NULL, NULL, NULL) - HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_8_variance4x2, NULL, NULL, - NULL, NULL, NULL) - HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_8_variance2x4, NULL, NULL, - NULL, NULL, NULL) -#endif + HIGHBD_BFP(BLOCK_4X4, aom_highbd_sad4x4_bits8, + aom_highbd_sad4x4_avg_bits8, aom_highbd_8_variance4x4, + aom_highbd_8_sub_pixel_variance4x4, + aom_highbd_8_sub_pixel_avg_variance4x4, + aom_highbd_sad4x4x4d_bits8, aom_highbd_jnt_sad4x4_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance4x4) -#if CONFIG_EXT_PARTITION - HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits8, - aom_highbd_sad128x128_avg_bits8, - aom_highbd_8_variance128x128, - aom_highbd_8_sub_pixel_variance128x128, - aom_highbd_8_sub_pixel_avg_variance128x128, - aom_highbd_sad128x128x3_bits8, aom_highbd_sad128x128x8_bits8, - aom_highbd_sad128x128x4d_bits8) + HIGHBD_BFP( + BLOCK_128X128, aom_highbd_sad128x128_bits8, + aom_highbd_sad128x128_avg_bits8, aom_highbd_8_variance128x128, + aom_highbd_8_sub_pixel_variance128x128, + aom_highbd_8_sub_pixel_avg_variance128x128, + aom_highbd_sad128x128x4d_bits8, aom_highbd_jnt_sad128x128_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance128x128) HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits8, aom_highbd_sad128x64_avg_bits8, aom_highbd_8_variance128x64, aom_highbd_8_sub_pixel_variance128x64, - aom_highbd_8_sub_pixel_avg_variance128x64, NULL, NULL, - aom_highbd_sad128x64x4d_bits8) + aom_highbd_8_sub_pixel_avg_variance128x64, + aom_highbd_sad128x64x4d_bits8, + aom_highbd_jnt_sad128x64_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance128x64) HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits8, aom_highbd_sad64x128_avg_bits8, aom_highbd_8_variance64x128, aom_highbd_8_sub_pixel_variance64x128, - aom_highbd_8_sub_pixel_avg_variance64x128, NULL, NULL, - aom_highbd_sad64x128x4d_bits8) -#endif // CONFIG_EXT_PARTITION + aom_highbd_8_sub_pixel_avg_variance64x128, + aom_highbd_sad64x128x4d_bits8, + aom_highbd_jnt_sad64x128_avg_bits8, + aom_highbd_8_jnt_sub_pixel_avg_variance64x128) -#if CONFIG_EXT_PARTITION HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8, aom_highbd_8_masked_sub_pixel_variance128x128) HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits8, aom_highbd_8_masked_sub_pixel_variance128x64) HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits8, aom_highbd_8_masked_sub_pixel_variance64x128) -#endif // CONFIG_EXT_PARTITION HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits8, aom_highbd_8_masked_sub_pixel_variance64x64) HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits8, @@ -1700,35 +1566,18 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) { aom_highbd_8_masked_sub_pixel_variance8x4) HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8, aom_highbd_8_masked_sub_pixel_variance4x4) -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION - HIGHBD_MBFP(BLOCK_128X32, aom_highbd_masked_sad128x32_bits8, - aom_highbd_8_masked_sub_pixel_variance128x32) - - HIGHBD_MBFP(BLOCK_32X128, aom_highbd_masked_sad32x128_bits8, - aom_highbd_8_masked_sub_pixel_variance32x128) -#endif // CONFIG_EXT_PARTITION - HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits8, aom_highbd_8_masked_sub_pixel_variance64x16) - HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits8, aom_highbd_8_masked_sub_pixel_variance16x64) - HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits8, aom_highbd_8_masked_sub_pixel_variance32x8) - HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits8, aom_highbd_8_masked_sub_pixel_variance8x32) - HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits8, aom_highbd_8_masked_sub_pixel_variance16x4) - HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits8, aom_highbd_8_masked_sub_pixel_variance4x16) -#endif -#if CONFIG_MOTION_VAR -#if CONFIG_EXT_PARTITION HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits8, aom_highbd_obmc_variance128x128, aom_highbd_obmc_sub_pixel_variance128x128) @@ -1738,7 +1587,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) { HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits8, aom_highbd_obmc_variance64x128, aom_highbd_obmc_sub_pixel_variance64x128) -#endif // CONFIG_EXT_PARTITION HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits8, aom_highbd_obmc_variance64x64, aom_highbd_obmc_sub_pixel_variance64x64) @@ -1778,223 +1626,206 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) { HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits8, aom_highbd_obmc_variance4x4, aom_highbd_obmc_sub_pixel_variance4x4) -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION - HIGHBD_OBFP(BLOCK_128X32, aom_highbd_obmc_sad128x32_bits8, - aom_highbd_obmc_variance128x32, - aom_highbd_obmc_sub_pixel_variance128x32) - - HIGHBD_OBFP(BLOCK_32X128, aom_highbd_obmc_sad32x128_bits8, - aom_highbd_obmc_variance32x128, - aom_highbd_obmc_sub_pixel_variance32x128) -#endif // CONFIG_EXT_PARTITION - HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits8, aom_highbd_obmc_variance64x16, aom_highbd_obmc_sub_pixel_variance64x16) - HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits8, aom_highbd_obmc_variance16x64, aom_highbd_obmc_sub_pixel_variance16x64) - HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits8, aom_highbd_obmc_variance32x8, aom_highbd_obmc_sub_pixel_variance32x8) - HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits8, aom_highbd_obmc_variance8x32, aom_highbd_obmc_sub_pixel_variance8x32) - HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits8, aom_highbd_obmc_variance16x4, aom_highbd_obmc_sub_pixel_variance16x4) - HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits8, aom_highbd_obmc_variance4x16, aom_highbd_obmc_sub_pixel_variance4x16) -#endif -#endif // CONFIG_MOTION_VAR break; case AOM_BITS_10: -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION - HIGHBD_BFP(BLOCK_128X32, aom_highbd_sad128x32_bits10, - aom_highbd_sad128x32_avg_bits10, - aom_highbd_10_variance128x32, - aom_highbd_10_sub_pixel_variance128x32, - aom_highbd_10_sub_pixel_avg_variance128x32, NULL, NULL, - aom_highbd_sad128x32x4d_bits10) - - HIGHBD_BFP(BLOCK_32X128, aom_highbd_sad32x128_bits10, - aom_highbd_sad32x128_avg_bits10, - aom_highbd_10_variance32x128, - aom_highbd_10_sub_pixel_variance32x128, - aom_highbd_10_sub_pixel_avg_variance32x128, NULL, NULL, - aom_highbd_sad32x128x4d_bits10) -#endif // CONFIG_EXT_PARTITION - HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits10, aom_highbd_sad64x16_avg_bits10, aom_highbd_10_variance64x16, aom_highbd_10_sub_pixel_variance64x16, - aom_highbd_10_sub_pixel_avg_variance64x16, NULL, NULL, - aom_highbd_sad64x16x4d_bits10) + aom_highbd_10_sub_pixel_avg_variance64x16, + aom_highbd_sad64x16x4d_bits10, + aom_highbd_jnt_sad64x16_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance64x16); HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits10, aom_highbd_sad16x64_avg_bits10, aom_highbd_10_variance16x64, aom_highbd_10_sub_pixel_variance16x64, - aom_highbd_10_sub_pixel_avg_variance16x64, NULL, NULL, - aom_highbd_sad16x64x4d_bits10) + aom_highbd_10_sub_pixel_avg_variance16x64, + aom_highbd_sad16x64x4d_bits10, + aom_highbd_jnt_sad16x64_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance16x64); HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits10, aom_highbd_sad32x8_avg_bits10, aom_highbd_10_variance32x8, aom_highbd_10_sub_pixel_variance32x8, - aom_highbd_10_sub_pixel_avg_variance32x8, NULL, NULL, - aom_highbd_sad32x8x4d_bits10) + aom_highbd_10_sub_pixel_avg_variance32x8, + aom_highbd_sad32x8x4d_bits10, + aom_highbd_jnt_sad32x8_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance32x8); HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits10, aom_highbd_sad8x32_avg_bits10, aom_highbd_10_variance8x32, aom_highbd_10_sub_pixel_variance8x32, - aom_highbd_10_sub_pixel_avg_variance8x32, NULL, NULL, - aom_highbd_sad8x32x4d_bits10) + aom_highbd_10_sub_pixel_avg_variance8x32, + aom_highbd_sad8x32x4d_bits10, + aom_highbd_jnt_sad8x32_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance8x32); HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits10, aom_highbd_sad16x4_avg_bits10, aom_highbd_10_variance16x4, aom_highbd_10_sub_pixel_variance16x4, - aom_highbd_10_sub_pixel_avg_variance16x4, NULL, NULL, - aom_highbd_sad16x4x4d_bits10) + aom_highbd_10_sub_pixel_avg_variance16x4, + aom_highbd_sad16x4x4d_bits10, + aom_highbd_jnt_sad16x4_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance16x4); HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits10, aom_highbd_sad4x16_avg_bits10, aom_highbd_10_variance4x16, aom_highbd_10_sub_pixel_variance4x16, - aom_highbd_10_sub_pixel_avg_variance4x16, NULL, NULL, - aom_highbd_sad4x16x4d_bits10) -#endif + aom_highbd_10_sub_pixel_avg_variance4x16, + aom_highbd_sad4x16x4d_bits10, + aom_highbd_jnt_sad4x16_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance4x16); HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits10, aom_highbd_sad32x16_avg_bits10, aom_highbd_10_variance32x16, aom_highbd_10_sub_pixel_variance32x16, - aom_highbd_10_sub_pixel_avg_variance32x16, NULL, NULL, - aom_highbd_sad32x16x4d_bits10) + aom_highbd_10_sub_pixel_avg_variance32x16, + aom_highbd_sad32x16x4d_bits10, + aom_highbd_jnt_sad32x16_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance32x16); HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits10, aom_highbd_sad16x32_avg_bits10, aom_highbd_10_variance16x32, aom_highbd_10_sub_pixel_variance16x32, - aom_highbd_10_sub_pixel_avg_variance16x32, NULL, NULL, - aom_highbd_sad16x32x4d_bits10) + aom_highbd_10_sub_pixel_avg_variance16x32, + aom_highbd_sad16x32x4d_bits10, + aom_highbd_jnt_sad16x32_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance16x32); HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits10, aom_highbd_sad64x32_avg_bits10, aom_highbd_10_variance64x32, aom_highbd_10_sub_pixel_variance64x32, - aom_highbd_10_sub_pixel_avg_variance64x32, NULL, NULL, - aom_highbd_sad64x32x4d_bits10) + aom_highbd_10_sub_pixel_avg_variance64x32, + aom_highbd_sad64x32x4d_bits10, + aom_highbd_jnt_sad64x32_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance64x32); HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits10, aom_highbd_sad32x64_avg_bits10, aom_highbd_10_variance32x64, aom_highbd_10_sub_pixel_variance32x64, - aom_highbd_10_sub_pixel_avg_variance32x64, NULL, NULL, - aom_highbd_sad32x64x4d_bits10) + aom_highbd_10_sub_pixel_avg_variance32x64, + aom_highbd_sad32x64x4d_bits10, + aom_highbd_jnt_sad32x64_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance32x64); HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits10, aom_highbd_sad32x32_avg_bits10, aom_highbd_10_variance32x32, aom_highbd_10_sub_pixel_variance32x32, aom_highbd_10_sub_pixel_avg_variance32x32, - aom_highbd_sad32x32x3_bits10, aom_highbd_sad32x32x8_bits10, - aom_highbd_sad32x32x4d_bits10) + aom_highbd_sad32x32x4d_bits10, + aom_highbd_jnt_sad32x32_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance32x32); HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits10, aom_highbd_sad64x64_avg_bits10, aom_highbd_10_variance64x64, aom_highbd_10_sub_pixel_variance64x64, aom_highbd_10_sub_pixel_avg_variance64x64, - aom_highbd_sad64x64x3_bits10, aom_highbd_sad64x64x8_bits10, - aom_highbd_sad64x64x4d_bits10) + aom_highbd_sad64x64x4d_bits10, + aom_highbd_jnt_sad64x64_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance64x64); HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits10, aom_highbd_sad16x16_avg_bits10, aom_highbd_10_variance16x16, aom_highbd_10_sub_pixel_variance16x16, aom_highbd_10_sub_pixel_avg_variance16x16, - aom_highbd_sad16x16x3_bits10, aom_highbd_sad16x16x8_bits10, - aom_highbd_sad16x16x4d_bits10) + aom_highbd_sad16x16x4d_bits10, + aom_highbd_jnt_sad16x16_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance16x16); HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits10, aom_highbd_sad16x8_avg_bits10, aom_highbd_10_variance16x8, aom_highbd_10_sub_pixel_variance16x8, aom_highbd_10_sub_pixel_avg_variance16x8, - aom_highbd_sad16x8x3_bits10, aom_highbd_sad16x8x8_bits10, - aom_highbd_sad16x8x4d_bits10) + aom_highbd_sad16x8x4d_bits10, + aom_highbd_jnt_sad16x8_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance16x8); HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits10, aom_highbd_sad8x16_avg_bits10, aom_highbd_10_variance8x16, aom_highbd_10_sub_pixel_variance8x16, aom_highbd_10_sub_pixel_avg_variance8x16, - aom_highbd_sad8x16x3_bits10, aom_highbd_sad8x16x8_bits10, - aom_highbd_sad8x16x4d_bits10) + aom_highbd_sad8x16x4d_bits10, + aom_highbd_jnt_sad8x16_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance8x16); HIGHBD_BFP( BLOCK_8X8, aom_highbd_sad8x8_bits10, aom_highbd_sad8x8_avg_bits10, aom_highbd_10_variance8x8, aom_highbd_10_sub_pixel_variance8x8, - aom_highbd_10_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits10, - aom_highbd_sad8x8x8_bits10, aom_highbd_sad8x8x4d_bits10) + aom_highbd_10_sub_pixel_avg_variance8x8, + aom_highbd_sad8x8x4d_bits10, aom_highbd_jnt_sad8x8_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance8x8); - HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits10, - aom_highbd_sad8x4_avg_bits10, aom_highbd_10_variance8x4, - aom_highbd_10_sub_pixel_variance8x4, - aom_highbd_10_sub_pixel_avg_variance8x4, NULL, - aom_highbd_sad8x4x8_bits10, aom_highbd_sad8x4x4d_bits10) + HIGHBD_BFP( + BLOCK_8X4, aom_highbd_sad8x4_bits10, aom_highbd_sad8x4_avg_bits10, + aom_highbd_10_variance8x4, aom_highbd_10_sub_pixel_variance8x4, + aom_highbd_10_sub_pixel_avg_variance8x4, + aom_highbd_sad8x4x4d_bits10, aom_highbd_jnt_sad8x4_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance8x4); - HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits10, - aom_highbd_sad4x8_avg_bits10, aom_highbd_10_variance4x8, - aom_highbd_10_sub_pixel_variance4x8, - aom_highbd_10_sub_pixel_avg_variance4x8, NULL, - aom_highbd_sad4x8x8_bits10, aom_highbd_sad4x8x4d_bits10) + HIGHBD_BFP( + BLOCK_4X8, aom_highbd_sad4x8_bits10, aom_highbd_sad4x8_avg_bits10, + aom_highbd_10_variance4x8, aom_highbd_10_sub_pixel_variance4x8, + aom_highbd_10_sub_pixel_avg_variance4x8, + aom_highbd_sad4x8x4d_bits10, aom_highbd_jnt_sad4x8_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance4x8); HIGHBD_BFP( BLOCK_4X4, aom_highbd_sad4x4_bits10, aom_highbd_sad4x4_avg_bits10, aom_highbd_10_variance4x4, aom_highbd_10_sub_pixel_variance4x4, - aom_highbd_10_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits10, - aom_highbd_sad4x4x8_bits10, aom_highbd_sad4x4x4d_bits10) - -#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8 - HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_10_variance2x2, NULL, NULL, - NULL, NULL, NULL) - HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_10_variance4x2, NULL, NULL, - NULL, NULL, NULL) - HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_10_variance2x4, NULL, NULL, - NULL, NULL, NULL) -#endif + aom_highbd_10_sub_pixel_avg_variance4x4, + aom_highbd_sad4x4x4d_bits10, aom_highbd_jnt_sad4x4_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance4x4); + + HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits10, + aom_highbd_sad128x128_avg_bits10, + aom_highbd_10_variance128x128, + aom_highbd_10_sub_pixel_variance128x128, + aom_highbd_10_sub_pixel_avg_variance128x128, + aom_highbd_sad128x128x4d_bits10, + aom_highbd_jnt_sad128x128_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance128x128); -#if CONFIG_EXT_PARTITION HIGHBD_BFP( - BLOCK_128X128, aom_highbd_sad128x128_bits10, - aom_highbd_sad128x128_avg_bits10, aom_highbd_10_variance128x128, - aom_highbd_10_sub_pixel_variance128x128, - aom_highbd_10_sub_pixel_avg_variance128x128, - aom_highbd_sad128x128x3_bits10, aom_highbd_sad128x128x8_bits10, - aom_highbd_sad128x128x4d_bits10) - - HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits10, - aom_highbd_sad128x64_avg_bits10, - aom_highbd_10_variance128x64, - aom_highbd_10_sub_pixel_variance128x64, - aom_highbd_10_sub_pixel_avg_variance128x64, NULL, NULL, - aom_highbd_sad128x64x4d_bits10) - - HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits10, - aom_highbd_sad64x128_avg_bits10, - aom_highbd_10_variance64x128, - aom_highbd_10_sub_pixel_variance64x128, - aom_highbd_10_sub_pixel_avg_variance64x128, NULL, NULL, - aom_highbd_sad64x128x4d_bits10) -#endif // CONFIG_EXT_PARTITION - -#if CONFIG_EXT_PARTITION + BLOCK_128X64, aom_highbd_sad128x64_bits10, + aom_highbd_sad128x64_avg_bits10, aom_highbd_10_variance128x64, + aom_highbd_10_sub_pixel_variance128x64, + aom_highbd_10_sub_pixel_avg_variance128x64, + aom_highbd_sad128x64x4d_bits10, aom_highbd_jnt_sad128x64_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance128x64); + + HIGHBD_BFP( + BLOCK_64X128, aom_highbd_sad64x128_bits10, + aom_highbd_sad64x128_avg_bits10, aom_highbd_10_variance64x128, + aom_highbd_10_sub_pixel_variance64x128, + aom_highbd_10_sub_pixel_avg_variance64x128, + aom_highbd_sad64x128x4d_bits10, aom_highbd_jnt_sad64x128_avg_bits10, + aom_highbd_10_jnt_sub_pixel_avg_variance64x128); + HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10, aom_highbd_10_masked_sub_pixel_variance128x128) HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits10, aom_highbd_10_masked_sub_pixel_variance128x64) HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits10, aom_highbd_10_masked_sub_pixel_variance64x128) -#endif // CONFIG_EXT_PARTITION HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits10, aom_highbd_10_masked_sub_pixel_variance64x64) HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits10, @@ -2021,35 +1852,18 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) { aom_highbd_10_masked_sub_pixel_variance8x4) HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10, aom_highbd_10_masked_sub_pixel_variance4x4) -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION - HIGHBD_MBFP(BLOCK_128X32, aom_highbd_masked_sad128x32_bits10, - aom_highbd_10_masked_sub_pixel_variance128x32) - - HIGHBD_MBFP(BLOCK_32X128, aom_highbd_masked_sad32x128_bits10, - aom_highbd_10_masked_sub_pixel_variance32x128) -#endif // CONFIG_EXT_PARTITION - HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits10, aom_highbd_10_masked_sub_pixel_variance64x16) - HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits10, aom_highbd_10_masked_sub_pixel_variance16x64) - HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits10, aom_highbd_10_masked_sub_pixel_variance32x8) - HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits10, aom_highbd_10_masked_sub_pixel_variance8x32) - HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits10, aom_highbd_10_masked_sub_pixel_variance16x4) - HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits10, aom_highbd_10_masked_sub_pixel_variance4x16) -#endif -#if CONFIG_MOTION_VAR -#if CONFIG_EXT_PARTITION HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits10, aom_highbd_10_obmc_variance128x128, aom_highbd_10_obmc_sub_pixel_variance128x128) @@ -2059,7 +1873,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) { HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits10, aom_highbd_10_obmc_variance64x128, aom_highbd_10_obmc_sub_pixel_variance64x128) -#endif // CONFIG_EXT_PARTITION HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits10, aom_highbd_10_obmc_variance64x64, aom_highbd_10_obmc_sub_pixel_variance64x64) @@ -2099,16 +1912,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) { HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits10, aom_highbd_10_obmc_variance4x4, aom_highbd_10_obmc_sub_pixel_variance4x4) -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION - HIGHBD_OBFP(BLOCK_128X32, aom_highbd_obmc_sad128x32_bits10, - aom_highbd_10_obmc_variance128x32, - aom_highbd_10_obmc_sub_pixel_variance128x32) - - HIGHBD_OBFP(BLOCK_32X128, aom_highbd_obmc_sad32x128_bits10, - aom_highbd_10_obmc_variance32x128, - aom_highbd_10_obmc_sub_pixel_variance32x128) -#endif // CONFIG_EXT_PARTITION HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits10, aom_highbd_10_obmc_variance64x16, @@ -2133,189 +1936,188 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) { HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits10, aom_highbd_10_obmc_variance4x16, aom_highbd_10_obmc_sub_pixel_variance4x16) -#endif -#endif // CONFIG_MOTION_VAR break; case AOM_BITS_12: -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION - HIGHBD_BFP(BLOCK_128X32, aom_highbd_sad128x32_bits12, - aom_highbd_sad128x32_avg_bits12, - aom_highbd_12_variance128x32, - aom_highbd_12_sub_pixel_variance128x32, - aom_highbd_12_sub_pixel_avg_variance128x32, NULL, NULL, - aom_highbd_sad128x32x4d_bits12) - - HIGHBD_BFP(BLOCK_32X128, aom_highbd_sad32x128_bits12, - aom_highbd_sad32x128_avg_bits12, - aom_highbd_12_variance32x128, - aom_highbd_12_sub_pixel_variance32x128, - aom_highbd_12_sub_pixel_avg_variance32x128, NULL, NULL, - aom_highbd_sad32x128x4d_bits12) -#endif // CONFIG_EXT_PARTITION - HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits12, aom_highbd_sad64x16_avg_bits12, aom_highbd_12_variance64x16, aom_highbd_12_sub_pixel_variance64x16, - aom_highbd_12_sub_pixel_avg_variance64x16, NULL, NULL, - aom_highbd_sad64x16x4d_bits12) + aom_highbd_12_sub_pixel_avg_variance64x16, + aom_highbd_sad64x16x4d_bits12, + aom_highbd_jnt_sad64x16_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance64x16); HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits12, aom_highbd_sad16x64_avg_bits12, aom_highbd_12_variance16x64, aom_highbd_12_sub_pixel_variance16x64, - aom_highbd_12_sub_pixel_avg_variance16x64, NULL, NULL, - aom_highbd_sad16x64x4d_bits12) + aom_highbd_12_sub_pixel_avg_variance16x64, + aom_highbd_sad16x64x4d_bits12, + aom_highbd_jnt_sad16x64_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance16x64); HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits12, aom_highbd_sad32x8_avg_bits12, aom_highbd_12_variance32x8, aom_highbd_12_sub_pixel_variance32x8, - aom_highbd_12_sub_pixel_avg_variance32x8, NULL, NULL, - aom_highbd_sad32x8x4d_bits12) + aom_highbd_12_sub_pixel_avg_variance32x8, + aom_highbd_sad32x8x4d_bits12, + aom_highbd_jnt_sad32x8_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance32x8); HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits12, aom_highbd_sad8x32_avg_bits12, aom_highbd_12_variance8x32, aom_highbd_12_sub_pixel_variance8x32, - aom_highbd_12_sub_pixel_avg_variance8x32, NULL, NULL, - aom_highbd_sad8x32x4d_bits12) + aom_highbd_12_sub_pixel_avg_variance8x32, + aom_highbd_sad8x32x4d_bits12, + aom_highbd_jnt_sad8x32_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance8x32); HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits12, aom_highbd_sad16x4_avg_bits12, aom_highbd_12_variance16x4, aom_highbd_12_sub_pixel_variance16x4, - aom_highbd_12_sub_pixel_avg_variance16x4, NULL, NULL, - aom_highbd_sad16x4x4d_bits12) + aom_highbd_12_sub_pixel_avg_variance16x4, + aom_highbd_sad16x4x4d_bits12, + aom_highbd_jnt_sad16x4_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance16x4); HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits12, aom_highbd_sad4x16_avg_bits12, aom_highbd_12_variance4x16, aom_highbd_12_sub_pixel_variance4x16, - aom_highbd_12_sub_pixel_avg_variance4x16, NULL, NULL, - aom_highbd_sad4x16x4d_bits12) -#endif + aom_highbd_12_sub_pixel_avg_variance4x16, + aom_highbd_sad4x16x4d_bits12, + aom_highbd_jnt_sad4x16_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance4x16); HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits12, aom_highbd_sad32x16_avg_bits12, aom_highbd_12_variance32x16, aom_highbd_12_sub_pixel_variance32x16, - aom_highbd_12_sub_pixel_avg_variance32x16, NULL, NULL, - aom_highbd_sad32x16x4d_bits12) + aom_highbd_12_sub_pixel_avg_variance32x16, + aom_highbd_sad32x16x4d_bits12, + aom_highbd_jnt_sad32x16_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance32x16); HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits12, aom_highbd_sad16x32_avg_bits12, aom_highbd_12_variance16x32, aom_highbd_12_sub_pixel_variance16x32, - aom_highbd_12_sub_pixel_avg_variance16x32, NULL, NULL, - aom_highbd_sad16x32x4d_bits12) + aom_highbd_12_sub_pixel_avg_variance16x32, + aom_highbd_sad16x32x4d_bits12, + aom_highbd_jnt_sad16x32_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance16x32); HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits12, aom_highbd_sad64x32_avg_bits12, aom_highbd_12_variance64x32, aom_highbd_12_sub_pixel_variance64x32, - aom_highbd_12_sub_pixel_avg_variance64x32, NULL, NULL, - aom_highbd_sad64x32x4d_bits12) + aom_highbd_12_sub_pixel_avg_variance64x32, + aom_highbd_sad64x32x4d_bits12, + aom_highbd_jnt_sad64x32_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance64x32); HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits12, aom_highbd_sad32x64_avg_bits12, aom_highbd_12_variance32x64, aom_highbd_12_sub_pixel_variance32x64, - aom_highbd_12_sub_pixel_avg_variance32x64, NULL, NULL, - aom_highbd_sad32x64x4d_bits12) + aom_highbd_12_sub_pixel_avg_variance32x64, + aom_highbd_sad32x64x4d_bits12, + aom_highbd_jnt_sad32x64_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance32x64); HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits12, aom_highbd_sad32x32_avg_bits12, aom_highbd_12_variance32x32, aom_highbd_12_sub_pixel_variance32x32, aom_highbd_12_sub_pixel_avg_variance32x32, - aom_highbd_sad32x32x3_bits12, aom_highbd_sad32x32x8_bits12, - aom_highbd_sad32x32x4d_bits12) + aom_highbd_sad32x32x4d_bits12, + aom_highbd_jnt_sad32x32_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance32x32); HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits12, aom_highbd_sad64x64_avg_bits12, aom_highbd_12_variance64x64, aom_highbd_12_sub_pixel_variance64x64, aom_highbd_12_sub_pixel_avg_variance64x64, - aom_highbd_sad64x64x3_bits12, aom_highbd_sad64x64x8_bits12, - aom_highbd_sad64x64x4d_bits12) + aom_highbd_sad64x64x4d_bits12, + aom_highbd_jnt_sad64x64_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance64x64); HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits12, aom_highbd_sad16x16_avg_bits12, aom_highbd_12_variance16x16, aom_highbd_12_sub_pixel_variance16x16, aom_highbd_12_sub_pixel_avg_variance16x16, - aom_highbd_sad16x16x3_bits12, aom_highbd_sad16x16x8_bits12, - aom_highbd_sad16x16x4d_bits12) + aom_highbd_sad16x16x4d_bits12, + aom_highbd_jnt_sad16x16_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance16x16); HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits12, aom_highbd_sad16x8_avg_bits12, aom_highbd_12_variance16x8, aom_highbd_12_sub_pixel_variance16x8, aom_highbd_12_sub_pixel_avg_variance16x8, - aom_highbd_sad16x8x3_bits12, aom_highbd_sad16x8x8_bits12, - aom_highbd_sad16x8x4d_bits12) + aom_highbd_sad16x8x4d_bits12, + aom_highbd_jnt_sad16x8_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance16x8); HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits12, aom_highbd_sad8x16_avg_bits12, aom_highbd_12_variance8x16, aom_highbd_12_sub_pixel_variance8x16, aom_highbd_12_sub_pixel_avg_variance8x16, - aom_highbd_sad8x16x3_bits12, aom_highbd_sad8x16x8_bits12, - aom_highbd_sad8x16x4d_bits12) + aom_highbd_sad8x16x4d_bits12, + aom_highbd_jnt_sad8x16_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance8x16); HIGHBD_BFP( BLOCK_8X8, aom_highbd_sad8x8_bits12, aom_highbd_sad8x8_avg_bits12, aom_highbd_12_variance8x8, aom_highbd_12_sub_pixel_variance8x8, - aom_highbd_12_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits12, - aom_highbd_sad8x8x8_bits12, aom_highbd_sad8x8x4d_bits12) + aom_highbd_12_sub_pixel_avg_variance8x8, + aom_highbd_sad8x8x4d_bits12, aom_highbd_jnt_sad8x8_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance8x8); - HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits12, - aom_highbd_sad8x4_avg_bits12, aom_highbd_12_variance8x4, - aom_highbd_12_sub_pixel_variance8x4, - aom_highbd_12_sub_pixel_avg_variance8x4, NULL, - aom_highbd_sad8x4x8_bits12, aom_highbd_sad8x4x4d_bits12) + HIGHBD_BFP( + BLOCK_8X4, aom_highbd_sad8x4_bits12, aom_highbd_sad8x4_avg_bits12, + aom_highbd_12_variance8x4, aom_highbd_12_sub_pixel_variance8x4, + aom_highbd_12_sub_pixel_avg_variance8x4, + aom_highbd_sad8x4x4d_bits12, aom_highbd_jnt_sad8x4_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance8x4); - HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits12, - aom_highbd_sad4x8_avg_bits12, aom_highbd_12_variance4x8, - aom_highbd_12_sub_pixel_variance4x8, - aom_highbd_12_sub_pixel_avg_variance4x8, NULL, - aom_highbd_sad4x8x8_bits12, aom_highbd_sad4x8x4d_bits12) + HIGHBD_BFP( + BLOCK_4X8, aom_highbd_sad4x8_bits12, aom_highbd_sad4x8_avg_bits12, + aom_highbd_12_variance4x8, aom_highbd_12_sub_pixel_variance4x8, + aom_highbd_12_sub_pixel_avg_variance4x8, + aom_highbd_sad4x8x4d_bits12, aom_highbd_jnt_sad4x8_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance4x8); HIGHBD_BFP( BLOCK_4X4, aom_highbd_sad4x4_bits12, aom_highbd_sad4x4_avg_bits12, aom_highbd_12_variance4x4, aom_highbd_12_sub_pixel_variance4x4, - aom_highbd_12_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits12, - aom_highbd_sad4x4x8_bits12, aom_highbd_sad4x4x4d_bits12) - -#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8 - HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_12_variance2x2, NULL, NULL, - NULL, NULL, NULL) - HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_12_variance4x2, NULL, NULL, - NULL, NULL, NULL) - HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_12_variance2x4, NULL, NULL, - NULL, NULL, NULL) -#endif + aom_highbd_12_sub_pixel_avg_variance4x4, + aom_highbd_sad4x4x4d_bits12, aom_highbd_jnt_sad4x4_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance4x4); + + HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits12, + aom_highbd_sad128x128_avg_bits12, + aom_highbd_12_variance128x128, + aom_highbd_12_sub_pixel_variance128x128, + aom_highbd_12_sub_pixel_avg_variance128x128, + aom_highbd_sad128x128x4d_bits12, + aom_highbd_jnt_sad128x128_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance128x128); + + HIGHBD_BFP( + BLOCK_128X64, aom_highbd_sad128x64_bits12, + aom_highbd_sad128x64_avg_bits12, aom_highbd_12_variance128x64, + aom_highbd_12_sub_pixel_variance128x64, + aom_highbd_12_sub_pixel_avg_variance128x64, + aom_highbd_sad128x64x4d_bits12, aom_highbd_jnt_sad128x64_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance128x64); -#if CONFIG_EXT_PARTITION HIGHBD_BFP( - BLOCK_128X128, aom_highbd_sad128x128_bits12, - aom_highbd_sad128x128_avg_bits12, aom_highbd_12_variance128x128, - aom_highbd_12_sub_pixel_variance128x128, - aom_highbd_12_sub_pixel_avg_variance128x128, - aom_highbd_sad128x128x3_bits12, aom_highbd_sad128x128x8_bits12, - aom_highbd_sad128x128x4d_bits12) - - HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits12, - aom_highbd_sad128x64_avg_bits12, - aom_highbd_12_variance128x64, - aom_highbd_12_sub_pixel_variance128x64, - aom_highbd_12_sub_pixel_avg_variance128x64, NULL, NULL, - aom_highbd_sad128x64x4d_bits12) - - HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits12, - aom_highbd_sad64x128_avg_bits12, - aom_highbd_12_variance64x128, - aom_highbd_12_sub_pixel_variance64x128, - aom_highbd_12_sub_pixel_avg_variance64x128, NULL, NULL, - aom_highbd_sad64x128x4d_bits12) -#endif // CONFIG_EXT_PARTITION - -#if CONFIG_EXT_PARTITION + BLOCK_64X128, aom_highbd_sad64x128_bits12, + aom_highbd_sad64x128_avg_bits12, aom_highbd_12_variance64x128, + aom_highbd_12_sub_pixel_variance64x128, + aom_highbd_12_sub_pixel_avg_variance64x128, + aom_highbd_sad64x128x4d_bits12, aom_highbd_jnt_sad64x128_avg_bits12, + aom_highbd_12_jnt_sub_pixel_avg_variance64x128); + HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12, aom_highbd_12_masked_sub_pixel_variance128x128) HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits12, aom_highbd_12_masked_sub_pixel_variance128x64) HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits12, aom_highbd_12_masked_sub_pixel_variance64x128) -#endif // CONFIG_EXT_PARTITION HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits12, aom_highbd_12_masked_sub_pixel_variance64x64) HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits12, @@ -2342,36 +2144,18 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) { aom_highbd_12_masked_sub_pixel_variance8x4) HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12, aom_highbd_12_masked_sub_pixel_variance4x4) -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION - HIGHBD_MBFP(BLOCK_128X32, aom_highbd_masked_sad128x32_bits12, - aom_highbd_12_masked_sub_pixel_variance128x32) - - HIGHBD_MBFP(BLOCK_32X128, aom_highbd_masked_sad32x128_bits12, - aom_highbd_12_masked_sub_pixel_variance32x128) -#endif // CONFIG_EXT_PARTITION - HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits12, aom_highbd_12_masked_sub_pixel_variance64x16) - HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits12, aom_highbd_12_masked_sub_pixel_variance16x64) - HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits12, aom_highbd_12_masked_sub_pixel_variance32x8) - HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits12, aom_highbd_12_masked_sub_pixel_variance8x32) - HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits12, aom_highbd_12_masked_sub_pixel_variance16x4) - HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits12, aom_highbd_12_masked_sub_pixel_variance4x16) -#endif - -#if CONFIG_MOTION_VAR -#if CONFIG_EXT_PARTITION HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits12, aom_highbd_12_obmc_variance128x128, aom_highbd_12_obmc_sub_pixel_variance128x128) @@ -2381,7 +2165,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) { HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits12, aom_highbd_12_obmc_variance64x128, aom_highbd_12_obmc_sub_pixel_variance64x128) -#endif // CONFIG_EXT_PARTITION HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits12, aom_highbd_12_obmc_variance64x64, aom_highbd_12_obmc_sub_pixel_variance64x64) @@ -2421,42 +2204,24 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) { HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits12, aom_highbd_12_obmc_variance4x4, aom_highbd_12_obmc_sub_pixel_variance4x4) -#if CONFIG_EXT_PARTITION_TYPES -#if CONFIG_EXT_PARTITION - HIGHBD_OBFP(BLOCK_128X32, aom_highbd_obmc_sad128x32_bits12, - aom_highbd_12_obmc_variance128x32, - aom_highbd_12_obmc_sub_pixel_variance128x32) - - HIGHBD_OBFP(BLOCK_32X128, aom_highbd_obmc_sad32x128_bits12, - aom_highbd_12_obmc_variance32x128, - aom_highbd_12_obmc_sub_pixel_variance32x128) -#endif // CONFIG_EXT_PARTITION - HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits12, aom_highbd_12_obmc_variance64x16, aom_highbd_12_obmc_sub_pixel_variance64x16) - HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits12, aom_highbd_12_obmc_variance16x64, aom_highbd_12_obmc_sub_pixel_variance16x64) - HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits12, aom_highbd_12_obmc_variance32x8, aom_highbd_12_obmc_sub_pixel_variance32x8) - HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits12, aom_highbd_12_obmc_variance8x32, aom_highbd_12_obmc_sub_pixel_variance8x32) - HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits12, aom_highbd_12_obmc_variance16x4, aom_highbd_12_obmc_sub_pixel_variance16x4) - HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits12, aom_highbd_12_obmc_variance4x16, aom_highbd_12_obmc_sub_pixel_variance4x16) -#endif -#endif // CONFIG_MOTION_VAR break; default: @@ -2466,7 +2231,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) { } } } -#endif // CONFIG_HIGHBITDEPTH static void realloc_segmentation_maps(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; @@ -2487,40 +2251,59 @@ static void realloc_segmentation_maps(AV1_COMP *cpi) { aom_calloc(cm->mi_rows * cm->mi_cols, 1)); } -void set_compound_tools(AV1_COMMON *cm) { - (void)cm; -#if CONFIG_INTERINTRA - cm->allow_interintra_compound = 1; -#endif // CONFIG_INTERINTRA -#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT - cm->allow_masked_compound = 1; -#endif // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT -} - void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); RATE_CONTROL *const rc = &cpi->rc; MACROBLOCK *const x = &cpi->td.mb; if (cm->profile != oxcf->profile) cm->profile = oxcf->profile; cm->bit_depth = oxcf->bit_depth; - cm->color_space = oxcf->color_space; -#if CONFIG_COLORSPACE_HEADERS - cm->transfer_function = oxcf->transfer_function; + cm->color_primaries = oxcf->color_primaries; + cm->transfer_characteristics = oxcf->transfer_characteristics; + cm->matrix_coefficients = oxcf->matrix_coefficients; + cm->seq_params.monochrome = oxcf->monochrome; cm->chroma_sample_position = oxcf->chroma_sample_position; -#endif cm->color_range = oxcf->color_range; - if (cm->profile <= PROFILE_1) - assert(cm->bit_depth == AOM_BITS_8); - else - assert(cm->bit_depth > AOM_BITS_8); + assert(IMPLIES(cm->profile <= PROFILE_1, cm->bit_depth <= AOM_BITS_10)); + + cm->timing_info_present = oxcf->timing_info_present; + cm->timing_info.num_units_in_display_tick = + oxcf->timing_info.num_units_in_display_tick; + cm->timing_info.time_scale = oxcf->timing_info.time_scale; + cm->timing_info.equal_picture_interval = + oxcf->timing_info.equal_picture_interval; + cm->timing_info.num_ticks_per_picture = + oxcf->timing_info.num_ticks_per_picture; + + cm->seq_params.display_model_info_present_flag = + oxcf->display_model_info_present_flag; + cm->seq_params.decoder_model_info_present_flag = + oxcf->decoder_model_info_present_flag; + if (oxcf->decoder_model_info_present_flag) { + // set the decoder model parameters in schedule mode + cm->buffer_model.num_units_in_decoding_tick = + oxcf->buffer_model.num_units_in_decoding_tick; + cm->buffer_removal_delay_present = 1; + set_aom_dec_model_info(&cm->buffer_model); + set_dec_model_op_parameters(&cm->op_params[0]); + } else if (cm->timing_info_present && + cm->timing_info.equal_picture_interval && + !cm->seq_params.decoder_model_info_present_flag) { + // set the decoder model parameters in resource availability mode + set_resource_availability_parameters(&cm->op_params[0]); + } else { + cm->op_params[0].initial_display_delay = + 10; // Default value (not signaled) + } + + update_film_grain_parameters(cpi, oxcf); cpi->oxcf = *oxcf; + cpi->common.options = oxcf->cfg; x->e_mbd.bd = (int)cm->bit_depth; -#if CONFIG_GLOBAL_MOTION x->e_mbd.global_motion = cm->global_motion; -#endif // CONFIG_GLOBAL_MOTION if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) { rc->baseline_gf_interval = FIXED_GF_INTERVAL; @@ -2530,30 +2313,21 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { cpi->refresh_last_frame = 1; cpi->refresh_golden_frame = 0; -#if CONFIG_EXT_REFS cpi->refresh_bwd_ref_frame = 0; cpi->refresh_alt2_ref_frame = 0; -#endif // CONFIG_EXT_REFS - - cm->refresh_frame_context = - (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode) - ? REFRESH_FRAME_CONTEXT_FORWARD - : REFRESH_FRAME_CONTEXT_BACKWARD; -#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING - cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE; -#endif + + cm->refresh_frame_context = (oxcf->frame_parallel_decoding_mode) + ? REFRESH_FRAME_CONTEXT_DISABLED + : REFRESH_FRAME_CONTEXT_BACKWARD; + if (oxcf->large_scale_tile) + cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; if (x->palette_buffer == NULL) { CHECK_MEM_ERROR(cm, x->palette_buffer, aom_memalign(16, sizeof(*x->palette_buffer))); } - set_compound_tools(cm); av1_reset_segment_features(cm); -#if CONFIG_AMVR - set_high_precision_mv(cpi, 0, 0); -#else - set_high_precision_mv(cpi, 0); -#endif + set_high_precision_mv(cpi, 1, 0); set_rc_buffer_sizes(rc, &cpi->oxcf); @@ -2569,7 +2343,12 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { rc->worst_quality = cpi->oxcf.worst_allowed_q; rc->best_quality = cpi->oxcf.best_allowed_q; - cm->interp_filter = cpi->sf.default_interp_filter; + if (!oxcf->large_scale_tile) + cm->interp_filter = cpi->sf.default_interp_filter; + else + cm->interp_filter = EIGHTTAP_REGULAR; + + cm->switchable_motion_mode = 1; if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) { cm->render_width = cpi->oxcf.render_width; @@ -2581,10 +2360,17 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { cm->width = cpi->oxcf.width; cm->height = cpi->oxcf.height; - if (cpi->initial_width) { - if (cm->width > cpi->initial_width || cm->height > cpi->initial_height) { + int sb_size = cm->seq_params.sb_size; + // Superblock size should not be updated after the first key frame. + if (!cpi->seq_params_locked) { + set_sb_size(&cm->seq_params, select_sb_size(cpi)); + } + + if (cpi->initial_width || sb_size != cm->seq_params.sb_size) { + if (cm->width > cpi->initial_width || cm->height > cpi->initial_height || + cm->seq_params.sb_size != sb_size) { av1_free_context_buffers(cm); - av1_free_pc_tree(&cpi->td); + av1_free_pc_tree(&cpi->td, num_planes); alloc_compressor_data(cpi); realloc_segmentation_maps(cpi); cpi->initial_width = cpi->initial_height = 0; @@ -2595,32 +2381,24 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { cpi->alt_ref_source = NULL; rc->is_src_frame_alt_ref = 0; -#if CONFIG_EXT_REFS rc->is_bwd_ref_frame = 0; rc->is_last_bipred_frame = 0; rc->is_bipred_frame = 0; -#endif // CONFIG_EXT_REFS - -#if 0 - // Experimental RD Code - cpi->frame_distortion = 0; - cpi->last_frame_distortion = 0; -#endif set_tile_info(cpi); cpi->ext_refresh_frame_flags_pending = 0; cpi->ext_refresh_frame_context_pending = 0; -#if CONFIG_HIGHBITDEPTH highbd_set_var_fns(cpi); -#endif -#if CONFIG_ANS && ANS_MAX_SYMBOLS - cpi->common.ans_window_size_log2 = cpi->oxcf.ans_window_size_log2; -#endif // CONFIG_ANS && ANS_MAX_SYMBOLS -#if CONFIG_AMVR - cm->seq_mv_precision_level = 2; -#endif + + // Init sequence level coding tools + // This should not be called after the first key frame. + if (!cpi->seq_params_locked) { + cm->seq_params.operating_points_cnt_minus_1 = + cm->number_spatial_layers > 1 ? cm->number_spatial_layers - 1 : 0; + init_seq_coding_tools(&cm->seq_params, cm, oxcf); + } } AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, @@ -2644,10 +2422,6 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, cm->free_mi = enc_free_mi; cm->setup_mi = enc_setup_mi; -#if CONFIG_NCOBMC_ADAPT_WEIGHT - get_default_ncobmc_kernels(cm); -#endif // CONFIG_NCOBMC_ADAPT_WEIGHT - CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc))); CHECK_MEM_ERROR(cm, cm->frame_contexts, @@ -2663,38 +2437,18 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, cpi->common.buffer_pool = pool; init_config(cpi, oxcf); -#if CONFIG_XIPHRC - cpi->od_rc.framerate = cpi->framerate; - cpi->od_rc.frame_width = cm->render_width; - cpi->od_rc.frame_height = cm->render_height; - cpi->od_rc.keyframe_rate = oxcf->key_freq; - cpi->od_rc.goldenframe_rate = FIXED_GF_INTERVAL; - cpi->od_rc.altref_rate = 25; - cpi->od_rc.firstpass_quant = 1; - cpi->od_rc.bit_depth = cm->bit_depth; - cpi->od_rc.minq = oxcf->best_allowed_q; - cpi->od_rc.maxq = oxcf->worst_allowed_q; - if (cpi->oxcf.rc_mode == AOM_CQ) cpi->od_rc.minq = cpi->od_rc.quality; - cpi->od_rc.quality = cpi->oxcf.rc_mode == AOM_Q ? oxcf->cq_level : -1; - cpi->od_rc.periodic_boosts = oxcf->frame_periodic_boost; - od_enc_rc_init(&cpi->od_rc, - cpi->oxcf.rc_mode == AOM_Q ? -1 : oxcf->target_bandwidth, - oxcf->maximum_buffer_size_ms); -#else av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc); -#endif cm->current_video_frame = 0; + cpi->seq_params_locked = 0; cpi->partition_search_skippable_frame = 0; cpi->tile_data = NULL; cpi->last_show_frame_buf_idx = INVALID_IDX; realloc_segmentation_maps(cpi); - for (i = 0; i < NMV_CONTEXTS; ++i) { - memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs)); - memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp)); - } + memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs)); + memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp)); for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0])); i++) { @@ -2715,7 +2469,6 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, #endif cpi->refresh_alt_ref_frame = 0; - cpi->multi_arf_last_grp_enabled = 0; cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; #if CONFIG_INTERNAL_STATS @@ -2753,17 +2506,14 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, #endif #if CONFIG_ENTROPY_STATS av1_zero(aggregate_fc); - av1_zero_array(aggregate_fc_per_type, FRAME_CONTEXTS); #endif // CONFIG_ENTROPY_STATS cpi->first_time_stamp_ever = INT64_MAX; - for (i = 0; i < NMV_CONTEXTS; ++i) { - cpi->td.mb.nmvcost[i][0] = &cpi->nmv_costs[i][0][MV_MAX]; - cpi->td.mb.nmvcost[i][1] = &cpi->nmv_costs[i][1][MV_MAX]; - cpi->td.mb.nmvcost_hp[i][0] = &cpi->nmv_costs_hp[i][0][MV_MAX]; - cpi->td.mb.nmvcost_hp[i][1] = &cpi->nmv_costs_hp[i][1][MV_MAX]; - } + cpi->td.mb.nmvcost[0] = &cpi->nmv_costs[0][MV_MAX]; + cpi->td.mb.nmvcost[1] = &cpi->nmv_costs[1][MV_MAX]; + cpi->td.mb.nmvcost_hp[0] = &cpi->nmv_costs_hp[0][MV_MAX]; + cpi->td.mb.nmvcost_hp[1] = &cpi->nmv_costs_hp[1][MV_MAX]; #ifdef OUTPUT_YUV_SKINMAP yuv_skinmap_file = fopen("skinmap.yuv", "ab"); @@ -2772,17 +2522,6 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, yuv_rec_file = fopen("rec.yuv", "wb"); #endif -#if 0 - framepsnr = fopen("framepsnr.stt", "a"); - kf_list = fopen("kf_list.stt", "w"); -#endif - -#if CONFIG_XIPHRC - if (oxcf->pass == 2) { - cpi->od_rc.twopass_allframes_buf = oxcf->two_pass_stats_in.buf; - cpi->od_rc.twopass_allframes_buf_size = oxcf->two_pass_stats_in.sz; - } -#else if (oxcf->pass == 1) { av1_init_first_pass(cpi); } else if (oxcf->pass == 2) { @@ -2808,24 +2547,15 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, av1_init_second_pass(cpi); } -#endif -#if CONFIG_MOTION_VAR -#if CONFIG_HIGHBITDEPTH - int buf_scaler = 2; -#else - int buf_scaler = 1; -#endif CHECK_MEM_ERROR( cm, cpi->td.mb.above_pred_buf, - (uint8_t *)aom_memalign(16, - buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE * - sizeof(*cpi->td.mb.above_pred_buf))); + (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*cpi->td.mb.above_pred_buf))); CHECK_MEM_ERROR( cm, cpi->td.mb.left_pred_buf, - (uint8_t *)aom_memalign(16, - buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE * - sizeof(*cpi->td.mb.left_pred_buf))); + (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*cpi->td.mb.left_pred_buf))); CHECK_MEM_ERROR(cm, cpi->td.mb.wsrc_buf, (int32_t *)aom_memalign( @@ -2835,143 +2565,130 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, (int32_t *)aom_memalign( 16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf))); -#endif - av1_set_speed_features_framesize_independent(cpi); av1_set_speed_features_framesize_dependent(cpi); -#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \ +#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \ cpi->fn_ptr[BT].sdf = SDF; \ cpi->fn_ptr[BT].sdaf = SDAF; \ cpi->fn_ptr[BT].vf = VF; \ cpi->fn_ptr[BT].svf = SVF; \ cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx3f = SDX3F; \ - cpi->fn_ptr[BT].sdx8f = SDX8F; \ - cpi->fn_ptr[BT].sdx4df = SDX4DF; + cpi->fn_ptr[BT].sdx4df = SDX4DF; \ + cpi->fn_ptr[BT].jsdaf = JSDAF; \ + cpi->fn_ptr[BT].jsvaf = JSVAF; -#if CONFIG_EXT_PARTITION_TYPES BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16, - aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16, NULL, NULL, - aom_sad4x16x4d) + aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16, + aom_sad4x16x4d, aom_jnt_sad4x16_avg, aom_jnt_sub_pixel_avg_variance4x16) BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4, - aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4, NULL, NULL, - aom_sad16x4x4d) + aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4, + aom_sad16x4x4d, aom_jnt_sad16x4_avg, aom_jnt_sub_pixel_avg_variance16x4) BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32, - aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32, NULL, NULL, - aom_sad8x32x4d) + aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32, + aom_sad8x32x4d, aom_jnt_sad8x32_avg, aom_jnt_sub_pixel_avg_variance8x32) BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8, - aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8, NULL, NULL, - aom_sad32x8x4d) + aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8, + aom_sad32x8x4d, aom_jnt_sad32x8_avg, aom_jnt_sub_pixel_avg_variance32x8) BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64, - aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64, NULL, NULL, - aom_sad16x64x4d) + aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64, + aom_sad16x64x4d, aom_jnt_sad16x64_avg, + aom_jnt_sub_pixel_avg_variance16x64) BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16, - aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16, NULL, NULL, - aom_sad64x16x4d) + aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16, + aom_sad64x16x4d, aom_jnt_sad64x16_avg, + aom_jnt_sub_pixel_avg_variance64x16) -#if CONFIG_EXT_PARTITION - BFP(BLOCK_32X128, aom_sad32x128, aom_sad32x128_avg, aom_variance32x128, - aom_sub_pixel_variance32x128, aom_sub_pixel_avg_variance32x128, NULL, - NULL, aom_sad32x128x4d) - - BFP(BLOCK_128X32, aom_sad128x32, aom_sad128x32_avg, aom_variance128x32, - aom_sub_pixel_variance128x32, aom_sub_pixel_avg_variance128x32, NULL, - NULL, aom_sad128x32x4d) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES - -#if CONFIG_EXT_PARTITION BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128, aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128, - aom_sad128x128x3, aom_sad128x128x8, aom_sad128x128x4d) + aom_sad128x128x4d, aom_jnt_sad128x128_avg, + aom_jnt_sub_pixel_avg_variance128x128) BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64, - aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64, NULL, - NULL, aom_sad128x64x4d) + aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64, + aom_sad128x64x4d, aom_jnt_sad128x64_avg, + aom_jnt_sub_pixel_avg_variance128x64) BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128, - aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128, NULL, - NULL, aom_sad64x128x4d) -#endif // CONFIG_EXT_PARTITION + aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128, + aom_sad64x128x4d, aom_jnt_sad64x128_avg, + aom_jnt_sub_pixel_avg_variance64x128) BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16, - aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16, NULL, NULL, - aom_sad32x16x4d) + aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16, + aom_sad32x16x4d, aom_jnt_sad32x16_avg, + aom_jnt_sub_pixel_avg_variance32x16) BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32, - aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32, NULL, NULL, - aom_sad16x32x4d) + aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32, + aom_sad16x32x4d, aom_jnt_sad16x32_avg, + aom_jnt_sub_pixel_avg_variance16x32) BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32, - aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32, NULL, NULL, - aom_sad64x32x4d) + aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32, + aom_sad64x32x4d, aom_jnt_sad64x32_avg, + aom_jnt_sub_pixel_avg_variance64x32) BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64, - aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64, NULL, NULL, - aom_sad32x64x4d) + aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64, + aom_sad32x64x4d, aom_jnt_sad32x64_avg, + aom_jnt_sub_pixel_avg_variance32x64) BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32, aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32, - aom_sad32x32x3, aom_sad32x32x8, aom_sad32x32x4d) + aom_sad32x32x4d, aom_jnt_sad32x32_avg, + aom_jnt_sub_pixel_avg_variance32x32) BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64, aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64, - aom_sad64x64x3, aom_sad64x64x8, aom_sad64x64x4d) + aom_sad64x64x4d, aom_jnt_sad64x64_avg, + aom_jnt_sub_pixel_avg_variance64x64) BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16, aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16, - aom_sad16x16x3, aom_sad16x16x8, aom_sad16x16x4d) + aom_sad16x16x4d, aom_jnt_sad16x16_avg, + aom_jnt_sub_pixel_avg_variance16x16) BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8, - aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8, aom_sad16x8x3, - aom_sad16x8x8, aom_sad16x8x4d) + aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8, + aom_sad16x8x4d, aom_jnt_sad16x8_avg, aom_jnt_sub_pixel_avg_variance16x8) BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16, - aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16, aom_sad8x16x3, - aom_sad8x16x8, aom_sad8x16x4d) + aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16, + aom_sad8x16x4d, aom_jnt_sad8x16_avg, aom_jnt_sub_pixel_avg_variance8x16) BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8, - aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x3, - aom_sad8x8x8, aom_sad8x8x4d) + aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d, + aom_jnt_sad8x8_avg, aom_jnt_sub_pixel_avg_variance8x8) BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4, - aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, NULL, - aom_sad8x4x8, aom_sad8x4x4d) + aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d, + aom_jnt_sad8x4_avg, aom_jnt_sub_pixel_avg_variance8x4) BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8, - aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, NULL, - aom_sad4x8x8, aom_sad4x8x4d) + aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d, + aom_jnt_sad4x8_avg, aom_jnt_sub_pixel_avg_variance4x8) BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4, - aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x3, - aom_sad4x4x8, aom_sad4x4x4d) - -#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8 - BFP(BLOCK_2X2, NULL, NULL, aom_variance2x2, NULL, NULL, NULL, NULL, NULL) - BFP(BLOCK_2X4, NULL, NULL, aom_variance2x4, NULL, NULL, NULL, NULL, NULL) - BFP(BLOCK_4X2, NULL, NULL, aom_variance4x2, NULL, NULL, NULL, NULL, NULL) -#endif + aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d, + aom_jnt_sad4x4_avg, aom_jnt_sub_pixel_avg_variance4x4) -#if CONFIG_MOTION_VAR #define OBFP(BT, OSDF, OVF, OSVF) \ cpi->fn_ptr[BT].osdf = OSDF; \ cpi->fn_ptr[BT].ovf = OVF; \ cpi->fn_ptr[BT].osvf = OSVF; -#if CONFIG_EXT_PARTITION OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128, aom_obmc_sub_pixel_variance128x128) OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64, aom_obmc_sub_pixel_variance128x64) OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128, aom_obmc_sub_pixel_variance64x128) -#endif // CONFIG_EXT_PARTITION OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64, aom_obmc_sub_pixel_variance64x64) OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32, @@ -2998,46 +2715,27 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, aom_obmc_sub_pixel_variance8x4) OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4, aom_obmc_sub_pixel_variance4x4) - -#if CONFIG_EXT_PARTITION_TYPES OBFP(BLOCK_4X16, aom_obmc_sad4x16, aom_obmc_variance4x16, aom_obmc_sub_pixel_variance4x16) - OBFP(BLOCK_16X4, aom_obmc_sad16x4, aom_obmc_variance16x4, aom_obmc_sub_pixel_variance16x4) - OBFP(BLOCK_8X32, aom_obmc_sad8x32, aom_obmc_variance8x32, aom_obmc_sub_pixel_variance8x32) - OBFP(BLOCK_32X8, aom_obmc_sad32x8, aom_obmc_variance32x8, aom_obmc_sub_pixel_variance32x8) - OBFP(BLOCK_16X64, aom_obmc_sad16x64, aom_obmc_variance16x64, aom_obmc_sub_pixel_variance16x64) - OBFP(BLOCK_64X16, aom_obmc_sad64x16, aom_obmc_variance64x16, aom_obmc_sub_pixel_variance64x16) -#if CONFIG_EXT_PARTITION - OBFP(BLOCK_32X128, aom_obmc_sad32x128, aom_obmc_variance32x128, - aom_obmc_sub_pixel_variance32x128) - - OBFP(BLOCK_128X32, aom_obmc_sad128x32, aom_obmc_variance128x32, - aom_obmc_sub_pixel_variance128x32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES -#endif // CONFIG_MOTION_VAR - #define MBFP(BT, MCSDF, MCSVF) \ cpi->fn_ptr[BT].msdf = MCSDF; \ cpi->fn_ptr[BT].msvf = MCSVF; -#if CONFIG_EXT_PARTITION MBFP(BLOCK_128X128, aom_masked_sad128x128, aom_masked_sub_pixel_variance128x128) MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_sub_pixel_variance128x64) MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_sub_pixel_variance64x128) -#endif // CONFIG_EXT_PARTITION MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_sub_pixel_variance64x64) MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_sub_pixel_variance64x32) MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_sub_pixel_variance32x64) @@ -3052,7 +2750,6 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4) MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4) -#if CONFIG_EXT_PARTITION_TYPES MBFP(BLOCK_4X16, aom_masked_sad4x16, aom_masked_sub_pixel_variance4x16) MBFP(BLOCK_16X4, aom_masked_sad16x4, aom_masked_sub_pixel_variance16x4) @@ -3065,16 +2762,7 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16) -#if CONFIG_EXT_PARTITION - MBFP(BLOCK_32X128, aom_masked_sad32x128, aom_masked_sub_pixel_variance32x128) - - MBFP(BLOCK_128X32, aom_masked_sad128x32, aom_masked_sub_pixel_variance128x32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES - -#if CONFIG_HIGHBITDEPTH highbd_set_var_fns(cpi); -#endif /* av1_init_quantizer() is first called here. Add check in * av1_frame_init_quantizer() so that av1_init_quantizer is only @@ -3082,29 +2770,25 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, * av1_init_quantizer() for every frame. */ av1_init_quantizer(cpi); -#if CONFIG_AOM_QM - aom_qm_init(cm); -#endif + av1_qm_init(cm); av1_loop_filter_init(cm); -#if CONFIG_FRAME_SUPERRES cm->superres_scale_denominator = SCALE_NUMERATOR; cm->superres_upscaled_width = oxcf->width; cm->superres_upscaled_height = oxcf->height; -#endif // CONFIG_FRAME_SUPERRES -#if CONFIG_LOOP_RESTORATION av1_loop_restoration_precal(); -#endif // CONFIG_LOOP_RESTORATION cm->error.setjmp = 0; return cpi; } +#if CONFIG_INTERNAL_STATS #define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T)) #define SNPRINT2(H, T, V) \ snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V)) +#endif // CONFIG_INTERNAL_STATS void av1_remove_compressor(AV1_COMP *cpi) { AV1_COMMON *cm; @@ -3114,14 +2798,14 @@ void av1_remove_compressor(AV1_COMP *cpi) { if (!cpi) return; cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + if (cm->current_video_frame > 0) { #if CONFIG_ENTROPY_STATS if (cpi->oxcf.pass != 1) { fprintf(stderr, "Writing counts.stt\n"); FILE *f = fopen("counts.stt", "wb"); fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f); - fwrite(aggregate_fc_per_type, sizeof(aggregate_fc_per_type[0]), - FRAME_CONTEXTS, f); fclose(f); } #endif // CONFIG_ENTROPY_STATS @@ -3151,16 +2835,21 @@ void av1_remove_compressor(AV1_COMP *cpi) { snprintf(headings, sizeof(headings), "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t" "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t" - "WstPsnr\tWstSsim\tWstFast\tWstHVS"); + "WstPsnr\tWstSsim\tWstFast\tWstHVS\t" + "AVPsrnY\tAPsnrCb\tAPsnrCr"); snprintf(results, sizeof(results), "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t" "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" - "%7.3f\t%7.3f\t%7.3f\t%7.3f", - dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr, - cpi->psnr.stat[ALL] / cpi->count, total_psnr, total_ssim, - total_ssim, cpi->fastssim.stat[ALL] / cpi->count, - cpi->psnrhvs.stat[ALL] / cpi->count, cpi->psnr.worst, - cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst); + "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f", + dr, cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr, + cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr, total_ssim, + total_ssim, cpi->fastssim.stat[STAT_ALL] / cpi->count, + cpi->psnrhvs.stat[STAT_ALL] / cpi->count, cpi->psnr.worst, + cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst, + cpi->psnr.stat[STAT_Y] / cpi->count, + cpi->psnr.stat[STAT_U] / cpi->count, + cpi->psnr.stat[STAT_V] / cpi->count); if (cpi->b_calculate_blockiness) { SNPRINT(headings, "\t Block\tWstBlck"); @@ -3184,19 +2873,7 @@ void av1_remove_compressor(AV1_COMP *cpi) { fclose(f); } - -#endif - -#if 0 - { - printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000); - printf("\n_frames recive_data encod_mb_row compress_frame Total\n"); - printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, - cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000, - cpi->time_compress_data / 1000, - (cpi->time_receive_data + cpi->time_compress_data) / 1000); - } -#endif +#endif // CONFIG_INTERNAL_STATS } for (t = 0; t < cpi->num_workers; ++t) { @@ -3209,21 +2886,22 @@ void av1_remove_compressor(AV1_COMP *cpi) { // Deallocate allocated thread data. if (t < cpi->num_workers - 1) { aom_free(thread_data->td->palette_buffer); -#if CONFIG_MOTION_VAR aom_free(thread_data->td->above_pred_buf); aom_free(thread_data->td->left_pred_buf); aom_free(thread_data->td->wsrc_buf); aom_free(thread_data->td->mask_buf); -#endif // CONFIG_MOTION_VAR aom_free(thread_data->td->counts); - av1_free_pc_tree(thread_data->td); + av1_free_pc_tree(thread_data->td, num_planes); aom_free(thread_data->td); } } aom_free(cpi->tile_thr_data); aom_free(cpi->workers); - if (cpi->num_workers > 1) av1_loop_filter_dealloc(&cpi->lf_row_sync); + if (cpi->num_workers > 1) { + av1_loop_filter_dealloc(&cpi->lf_row_sync); + av1_loop_restoration_dealloc(&cpi->lr_row_sync, cpi->num_workers); + } dealloc_compressor_data(cpi); @@ -3244,6 +2922,10 @@ void av1_remove_compressor(AV1_COMP *cpi) { #endif // CONFIG_INTERNAL_STATS av1_remove_common(cm); + for (i = 0; i < FRAME_BUFFERS; ++i) { + av1_hash_table_destroy(&cm->buffer_pool->frame_bufs[i].hash_table); + } + if (cpi->sf.use_hash_based_trellis) hbt_destroy(); av1_free_ref_frame_buffers(cm->buffer_pool); aom_free(cpi); @@ -3253,30 +2935,14 @@ void av1_remove_compressor(AV1_COMP *cpi) { #ifdef OUTPUT_YUV_REC fclose(yuv_rec_file); #endif -#if 0 - - if (keyfile) - fclose(keyfile); - - if (framepsnr) - fclose(framepsnr); - - if (kf_list) - fclose(kf_list); - -#endif } static void generate_psnr_packet(AV1_COMP *cpi) { struct aom_codec_cx_pkt pkt; int i; PSNR_STATS psnr; -#if CONFIG_HIGHBITDEPTH aom_calc_highbd_psnr(cpi->source, cpi->common.frame_to_show, &psnr, cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth); -#else - aom_calc_psnr(cpi->source, cpi->common.frame_to_show, &psnr); -#endif for (i = 0; i < 4; ++i) { pkt.data.psnr.samples[i] = psnr.samples[i]; @@ -3290,22 +2956,25 @@ static void generate_psnr_packet(AV1_COMP *cpi) { int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags) { if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1; - cpi->ref_frame_flags = ref_frame_flags; + cpi->ext_ref_frame_flags = ref_frame_flags; return 0; } -void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags) { - cpi->ext_refresh_golden_frame = (ref_frame_flags & AOM_GOLD_FLAG) != 0; - cpi->ext_refresh_alt_ref_frame = (ref_frame_flags & AOM_ALT_FLAG) != 0; - cpi->ext_refresh_last_frame = (ref_frame_flags & AOM_LAST_FLAG) != 0; +void av1_update_reference(AV1_COMP *cpi, int ref_frame_upd_flags) { + cpi->ext_refresh_last_frame = (ref_frame_upd_flags & AOM_LAST_FLAG) != 0; + cpi->ext_refresh_golden_frame = (ref_frame_upd_flags & AOM_GOLD_FLAG) != 0; + cpi->ext_refresh_alt_ref_frame = (ref_frame_upd_flags & AOM_ALT_FLAG) != 0; + cpi->ext_refresh_bwd_ref_frame = (ref_frame_upd_flags & AOM_BWD_FLAG) != 0; + cpi->ext_refresh_alt2_ref_frame = (ref_frame_upd_flags & AOM_ALT2_FLAG) != 0; cpi->ext_refresh_frame_flags_pending = 1; } int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) { AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx); if (cfg) { - aom_yv12_copy_frame(cfg, sd); + aom_yv12_copy_frame(cfg, sd, num_planes); return 0; } else { return -1; @@ -3314,9 +2983,10 @@ int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) { int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) { AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx); if (cfg) { - aom_yv12_copy_frame(sd, cfg); + aom_yv12_copy_frame(sd, cfg, num_planes); return 0; } else { return -1; @@ -3361,7 +3031,6 @@ void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) { } #endif -#if CONFIG_EXT_REFS && !CONFIG_XIPHRC #if USE_GF16_MULTI_LAYER static void check_show_existing_frame_gf16(AV1_COMP *cpi) { const GF_GROUP *const gf_group = &cpi->twopass.gf_group; @@ -3374,7 +3043,7 @@ static void check_show_existing_frame_gf16(AV1_COMP *cpi) { } else if (cpi->rc.is_last_bipred_frame) { cpi->rc.is_last_bipred_frame = 0; cm->show_existing_frame = 1; - cpi->existing_fb_idx_to_show = cpi->bwd_fb_idx; + cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[BWDREF_FRAME - 1]; } else if (next_frame_update_type == OVERLAY_UPDATE || next_frame_update_type == INTNL_OVERLAY_UPDATE) { // Check the temporal filtering status for the next OVERLAY frame @@ -3392,8 +3061,8 @@ static void check_show_existing_frame_gf16(AV1_COMP *cpi) { cm->show_existing_frame = 1; cpi->rc.is_src_frame_alt_ref = 1; cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE) - ? cpi->alt_fb_idx - : cpi->bwd_fb_idx; + ? cpi->ref_fb_idx[ALTREF_FRAME - 1] + : cpi->ref_fb_idx[BWDREF_FRAME - 1]; cpi->is_arf_filter_off[which_arf] = 0; } } @@ -3423,7 +3092,7 @@ static void check_show_existing_frame(AV1_COMP *cpi) { // the last_fb_idxes[0] after reference frame buffer update cpi->rc.is_last_bipred_frame = 0; cm->show_existing_frame = 1; - cpi->existing_fb_idx_to_show = cpi->lst_fb_idxes[0]; + cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[0]; } else if (cpi->is_arf_filter_off[which_arf] && (next_frame_update_type == OVERLAY_UPDATE || next_frame_update_type == INTNL_OVERLAY_UPDATE)) { @@ -3432,20 +3101,18 @@ static void check_show_existing_frame(AV1_COMP *cpi) { cm->show_existing_frame = 1; cpi->rc.is_src_frame_alt_ref = 1; cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE) - ? cpi->alt_fb_idx - : cpi->alt2_fb_idx; + ? cpi->ref_fb_idx[ALTREF_FRAME - 1] + : cpi->ref_fb_idx[ALTREF2_FRAME - 1]; cpi->is_arf_filter_off[which_arf] = 0; } cpi->rc.is_src_frame_ext_arf = 0; } -#endif // CONFIG_EXT_REFS && !CONFIG_XIPHRC #ifdef OUTPUT_YUV_REC void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) { uint8_t *src = s->y_buffer; int h = cm->height; if (yuv_rec_file == NULL) return; -#if CONFIG_HIGHBITDEPTH if (s->flags & YV12_FLAG_HIGHBITDEPTH) { uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer); @@ -3473,7 +3140,6 @@ void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) { fflush(yuv_rec_file); return; } -#endif // CONFIG_HIGHBITDEPTH do { fwrite(src, s->y_width, 1, yuv_rec_file); @@ -3500,7 +3166,6 @@ void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) { } #endif // OUTPUT_YUV_REC -#if CONFIG_GLOBAL_MOTION #define GM_RECODE_LOOP_NUM4X4_FACTOR 192 static int recode_loop_test_global_motion(AV1_COMP *cpi) { int i; @@ -3515,12 +3180,13 @@ static int recode_loop_test_global_motion(AV1_COMP *cpi) { assert(cm->global_motion[i].wmtype == IDENTITY); cpi->gmparams_cost[i] = 0; recode = 1; - recode |= (rdc->global_motion_used[i] > 0); + // TODO(sarahparker): The earlier condition for recoding here was: + // "recode |= (rdc->global_motion_used[i] > 0);". Can we bring something + // similar to that back to speed up global motion? } } return recode; } -#endif // CONFIG_GLOBAL_MOTION // Function to test for conditions that indicate we should loop // back and recode a frame. @@ -3602,15 +3268,15 @@ static void dump_ref_frame_images(AV1_COMP *cpi) { } #endif // DUMP_REF_FRAME_IMAGES == 1 -#if CONFIG_EXT_REFS // This function is used to shift the virtual indices of last reference frames // as follows: // LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME // when the LAST_FRAME is updated. static INLINE void shift_last_ref_frames(AV1_COMP *cpi) { + // TODO(isbs): shift the scaled indices as well int ref_frame; for (ref_frame = LAST_REF_FRAMES - 1; ref_frame > 0; --ref_frame) { - cpi->lst_fb_idxes[ref_frame] = cpi->lst_fb_idxes[ref_frame - 1]; + cpi->ref_fb_idx[ref_frame] = cpi->ref_fb_idx[ref_frame - 1]; // [0] is allocated to the current coded frame. The statistics for the // reference frames start at [LAST_FRAME], i.e. [1]. @@ -3621,64 +3287,18 @@ static INLINE void shift_last_ref_frames(AV1_COMP *cpi) { } } } -#endif // CONFIG_EXT_REFS - -#if CONFIG_VAR_REFS -static void enc_check_valid_ref_frames(AV1_COMP *const cpi) { - AV1_COMMON *const cm = &cpi->common; - MV_REFERENCE_FRAME ref_frame; - - // TODO(zoeliu): To handle ALTREF_FRAME the same way as do with other - // reference frames. Current encoder invalid ALTREF when ALTREF - // is the same as LAST, but invalid all the other references - // when they are the same as ALTREF. - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - int ref_buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); - RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - LAST_FRAME]; - - if (ref_buf_idx != INVALID_IDX) { - ref_buf->is_valid = 1; - - MV_REFERENCE_FRAME ref; - for (ref = LAST_FRAME; ref < ref_frame; ++ref) { - int buf_idx = get_ref_frame_buf_idx(cpi, ref); - RefBuffer *const buf = &cm->frame_refs[ref - LAST_FRAME]; - if (buf->is_valid && buf_idx == ref_buf_idx) { - if (ref_frame != ALTREF_FRAME || ref == LAST_FRAME) { - ref_buf->is_valid = 0; - break; - } else { - buf->is_valid = 0; - } - } - } - } else { - ref_buf->is_valid = 0; - } - } -} -#endif // CONFIG_VAR_REFS -#if CONFIG_EXT_REFS #if USE_GF16_MULTI_LAYER static void update_reference_frames_gf16(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; BufferPool *const pool = cm->buffer_pool; if (cm->frame_type == KEY_FRAME) { - for (int ref_frame = 0; ref_frame < LAST_REF_FRAMES; ++ref_frame) { + for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { ref_cnt_fb(pool->frame_bufs, - &cm->ref_frame_map[cpi->lst_fb_idxes[ref_frame]], + &cm->ref_frame_map[cpi->ref_fb_idx[ref_frame]], cm->new_fb_idx); } - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], - cm->new_fb_idx); - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx], - cm->new_fb_idx); - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt2_fb_idx], - cm->new_fb_idx); - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx], - cm->new_fb_idx); } else { if (cpi->refresh_last_frame || cpi->refresh_golden_frame || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame || @@ -3703,7 +3323,6 @@ static void update_reference_frames_gf16(AV1_COMP *cpi) { #endif // DUMP_REF_FRAME_IMAGES } #endif // USE_GF16_MULTI_LAYER -#endif // CONFIG_EXT_REFS static void update_reference_frames(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; @@ -3712,30 +3331,28 @@ static void update_reference_frames(AV1_COMP *cpi) { // for the purpose to verify no mismatch between encoder and decoder. if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx; -#if CONFIG_EXT_REFS #if USE_GF16_MULTI_LAYER if (cpi->rc.baseline_gf_interval == 16) { update_reference_frames_gf16(cpi); return; } #endif // USE_GF16_MULTI_LAYER -#endif // CONFIG_EXT_REFS BufferPool *const pool = cm->buffer_pool; + // At this point the new frame has been encoded. // If any buffer copy / swapping is signaled it should be done here. - if (cm->frame_type == KEY_FRAME) { - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], - cm->new_fb_idx); -#if CONFIG_EXT_REFS - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx], - cm->new_fb_idx); - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt2_fb_idx], - cm->new_fb_idx); -#endif // CONFIG_EXT_REFS - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx], - cm->new_fb_idx); - } else if (av1_preserve_existing_gf(cpi)) { + + if (cm->frame_type == KEY_FRAME || frame_is_sframe(cm)) { + for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { + ref_cnt_fb(pool->frame_bufs, + &cm->ref_frame_map[cpi->ref_fb_idx[ref_frame]], + cm->new_fb_idx); + } + return; + } + + if (av1_preserve_existing_gf(cpi)) { // We have decided to preserve the previously existing golden frame as our // new ARF frame. However, in the short term in function // av1_bitstream.c::get_refresh_mask() we left it in the GF slot and, if @@ -3746,19 +3363,17 @@ static void update_reference_frames(AV1_COMP *cpi) { // slot and, if we're updating the GF, the current frame becomes the new GF. int tmp; - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx], + ref_cnt_fb(pool->frame_bufs, + &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF_FRAME - 1]], cm->new_fb_idx); - tmp = cpi->alt_fb_idx; - cpi->alt_fb_idx = cpi->gld_fb_idx; - cpi->gld_fb_idx = tmp; + tmp = cpi->ref_fb_idx[ALTREF_FRAME - 1]; + cpi->ref_fb_idx[ALTREF_FRAME - 1] = cpi->ref_fb_idx[GOLDEN_FRAME - 1]; + cpi->ref_fb_idx[GOLDEN_FRAME - 1] = tmp; -#if CONFIG_EXT_REFS // We need to modify the mapping accordingly - cpi->arf_map[0] = cpi->alt_fb_idx; -#endif // CONFIG_EXT_REFS -// TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to -// cpi->interp_filter_selected[GOLDEN_FRAME]? -#if CONFIG_EXT_REFS + cpi->arf_map[0] = cpi->ref_fb_idx[ALTREF_FRAME - 1]; + // TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to + // cpi->interp_filter_selected[GOLDEN_FRAME]? } else if (cpi->rc.is_src_frame_ext_arf && cm->show_existing_frame) { // Deal with the special case for showing existing internal ALTREF_FRAME // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME @@ -3767,29 +3382,22 @@ static void update_reference_frames(AV1_COMP *cpi) { const int which_arf = gf_group->arf_ref_idx[gf_group->index]; assert(gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE); - const int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]; + const int tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1]; shift_last_ref_frames(cpi); - cpi->lst_fb_idxes[0] = cpi->alt2_fb_idx; - cpi->alt2_fb_idx = tmp; + cpi->ref_fb_idx[LAST_FRAME - 1] = cpi->ref_fb_idx[ALTREF2_FRAME - 1]; + cpi->ref_fb_idx[ALTREF2_FRAME - 1] = tmp; // We need to modify the mapping accordingly - cpi->arf_map[which_arf] = cpi->alt2_fb_idx; + cpi->arf_map[which_arf] = cpi->ref_fb_idx[ALTREF2_FRAME - 1]; memcpy(cpi->interp_filter_selected[LAST_FRAME], cpi->interp_filter_selected[ALTREF2_FRAME], sizeof(cpi->interp_filter_selected[ALTREF2_FRAME])); -#endif // CONFIG_EXT_REFS } else { /* For non key/golden frames */ // === ALTREF_FRAME === if (cpi->refresh_alt_ref_frame) { - int arf_idx = cpi->alt_fb_idx; + int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1]; int which_arf = 0; -#if !CONFIG_EXT_REFS - if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - arf_idx = gf_group->arf_update_idx[gf_group->index]; - } -#endif // !CONFIG_EXT_REFS ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx); memcpy(cpi->interp_filter_selected[ALTREF_FRAME + which_arf], @@ -3799,21 +3407,19 @@ static void update_reference_frames(AV1_COMP *cpi) { // === GOLDEN_FRAME === if (cpi->refresh_golden_frame) { - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], + ref_cnt_fb(pool->frame_bufs, + &cm->ref_frame_map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]], cm->new_fb_idx); -#if !CONFIG_EXT_REFS - if (!cpi->rc.is_src_frame_alt_ref) -#endif // !CONFIG_EXT_REFS - memcpy(cpi->interp_filter_selected[GOLDEN_FRAME], - cpi->interp_filter_selected[0], - sizeof(cpi->interp_filter_selected[0])); + memcpy(cpi->interp_filter_selected[GOLDEN_FRAME], + cpi->interp_filter_selected[0], + sizeof(cpi->interp_filter_selected[0])); } -#if CONFIG_EXT_REFS // === BWDREF_FRAME === if (cpi->refresh_bwd_ref_frame) { - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx], + ref_cnt_fb(pool->frame_bufs, + &cm->ref_frame_map[cpi->ref_fb_idx[BWDREF_FRAME - 1]], cm->new_fb_idx); memcpy(cpi->interp_filter_selected[BWDREF_FRAME], @@ -3823,18 +3429,17 @@ static void update_reference_frames(AV1_COMP *cpi) { // === ALTREF2_FRAME === if (cpi->refresh_alt2_ref_frame) { - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt2_fb_idx], + ref_cnt_fb(pool->frame_bufs, + &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]], cm->new_fb_idx); memcpy(cpi->interp_filter_selected[ALTREF2_FRAME], cpi->interp_filter_selected[0], sizeof(cpi->interp_filter_selected[0])); } -#endif // CONFIG_EXT_REFS } if (cpi->refresh_last_frame) { -#if CONFIG_EXT_REFS // NOTE(zoeliu): We have two layers of mapping (1) from the per-frame // reference to the reference frame buffer virtual index; and then (2) from // the virtual index to the reference frame buffer physical index: @@ -3842,7 +3447,7 @@ static void update_reference_frames(AV1_COMP *cpi) { // LAST_FRAME, ..., LAST3_FRAME, ..., ALTREF_FRAME // | | | // v v v - // lst_fb_idxes[0], ..., lst_fb_idxes[2], ..., alt_fb_idx + // ref_fb_idx[0], ..., ref_fb_idx[2], ..., ref_fb_idx[ALTREF_FRAME-1] // | | | // v v v // ref_frame_map[], ..., ref_frame_map[], ..., ref_frame_map[] @@ -3864,61 +3469,42 @@ static void update_reference_frames(AV1_COMP *cpi) { // LAST_FRAME, LAST2_FRAME, LAST3_FRAME // | | | // v v v - // lst_fb_idxes[2], lst_fb_idxes[0], lst_fb_idxes[1] - int ref_frame; + // ref_fb_idx[2], ref_fb_idx[0], ref_fb_idx[1] + int tmp; - if (cm->frame_type == KEY_FRAME) { - for (ref_frame = 0; ref_frame < LAST_REF_FRAMES; ++ref_frame) { - ref_cnt_fb(pool->frame_bufs, - &cm->ref_frame_map[cpi->lst_fb_idxes[ref_frame]], - cm->new_fb_idx); - } - } else { - int tmp; + ref_cnt_fb(pool->frame_bufs, + &cm->ref_frame_map[cpi->ref_fb_idx[LAST_REF_FRAMES - 1]], + cm->new_fb_idx); - ref_cnt_fb(pool->frame_bufs, - &cm->ref_frame_map[cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]], - cm->new_fb_idx); + tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1]; - tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]; + shift_last_ref_frames(cpi); + cpi->ref_fb_idx[0] = tmp; - shift_last_ref_frames(cpi); - cpi->lst_fb_idxes[0] = tmp; + assert(cm->show_existing_frame == 0); + memcpy(cpi->interp_filter_selected[LAST_FRAME], + cpi->interp_filter_selected[0], + sizeof(cpi->interp_filter_selected[0])); + + if (cpi->rc.is_last_bipred_frame) { + // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the + // LAST3_FRAME by updating the virtual indices. + // + // NOTE: The source frame for BWDREF does not have a holding position as + // the OVERLAY frame for ALTREF's. Hence, to resolve the reference + // virtual index reshuffling for BWDREF, the encoder always + // specifies a LAST_BIPRED right before BWDREF and completes the + // reshuffling job accordingly. + tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1]; - assert(cm->show_existing_frame == 0); - memcpy(cpi->interp_filter_selected[LAST_FRAME], - cpi->interp_filter_selected[0], - sizeof(cpi->interp_filter_selected[0])); + shift_last_ref_frames(cpi); + cpi->ref_fb_idx[0] = cpi->ref_fb_idx[BWDREF_FRAME - 1]; + cpi->ref_fb_idx[BWDREF_FRAME - 1] = tmp; - if (cpi->rc.is_last_bipred_frame) { - // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the - // LAST3_FRAME by updating the virtual indices. - // - // NOTE: The source frame for BWDREF does not have a holding position as - // the OVERLAY frame for ALTREF's. Hence, to resolve the reference - // virtual index reshuffling for BWDREF, the encoder always - // specifies a LAST_BIPRED right before BWDREF and completes the - // reshuffling job accordingly. - tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]; - - shift_last_ref_frames(cpi); - cpi->lst_fb_idxes[0] = cpi->bwd_fb_idx; - cpi->bwd_fb_idx = tmp; - - memcpy(cpi->interp_filter_selected[LAST_FRAME], - cpi->interp_filter_selected[BWDREF_FRAME], - sizeof(cpi->interp_filter_selected[BWDREF_FRAME])); - } - } -#else // !CONFIG_EXT_REFS - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], - cm->new_fb_idx); - if (!cpi->rc.is_src_frame_alt_ref) { memcpy(cpi->interp_filter_selected[LAST_FRAME], - cpi->interp_filter_selected[0], - sizeof(cpi->interp_filter_selected[0])); + cpi->interp_filter_selected[BWDREF_FRAME], + sizeof(cpi->interp_filter_selected[BWDREF_FRAME])); } -#endif // CONFIG_EXT_REFS } #if DUMP_REF_FRAME_IMAGES == 1 @@ -3937,19 +3523,11 @@ static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, int buffer_idx) { static void scale_references(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); MV_REFERENCE_FRAME ref_frame; const AOM_REFFRAME ref_mask[INTER_REFS_PER_FRAME] = { - AOM_LAST_FLAG, -#if CONFIG_EXT_REFS - AOM_LAST2_FLAG, - AOM_LAST3_FLAG, -#endif // CONFIG_EXT_REFS - AOM_GOLD_FLAG, -#if CONFIG_EXT_REFS - AOM_BWD_FLAG, - AOM_ALT2_FLAG, -#endif // CONFIG_EXT_REFS - AOM_ALT_FLAG + AOM_LAST_FLAG, AOM_LAST2_FLAG, AOM_LAST3_FLAG, AOM_GOLD_FLAG, + AOM_BWD_FLAG, AOM_ALT2_FLAG, AOM_ALT_FLAG }; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { @@ -3964,7 +3542,6 @@ static void scale_references(AV1_COMP *cpi) { continue; } -#if CONFIG_HIGHBITDEPTH if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { RefCntBuffer *new_fb_ptr = NULL; int force_scaling = 0; @@ -3983,35 +3560,11 @@ static void scale_references(AV1_COMP *cpi) { cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); - av1_resize_and_extend_frame(ref, &new_fb_ptr->buf, - (int)cm->bit_depth); + av1_resize_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth, + num_planes); cpi->scaled_ref_idx[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); } -#else - if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { - RefCntBuffer *new_fb_ptr = NULL; - int force_scaling = 0; - int new_fb = cpi->scaled_ref_idx[ref_frame - 1]; - if (new_fb == INVALID_IDX) { - new_fb = get_free_fb(cm); - force_scaling = 1; - } - if (new_fb == INVALID_IDX) return; - new_fb_ptr = &pool->frame_bufs[new_fb]; - if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || - new_fb_ptr->buf.y_crop_height != cm->height) { - if (aom_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - AOM_BORDER_IN_PIXELS, cm->byte_alignment, - NULL, NULL, NULL)) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, - "Failed to allocate frame buffer"); - av1_resize_and_extend_frame(ref, &new_fb_ptr->buf); - cpi->scaled_ref_idx[ref_frame - 1] = new_fb; - alloc_frame_mvs(cm, new_fb); - } -#endif // CONFIG_HIGHBITDEPTH } else { const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); RefCntBuffer *const buf = &pool->frame_bufs[buf_idx]; @@ -4029,115 +3582,18 @@ static void scale_references(AV1_COMP *cpi) { static void release_scaled_references(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; int i; - if (cpi->oxcf.pass == 0) { - // Only release scaled references under certain conditions: - // if reference will be updated, or if scaled reference has same resolution. - int refresh[INTER_REFS_PER_FRAME]; - refresh[0] = (cpi->refresh_last_frame) ? 1 : 0; -#if CONFIG_EXT_REFS - refresh[1] = refresh[2] = 0; - refresh[3] = (cpi->refresh_golden_frame) ? 1 : 0; - refresh[4] = (cpi->refresh_bwd_ref_frame) ? 1 : 0; - refresh[5] = (cpi->refresh_alt2_ref_frame) ? 1 : 0; - refresh[6] = (cpi->refresh_alt_ref_frame) ? 1 : 0; -#else // !CONFIG_EXT_REFS - refresh[1] = (cpi->refresh_golden_frame) ? 1 : 0; - refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0; -#endif // CONFIG_EXT_REFS - for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { - const int idx = cpi->scaled_ref_idx[i - 1]; - RefCntBuffer *const buf = - idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL; - const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i); - if (buf != NULL && - (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width && - buf->buf.y_crop_height == ref->y_crop_height))) { - --buf->ref_count; - cpi->scaled_ref_idx[i - 1] = INVALID_IDX; - } - } - } else { - for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) { - const int idx = cpi->scaled_ref_idx[i]; - RefCntBuffer *const buf = - idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL; - if (buf != NULL) { - --buf->ref_count; - cpi->scaled_ref_idx[i] = INVALID_IDX; - } + // TODO(isbs): only refresh the necessary frames, rather than all of them + for (i = 0; i < REF_FRAMES; ++i) { + const int idx = cpi->scaled_ref_idx[i]; + RefCntBuffer *const buf = + idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL; + if (buf != NULL) { + --buf->ref_count; + cpi->scaled_ref_idx[i] = INVALID_IDX; } } } -#if 0 && CONFIG_INTERNAL_STATS -static void output_frame_level_debug_stats(AV1_COMP *cpi) { - AV1_COMMON *const cm = &cpi->common; - FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w"); - int64_t recon_err; - - aom_clear_system_state(); - - recon_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm)); - - if (cpi->twopass.total_left_stats.coded_error != 0.0) - fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d" - "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" " - "%10"PRId64" %10"PRId64" %10d " - "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf" - "%6d %6d %5d %5d %5d " - "%10"PRId64" %10.3lf" - "%10lf %8u %10"PRId64" %10d %10d %10d\n", - cpi->common.current_video_frame, - cm->width, cm->height, - cpi->rc.source_alt_ref_pending, - cpi->rc.source_alt_ref_active, - cpi->rc.this_frame_target, - cpi->rc.projected_frame_size, - cpi->rc.projected_frame_size / cpi->common.MBs, - (cpi->rc.projected_frame_size - cpi->rc.this_frame_target), - cpi->rc.vbr_bits_off_target, - cpi->rc.vbr_bits_off_target_fast, - cpi->twopass.extend_minq, - cpi->twopass.extend_minq_fast, - cpi->rc.total_target_vs_actual, - (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target), - cpi->rc.total_actual_bits, cm->base_qindex, - av1_convert_qindex_to_q(cm->base_qindex, cm->bit_depth), - (double)av1_dc_quant(cm->base_qindex, 0, cm->bit_depth) / 4.0, - av1_convert_qindex_to_q(cpi->twopass.active_worst_quality, - cm->bit_depth), - cpi->rc.avg_q, - av1_convert_qindex_to_q(cpi->oxcf.cq_level, cm->bit_depth), - cpi->refresh_last_frame, cpi->refresh_golden_frame, - cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost, - cpi->twopass.bits_left, - cpi->twopass.total_left_stats.coded_error, - cpi->twopass.bits_left / - (1 + cpi->twopass.total_left_stats.coded_error), - cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost, - cpi->twopass.kf_zeromotion_pct, - cpi->twopass.fr_content_type); - - fclose(f); - - if (0) { - FILE *const fmodes = fopen("Modes.stt", "a"); - int i; - - fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame, - cm->frame_type, cpi->refresh_golden_frame, - cpi->refresh_alt_ref_frame); - - for (i = 0; i < MAX_MODES; ++i) - fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]); - - fprintf(fmodes, "\n"); - - fclose(fmodes); - } -} -#endif - static void set_mv_search_params(AV1_COMP *cpi) { const AV1_COMMON *const cm = &cpi->common; const unsigned int max_mv_def = AOMMIN(cm->width, cm->height); @@ -4164,18 +3620,16 @@ static void set_mv_search_params(AV1_COMP *cpi) { } static void set_size_independent_vars(AV1_COMP *cpi) { -#if CONFIG_GLOBAL_MOTION int i; for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { cpi->common.global_motion[i] = default_warp_params; } cpi->global_motion_search_done = 0; -#endif // CONFIG_GLOBAL_MOTION av1_set_speed_features_framesize_independent(cpi); av1_set_rd_speed_thresholds(cpi); av1_set_rd_speed_thresholds_sub8x8(cpi); cpi->common.interp_filter = cpi->sf.default_interp_filter; - if (!frame_is_intra_only(&cpi->common)) set_compound_tools(&cpi->common); + cpi->common.switchable_motion_mode = 1; } static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index, @@ -4186,24 +3640,13 @@ static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index, // Setup variables that depend on the dimensions of the frame. av1_set_speed_features_framesize_dependent(cpi); -// Decide q and q bounds. -#if CONFIG_XIPHRC - int frame_type = cm->frame_type == KEY_FRAME ? OD_I_FRAME : OD_P_FRAME; - *q = od_enc_rc_select_quantizers_and_lambdas( - &cpi->od_rc, cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame, - frame_type, bottom_index, top_index); -#else + // Decide q and q bounds. *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, bottom_index, top_index); -#endif if (!frame_is_intra_only(cm)) { -#if CONFIG_AMVR set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH, - cpi->common.cur_frame_mv_precision_level); -#else - set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH); -#endif + cpi->common.cur_frame_force_integer_mv); } // Configure experimental use of segmentation for enhanced coding of @@ -4224,10 +3667,9 @@ static void init_motion_estimation(AV1_COMP *cpi) { } } -#if CONFIG_LOOP_RESTORATION #define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0 -static void set_restoration_tilesize(int width, int height, int sx, int sy, - RestorationInfo *rst) { +static void set_restoration_unit_size(int width, int height, int sx, int sy, + RestorationInfo *rst) { (void)width; (void)height; (void)sx; @@ -4238,17 +3680,13 @@ static void set_restoration_tilesize(int width, int height, int sx, int sy, int s = 0; #endif // !COUPLED_CHROMA_FROM_LUMA_RESTORATION - rst[0].restoration_tilesize = (RESTORATION_TILESIZE_MAX >> 1); - rst[1].restoration_tilesize = rst[0].restoration_tilesize >> s; - rst[2].restoration_tilesize = rst[1].restoration_tilesize; - - rst[0].procunit_width = rst[0].procunit_height = RESTORATION_PROC_UNIT_SIZE; - rst[1].procunit_width = rst[2].procunit_width = - RESTORATION_PROC_UNIT_SIZE >> sx; - rst[1].procunit_height = rst[2].procunit_height = - RESTORATION_PROC_UNIT_SIZE >> sy; + if (width * height > 352 * 288) + rst[0].restoration_unit_size = RESTORATION_UNITSIZE_MAX; + else + rst[0].restoration_unit_size = (RESTORATION_UNITSIZE_MAX >> 1); + rst[1].restoration_unit_size = rst[0].restoration_unit_size >> s; + rst[2].restoration_unit_size = rst[1].restoration_unit_size; } -#endif // CONFIG_LOOP_RESTORATION static void init_ref_frame_bufs(AV1_COMMON *cm) { int i; @@ -4258,31 +3696,23 @@ static void init_ref_frame_bufs(AV1_COMMON *cm) { cm->ref_frame_map[i] = INVALID_IDX; pool->frame_bufs[i].ref_count = 0; } -#if CONFIG_HASH_ME - for (i = 0; i < FRAME_BUFFERS; ++i) { - av1_hash_table_init(&pool->frame_bufs[i].hash_table); + if (cm->seq_params.force_screen_content_tools) { + for (i = 0; i < FRAME_BUFFERS; ++i) { + av1_hash_table_init(&pool->frame_bufs[i].hash_table); + } } -#endif } -static void check_initial_width(AV1_COMP *cpi, -#if CONFIG_HIGHBITDEPTH - int use_highbitdepth, -#endif +static void check_initial_width(AV1_COMP *cpi, int use_highbitdepth, int subsampling_x, int subsampling_y) { AV1_COMMON *const cm = &cpi->common; - if (!cpi->initial_width || -#if CONFIG_HIGHBITDEPTH - cm->use_highbitdepth != use_highbitdepth || -#endif + if (!cpi->initial_width || cm->use_highbitdepth != use_highbitdepth || cm->subsampling_x != subsampling_x || cm->subsampling_y != subsampling_y) { cm->subsampling_x = subsampling_x; cm->subsampling_y = subsampling_y; -#if CONFIG_HIGHBITDEPTH cm->use_highbitdepth = use_highbitdepth; -#endif alloc_raw_frame_buffers(cpi); init_ref_frame_bufs(cm); @@ -4299,12 +3729,9 @@ static void check_initial_width(AV1_COMP *cpi, // Returns 1 if the assigned width or height was <= 0. static int set_size_literal(AV1_COMP *cpi, int width, int height) { AV1_COMMON *cm = &cpi->common; -#if CONFIG_HIGHBITDEPTH + const int num_planes = av1_num_planes(cm); check_initial_width(cpi, cm->use_highbitdepth, cm->subsampling_x, cm->subsampling_y); -#else - check_initial_width(cpi, cm->subsampling_x, cm->subsampling_y); -#endif // CONFIG_HIGHBITDEPTH if (width <= 0 || height <= 0) return 1; @@ -4314,7 +3741,7 @@ static int set_size_literal(AV1_COMP *cpi, int width, int height) { if (cpi->initial_width && cpi->initial_height && (cm->width > cpi->initial_width || cm->height > cpi->initial_height)) { av1_free_context_buffers(cm); - av1_free_pc_tree(&cpi->td); + av1_free_pc_tree(&cpi->td, num_planes); alloc_compressor_data(cpi); realloc_segmentation_maps(cpi); cpi->initial_width = cpi->initial_height = 0; @@ -4326,6 +3753,7 @@ static int set_size_literal(AV1_COMP *cpi, int width, int height) { static void set_frame_size(AV1_COMP *cpi, int width, int height) { AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; int ref_frame; @@ -4333,52 +3761,42 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) { // There has been a change in the encoded frame size set_size_literal(cpi, width, height); set_mv_search_params(cpi); + // Recalculate 'all_lossless' in case super-resolution was (un)selected. + cm->all_lossless = cm->coded_lossless && !av1_superres_scaled(cm); } -#if !CONFIG_XIPHRC if (cpi->oxcf.pass == 2) { av1_set_target_rate(cpi, cm->width, cm->height); } -#endif alloc_frame_mvs(cm, cm->new_fb_idx); + // Allocate above context buffers + if (cm->num_allocated_above_context_planes < av1_num_planes(cm) || + cm->num_allocated_above_context_mi_col < cm->mi_cols || + cm->num_allocated_above_contexts < cm->tile_rows) { + av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts); + if (av1_alloc_above_context_buffers(cm, cm->tile_rows)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } + // Reset the frame pointers to the current frame size. if (aom_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, -#if CONFIG_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, - NULL, NULL)) + cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); -#if CONFIG_LOOP_RESTORATION - set_restoration_tilesize( -#if CONFIG_FRAME_SUPERRES - cm->superres_upscaled_width, cm->superres_upscaled_height, -#else - cm->width, cm->height, -#endif // CONFIG_FRAME_SUPERRES - cm->subsampling_x, cm->subsampling_y, cm->rst_info); - for (int i = 0; i < MAX_MB_PLANE; ++i) + const int frame_width = cm->superres_upscaled_width; + const int frame_height = cm->superres_upscaled_height; + set_restoration_unit_size(frame_width, frame_height, cm->subsampling_x, + cm->subsampling_y, cm->rst_info); + for (int i = 0; i < num_planes; ++i) cm->rst_info[i].frame_restoration_type = RESTORE_NONE; + av1_alloc_restoration_buffers(cm); - for (int i = 0; i < MAX_MB_PLANE; ++i) { - cpi->rst_search[i].restoration_tilesize = - cm->rst_info[i].restoration_tilesize; - cpi->rst_search[i].procunit_width = cm->rst_info[i].procunit_width; - cpi->rst_search[i].procunit_height = cm->rst_info[i].procunit_height; - av1_alloc_restoration_struct(cm, &cpi->rst_search[i], -#if CONFIG_FRAME_SUPERRES - cm->superres_upscaled_width, - cm->superres_upscaled_height); -#else - cm->width, cm->height); -#endif // CONFIG_FRAME_SUPERRES - } -#endif // CONFIG_LOOP_RESTORATION alloc_util_frame_buffers(cpi); // TODO(afergs): Remove? Gets called anyways. init_motion_estimation(cpi); @@ -4391,36 +3809,18 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) { if (buf_idx != INVALID_IDX) { YV12_BUFFER_CONFIG *const buf = &cm->buffer_pool->frame_bufs[buf_idx].buf; ref_buf->buf = buf; -#if CONFIG_HIGHBITDEPTH - av1_setup_scale_factors_for_frame( - &ref_buf->sf, buf->y_crop_width, buf->y_crop_height, cm->width, - cm->height, (buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0); -#else av1_setup_scale_factors_for_frame(&ref_buf->sf, buf->y_crop_width, buf->y_crop_height, cm->width, cm->height); -#endif // CONFIG_HIGHBITDEPTH - if (av1_is_scaled(&ref_buf->sf)) aom_extend_frame_borders(buf); + if (av1_is_scaled(&ref_buf->sf)) + aom_extend_frame_borders(buf, num_planes); } else { ref_buf->buf = NULL; } } -#if CONFIG_VAR_REFS - // Check duplicate reference frames - enc_check_valid_ref_frames(cpi); -#endif // CONFIG_VAR_REFS - -#if CONFIG_INTRABC -#if CONFIG_HIGHBITDEPTH - av1_setup_scale_factors_for_frame(&xd->sf_identity, cm->width, cm->height, - cm->width, cm->height, - cm->use_highbitdepth); -#else - av1_setup_scale_factors_for_frame(&xd->sf_identity, cm->width, cm->height, + av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height, cm->width, cm->height); -#endif // CONFIG_HIGHBITDEPTH -#endif // CONFIG_INTRABC set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); } @@ -4432,6 +3832,7 @@ static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) { if (oxcf->pass == 1) return SCALE_NUMERATOR; uint8_t new_denom = SCALE_NUMERATOR; + if (cpi->common.seq_params.reduced_still_picture_hdr) return SCALE_NUMERATOR; switch (oxcf->resize_mode) { case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break; case RESIZE_FIXED: @@ -4446,15 +3847,19 @@ static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) { return new_denom; } -#if CONFIG_FRAME_SUPERRES - static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) { // Choose an arbitrary random number static unsigned int seed = 34567; const AV1EncoderConfig *oxcf = &cpi->oxcf; if (oxcf->pass == 1) return SCALE_NUMERATOR; uint8_t new_denom = SCALE_NUMERATOR; - int bottom_index, top_index, q, qthresh; + + // Make sure that superres mode of the frame is consistent with the + // sequence-level flag. + assert(IMPLIES(oxcf->superres_mode != SUPERRES_NONE, + cpi->common.seq_params.enable_superres)); + assert(IMPLIES(!cpi->common.seq_params.enable_superres, + oxcf->superres_mode == SUPERRES_NONE)); switch (oxcf->superres_mode) { case SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break; @@ -4465,21 +3870,35 @@ static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) { new_denom = oxcf->superres_scale_denominator; break; case SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break; - case SUPERRES_QTHRESH: - qthresh = (cpi->common.frame_type == KEY_FRAME ? oxcf->superres_kf_qthresh - : oxcf->superres_qthresh); + case SUPERRES_QTHRESH: { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + const RATE_FACTOR_LEVEL rf_level = gf_group->rf_level[gf_group->index]; + const double rate_factor_delta = rate_factor_deltas[rf_level]; + const int qthresh = (rate_factor_delta <= 1.0) + ? oxcf->superres_qthresh + : oxcf->superres_kf_qthresh; av1_set_target_rate(cpi, cpi->oxcf.width, cpi->oxcf.height); - q = av1_rc_pick_q_and_bounds(cpi, cpi->oxcf.width, cpi->oxcf.height, - &bottom_index, &top_index); + int bottom_index, top_index; + const int q = av1_rc_pick_q_and_bounds( + cpi, cpi->oxcf.width, cpi->oxcf.height, &bottom_index, &top_index); if (q < qthresh) { new_denom = SCALE_NUMERATOR; } else { - new_denom = SCALE_NUMERATOR + 1 + ((q - qthresh) >> 3); - new_denom = AOMMIN(SCALE_NUMERATOR << 1, new_denom); - // printf("SUPERRES: q %d, qthresh %d: denom %d\n", q, qthresh, - // new_denom); + const uint8_t min_denom = SCALE_NUMERATOR + 1; + const uint8_t denom_step = (MAXQ - qthresh + 1) >> 3; + + if (q == qthresh) { + new_denom = min_denom; + } else if (denom_step == 0) { + new_denom = SCALE_NUMERATOR << 1; + } else { + const uint8_t additional_denom = (q - qthresh) / denom_step; + new_denom = + AOMMIN(min_denom + additional_denom, SCALE_NUMERATOR << 1); + } } break; + } default: assert(0); } return new_denom; @@ -4489,15 +3908,12 @@ static int dimension_is_ok(int orig_dim, int resized_dim, int denom) { return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2); } -// TODO(now): Fix? static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) { - return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom) && - (CONFIG_HORZONLY_FRAME_SUPERRES || - dimension_is_ok(oheight, rsz->resize_height, rsz->superres_denom)); + // Only need to check the width, as scaling is horizontal only. + (void)oheight; + return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom); } -#define DIVIDE_AND_ROUND(x, y) (((x) + ((y) >> 1)) / (y)) - static int validate_size_scales(RESIZE_MODE resize_mode, SUPERRES_MODE superres_mode, int owidth, int oheight, size_params_type *rsz) { @@ -4548,24 +3964,17 @@ static int validate_size_scales(RESIZE_MODE resize_mode, } while (!dimensions_are_ok(owidth, oheight, rsz) && (resize_denom > SCALE_NUMERATOR || rsz->superres_denom > SCALE_NUMERATOR)); - } else { // We are allowed to alter neither resize scale nor superres scale. + } else { // We are allowed to alter neither resize scale nor superres + // scale. return 0; } return dimensions_are_ok(owidth, oheight, rsz); } -#undef DIVIDE_AND_ROUND -#endif // CONFIG_FRAME_SUPERRES // Calculates resize and superres params for next frame size_params_type av1_calculate_next_size_params(AV1_COMP *cpi) { const AV1EncoderConfig *oxcf = &cpi->oxcf; - size_params_type rsz = { - oxcf->width, - oxcf->height, -#if CONFIG_FRAME_SUPERRES - SCALE_NUMERATOR -#endif // CONFIG_FRAME_SUPERRES - }; + size_params_type rsz = { oxcf->width, oxcf->height, SCALE_NUMERATOR }; int resize_denom; if (oxcf->pass == 1) return rsz; if (cpi->resize_pending_width && cpi->resize_pending_height) { @@ -4579,12 +3988,10 @@ size_params_type av1_calculate_next_size_params(AV1_COMP *cpi) { av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height, resize_denom); } -#if CONFIG_FRAME_SUPERRES rsz.superres_denom = calculate_next_superres_scale(cpi); if (!validate_size_scales(oxcf->resize_mode, oxcf->superres_mode, oxcf->width, oxcf->height, &rsz)) assert(0 && "Invalid scale parameters"); -#endif // CONFIG_FRAME_SUPERRES return rsz; } @@ -4592,14 +3999,12 @@ static void setup_frame_size_from_params(AV1_COMP *cpi, size_params_type *rsz) { int encode_width = rsz->resize_width; int encode_height = rsz->resize_height; -#if CONFIG_FRAME_SUPERRES AV1_COMMON *cm = &cpi->common; cm->superres_upscaled_width = encode_width; cm->superres_upscaled_height = encode_height; cm->superres_scale_denominator = rsz->superres_denom; av1_calculate_scaled_superres_size(&encode_width, &encode_height, rsz->superres_denom); -#endif // CONFIG_FRAME_SUPERRES set_frame_size(cpi, encode_width, encode_height); } @@ -4608,67 +4013,63 @@ static void setup_frame_size(AV1_COMP *cpi) { setup_frame_size_from_params(cpi, &rsz); } -#if CONFIG_FRAME_SUPERRES static void superres_post_encode(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + + if (!av1_superres_scaled(cm)) return; - if (av1_superres_unscaled(cm)) return; + assert(cpi->oxcf.enable_superres); + assert(!is_lossless_requested(&cpi->oxcf)); + assert(!cm->all_lossless); av1_superres_upscale(cm, NULL); // If regular resizing is occurring the source will need to be downscaled to // match the upscaled superres resolution. Otherwise the original source is // used. - if (av1_resize_unscaled(cm)) { + if (!av1_resize_scaled(cm)) { cpi->source = cpi->unscaled_source; if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source; } else { assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width); assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height); - // Do downscale. cm->(width|height) has been updated by av1_superres_upscale + // Do downscale. cm->(width|height) has been updated by + // av1_superres_upscale if (aom_realloc_frame_buffer( &cpi->scaled_source, cm->superres_upscaled_width, cm->superres_upscaled_height, cm->subsampling_x, cm->subsampling_y, -#if CONFIG_HIGHBITDEPTH - cm->use_highbitdepth, -#endif // CONFIG_HIGHBITDEPTH - AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL)) + cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) aom_internal_error( &cm->error, AOM_CODEC_MEM_ERROR, "Failed to reallocate scaled source buffer for superres"); assert(cpi->scaled_source.y_crop_width == cm->superres_upscaled_width); assert(cpi->scaled_source.y_crop_height == cm->superres_upscaled_height); -#if CONFIG_HIGHBITDEPTH av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source, - (int)cm->bit_depth); -#else - av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source); -#endif // CONFIG_HIGHBITDEPTH + (int)cm->bit_depth, num_planes); cpi->source = &cpi->scaled_source; } } -#endif // CONFIG_FRAME_SUPERRES static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) { + const int num_planes = av1_num_planes(cm); MACROBLOCKD *xd = &cpi->td.mb.e_mbd; - struct loopfilter *lf = &cm->lf; - int no_loopfilter = 0; - if (is_lossless_requested(&cpi->oxcf)) no_loopfilter = 1; + assert(IMPLIES(is_lossless_requested(&cpi->oxcf), + cm->coded_lossless && cm->all_lossless)); -#if CONFIG_EXT_TILE - // 0 loopfilter level is only necessary if individual tile - // decoding is required. - if (cm->single_tile_decoding) no_loopfilter = 1; -#endif // CONFIG_EXT_TILE + const int no_loopfilter = cm->coded_lossless || cm->large_scale_tile; + const int no_cdef = + !cm->seq_params.enable_cdef || cm->coded_lossless || cm->large_scale_tile; + const int no_restoration = !cm->seq_params.enable_restoration || + cm->all_lossless || cm->large_scale_tile; + + struct loopfilter *lf = &cm->lf; if (no_loopfilter) { -#if CONFIG_LOOPFILTER_LEVEL lf->filter_level[0] = 0; lf->filter_level[1] = 0; -#else - lf->filter_level = 0; -#endif } else { struct aom_usec_timer timer; @@ -4682,79 +4083,60 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) { cpi->time_pick_lpf += aom_usec_timer_elapsed(&timer); } -#if !CONFIG_LPF_SB -#if CONFIG_LOOPFILTER_LEVEL - if (lf->filter_level[0] || lf->filter_level[1]) -#else - if (lf->filter_level > 0) -#endif -#endif // CONFIG_LPF_SB - { -#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4 -#if CONFIG_LPF_SB - av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0, 0, - 0); -#else -#if CONFIG_LOOPFILTER_LEVEL - av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level[0], - lf->filter_level[1], 0, 0); - av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level_u, - lf->filter_level_u, 1, 0); - av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level_v, - lf->filter_level_v, 2, 0); - -#else - av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0); -#endif // CONFIG_LOOPFILTER_LEVEL -#endif // CONFIG_LPF_SB + if (lf->filter_level[0] || lf->filter_level[1]) { +#if LOOP_FILTER_BITMASK + av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, num_planes, 0); #else if (cpi->num_workers > 1) - av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane, - lf->filter_level, 0, 0, cpi->workers, - cpi->num_workers, &cpi->lf_row_sync); + av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd, 0, num_planes, 0, + cpi->workers, cpi->num_workers, + &cpi->lf_row_sync); else - av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0); + av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, num_planes, 0); #endif } -#if CONFIG_STRIPED_LOOP_RESTORATION - av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm); -#endif + if (!no_restoration) + av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm, 0); -#if CONFIG_CDEF - if (is_lossless_requested(&cpi->oxcf)) { + if (no_cdef) { cm->cdef_bits = 0; cm->cdef_strengths[0] = 0; cm->nb_cdef_strengths = 1; + cm->cdef_uv_strengths[0] = 0; } else { // Find CDEF parameters av1_cdef_search(cm->frame_to_show, cpi->source, cm, xd, - cpi->oxcf.speed > 0); + cpi->sf.fast_cdef_search); // Apply the filter av1_cdef_frame(cm->frame_to_show, cm, xd); } -#endif -#if CONFIG_FRAME_SUPERRES superres_post_encode(cpi); -#endif // CONFIG_FRAME_SUPERRES -#if CONFIG_LOOP_RESTORATION - aom_extend_frame_borders(cm->frame_to_show); - av1_pick_filter_restoration(cpi->source, cpi, cpi->sf.lpf_pick); - if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE || - cm->rst_info[1].frame_restoration_type != RESTORE_NONE || - cm->rst_info[2].frame_restoration_type != RESTORE_NONE) { - av1_loop_restoration_frame(cm->frame_to_show, cm, cm->rst_info, 7, 0, NULL); + if (no_restoration) { + cm->rst_info[0].frame_restoration_type = RESTORE_NONE; + cm->rst_info[1].frame_restoration_type = RESTORE_NONE; + cm->rst_info[2].frame_restoration_type = RESTORE_NONE; + } else { + av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm, 1); + av1_pick_filter_restoration(cpi->source, cpi); + if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE || + cm->rst_info[1].frame_restoration_type != RESTORE_NONE || + cm->rst_info[2].frame_restoration_type != RESTORE_NONE) { + if (cpi->num_workers > 1) + av1_loop_restoration_filter_frame_mt(cm->frame_to_show, cm, 0, + cpi->workers, cpi->num_workers, + &cpi->lr_row_sync, &cpi->lr_ctxt); + else + av1_loop_restoration_filter_frame(cm->frame_to_show, cm, 0, + &cpi->lr_ctxt); + } } -#endif // CONFIG_LOOP_RESTORATION - // TODO(debargha): Fix mv search range on encoder side - // aom_extend_frame_inner_borders(cm->frame_to_show); - aom_extend_frame_borders(cm->frame_to_show); } -static void encode_without_recode_loop(AV1_COMP *cpi) { +static int encode_without_recode_loop(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; int q = 0, bottom_index = 0, top_index = 0; // Dummy variables. @@ -4774,10 +4156,7 @@ static void encode_without_recode_loop(AV1_COMP *cpi) { if (cpi->unscaled_last_source != NULL) cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source, &cpi->scaled_last_source); -#if CONFIG_HIGHBITDEPTH && CONFIG_GLOBAL_MOTION cpi->source->buf_8bit_valid = 0; -#endif - if (frame_is_intra_only(cm) == 0) { scale_references(cpi); } @@ -4796,6 +4175,16 @@ static void encode_without_recode_loop(AV1_COMP *cpi) { av1_cyclic_refresh_setup(cpi); } apply_active_map(cpi); + if (cm->seg.enabled) { + if (!cm->seg.update_data && cm->prev_frame) { + segfeatures_copy(&cm->seg, &cm->prev_frame->seg); + } else { + calculate_segdata(&cm->seg); + } + } else { + memset(&cm->seg, 0, sizeof(cm->seg)); + } + segfeatures_copy(&cm->cur_frame->seg, &cm->seg); // transform / motion compensation build reconstruction frame av1_encode_frame(cpi); @@ -4810,29 +4199,25 @@ static void encode_without_recode_loop(AV1_COMP *cpi) { // seen in the last encoder iteration. // update_base_skip_probs(cpi); aom_clear_system_state(); + return AOM_CODEC_OK; } -static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size, - uint8_t *dest) { +static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int bottom_index, top_index; int loop_count = 0; int loop_at_this_size = 0; int loop = 0; -#if !CONFIG_XIPHRC int overshoot_seen = 0; int undershoot_seen = 0; -#endif int frame_over_shoot_limit; int frame_under_shoot_limit; int q = 0, q_low = 0, q_high = 0; set_size_independent_vars(cpi); -#if CONFIG_HIGHBITDEPTH && CONFIG_GLOBAL_MOTION cpi->source->buf_8bit_valid = 0; -#endif aom_clear_system_state(); setup_frame_size(cpi); @@ -4845,32 +4230,27 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size, // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed. set_mv_search_params(cpi); -#if !CONFIG_XIPHRC // Reset the loop state for new frame size. overshoot_seen = 0; undershoot_seen = 0; -#endif q_low = bottom_index; q_high = top_index; loop_at_this_size = 0; - } - // Decide frame size bounds first time through. - if (loop_count == 0) { + // Decide frame size bounds first time through. av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target, &frame_under_shoot_limit, &frame_over_shoot_limit); } -#if CONFIG_GLOBAL_MOTION - // if frame was scaled calculate global_motion_search again if already done + // if frame was scaled calculate global_motion_search again if already + // done if (loop_count > 0 && cpi->source && cpi->global_motion_search_done) if (cpi->source->y_crop_width != cm->width || cpi->source->y_crop_height != cm->height) cpi->global_motion_search_done = 0; -#endif // CONFIG_GLOBAL_MOTION cpi->source = av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source); if (cpi->unscaled_last_source != NULL) @@ -4884,29 +4264,18 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size, scale_references(cpi); } av1_set_quantizer(cm, q); + // printf("Frame %d/%d: q = %d, frame_type = %d\n", cm->current_video_frame, + // cm->show_frame, q, cm->frame_type); if (loop_count == 0) setup_frame(cpi); -#if CONFIG_Q_ADAPT_PROBS // Base q-index may have changed, so we need to assign proper default coef // probs before every iteration. - if (frame_is_intra_only(cm) || cm->error_resilient_mode) { - int i; + if (cm->primary_ref_frame == PRIMARY_REF_NONE || + cm->frame_refs[cm->primary_ref_frame].idx < 0) { av1_default_coef_probs(cm); - if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode || - cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) { - for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc; - } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) { -#if CONFIG_NO_FRAME_CONTEXT_SIGNALING - if (cm->frame_refs[0].idx >= 0) { - cm->frame_contexts[cm->frame_refs[0].idx] = *cm->fc; - } -#else - cm->frame_contexts[cm->frame_context_idx] = *cm->fc; -#endif - } + av1_setup_frame_contexts(cm); } -#endif // CONFIG_Q_ADAPT_PROBS // Variance adaptive and in frame q adjustment experiments are mutually // exclusive. @@ -4915,6 +4284,16 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size, } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { av1_setup_in_frame_q_adj(cpi); } + if (cm->seg.enabled) { + if (!cm->seg.update_data && cm->prev_frame) { + segfeatures_copy(&cm->seg, &cm->prev_frame->seg); + } else { + calculate_segdata(&cm->seg); + } + } else { + memset(&cm->seg, 0, sizeof(cm->seg)); + } + segfeatures_copy(&cm->cur_frame->seg, &cm->seg); // transform / motion compensation build reconstruction frame save_coding_context(cpi); @@ -4931,7 +4310,9 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size, // to recode. if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) { restore_coding_context(cpi); - av1_pack_bitstream(cpi, dest, size); + + if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK) + return AOM_CODEC_ERROR; rc->projected_frame_size = (int)(*size) << 3; restore_coding_context(cpi); @@ -4950,16 +4331,11 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size, int64_t high_err_target = cpi->ambient_err; int64_t low_err_target = cpi->ambient_err >> 1; -#if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) { kf_err = aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm)); } else { kf_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm)); } -#else - kf_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm)); -#endif // CONFIG_HIGHBITDEPTH - // Prevent possible divide by zero error below for perfect KF kf_err += !kf_err; @@ -4996,7 +4372,6 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size, // Is the projected frame size out of range and are we allowed // to attempt to recode. int last_q = q; -#if !CONFIG_XIPHRC int retries = 0; // Frame size out of permitted range: @@ -5062,7 +4437,6 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size, undershoot_seen = 1; } -#endif // Clamp Q to upper and lower limits: q = clamp(q, q_low, q_high); @@ -5078,11 +4452,9 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size, rc->projected_frame_size < rc->max_frame_bandwidth) loop = 0; -#if CONFIG_GLOBAL_MOTION if (recode_loop_test_global_motion(cpi)) { loop = 1; } -#endif // CONFIG_GLOBAL_MOTION if (loop) { ++loop_count; @@ -5093,86 +4465,90 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size, #endif } } while (loop); + + return AOM_CODEC_OK; } static int get_ref_frame_flags(const AV1_COMP *cpi) { const int *const map = cpi->common.ref_frame_map; -#if CONFIG_EXT_REFS - const int last2_is_last = - map[cpi->lst_fb_idxes[1]] == map[cpi->lst_fb_idxes[0]]; - const int last3_is_last = - map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[0]]; - const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[0]]; -#if CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS - const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]]; - const int last3_is_last2 = - map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[1]]; - const int gld_is_last2 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[1]]; - const int gld_is_last3 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[2]]; -#else // !CONFIG_ONE_SIDED_COMPOUND || CONFIG_EXT_COMP_REFS - const int bwd_is_last = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[0]]; - const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]]; - - const int last3_is_last2 = - map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[1]]; - const int gld_is_last2 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[1]]; - const int bwd_is_last2 = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[1]]; - - const int gld_is_last3 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[2]]; - const int bwd_is_last3 = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[2]]; - - const int bwd_is_gld = map[cpi->bwd_fb_idx] == map[cpi->gld_fb_idx]; -#endif // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS - - const int alt2_is_last = map[cpi->alt2_fb_idx] == map[cpi->lst_fb_idxes[0]]; - const int alt2_is_last2 = map[cpi->alt2_fb_idx] == map[cpi->lst_fb_idxes[1]]; - const int alt2_is_last3 = map[cpi->alt2_fb_idx] == map[cpi->lst_fb_idxes[2]]; - const int alt2_is_gld = map[cpi->alt2_fb_idx] == map[cpi->gld_fb_idx]; - const int alt2_is_bwd = map[cpi->alt2_fb_idx] == map[cpi->bwd_fb_idx]; - - const int last2_is_alt = map[cpi->lst_fb_idxes[1]] == map[cpi->alt_fb_idx]; - const int last3_is_alt = map[cpi->lst_fb_idxes[2]] == map[cpi->alt_fb_idx]; - const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx]; - const int bwd_is_alt = map[cpi->bwd_fb_idx] == map[cpi->alt_fb_idx]; - const int alt2_is_alt = map[cpi->alt2_fb_idx] == map[cpi->alt_fb_idx]; -#else // !CONFIG_EXT_REFS - const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx]; - const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx]; - const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx]; -#endif // CONFIG_EXT_REFS - - int flags = AOM_REFFRAME_ALL; - - if (gld_is_last || gld_is_alt) flags &= ~AOM_GOLD_FLAG; + // No.1 Priority: LAST_FRAME + const int last2_is_last = map[cpi->ref_fb_idx[1]] == map[cpi->ref_fb_idx[0]]; + const int last3_is_last = map[cpi->ref_fb_idx[2]] == map[cpi->ref_fb_idx[0]]; + const int gld_is_last = + map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] == map[cpi->ref_fb_idx[0]]; + const int bwd_is_last = + map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] == map[cpi->ref_fb_idx[0]]; + const int alt2_is_last = + map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == map[cpi->ref_fb_idx[0]]; + const int alt_is_last = + map[cpi->ref_fb_idx[ALTREF_FRAME - 1]] == map[cpi->ref_fb_idx[0]]; + + // No.2 Priority: ALTREF_FRAME + const int last2_is_alt = + map[cpi->ref_fb_idx[1]] == map[cpi->ref_fb_idx[ALTREF_FRAME - 1]]; + const int last3_is_alt = + map[cpi->ref_fb_idx[2]] == map[cpi->ref_fb_idx[ALTREF_FRAME - 1]]; + const int gld_is_alt = map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] == + map[cpi->ref_fb_idx[ALTREF_FRAME - 1]]; + const int bwd_is_alt = map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] == + map[cpi->ref_fb_idx[ALTREF_FRAME - 1]]; + const int alt2_is_alt = map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == + map[cpi->ref_fb_idx[ALTREF_FRAME - 1]]; + + // No.3 Priority: LAST2_FRAME + const int last3_is_last2 = map[cpi->ref_fb_idx[2]] == map[cpi->ref_fb_idx[1]]; + const int gld_is_last2 = + map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] == map[cpi->ref_fb_idx[1]]; + const int bwd_is_last2 = + map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] == map[cpi->ref_fb_idx[1]]; + const int alt2_is_last2 = + map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == map[cpi->ref_fb_idx[1]]; + + // No.4 Priority: LAST3_FRAME + const int gld_is_last3 = + map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] == map[cpi->ref_fb_idx[2]]; + const int bwd_is_last3 = + map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] == map[cpi->ref_fb_idx[2]]; + const int alt2_is_last3 = + map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == map[cpi->ref_fb_idx[2]]; + + // No.5 Priority: GOLDEN_FRAME + const int bwd_is_gld = map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] == + map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]]; + const int alt2_is_gld = map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == + map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]]; + + // No.6 Priority: BWDREF_FRAME + const int alt2_is_bwd = map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == + map[cpi->ref_fb_idx[BWDREF_FRAME - 1]]; + + // No.7 Priority: ALTREF2_FRAME + + // After av1_apply_encoding_flags() is called, cpi->ref_frame_flags might be + // adjusted according to external encoder flags. + int flags = cpi->ext_ref_frame_flags; if (cpi->rc.frames_till_gf_update_due == INT_MAX) flags &= ~AOM_GOLD_FLAG; if (alt_is_last) flags &= ~AOM_ALT_FLAG; -#if CONFIG_EXT_REFS if (last2_is_last || last2_is_alt) flags &= ~AOM_LAST2_FLAG; - if (last3_is_last || last3_is_last2 || last3_is_alt) flags &= ~AOM_LAST3_FLAG; + if (last3_is_last || last3_is_alt || last3_is_last2) flags &= ~AOM_LAST3_FLAG; - if (gld_is_last2 || gld_is_last3) flags &= ~AOM_GOLD_FLAG; + if (gld_is_last || gld_is_alt || gld_is_last2 || gld_is_last3) + flags &= ~AOM_GOLD_FLAG; -#if CONFIG_ONE_SIDED_COMPOUND && \ - !CONFIG_EXT_COMP_REFS // Changes LL & HL bitstream - /* Allow biprediction between two identical frames (e.g. bwd_is_last = 1) */ - if (bwd_is_alt && (flags & AOM_BWD_FLAG)) flags &= ~AOM_BWD_FLAG; -#else // !CONFIG_ONE_SIDED_COMPOUND || CONFIG_EXT_COMP_REFS - if ((bwd_is_last || bwd_is_last2 || bwd_is_last3 || bwd_is_gld || - bwd_is_alt) && + if ((bwd_is_last || bwd_is_alt || bwd_is_last2 || bwd_is_last3 || + bwd_is_gld) && (flags & AOM_BWD_FLAG)) flags &= ~AOM_BWD_FLAG; -#endif // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS - if ((alt2_is_last || alt2_is_last2 || alt2_is_last3 || alt2_is_gld || - alt2_is_bwd || alt2_is_alt) && + if ((alt2_is_last || alt2_is_alt || alt2_is_last2 || alt2_is_last3 || + alt2_is_gld || alt2_is_bwd) && (flags & AOM_ALT2_FLAG)) flags &= ~AOM_ALT2_FLAG; -#endif // CONFIG_EXT_REFS return flags; } @@ -5182,6 +4558,9 @@ static void set_ext_overrides(AV1_COMP *cpi) { // av1_update_reference() and av1_update_entropy() calls // Note: The overrides are valid only for the next frame passed // to encode_frame_to_data_rate() function + if (cpi->ext_use_s_frame) cpi->common.frame_type = S_FRAME; + cpi->common.force_primary_ref_none = cpi->ext_use_primary_ref_none; + if (cpi->ext_refresh_frame_context_pending) { cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context; cpi->ext_refresh_frame_context_pending = 0; @@ -5190,54 +4569,23 @@ static void set_ext_overrides(AV1_COMP *cpi) { cpi->refresh_last_frame = cpi->ext_refresh_last_frame; cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame; cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame; + cpi->refresh_bwd_ref_frame = cpi->ext_refresh_bwd_ref_frame; + cpi->refresh_alt2_ref_frame = cpi->ext_refresh_alt2_ref_frame; cpi->ext_refresh_frame_flags_pending = 0; } + cpi->common.allow_ref_frame_mvs = cpi->ext_use_ref_frame_mvs; + cpi->common.error_resilient_mode = cpi->ext_use_error_resilient; } -#if !CONFIG_FRAME_SIGN_BIAS -static void set_arf_sign_bias(AV1_COMP *cpi) { - AV1_COMMON *const cm = &cpi->common; - int arf_sign_bias; -#if CONFIG_EXT_REFS - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - // The arf_sign_bias will be one for internal ARFs' - arf_sign_bias = cpi->rc.source_alt_ref_active && - (!cpi->refresh_alt_ref_frame || - gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE); -#else // !CONFIG_EXT_REFS - if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - arf_sign_bias = cpi->rc.source_alt_ref_active && - (!cpi->refresh_alt_ref_frame || - (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)); - } else { - arf_sign_bias = - (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame); - } -#endif // CONFIG_EXT_REFS - - cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias; -#if CONFIG_EXT_REFS - cm->ref_frame_sign_bias[BWDREF_FRAME] = cm->ref_frame_sign_bias[ALTREF_FRAME]; - cm->ref_frame_sign_bias[ALTREF2_FRAME] = - cm->ref_frame_sign_bias[ALTREF_FRAME]; -#endif // CONFIG_EXT_REFS -} -#endif // !CONFIG_FRAME_SIGN_BIAS - static int setup_interp_filter_search_mask(AV1_COMP *cpi) { InterpFilter ifilter; - int ref_total[TOTAL_REFS_PER_FRAME] = { 0 }; + int ref_total[REF_FRAMES] = { 0 }; MV_REFERENCE_FRAME ref; int mask = 0; int arf_idx = ALTREF_FRAME; -#if CONFIG_EXT_REFS if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame) -#else // !CONFIG_EXT_REFS - if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame) -#endif // CONFIG_EXT_REFS return mask; for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) @@ -5247,25 +4595,21 @@ static int setup_interp_filter_search_mask(AV1_COMP *cpi) { for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) { if ((ref_total[LAST_FRAME] && cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) && -#if CONFIG_EXT_REFS (ref_total[LAST2_FRAME] == 0 || cpi->interp_filter_selected[LAST2_FRAME][ifilter] * 50 < ref_total[LAST2_FRAME]) && (ref_total[LAST3_FRAME] == 0 || cpi->interp_filter_selected[LAST3_FRAME][ifilter] * 50 < ref_total[LAST3_FRAME]) && -#endif // CONFIG_EXT_REFS (ref_total[GOLDEN_FRAME] == 0 || cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50 < ref_total[GOLDEN_FRAME]) && -#if CONFIG_EXT_REFS (ref_total[BWDREF_FRAME] == 0 || cpi->interp_filter_selected[BWDREF_FRAME][ifilter] * 50 < ref_total[BWDREF_FRAME]) && (ref_total[ALTREF2_FRAME] == 0 || cpi->interp_filter_selected[ALTREF2_FRAME][ifilter] * 50 < ref_total[ALTREF2_FRAME]) && -#endif // CONFIG_EXT_REFS (ref_total[ALTREF_FRAME] == 0 || cpi->interp_filter_selected[arf_idx][ifilter] * 50 < ref_total[ALTREF_FRAME])) @@ -5281,16 +4625,50 @@ static int setup_interp_filter_search_mask(AV1_COMP *cpi) { static void dump_filtered_recon_frames(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; const YV12_BUFFER_CONFIG *recon_buf = cm->frame_to_show; - int h; - char file_name[256] = "/tmp/enc_filtered_recon.yuv"; - FILE *f_recon = NULL; - if (recon_buf == NULL || !cm->show_frame) { - printf("Frame %d is not ready or no show to dump.\n", + if (recon_buf == NULL) { + printf("Frame %d is not ready.\n", cm->current_video_frame); + return; + } + + static const int flag_list[REF_FRAMES] = { 0, + AOM_LAST_FLAG, + AOM_LAST2_FLAG, + AOM_LAST3_FLAG, + AOM_GOLD_FLAG, + AOM_BWD_FLAG, + AOM_ALT2_FLAG, + AOM_ALT_FLAG }; + printf( + "\n***Frame=%d (frame_offset=%d, show_frame=%d, " + "show_existing_frame=%d) " + "[LAST LAST2 LAST3 GOLDEN BWD ALT2 ALT]=[", + cm->current_video_frame, cm->frame_offset, cm->show_frame, + cm->show_existing_frame); + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx; + const int ref_offset = + (buf_idx >= 0) + ? (int)cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset + : -1; + printf( + " %d(%c-%d-%4.2f)", ref_offset, + (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N', + (buf_idx >= 0) ? (int)cpi->frame_rf_level[buf_idx] : -1, + (buf_idx >= 0) ? rate_factor_deltas[cpi->frame_rf_level[buf_idx]] : -1); + } + printf(" ]\n"); + + if (!cm->show_frame) { + printf("Frame %d is a no show frame, so no image dump.\n", cm->current_video_frame); return; } + int h; + char file_name[256] = "/tmp/enc_filtered_recon.yuv"; + FILE *f_recon = NULL; + if (cm->current_video_frame == 0) { if ((f_recon = fopen(file_name, "wb")) == NULL) { printf("Unable to open file %s to write.\n", file_name); @@ -5303,13 +4681,14 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) { } } printf( - "\nFrame=%5d, encode_update_type[%5d]=%1d, show_existing_frame=%d, " - "source_alt_ref_active=%d, refresh_alt_ref_frame=%d, rf_level=%d, " - "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n", + "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, " + "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, " + "refresh_alt_ref_frame=%d, rf_level=%d, " + "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n", cm->current_video_frame, cpi->twopass.gf_group.index, cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index], - cm->show_existing_frame, cpi->rc.source_alt_ref_active, - cpi->refresh_alt_ref_frame, + cm->frame_offset, cm->show_frame, cm->show_existing_frame, + cpi->rc.source_alt_ref_active, cpi->refresh_alt_ref_frame, cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index], recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height); #if 0 @@ -5346,49 +4725,44 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) { } #endif // DUMP_RECON_FRAMES -static void make_update_tile_list_enc(AV1_COMP *cpi, const int tile_rows, - const int tile_cols, - FRAME_CONTEXT *ec_ctxs[]) { - int i; - for (i = 0; i < tile_rows * tile_cols; ++i) - ec_ctxs[i] = &cpi->tile_data[i].tctx; -} - -static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, - uint8_t *dest, int skip_adapt, - unsigned int *frame_flags) { +static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, + int skip_adapt, + unsigned int *frame_flags) { AV1_COMMON *const cm = &cpi->common; const AV1EncoderConfig *const oxcf = &cpi->oxcf; struct segmentation *const seg = &cm->seg; - FRAME_CONTEXT **tile_ctxs = aom_malloc(cm->tile_rows * cm->tile_cols * - sizeof(&cpi->tile_data[0].tctx)); - aom_cdf_prob **cdf_ptrs = - aom_malloc(cm->tile_rows * cm->tile_cols * - sizeof(&cpi->tile_data[0].tctx.partition_cdf[0][0])); -#if CONFIG_XIPHRC - int frame_type; - int drop_this_frame = 0; -#endif // CONFIG_XIPHRC + set_ext_overrides(cpi); aom_clear_system_state(); -#if !CONFIG_FRAME_SIGN_BIAS - // Set the arf sign bias for this frame. - set_arf_sign_bias(cpi); -#endif // !CONFIG_FRAME_SIGN_BIAS - -#if CONFIG_TEMPMV_SIGNALING // frame type has been decided outside of this function call - cm->cur_frame->intra_only = cm->frame_type == KEY_FRAME || cm->intra_only; - cm->use_prev_frame_mvs = - !cpi->oxcf.disable_tempmv && !cm->cur_frame->intra_only; -#endif + cm->cur_frame->intra_only = frame_is_intra_only(cm); + cm->cur_frame->frame_type = cm->frame_type; + + // S_FRAMEs are always error resilient + cm->error_resilient_mode |= frame_is_sframe(cm); + + cm->large_scale_tile = cpi->oxcf.large_scale_tile; + cm->single_tile_decoding = cpi->oxcf.single_tile_decoding; + if (cm->large_scale_tile) cm->seq_params.frame_id_numbers_present_flag = 0; + + cm->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm); + // cm->allow_ref_frame_mvs needs to be written into the frame header while + // cm->large_scale_tile is 1, therefore, "cm->large_scale_tile=1" case is + // separated from frame_might_allow_ref_frame_mvs(). + cm->allow_ref_frame_mvs &= !cm->large_scale_tile; + + cm->allow_warped_motion = + cpi->oxcf.allow_warped_motion && frame_might_allow_warped_motion(cm); + + // Reset the frame packet stamp index. + if (cm->frame_type == KEY_FRAME) cm->current_video_frame = 0; -#if CONFIG_EXT_REFS // NOTE: // (1) Move the setup of the ref_frame_flags upfront as it would be // determined by the current frame properties; - // (2) The setup of the ref_frame_flags applies to both show_existing_frame's + // (2) The setup of the ref_frame_flags applies to both + // show_existing_frame's // and the other cases. if (cm->current_video_frame > 0) cpi->ref_frame_flags = get_ref_frame_flags(cpi); @@ -5415,12 +4789,20 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, cpi->rc.is_bipred_frame = 0; restore_coding_context(cpi); + // Build the bitstream - av1_pack_bitstream(cpi, dest, size); + if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK) + return AOM_CODEC_ERROR; + + cpi->seq_params_locked = 1; // Set up frame to show to get ready for stats collection. cm->frame_to_show = get_frame_new_buffer(cm); + // Update current frame offset. + cm->frame_offset = + cm->buffer_pool->frame_bufs[cm->new_fb_idx].cur_frame_offset; + #if DUMP_RECON_FRAMES == 1 // NOTE(zoeliu): For debug - Output the filtered reconstructed video. dump_filtered_recon_frames(cpi); @@ -5432,9 +4814,11 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, // update has been done previously when handling the LAST_BIPRED_FRAME // right before BWDREF_FRAME (in the display order); // (2) For INTNL_OVERLAY as the show_existing_frame, the reference frame - // update will be done when the following is called, which will exchange + // update will be done when the following is called, which will + // exchange // the virtual indexes between LAST_FRAME and ALTREF2_FRAME, so that - // LAST3 will get retired, LAST2 becomes LAST3, LAST becomes LAST2, and + // LAST3 will get retired, LAST2 becomes LAST3, LAST becomes LAST2, + // and // ALTREF2_FRAME will serve as the new LAST_FRAME. update_reference_frames(cpi); @@ -5452,23 +4836,13 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, // to do post-encoding update accordingly. if (cpi->rc.is_src_frame_alt_ref) { av1_set_target_rate(cpi, cm->width, cm->height); -#if CONFIG_XIPHRC - frame_type = cm->frame_type == INTER_FRAME ? OD_P_FRAME : OD_I_FRAME; - drop_this_frame = od_enc_rc_update_state( - &cpi->od_rc, *size << 3, cpi->refresh_golden_frame, - cpi->refresh_alt_ref_frame, frame_type, cpi->droppable); -#else av1_rc_postencode_update(cpi, *size); -#endif } ++cm->current_video_frame; - aom_free(tile_ctxs); - aom_free(cdf_ptrs); - return; + return AOM_CODEC_OK; } -#endif // CONFIG_EXT_REFS // Set default state for segment based loop filter update flags. cm->lf.mode_ref_delta_update = 0; @@ -5477,7 +4851,7 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi); // Set various flags etc to special state if it is a key frame. - if (frame_is_intra_only(cm)) { + if (frame_is_intra_only(cm) || frame_is_sframe(cm)) { // Reset the loop filter deltas and segmentation map. av1_reset_segment_features(cm); @@ -5489,19 +4863,6 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, // The alternate reference frame cannot be active for a key frame. cpi->rc.source_alt_ref_active = 0; - - cm->error_resilient_mode = oxcf->error_resilient_mode; - -#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING - // By default, encoder assumes decoder can use prev_mi. - if (cm->error_resilient_mode) { - cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE; - cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_FORWARD; - } else if (cm->intra_only) { - // Only reset the current context. - cm->reset_frame_context = RESET_FRAME_CONTEXT_CURRENT; - } -#endif } if (cpi->oxcf.mtu == 0) { cm->num_tg = cpi->oxcf.num_tile_groups; @@ -5511,33 +4872,15 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, cm->num_tg = DEFAULT_MAX_NUM_TG; } -#if CONFIG_EXT_TILE - cm->large_scale_tile = cpi->oxcf.large_scale_tile; - cm->single_tile_decoding = cpi->oxcf.single_tile_decoding; -#endif // CONFIG_EXT_TILE - -#if CONFIG_XIPHRC - if (drop_this_frame) { - av1_rc_postencode_update_drop_frame(cpi); - ++cm->current_video_frame; - aom_free(tile_ctxs); - aom_free(cdf_ptrs); - return; - } -#else // For 1 pass CBR, check if we are dropping this frame. // Never drop on key frame. if (oxcf->pass == 0 && oxcf->rc_mode == AOM_CBR && cm->frame_type != KEY_FRAME) { if (av1_rc_drop_frame(cpi)) { av1_rc_postencode_update_drop_frame(cpi); - ++cm->current_video_frame; - aom_free(tile_ctxs); - aom_free(cdf_ptrs); - return; + return AOM_CODEC_OK; } } -#endif aom_clear_system_state(); @@ -5546,46 +4889,59 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, MAX_MODES * sizeof(*cpi->mode_chosen_counts)); #endif -#if CONFIG_REFERENCE_BUFFER if (cm->seq_params.frame_id_numbers_present_flag) { /* Non-normative definition of current_frame_id ("frame counter" with - * wraparound) */ - const int frame_id_length = FRAME_ID_LENGTH_MINUS7 + 7; + * wraparound) */ + const int frame_id_length = FRAME_ID_LENGTH; if (cm->current_frame_id == -1) { int lsb, msb; -/* quasi-random initialization of current_frame_id for a key frame */ -#if CONFIG_HIGHBITDEPTH + /* quasi-random initialization of current_frame_id for a key frame */ if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) { lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff; msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff; } else { -#endif lsb = cpi->source->y_buffer[0] & 0xff; msb = cpi->source->y_buffer[1] & 0xff; -#if CONFIG_HIGHBITDEPTH } -#endif cm->current_frame_id = ((msb << 8) + lsb) % (1 << frame_id_length); + + // S_frame is meant for stitching different streams of different + // resolutions together, so current_frame_id must be the + // same across different streams of the same content current_frame_id + // should be the same and not random. 0x37 is a chosen number as start + // point + if (cpi->oxcf.sframe_enabled) cm->current_frame_id = 0x37; } else { cm->current_frame_id = (cm->current_frame_id + 1 + (1 << frame_id_length)) % (1 << frame_id_length); } } -#endif // CONFIG_REFERENCE_BUFFER -#if CONFIG_EXT_DELTA_Q - cm->delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q; - cm->delta_lf_present_flag = cpi->oxcf.deltaq_mode == DELTA_Q_LF; -#if CONFIG_LOOPFILTER_LEVEL - cm->delta_lf_multi = DEFAULT_DELTA_LF_MULTI; -#endif // CONFIG_LOOPFILTER_LEVEL -#endif + switch (cpi->oxcf.cdf_update_mode) { + case 0: // No CDF update for any frames(4~6% compression loss). + cm->disable_cdf_update = 1; + break; + case 1: // Enable CDF update for all frames. + cm->disable_cdf_update = 0; + break; + case 2: + // Strategically determine at which frames to do CDF update. + // Currently only enable CDF update for all-intra and no-show frames(1.5% + // compression loss). + // TODO(huisu@google.com): design schemes for various trade-offs between + // compression quality and decoding speed. + cm->disable_cdf_update = + (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1; + break; + } + cm->timing_info_present &= !cm->seq_params.reduced_still_picture_hdr; if (cpi->sf.recode_loop == DISALLOW_RECODE) { - encode_without_recode_loop(cpi); + if (encode_without_recode_loop(cpi) != AOM_CODEC_OK) return AOM_CODEC_ERROR; } else { - encode_with_recode_loop(cpi, size, dest); + if (encode_with_recode_loop(cpi, size, dest) != AOM_CODEC_OK) + return AOM_CODEC_ERROR; } cm->last_tile_cols = cm->tile_cols; @@ -5601,72 +4957,86 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, // fixed interval. Note the reconstruction error if it is the frame before // the force key frame if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) { -#if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) { cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm)); } else { cpi->ambient_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm)); } -#else - cpi->ambient_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm)); -#endif // CONFIG_HIGHBITDEPTH } - // If the encoder forced a KEY_FRAME decision - if (cm->frame_type == KEY_FRAME) { + // If the encoder forced a KEY_FRAME decision or if frame is an S_FRAME + if (cm->frame_type == KEY_FRAME || frame_is_sframe(cm)) { cpi->refresh_last_frame = 1; } cm->frame_to_show = get_frame_new_buffer(cm); - cm->frame_to_show->color_space = cm->color_space; -#if CONFIG_COLORSPACE_HEADERS - cm->frame_to_show->transfer_function = cm->transfer_function; + cm->frame_to_show->color_primaries = cm->color_primaries; + cm->frame_to_show->transfer_characteristics = cm->transfer_characteristics; + cm->frame_to_show->matrix_coefficients = cm->matrix_coefficients; + cm->frame_to_show->monochrome = cm->seq_params.monochrome; cm->frame_to_show->chroma_sample_position = cm->chroma_sample_position; -#endif cm->frame_to_show->color_range = cm->color_range; cm->frame_to_show->render_width = cm->render_width; cm->frame_to_show->render_height = cm->render_height; -#if CONFIG_EXT_REFS -// TODO(zoeliu): For non-ref frames, loop filtering may need to be turned -// off. -#endif // CONFIG_EXT_REFS + // TODO(zoeliu): For non-ref frames, loop filtering may need to be turned + // off. // Pick the loop filter level for the frame. - loopfilter_frame(cpi, cm); + if (!cm->allow_intrabc) { + loopfilter_frame(cpi, cm); + } else { + cm->lf.filter_level[0] = 0; + cm->lf.filter_level[1] = 0; + cm->cdef_bits = 0; + cm->cdef_strengths[0] = 0; + cm->nb_cdef_strengths = 1; + cm->cdef_uv_strengths[0] = 0; + cm->rst_info[0].frame_restoration_type = RESTORE_NONE; + cm->rst_info[1].frame_restoration_type = RESTORE_NONE; + cm->rst_info[2].frame_restoration_type = RESTORE_NONE; + } + + // TODO(debargha): Fix mv search range on encoder side + // aom_extend_frame_inner_borders(cm->frame_to_show, av1_num_planes(cm)); + aom_extend_frame_borders(cm->frame_to_show, av1_num_planes(cm)); #ifdef OUTPUT_YUV_REC aom_write_one_yuv_frame(cm, cm->frame_to_show); #endif // Build the bitstream - av1_pack_bitstream(cpi, dest, size); + if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK) + return AOM_CODEC_ERROR; - if (skip_adapt) { - aom_free(tile_ctxs); - aom_free(cdf_ptrs); - return; - } + cpi->seq_params_locked = 1; + + if (skip_adapt) return AOM_CODEC_OK; -#if CONFIG_REFERENCE_BUFFER if (cm->seq_params.frame_id_numbers_present_flag) { int i; - /* Update reference frame id values based on the value of refresh_mask */ + // Update reference frame id values based on the value of refresh_frame_mask for (i = 0; i < REF_FRAMES; i++) { - if ((cm->refresh_mask >> i) & 1) { + if ((cpi->refresh_frame_mask >> i) & 1) { cm->ref_frame_id[i] = cm->current_frame_id; } } } -#endif // CONFIG_REFERENCE_BUFFER #if DUMP_RECON_FRAMES == 1 // NOTE(zoeliu): For debug - Output the filtered reconstructed video. - if (cm->show_frame) dump_filtered_recon_frames(cpi); + dump_filtered_recon_frames(cpi); #endif // DUMP_RECON_FRAMES - if (cm->seg.update_map) update_reference_segmentation_map(cpi); + if (cm->seg.enabled) { + if (cm->seg.update_map) { + update_reference_segmentation_map(cpi); + } else if (cm->last_frame_seg_map) { + memcpy(cm->current_frame_seg_map, cm->last_frame_seg_map, + cm->mi_cols * cm->mi_rows * sizeof(uint8_t)); + } + } if (frame_is_intra_only(cm) == 0) { release_scaled_references(cpi); @@ -5675,39 +5045,12 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, update_reference_frames(cpi); #if CONFIG_ENTROPY_STATS - av1_accumulate_frame_counts(&aggregate_fc, &cm->counts); - assert(cm->frame_context_idx < FRAME_CONTEXTS); - av1_accumulate_frame_counts(&aggregate_fc_per_type[cm->frame_context_idx], - &cm->counts); + av1_accumulate_frame_counts(&aggregate_fc, &cpi->counts); #endif // CONFIG_ENTROPY_STATS - if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { -#if CONFIG_LV_MAP - av1_adapt_coef_probs(cm); -#endif // CONFIG_LV_MAP - av1_adapt_intra_frame_probs(cm); - make_update_tile_list_enc(cpi, cm->tile_rows, cm->tile_cols, tile_ctxs); - av1_average_tile_coef_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs, - cm->tile_rows * cm->tile_cols); - av1_average_tile_intra_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs, - cm->tile_rows * cm->tile_cols); -#if CONFIG_PVQ - av1_average_tile_pvq_cdfs(cpi->common.fc, tile_ctxs, - cm->tile_rows * cm->tile_cols); -#endif // CONFIG_PVQ -#if CONFIG_ADAPT_SCAN - av1_adapt_scan_order(cm); -#endif // CONFIG_ADAPT_SCAN - } - if (!frame_is_intra_only(cm)) { - if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { - av1_adapt_inter_frame_probs(cm); - av1_adapt_mv_probs(cm, cm->allow_high_precision_mv); - av1_average_tile_inter_cdfs(&cpi->common, cpi->common.fc, tile_ctxs, - cdf_ptrs, cm->tile_rows * cm->tile_cols); - av1_average_tile_mv_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs, - cm->tile_rows * cm->tile_cols); - } + if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { + *cm->fc = cpi->tile_data[cm->largest_tile_id].tctx; + av1_reset_cdf_symbol_counters(cm->fc); } if (cpi->refresh_golden_frame == 1) @@ -5720,39 +5063,14 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, else cpi->frame_flags &= ~FRAMEFLAGS_ALTREF; -#if CONFIG_EXT_REFS if (cpi->refresh_bwd_ref_frame == 1) cpi->frame_flags |= FRAMEFLAGS_BWDREF; else cpi->frame_flags &= ~FRAMEFLAGS_BWDREF; -#endif // CONFIG_EXT_REFS - -#if !CONFIG_EXT_REFS - cpi->ref_frame_flags = get_ref_frame_flags(cpi); -#endif // !CONFIG_EXT_REFS cm->last_frame_type = cm->frame_type; -#if CONFIG_XIPHRC - frame_type = cm->frame_type == KEY_FRAME ? OD_I_FRAME : OD_P_FRAME; - - drop_this_frame = - od_enc_rc_update_state(&cpi->od_rc, *size << 3, cpi->refresh_golden_frame, - cpi->refresh_alt_ref_frame, frame_type, 0); - if (drop_this_frame) { - av1_rc_postencode_update_drop_frame(cpi); - ++cm->current_video_frame; - aom_free(tile_ctxs); - aom_free(cdf_ptrs); - return; - } -#else // !CONFIG_XIPHRC av1_rc_postencode_update(cpi, *size); -#endif // CONFIG_XIPHRC - -#if 0 - output_frame_level_debug_stats(cpi); -#endif if (cm->frame_type == KEY_FRAME) { // Tell the caller that the frame was coded as a key frame @@ -5768,90 +5086,79 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, cm->lf.mode_ref_delta_update = 0; if (cm->show_frame) { -#if CONFIG_EXT_REFS -// TODO(zoeliu): We may only swamp mi and prev_mi for those frames that are -// being used as reference. -#endif // CONFIG_EXT_REFS + // TODO(zoeliu): We may only swamp mi and prev_mi for those frames that + // are + // being used as reference. swap_mi_and_prev_mi(cm); // Don't increment frame counters if this was an altref buffer // update not a real frame ++cm->current_video_frame; } -#if CONFIG_EXT_REFS // NOTE: Shall not refer to any frame not used as reference. if (cm->is_reference_frame) { -#endif // CONFIG_EXT_REFS - cm->prev_frame = cm->cur_frame; // keep track of the last coded dimensions cm->last_width = cm->width; cm->last_height = cm->height; // reset to normal state now that we are done. cm->last_show_frame = cm->show_frame; -#if CONFIG_EXT_REFS } -#endif // CONFIG_EXT_REFS - aom_free(tile_ctxs); - aom_free(cdf_ptrs); + return AOM_CODEC_OK; } -static void Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest, - int skip_adapt, unsigned int *frame_flags) { -#if CONFIG_XIPHRC - int64_t ip_count; - int frame_type, is_golden, is_altref; - - /* Not updated during init so update it here */ - if (cpi->oxcf.rc_mode == AOM_Q) cpi->od_rc.quality = cpi->oxcf.cq_level; - - frame_type = od_frame_type(&cpi->od_rc, cpi->od_rc.cur_frame, &is_golden, - &is_altref, &ip_count); - - if (frame_type == OD_I_FRAME) { - frame_type = KEY_FRAME; - cpi->frame_flags &= FRAMEFLAGS_KEY; - } else if (frame_type == OD_P_FRAME) { - frame_type = INTER_FRAME; - } - - if (is_altref) { - cpi->refresh_alt_ref_frame = 1; - cpi->rc.source_alt_ref_active = 1; - } - - cpi->refresh_golden_frame = is_golden; - cpi->common.frame_type = frame_type; - if (is_golden) cpi->frame_flags &= FRAMEFLAGS_GOLDEN; -#else +static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest, + int skip_adapt, unsigned int *frame_flags) { if (cpi->oxcf.rc_mode == AOM_CBR) { av1_rc_get_one_pass_cbr_params(cpi); } else { av1_rc_get_one_pass_vbr_params(cpi); } -#endif - encode_frame_to_data_rate(cpi, size, dest, skip_adapt, frame_flags); + if (encode_frame_to_data_rate(cpi, size, dest, skip_adapt, frame_flags) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + check_show_existing_frame(cpi); + return AOM_CODEC_OK; } -#if !CONFIG_XIPHRC -static void Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest, - unsigned int *frame_flags) { - encode_frame_to_data_rate(cpi, size, dest, 0, frame_flags); +static int Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest, + unsigned int *frame_flags) { +#if CONFIG_MISMATCH_DEBUG + mismatch_move_frame_idx_w(); +#endif +#if TXCOEFF_COST_TIMER + AV1_COMMON *cm = &cpi->common; + cm->txcoeff_cost_timer = 0; + cm->txcoeff_cost_count = 0; +#endif + + if (encode_frame_to_data_rate(cpi, size, dest, 0, frame_flags) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + +#if TXCOEFF_COST_TIMER + cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer; + fprintf(stderr, + "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld " + "in us\n", + cm->txcoeff_cost_count, cm->txcoeff_cost_timer, + cm->cum_txcoeff_cost_timer); +#endif -#if CONFIG_EXT_REFS - // Do not do post-encoding update for those frames that do not have a spot in - // a gf group, but note that an OVERLAY frame always has a spot in a gf group, + // Do not do post-encoding update for those frames that do not have a spot + // in + // a gf group, but note that an OVERLAY frame always has a spot in a gf + // group, // even when show_existing_frame is used. if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref) { av1_twopass_postencode_update(cpi); } check_show_existing_frame(cpi); -#else - av1_twopass_postencode_update(cpi); -#endif // CONFIG_EXT_REFS + return AOM_CODEC_OK; } -#endif int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, @@ -5861,37 +5168,34 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, int res = 0; const int subsampling_x = sd->subsampling_x; const int subsampling_y = sd->subsampling_y; -#if CONFIG_HIGHBITDEPTH const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; -#endif -#if CONFIG_HIGHBITDEPTH check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y); -#else - check_initial_width(cpi, subsampling_x, subsampling_y); -#endif // CONFIG_HIGHBITDEPTH aom_usec_timer_start(&timer); if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, -#if CONFIG_HIGHBITDEPTH - use_highbitdepth, -#endif // CONFIG_HIGHBITDEPTH - frame_flags)) + use_highbitdepth, frame_flags)) res = -1; aom_usec_timer_mark(&timer); cpi->time_receive_data += aom_usec_timer_elapsed(&timer); - if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) && + if ((cm->profile == PROFILE_0) && !cm->seq_params.monochrome && (subsampling_x != 1 || subsampling_y != 1)) { aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM, - "Non-4:2:0 color format requires profile 1 or 3"); + "Non-4:2:0 color format requires profile 1 or 2"); res = -1; } - if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) && - (subsampling_x == 1 && subsampling_y == 1)) { + if ((cm->profile == PROFILE_1) && + !(subsampling_x == 0 && subsampling_y == 0)) { aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM, - "4:2:0 color format requires profile 0 or 2"); + "Profile 1 requires 4:4:4 color format"); + res = -1; + } + if ((cm->profile == PROFILE_2) && (cm->bit_depth <= AOM_BITS_10) && + !(subsampling_x == 1 && subsampling_y == 0)) { + aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM, + "Profile 2 bit-depth < 10 requires 4:2:2 color format"); res = -1; } @@ -5902,13 +5206,10 @@ static int frame_is_reference(const AV1_COMP *cpi) { const AV1_COMMON *cm = &cpi->common; return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame || - cpi->refresh_golden_frame || -#if CONFIG_EXT_REFS - cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame || -#endif // CONFIG_EXT_REFS - cpi->refresh_alt_ref_frame || !cm->error_resilient_mode || - cm->lf.mode_ref_delta_update || cm->seg.update_map || - cm->seg.update_data; + cpi->refresh_golden_frame || cpi->refresh_bwd_ref_frame || + cpi->refresh_alt2_ref_frame || cpi->refresh_alt_ref_frame || + !cm->error_resilient_mode || cm->lf.mode_ref_delta_update || + cm->seg.update_map || cm->seg.update_data; } static void adjust_frame_rate(AV1_COMP *cpi, @@ -5968,7 +5269,6 @@ static int get_arf_src_index(AV1_COMP *cpi) { return arf_src_index; } -#if CONFIG_EXT_REFS static int get_brf_src_index(AV1_COMP *cpi) { int brf_src_index = 0; const GF_GROUP *const gf_group = &cpi->twopass.gf_group; @@ -6002,7 +5302,6 @@ static int get_arf2_src_index(AV1_COMP *cpi) { } return arf2_src_index; } -#endif // CONFIG_EXT_REFS static void check_src_altref(AV1_COMP *cpi, const struct lookahead_entry *source) { @@ -6014,14 +5313,10 @@ static void check_src_altref(AV1_COMP *cpi, if (cpi->oxcf.pass == 2) { const GF_GROUP *const gf_group = &cpi->twopass.gf_group; rc->is_src_frame_alt_ref = -#if CONFIG_EXT_REFS (gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE) || -#endif // CONFIG_EXT_REFS (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE); -#if CONFIG_EXT_REFS rc->is_src_frame_ext_arf = gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE; -#endif // CONFIG_EXT_REFS } else { rc->is_src_frame_alt_ref = cpi->alt_ref_source && (source == cpi->alt_ref_source); @@ -6031,20 +5326,16 @@ static void check_src_altref(AV1_COMP *cpi, // Current frame is an ARF overlay frame. cpi->alt_ref_source = NULL; -#if CONFIG_EXT_REFS if (rc->is_src_frame_ext_arf && !cpi->common.show_existing_frame) { // For INTNL_OVERLAY, when show_existing_frame == 0, they do need to // refresh the LAST_FRAME, i.e. LAST3 gets retired, LAST2 becomes LAST3, // LAST becomes LAST2, and INTNL_OVERLAY becomes LAST. cpi->refresh_last_frame = 1; } else { -#endif // CONFIG_EXT_REFS // Don't refresh the last buffer for an ARF overlay frame. It will // become the GF so preserve last as an alternative prediction option. cpi->refresh_last_frame = 0; -#if CONFIG_EXT_REFS } -#endif // CONFIG_EXT_REFS } } @@ -6055,10 +5346,10 @@ extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch, static void adjust_image_stat(double y, double u, double v, double all, ImageStat *s) { - s->stat[Y] += y; - s->stat[U] += u; - s->stat[V] += v; - s->stat[ALL] += all; + s->stat[STAT_Y] += y; + s->stat[STAT_U] += u; + s->stat[STAT_V] += v; + s->stat[STAT_ALL] += all; s->worst = AOMMIN(s->worst, all); } @@ -6073,12 +5364,10 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { #endif cpi->bytes += frame_bytes; -#if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) { in_bit_depth = cpi->oxcf.input_bit_depth; bit_depth = cm->bit_depth; } -#endif if (cm->show_frame) { const YV12_BUFFER_CONFIG *orig = cpi->source; const YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show; @@ -6089,28 +5378,20 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { PSNR_STATS psnr; double frame_ssim2 = 0.0, weight = 0.0; aom_clear_system_state(); -// TODO(yaowu): unify these two versions into one. -#if CONFIG_HIGHBITDEPTH + // TODO(yaowu): unify these two versions into one. aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth); -#else - aom_calc_psnr(orig, recon, &psnr); -#endif // CONFIG_HIGHBITDEPTH adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0], &cpi->psnr); cpi->total_sq_error += psnr.sse[0]; cpi->total_samples += psnr.samples[0]; samples = psnr.samples[0]; -// TODO(yaowu): unify these two versions into one. -#if CONFIG_HIGHBITDEPTH + // TODO(yaowu): unify these two versions into one. if (cm->use_highbitdepth) frame_ssim2 = aom_highbd_calc_ssim(orig, recon, &weight, bit_depth, in_bit_depth); else frame_ssim2 = aom_calc_ssim(orig, recon, &weight); -#else - frame_ssim2 = aom_calc_ssim(orig, recon, &weight); -#endif // CONFIG_HIGHBITDEPTH cpi->worst_ssim = AOMMIN(cpi->worst_ssim, frame_ssim2); cpi->summed_quality += frame_ssim2 * weight; @@ -6119,18 +5400,19 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { #if 0 { FILE *f = fopen("q_used.stt", "a"); + double y2 = psnr.psnr[1]; + double u2 = psnr.psnr[2]; + double v2 = psnr.psnr[3]; + double frame_psnr2 = psnr.psnr[0]; fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n", - cpi->common.current_video_frame, y2, u2, v2, + cm->current_video_frame, y2, u2, v2, frame_psnr2, frame_ssim2); fclose(f); } #endif } if (cpi->b_calculate_blockiness) { -#if CONFIG_HIGHBITDEPTH - if (!cm->use_highbitdepth) -#endif - { + if (!cm->use_highbitdepth) { const double frame_blockiness = av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height); @@ -6139,10 +5421,7 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { } if (cpi->b_calculate_consistency) { -#if CONFIG_HIGHBITDEPTH - if (!cm->use_highbitdepth) -#endif - { + if (!cm->use_highbitdepth) { const double this_inconsistency = aom_get_ssim_metrics( orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1); @@ -6167,7 +5446,6 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { } #endif // CONFIG_INTERNAL_STATS -#if CONFIG_AMVR static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture, const YV12_BUFFER_CONFIG *last_picture, hash_table *last_hash_table) { @@ -6203,14 +5481,28 @@ static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture, p_cur += (y_pos * stride_cur + x_pos); p_ref += (y_pos * stride_ref + x_pos); - for (int tmpY = 0; tmpY < block_size && match; tmpY++) { - for (int tmpX = 0; tmpX < block_size && match; tmpX++) { - if (p_cur[tmpX] != p_ref[tmpX]) { - match = 0; + if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur); + uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref); + for (int tmpY = 0; tmpY < block_size && match; tmpY++) { + for (int tmpX = 0; tmpX < block_size && match; tmpX++) { + if (p16_cur[tmpX] != p16_ref[tmpX]) { + match = 0; + } } + p16_cur += stride_cur; + p16_ref += stride_ref; + } + } else { + for (int tmpY = 0; tmpY < block_size && match; tmpY++) { + for (int tmpX = 0; tmpX < block_size && match; tmpX++) { + if (p_cur[tmpX] != p_ref[tmpX]) { + match = 0; + } + } + p_cur += stride_cur; + p_ref += stride_ref; } - p_cur += stride_cur; - p_ref += stride_ref; } if (match) { @@ -6227,10 +5519,14 @@ static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture, av1_get_block_hash_value( cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur, - block_size, &hash_value_1, &hash_value_2); - - if (av1_has_exact_match(last_hash_table, hash_value_1, hash_value_2)) { - M++; + block_size, &hash_value_1, &hash_value_2, + (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH)); + // Hashing does not work for highbitdepth currently. + // TODO(Roger): Make it work for highbitdepth. + if (av1_use_hash_me(&cpi->common)) { + if (av1_has_exact_match(last_hash_table, hash_value_1, hash_value_2)) { + M++; + } } } } @@ -6282,13 +5578,14 @@ static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture, return 0; } -#endif int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, size_t *size, uint8_t *dest, int64_t *time_stamp, - int64_t *time_end, int flush) { + int64_t *time_end, int flush, + const aom_rational_t *timebase) { const AV1EncoderConfig *const oxcf = &cpi->oxcf; AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); BufferPool *const pool = cm->buffer_pool; RATE_CONTROL *const rc = &cpi->rc; struct aom_usec_timer cmptimer; @@ -6296,15 +5593,9 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, struct lookahead_entry *last_source = NULL; struct lookahead_entry *source = NULL; int arf_src_index; -#if CONFIG_EXT_REFS int brf_src_index; -#endif // CONFIG_EXT_REFS int i; -#if CONFIG_XIPHRC - cpi->od_rc.end_of_input = flush; -#endif - #if CONFIG_BITSTREAM_DEBUG assert(cpi->oxcf.max_threads == 0 && "bitstream debug tool does not support multithreading"); @@ -6312,13 +5603,10 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, bitstream_queue_set_frame_write(cm->current_video_frame * 2 + cm->show_frame); #endif + cm->showable_frame = 0; aom_usec_timer_start(&cmptimer); -#if CONFIG_AMVR set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV, 0); -#else - set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV); -#endif // Is multi-arf enabled. // Note that at the moment multi_arf is only configured for 2 pass VBR @@ -6327,24 +5615,36 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, else cpi->multi_arf_allowed = 0; -// Normal defaults -#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING - cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE; -#endif - cm->refresh_frame_context = - (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode) - ? REFRESH_FRAME_CONTEXT_FORWARD - : REFRESH_FRAME_CONTEXT_BACKWARD; + // Normal defaults + cm->refresh_frame_context = oxcf->frame_parallel_decoding_mode + ? REFRESH_FRAME_CONTEXT_DISABLED + : REFRESH_FRAME_CONTEXT_BACKWARD; + if (oxcf->large_scale_tile) + cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; cpi->refresh_last_frame = 1; cpi->refresh_golden_frame = 0; -#if CONFIG_EXT_REFS cpi->refresh_bwd_ref_frame = 0; cpi->refresh_alt2_ref_frame = 0; -#endif // CONFIG_EXT_REFS cpi->refresh_alt_ref_frame = 0; -#if CONFIG_EXT_REFS && !CONFIG_XIPHRC + // TODO(zoeliu@gmail.com): To support forward-KEY_FRAME and set up the + // following flag accordingly. + cm->reset_decoder_state = 0; + + // Don't allow a show_existing_frame to coincide with an error resilient or + // S-Frame + struct lookahead_entry *lookahead_src = NULL; + if (cm->current_video_frame > 0) + lookahead_src = av1_lookahead_peek(cpi->lookahead, 0); + if (lookahead_src != NULL && + ((cpi->oxcf.error_resilient_mode | + ((lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT) != 0)) || + (cpi->oxcf.s_frame_mode | + ((lookahead_src->flags & AOM_EFLAG_SET_S_FRAME) != 0)))) { + cm->show_existing_frame = 0; + } + if (oxcf->pass == 2 && cm->show_existing_frame) { // Manage the source buffer and flush out the source frame that has been // coded already; Also get prepared for PSNR calculation if needed. @@ -6352,6 +5652,7 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, *size = 0; return -1; } + av1_apply_encoding_flags(cpi, source->flags); cpi->source = &source->img; // TODO(zoeliu): To track down to determine whether it's needed to adjust // the frame rate. @@ -6361,7 +5662,8 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, // We need to adjust frame rate for an overlay frame if (cpi->rc.is_src_frame_alt_ref) adjust_frame_rate(cpi, source); - // Find a free buffer for the new frame, releasing the reference previously + // Find a free buffer for the new frame, releasing the reference + // previously // held. if (cm->new_fb_idx != INVALID_IDX) { --pool->frame_bufs[cm->new_fb_idx].ref_count; @@ -6379,7 +5681,8 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, // We need to update the gf_group for show_existing overlay frame if (cpi->rc.is_src_frame_alt_ref) av1_rc_get_second_pass_params(cpi); - Pass2Encode(cpi, size, dest, frame_flags); + if (Pass2Encode(cpi, size, dest, frame_flags) != AOM_CODEC_OK) + return AOM_CODEC_ERROR; if (cpi->b_calculate_psnr) generate_psnr_packet(cpi); @@ -6393,7 +5696,6 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, cm->show_existing_frame = 0; return 0; } -#endif // CONFIG_EXT_REFS && !CONFIG_XIPHRC // Should we encode an arf frame. arf_src_index = get_arf_src_index(cpi); @@ -6415,21 +5717,13 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, assert(arf_src_index <= rc->frames_to_key); if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) { + cm->showable_frame = 1; cpi->alt_ref_source = source; if (oxcf->arnr_max_frames > 0) { -// Produce the filtered ARF frame. -#if CONFIG_BGSPRITE - int bgsprite_ret = av1_background_sprite(cpi, arf_src_index); - // Do temporal filter if bgsprite not generated. - if (bgsprite_ret != 0) -#endif // CONFIG_BGSPRITE - av1_temporal_filter(cpi, -#if CONFIG_BGSPRITE - NULL, &cpi->alt_ref_buffer, -#endif // CONFIG_BGSPRITE - arf_src_index); - aom_extend_frame_borders(&cpi->alt_ref_buffer); + // Produce the filtered ARF frame. + av1_temporal_filter(cpi, arf_src_index); + aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes); force_src_buffer = &cpi->alt_ref_buffer; } @@ -6438,16 +5732,13 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, cpi->refresh_alt_ref_frame = 1; cpi->refresh_last_frame = 0; cpi->refresh_golden_frame = 0; -#if CONFIG_EXT_REFS cpi->refresh_bwd_ref_frame = 0; cpi->refresh_alt2_ref_frame = 0; -#endif // CONFIG_EXT_REFS rc->is_src_frame_alt_ref = 0; } rc->source_alt_ref_pending = 0; } -#if CONFIG_EXT_REFS // Should we encode an arf2 frame. arf_src_index = get_arf2_src_index(cpi); if (arf_src_index) { @@ -6468,16 +5759,13 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, assert(arf_src_index <= rc->frames_to_key); if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) { + cm->showable_frame = 1; cpi->alt_ref_source = source; if (oxcf->arnr_max_frames > 0) { // Produce the filtered ARF frame. - av1_temporal_filter(cpi, -#if CONFIG_BGSPRITE - NULL, NULL, -#endif // CONFIG_BGSPRITE - arf_src_index); - aom_extend_frame_borders(&cpi->alt_ref_buffer); + av1_temporal_filter(cpi, arf_src_index); + aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes); force_src_buffer = &cpi->alt_ref_buffer; } @@ -6499,6 +5787,7 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, if (brf_src_index) { assert(brf_src_index <= rc->frames_to_key); if ((source = av1_lookahead_peek(cpi->lookahead, brf_src_index)) != NULL) { + cm->showable_frame = 1; cm->show_frame = 0; cm->intra_only = 0; @@ -6511,7 +5800,6 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, rc->is_bwd_ref_frame = 1; } } -#endif // CONFIG_EXT_REFS if (!source) { // Get last frame source. @@ -6538,16 +5826,13 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, *time_stamp = source->ts_start; *time_end = source->ts_end; + av1_apply_encoding_flags(cpi, source->flags); *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; } else { *size = 0; if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) { -#if CONFIG_XIPHRC - od_enc_rc_2pass_out(&cpi->od_rc, cpi->output_pkt_list, 1); -#else av1_end_first_pass(cpi); /* get last stats packet */ -#endif cpi->twopass.first_pass_done = 1; } return -1; @@ -6573,20 +5858,23 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, if (cm->new_fb_idx == INVALID_IDX) return -1; + // Retain the RF_LEVEL for the current newly coded frame. + cpi->frame_rf_level[cm->new_fb_idx] = + cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; + cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; -#if CONFIG_HIGHBITDEPTH && CONFIG_GLOBAL_MOTION cm->cur_frame->buf.buf_8bit_valid = 0; -#endif -#if !CONFIG_EXT_REFS - if (cpi->multi_arf_allowed) { - if (cm->frame_type == KEY_FRAME) { - init_buffer_indices(cpi); - } else if (oxcf->pass == 2) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index]; - } + + if (cm->film_grain_table) { + cm->film_grain_params_present = aom_film_grain_table_lookup( + cm->film_grain_table, *time_stamp, *time_end, 0 /* erase */, + &cm->film_grain_params); } -#endif // !CONFIG_EXT_REFS + cm->cur_frame->film_grain_params_present = cm->film_grain_params_present; + + // only one operating point supported now + cpi->common.tu_presentation_delay = + ticks_to_timebase_units(timebase, *time_stamp); // Start with a 0 size frame. *size = 0; @@ -6594,87 +5882,62 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, cpi->frame_flags = *frame_flags; if (oxcf->pass == 2) { -#if CONFIG_XIPHRC - if (od_enc_rc_2pass_in(&cpi->od_rc) < 0) return -1; - } -#else av1_rc_get_second_pass_params(cpi); } else if (oxcf->pass == 1) { setup_frame_size(cpi); } -#endif if (cpi->oxcf.pass != 0 || frame_is_intra_only(cm) == 1) { - for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) - cpi->scaled_ref_idx[i] = INVALID_IDX; + for (i = 0; i < REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX; } -#if CONFIG_AOM_QM cm->using_qmatrix = cpi->oxcf.using_qm; cm->min_qmlevel = cpi->oxcf.qm_minlevel; cm->max_qmlevel = cpi->oxcf.qm_maxlevel; -#endif -#if CONFIG_REFERENCE_BUFFER if (cm->seq_params.frame_id_numbers_present_flag) { if (*time_stamp == 0) { cpi->common.current_frame_id = -1; } } -#endif // CONFIG_REFERENCE_BUFFER -#if CONFIG_AMVR + cpi->cur_poc++; - if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) { - if (cpi->common.seq_mv_precision_level == 2) { + if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools && + !frame_is_intra_only(cm)) { + if (cpi->common.seq_params.force_integer_mv == 2) { struct lookahead_entry *previous_entry = - cpi->lookahead->buf + cpi->previsous_index; - cpi->common.cur_frame_mv_precision_level = is_integer_mv( - cpi, cpi->source, &previous_entry->img, cpi->previsou_hash_table); + av1_lookahead_peek(cpi->lookahead, cpi->previous_index); + if (!previous_entry) + cpi->common.cur_frame_force_integer_mv = 0; + else + cpi->common.cur_frame_force_integer_mv = is_integer_mv( + cpi, cpi->source, &previous_entry->img, cpi->previous_hash_table); } else { - cpi->common.cur_frame_mv_precision_level = - cpi->common.seq_mv_precision_level; + cpi->common.cur_frame_force_integer_mv = + cpi->common.seq_params.force_integer_mv; } } else { - cpi->common.cur_frame_mv_precision_level = 0; + cpi->common.cur_frame_force_integer_mv = 0; } -#endif -#if CONFIG_XIPHRC - if (oxcf->pass == 1) { - size_t tmp; - if (cpi->od_rc.cur_frame == 0) Pass0Encode(cpi, &tmp, dest, 1, frame_flags); - cpi->od_rc.firstpass_quant = cpi->od_rc.target_quantizer; - Pass0Encode(cpi, &tmp, dest, 0, frame_flags); - od_enc_rc_2pass_out(&cpi->od_rc, cpi->output_pkt_list, 0); - } else if (oxcf->pass == 2) { - Pass0Encode(cpi, size, dest, 0, frame_flags); - } else { - if (cpi->od_rc.cur_frame == 0) { - size_t tmp; - Pass0Encode(cpi, &tmp, dest, 1, frame_flags); - } - Pass0Encode(cpi, size, dest, 0, frame_flags); - } -#else if (oxcf->pass == 1) { cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(oxcf); av1_first_pass(cpi, source); } else if (oxcf->pass == 2) { - Pass2Encode(cpi, size, dest, frame_flags); + if (Pass2Encode(cpi, size, dest, frame_flags) != AOM_CODEC_OK) + return AOM_CODEC_ERROR; } else { // One pass encode - Pass0Encode(cpi, size, dest, 0, frame_flags); + if (Pass0Encode(cpi, size, dest, 0, frame_flags) != AOM_CODEC_OK) + return AOM_CODEC_ERROR; } -#endif -#if CONFIG_HASH_ME if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) { -#if CONFIG_AMVR - cpi->previsou_hash_table = &cm->cur_frame->hash_table; + cpi->previous_hash_table = &cm->cur_frame->hash_table; { int l; for (l = -MAX_PRE_FRAMES; l < cpi->lookahead->max_sz; l++) { if ((cpi->lookahead->buf + l) == source) { - cpi->previsous_index = l; + cpi->previous_index = l; break; } } @@ -6684,17 +5947,26 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, "Failed to find last frame original buffer"); } } -#endif } -#endif + if (!cm->large_scale_tile) { + cm->frame_contexts[cm->new_fb_idx] = *cm->fc; + } -#if CONFIG_NO_FRAME_CONTEXT_SIGNALING - cm->frame_contexts[cm->new_fb_idx] = *cm->fc; -#else - if (!cm->error_resilient_mode) - cm->frame_contexts[cm->frame_context_idx] = *cm->fc; -#endif // CONFIG_NO_FRAME_CONTEXT_SIGNALING +#define EXT_TILE_DEBUG 0 +#if EXT_TILE_DEBUG + if (cm->large_scale_tile && oxcf->pass == 2) { + char fn[20] = "./fc"; + fn[4] = cm->current_video_frame / 100 + '0'; + fn[5] = (cm->current_video_frame % 100) / 10 + '0'; + fn[6] = (cm->current_video_frame % 10) + '0'; + fn[7] = '\0'; + av1_print_frame_contexts(cm->fc, fn); + } +#endif // EXT_TILE_DEBUG +#undef EXT_TILE_DEBUG + + cm->showable_frame = !cm->show_frame && cm->showable_frame; // No frame encoded, or frame was dropped, release scaled references. if ((*size == 0) && (frame_is_intra_only(cm) == 0)) { @@ -6717,10 +5989,6 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, } #endif // CONFIG_INTERNAL_STATS -#if CONFIG_XIPHRC - cpi->od_rc.cur_frame++; -#endif - aom_clear_system_state(); return 0; @@ -6755,6 +6023,29 @@ int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) { return 0; } +static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + return a->y_height == b->y_height && a->y_width == b->y_width && + a->uv_height == b->uv_height && a->uv_width == b->uv_width && + a->y_stride == b->y_stride && a->uv_stride == b->uv_stride && + a->border == b->border && + (a->flags & YV12_FLAG_HIGHBITDEPTH) == + (b->flags & YV12_FLAG_HIGHBITDEPTH); +} + +aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *new_frame, + YV12_BUFFER_CONFIG *sd) { + const int num_planes = av1_num_planes(cm); + if (!equal_dimensions_and_border(new_frame, sd)) + aom_internal_error(&cm->error, AOM_CODEC_ERROR, + "Incorrect buffer dimensions"); + else + aom_yv12_copy_frame(new_frame, sd, num_planes); + + return cm->error.error_code; +} + int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode, AOM_SCALING vert_mode) { int hr = 0, hs = 0, vr = 0, vs = 0; @@ -6773,47 +6064,134 @@ int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode, int av1_get_quantizer(AV1_COMP *cpi) { return cpi->common.base_qindex; } +int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *frame_size) { + size_t output_size = 0; + size_t total_bytes_read = 0; + size_t remaining_size = *frame_size; + uint8_t *buff_ptr = buffer; + + // go through each OBUs + while (total_bytes_read < *frame_size) { + uint8_t saved_obu_header[2]; + uint64_t obu_payload_size; + size_t length_of_payload_size; + size_t length_of_obu_size; + uint32_t obu_header_size = (buff_ptr[0] >> 2) & 0x1 ? 2 : 1; + size_t obu_bytes_read = obu_header_size; // bytes read for current obu + + // save the obu header (1 or 2 bytes) + memmove(saved_obu_header, buff_ptr, obu_header_size); + // clear the obu_has_size_field + saved_obu_header[0] = saved_obu_header[0] & (~0x2); + + // get the payload_size and length of payload_size + if (aom_uleb_decode(buff_ptr + obu_header_size, remaining_size, + &obu_payload_size, &length_of_payload_size) != 0) { + return AOM_CODEC_ERROR; + } + obu_bytes_read += length_of_payload_size; + + // calculate the length of size of the obu header plus payload + length_of_obu_size = + aom_uleb_size_in_bytes((uint64_t)(obu_header_size + obu_payload_size)); + + // move the rest of data to new location + memmove(buff_ptr + length_of_obu_size + obu_header_size, + buff_ptr + obu_bytes_read, remaining_size - obu_bytes_read); + obu_bytes_read += (size_t)obu_payload_size; + + // write the new obu size + const uint64_t obu_size = obu_header_size + obu_payload_size; + size_t coded_obu_size; + if (aom_uleb_encode(obu_size, sizeof(obu_size), buff_ptr, + &coded_obu_size) != 0) { + return AOM_CODEC_ERROR; + } + + // write the saved (modified) obu_header following obu size + memmove(buff_ptr + length_of_obu_size, saved_obu_header, obu_header_size); + + total_bytes_read += obu_bytes_read; + remaining_size -= obu_bytes_read; + buff_ptr += length_of_obu_size + obu_size; + output_size += length_of_obu_size + (size_t)obu_size; + } + + *frame_size = output_size; + return AOM_CODEC_OK; +} + void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) { + // TODO(yunqingwang): For what references to use, external encoding flags + // should be consistent with internal reference frame selection. Need to + // ensure that there is not conflict between the two. In AV1 encoder, the + // priority rank for 7 reference frames are: LAST, ALTREF, LAST2, LAST3, + // GOLDEN, BWDREF, ALTREF2. If only one reference frame is used, it must be + // LAST. + cpi->ext_ref_frame_flags = AOM_REFFRAME_ALL; if (flags & - (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF)) { - int ref = AOM_REFFRAME_ALL; - + (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD | + AOM_EFLAG_NO_REF_ARF2)) { if (flags & AOM_EFLAG_NO_REF_LAST) { - ref ^= AOM_LAST_FLAG; -#if CONFIG_EXT_REFS - ref ^= AOM_LAST2_FLAG; - ref ^= AOM_LAST3_FLAG; -#endif // CONFIG_EXT_REFS - } + cpi->ext_ref_frame_flags = 0; + } else { + int ref = AOM_REFFRAME_ALL; - if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG; + if (flags & AOM_EFLAG_NO_REF_LAST2) ref ^= AOM_LAST2_FLAG; + if (flags & AOM_EFLAG_NO_REF_LAST3) ref ^= AOM_LAST3_FLAG; - if (flags & AOM_EFLAG_NO_REF_ARF) ref ^= AOM_ALT_FLAG; + if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG; - av1_use_as_reference(cpi, ref); + if (flags & AOM_EFLAG_NO_REF_ARF) { + ref ^= AOM_ALT_FLAG; + ref ^= AOM_BWD_FLAG; + ref ^= AOM_ALT2_FLAG; + } else { + if (flags & AOM_EFLAG_NO_REF_BWD) ref ^= AOM_BWD_FLAG; + if (flags & AOM_EFLAG_NO_REF_ARF2) ref ^= AOM_ALT2_FLAG; + } + + av1_use_as_reference(cpi, ref); + } } if (flags & - (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF | - AOM_EFLAG_FORCE_GF | AOM_EFLAG_FORCE_ARF)) { + (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) { int upd = AOM_REFFRAME_ALL; - if (flags & AOM_EFLAG_NO_UPD_LAST) { - upd ^= AOM_LAST_FLAG; -#if CONFIG_EXT_REFS - upd ^= AOM_LAST2_FLAG; - upd ^= AOM_LAST3_FLAG; -#endif // CONFIG_EXT_REFS - } + // Refreshing LAST/LAST2/LAST3 is handled by 1 common flag. + if (flags & AOM_EFLAG_NO_UPD_LAST) upd ^= AOM_LAST_FLAG; if (flags & AOM_EFLAG_NO_UPD_GF) upd ^= AOM_GOLD_FLAG; - if (flags & AOM_EFLAG_NO_UPD_ARF) upd ^= AOM_ALT_FLAG; + if (flags & AOM_EFLAG_NO_UPD_ARF) { + upd ^= AOM_ALT_FLAG; + upd ^= AOM_BWD_FLAG; + upd ^= AOM_ALT2_FLAG; + } av1_update_reference(cpi, upd); } + cpi->ext_use_ref_frame_mvs = cpi->oxcf.allow_ref_frame_mvs & + ((flags & AOM_EFLAG_NO_REF_FRAME_MVS) == 0); + cpi->ext_use_error_resilient = cpi->oxcf.error_resilient_mode | + ((flags & AOM_EFLAG_ERROR_RESILIENT) != 0); + cpi->ext_use_s_frame = + cpi->oxcf.s_frame_mode | ((flags & AOM_EFLAG_SET_S_FRAME) != 0); + cpi->ext_use_primary_ref_none = (flags & AOM_EFLAG_SET_PRIMARY_REF_NONE) != 0; + if (flags & AOM_EFLAG_NO_UPD_ENTROPY) { av1_update_entropy(cpi, 0); } } + +int64_t timebase_units_to_ticks(const aom_rational_t *timebase, int64_t n) { + return n * TICKS_PER_SEC * timebase->num / timebase->den; +} + +int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n) { + const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1; + return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC; +} diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h index eb779a3cd..5212db2b1 100644 --- a/third_party/aom/av1/encoder/encoder.h +++ b/third_party/aom/av1/encoder/encoder.h @@ -14,7 +14,8 @@ #include <stdio.h> -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aomcx.h" #include "av1/common/alloccommon.h" @@ -22,11 +23,8 @@ #include "av1/common/thread_common.h" #include "av1/common/onyxc_int.h" #include "av1/common/resize.h" +#include "av1/common/timing.h" #include "av1/encoder/aq_cyclicrefresh.h" -#if CONFIG_ANS -#include "aom_dsp/ans.h" -#include "aom_dsp/buf_ans.h" -#endif #include "av1/encoder/av1_quantize.h" #include "av1/encoder/context_tree.h" #include "av1/encoder/encodemb.h" @@ -38,9 +36,6 @@ #include "av1/encoder/rd.h" #include "av1/encoder/speed_features.h" #include "av1/encoder/tokenize.h" -#if CONFIG_XIPHRC -#include "av1/encoder/ratectrl_xiph.h" -#endif #if CONFIG_INTERNAL_STATS #include "aom_dsp/ssim.h" @@ -54,19 +49,13 @@ extern "C" { #endif typedef struct { - int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS]; - int nmv_costs[NMV_CONTEXTS][2][MV_VALS]; - int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS]; - - // 0 = Intra, Last, GF, ARF - int8_t last_ref_lf_deltas[TOTAL_REFS_PER_FRAME]; - // 0 = ZERO_MV, MV - int8_t last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; + int nmv_vec_cost[MV_JOINTS]; + int nmv_costs[2][MV_VALS]; + int nmv_costs_hp[2][MV_VALS]; FRAME_CONTEXT fc; } CODING_CONTEXT; -#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING typedef enum { // regular inter frame REGULAR_FRAME = 0, @@ -76,14 +65,12 @@ typedef enum { OVERLAY_FRAME = 2, // golden frame GLD_FRAME = 3, -#if CONFIG_EXT_REFS // backward reference frame BRF_FRAME = 4, // extra alternate reference frame - EXT_ARF_FRAME = 5 -#endif + EXT_ARF_FRAME = 5, + FRAME_CONTEXT_INDEXES } FRAME_CONTEXT_INDEX; -#endif typedef enum { NORMAL = 0, @@ -101,13 +88,9 @@ typedef enum { typedef enum { FRAMEFLAGS_KEY = 1 << 0, FRAMEFLAGS_GOLDEN = 1 << 1, -#if CONFIG_EXT_REFS FRAMEFLAGS_BWDREF = 1 << 2, // TODO(zoeliu): To determine whether a frame flag is needed for ALTREF2_FRAME FRAMEFLAGS_ALTREF = 1 << 3, -#else // !CONFIG_EXT_REFS - FRAMEFLAGS_ALTREF = 1 << 2, -#endif // CONFIG_EXT_REFS } FRAMETYPE_FLAGS; typedef enum { @@ -115,26 +98,22 @@ typedef enum { VARIANCE_AQ = 1, COMPLEXITY_AQ = 2, CYCLIC_REFRESH_AQ = 3, -#if !CONFIG_EXT_DELTA_Q - DELTA_AQ = 4, -#endif AQ_MODE_COUNT // This should always be the last member of the enum } AQ_MODE; -#if CONFIG_EXT_DELTA_Q typedef enum { NO_DELTA_Q = 0, DELTA_Q_ONLY = 1, DELTA_Q_LF = 2, DELTAQ_MODE_COUNT // This should always be the last member of the enum } DELTAQ_MODE; -#endif + typedef enum { RESIZE_NONE = 0, // No frame resizing allowed. RESIZE_FIXED = 1, // All frames are coded at the specified scale. RESIZE_RANDOM = 2, // All frames are coded at a random scale. RESIZE_MODES } RESIZE_MODE; -#if CONFIG_FRAME_SUPERRES + typedef enum { SUPERRES_NONE = 0, // No frame superres allowed SUPERRES_FIXED = 1, // All frames are coded at the specified scale, @@ -145,13 +124,14 @@ typedef enum { // q_index SUPERRES_MODES } SUPERRES_MODE; -#endif // CONFIG_FRAME_SUPERRES typedef struct AV1EncoderConfig { BITSTREAM_PROFILE profile; aom_bit_depth_t bit_depth; // Codec bit-depth. int width; // width of data passed to the compressor int height; // height of data passed to the compressor + int forced_max_frame_width; // forced maximum width of frame (if != 0) + int forced_max_frame_height; // forced maximum height of frame (if != 0) unsigned int input_bit_depth; // Input bit depth. double init_framerate; // set to passed in framerate int64_t target_bandwidth; // bandwidth to be used in bits per second @@ -159,6 +139,7 @@ typedef struct AV1EncoderConfig { int noise_sensitivity; // pre processing blur: recommendation 0 int sharpness; // sharpening output: recommendation 0: int speed; + int dev_sf; // maximum allowed bitrate for any intra frame in % of bitrate target. unsigned int rc_max_intra_bitrate_pct; // maximum allowed bitrate for any inter frame in % of bitrate target. @@ -172,8 +153,11 @@ typedef struct AV1EncoderConfig { // Key Framing Operations int auto_key; // autodetect cut scenes and set the keyframes int key_freq; // maximum distance to key frame. - + int sframe_dist; + int sframe_mode; + int sframe_enabled; int lag_in_frames; // how many frames lag before we start encoding + int fwd_kf_enabled; // ---------------------------------------------------------------- // DATARATE CONTROL OPTIONS @@ -199,36 +183,33 @@ typedef struct AV1EncoderConfig { int best_allowed_q; int cq_level; AQ_MODE aq_mode; // Adaptive Quantization mode -#if CONFIG_EXT_DELTA_Q DELTAQ_MODE deltaq_mode; -#endif -#if CONFIG_AOM_QM + int enable_cdef; + int enable_restoration; + int disable_trellis_quant; int using_qm; + int qm_y; + int qm_u; + int qm_v; int qm_minlevel; int qm_maxlevel; -#endif #if CONFIG_DIST_8X8 int using_dist_8x8; #endif unsigned int num_tile_groups; unsigned int mtu; -#if CONFIG_TEMPMV_SIGNALING - unsigned int disable_tempmv; -#endif // Internal frame size scaling. RESIZE_MODE resize_mode; uint8_t resize_scale_denominator; uint8_t resize_kf_scale_denominator; -#if CONFIG_FRAME_SUPERRES // Frame Super-Resolution size scaling. SUPERRES_MODE superres_mode; uint8_t superres_scale_denominator; uint8_t superres_kf_scale_denominator; int superres_qthresh; int superres_kf_qthresh; -#endif // CONFIG_FRAME_SUPERRES // Enable feature to reduce the frame quantization every x frames. int frame_periodic_boost; @@ -241,9 +222,7 @@ typedef struct AV1EncoderConfig { // ---------------------------------------------------------------- int enable_auto_arf; -#if CONFIG_EXT_REFS int enable_auto_brf; // (b)ackward (r)ef (f)rame -#endif // CONFIG_EXT_REFS /* Bitfield defining the error resiliency features to enable. * Can provide decodable frames after losses in previous @@ -251,12 +230,16 @@ typedef struct AV1EncoderConfig { */ unsigned int error_resilient_mode; + unsigned int s_frame_mode; + /* Bitfield defining the parallel decoding mode where the * decoding in successive frames may be conducted in parallel * just by decoding the frame headers. */ unsigned int frame_parallel_decoding_mode; + unsigned int limit; + int arnr_max_frames; int arnr_strength; @@ -265,18 +248,10 @@ typedef struct AV1EncoderConfig { int tile_columns; int tile_rows; -#if CONFIG_MAX_TILE int tile_width_count; int tile_height_count; int tile_widths[MAX_TILE_COLS]; int tile_heights[MAX_TILE_ROWS]; -#endif -#if CONFIG_DEPENDENT_HORZTILES - int dependent_horz_tiles; -#endif -#if CONFIG_LOOPFILTERING_ACROSS_TILES - int loop_filter_across_tiles_enabled; -#endif // CONFIG_LOOPFILTERING_ACROSS_TILES int max_threads; @@ -289,34 +264,135 @@ typedef struct AV1EncoderConfig { aom_tune_metric tuning; aom_tune_content content; -#if CONFIG_HIGHBITDEPTH int use_highbitdepth; -#endif - aom_color_space_t color_space; - aom_transfer_function_t transfer_function; + aom_color_primaries_t color_primaries; + aom_transfer_characteristics_t transfer_characteristics; + aom_matrix_coefficients_t matrix_coefficients; aom_chroma_sample_position_t chroma_sample_position; int color_range; int render_width; int render_height; - -#if CONFIG_EXT_PARTITION + aom_timing_info_type_t timing_info_type; + int timing_info_present; + aom_timing_info_t timing_info; + int decoder_model_info_present_flag; + int display_model_info_present_flag; + int buffer_removal_delay_present; + aom_dec_model_info_t buffer_model; + aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1]; + aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1]; + int film_grain_test_vector; + const char *film_grain_table_filename; + + uint8_t cdf_update_mode; aom_superblock_size_t superblock_size; -#endif // CONFIG_EXT_PARTITION -#if CONFIG_ANS && ANS_MAX_SYMBOLS - int ans_window_size_log2; -#endif // CONFIG_ANS && ANS_MAX_SYMBOLS -#if CONFIG_EXT_TILE unsigned int large_scale_tile; unsigned int single_tile_decoding; -#endif // CONFIG_EXT_TILE - + int monochrome; + unsigned int full_still_picture_hdr; + int enable_dual_filter; unsigned int motion_vector_unit_test; + const cfg_options_t *cfg; + int enable_order_hint; + int enable_jnt_comp; + int enable_ref_frame_mvs; + unsigned int allow_ref_frame_mvs; + int enable_warped_motion; + int allow_warped_motion; + int enable_superres; + unsigned int save_as_annexb; } AV1EncoderConfig; static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) { return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0; } +typedef struct FRAME_COUNTS { +// Note: This structure should only contain 'unsigned int' fields, or +// aggregates built solely from 'unsigned int' fields/elements +#if CONFIG_ENTROPY_STATS + unsigned int kf_y_mode[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][INTRA_MODES]; + unsigned int angle_delta[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1]; + unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES]; + unsigned int uv_mode[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES]; + unsigned int cfl_sign[CFL_JOINT_SIGNS]; + unsigned int cfl_alpha[CFL_ALPHA_CONTEXTS][CFL_ALPHABET_SIZE]; + unsigned int palette_y_mode[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2]; + unsigned int palette_uv_mode[PALETTE_UV_MODE_CONTEXTS][2]; + unsigned int palette_y_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; + unsigned int palette_uv_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; + unsigned int palette_y_color_index[PALETTE_SIZES] + [PALETTE_COLOR_INDEX_CONTEXTS] + [PALETTE_COLORS]; + unsigned int palette_uv_color_index[PALETTE_SIZES] + [PALETTE_COLOR_INDEX_CONTEXTS] + [PALETTE_COLORS]; + unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES]; + unsigned int txb_skip[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS][2]; + unsigned int eob_extra[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [EOB_COEF_CONTEXTS][2]; + unsigned int dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS][2]; + unsigned int coeff_lps[TX_SIZES][PLANE_TYPES][BR_CDF_SIZE - 1][LEVEL_CONTEXTS] + [2]; + unsigned int eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS][2]; + unsigned int eob_multi16[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][5]; + unsigned int eob_multi32[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][6]; + unsigned int eob_multi64[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][7]; + unsigned int eob_multi128[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][8]; + unsigned int eob_multi256[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][9]; + unsigned int eob_multi512[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][10]; + unsigned int eob_multi1024[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][11]; + unsigned int coeff_lps_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [LEVEL_CONTEXTS][BR_CDF_SIZE]; + unsigned int coeff_base_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [SIG_COEF_CONTEXTS][NUM_BASE_LEVELS + 2]; + unsigned int coeff_base_eob_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [SIG_COEF_CONTEXTS_EOB][NUM_BASE_LEVELS + 1]; + unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2]; + unsigned int zeromv_mode[GLOBALMV_MODE_CONTEXTS][2]; + unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2]; + unsigned int drl_mode[DRL_MODE_CONTEXTS][2]; + unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES]; + unsigned int wedge_idx[BLOCK_SIZES_ALL][16]; + unsigned int interintra[BLOCK_SIZE_GROUPS][2]; + unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES]; + unsigned int wedge_interintra[BLOCK_SIZES_ALL][2]; + unsigned int compound_type[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1]; + unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES]; + unsigned int obmc[BLOCK_SIZES_ALL][2]; + unsigned int intra_inter[INTRA_INTER_CONTEXTS][2]; + unsigned int comp_inter[COMP_INTER_CONTEXTS][2]; + unsigned int comp_ref_type[COMP_REF_TYPE_CONTEXTS][2]; + unsigned int uni_comp_ref[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1][2]; + unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS - 1][2]; + unsigned int comp_ref[REF_CONTEXTS][FWD_REFS - 1][2]; + unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS - 1][2]; + unsigned int intrabc[2]; + + unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2]; + unsigned int intra_tx_size[MAX_TX_CATS][TX_SIZE_CONTEXTS][MAX_TX_DEPTH + 1]; + unsigned int skip_mode[SKIP_MODE_CONTEXTS][2]; + unsigned int skip[SKIP_CONTEXTS][2]; + unsigned int compound_index[COMP_INDEX_CONTEXTS][2]; + unsigned int comp_group_idx[COMP_GROUP_IDX_CONTEXTS][2]; + unsigned int delta_q[DELTA_Q_PROBS][2]; + unsigned int delta_lf_multi[FRAME_LF_COUNT][DELTA_LF_PROBS][2]; + unsigned int delta_lf[DELTA_LF_PROBS][2]; + + unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES]; + unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES] + [TX_TYPES]; + unsigned int filter_intra_mode[FILTER_INTRA_MODES]; + unsigned int filter_intra[BLOCK_SIZES_ALL][2]; + unsigned int switchable_restore[RESTORE_SWITCHABLE_TYPES]; + unsigned int wiener_restore[2]; + unsigned int sgrproj_restore[2]; +#endif // CONFIG_ENTROPY_STATS + + unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS]; +} FRAME_COUNTS; + // TODO(jingning) All spatially adaptive variables should go to TileDataEnc. typedef struct TileDataEnc { TileInfo tile_info; @@ -324,42 +400,31 @@ typedef struct TileDataEnc { int mode_map[BLOCK_SIZES_ALL][MAX_MODES]; int m_search_count; int ex_search_count; -#if CONFIG_PVQ - PVQ_QUEUE pvq_q; -#endif -#if CONFIG_CFL CFL_CTX cfl; -#endif DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx); + uint8_t allow_update_cdf; } TileDataEnc; typedef struct RD_COUNTS { int64_t comp_pred_diff[REFERENCE_MODES]; -#if CONFIG_GLOBAL_MOTION // Stores number of 4x4 blocks using global motion per reference frame. - int global_motion_used[TOTAL_REFS_PER_FRAME]; -#endif // CONFIG_GLOBAL_MOTION - int single_ref_used_flag; + int global_motion_used[REF_FRAMES]; int compound_ref_used_flag; + int skip_mode_used_flag; } RD_COUNTS; typedef struct ThreadData { MACROBLOCK mb; RD_COUNTS rd_counts; FRAME_COUNTS *counts; -#if !CONFIG_CB4X4 - PICK_MODE_CONTEXT *leaf_tree; -#endif PC_TREE *pc_tree; PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1]; -#if CONFIG_MOTION_VAR int32_t *wsrc_buf; int32_t *mask_buf; uint8_t *above_pred_buf; uint8_t *left_pred_buf; -#endif - PALETTE_BUFFER *palette_buffer; + int intrabc_used_this_tile; } ThreadData; struct EncWorkerData; @@ -370,14 +435,21 @@ typedef struct ActiveMap { unsigned char *map; } ActiveMap; -#define NUM_STAT_TYPES 4 // types of stats: Y, U, V and ALL +#if CONFIG_INTERNAL_STATS +// types of stats +typedef enum { + STAT_Y, + STAT_U, + STAT_V, + STAT_ALL, + NUM_STAT_TYPES // This should always be the last member of the enum +} StatType; typedef struct IMAGE_STAT { double stat[NUM_STAT_TYPES]; double worst; } ImageStat; - -#undef NUM_STAT_TYPES +#endif // CONFIG_INTERNAL_STATS typedef struct { int ref_count; @@ -392,16 +464,18 @@ typedef struct TileBufferEnc { typedef struct AV1_COMP { QUANTS quants; ThreadData td; + FRAME_COUNTS counts; MB_MODE_INFO_EXT *mbmi_ext_base; -#if CONFIG_LV_MAP CB_COEFF_BUFFER *coeff_buffer_base; -#endif Dequants dequants; AV1_COMMON common; AV1EncoderConfig oxcf; struct lookahead_ctx *lookahead; struct lookahead_entry *alt_ref_source; + int optimize_speed_feature; + int optimize_seg_arr[MAX_SEGMENTS]; + YV12_BUFFER_CONFIG *source; YV12_BUFFER_CONFIG *last_source; // NULL for first frame and alt_ref frames YV12_BUFFER_CONFIG *unscaled_source; @@ -411,58 +485,42 @@ typedef struct AV1_COMP { // For a still frame, this flag is set to 1 to skip partition search. int partition_search_skippable_frame; -#if CONFIG_AMVR double csm_rate_array[32]; double m_rate_array[32]; int rate_size; int rate_index; - hash_table *previsou_hash_table; - int previsous_index; + hash_table *previous_hash_table; + int previous_index; int cur_poc; // DebugInfo -#endif - int scaled_ref_idx[TOTAL_REFS_PER_FRAME]; -#if CONFIG_EXT_REFS - int lst_fb_idxes[LAST_REF_FRAMES]; -#else - int lst_fb_idx; -#endif // CONFIG_EXT_REFS - int gld_fb_idx; -#if CONFIG_EXT_REFS - int bwd_fb_idx; // BWDREF_FRAME - int alt2_fb_idx; // ALTREF2_FRAME -#endif // CONFIG_EXT_REFS - int alt_fb_idx; -#if CONFIG_EXT_REFS - int ext_fb_idx; // extra ref frame buffer index + int scaled_ref_idx[REF_FRAMES]; + int ref_fb_idx[REF_FRAMES]; int refresh_fb_idx; // ref frame buffer index to refresh -#endif // CONFIG_EXT_REFS int last_show_frame_buf_idx; // last show frame buffer index int refresh_last_frame; int refresh_golden_frame; -#if CONFIG_EXT_REFS int refresh_bwd_ref_frame; int refresh_alt2_ref_frame; -#endif // CONFIG_EXT_REFS int refresh_alt_ref_frame; int ext_refresh_frame_flags_pending; int ext_refresh_last_frame; int ext_refresh_golden_frame; + int ext_refresh_bwd_ref_frame; + int ext_refresh_alt2_ref_frame; int ext_refresh_alt_ref_frame; int ext_refresh_frame_context_pending; int ext_refresh_frame_context; + int ext_use_ref_frame_mvs; + int ext_use_error_resilient; + int ext_use_s_frame; + int ext_use_primary_ref_none; YV12_BUFFER_CONFIG last_frame_uf; -#if CONFIG_LOOP_RESTORATION - YV12_BUFFER_CONFIG last_frame_db; YV12_BUFFER_CONFIG trial_frame_rst; - uint8_t *extra_rstbuf; // Extra buffers used in restoration search - RestorationInfo rst_search[MAX_MB_PLANE]; // Used for encoder side search -#endif // CONFIG_LOOP_RESTORATION // Ambient reconstruction err target for force key frames int64_t ambient_err; @@ -471,22 +529,17 @@ typedef struct AV1_COMP { CODING_CONTEXT coding_context; -#if CONFIG_GLOBAL_MOTION int gmtype_cost[TRANS_TYPES]; - int gmparams_cost[TOTAL_REFS_PER_FRAME]; -#endif // CONFIG_GLOBAL_MOTION + int gmparams_cost[REF_FRAMES]; - int nmv_costs[NMV_CONTEXTS][2][MV_VALS]; - int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS]; + int nmv_costs[2][MV_VALS]; + int nmv_costs_hp[2][MV_VALS]; int64_t last_time_stamp_seen; int64_t last_end_time_stamp_seen; int64_t first_time_stamp_ever; RATE_CONTROL rc; -#if CONFIG_XIPHRC - od_rc_state od_rc; -#endif double framerate; // NOTE(zoeliu): Any inter frame allows maximum of REF_FRAMES inter @@ -500,6 +553,8 @@ typedef struct AV1_COMP { int mbgraph_n_frames; // number of frames filled in the above int static_mb_pct; // % forced skip mbs by segmentation int ref_frame_flags; + int ext_ref_frame_flags; + RATE_FACTOR_LEVEL frame_rf_level[FRAME_BUFFERS]; SPEED_FEATURES sf; @@ -507,6 +562,7 @@ typedef struct AV1_COMP { int mv_step_param; int allow_comp_inter_inter; + int all_one_sided_refs; uint8_t *segmentation_map; @@ -514,7 +570,6 @@ typedef struct AV1_COMP { ActiveMap active_map; fractional_mv_step_fp *find_fractional_mv_step; - av1_full_search_fn_t full_search_sad; // It is currently unused. av1_diamond_search_fn_t diamond_search_sad; aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL]; uint64_t time_receive_data; @@ -581,8 +636,6 @@ typedef struct AV1_COMP { search_site_config ss_cfg; int multi_arf_allowed; - int multi_arf_enabled; - int multi_arf_last_grp_enabled; TileDataEnc *tile_data; int allocated_tiles; // Keep track of memory allocated for tiles. @@ -597,6 +650,11 @@ typedef struct AV1_COMP { int resize_buffer_underflow; int resize_count; + // Sequence parameters have been transmitted already and locked + // or not. Once locked av1_change_config cannot change the seq + // parameters. + int seq_params_locked; + // VARIANCE_AQ segment map refresh int vaq_refresh; @@ -604,11 +662,6 @@ typedef struct AV1_COMP { int num_workers; AVxWorker *workers; struct EncWorkerData *tile_thr_data; - AV1LfSync lf_row_sync; -#if CONFIG_ANS - struct BufAnsCoder buf_ans; -#endif -#if CONFIG_EXT_REFS int refresh_frame_mask; int existing_fb_idx_to_show; int is_arf_filter_off[MAX_EXT_ARFS + 1]; @@ -616,22 +669,24 @@ typedef struct AV1_COMP { int arf_map[MAX_EXT_ARFS + 1]; int arf_pos_in_gf[MAX_EXT_ARFS + 1]; int arf_pos_for_ovrly[MAX_EXT_ARFS + 1]; -#endif // CONFIG_EXT_REFS -#if CONFIG_GLOBAL_MOTION int global_motion_search_done; -#endif -#if CONFIG_LV_MAP tran_low_t *tcoeff_buf[MAX_MB_PLANE]; -#endif - -#if CONFIG_EXT_REFS int extra_arf_allowed; - int bwd_ref_allowed; -#endif // CONFIG_EXT_REFS + // A flag to indicate if intrabc is ever used in current frame. + int intrabc_used; + int dv_cost[2][MV_VALS]; + // TODO(huisu@google.com): we can update dv_joint_cost per SB. + int dv_joint_cost[MV_JOINTS]; + int has_lossless_segment; + + // For frame refs short signaling: + // A mapping of each reference frame from its encoder side value to the + // decoder side value obtained following the short signaling procedure. + int ref_conv[REF_FRAMES]; -#if CONFIG_BGSPRITE - int bgsprite_allowed; -#endif // CONFIG_BGSPRITE + AV1LfSync lf_row_sync; + AV1LrSync lr_row_sync; + AV1LrStruct lr_ctxt; } AV1_COMP; void av1_initialize_enc(void); @@ -650,12 +705,17 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, size_t *size, uint8_t *dest, int64_t *time_stamp, - int64_t *time_end, int flush); + int64_t *time_end, int flush, + const aom_rational_t *timebase); int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest); int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame); +aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *new_frame, + YV12_BUFFER_CONFIG *sd); + int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags); void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags); @@ -675,6 +735,11 @@ int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode, int av1_get_quantizer(struct AV1_COMP *cpi); +int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size); + +int64_t timebase_units_to_ticks(const aom_rational_t *timebase, int64_t n); +int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n); + static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) { return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame || (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref); @@ -682,22 +747,7 @@ static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) { static INLINE int get_ref_frame_map_idx(const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { -#if CONFIG_EXT_REFS - if (ref_frame >= LAST_FRAME && ref_frame <= LAST3_FRAME) - return cpi->lst_fb_idxes[ref_frame - 1]; -#else - if (ref_frame == LAST_FRAME) return cpi->lst_fb_idx; -#endif // CONFIG_EXT_REFS - else if (ref_frame == GOLDEN_FRAME) - return cpi->gld_fb_idx; -#if CONFIG_EXT_REFS - else if (ref_frame == BWDREF_FRAME) - return cpi->bwd_fb_idx; - else if (ref_frame == ALTREF2_FRAME) - return cpi->alt2_fb_idx; -#endif // CONFIG_EXT_REFS - else - return cpi->alt_fb_idx; + return (ref_frame >= 1) ? cpi->ref_fb_idx[ref_frame - 1] : INVALID_IDX; } static INLINE int get_ref_frame_buf_idx(const AV1_COMP *cpi, @@ -707,16 +757,19 @@ static INLINE int get_ref_frame_buf_idx(const AV1_COMP *cpi, return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX; } -#if CONFIG_HASH_ME -static INLINE hash_table *get_ref_frame_hash_map(const AV1_COMP *cpi, - MV_REFERENCE_FRAME ref_frame) { +// TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD. +static INLINE int av1_use_hash_me(const AV1_COMMON *const cm) { + return cm->allow_screen_content_tools; +} + +static INLINE hash_table *av1_get_ref_frame_hash_map( + const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { const AV1_COMMON *const cm = &cpi->common; const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); return buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].hash_table : NULL; } -#endif static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer( const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { @@ -726,7 +779,6 @@ static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer( : NULL; } -#if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING static INLINE int enc_is_ref_frame_buf(AV1_COMP *cpi, RefCntBuffer *frame_buf) { MV_REFERENCE_FRAME ref_frame; AV1_COMMON *const cm = &cpi->common; @@ -737,48 +789,42 @@ static INLINE int enc_is_ref_frame_buf(AV1_COMP *cpi, RefCntBuffer *frame_buf) { } return (ref_frame <= ALTREF_FRAME); } -#endif // CONFIG_EXT_REFS -static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols) { - // We assume 3 planes all at full resolution. We assume up to 1 token per - // pixel, and then allow a head room of 1 EOSB token per 4x4 block per plane, - // plus EOSB_TOKEN per plane. - return mb_rows * mb_cols * (16 * 16 + 17) * 3; +// Token buffer is only used for palette tokens. +static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols, + int sb_size_log2, + const int num_planes) { + // Calculate the maximum number of max superblocks in the image. + const int shift = sb_size_log2 - 4; + const int sb_size = 1 << sb_size_log2; + const int sb_size_square = sb_size * sb_size; + const int sb_rows = ALIGN_POWER_OF_TWO(mb_rows, shift) >> shift; + const int sb_cols = ALIGN_POWER_OF_TWO(mb_cols, shift) >> shift; + + // One palette token for each pixel. There can be palettes on two planes. + const int sb_palette_toks = AOMMIN(2, num_planes) * sb_size_square; + + return sb_rows * sb_cols * sb_palette_toks; } // Get the allocated token size for a tile. It does the same calculation as in // the frame token allocation. -static INLINE unsigned int allocated_tokens(TileInfo tile) { -#if CONFIG_CB4X4 +static INLINE unsigned int allocated_tokens(TileInfo tile, int sb_size_log2, + int num_planes) { int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 2) >> 2; int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 2) >> 2; -#else - int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 1) >> 1; - int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 1) >> 1; -#endif - return get_token_alloc(tile_mb_rows, tile_mb_cols); + return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes); } -#if CONFIG_TEMPMV_SIGNALING -void av1_set_temporal_mv_prediction(AV1_COMP *cpi, int allow_tempmv_prediction); -#endif - void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags); +#define ALT_MIN_LAG 3 static INLINE int is_altref_enabled(const AV1_COMP *const cpi) { - return cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.enable_auto_arf; + return cpi->oxcf.lag_in_frames >= ALT_MIN_LAG && cpi->oxcf.enable_auto_arf; } // TODO(zoeliu): To set up cpi->oxcf.enable_auto_brf -#if 0 && CONFIG_EXT_REFS -static INLINE int is_bwdref_enabled(const AV1_COMP *const cpi) { - // NOTE(zoeliu): The enabling of bi-predictive frames depends on the use of - // alt_ref, and now will be off when the alt_ref interval is - // not sufficiently large. - return is_altref_enabled(cpi) && cpi->oxcf.enable_auto_brf; -} -#endif // CONFIG_EXT_REFS static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd, MV_REFERENCE_FRAME ref0, @@ -813,22 +859,14 @@ static INLINE void uref_cnt_fb(EncRefCntBuffer *ubufs, int *uidx, ubufs[new_uidx].ref_count++; } -// Returns 1 if a frame is unscaled and 0 otherwise. -static INLINE int av1_resize_unscaled(const AV1_COMMON *cm) { -#if CONFIG_FRAME_SUPERRES - return cm->superres_upscaled_width == cm->render_width && - cm->superres_upscaled_height == cm->render_height; -#else - return cm->width == cm->render_width && cm->height == cm->render_height; -#endif // CONFIG_FRAME_SUPERRES +// Returns 1 if a frame is scaled and 0 otherwise. +static INLINE int av1_resize_scaled(const AV1_COMMON *cm) { + return !(cm->superres_upscaled_width == cm->render_width && + cm->superres_upscaled_height == cm->render_height); } -static INLINE int av1_frame_unscaled(const AV1_COMMON *cm) { -#if CONFIG_FRAME_SUPERRES - return av1_superres_unscaled(cm) && av1_resize_unscaled(cm); -#else - return av1_resize_unscaled(cm); -#endif // CONFIG_FRAME_SUPERRES +static INLINE int av1_frame_scaled(const AV1_COMMON *cm) { + return !av1_superres_scaled(cm) && av1_resize_scaled(cm); } #ifdef __cplusplus diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c index 6209d6fa4..4d4802b46 100644 --- a/third_party/aom/av1/encoder/encodetxb.c +++ b/third_party/aom/av1/encoder/encodetxb.c @@ -9,65 +9,81 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "av1/common/scan.h" +#include "av1/encoder/encodetxb.h" + +#include "aom_ports/mem.h" #include "av1/common/blockd.h" #include "av1/common/idct.h" #include "av1/common/pred_common.h" +#include "av1/common/scan.h" #include "av1/encoder/bitstream.h" -#include "av1/encoder/encodeframe.h" #include "av1/encoder/cost.h" -#include "av1/encoder/encodetxb.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/hash.h" #include "av1/encoder/rdopt.h" -#include "av1/encoder/subexp.h" #include "av1/encoder/tokenize.h" -#define TEST_OPTIMIZE_TXB 0 +static int hbt_needs_init = 1; +static CRC32C crc_calculator; +static const int HBT_EOB = 16; // also the length in opt_qcoeff +static const int HBT_TABLE_SIZE = 65536; // 16 bit: holds 65536 'arrays' +static const int HBT_ARRAY_LENGTH = 256; // 8 bit: 256 entries +// If removed in hbt_create_hashes or increased beyond int8_t, widen deltas type +static const int HBT_KICKOUT = 3; + +typedef struct OptTxbQcoeff { + // Use larger type if larger/no kickout value is used in hbt_create_hashes + int8_t deltas[16]; + uint32_t hbt_qc_hash; + uint32_t hbt_ctx_hash; + int init; + int rate_cost; +} OptTxbQcoeff; + +OptTxbQcoeff *hbt_hash_table; + +typedef struct LevelDownStats { + int update; + tran_low_t low_qc; + tran_low_t low_dqc; + int64_t dist0; + int rate; + int rate_low; + int64_t dist; + int64_t dist_low; + int64_t rd; + int64_t rd_low; + int64_t nz_rd; + int64_t rd_diff; + int cost_diff; + int64_t dist_diff; + int new_eob; +} LevelDownStats; void av1_alloc_txb_buf(AV1_COMP *cpi) { -#if 0 - AV1_COMMON *cm = &cpi->common; - int mi_block_size = 1 << MI_SIZE_LOG2; - // TODO(angiebird): Make sure cm->subsampling_x/y is set correctly, and then - // use precise buffer size according to cm->subsampling_x/y - int pixel_stride = mi_block_size * cm->mi_cols; - int pixel_height = mi_block_size * cm->mi_rows; - int i; - for (i = 0; i < MAX_MB_PLANE; ++i) { - CHECK_MEM_ERROR( - cm, cpi->tcoeff_buf[i], - aom_malloc(sizeof(*cpi->tcoeff_buf[i]) * pixel_stride * pixel_height)); - } -#else AV1_COMMON *cm = &cpi->common; - int size = ((cm->mi_rows >> MAX_MIB_SIZE_LOG2) + 1) * - ((cm->mi_cols >> MAX_MIB_SIZE_LOG2) + 1); + int size = ((cm->mi_rows >> cm->seq_params.mib_size_log2) + 1) * + ((cm->mi_cols >> cm->seq_params.mib_size_log2) + 1); av1_free_txb_buf(cpi); // TODO(jingning): This should be further reduced. CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base, - aom_malloc(sizeof(*cpi->coeff_buffer_base) * size)); -#endif + aom_memalign(32, sizeof(*cpi->coeff_buffer_base) * size)); } -void av1_free_txb_buf(AV1_COMP *cpi) { -#if 0 - int i; - for (i = 0; i < MAX_MB_PLANE; ++i) { - aom_free(cpi->tcoeff_buf[i]); - } -#else - aom_free(cpi->coeff_buffer_base); -#endif -} +void av1_free_txb_buf(AV1_COMP *cpi) { aom_free(cpi->coeff_buffer_base); } void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x, int mi_row, int mi_col) { - int stride = (cpi->common.mi_cols >> MAX_MIB_SIZE_LOG2) + 1; - int offset = - (mi_row >> MAX_MIB_SIZE_LOG2) * stride + (mi_col >> MAX_MIB_SIZE_LOG2); + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + int mib_size_log2 = cm->seq_params.mib_size_log2; + int stride = (cm->mi_cols >> mib_size_log2) + 1; + int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2); CB_COEFF_BUFFER *coeff_buf = &cpi->coeff_buffer_base[offset]; const int txb_offset = x->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN); - for (int plane = 0; plane < MAX_MB_PLANE; ++plane) { + assert(x->cb_offset < (1 << num_pels_log2_lookup[cm->seq_params.sb_size])); + for (int plane = 0; plane < num_planes; ++plane) { x->mbmi_ext->tcoeff[plane] = coeff_buf->tcoeff[plane] + x->cb_offset; x->mbmi_ext->eobs[plane] = coeff_buf->eobs[plane] + txb_offset; x->mbmi_ext->txb_skip_ctx[plane] = @@ -93,435 +109,147 @@ static void write_golomb(aom_writer *w, int level) { for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01); } -static INLINE void write_nz_map(aom_writer *w, const tran_low_t *tcoeff, - uint16_t eob, int plane, const int16_t *scan, - TX_SIZE tx_size, TX_TYPE tx_type, - FRAME_CONTEXT *fc) { - const PLANE_TYPE plane_type = get_plane_type(plane); - const TX_SIZE txs_ctx = get_txsize_context(tx_size); - const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2; - const int height = tx_size_high[tx_size]; -#if CONFIG_CTX1D - const int width = tx_size_wide[tx_size]; - const int eob_offset = width + height; - const TX_CLASS tx_class = get_tx_class(tx_type); - const int seg_eob = - (tx_class == TX_CLASS_2D) ? tx_size_2d[tx_size] : eob_offset; -#else - const int seg_eob = tx_size_2d[tx_size]; -#endif -#if !LV_MAP_PROB - aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type]; - aom_prob *eob_flag = fc->eob_flag[txs_ctx][plane_type]; -#endif - - for (int c = 0; c < eob; ++c) { - int coeff_ctx = get_nz_map_ctx(tcoeff, c, scan, bwl, height, tx_type); - int eob_ctx = get_eob_ctx(tcoeff, scan[c], txs_ctx, tx_type); - - tran_low_t v = tcoeff[scan[c]]; - int is_nz = (v != 0); - - if (c == seg_eob - 1) break; - -#if LV_MAP_PROB - aom_write_bin(w, is_nz, fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2); -#else - aom_write(w, is_nz, nz_map[coeff_ctx]); -#endif - - if (is_nz) { -#if LV_MAP_PROB - aom_write_bin(w, c == (eob - 1), - fc->eob_flag_cdf[txs_ctx][plane_type][eob_ctx], 2); -#else - aom_write(w, c == (eob - 1), eob_flag[eob_ctx]); -#endif - } +static INLINE tran_low_t get_lower_coeff(tran_low_t qc) { + if (qc == 0) { + return 0; } + return qc > 0 ? qc - 1 : qc + 1; } -#if CONFIG_CTX1D -static INLINE void write_nz_map_vert(aom_writer *w, const tran_low_t *tcoeff, - uint16_t eob, int plane, - const int16_t *scan, const int16_t *iscan, - TX_SIZE tx_size, TX_TYPE tx_type, - FRAME_CONTEXT *fc) { - (void)eob; - const TX_SIZE txs_ctx = get_txsize_context(tx_size); - const PLANE_TYPE plane_type = get_plane_type(plane); - const TX_CLASS tx_class = get_tx_class(tx_type); - const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2; - const int width = tx_size_wide[tx_size]; - const int height = tx_size_high[tx_size]; - int16_t eob_ls[MAX_HVTX_SIZE]; - get_eob_vert(eob_ls, tcoeff, width, height); -#if !LV_MAP_PROB - aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type]; -#endif - for (int c = 0; c < width; ++c) { - int16_t veob = eob_ls[c]; - assert(veob <= height); - int el_ctx = get_empty_line_ctx(c, eob_ls); -#if LV_MAP_PROB - aom_write_bin(w, veob == 0, - fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx], 2); -#else - aom_write(w, veob == 0, - fc->empty_line[txs_ctx][plane_type][tx_class][el_ctx]); -#endif - if (veob) { - for (int r = 0; r < veob; ++r) { - if (r + 1 != height) { - int coeff_idx = r * width + c; - int scan_idx = iscan[coeff_idx]; - int is_nz = tcoeff[coeff_idx] != 0; - int coeff_ctx = - get_nz_map_ctx(tcoeff, scan_idx, scan, bwl, height, tx_type); -#if LV_MAP_PROB - aom_write_bin(w, is_nz, - fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2); -#else - aom_write(w, is_nz, nz_map[coeff_ctx]); -#endif - if (is_nz) { - int eob_ctx = get_hv_eob_ctx(c, r, eob_ls); -#if LV_MAP_PROB - aom_write_bin( - w, r == veob - 1, - fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx], 2); -#else - aom_write(w, r == veob - 1, - fc->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx]); -#endif - } - } - } - } - } +static INLINE tran_low_t qcoeff_to_dqcoeff(tran_low_t qc, int coeff_idx, + int dqv, int shift, + const qm_val_t *iqmatrix) { + int sign = qc < 0 ? -1 : 1; + if (iqmatrix != NULL) + dqv = + ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + return sign * ((abs(qc) * dqv) >> shift); } -static INLINE void write_nz_map_horiz(aom_writer *w, const tran_low_t *tcoeff, - uint16_t eob, int plane, - const int16_t *scan, const int16_t *iscan, - TX_SIZE tx_size, TX_TYPE tx_type, - FRAME_CONTEXT *fc) { - (void)scan; - (void)eob; - const TX_SIZE txs_ctx = get_txsize_context(tx_size); - const PLANE_TYPE plane_type = get_plane_type(plane); - const TX_CLASS tx_class = get_tx_class(tx_type); - const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2; - const int width = tx_size_wide[tx_size]; - const int height = tx_size_high[tx_size]; - int16_t eob_ls[MAX_HVTX_SIZE]; - get_eob_horiz(eob_ls, tcoeff, width, height); -#if !LV_MAP_PROB - aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type]; -#endif - for (int r = 0; r < height; ++r) { - int16_t heob = eob_ls[r]; - int el_ctx = get_empty_line_ctx(r, eob_ls); -#if LV_MAP_PROB - aom_write_bin(w, heob == 0, - fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx], 2); -#else - aom_write(w, heob == 0, - fc->empty_line[txs_ctx][plane_type][tx_class][el_ctx]); -#endif - if (heob) { - for (int c = 0; c < heob; ++c) { - if (c + 1 != width) { - int coeff_idx = r * width + c; - int scan_idx = iscan[coeff_idx]; - int is_nz = tcoeff[coeff_idx] != 0; - int coeff_ctx = - get_nz_map_ctx(tcoeff, scan_idx, scan, bwl, height, tx_type); -#if LV_MAP_PROB - aom_write_bin(w, is_nz, - fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2); -#else - aom_write(w, is_nz, nz_map[coeff_ctx]); -#endif - if (is_nz) { - int eob_ctx = get_hv_eob_ctx(r, c, eob_ls); -#if LV_MAP_PROB - aom_write_bin( - w, c == heob - 1, - fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx], 2); -#else - aom_write(w, c == heob - 1, - fc->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx]); -#endif - } - } - } - } - } +static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff, + int shift) { + const int64_t diff = (tcoeff - dqcoeff) * (1 << shift); + const int64_t error = diff * diff; + return error; } -#endif - -void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd, - aom_writer *w, int blk_row, int blk_col, int block, - int plane, TX_SIZE tx_size, const tran_low_t *tcoeff, - uint16_t eob, TXB_CTX *txb_ctx) { - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - const PLANE_TYPE plane_type = get_plane_type(plane); - const TX_SIZE txs_ctx = get_txsize_context(tx_size); - const TX_TYPE tx_type = - av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size); - const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi); - const int16_t *scan = scan_order->scan; - int c; - const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2; - const int height = tx_size_high[tx_size]; - uint16_t update_eob = 0; - FRAME_CONTEXT *ec_ctx = xd->tile_ctx; - (void)blk_row; - (void)blk_col; - -#if LV_MAP_PROB - aom_write_bin(w, eob == 0, - ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2); +#if CONFIG_ENTROPY_STATS +void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size, + TX_CLASS tx_class, PLANE_TYPE plane, + FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts, + uint8_t allow_update_cdf) { #else - aom_write(w, eob == 0, ec_ctx->txb_skip[txs_ctx][txb_ctx->txb_skip_ctx]); +void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class, + PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx, + uint8_t allow_update_cdf) { #endif + int eob_extra; + const int eob_pt = get_eob_pos_token(eob, &eob_extra); + TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); - if (eob == 0) return; -#if CONFIG_TXK_SEL - av1_write_tx_type(cm, xd, blk_row, blk_col, block, plane, - get_min_tx_size(tx_size), w); -#endif + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; -#if CONFIG_CTX1D - TX_CLASS tx_class = get_tx_class(tx_type); - if (tx_class == TX_CLASS_2D) { - write_nz_map(w, tcoeff, eob, plane, scan, tx_size, tx_type, ec_ctx); - } else { - const int width = tx_size_wide[tx_size]; - const int eob_offset = width + height; - const int eob_mode = eob > eob_offset; -#if LV_MAP_PROB - aom_write_bin(w, eob_mode, - ec_ctx->eob_mode_cdf[txs_ctx][plane_type][tx_class], 2); -#else - aom_write(w, eob_mode, ec_ctx->eob_mode[txs_ctx][plane_type][tx_class]); + switch (eob_multi_size) { + case 0: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi16[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; #endif - if (eob_mode == 0) { - write_nz_map(w, tcoeff, eob, plane, scan, tx_size, tx_type, ec_ctx); - } else { - const int16_t *iscan = scan_order->iscan; - assert(tx_class == TX_CLASS_VERT || tx_class == TX_CLASS_HORIZ); - if (tx_class == TX_CLASS_VERT) - write_nz_map_vert(w, tcoeff, eob, plane, scan, iscan, tx_size, tx_type, - ec_ctx); - else - write_nz_map_horiz(w, tcoeff, eob, plane, scan, iscan, tx_size, tx_type, - ec_ctx); - } - } -#else - write_nz_map(w, tcoeff, eob, plane, scan, tx_size, tx_type, ec_ctx); -#endif // CONFIG_CTX1D - - int i; - for (i = 0; i < NUM_BASE_LEVELS; ++i) { -#if !LV_MAP_PROB - aom_prob *coeff_base = ec_ctx->coeff_base[txs_ctx][plane_type][i]; + if (allow_update_cdf) + update_cdf(ec_ctx->eob_flag_cdf16[plane][eob_multi_ctx], eob_pt - 1, 5); + break; + case 1: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi32[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; #endif - update_eob = 0; - for (c = eob - 1; c >= 0; --c) { - tran_low_t v = tcoeff[scan[c]]; - tran_low_t level = abs(v); - int sign = (v < 0) ? 1 : 0; - int ctx; - - if (level <= i) continue; - - ctx = get_base_ctx(tcoeff, scan[c], bwl, height, i + 1); - - if (level == i + 1) { -#if LV_MAP_PROB - aom_write_bin(w, 1, ec_ctx->coeff_base_cdf[txs_ctx][plane_type][i][ctx], - 2); -#else - aom_write(w, 1, coeff_base[ctx]); + if (allow_update_cdf) + update_cdf(ec_ctx->eob_flag_cdf32[plane][eob_multi_ctx], eob_pt - 1, 6); + break; + case 2: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi64[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; #endif - if (c == 0) { -#if LV_MAP_PROB - aom_write_bin(w, sign, - ec_ctx->dc_sign_cdf[plane_type][txb_ctx->dc_sign_ctx], - 2); -#else - aom_write(w, sign, ec_ctx->dc_sign[plane_type][txb_ctx->dc_sign_ctx]); + if (allow_update_cdf) + update_cdf(ec_ctx->eob_flag_cdf64[plane][eob_multi_ctx], eob_pt - 1, 7); + break; + case 3: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi128[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; #endif - } else { - aom_write_bit(w, sign); - } - continue; + if (allow_update_cdf) { + update_cdf(ec_ctx->eob_flag_cdf128[plane][eob_multi_ctx], eob_pt - 1, + 8); } - -#if LV_MAP_PROB - aom_write_bin(w, 0, ec_ctx->coeff_base_cdf[txs_ctx][plane_type][i][ctx], - 2); -#else - aom_write(w, 0, coeff_base[ctx]); -#endif - update_eob = AOMMAX(update_eob, c); - } - } - - for (c = update_eob; c >= 0; --c) { - tran_low_t v = tcoeff[scan[c]]; - tran_low_t level = abs(v); - int sign = (v < 0) ? 1 : 0; - int idx; - int ctx; - - if (level <= NUM_BASE_LEVELS) continue; - - if (c == 0) { -#if LV_MAP_PROB - aom_write_bin(w, sign, - ec_ctx->dc_sign_cdf[plane_type][txb_ctx->dc_sign_ctx], 2); -#else - aom_write(w, sign, ec_ctx->dc_sign[plane_type][txb_ctx->dc_sign_ctx]); + break; + case 4: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi256[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; #endif - } else { - aom_write_bit(w, sign); - } - - // level is above 1. - ctx = get_br_ctx(tcoeff, scan[c], bwl, height); - -#if BR_NODE - int base_range = level - 1 - NUM_BASE_LEVELS; - int br_set_idx = 0; - int br_base = 0; - int br_offset = 0; - - if (base_range >= COEFF_BASE_RANGE) - br_set_idx = BASE_RANGE_SETS; - else - br_set_idx = coeff_to_br_index[base_range]; - - for (idx = 0; idx < BASE_RANGE_SETS; ++idx) { - aom_write_bin(w, idx == br_set_idx, - ec_ctx->coeff_br_cdf[txs_ctx][plane_type][idx][ctx], 2); - if (idx == br_set_idx) { - br_base = br_index_to_coeff[br_set_idx]; - br_offset = base_range - br_base; - int extra_bits = (1 << br_extra_bits[idx]) - 1; - for (int tok = 0; tok < extra_bits; ++tok) { - if (tok == br_offset) { - aom_write_bin(w, 1, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx], - 2); - break; - } - aom_write_bin(w, 0, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx], - 2); - } - // aom_write_literal(w, br_offset, br_extra_bits[idx]); - break; + if (allow_update_cdf) { + update_cdf(ec_ctx->eob_flag_cdf256[plane][eob_multi_ctx], eob_pt - 1, + 9); } - } - - if (br_set_idx < BASE_RANGE_SETS) continue; -#else // BR_NODE - for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) { - if (level == (idx + 1 + NUM_BASE_LEVELS)) { -#if LV_MAP_PROB - aom_write_bin(w, 1, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx], 2); -#else - aom_write(w, 1, ec_ctx->coeff_lps[txs_ctx][plane_type][ctx]); + break; + case 5: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi512[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; #endif - break; + if (allow_update_cdf) { + update_cdf(ec_ctx->eob_flag_cdf512[plane][eob_multi_ctx], eob_pt - 1, + 10); } -#if LV_MAP_PROB - aom_write_bin(w, 0, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx], 2); -#else - aom_write(w, 0, ec_ctx->coeff_lps[txs_ctx][plane_type][ctx]); -#endif - } - if (idx < COEFF_BASE_RANGE) continue; -#endif // BR_NODE - - // use 0-th order Golomb code to handle the residual level. - write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS); + break; + case 6: + default: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi1024[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) { + update_cdf(ec_ctx->eob_flag_cdf1024[plane][eob_multi_ctx], eob_pt - 1, + 11); + } + break; } -} - -void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, - aom_writer *w, int plane) { - MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - BLOCK_SIZE bsize = mbmi->sb_type; - struct macroblockd_plane *pd = &xd->plane[plane]; -#if CONFIG_CHROMA_SUB8X8 - const BLOCK_SIZE plane_bsize = - AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd)); -#elif CONFIG_CB4X4 - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); -#else - const BLOCK_SIZE plane_bsize = - get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd); -#endif - const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); - const int max_blocks_high = max_block_high(xd, plane_bsize, plane); - const TX_SIZE tx_size = av1_get_tx_size(plane, xd); - const int bkw = tx_size_wide_unit[tx_size]; - const int bkh = tx_size_high_unit[tx_size]; - const int step = tx_size_wide_unit[tx_size] * tx_size_high_unit[tx_size]; - int row, col; - int block = 0; - for (row = 0; row < max_blocks_high; row += bkh) { - for (col = 0; col < max_blocks_wide; col += bkw) { - tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block); - uint16_t eob = x->mbmi_ext->eobs[plane][block]; - TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block], - x->mbmi_ext->dc_sign_ctx[plane][block] }; - av1_write_coeffs_txb(cm, xd, w, row, col, block, plane, tx_size, tcoeff, - eob, &txb_ctx); - block += step; - } + if (k_eob_offset_bits[eob_pt] > 0) { + int eob_ctx = eob_pt - 3; + int eob_shift = k_eob_offset_bits[eob_pt] - 1; + int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; +#if CONFIG_ENTROPY_STATS + counts->eob_extra[cdf_idx][txs_ctx][plane][eob_pt][bit]++; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) + update_cdf(ec_ctx->eob_extra_cdf[txs_ctx][plane][eob_ctx], bit, 2); } } -static INLINE void get_base_ctx_set(const tran_low_t *tcoeffs, - int c, // raster order - const int bwl, const int height, - int ctx_set[NUM_BASE_LEVELS]) { - const int row = c >> bwl; - const int col = c - (row << bwl); - const int stride = 1 << bwl; - int mag[NUM_BASE_LEVELS] = { 0 }; - int idx; - tran_low_t abs_coeff; - int i; - - for (idx = 0; idx < BASE_CONTEXT_POSITION_NUM; ++idx) { - int ref_row = row + base_ref_offset[idx][0]; - int ref_col = col + base_ref_offset[idx][1]; - int pos = (ref_row << bwl) + ref_col; - - if (ref_row < 0 || ref_col < 0 || ref_row >= height || ref_col >= stride) - continue; - - abs_coeff = abs(tcoeffs[pos]); - - for (i = 0; i < NUM_BASE_LEVELS; ++i) { - ctx_set[i] += abs_coeff > i; - if (base_ref_offset[idx][0] >= 0 && base_ref_offset[idx][1] >= 0) - mag[i] |= abs_coeff > (i + 1); - } - } +static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs, + const LV_MAP_COEFF_COST *txb_costs, TX_CLASS tx_class) { + int eob_extra; + const int eob_pt = get_eob_pos_token(eob, &eob_extra); + int eob_cost = 0; + const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; + eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1]; + + if (k_eob_offset_bits[eob_pt] > 0) { + const int eob_ctx = eob_pt - 3; + const int eob_shift = k_eob_offset_bits[eob_pt] - 1; + const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; + eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit]; + const int offset_bits = k_eob_offset_bits[eob_pt]; + if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1); + } + return eob_cost; +} - for (i = 0; i < NUM_BASE_LEVELS; ++i) { - ctx_set[i] = get_base_ctx_from_count_mag(row, col, ctx_set[i], mag[i]); +static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx, + const int (*dc_sign_cost)[2], + int dc_sign_ctx) { + if (coeff_idx == 0) { + const int sign = (qc < 0) ? 1 : 0; + return dc_sign_cost[dc_sign_ctx][sign]; } - return; + return av1_cost_literal(1); } static INLINE int get_br_cost(tran_low_t abs_qc, int ctx, @@ -530,1440 +258,1522 @@ static INLINE int get_br_cost(tran_low_t abs_qc, int ctx, const tran_low_t max_level = 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE; (void)ctx; if (abs_qc >= min_level) { -#if BR_NODE - if (abs_qc >= max_level) + if (abs_qc >= max_level) { return coeff_lps[COEFF_BASE_RANGE]; // COEFF_BASE_RANGE * cost0; - else + } else { return coeff_lps[(abs_qc - min_level)]; // * cost0 + cost1; -#else - const int cost0 = coeff_lps[0]; - const int cost1 = coeff_lps[1]; - if (abs_qc >= max_level) - return COEFF_BASE_RANGE * cost0; - else - return (abs_qc - min_level) * cost0 + cost1; -#endif - } else { - return 0; + } } + return 0; } -static INLINE int get_base_cost(tran_low_t abs_qc, int ctx, - const int coeff_base[2], int base_idx) { - const int level = base_idx + 1; - (void)ctx; - if (abs_qc < level) - return 0; - else - return coeff_base[abs_qc == level]; -} - -int get_nz_eob_map_cost(const LV_MAP_COEFF_COST *coeff_costs, - const tran_low_t *qcoeff, uint16_t eob, int plane, - const int16_t *scan, TX_SIZE tx_size, TX_TYPE tx_type) { - (void)plane; - TX_SIZE txs_ctx = get_txsize_context(tx_size); - const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2; - const int height = tx_size_high[tx_size]; -#if CONFIG_CTX1D - const TX_CLASS tx_class = get_tx_class(tx_type); - const int width = tx_size_wide[tx_size]; - const int eob_offset = width + height; - const int seg_eob = - (tx_class == TX_CLASS_2D) ? tx_size_2d[tx_size] : eob_offset; -#else - const int seg_eob = tx_size_2d[tx_size]; -#endif - int cost = 0; - for (int c = 0; c < eob; ++c) { - tran_low_t v = qcoeff[scan[c]]; - int is_nz = (v != 0); - if (c + 1 != seg_eob) { - int coeff_ctx = get_nz_map_ctx(qcoeff, c, scan, bwl, height, tx_type); - cost += coeff_costs->nz_map_cost[coeff_ctx][is_nz]; - if (is_nz) { - int eob_ctx = get_eob_ctx(qcoeff, scan[c], txs_ctx, tx_type); - cost += coeff_costs->eob_cost[eob_ctx][c == (eob - 1)]; - } - } +static INLINE int get_golomb_cost(int abs_qc) { + if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) { + const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS; + const int length = get_msb(r) + 1; + return av1_cost_literal(2 * length - 1); } - return cost; + return 0; } -#if CONFIG_CTX1D -static INLINE int get_nz_eob_map_cost_vert(const LV_MAP_COEFF_COST *coeff_costs, - const tran_low_t *qcoeff, - uint16_t eob, int plane, - const int16_t *scan, - const int16_t *iscan, - TX_SIZE tx_size, TX_TYPE tx_type) { - (void)tx_size; - (void)scan; - (void)eob; - (void)plane; - const TX_CLASS tx_class = get_tx_class(tx_type); - const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2; - const int width = tx_size_wide[tx_size]; - const int height = tx_size_high[tx_size]; - int16_t eob_ls[MAX_HVTX_SIZE]; - get_eob_vert(eob_ls, qcoeff, width, height); +static int get_coeff_cost(const tran_low_t qc, const int scan_idx, + const int is_eob, const TxbInfo *const txb_info, + const LV_MAP_COEFF_COST *const txb_costs, + const int coeff_ctx, const TX_CLASS tx_class) { + const TXB_CTX *const txb_ctx = txb_info->txb_ctx; + const int is_nz = (qc != 0); + const tran_low_t abs_qc = abs(qc); int cost = 0; - for (int c = 0; c < width; ++c) { - int16_t veob = eob_ls[c]; - assert(veob <= height); - int el_ctx = get_empty_line_ctx(c, eob_ls); - cost += coeff_costs->empty_line_cost[tx_class][el_ctx][veob == 0]; - if (veob) { - for (int r = 0; r < veob; ++r) { - if (r + 1 != height) { - int coeff_idx = r * width + c; - int scan_idx = iscan[coeff_idx]; - int is_nz = qcoeff[coeff_idx] != 0; - int coeff_ctx = - get_nz_map_ctx(qcoeff, scan_idx, scan, bwl, height, tx_type); - cost += coeff_costs->nz_map_cost[coeff_ctx][is_nz]; - if (is_nz) { - int eob_ctx = get_hv_eob_ctx(c, r, eob_ls); - cost += coeff_costs->hv_eob_cost[tx_class][eob_ctx][r == veob - 1]; - } - } - } - } + const int16_t *const scan = txb_info->scan_order->scan; + const int pos = scan[scan_idx]; + + if (is_eob) { + cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1]; + } else { + cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)]; } - return cost; -} + if (is_nz) { + cost += get_sign_bit_cost(qc, scan_idx, txb_costs->dc_sign_cost, + txb_ctx->dc_sign_ctx); -static INLINE int get_nz_eob_map_cost_horiz( - const LV_MAP_COEFF_COST *coeff_costs, const tran_low_t *qcoeff, - uint16_t eob, int plane, const int16_t *scan, const int16_t *iscan, - TX_SIZE tx_size, TX_TYPE tx_type) { - (void)tx_size; - (void)scan; - (void)eob; - (void)plane; - const TX_CLASS tx_class = get_tx_class(tx_type); - const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2; - const int width = tx_size_wide[tx_size]; - const int height = tx_size_high[tx_size]; - int16_t eob_ls[MAX_HVTX_SIZE]; - get_eob_horiz(eob_ls, qcoeff, width, height); - int cost = 0; - for (int r = 0; r < height; ++r) { - int16_t heob = eob_ls[r]; - assert(heob <= width); - int el_ctx = get_empty_line_ctx(r, eob_ls); - cost += coeff_costs->empty_line_cost[tx_class][el_ctx][heob == 0]; - if (heob) { - for (int c = 0; c < heob; ++c) { - if (c + 1 != width) { - int coeff_idx = r * width + c; - int scan_idx = iscan[coeff_idx]; - int is_nz = qcoeff[coeff_idx] != 0; - int coeff_ctx = - get_nz_map_ctx(qcoeff, scan_idx, scan, bwl, height, tx_type); - cost += coeff_costs->nz_map_cost[coeff_ctx][is_nz]; - if (is_nz) { - int eob_ctx = get_hv_eob_ctx(r, c, eob_ls); - cost += coeff_costs->hv_eob_cost[tx_class][eob_ctx][c == heob - 1]; - } - } - } + if (abs_qc > NUM_BASE_LEVELS) { + const int ctx = + get_br_ctx(txb_info->levels, pos, txb_info->bwl, tx_class); + cost += get_br_cost(abs_qc, ctx, txb_costs->lps_cost[ctx]); + cost += get_golomb_cost(abs_qc); } } return cost; } -#endif -int av1_cost_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *x, int plane, - int blk_row, int blk_col, int block, TX_SIZE tx_size, - TXB_CTX *txb_ctx) { - MACROBLOCKD *const xd = &x->e_mbd; - TX_SIZE txs_ctx = get_txsize_context(tx_size); - const PLANE_TYPE plane_type = get_plane_type(plane); - const TX_TYPE tx_type = - av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size); - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - const struct macroblock_plane *p = &x->plane[plane]; - const int eob = p->eobs[block]; - const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); - int c, cost; - int txb_skip_ctx = txb_ctx->txb_skip_ctx; +static INLINE int get_nz_map_ctx(const uint8_t *const levels, + const int coeff_idx, const int bwl, + const int height, const int scan_idx, + const int is_eob, const TX_SIZE tx_size, + const TX_CLASS tx_class) { + if (is_eob) { + if (scan_idx == 0) return 0; + if (scan_idx <= (height << bwl) / 8) return 1; + if (scan_idx <= (height << bwl) / 4) return 2; + return 3; + } + const int stats = + get_nz_mag(levels + get_padded_idx(coeff_idx, bwl), bwl, tx_class); + return get_nz_map_ctx_from_stats(stats, coeff_idx, bwl, tx_size, tx_class); +} - const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2; - const int height = tx_size_high[tx_size]; +static void get_dist_cost_stats(LevelDownStats *const stats, const int scan_idx, + const int is_eob, + const LV_MAP_COEFF_COST *const txb_costs, + const TxbInfo *const txb_info, + const TX_CLASS tx_class) { + const int16_t *const scan = txb_info->scan_order->scan; + const int coeff_idx = scan[scan_idx]; + const tran_low_t qc = txb_info->qcoeff[coeff_idx]; + const uint8_t *const levels = txb_info->levels; + stats->new_eob = -1; + stats->update = 0; + stats->rd_low = 0; + stats->rd = 0; + stats->nz_rd = 0; + stats->dist_low = 0; + stats->rate_low = 0; + stats->low_qc = 0; - const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi); - const int16_t *scan = scan_order->scan; + const tran_low_t tqc = txb_info->tcoeff[coeff_idx]; + const int dqv = txb_info->dequant[coeff_idx != 0]; + const int coeff_ctx = + get_nz_map_ctx(levels, coeff_idx, txb_info->bwl, txb_info->height, + scan_idx, is_eob, txb_info->tx_size, tx_class); + const int qc_cost = get_coeff_cost(qc, scan_idx, is_eob, txb_info, txb_costs, + coeff_ctx, tx_class); + assert(qc != 0); + const tran_low_t dqc = qcoeff_to_dqcoeff(qc, coeff_idx, dqv, txb_info->shift, + txb_info->iqmatrix); + const int64_t dqc_dist = get_coeff_dist(tqc, dqc, txb_info->shift); - LV_MAP_COEFF_COST *coeff_costs = &x->coeff_costs[txs_ctx][plane_type]; + // distortion difference when coefficient is quantized to 0 + const tran_low_t dqc0 = + qcoeff_to_dqcoeff(0, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix); - cost = 0; + stats->dist0 = get_coeff_dist(tqc, dqc0, txb_info->shift); + stats->dist = dqc_dist - stats->dist0; + stats->rate = qc_cost; - if (eob == 0) { - cost = coeff_costs->txb_skip_cost[txb_skip_ctx][1]; - return cost; - } - cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0]; + stats->rd = RDCOST(txb_info->rdmult, stats->rate, stats->dist); -#if CONFIG_TXK_SEL - cost += av1_tx_type_cost(cm, x, xd, mbmi->sb_type, plane, tx_size, tx_type); -#endif + stats->low_qc = get_lower_coeff(qc); -#if CONFIG_CTX1D - TX_CLASS tx_class = get_tx_class(tx_type); - if (tx_class == TX_CLASS_2D) { - cost += get_nz_eob_map_cost(coeff_costs, qcoeff, eob, plane, scan, tx_size, - tx_type); + if (is_eob && stats->low_qc == 0) { + stats->rd_low = stats->rd; // disable selection of low_qc in this case. } else { - const int width = tx_size_wide[tx_size]; - const int eob_offset = width + height; - const int eob_mode = eob > eob_offset; - cost += coeff_costs->eob_mode_cost[tx_class][eob_mode]; - if (eob_mode == 0) { - cost += get_nz_eob_map_cost(coeff_costs, qcoeff, eob, plane, scan, - tx_size, tx_type); + if (stats->low_qc == 0) { + stats->dist_low = 0; } else { - const int16_t *iscan = scan_order->iscan; - assert(tx_class == TX_CLASS_VERT || tx_class == TX_CLASS_HORIZ); - if (tx_class == TX_CLASS_VERT) - cost += get_nz_eob_map_cost_vert(coeff_costs, qcoeff, eob, plane, scan, - iscan, tx_size, tx_type); - else - cost += get_nz_eob_map_cost_horiz(coeff_costs, qcoeff, eob, plane, scan, - iscan, tx_size, tx_type); + stats->low_dqc = qcoeff_to_dqcoeff(stats->low_qc, coeff_idx, dqv, + txb_info->shift, txb_info->iqmatrix); + const int64_t low_dqc_dist = + get_coeff_dist(tqc, stats->low_dqc, txb_info->shift); + stats->dist_low = low_dqc_dist - stats->dist0; } + const int low_qc_cost = + get_coeff_cost(stats->low_qc, scan_idx, is_eob, txb_info, txb_costs, + coeff_ctx, tx_class); + stats->rate_low = low_qc_cost; + stats->rd_low = RDCOST(txb_info->rdmult, stats->rate_low, stats->dist_low); } -#else // CONFIG_CTX1D - cost += get_nz_eob_map_cost(coeff_costs, qcoeff, eob, plane, scan, tx_size, - tx_type); -#endif // CONFIG_CTX1D - - for (c = 0; c < eob; ++c) { - tran_low_t v = qcoeff[scan[c]]; - int is_nz = (v != 0); - int level = abs(v); - - if (is_nz) { - int ctx_ls[NUM_BASE_LEVELS] = { 0 }; - int sign = (v < 0) ? 1 : 0; - - // sign bit cost - if (c == 0) { - int dc_sign_ctx = txb_ctx->dc_sign_ctx; - cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign]; - } else { - cost += av1_cost_bit(128, sign); - } - - get_base_ctx_set(qcoeff, scan[c], bwl, height, ctx_ls); - - int i; - for (i = 0; i < NUM_BASE_LEVELS; ++i) { - if (level <= i) continue; - - if (level == i + 1) { - cost += coeff_costs->base_cost[i][ctx_ls[i]][1]; - continue; - } - cost += coeff_costs->base_cost[i][ctx_ls[i]][0]; - } - - if (level > NUM_BASE_LEVELS) { - int ctx; - ctx = get_br_ctx(qcoeff, scan[c], bwl, height); -#if BR_NODE - int base_range = level - 1 - NUM_BASE_LEVELS; - if (base_range < COEFF_BASE_RANGE) { - cost += coeff_costs->lps_cost[ctx][base_range]; - } else { - cost += coeff_costs->lps_cost[ctx][COEFF_BASE_RANGE]; - } - -#else - for (int idx = 0; idx < COEFF_BASE_RANGE; ++idx) { - if (level == (idx + 1 + NUM_BASE_LEVELS)) { - cost += coeff_costs->lps_cost[ctx][1]; - break; - } - cost += coeff_costs->lps_cost[ctx][0]; - } -#endif - if (level >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) { - // residual cost - int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS; - int ri = r; - int length = 0; - - while (ri) { - ri >>= 1; - ++length; - } - - for (ri = 0; ri < length - 1; ++ri) cost += av1_cost_bit(128, 0); +} - for (ri = length - 1; ri >= 0; --ri) - cost += av1_cost_bit(128, (r >> ri) & 0x01); - } - } - } - } +static void get_dist_cost_stats_with_eob( + LevelDownStats *const stats, const int scan_idx, + const LV_MAP_COEFF_COST *const txb_costs, const TxbInfo *const txb_info, + const TX_CLASS tx_class) { + const int is_eob = 0; + get_dist_cost_stats(stats, scan_idx, is_eob, txb_costs, txb_info, tx_class); - return cost; + const int16_t *const scan = txb_info->scan_order->scan; + const int coeff_idx = scan[scan_idx]; + const tran_low_t qc = txb_info->qcoeff[coeff_idx]; + const int coeff_ctx_temp = get_nz_map_ctx( + txb_info->levels, coeff_idx, txb_info->bwl, txb_info->height, scan_idx, 1, + txb_info->tx_size, tx_class); + const int qc_eob_cost = get_coeff_cost(qc, scan_idx, 1, txb_info, txb_costs, + coeff_ctx_temp, tx_class); + int64_t rd_eob = RDCOST(txb_info->rdmult, qc_eob_cost, stats->dist); + if (stats->low_qc != 0) { + const int low_qc_eob_cost = + get_coeff_cost(stats->low_qc, scan_idx, 1, txb_info, txb_costs, + coeff_ctx_temp, tx_class); + int64_t rd_eob_low = + RDCOST(txb_info->rdmult, low_qc_eob_cost, stats->dist_low); + rd_eob = (rd_eob > rd_eob_low) ? rd_eob_low : rd_eob; + } + + stats->nz_rd = AOMMIN(stats->rd_low, stats->rd) - rd_eob; } -static INLINE int has_base(tran_low_t qc, int base_idx) { - const int level = base_idx + 1; - return abs(qc) >= level; +static INLINE void update_qcoeff(const int coeff_idx, const tran_low_t qc, + const TxbInfo *const txb_info) { + txb_info->qcoeff[coeff_idx] = qc; + txb_info->levels[get_padded_idx(coeff_idx, txb_info->bwl)] = + (uint8_t)clamp(abs(qc), 0, INT8_MAX); } -static INLINE int has_br(tran_low_t qc) { - return abs(qc) >= 1 + NUM_BASE_LEVELS; +static INLINE void update_coeff(const int coeff_idx, const tran_low_t qc, + const TxbInfo *const txb_info) { + update_qcoeff(coeff_idx, qc, txb_info); + const int dqv = txb_info->dequant[coeff_idx != 0]; + txb_info->dqcoeff[coeff_idx] = qcoeff_to_dqcoeff( + qc, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix); } -static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx, - const int (*dc_sign_cost)[2], - int dc_sign_ctx) { - const int sign = (qc < 0) ? 1 : 0; - // sign bit cost - if (coeff_idx == 0) { - return dc_sign_cost[dc_sign_ctx][sign]; - } else { - return av1_cost_bit(128, sign); - } -} -static INLINE int get_golomb_cost(int abs_qc) { - if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) { - // residual cost - int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS; - int ri = r; - int length = 0; - - while (ri) { - ri >>= 1; - ++length; - } +void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = width + TX_PAD_HOR; + uint8_t *ls = levels; - return av1_cost_literal(2 * length - 1); - } else { - return 0; - } -} + memset(levels - TX_PAD_TOP * stride, 0, + sizeof(*levels) * TX_PAD_TOP * stride); + memset(levels + stride * height, 0, + sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END)); -void gen_txb_cache(TxbCache *txb_cache, TxbInfo *txb_info) { - // gen_nz_count_arr - const int16_t *scan = txb_info->scan_order->scan; - const int bwl = txb_info->bwl; - const int height = txb_info->height; - tran_low_t *qcoeff = txb_info->qcoeff; - const BASE_CTX_TABLE *base_ctx_table = - txb_info->coeff_ctx_table->base_ctx_table; - for (int c = 0; c < txb_info->eob; ++c) { - const int coeff_idx = scan[c]; // raster order - const int row = coeff_idx >> bwl; - const int col = coeff_idx - (row << bwl); -#if REDUCE_CONTEXT_DEPENDENCY - int prev_coeff_idx; - int prev_row; - int prev_col; - if (c > MIN_SCAN_IDX_REDUCE_CONTEXT_DEPENDENCY) { - prev_coeff_idx = scan[c - 1]; // raster order - prev_row = prev_coeff_idx >> bwl; - prev_col = prev_coeff_idx - (prev_row << bwl); - } else { - prev_coeff_idx = -1; - prev_row = -1; - prev_col = -1; + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + *ls++ = (uint8_t)clamp(abs(coeff[i * width + j]), 0, INT8_MAX); } - txb_cache->nz_count_arr[coeff_idx] = - get_nz_count(qcoeff, bwl, height, row, col, prev_row, prev_col); -#else - txb_cache->nz_count_arr[coeff_idx] = - get_nz_count(qcoeff, bwl, height, row, col); -#endif - const int nz_count = txb_cache->nz_count_arr[coeff_idx]; - txb_cache->nz_ctx_arr[coeff_idx] = - get_nz_map_ctx_from_count(nz_count, coeff_idx, bwl, txb_info->tx_type); - - // gen_base_count_mag_arr - if (!has_base(qcoeff[coeff_idx], 0)) continue; - int *base_mag = txb_cache->base_mag_arr[coeff_idx]; - int count[NUM_BASE_LEVELS]; - get_base_count_mag(base_mag, count, qcoeff, bwl, height, row, col); - - for (int i = 0; i < NUM_BASE_LEVELS; ++i) { - if (!has_base(qcoeff[coeff_idx], i)) break; - txb_cache->base_count_arr[i][coeff_idx] = count[i]; - const int level = i + 1; - txb_cache->base_ctx_arr[i][coeff_idx] = - base_ctx_table[row != 0][col != 0][base_mag[0] > level][count[i]]; + for (int j = 0; j < TX_PAD_HOR; j++) { + *ls++ = 0; } - - // gen_br_count_mag_arr - if (!has_br(qcoeff[coeff_idx])) continue; - int *br_count = txb_cache->br_count_arr + coeff_idx; - int *br_mag = txb_cache->br_mag_arr[coeff_idx]; - *br_count = get_br_count_mag(br_mag, qcoeff, bwl, height, row, col, - NUM_BASE_LEVELS); - txb_cache->br_ctx_arr[coeff_idx] = - get_br_ctx_from_count_mag(row, col, *br_count, br_mag[0]); - } -} - -static INLINE const int *get_level_prob(int level, int coeff_idx, - const TxbCache *txb_cache, - const LV_MAP_COEFF_COST *txb_costs) { - if (level == 0) { - const int ctx = txb_cache->nz_ctx_arr[coeff_idx]; - return txb_costs->nz_map_cost[ctx]; - } else if (level >= 1 && level < 1 + NUM_BASE_LEVELS) { - const int idx = level - 1; - const int ctx = txb_cache->base_ctx_arr[idx][coeff_idx]; - return txb_costs->base_cost[idx][ctx]; - } else if (level >= 1 + NUM_BASE_LEVELS && - level < 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) { - const int ctx = txb_cache->br_ctx_arr[coeff_idx]; - return txb_costs->lps_cost[ctx]; - } else if (level >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) { - printf("get_level_prob does not support golomb\n"); - assert(0); - return 0; - } else { - assert(0); - return 0; } } -static INLINE tran_low_t get_lower_coeff(tran_low_t qc) { - if (qc == 0) { - return 0; +void av1_get_nz_map_contexts_c(const uint8_t *const levels, + const int16_t *const scan, const uint16_t eob, + const TX_SIZE tx_size, const TX_CLASS tx_class, + int8_t *const coeff_contexts) { + const int bwl = get_txb_bwl(tx_size); + const int height = get_txb_high(tx_size); + for (int i = 0; i < eob; ++i) { + const int pos = scan[i]; + coeff_contexts[pos] = get_nz_map_ctx(levels, pos, bwl, height, i, + i == eob - 1, tx_size, tx_class); } - return qc > 0 ? qc - 1 : qc + 1; } -static INLINE void update_mag_arr(int *mag_arr, int abs_qc) { - if (mag_arr[0] == abs_qc) { - mag_arr[1] -= 1; - assert(mag_arr[1] >= 0); - } -} +void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd, + aom_writer *w, int blk_row, int blk_col, int plane, + TX_SIZE tx_size, const tran_low_t *tcoeff, + uint16_t eob, TXB_CTX *txb_ctx) { + const PLANE_TYPE plane_type = get_plane_type(plane); + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, + tx_size, cm->reduced_tx_set_used); + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + const int16_t *const scan = scan_order->scan; + int c; + const int bwl = get_txb_bwl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, width); + DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); -static INLINE int get_mag_from_mag_arr(const int *mag_arr) { - int mag; - if (mag_arr[1] > 0) { - mag = mag_arr[0]; - } else if (mag_arr[0] > 0) { - mag = mag_arr[0] - 1; - } else { - // no neighbor - assert(mag_arr[0] == 0 && mag_arr[1] == 0); - mag = 0; + aom_write_symbol(w, eob == 0, + ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2); + if (plane == 0 && eob == 0) { + assert(tx_type == DCT_DCT); } - return mag; -} + if (eob == 0) return; -static int neighbor_level_down_update(int *new_count, int *new_mag, int count, - const int *mag, int coeff_idx, - tran_low_t abs_nb_coeff, int nb_coeff_idx, - int level, const TxbInfo *txb_info) { - *new_count = count; - *new_mag = get_mag_from_mag_arr(mag); + av1_txb_init_levels(tcoeff, width, height, levels); - int update = 0; - // check if br_count changes - if (abs_nb_coeff == level) { - update = 1; - *new_count -= 1; - assert(*new_count >= 0); - } - const int row = coeff_idx >> txb_info->bwl; - const int col = coeff_idx - (row << txb_info->bwl); - const int nb_row = nb_coeff_idx >> txb_info->bwl; - const int nb_col = nb_coeff_idx - (nb_row << txb_info->bwl); - - // check if mag changes - if (nb_row >= row && nb_col >= col) { - if (abs_nb_coeff == mag[0]) { - assert(mag[1] > 0); - if (mag[1] == 1) { - // the nb is the only qc with max mag - *new_mag -= 1; - assert(*new_mag >= 0); - update = 1; - } - } - } - return update; -} + av1_write_tx_type(cm, xd, blk_row, blk_col, plane, tx_size, w); -static int try_neighbor_level_down_br(int coeff_idx, int nb_coeff_idx, - const TxbCache *txb_cache, - const LV_MAP_COEFF_COST *txb_costs, - const TxbInfo *txb_info) { - const tran_low_t qc = txb_info->qcoeff[coeff_idx]; - const tran_low_t abs_qc = abs(qc); - const int level = NUM_BASE_LEVELS + 1; - if (abs_qc < level) return 0; - - const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx]; - const tran_low_t abs_nb_coeff = abs(nb_coeff); - const int count = txb_cache->br_count_arr[coeff_idx]; - const int *mag = txb_cache->br_mag_arr[coeff_idx]; - int new_count; - int new_mag; - const int update = - neighbor_level_down_update(&new_count, &new_mag, count, mag, coeff_idx, - abs_nb_coeff, nb_coeff_idx, level, txb_info); - if (update) { - const int row = coeff_idx >> txb_info->bwl; - const int col = coeff_idx - (row << txb_info->bwl); - const int ctx = txb_cache->br_ctx_arr[coeff_idx]; - const int org_cost = get_br_cost(abs_qc, ctx, txb_costs->lps_cost[ctx]); - - const int new_ctx = get_br_ctx_from_count_mag(row, col, new_count, new_mag); - const int new_cost = - get_br_cost(abs_qc, new_ctx, txb_costs->lps_cost[new_ctx]); - const int cost_diff = -org_cost + new_cost; - return cost_diff; - } else { - return 0; + int eob_extra; + const int eob_pt = get_eob_pos_token(eob, &eob_extra); + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; + switch (eob_multi_size) { + case 0: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx], 5); + break; + case 1: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx], 6); + break; + case 2: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx], 7); + break; + case 3: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx], 8); + break; + case 4: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx], 9); + break; + case 5: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx], 10); + break; + default: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11); + break; } -} -static int try_neighbor_level_down_base(int coeff_idx, int nb_coeff_idx, - const TxbCache *txb_cache, - const LV_MAP_COEFF_COST *txb_costs, - const TxbInfo *txb_info) { - const tran_low_t qc = txb_info->qcoeff[coeff_idx]; - const tran_low_t abs_qc = abs(qc); - const BASE_CTX_TABLE *base_ctx_table = - txb_info->coeff_ctx_table->base_ctx_table; - - int cost_diff = 0; - for (int base_idx = 0; base_idx < NUM_BASE_LEVELS; ++base_idx) { - const int level = base_idx + 1; - if (abs_qc < level) continue; - - const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx]; - const tran_low_t abs_nb_coeff = abs(nb_coeff); - - const int count = txb_cache->base_count_arr[base_idx][coeff_idx]; - const int *mag = txb_cache->base_mag_arr[coeff_idx]; - int new_count; - int new_mag; - const int update = - neighbor_level_down_update(&new_count, &new_mag, count, mag, coeff_idx, - abs_nb_coeff, nb_coeff_idx, level, txb_info); - if (update) { - const int row = coeff_idx >> txb_info->bwl; - const int col = coeff_idx - (row << txb_info->bwl); - const int ctx = txb_cache->base_ctx_arr[base_idx][coeff_idx]; - const int org_cost = get_base_cost( - abs_qc, ctx, txb_costs->base_cost[base_idx][ctx], base_idx); - - const int new_ctx = - base_ctx_table[row != 0][col != 0][new_mag > level][new_count]; - const int new_cost = get_base_cost( - abs_qc, new_ctx, txb_costs->base_cost[base_idx][new_ctx], base_idx); - cost_diff += -org_cost + new_cost; + if (k_eob_offset_bits[eob_pt] > 0) { + const int eob_ctx = eob_pt - 3; + int eob_shift = k_eob_offset_bits[eob_pt] - 1; + int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; + aom_write_symbol(w, bit, + ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2); + for (int i = 1; i < k_eob_offset_bits[eob_pt]; i++) { + eob_shift = k_eob_offset_bits[eob_pt] - 1 - i; + bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; + aom_write_bit(w, bit); } } - return cost_diff; -} -static int try_neighbor_level_down_nz(int coeff_idx, int nb_coeff_idx, - const TxbCache *txb_cache, - const LV_MAP_COEFF_COST *txb_costs, - TxbInfo *txb_info) { - // assume eob doesn't change - const tran_low_t qc = txb_info->qcoeff[coeff_idx]; - const tran_low_t abs_qc = abs(qc); - const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx]; - const tran_low_t abs_nb_coeff = abs(nb_coeff); - if (abs_nb_coeff != 1) return 0; - const int16_t *iscan = txb_info->scan_order->iscan; - const int scan_idx = iscan[coeff_idx]; - if (scan_idx == txb_info->seg_eob) return 0; - const int nb_scan_idx = iscan[nb_coeff_idx]; - if (nb_scan_idx < scan_idx) { - const int count = txb_cache->nz_count_arr[coeff_idx]; - assert(count > 0); - txb_info->qcoeff[nb_coeff_idx] = get_lower_coeff(nb_coeff); - const int new_ctx = get_nz_map_ctx_from_count( - count - 1, coeff_idx, txb_info->bwl, txb_info->tx_type); - txb_info->qcoeff[nb_coeff_idx] = nb_coeff; - const int ctx = txb_cache->nz_ctx_arr[coeff_idx]; - const int is_nz = abs_qc > 0; - const int org_cost = txb_costs->nz_map_cost[ctx][is_nz]; - const int new_cost = txb_costs->nz_map_cost[new_ctx][is_nz]; - const int cost_diff = new_cost - org_cost; - return cost_diff; - } else { - return 0; - } -} + av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts); -static int try_self_level_down(tran_low_t *low_coeff, int coeff_idx, - const TxbCache *txb_cache, - const LV_MAP_COEFF_COST *txb_costs, - TxbInfo *txb_info) { - const tran_low_t qc = txb_info->qcoeff[coeff_idx]; - if (qc == 0) { - *low_coeff = 0; - return 0; - } - const tran_low_t abs_qc = abs(qc); - *low_coeff = get_lower_coeff(qc); - int cost_diff; - if (*low_coeff == 0) { - const int scan_idx = txb_info->scan_order->iscan[coeff_idx]; - const int *level_cost = - get_level_prob(abs_qc, coeff_idx, txb_cache, txb_costs); - const int *low_level_cost = - get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs); - if (scan_idx < txb_info->seg_eob) { - // When level-0, we code the binary of abs_qc > level - // but when level-k k > 0 we code the binary of abs_qc == level - // That's why wee need this special treatment for level-0 map - // TODO(angiebird): make leve-0 consistent to other levels - cost_diff = -level_cost[1] + low_level_cost[0] - low_level_cost[1]; + for (c = eob - 1; c >= 0; --c) { + const int pos = scan[c]; + const int coeff_ctx = coeff_contexts[pos]; + const tran_low_t v = tcoeff[pos]; + const tran_low_t level = abs(v); + + if (c == eob - 1) { + aom_write_symbol( + w, AOMMIN(level, 3) - 1, + ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx], 3); } else { - cost_diff = -level_cost[1]; + aom_write_symbol(w, AOMMIN(level, 3), + ec_ctx->coeff_base_cdf[txs_ctx][plane_type][coeff_ctx], + 4); } - - if (scan_idx < txb_info->seg_eob) { - const int eob_ctx = get_eob_ctx(txb_info->qcoeff, coeff_idx, - txb_info->txs_ctx, txb_info->tx_type); - cost_diff -= - txb_costs->eob_cost[eob_ctx][scan_idx == (txb_info->eob - 1)]; + if (level > NUM_BASE_LEVELS) { + // level is above 1. + const int base_range = level - 1 - NUM_BASE_LEVELS; + const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class); + for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { + const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1); + aom_write_symbol( + w, k, + ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx], + BR_CDF_SIZE); + if (k < BR_CDF_SIZE - 1) break; + } } + } - const int sign_cost = get_sign_bit_cost( - qc, coeff_idx, txb_costs->dc_sign_cost, txb_info->txb_ctx->dc_sign_ctx); - cost_diff -= sign_cost; - } else if (abs_qc <= NUM_BASE_LEVELS) { - const int *level_cost = - get_level_prob(abs_qc, coeff_idx, txb_cache, txb_costs); - const int *low_level_cost = - get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs); - cost_diff = -level_cost[1] + low_level_cost[1] - low_level_cost[0]; - } else if (abs_qc == NUM_BASE_LEVELS + 1) { - const int *level_cost = - get_level_prob(abs_qc, coeff_idx, txb_cache, txb_costs); - const int *low_level_cost = - get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs); -#if BR_NODE - cost_diff = -level_cost[0] + low_level_cost[1] - low_level_cost[0]; -#else - cost_diff = -level_cost[1] + low_level_cost[1] - low_level_cost[0]; -#endif - } else if (abs_qc < 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) { - const int *level_cost = - get_level_prob(abs_qc, coeff_idx, txb_cache, txb_costs); - const int *low_level_cost = - get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs); - -#if BR_NODE - cost_diff = -level_cost[abs_qc - 1 - NUM_BASE_LEVELS] + - low_level_cost[abs(*low_coeff) - 1 - NUM_BASE_LEVELS]; -#else - cost_diff = -level_cost[1] + low_level_cost[1] - low_level_cost[0]; -#endif - } else if (abs_qc == 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) { - const int *low_level_cost = - get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs); -#if BR_NODE - cost_diff = -get_golomb_cost(abs_qc) - low_level_cost[COEFF_BASE_RANGE] + - low_level_cost[COEFF_BASE_RANGE - 1]; -#else - cost_diff = - -get_golomb_cost(abs_qc) + low_level_cost[1] - low_level_cost[0]; -#endif - } else { - assert(abs_qc > 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE); - const tran_low_t abs_low_coeff = abs(*low_coeff); - cost_diff = -get_golomb_cost(abs_qc) + get_golomb_cost(abs_low_coeff); + // Loop to code all signs in the transform block, + // starting with the sign of DC (if applicable) + for (c = 0; c < eob; ++c) { + const tran_low_t v = tcoeff[scan[c]]; + const tran_low_t level = abs(v); + const int sign = (v < 0) ? 1 : 0; + if (level) { + if (c == 0) { + aom_write_symbol( + w, sign, ec_ctx->dc_sign_cdf[plane_type][txb_ctx->dc_sign_ctx], 2); + } else { + aom_write_bit(w, sign); + } + if (level > COEFF_BASE_RANGE + NUM_BASE_LEVELS) + write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS); + } } - return cost_diff; } -#define COST_MAP_SIZE 5 -#define COST_MAP_OFFSET 2 +typedef struct encode_txb_args { + const AV1_COMMON *cm; + MACROBLOCK *x; + aom_writer *w; +} ENCODE_TXB_ARGS; -static INLINE int check_nz_neighbor(tran_low_t qc) { return abs(qc) == 1; } - -static INLINE int check_base_neighbor(tran_low_t qc) { - return abs(qc) <= 1 + NUM_BASE_LEVELS; +static void write_coeffs_txb_wrap(const AV1_COMMON *cm, MACROBLOCK *x, + aom_writer *w, int plane, int block, + int blk_row, int blk_col, TX_SIZE tx_size) { + MACROBLOCKD *xd = &x->e_mbd; + tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block); + uint16_t eob = x->mbmi_ext->eobs[plane][block]; + TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block], + x->mbmi_ext->dc_sign_ctx[plane][block] }; + av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff, eob, + &txb_ctx); } -static INLINE int check_br_neighbor(tran_low_t qc) { - return abs(qc) > BR_MAG_OFFSET; +void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, int mi_row, + int mi_col, aom_writer *w, BLOCK_SIZE bsize) { + MACROBLOCKD *xd = &x->e_mbd; + const int num_planes = av1_num_planes(cm); + int block[MAX_MB_PLANE] = { 0 }; + int row, col; + assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x, + xd->plane[0].subsampling_y)); + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + const int max_blocks_high = max_block_high(xd, bsize, 0); + const BLOCK_SIZE max_unit_bsize = BLOCK_64X64; + int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0]; + int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0]; + mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide); + mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high); + + for (row = 0; row < max_blocks_high; row += mu_blocks_high) { + for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) { + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, + pd->subsampling_y)) + continue; + const TX_SIZE tx_size = av1_get_tx_size(plane, xd); + const int stepr = tx_size_high_unit[tx_size]; + const int stepc = tx_size_wide_unit[tx_size]; + const int step = stepr * stepc; + + const int unit_height = ROUND_POWER_OF_TWO( + AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y); + const int unit_width = ROUND_POWER_OF_TWO( + AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x); + for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height; + blk_row += stepr) { + for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width; + blk_col += stepc) { + write_coeffs_txb_wrap(cm, x, w, plane, block[plane], blk_row, + blk_col, tx_size); + block[plane] += step; + } + } + } + } + } } -#define FAST_OPTIMIZE_TXB 1 +// TODO(angiebird): use this function whenever it's possible +static int get_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x, + const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, + TX_TYPE tx_type) { + if (plane > 0) return 0; -#if FAST_OPTIMIZE_TXB -#define ALNB_REF_OFFSET_NUM 2 -static int alnb_ref_offset[ALNB_REF_OFFSET_NUM][2] = { - { -1, 0 }, { 0, -1 }, -}; -#define NB_REF_OFFSET_NUM 4 -static int nb_ref_offset[NB_REF_OFFSET_NUM][2] = { - { -1, 0 }, { 0, -1 }, { 1, 0 }, { 0, 1 }, -}; -#endif // FAST_OPTIMIZE_TXB + const TX_SIZE square_tx_size = txsize_sqr_map[tx_size]; -// TODO(angiebird): add static to this function once it's called -int try_level_down(int coeff_idx, const TxbCache *txb_cache, - const LV_MAP_COEFF_COST *txb_costs, TxbInfo *txb_info, - int (*cost_map)[COST_MAP_SIZE], int fast_mode) { -#if !FAST_OPTIMIZE_TXB - (void)fast_mode; -#endif - if (cost_map) { - for (int i = 0; i < COST_MAP_SIZE; ++i) av1_zero(cost_map[i]); - } - - tran_low_t qc = txb_info->qcoeff[coeff_idx]; - tran_low_t low_coeff; - if (qc == 0) return 0; - int accu_cost_diff = 0; - - const int16_t *iscan = txb_info->scan_order->iscan; - const int eob = txb_info->eob; - const int scan_idx = iscan[coeff_idx]; - if (scan_idx < eob) { - const int cost_diff = try_self_level_down(&low_coeff, coeff_idx, txb_cache, - txb_costs, txb_info); - if (cost_map) - cost_map[0 + COST_MAP_OFFSET][0 + COST_MAP_OFFSET] = cost_diff; - accu_cost_diff += cost_diff; - } - - const int row = coeff_idx >> txb_info->bwl; - const int col = coeff_idx - (row << txb_info->bwl); - if (check_nz_neighbor(qc)) { -#if FAST_OPTIMIZE_TXB - int(*ref_offset)[2]; - int ref_num; - if (fast_mode) { - ref_offset = alnb_ref_offset; - ref_num = ALNB_REF_OFFSET_NUM; + const MB_MODE_INFO *mbmi = xd->mi[0]; + const int is_inter = is_inter_block(mbmi); + if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 && + !xd->lossless[xd->mi[0]->segment_id]) { + const int ext_tx_set = + get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used); + if (is_inter) { + if (ext_tx_set > 0) + return x->inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type]; } else { - ref_offset = sig_ref_offset; - ref_num = SIG_REF_OFFSET_NUM; - } -#else - int(*ref_offset)[2] = sig_ref_offset; - const int ref_num = SIG_REF_OFFSET_NUM; -#endif - for (int i = 0; i < ref_num; ++i) { - const int nb_row = row - ref_offset[i][0]; - const int nb_col = col - ref_offset[i][1]; - const int nb_coeff_idx = nb_row * txb_info->stride + nb_col; - - if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height || - nb_col >= txb_info->stride) - continue; - - const int nb_scan_idx = iscan[nb_coeff_idx]; - if (nb_scan_idx < eob) { - const int cost_diff = try_neighbor_level_down_nz( - nb_coeff_idx, coeff_idx, txb_cache, txb_costs, txb_info); - if (cost_map) - cost_map[nb_row - row + COST_MAP_OFFSET] - [nb_col - col + COST_MAP_OFFSET] += cost_diff; - accu_cost_diff += cost_diff; + if (ext_tx_set > 0) { + PREDICTION_MODE intra_dir; + if (mbmi->filter_intra_mode_info.use_filter_intra) + intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info + .filter_intra_mode]; + else + intra_dir = mbmi->mode; + return x->intra_tx_type_costs[ext_tx_set][square_tx_size][intra_dir] + [tx_type]; } } } + return 0; +} - if (check_base_neighbor(qc)) { -#if FAST_OPTIMIZE_TXB - int(*ref_offset)[2]; - int ref_num; - if (fast_mode) { - ref_offset = nb_ref_offset; - ref_num = NB_REF_OFFSET_NUM; - } else { - ref_offset = base_ref_offset; - ref_num = BASE_CONTEXT_POSITION_NUM; - } -#else - int(*ref_offset)[2] = base_ref_offset; - int ref_num = BASE_CONTEXT_POSITION_NUM; -#endif - for (int i = 0; i < ref_num; ++i) { - const int nb_row = row - ref_offset[i][0]; - const int nb_col = col - ref_offset[i][1]; - const int nb_coeff_idx = nb_row * txb_info->stride + nb_col; - - if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height || - nb_col >= txb_info->stride) - continue; - - const int nb_scan_idx = iscan[nb_coeff_idx]; - if (nb_scan_idx < eob) { - const int cost_diff = try_neighbor_level_down_base( - nb_coeff_idx, coeff_idx, txb_cache, txb_costs, txb_info); - if (cost_map) - cost_map[nb_row - row + COST_MAP_OFFSET] - [nb_col - col + COST_MAP_OFFSET] += cost_diff; - accu_cost_diff += cost_diff; +static AOM_FORCE_INLINE int warehouse_efficients_txb( + const AV1_COMMON *const cm, const MACROBLOCK *x, const int plane, + const int block, const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, + const struct macroblock_plane *p, const int eob, + const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs, + const MACROBLOCKD *const xd, const TX_TYPE tx_type, + const TX_CLASS tx_class) { + const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + const int txb_skip_ctx = txb_ctx->txb_skip_ctx; + const int bwl = get_txb_bwl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + const int16_t *const scan = scan_order->scan; + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, width); + DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const LV_MAP_EOB_COST *const eob_costs = + &x->eob_costs[eob_multi_size][plane_type]; + int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0]; + + av1_txb_init_levels(qcoeff, width, height, levels); + + cost += get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type); + + cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class); + + av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts); + + const int(*lps_cost)[COEFF_BASE_RANGE + 1] = coeff_costs->lps_cost; + int c = eob - 1; + { + const int pos = scan[c]; + const tran_low_t v = qcoeff[pos]; + const int sign = v >> 31; + const int level = (v ^ sign) - sign; + const int coeff_ctx = coeff_contexts[pos]; + cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1]; + + if (v) { + // sign bit cost + if (level > NUM_BASE_LEVELS) { + const int ctx = get_br_ctx(levels, pos, bwl, tx_class); + const int base_range = + AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE); + cost += lps_cost[ctx][base_range]; + cost += get_golomb_cost(level); + } + if (c) { + cost += av1_cost_literal(1); + } else { + const int sign01 = (sign ^ sign) - sign; + const int dc_sign_ctx = txb_ctx->dc_sign_ctx; + cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01]; + return cost; } } } - - if (check_br_neighbor(qc)) { -#if FAST_OPTIMIZE_TXB - int(*ref_offset)[2]; - int ref_num; - if (fast_mode) { - ref_offset = nb_ref_offset; - ref_num = NB_REF_OFFSET_NUM; - } else { - ref_offset = br_ref_offset; - ref_num = BR_CONTEXT_POSITION_NUM; - } -#else - int(*ref_offset)[2] = br_ref_offset; - const int ref_num = BR_CONTEXT_POSITION_NUM; -#endif - for (int i = 0; i < ref_num; ++i) { - const int nb_row = row - ref_offset[i][0]; - const int nb_col = col - ref_offset[i][1]; - const int nb_coeff_idx = nb_row * txb_info->stride + nb_col; - - if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height || - nb_col >= txb_info->stride) - continue; - - const int nb_scan_idx = iscan[nb_coeff_idx]; - if (nb_scan_idx < eob) { - const int cost_diff = try_neighbor_level_down_br( - nb_coeff_idx, coeff_idx, txb_cache, txb_costs, txb_info); - if (cost_map) - cost_map[nb_row - row + COST_MAP_OFFSET] - [nb_col - col + COST_MAP_OFFSET] += cost_diff; - accu_cost_diff += cost_diff; + const int(*base_cost)[4] = coeff_costs->base_cost; + for (c = eob - 2; c >= 1; --c) { + const int pos = scan[c]; + const int coeff_ctx = coeff_contexts[pos]; + const tran_low_t v = qcoeff[pos]; + const int level = abs(v); + const int cost0 = base_cost[coeff_ctx][AOMMIN(level, 3)]; + if (v) { + // sign bit cost + cost += av1_cost_literal(1); + if (level > NUM_BASE_LEVELS) { + const int ctx = get_br_ctx(levels, pos, bwl, tx_class); + const int base_range = + AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE); + cost += lps_cost[ctx][base_range]; + cost += get_golomb_cost(level); } } + cost += cost0; } + if (c == 0) { + const int pos = scan[c]; + const tran_low_t v = qcoeff[pos]; + const int coeff_ctx = coeff_contexts[pos]; + const int sign = v >> 31; + const int level = (v ^ sign) - sign; + cost += base_cost[coeff_ctx][AOMMIN(level, 3)]; - return accu_cost_diff; -} - -static int get_low_coeff_cost(int coeff_idx, const TxbCache *txb_cache, - const LV_MAP_COEFF_COST *txb_costs, - const TxbInfo *txb_info) { - const tran_low_t qc = txb_info->qcoeff[coeff_idx]; - const int abs_qc = abs(qc); - assert(abs_qc <= 1); - int cost = 0; - const int scan_idx = txb_info->scan_order->iscan[coeff_idx]; - if (scan_idx < txb_info->seg_eob) { - const int *level_cost = get_level_prob(0, coeff_idx, txb_cache, txb_costs); - cost += level_cost[qc != 0]; - } - - if (qc != 0) { - const int base_idx = 0; - const int ctx = txb_cache->base_ctx_arr[base_idx][coeff_idx]; - cost += get_base_cost(abs_qc, ctx, txb_costs->base_cost[base_idx][ctx], - base_idx); - if (scan_idx < txb_info->seg_eob) { - const int eob_ctx = get_eob_ctx(txb_info->qcoeff, coeff_idx, - txb_info->txs_ctx, txb_info->tx_type); - cost += txb_costs->eob_cost[eob_ctx][scan_idx == (txb_info->eob - 1)]; + if (v) { + // sign bit cost + const int sign01 = (sign ^ sign) - sign; + const int dc_sign_ctx = txb_ctx->dc_sign_ctx; + cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01]; + if (level > NUM_BASE_LEVELS) { + const int ctx = get_br_ctx(levels, pos, bwl, tx_class); + const int base_range = + AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE); + cost += lps_cost[ctx][base_range]; + cost += get_golomb_cost(level); + } } - cost += get_sign_bit_cost(qc, coeff_idx, txb_costs->dc_sign_cost, - txb_info->txb_ctx->dc_sign_ctx); } return cost; } -static INLINE void set_eob(TxbInfo *txb_info, int eob) { - txb_info->eob = eob; - txb_info->seg_eob = AOMMIN(eob, tx_size_2d[txb_info->tx_size] - 1); +int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x, + const int plane, const int blk_row, const int blk_col, + const int block, const TX_SIZE tx_size, + const TXB_CTX *const txb_ctx) { + const struct macroblock_plane *p = &x->plane[plane]; + const int eob = p->eobs[block]; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const PLANE_TYPE plane_type = get_plane_type(plane); + const LV_MAP_COEFF_COST *const coeff_costs = + &x->coeff_costs[txs_ctx][plane_type]; + if (eob == 0) { + return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; + } + + const MACROBLOCKD *const xd = &x->e_mbd; + const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, + tx_size, cm->reduced_tx_set_used); + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + +#define WAREHOUSE_EFFICIENTS_TXB_CASE(tx_class_literal) \ + case tx_class_literal: \ + return warehouse_efficients_txb(cm, x, plane, block, tx_size, txb_ctx, p, \ + eob, plane_type, coeff_costs, xd, tx_type, \ + tx_class_literal); + switch (tx_class) { + WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_2D); + WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_HORIZ); + WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_VERT); +#undef WAREHOUSE_EFFICIENTS_TXB_CASE + default: assert(false); return 0; + } } -// TODO(angiebird): add static to this function once it's called -int try_change_eob(int *new_eob, int coeff_idx, const TxbCache *txb_cache, - const LV_MAP_COEFF_COST *txb_costs, TxbInfo *txb_info, - int fast_mode) { - assert(txb_info->eob > 0); - const tran_low_t qc = txb_info->qcoeff[coeff_idx]; - const int abs_qc = abs(qc); - if (abs_qc != 1) { - *new_eob = -1; - return 0; - } - const int16_t *iscan = txb_info->scan_order->iscan; - const int16_t *scan = txb_info->scan_order->scan; - const int scan_idx = iscan[coeff_idx]; - *new_eob = 0; - int cost_diff = 0; - cost_diff -= get_low_coeff_cost(coeff_idx, txb_cache, txb_costs, txb_info); - // int coeff_cost = - // get_coeff_cost(qc, scan_idx, txb_info, txb_probs); - // if (-cost_diff != coeff_cost) { - // printf("-cost_diff %d coeff_cost %d\n", -cost_diff, coeff_cost); - // get_low_coeff_cost(coeff_idx, txb_cache, txb_probs, txb_info); - // get_coeff_cost(qc, scan_idx, txb_info, txb_probs); - // } - for (int si = scan_idx - 1; si >= 0; --si) { - const int ci = scan[si]; - if (txb_info->qcoeff[ci] != 0) { - *new_eob = si + 1; - break; +static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs, + const LV_MAP_EOB_COST *txb_eob_costs, int *rate_cost) { + int update = 0; + if (txb_info->eob == 0) return update; + const int16_t *const scan = txb_info->scan_order->scan; + // forward optimize the nz_map` + const int init_eob = txb_info->eob; + const TX_CLASS tx_class = tx_type_to_class[txb_info->tx_type]; + const int eob_cost = + get_eob_cost(init_eob, txb_eob_costs, txb_costs, tx_class); + + // backward optimize the level-k map + int accu_rate = eob_cost; + int64_t accu_dist = 0; + int64_t prev_eob_rd_cost = INT64_MAX; + int64_t cur_eob_rd_cost = 0; + + { + const int si = init_eob - 1; + const int coeff_idx = scan[si]; + LevelDownStats stats; + get_dist_cost_stats(&stats, si, si == init_eob - 1, txb_costs, txb_info, + tx_class); + if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) { + update = 1; + update_coeff(coeff_idx, stats.low_qc, txb_info); + accu_rate += stats.rate_low; + accu_dist += stats.dist_low; } else { - cost_diff -= get_low_coeff_cost(ci, txb_cache, txb_costs, txb_info); + accu_rate += stats.rate; + accu_dist += stats.dist; } } - const int org_eob = txb_info->eob; - set_eob(txb_info, *new_eob); - cost_diff += try_level_down(coeff_idx, txb_cache, txb_costs, txb_info, NULL, - fast_mode); - set_eob(txb_info, org_eob); + int si = init_eob - 2; + int8_t has_nz_tail = 0; + // eob is not fixed + for (; si >= 0 && has_nz_tail < 2; --si) { + assert(si != init_eob - 1); + const int coeff_idx = scan[si]; + tran_low_t qc = txb_info->qcoeff[coeff_idx]; - if (*new_eob > 0) { - // Note that get_eob_ctx does NOT actually account for qcoeff, so we don't - // need to lower down the qcoeff here - const int eob_ctx = get_eob_ctx(txb_info->qcoeff, scan[*new_eob - 1], - txb_info->txs_ctx, txb_info->tx_type); - cost_diff -= txb_costs->eob_cost[eob_ctx][0]; - cost_diff += txb_costs->eob_cost[eob_ctx][1]; - } else { - const int txb_skip_ctx = txb_info->txb_ctx->txb_skip_ctx; - cost_diff -= txb_costs->txb_skip_cost[txb_skip_ctx][0]; - cost_diff += txb_costs->txb_skip_cost[txb_skip_ctx][1]; - } - return cost_diff; -} + if (qc == 0) { + const int coeff_ctx = + get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl, + txb_info->tx_size, tx_class); + accu_rate += txb_costs->base_cost[coeff_ctx][0]; + } else { + LevelDownStats stats; + get_dist_cost_stats_with_eob(&stats, si, txb_costs, txb_info, tx_class); + // check if it is better to make this the last significant coefficient + int cur_eob_rate = + get_eob_cost(si + 1, txb_eob_costs, txb_costs, tx_class); + cur_eob_rd_cost = RDCOST(txb_info->rdmult, cur_eob_rate, 0); + prev_eob_rd_cost = + RDCOST(txb_info->rdmult, accu_rate, accu_dist) + stats.nz_rd; + if (cur_eob_rd_cost <= prev_eob_rd_cost) { + update = 1; + for (int j = si + 1; j < txb_info->eob; j++) { + const int coeff_pos_j = scan[j]; + update_coeff(coeff_pos_j, 0, txb_info); + } + txb_info->eob = si + 1; + + // rerun cost calculation due to change of eob + accu_rate = cur_eob_rate; + accu_dist = 0; + get_dist_cost_stats(&stats, si, 1, txb_costs, txb_info, tx_class); + if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) { + update = 1; + update_coeff(coeff_idx, stats.low_qc, txb_info); + accu_rate += stats.rate_low; + accu_dist += stats.dist_low; + } else { + accu_rate += stats.rate; + accu_dist += stats.dist; + } -static INLINE tran_low_t qcoeff_to_dqcoeff(tran_low_t qc, int dqv, int shift) { - int sgn = qc < 0 ? -1 : 1; - return sgn * ((abs(qc) * dqv) >> shift); -} + // reset non zero tail when new eob is found + has_nz_tail = 0; + } else { + int bUpdCoeff = 0; + if (stats.rd_low < stats.rd) { + if ((si < txb_info->eob - 1)) { + bUpdCoeff = 1; + update = 1; + } + } else { + ++has_nz_tail; + } -// TODO(angiebird): add static to this function it's called -void update_level_down(int coeff_idx, TxbCache *txb_cache, TxbInfo *txb_info) { - const tran_low_t qc = txb_info->qcoeff[coeff_idx]; - const int abs_qc = abs(qc); - if (qc == 0) return; - const tran_low_t low_coeff = get_lower_coeff(qc); - txb_info->qcoeff[coeff_idx] = low_coeff; - const int dqv = txb_info->dequant[coeff_idx != 0]; - txb_info->dqcoeff[coeff_idx] = - qcoeff_to_dqcoeff(low_coeff, dqv, txb_info->shift); - - const int row = coeff_idx >> txb_info->bwl; - const int col = coeff_idx - (row << txb_info->bwl); - const int eob = txb_info->eob; - const int16_t *iscan = txb_info->scan_order->iscan; - for (int i = 0; i < SIG_REF_OFFSET_NUM; ++i) { - const int nb_row = row - sig_ref_offset[i][0]; - const int nb_col = col - sig_ref_offset[i][1]; - - if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height && - nb_col < txb_info->stride)) - continue; - - const int nb_coeff_idx = nb_row * txb_info->stride + nb_col; - const int nb_scan_idx = iscan[nb_coeff_idx]; - if (nb_scan_idx < eob) { - const int scan_idx = iscan[coeff_idx]; - if (scan_idx < nb_scan_idx) { - const int level = 1; - if (abs_qc == level) { - txb_cache->nz_count_arr[nb_coeff_idx] -= 1; - assert(txb_cache->nz_count_arr[nb_coeff_idx] >= 0); + if (bUpdCoeff) { + update_coeff(coeff_idx, stats.low_qc, txb_info); + accu_rate += stats.rate_low; + accu_dist += stats.dist_low; + } else { + accu_rate += stats.rate; + accu_dist += stats.dist; } - const int count = txb_cache->nz_count_arr[nb_coeff_idx]; - txb_cache->nz_ctx_arr[nb_coeff_idx] = get_nz_map_ctx_from_count( - count, nb_coeff_idx, txb_info->bwl, txb_info->tx_type); - // int ref_ctx = get_nz_map_ctx(txb_info->qcoeff, nb_coeff_idx, - // txb_info->bwl, tx_type); - // if (ref_ctx != txb_cache->nz_ctx_arr[nb_coeff_idx]) - // printf("nz ctx %d ref_ctx %d\n", - // txb_cache->nz_ctx_arr[nb_coeff_idx], ref_ctx); } } - } + } // for (si) + + // eob is fixed + for (; si >= 0; --si) { + assert(si != init_eob - 1); + const int coeff_idx = scan[si]; + tran_low_t qc = txb_info->qcoeff[coeff_idx]; + + if (qc == 0) { + const int coeff_ctx = + get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl, + txb_info->tx_size, tx_class); + accu_rate += txb_costs->base_cost[coeff_ctx][0]; + } else { + LevelDownStats stats; + get_dist_cost_stats(&stats, si, 0, txb_costs, txb_info, tx_class); - const BASE_CTX_TABLE *base_ctx_table = - txb_info->coeff_ctx_table->base_ctx_table; - for (int i = 0; i < BASE_CONTEXT_POSITION_NUM; ++i) { - const int nb_row = row - base_ref_offset[i][0]; - const int nb_col = col - base_ref_offset[i][1]; - const int nb_coeff_idx = nb_row * txb_info->stride + nb_col; - - if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height && - nb_col < txb_info->stride)) - continue; - - const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx]; - if (!has_base(nb_coeff, 0)) continue; - const int nb_scan_idx = iscan[nb_coeff_idx]; - if (nb_scan_idx < eob) { - if (row >= nb_row && col >= nb_col) - update_mag_arr(txb_cache->base_mag_arr[nb_coeff_idx], abs_qc); - const int mag = - get_mag_from_mag_arr(txb_cache->base_mag_arr[nb_coeff_idx]); - for (int base_idx = 0; base_idx < NUM_BASE_LEVELS; ++base_idx) { - if (!has_base(nb_coeff, base_idx)) continue; - const int level = base_idx + 1; - if (abs_qc == level) { - txb_cache->base_count_arr[base_idx][nb_coeff_idx] -= 1; - assert(txb_cache->base_count_arr[base_idx][nb_coeff_idx] >= 0); + int bUpdCoeff = 0; + if (stats.rd_low < stats.rd) { + if ((si < txb_info->eob - 1)) { + bUpdCoeff = 1; + update = 1; } - const int count = txb_cache->base_count_arr[base_idx][nb_coeff_idx]; - txb_cache->base_ctx_arr[base_idx][nb_coeff_idx] = - base_ctx_table[nb_row != 0][nb_col != 0][mag > level][count]; - // int ref_ctx = get_base_ctx(txb_info->qcoeff, nb_coeff_idx, - // txb_info->bwl, level); - // if (ref_ctx != txb_cache->base_ctx_arr[base_idx][nb_coeff_idx]) { - // printf("base ctx %d ref_ctx %d\n", - // txb_cache->base_ctx_arr[base_idx][nb_coeff_idx], ref_ctx); - // } + } + if (bUpdCoeff) { + update_coeff(coeff_idx, stats.low_qc, txb_info); + accu_rate += stats.rate_low; + accu_dist += stats.dist_low; + } else { + accu_rate += stats.rate; + accu_dist += stats.dist; } } - } - - for (int i = 0; i < BR_CONTEXT_POSITION_NUM; ++i) { - const int nb_row = row - br_ref_offset[i][0]; - const int nb_col = col - br_ref_offset[i][1]; - const int nb_coeff_idx = nb_row * txb_info->stride + nb_col; + } // for (si) - if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height && - nb_col < txb_info->stride)) - continue; + int non_zero_blk_rate = + txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][0]; + prev_eob_rd_cost = + RDCOST(txb_info->rdmult, accu_rate + non_zero_blk_rate, accu_dist); - const int nb_scan_idx = iscan[nb_coeff_idx]; - const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx]; - if (!has_br(nb_coeff)) continue; - if (nb_scan_idx < eob) { - const int level = 1 + NUM_BASE_LEVELS; - if (abs_qc == level) { - txb_cache->br_count_arr[nb_coeff_idx] -= 1; - assert(txb_cache->br_count_arr[nb_coeff_idx] >= 0); - } - if (row >= nb_row && col >= nb_col) - update_mag_arr(txb_cache->br_mag_arr[nb_coeff_idx], abs_qc); - const int count = txb_cache->br_count_arr[nb_coeff_idx]; - const int mag = get_mag_from_mag_arr(txb_cache->br_mag_arr[nb_coeff_idx]); - txb_cache->br_ctx_arr[nb_coeff_idx] = - get_br_ctx_from_count_mag(nb_row, nb_col, count, mag); - // int ref_ctx = get_level_ctx(txb_info->qcoeff, nb_coeff_idx, - // txb_info->bwl); - // if (ref_ctx != txb_cache->br_ctx_arr[nb_coeff_idx]) { - // printf("base ctx %d ref_ctx %d\n", - // txb_cache->br_ctx_arr[nb_coeff_idx], ref_ctx); - // } + int zero_blk_rate = + txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][1]; + int64_t zero_blk_rd_cost = RDCOST(txb_info->rdmult, zero_blk_rate, 0); + if (zero_blk_rd_cost <= prev_eob_rd_cost) { + update = 1; + for (int j = 0; j < txb_info->eob; j++) { + const int coeff_pos_j = scan[j]; + update_coeff(coeff_pos_j, 0, txb_info); } + txb_info->eob = 0; + } + + // record total rate cost + *rate_cost = zero_blk_rd_cost <= prev_eob_rd_cost + ? zero_blk_rate + : accu_rate + non_zero_blk_rate; + + if (txb_info->eob > 0) { + *rate_cost += txb_info->tx_type_cost; } + + return update; } -static int get_coeff_cost(tran_low_t qc, int scan_idx, TxbInfo *txb_info, - const LV_MAP_COEFF_COST *txb_costs) { - const TXB_CTX *txb_ctx = txb_info->txb_ctx; - const int is_nz = (qc != 0); - const tran_low_t abs_qc = abs(qc); - int cost = 0; +// These numbers are empirically obtained. +static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { + { 17, 13 }, + { 16, 10 }, +}; + +void hbt_init() { + hbt_hash_table = + aom_malloc(sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH); + memset(hbt_hash_table, 0, + sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH); + av1_crc32c_calculator_init(&crc_calculator); // 31 bit: qc & ctx + + hbt_needs_init = 0; +} + +void hbt_destroy() { aom_free(hbt_hash_table); } + +int hbt_hash_miss(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash, + TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs, + const LV_MAP_EOB_COST *txb_eob_costs, + const struct macroblock_plane *p, int block, int fast_mode, + int *rate_cost) { + (void)fast_mode; const int16_t *scan = txb_info->scan_order->scan; + int prev_eob = txb_info->eob; + assert(HBT_EOB <= 16); // Lengthen array if allowing longer eob. + int32_t prev_coeff[16]; + for (int i = 0; i < prev_eob; i++) { + prev_coeff[i] = txb_info->qcoeff[scan[i]]; + } + for (int i = prev_eob; i < HBT_EOB; i++) { + prev_coeff[i] = 0; // For compiler piece of mind. + } + + av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height, + txb_info->levels); - if (scan_idx < txb_info->seg_eob) { - int coeff_ctx = - get_nz_map_ctx(txb_info->qcoeff, scan_idx, scan, txb_info->bwl, - txb_info->height, txb_info->tx_type); - cost += txb_costs->nz_map_cost[coeff_ctx][is_nz]; + const int update = + optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost); + + // Overwrite old entry + uint16_t hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE; + uint16_t hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH; + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .rate_cost = *rate_cost; + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index].init = 1; + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .hbt_qc_hash = hbt_qc_hash; + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .hbt_ctx_hash = hbt_ctx_hash; + assert(prev_eob >= txb_info->eob); // eob can't get longer + for (int i = 0; i < txb_info->eob; i++) { + // Record how coeff changed. Convention: towards zero is negative. + if (txb_info->qcoeff[scan[i]] > 0) + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .deltas[i] = txb_info->qcoeff[scan[i]] - prev_coeff[i]; + else + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .deltas[i] = prev_coeff[i] - txb_info->qcoeff[scan[i]]; + } + for (int i = txb_info->eob; i < prev_eob; i++) { + // If eob got shorter, record that all after it changed to zero. + if (prev_coeff[i] > 0) + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .deltas[i] = -prev_coeff[i]; + else + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .deltas[i] = prev_coeff[i]; + } + for (int i = prev_eob; i < HBT_EOB; i++) { + // Record 'no change' after optimized coefficients run out. + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .deltas[i] = 0; } - if (is_nz) { - cost += get_sign_bit_cost(qc, scan_idx, txb_costs->dc_sign_cost, - txb_ctx->dc_sign_ctx); + if (update) { + p->eobs[block] = txb_info->eob; + p->txb_entropy_ctx[block] = av1_get_txb_entropy_context( + txb_info->qcoeff, txb_info->scan_order, txb_info->eob); + } + return txb_info->eob; +} - int ctx_ls[NUM_BASE_LEVELS] = { 0 }; - get_base_ctx_set(txb_info->qcoeff, scan[scan_idx], txb_info->bwl, - txb_info->height, ctx_ls); +int hbt_hash_hit(uint32_t hbt_table_index, int hbt_array_index, + TxbInfo *txb_info, const struct macroblock_plane *p, int block, + int *rate_cost) { + const int16_t *scan = txb_info->scan_order->scan; + int new_eob = 0; + int update = 0; - int i; - for (i = 0; i < NUM_BASE_LEVELS; ++i) { - cost += get_base_cost(abs_qc, ctx_ls[i], - txb_costs->base_cost[i][ctx_ls[i]], i); - } + for (int i = 0; i < txb_info->eob; i++) { + // Delta convention is negatives go towards zero, so only apply those ones. + if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .deltas[i] < 0) { + if (txb_info->qcoeff[scan[i]] > 0) + txb_info->qcoeff[scan[i]] += + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .deltas[i]; + else + txb_info->qcoeff[scan[i]] -= + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .deltas[i]; - if (abs_qc > NUM_BASE_LEVELS) { - int ctx = get_br_ctx(txb_info->qcoeff, scan[scan_idx], txb_info->bwl, - txb_info->height); - cost += get_br_cost(abs_qc, ctx, txb_costs->lps_cost[ctx]); - cost += get_golomb_cost(abs_qc); + update = 1; + update_coeff(scan[i], txb_info->qcoeff[scan[i]], txb_info); } + if (txb_info->qcoeff[scan[i]]) new_eob = i + 1; + } - if (scan_idx < txb_info->seg_eob) { - int eob_ctx = get_eob_ctx(txb_info->qcoeff, scan[scan_idx], - txb_info->txs_ctx, txb_info->tx_type); - cost += txb_costs->eob_cost[eob_ctx][scan_idx == (txb_info->eob - 1)]; - } + // Rate_cost can be calculated here instead (av1_cost_coeffs_txb), but + // it is expensive and gives little benefit as long as qc_hash is high bit + *rate_cost = + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .rate_cost; + + if (update) { + txb_info->eob = new_eob; + p->eobs[block] = txb_info->eob; + p->txb_entropy_ctx[block] = av1_get_txb_entropy_context( + txb_info->qcoeff, txb_info->scan_order, txb_info->eob); } - return cost; + + return txb_info->eob; } -#if TEST_OPTIMIZE_TXB -#define ALL_REF_OFFSET_NUM 17 -static int all_ref_offset[ALL_REF_OFFSET_NUM][2] = { - { 0, 0 }, { -2, -1 }, { -2, 0 }, { -2, 1 }, { -1, -2 }, { -1, -1 }, - { -1, 0 }, { -1, 1 }, { 0, -2 }, { 0, -1 }, { 1, -2 }, { 1, -1 }, - { 1, 0 }, { 2, 0 }, { 0, 1 }, { 0, 2 }, { 1, 1 }, -}; +int hbt_search_match(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash, + TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs, + const LV_MAP_EOB_COST *txb_eob_costs, + const struct macroblock_plane *p, int block, int fast_mode, + int *rate_cost) { + // Check for qcoeff match + int hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH; + int hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE; + + if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .hbt_qc_hash == hbt_qc_hash && + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .hbt_ctx_hash == hbt_ctx_hash && + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .init) { + return hbt_hash_hit(hbt_table_index, hbt_array_index, txb_info, p, block, + rate_cost); + } else { + return hbt_hash_miss(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs, + txb_eob_costs, p, block, fast_mode, rate_cost); + } +} + +int hbt_create_hashes(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs, + const LV_MAP_EOB_COST *txb_eob_costs, + const struct macroblock_plane *p, int block, + int fast_mode, int *rate_cost) { + // Initialize hash table if needed. + if (hbt_needs_init) { + hbt_init(); + } + + //// Hash creation + uint8_t txb_hash_data[256]; // Asserts below to ensure enough space. + const int16_t *scan = txb_info->scan_order->scan; + uint8_t chunk = 0; + int hash_data_index = 0; + + // Make qc_hash. + int packing_index = 0; // needed for packing. + for (int i = 0; i < txb_info->eob; i++) { + tran_low_t prechunk = txb_info->qcoeff[scan[i]]; + + // Softening: Improves speed. Aligns with signed deltas. + if (prechunk < 0) prechunk *= -1; + + // Early kick out: Don't apply feature if there are large coeffs: + // If this kickout value is removed or raised beyond int8_t, + // widen deltas type in OptTxbQcoeff struct. + assert((int8_t)HBT_KICKOUT == HBT_KICKOUT); // If not, widen types. + if (prechunk > HBT_KICKOUT) { + av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height, + txb_info->levels); + + const int update = + optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost); + + if (update) { + p->eobs[block] = txb_info->eob; + p->txb_entropy_ctx[block] = av1_get_txb_entropy_context( + txb_info->qcoeff, txb_info->scan_order, txb_info->eob); + } + return txb_info->eob; + } + + // Since coeffs are 0 to 3, only 2 bits are needed: pack into bytes + if (packing_index == 0) txb_hash_data[hash_data_index] = 0; + chunk = prechunk << packing_index; + packing_index += 2; + txb_hash_data[hash_data_index] |= chunk; -static int try_level_down_ref(int coeff_idx, const LV_MAP_COEFF_COST *txb_costs, - TxbInfo *txb_info, - int (*cost_map)[COST_MAP_SIZE]) { - if (cost_map) { - for (int i = 0; i < COST_MAP_SIZE; ++i) av1_zero(cost_map[i]); - } - tran_low_t qc = txb_info->qcoeff[coeff_idx]; - if (qc == 0) return 0; - int row = coeff_idx >> txb_info->bwl; - int col = coeff_idx - (row << txb_info->bwl); - int org_cost = 0; - for (int i = 0; i < ALL_REF_OFFSET_NUM; ++i) { - int nb_row = row - all_ref_offset[i][0]; - int nb_col = col - all_ref_offset[i][1]; - int nb_coeff_idx = nb_row * txb_info->stride + nb_col; - int nb_scan_idx = txb_info->scan_order->iscan[nb_coeff_idx]; - if (nb_scan_idx < txb_info->eob && nb_row >= 0 && nb_col >= 0 && - nb_row < txb_info->height && nb_col < txb_info->stride) { - tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx]; - int cost = get_coeff_cost(nb_coeff, nb_scan_idx, txb_info, txb_costs); - if (cost_map) - cost_map[nb_row - row + COST_MAP_OFFSET] - [nb_col - col + COST_MAP_OFFSET] -= cost; - org_cost += cost; + // Full byte: + if (packing_index == 8) { + packing_index = 0; + hash_data_index++; } } - txb_info->qcoeff[coeff_idx] = get_lower_coeff(qc); - int new_cost = 0; - for (int i = 0; i < ALL_REF_OFFSET_NUM; ++i) { - int nb_row = row - all_ref_offset[i][0]; - int nb_col = col - all_ref_offset[i][1]; - int nb_coeff_idx = nb_row * txb_info->stride + nb_col; - int nb_scan_idx = txb_info->scan_order->iscan[nb_coeff_idx]; - if (nb_scan_idx < txb_info->eob && nb_row >= 0 && nb_col >= 0 && - nb_row < txb_info->height && nb_col < txb_info->stride) { - tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx]; - int cost = get_coeff_cost(nb_coeff, nb_scan_idx, txb_info, txb_costs); - if (cost_map) - cost_map[nb_row - row + COST_MAP_OFFSET] - [nb_col - col + COST_MAP_OFFSET] += cost; - new_cost += cost; + // Needed when packing_index != 0, to include final byte. + hash_data_index++; + assert(hash_data_index <= 64); + // 31 bit qc_hash: index to array + uint32_t hbt_qc_hash = + av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index); + + // Make ctx_hash. + hash_data_index = 0; + tran_low_t prechunk; + + for (int i = 0; i < txb_info->eob; i++) { + // Save as magnitudes towards or away from zero. + if (txb_info->tcoeff[scan[i]] >= 0) + prechunk = txb_info->tcoeff[scan[i]] - txb_info->dqcoeff[scan[i]]; + else + prechunk = txb_info->dqcoeff[scan[i]] - txb_info->tcoeff[scan[i]]; + + chunk = prechunk & 0xff; + txb_hash_data[hash_data_index++] = chunk; + } + + // Extra ctx data: + // Include dequants. + txb_hash_data[hash_data_index++] = txb_info->dequant[0] & 0xff; + txb_hash_data[hash_data_index++] = txb_info->dequant[1] & 0xff; + chunk = txb_info->txb_ctx->txb_skip_ctx & 0xff; + txb_hash_data[hash_data_index++] = chunk; + chunk = txb_info->txb_ctx->dc_sign_ctx & 0xff; + txb_hash_data[hash_data_index++] = chunk; + // eob + chunk = txb_info->eob & 0xff; + txb_hash_data[hash_data_index++] = chunk; + // rdmult (int64) + chunk = txb_info->rdmult & 0xff; + txb_hash_data[hash_data_index++] = chunk; + // tx_type + chunk = txb_info->tx_type & 0xff; + txb_hash_data[hash_data_index++] = chunk; + // base_eob_cost + for (int i = 1; i < 3; i++) { // i = 0 are softened away + for (int j = 0; j < SIG_COEF_CONTEXTS_EOB; j++) { + chunk = (txb_costs->base_eob_cost[j][i] & 0xff00) >> 8; + txb_hash_data[hash_data_index++] = chunk; } } - txb_info->qcoeff[coeff_idx] = qc; - return new_cost - org_cost; -} - -static void test_level_down(int coeff_idx, const TxbCache *txb_cache, - const LV_MAP_COEFF_COST *txb_costs, - TxbInfo *txb_info) { - int cost_map[COST_MAP_SIZE][COST_MAP_SIZE]; - int ref_cost_map[COST_MAP_SIZE][COST_MAP_SIZE]; - const int cost_diff = - try_level_down(coeff_idx, txb_cache, txb_costs, txb_info, cost_map, 0); - const int cost_diff_ref = - try_level_down_ref(coeff_idx, txb_costs, txb_info, ref_cost_map); - if (cost_diff != cost_diff_ref) { - printf("qc %d cost_diff %d cost_diff_ref %d\n", txb_info->qcoeff[coeff_idx], - cost_diff, cost_diff_ref); - for (int r = 0; r < COST_MAP_SIZE; ++r) { - for (int c = 0; c < COST_MAP_SIZE; ++c) { - printf("%d:%d ", cost_map[r][c], ref_cost_map[r][c]); - } - printf("\n"); + // eob_cost + for (int i = 0; i < 11; i++) { + for (int j = 0; j < 2; j++) { + chunk = (txb_eob_costs->eob_cost[j][i] & 0xff00) >> 8; + txb_hash_data[hash_data_index++] = chunk; + } + } + // dc_sign_cost + for (int i = 0; i < 2; i++) { + for (int j = 0; j < DC_SIGN_CONTEXTS; j++) { + chunk = (txb_costs->dc_sign_cost[j][i] & 0xff00) >> 8; + txb_hash_data[hash_data_index++] = chunk; } } + + assert(hash_data_index <= 256); + // 31 bit ctx_hash: used to index table + uint32_t hbt_ctx_hash = + av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index); + //// End hash creation + + return hbt_search_match(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs, + txb_eob_costs, p, block, fast_mode, rate_cost); } -#endif -// TODO(angiebird): make this static once it's called -int get_txb_cost(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs) { - int cost = 0; - int txb_skip_ctx = txb_info->txb_ctx->txb_skip_ctx; - const int16_t *scan = txb_info->scan_order->scan; - if (txb_info->eob == 0) { - cost = txb_costs->txb_skip_cost[txb_skip_ctx][1]; - return cost; - } - cost = txb_costs->txb_skip_cost[txb_skip_ctx][0]; - for (int c = 0; c < txb_info->eob; ++c) { - tran_low_t qc = txb_info->qcoeff[scan[c]]; - int coeff_cost = get_coeff_cost(qc, c, txb_info, txb_costs); - cost += coeff_cost; +static AOM_FORCE_INLINE int get_coeff_cost_simple( + int ci, tran_low_t abs_qc, int coeff_ctx, + const LV_MAP_COEFF_COST *txb_costs, int bwl, TX_CLASS tx_class, + const uint8_t *levels) { + // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0) + // and not the last (scan_idx != eob - 1) + assert(ci > 0); + int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)]; + if (abs_qc) { + cost += av1_cost_literal(1); + if (abs_qc > NUM_BASE_LEVELS) { + const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class); + cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]); + cost += get_golomb_cost(abs_qc); + } } return cost; } -#if TEST_OPTIMIZE_TXB -void test_try_change_eob(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs, - TxbCache *txb_cache) { - int eob = txb_info->eob; - const int16_t *scan = txb_info->scan_order->scan; - if (eob > 0) { - int last_si = eob - 1; - int last_ci = scan[last_si]; - int last_coeff = txb_info->qcoeff[last_ci]; - if (abs(last_coeff) == 1) { - int new_eob; - int cost_diff = - try_change_eob(&new_eob, last_ci, txb_cache, txb_costs, txb_info, 0); - int org_eob = txb_info->eob; - int cost = get_txb_cost(txb_info, txb_costs); - - txb_info->qcoeff[last_ci] = get_lower_coeff(last_coeff); - set_eob(txb_info, new_eob); - int new_cost = get_txb_cost(txb_info, txb_costs); - set_eob(txb_info, org_eob); - txb_info->qcoeff[last_ci] = last_coeff; - - int ref_cost_diff = -cost + new_cost; - if (cost_diff != ref_cost_diff) - printf("org_eob %d new_eob %d cost_diff %d ref_cost_diff %d\n", org_eob, - new_eob, cost_diff, ref_cost_diff); +static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc, + int sign, int coeff_ctx, + int dc_sign_ctx, + const LV_MAP_COEFF_COST *txb_costs, + int bwl, TX_CLASS tx_class, + const uint8_t *levels) { + int cost = 0; + if (is_last) { + cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1]; + } else { + cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)]; + } + if (abs_qc != 0) { + if (ci == 0) { + cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign]; + } else { + cost += av1_cost_literal(1); + } + if (abs_qc > NUM_BASE_LEVELS) { + const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class); + cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]); + cost += get_golomb_cost(abs_qc); } } + return cost; } -#endif -static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff, - int shift) { - const int64_t diff = (tcoeff - dqcoeff) * (1 << shift); - const int64_t error = diff * diff; - return error; +static INLINE void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv, + int shift, tran_low_t *qc_low, + tran_low_t *dqc_low) { + tran_low_t abs_qc_low = abs_qc - 1; + *qc_low = (-sign ^ abs_qc_low) + sign; + assert((sign ? -abs_qc_low : abs_qc_low) == *qc_low); + tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift; + *dqc_low = (-sign ^ abs_dqc_low) + sign; + assert((sign ? -abs_dqc_low : abs_dqc_low) == *dqc_low); } -typedef struct LevelDownStats { - int update; - tran_low_t low_qc; - tran_low_t low_dqc; - int64_t rd_diff; - int cost_diff; - int64_t dist_diff; - int new_eob; -} LevelDownStats; - -void try_level_down_facade(LevelDownStats *stats, int scan_idx, - const TxbCache *txb_cache, - const LV_MAP_COEFF_COST *txb_costs, - TxbInfo *txb_info, int fast_mode) { - const int16_t *scan = txb_info->scan_order->scan; - const int coeff_idx = scan[scan_idx]; - const tran_low_t qc = txb_info->qcoeff[coeff_idx]; - stats->new_eob = -1; - stats->update = 0; +static INLINE void update_coeff_general( + int *accu_rate, int64_t *accu_dist, int si, int eob, TX_SIZE tx_size, + TX_CLASS tx_class, int bwl, int height, int64_t rdmult, int shift, + int dc_sign_ctx, const int16_t *dequant, const int16_t *scan, + const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels) { + const int dqv = dequant[si != 0]; + const int ci = scan[si]; + const tran_low_t qc = qcoeff[ci]; + const int is_last = si == (eob - 1); + const int coeff_ctx = get_lower_levels_ctx_general( + is_last, si, bwl, height, levels, ci, tx_size, tx_class); if (qc == 0) { - return; + *accu_rate += txb_costs->base_cost[coeff_ctx][0]; + } else { + const int sign = (qc < 0) ? 1 : 0; + const tran_low_t abs_qc = abs(qc); + const tran_low_t tqc = tcoeff[ci]; + const tran_low_t dqc = dqcoeff[ci]; + const int64_t dist = get_coeff_dist(tqc, dqc, shift); + const int64_t dist0 = get_coeff_dist(tqc, 0, shift); + const int rate = + get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx, + dc_sign_ctx, txb_costs, bwl, tx_class, levels); + const int64_t rd = RDCOST(rdmult, rate, dist); + + tran_low_t qc_low, dqc_low; + get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low); + const tran_low_t abs_qc_low = abs_qc - 1; + const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift); + const int rate_low = + get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx, + dc_sign_ctx, txb_costs, bwl, tx_class, levels); + const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low); + if (rd_low < rd) { + qcoeff[ci] = qc_low; + dqcoeff[ci] = dqc_low; + levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX); + *accu_rate += rate_low; + *accu_dist += dist_low - dist0; + } else { + *accu_rate += rate; + *accu_dist += dist - dist0; + } } +} - const tran_low_t tqc = txb_info->tcoeff[coeff_idx]; - const int dqv = txb_info->dequant[coeff_idx != 0]; - - const tran_low_t dqc = qcoeff_to_dqcoeff(qc, dqv, txb_info->shift); - const int64_t dqc_dist = get_coeff_dist(tqc, dqc, txb_info->shift); - - stats->low_qc = get_lower_coeff(qc); - stats->low_dqc = qcoeff_to_dqcoeff(stats->low_qc, dqv, txb_info->shift); - const int64_t low_dqc_dist = - get_coeff_dist(tqc, stats->low_dqc, txb_info->shift); - - stats->dist_diff = -dqc_dist + low_dqc_dist; - stats->cost_diff = 0; - stats->new_eob = txb_info->eob; - if (scan_idx == txb_info->eob - 1 && abs(qc) == 1) { - stats->cost_diff = try_change_eob(&stats->new_eob, coeff_idx, txb_cache, - txb_costs, txb_info, fast_mode); +static AOM_FORCE_INLINE void update_coeff_simple( + int *accu_rate, int si, int eob, TX_SIZE tx_size, TX_CLASS tx_class, + int bwl, int64_t rdmult, int shift, const int16_t *dequant, + const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs, + const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff, + uint8_t *levels) { + const int dqv = dequant[1]; + (void)eob; + // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0) + // and not the last (scan_idx != eob - 1) + assert(si != eob - 1); + assert(si > 0); + const int ci = scan[si]; + const tran_low_t qc = qcoeff[ci]; + const int coeff_ctx = + get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class); + if (qc == 0) { + *accu_rate += txb_costs->base_cost[coeff_ctx][0]; } else { - stats->cost_diff = try_level_down(coeff_idx, txb_cache, txb_costs, txb_info, - NULL, fast_mode); -#if TEST_OPTIMIZE_TXB - test_level_down(coeff_idx, txb_cache, txb_costs, txb_info); -#endif + const tran_low_t abs_qc = abs(qc); + const tran_low_t tqc = tcoeff[ci]; + const tran_low_t dqc = dqcoeff[ci]; + const int rate = get_coeff_cost_simple(ci, abs_qc, coeff_ctx, txb_costs, + bwl, tx_class, levels); + if (abs(dqc) < abs(tqc)) { + *accu_rate += rate; + return; + } + const int64_t dist = get_coeff_dist(tqc, dqc, shift); + const int64_t rd = RDCOST(rdmult, rate, dist); + + const int sign = (qc < 0) ? 1 : 0; + tran_low_t qc_low, dqc_low; + get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low); + const tran_low_t abs_qc_low = abs_qc - 1; + const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift); + const int rate_low = get_coeff_cost_simple( + ci, abs_qc_low, coeff_ctx, txb_costs, bwl, tx_class, levels); + const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low); + if (rd_low < rd) { + qcoeff[ci] = qc_low; + dqcoeff[ci] = dqc_low; + levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX); + *accu_rate += rate_low; + } else { + *accu_rate += rate; + } } - stats->rd_diff = RDCOST(txb_info->rdmult, stats->cost_diff, stats->dist_diff); - if (stats->rd_diff < 0) stats->update = 1; - return; } -static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs, - TxbCache *txb_cache, int dry_run, int fast_mode) { - int update = 0; - if (txb_info->eob == 0) return update; - int cost_diff = 0; - int64_t dist_diff = 0; - int64_t rd_diff = 0; - const int max_eob = tx_size_2d[txb_info->tx_size]; - -#if TEST_OPTIMIZE_TXB - int64_t sse; - int64_t org_dist = - av1_block_error_c(txb_info->tcoeff, txb_info->dqcoeff, max_eob, &sse) * - (1 << (2 * txb_info->shift)); - int org_cost = get_txb_cost(txb_info, txb_costs); -#endif - - tran_low_t *org_qcoeff = txb_info->qcoeff; - tran_low_t *org_dqcoeff = txb_info->dqcoeff; +static AOM_FORCE_INLINE void update_coeff_eob( + int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci, + int si, TX_SIZE tx_size, TX_CLASS tx_class, int bwl, int height, + int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant, + const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs, + const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness) { + const int dqv = dequant[si != 0]; + assert(si != *eob - 1); + const int ci = scan[si]; + const tran_low_t qc = qcoeff[ci]; + const int coeff_ctx = + get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class); + if (qc == 0) { + *accu_rate += txb_costs->base_cost[coeff_ctx][0]; + } else { + int lower_level = 0; + const tran_low_t abs_qc = abs(qc); + const tran_low_t tqc = tcoeff[ci]; + const tran_low_t dqc = dqcoeff[ci]; + const int sign = (qc < 0) ? 1 : 0; + const int64_t dist0 = get_coeff_dist(tqc, 0, shift); + int64_t dist = get_coeff_dist(tqc, dqc, shift) - dist0; + int rate = + get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx, + txb_costs, bwl, tx_class, levels); + int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist); + + tran_low_t qc_low, dqc_low; + get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low); + const tran_low_t abs_qc_low = abs_qc - 1; + const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0; + const int rate_low = + get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx, dc_sign_ctx, + txb_costs, bwl, tx_class, levels); + const int64_t rd_low = + RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low); + + int lower_level_new_eob = 0; + const int new_eob = si + 1; + uint8_t tmp_levels[3]; + for (int ni = 0; ni < *nz_num; ++ni) { + const int last_ci = nz_ci[ni]; + tmp_levels[ni] = levels[get_padded_idx(last_ci, bwl)]; + levels[get_padded_idx(last_ci, bwl)] = 0; + } - tran_low_t tmp_qcoeff[MAX_TX_SQUARE]; - tran_low_t tmp_dqcoeff[MAX_TX_SQUARE]; - const int org_eob = txb_info->eob; - if (dry_run) { - memcpy(tmp_qcoeff, org_qcoeff, sizeof(org_qcoeff[0]) * max_eob); - memcpy(tmp_dqcoeff, org_dqcoeff, sizeof(org_dqcoeff[0]) * max_eob); - txb_info->qcoeff = tmp_qcoeff; - txb_info->dqcoeff = tmp_dqcoeff; - } + const int coeff_ctx_new_eob = get_lower_levels_ctx_general( + 1, si, bwl, height, levels, ci, tx_size, tx_class); + const int new_eob_cost = + get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class); + int rate_coeff_eob = + new_eob_cost + get_coeff_cost_general(1, ci, abs_qc, sign, + coeff_ctx_new_eob, dc_sign_ctx, + txb_costs, bwl, tx_class, levels); + int64_t dist_new_eob = dist; + int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob); + + if (abs_qc_low > 0) { + const int rate_coeff_eob_low = + new_eob_cost + + get_coeff_cost_general(1, ci, abs_qc_low, sign, coeff_ctx_new_eob, + dc_sign_ctx, txb_costs, bwl, tx_class, levels); + const int64_t dist_new_eob_low = dist_low; + const int64_t rd_new_eob_low = + RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low); + if (rd_new_eob_low < rd_new_eob) { + lower_level_new_eob = 1; + rd_new_eob = rd_new_eob_low; + rate_coeff_eob = rate_coeff_eob_low; + dist_new_eob = dist_new_eob_low; + } + } - const int16_t *scan = txb_info->scan_order->scan; + if (rd_low < rd) { + lower_level = 1; + rd = rd_low; + rate = rate_low; + dist = dist_low; + } - // forward optimize the nz_map - const int cur_eob = txb_info->eob; - for (int si = 0; si < cur_eob; ++si) { - const int coeff_idx = scan[si]; - tran_low_t qc = txb_info->qcoeff[coeff_idx]; - if (abs(qc) == 1) { - LevelDownStats stats; - try_level_down_facade(&stats, si, txb_cache, txb_costs, txb_info, - fast_mode); - if (stats.update) { - update = 1; - cost_diff += stats.cost_diff; - dist_diff += stats.dist_diff; - rd_diff += stats.rd_diff; - update_level_down(coeff_idx, txb_cache, txb_info); - set_eob(txb_info, stats.new_eob); + if (sharpness == 0 && rd_new_eob < rd) { + for (int ni = 0; ni < *nz_num; ++ni) { + int last_ci = nz_ci[ni]; + // levels[get_padded_idx(last_ci, bwl)] = 0; + qcoeff[last_ci] = 0; + dqcoeff[last_ci] = 0; + } + *eob = new_eob; + *nz_num = 0; + *accu_rate = rate_coeff_eob; + *accu_dist = dist_new_eob; + lower_level = lower_level_new_eob; + } else { + for (int ni = 0; ni < *nz_num; ++ni) { + const int last_ci = nz_ci[ni]; + levels[get_padded_idx(last_ci, bwl)] = tmp_levels[ni]; } + *accu_rate += rate; + *accu_dist += dist; } - } - // backward optimize the level-k map - int eob_fix = 0; - for (int si = txb_info->eob - 1; si >= 0; --si) { - const int coeff_idx = scan[si]; - if (eob_fix == 1 && txb_info->qcoeff[coeff_idx] == 1) { - // when eob is fixed, there is not need to optimize again when - // abs(qc) == 1 - continue; + if (lower_level) { + qcoeff[ci] = qc_low; + dqcoeff[ci] = dqc_low; + levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX); } - LevelDownStats stats; - try_level_down_facade(&stats, si, txb_cache, txb_costs, txb_info, - fast_mode); - if (stats.update) { -#if TEST_OPTIMIZE_TXB -// printf("si %d low_qc %d cost_diff %d dist_diff %ld rd_diff %ld eob %d new_eob -// %d\n", si, stats.low_qc, stats.cost_diff, stats.dist_diff, stats.rd_diff, -// txb_info->eob, stats.new_eob); -#endif - update = 1; - cost_diff += stats.cost_diff; - dist_diff += stats.dist_diff; - rd_diff += stats.rd_diff; - update_level_down(coeff_idx, txb_cache, txb_info); - set_eob(txb_info, stats.new_eob); + if (qcoeff[ci]) { + nz_ci[*nz_num] = ci; + ++*nz_num; } - if (eob_fix == 0 && txb_info->qcoeff[coeff_idx] != 0) eob_fix = 1; - if (si > txb_info->eob) si = txb_info->eob; - } -#if TEST_OPTIMIZE_TXB - int64_t new_dist = - av1_block_error_c(txb_info->tcoeff, txb_info->dqcoeff, max_eob, &sse) * - (1 << (2 * txb_info->shift)); - int new_cost = get_txb_cost(txb_info, txb_costs); - int64_t ref_dist_diff = new_dist - org_dist; - int ref_cost_diff = new_cost - org_cost; - if (cost_diff != ref_cost_diff || dist_diff != ref_dist_diff) - printf( - "overall rd_diff %ld\ncost_diff %d ref_cost_diff%d\ndist_diff %ld " - "ref_dist_diff %ld\neob %d new_eob %d\n\n", - rd_diff, cost_diff, ref_cost_diff, dist_diff, ref_dist_diff, org_eob, - txb_info->eob); -#endif - if (dry_run) { - txb_info->qcoeff = org_qcoeff; - txb_info->dqcoeff = org_dqcoeff; - set_eob(txb_info, org_eob); } - return update; } -// These numbers are empirically obtained. -static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { - { 17, 13 }, { 16, 10 }, -}; +static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob, + int nz_num, int *nz_ci, int64_t rdmult, + int skip_cost, int non_skip_cost, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + int sharpness) { + const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist); + const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0); + if (sharpness == 0 && rd_new_eob < rd) { + for (int i = 0; i < nz_num; ++i) { + const int ci = nz_ci[i]; + qcoeff[ci] = 0; + dqcoeff[ci] = 0; + // no need to set up levels because this is the last step + // levels[get_padded_idx(ci, bwl)] = 0; + } + *accu_rate = 0; + *eob = 0; + } +} + +int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int *rate_cost, + int sharpness) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const PLANE_TYPE plane_type = get_plane_type(plane); + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const MB_MODE_INFO *mbmi = xd->mi[0]; + const struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block); + const int16_t *dequant = p->dequant_QTX; + const int bwl = get_txb_bwl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + assert(width == (1 << bwl)); + const int is_inter = is_inter_block(mbmi); + const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type); + const int16_t *scan = scan_order->scan; + const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type]; + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const LV_MAP_EOB_COST *txb_eob_costs = + &x->eob_costs[eob_multi_size][plane_type]; + + const int shift = av1_get_tx_scale(tx_size); + const int64_t rdmult = + ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) + + 2) >> + (sharpness + (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4 + ? 7 - mbmi->segment_id + : 2)); + + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, width); + + av1_txb_init_levels(qcoeff, width, height, levels); + + // TODO(angirbird): check iqmatrix + + const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0]; + const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; + int eob = p->eobs[block]; + const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class); + int accu_rate = eob_cost; + int64_t accu_dist = 0; + int si = eob - 1; + const int ci = scan[si]; + const tran_low_t qc = qcoeff[ci]; + const tran_low_t abs_qc = abs(qc); + const int sign = qc < 0; + const int max_nz_num = 2; + int nz_num = 1; + int nz_ci[3] = { ci, 0, 0 }; + if (abs_qc >= 2) { + update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class, + bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx, + dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff, + levels); + --si; + } else { + assert(abs_qc == 1); + const int coeff_ctx = get_lower_levels_ctx_general( + 1, si, bwl, height, levels, ci, tx_size, tx_class); + accu_rate += get_coeff_cost_general(1, ci, abs_qc, sign, coeff_ctx, + txb_ctx->dc_sign_ctx, txb_costs, bwl, + tx_class, levels); + const tran_low_t tqc = tcoeff[ci]; + const tran_low_t dqc = dqcoeff[ci]; + const int64_t dist = get_coeff_dist(tqc, dqc, shift); + const int64_t dist0 = get_coeff_dist(tqc, 0, shift); + accu_dist += dist - dist0; + --si; + } + +#define UPDATE_COEFF_EOB_CASE(tx_class_literal) \ + case tx_class_literal: \ + for (; si >= 0 && nz_num <= max_nz_num; --si) { \ + update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si, \ + tx_size, tx_class_literal, bwl, height, \ + txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \ + txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff, \ + levels, sharpness); \ + } \ + break; + switch (tx_class) { + UPDATE_COEFF_EOB_CASE(TX_CLASS_2D); + UPDATE_COEFF_EOB_CASE(TX_CLASS_HORIZ); + UPDATE_COEFF_EOB_CASE(TX_CLASS_VERT); +#undef UPDATE_COEFF_EOB_CASE + default: assert(false); + } + + if (si == -1 && nz_num <= max_nz_num) { + update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost, + non_skip_cost, qcoeff, dqcoeff, sharpness); + } + +#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal) \ + case tx_class_literal: \ + for (; si >= 1; --si) { \ + update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bwl, \ + rdmult, shift, dequant, scan, txb_costs, tcoeff, \ + qcoeff, dqcoeff, levels); \ + } \ + break; + switch (tx_class) { + UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D); + UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_HORIZ); + UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_VERT); +#undef UPDATE_COEFF_SIMPLE_CASE + default: assert(false); + } + + // DC position + if (si == 0) { + // no need to update accu_dist because it's not used after this point + int64_t dummy_dist = 0; + update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class, + bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx, + dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff, + levels); + } + + const int tx_type_cost = get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type); + if (eob == 0) + accu_rate += skip_cost; + else + accu_rate += non_skip_cost + tx_type_cost; + + p->eobs[block] = eob; + p->txb_entropy_ctx[block] = + av1_get_txb_entropy_context(qcoeff, scan_order, p->eobs[block]); + + *rate_cost = accu_rate; + return eob; +} -int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane, +// This function is deprecated, but we keep it here because hash trellis +// is not integrated with av1_optimize_txb_new yet +int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, int blk_row, int blk_col, int block, TX_SIZE tx_size, - TXB_CTX *txb_ctx, int fast_mode) { + TXB_CTX *txb_ctx, int fast_mode, int *rate_cost) { + const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; const PLANE_TYPE plane_type = get_plane_type(plane); - const TX_SIZE txs_ctx = get_txsize_context(tx_size); - const TX_TYPE tx_type = - av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size); - const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, + tx_size, cm->reduced_tx_set_used); + const MB_MODE_INFO *mbmi = xd->mi[0]; const struct macroblock_plane *p = &x->plane[plane]; struct macroblockd_plane *pd = &xd->plane[plane]; const int eob = p->eobs[block]; tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block); - const int16_t *dequant = pd->dequant; - const int seg_eob = AOMMIN(eob, tx_size_2d[tx_size] - 1); - const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2; - const int stride = 1 << bwl; - const int height = tx_size_high[tx_size]; + const int16_t *dequant = p->dequant_QTX; + const int seg_eob = av1_get_max_eob(tx_size); + const int bwl = get_txb_bwl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); const int is_inter = is_inter_block(mbmi); - const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi); - const LV_MAP_COEFF_COST txb_costs = x->coeff_costs[txs_ctx][plane_type]; + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type]; + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const LV_MAP_EOB_COST txb_eob_costs = + x->eob_costs[eob_multi_size][plane_type]; const int shift = av1_get_tx_scale(tx_size); const int64_t rdmult = - (x->rdmult * plane_rd_mult[is_inter][plane_type] + 2) >> 2; - - TxbInfo txb_info = { qcoeff, - dqcoeff, - tcoeff, - dequant, - shift, - tx_size, - txs_ctx, - tx_type, - bwl, - stride, - height, - eob, - seg_eob, - scan_order, - txb_ctx, - rdmult, - &cm->coeff_ctx_table }; - - TxbCache txb_cache; - gen_txb_cache(&txb_cache, &txb_info); + ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) + + 2) >> + 2; + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, width); + const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size); + const qm_val_t *iqmatrix = + IS_2D_TRANSFORM(tx_type) + ? pd->seg_iqmatrix[mbmi->segment_id][qm_tx_size] + : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size]; + assert(width == (1 << bwl)); + const int tx_type_cost = get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type); + TxbInfo txb_info = { + qcoeff, levels, dqcoeff, tcoeff, dequant, shift, + tx_size, txs_ctx, tx_type, bwl, width, height, + eob, seg_eob, scan_order, txb_ctx, rdmult, &cm->coeff_ctx_table, + iqmatrix, tx_type_cost, + }; + + // Hash based trellis (hbt) speed feature: avoid expensive optimize_txb calls + // by storing the coefficient deltas in a hash table. + // Currently disabled in speedfeatures.c + if (eob <= HBT_EOB && eob > 0 && cpi->sf.use_hash_based_trellis) { + return hbt_create_hashes(&txb_info, txb_costs, &txb_eob_costs, p, block, + fast_mode, rate_cost); + } + + av1_txb_init_levels(qcoeff, width, height, levels); const int update = - optimize_txb(&txb_info, &txb_costs, &txb_cache, 0, fast_mode); - if (update) p->eobs[block] = txb_info.eob; + optimize_txb(&txb_info, txb_costs, &txb_eob_costs, rate_cost); + + if (update) { + p->eobs[block] = txb_info.eob; + p->txb_entropy_ctx[block] = + av1_get_txb_entropy_context(qcoeff, scan_order, txb_info.eob); + } return txb_info.eob; } + int av1_get_txb_entropy_context(const tran_low_t *qcoeff, const SCAN_ORDER *scan_order, int eob) { - const int16_t *scan = scan_order->scan; + const int16_t *const scan = scan_order->scan; int cul_level = 0; int c; if (eob == 0) return 0; for (c = 0; c < eob; ++c) { cul_level += abs(qcoeff[scan[c]]); + if (cul_level > COEFF_CONTEXT_MASK) break; } cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level); @@ -1981,167 +1791,72 @@ void av1_update_txb_context_b(int plane, int block, int blk_row, int blk_col, ThreadData *const td = args->td; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; struct macroblock_plane *p = &x->plane[plane]; struct macroblockd_plane *pd = &xd->plane[plane]; const uint16_t eob = p->eobs[block]; const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); const PLANE_TYPE plane_type = pd->plane_type; - const TX_TYPE tx_type = - av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size); - const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi); - (void)plane_bsize; - - int cul_level = av1_get_txb_entropy_context(qcoeff, scan_order, eob); - av1_set_contexts(xd, pd, plane, tx_size, cul_level, blk_col, blk_row); -} - -static INLINE void av1_update_nz_eob_counts(FRAME_CONTEXT *fc, - FRAME_COUNTS *counts, uint16_t eob, - const tran_low_t *tcoeff, int plane, - TX_SIZE tx_size, TX_TYPE tx_type, - const int16_t *scan) { - const PLANE_TYPE plane_type = get_plane_type(plane); - const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2; - const int height = tx_size_high[tx_size]; - TX_SIZE txsize_ctx = get_txsize_context(tx_size); -#if CONFIG_CTX1D - const int width = tx_size_wide[tx_size]; - const int eob_offset = width + height; - const TX_CLASS tx_class = get_tx_class(tx_type); - const int seg_eob = - (tx_class == TX_CLASS_2D) ? tx_size_2d[tx_size] : eob_offset; -#else - const int seg_eob = tx_size_2d[tx_size]; -#endif - unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2] = - &counts->nz_map[txsize_ctx][plane_type]; - for (int c = 0; c < eob; ++c) { - tran_low_t v = tcoeff[scan[c]]; - int is_nz = (v != 0); - int coeff_ctx = get_nz_map_ctx(tcoeff, c, scan, bwl, height, tx_type); - int eob_ctx = get_eob_ctx(tcoeff, scan[c], txsize_ctx, tx_type); - - if (c == seg_eob - 1) break; - - ++(*nz_map_count)[coeff_ctx][is_nz]; -#if LV_MAP_PROB - update_bin(fc->nz_map_cdf[txsize_ctx][plane_type][coeff_ctx], is_nz, 2); -#endif - - if (is_nz) { - ++counts->eob_flag[txsize_ctx][plane_type][eob_ctx][c == (eob - 1)]; -#if LV_MAP_PROB - update_bin(fc->eob_flag_cdf[txsize_ctx][plane_type][eob_ctx], - c == (eob - 1), 2); -#endif - } - } + const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, + tx_size, cm->reduced_tx_set_used); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + const int cul_level = av1_get_txb_entropy_context(qcoeff, scan_order, eob); + av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, blk_col, + blk_row); } -#if CONFIG_CTX1D -static INLINE void av1_update_nz_eob_counts_vert( - FRAME_CONTEXT *fc, FRAME_COUNTS *counts, uint16_t eob, - const tran_low_t *tcoeff, int plane, TX_SIZE tx_size, TX_TYPE tx_type, - const int16_t *scan, const int16_t *iscan) { - (void)eob; - const TX_SIZE txs_ctx = get_txsize_context(tx_size); - const PLANE_TYPE plane_type = get_plane_type(plane); - const TX_CLASS tx_class = get_tx_class(tx_type); - const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2; - const int width = tx_size_wide[tx_size]; - const int height = tx_size_high[tx_size]; - int16_t eob_ls[MAX_HVTX_SIZE]; - get_eob_vert(eob_ls, tcoeff, width, height); - unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2] = - &counts->nz_map[txs_ctx][plane_type]; - for (int c = 0; c < width; ++c) { - int16_t veob = eob_ls[c]; - assert(veob <= height); - int el_ctx = get_empty_line_ctx(c, eob_ls); - ++counts->empty_line[txs_ctx][plane_type][tx_class][el_ctx][veob == 0]; -#if LV_MAP_PROB - update_bin(fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx], - veob == 0, 2); -#endif - if (veob) { - for (int r = 0; r < veob; ++r) { - if (r + 1 != height) { - int coeff_idx = r * width + c; - int scan_idx = iscan[coeff_idx]; - int is_nz = tcoeff[coeff_idx] != 0; - int coeff_ctx = - get_nz_map_ctx(tcoeff, scan_idx, scan, bwl, height, tx_type); - ++(*nz_map_count)[coeff_ctx][is_nz]; -#if LV_MAP_PROB - update_bin(fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], is_nz, 2); -#endif - if (is_nz) { - int eob_ctx = get_hv_eob_ctx(c, r, eob_ls); - ++counts->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx] - [r == veob - 1]; -#if LV_MAP_PROB - update_bin(fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx], - r == veob - 1, 2); -#endif - } +static void update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd, + int blk_row, int blk_col, int plane, + TX_SIZE tx_size, FRAME_COUNTS *counts, + uint8_t allow_update_cdf) { + MB_MODE_INFO *mbmi = xd->mi[0]; + int is_inter = is_inter_block(mbmi); + FRAME_CONTEXT *fc = xd->tile_ctx; +#if !CONFIG_ENTROPY_STATS + (void)counts; +#endif // !CONFIG_ENTROPY_STATS + + // Only y plane's tx_type is updated + if (plane > 0) return; + TX_TYPE tx_type = av1_get_tx_type(PLANE_TYPE_Y, xd, blk_row, blk_col, tx_size, + cm->reduced_tx_set_used); + if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 && + cm->base_qindex > 0 && !mbmi->skip && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + const int eset = get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used); + if (eset > 0) { + const TxSetType tx_set_type = + av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used); + if (is_inter) { + if (allow_update_cdf) { + update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]], + av1_ext_tx_ind[tx_set_type][tx_type], + av1_num_ext_tx_set[tx_set_type]); } - } - } - } -} - -static INLINE void av1_update_nz_eob_counts_horiz( - FRAME_CONTEXT *fc, FRAME_COUNTS *counts, uint16_t eob, - const tran_low_t *tcoeff, int plane, TX_SIZE tx_size, TX_TYPE tx_type, - const int16_t *scan, const int16_t *iscan) { - (void)eob; - (void)scan; - const TX_SIZE txs_ctx = get_txsize_context(tx_size); - const PLANE_TYPE plane_type = get_plane_type(plane); - const TX_CLASS tx_class = get_tx_class(tx_type); - const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2; - const int width = tx_size_wide[tx_size]; - const int height = tx_size_high[tx_size]; - int16_t eob_ls[MAX_HVTX_SIZE]; - get_eob_horiz(eob_ls, tcoeff, width, height); - unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2] = - &counts->nz_map[txs_ctx][plane_type]; - for (int r = 0; r < height; ++r) { - int16_t heob = eob_ls[r]; - int el_ctx = get_empty_line_ctx(r, eob_ls); - ++counts->empty_line[txs_ctx][plane_type][tx_class][el_ctx][heob == 0]; -#if LV_MAP_PROB - update_bin(fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx], - heob == 0, 2); -#endif - if (heob) { - for (int c = 0; c < heob; ++c) { - if (c + 1 != width) { - int coeff_idx = r * width + c; - int scan_idx = iscan[coeff_idx]; - int is_nz = tcoeff[coeff_idx] != 0; - int coeff_ctx = - get_nz_map_ctx(tcoeff, scan_idx, scan, bwl, height, tx_type); - ++(*nz_map_count)[coeff_ctx][is_nz]; -#if LV_MAP_PROB - update_bin(fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], is_nz, 2); -#endif - if (is_nz) { - int eob_ctx = get_hv_eob_ctx(r, c, eob_ls); - ++counts->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx] - [c == heob - 1]; -#if LV_MAP_PROB - update_bin(fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx], - c == heob - 1, 2); -#endif - } +#if CONFIG_ENTROPY_STATS + ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]] + [av1_ext_tx_ind[tx_set_type][tx_type]]; +#endif // CONFIG_ENTROPY_STATS + } else { + PREDICTION_MODE intra_dir; + if (mbmi->filter_intra_mode_info.use_filter_intra) + intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info + .filter_intra_mode]; + else + intra_dir = mbmi->mode; +#if CONFIG_ENTROPY_STATS + ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][intra_dir] + [av1_ext_tx_ind[tx_set_type][tx_type]]; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) { + update_cdf( + fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][intra_dir], + av1_ext_tx_ind[tx_set_type][tx_type], + av1_num_ext_tx_set[tx_set_type]); } } } } } -#endif // CONFIG_CTX1D void av1_update_and_record_txb_context(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, @@ -2154,461 +1869,164 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row, MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *p = &x->plane[plane]; struct macroblockd_plane *pd = &xd->plane[plane]; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - int eob = p->eobs[block], update_eob = 0; - const PLANE_TYPE plane_type = pd->plane_type; - const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); - tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block); - const int segment_id = mbmi->segment_id; - const TX_TYPE tx_type = - av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size); - const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi); - const int16_t *scan = scan_order->scan; - const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size); - int c, i; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int eob = p->eobs[block]; TXB_CTX txb_ctx; get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + blk_col, pd->left_context + blk_row, &txb_ctx); - const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2; - const int height = tx_size_high[tx_size]; - int cul_level = 0; - - TX_SIZE txsize_ctx = get_txsize_context(tx_size); + const int bwl = get_txb_bwl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + const uint8_t allow_update_cdf = args->allow_update_cdf; + const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size); FRAME_CONTEXT *ec_ctx = xd->tile_ctx; +#if CONFIG_ENTROPY_STATS + int cdf_idx = cm->coef_cdf_category; +#endif // CONFIG_ENTROPY_STATS + +#if CONFIG_ENTROPY_STATS + ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0]; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) { + update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx], eob == 0, + 2); + } - memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob); - - ++td->counts->txb_skip[txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0]; -#if LV_MAP_PROB - update_bin(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx], eob == 0, - 2); -#endif x->mbmi_ext->txb_skip_ctx[plane][block] = txb_ctx.txb_skip_ctx; - x->mbmi_ext->eobs[plane][block] = eob; if (eob == 0) { - av1_set_contexts(xd, pd, plane, tx_size, 0, blk_col, blk_row); + av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col, blk_row); return; } -#if CONFIG_TXK_SEL - av1_update_tx_type_count(cm, xd, blk_row, blk_col, block, plane, - mbmi->sb_type, get_min_tx_size(tx_size), td->counts); -#endif - -#if CONFIG_CTX1D - TX_CLASS tx_class = get_tx_class(tx_type); - if (tx_class == TX_CLASS_2D) { - av1_update_nz_eob_counts(ec_ctx, td->counts, eob, tcoeff, plane, tx_size, - tx_type, scan); - } else { - const int width = tx_size_wide[tx_size]; - const int eob_offset = width + height; - const int eob_mode = eob > eob_offset; - const TX_SIZE txs_ctx = get_txsize_context(tx_size); - ++td->counts->eob_mode[txs_ctx][plane_type][tx_class][eob_mode]; -#if LV_MAP_PROB - update_bin(ec_ctx->eob_mode_cdf[txs_ctx][plane_type][tx_class], eob_mode, - 2); -#endif - if (eob_mode == 0) { - av1_update_nz_eob_counts(ec_ctx, td->counts, eob, tcoeff, plane, tx_size, - tx_type, scan); - } else { - const int16_t *iscan = scan_order->iscan; - assert(tx_class == TX_CLASS_VERT || tx_class == TX_CLASS_HORIZ); - if (tx_class == TX_CLASS_VERT) - av1_update_nz_eob_counts_vert(ec_ctx, td->counts, eob, tcoeff, plane, - tx_size, tx_type, scan, iscan); - else - av1_update_nz_eob_counts_horiz(ec_ctx, td->counts, eob, tcoeff, plane, - tx_size, tx_type, scan, iscan); - } - } -#else // CONFIG_CTX1D - av1_update_nz_eob_counts(ec_ctx, td->counts, eob, tcoeff, plane, tx_size, - tx_type, scan); -#endif // CONFIG_CTX1D - - // Reverse process order to handle coefficient level and sign. - for (i = 0; i < NUM_BASE_LEVELS; ++i) { - update_eob = 0; - for (c = eob - 1; c >= 0; --c) { - tran_low_t v = qcoeff[scan[c]]; - tran_low_t level = abs(v); - int ctx; - - if (level <= i) continue; - - ctx = get_base_ctx(tcoeff, scan[c], bwl, height, i + 1); + tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block); + const int segment_id = mbmi->segment_id; + const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size); + const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); + memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob); - if (level == i + 1) { - ++td->counts->coeff_base[txsize_ctx][plane_type][i][ctx][1]; -#if LV_MAP_PROB - update_bin(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][i][ctx], 1, - 2); -#endif - if (c == 0) { - int dc_sign_ctx = txb_ctx.dc_sign_ctx; + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, width); + av1_txb_init_levels(tcoeff, width, height, levels); + update_tx_type_count(cm, xd, blk_row, blk_col, plane, tx_size, td->counts, + allow_update_cdf); - ++td->counts->dc_sign[plane_type][dc_sign_ctx][v < 0]; -#if LV_MAP_PROB - update_bin(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], v < 0, 2); -#endif - x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx; - } - cul_level += level; - continue; - } - ++td->counts->coeff_base[txsize_ctx][plane_type][i][ctx][0]; -#if LV_MAP_PROB - update_bin(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][i][ctx], 0, 2); + const PLANE_TYPE plane_type = pd->plane_type; + const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, + tx_size, cm->reduced_tx_set_used); + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + const int16_t *const scan = scan_order->scan; +#if CONFIG_ENTROPY_STATS + av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx, + td->counts, allow_update_cdf); +#else + av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx, + allow_update_cdf); #endif - update_eob = AOMMAX(update_eob, c); - } - } - - for (c = update_eob; c >= 0; --c) { - tran_low_t v = qcoeff[scan[c]]; - tran_low_t level = abs(v); - int idx; - int ctx; - if (level <= NUM_BASE_LEVELS) continue; + DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); + av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts); - cul_level += level; - if (c == 0) { - int dc_sign_ctx = txb_ctx.dc_sign_ctx; + for (int c = eob - 1; c >= 0; --c) { + const int pos = scan[c]; + const int coeff_ctx = coeff_contexts[pos]; + const tran_low_t v = qcoeff[pos]; + const tran_low_t level = abs(v); - ++td->counts->dc_sign[plane_type][dc_sign_ctx][v < 0]; -#if LV_MAP_PROB - update_bin(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], v < 0, 2); -#endif - x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx; + if (allow_update_cdf) { + if (c == eob - 1) { + assert(coeff_ctx < 4); + update_cdf( + ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx], + AOMMIN(level, 3) - 1, 3); + } else { + update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx], + AOMMIN(level, 3), 4); + } } - - // level is above 1. - ctx = get_br_ctx(tcoeff, scan[c], bwl, height); - -#if BR_NODE - int base_range = level - 1 - NUM_BASE_LEVELS; - int br_set_idx = base_range < COEFF_BASE_RANGE - ? coeff_to_br_index[base_range] - : BASE_RANGE_SETS; - - for (idx = 0; idx < BASE_RANGE_SETS; ++idx) { - if (idx == br_set_idx) { - int br_base = br_index_to_coeff[br_set_idx]; - int br_offset = base_range - br_base; - ++td->counts->coeff_br[txsize_ctx][plane_type][idx][ctx][1]; -#if LV_MAP_PROB - update_bin(ec_ctx->coeff_br_cdf[txsize_ctx][plane_type][idx][ctx], 1, - 2); -#endif - int extra_bits = (1 << br_extra_bits[idx]) - 1; - for (int tok = 0; tok < extra_bits; ++tok) { - if (br_offset == tok) { - ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][1]; -#if LV_MAP_PROB - update_bin(ec_ctx->coeff_lps_cdf[txsize_ctx][plane_type][ctx], 1, - 2); -#endif - break; - } - ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][0]; -#if LV_MAP_PROB - update_bin(ec_ctx->coeff_lps_cdf[txsize_ctx][plane_type][ctx], 0, 2); + { + if (c == eob - 1) { + assert(coeff_ctx < 4); +#if CONFIG_ENTROPY_STATS + ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type] + [coeff_ctx][AOMMIN(level, 3) - 1]; + } else { + ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type] + [coeff_ctx][AOMMIN(level, 3)]; #endif - } - break; } - ++td->counts->coeff_br[txsize_ctx][plane_type][idx][ctx][0]; -#if LV_MAP_PROB - update_bin(ec_ctx->coeff_br_cdf[txsize_ctx][plane_type][idx][ctx], 0, 2); -#endif } -#else // BR_NODE - for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) { - if (level == (idx + 1 + NUM_BASE_LEVELS)) { - ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][1]; -#if LV_MAP_PROB - update_bin(ec_ctx->coeff_lps_cdf[txsize_ctx][plane_type][ctx], 1, 2); + if (level > NUM_BASE_LEVELS) { + const int base_range = level - 1 - NUM_BASE_LEVELS; + const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class); + for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { + const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1); + if (allow_update_cdf) { + update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)] + [plane_type][br_ctx], + k, BR_CDF_SIZE); + } + for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) { +#if CONFIG_ENTROPY_STATS + ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type][lps] + [br_ctx][lps == k]; +#endif // CONFIG_ENTROPY_STATS + if (lps == k) break; + } +#if CONFIG_ENTROPY_STATS + ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)] + [plane_type][br_ctx][k]; #endif - break; + if (k < BR_CDF_SIZE - 1) break; } - ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][0]; -#if LV_MAP_PROB - update_bin(ec_ctx->coeff_lps_cdf[txsize_ctx][plane_type][ctx], 0, 2); -#endif } - if (idx < COEFF_BASE_RANGE) continue; -#endif // BR_NODE - // use 0-th order Golomb code to handle the residual level. } - cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level); + // Update the context needed to code the DC sign (if applicable) + if (tcoeff[0] != 0) { + const int dc_sign = (tcoeff[0] < 0) ? 1 : 0; + const int dc_sign_ctx = txb_ctx.dc_sign_ctx; +#if CONFIG_ENTROPY_STATS + ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign]; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) + update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2); + x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx; + } - // DC value - set_dc_sign(&cul_level, tcoeff[0]); - av1_set_contexts(xd, pd, plane, tx_size, cul_level, blk_col, blk_row); - -#if CONFIG_ADAPT_SCAN - // Since dqcoeff is not available here, we pass qcoeff into - // av1_update_scan_count_facade(). The update behavior should be the same - // because av1_update_scan_count_facade() only cares if coefficients are zero - // or not. - av1_update_scan_count_facade((AV1_COMMON *)cm, td->counts, tx_size, tx_type, - qcoeff, eob); -#endif + const int cul_level = av1_get_txb_entropy_context(tcoeff, scan_order, eob); + av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, blk_col, + blk_row); } void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td, RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate, - int mi_row, int mi_col) { + int mi_row, int mi_col, uint8_t allow_update_cdf) { const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - const int ctx = av1_get_skip_context(xd); - const int skip_inc = - !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); - struct tokenize_b_args arg = { cpi, td, NULL, 0 }; + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct tokenize_b_args arg = { cpi, td, NULL, 0, allow_update_cdf }; (void)rate; (void)mi_row; (void)mi_col; if (mbmi->skip) { - if (!dry_run) td->counts->skip[ctx][1] += skip_inc; - av1_reset_skip_context(xd, mi_row, mi_col, bsize); + av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes); return; } if (!dry_run) { - td->counts->skip[ctx][0] += skip_inc; av1_foreach_transformed_block(xd, bsize, mi_row, mi_col, - av1_update_and_record_txb_context, &arg); + av1_update_and_record_txb_context, &arg, + num_planes); } else if (dry_run == DRY_RUN_NORMAL) { av1_foreach_transformed_block(xd, bsize, mi_row, mi_col, - av1_update_txb_context_b, &arg); + av1_update_txb_context_b, &arg, num_planes); } else { printf("DRY_RUN_COSTCOEFFS is not supported yet\n"); assert(0); } } - -static void find_new_prob(unsigned int *branch_cnt, aom_prob *oldp, - int *savings, int *update, aom_writer *const bc) { - const aom_prob upd = DIFF_UPDATE_PROB; - int u = 0; - aom_prob newp = get_binary_prob(branch_cnt[0], branch_cnt[1]); - int s = av1_prob_diff_update_savings_search(branch_cnt, *oldp, &newp, upd, 1); - - if (s > 0 && newp != *oldp) u = 1; - - if (u) - *savings += s - (int)(av1_cost_zero(upd)); // TODO(jingning): 1? - else - *savings -= (int)(av1_cost_zero(upd)); - - if (update) { - ++update[u]; - return; - } - - aom_write(bc, u, upd); - if (u) { - /* send/use new probability */ - av1_write_prob_diff_update(bc, newp, *oldp); - *oldp = newp; - } -} - -static void write_txb_probs(aom_writer *const bc, AV1_COMP *cpi, - TX_SIZE tx_size) { - FRAME_CONTEXT *fc = cpi->common.fc; - FRAME_COUNTS *counts = cpi->td.counts; - int savings = 0; - int update[2] = { 0, 0 }; - int plane, ctx, level; - - for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) { - find_new_prob(counts->txb_skip[tx_size][ctx], &fc->txb_skip[tx_size][ctx], - &savings, update, bc); - } - - for (plane = 0; plane < PLANE_TYPES; ++plane) { - for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) { - find_new_prob(counts->nz_map[tx_size][plane][ctx], - &fc->nz_map[tx_size][plane][ctx], &savings, update, bc); - } - } - - for (plane = 0; plane < PLANE_TYPES; ++plane) { - for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) { - find_new_prob(counts->eob_flag[tx_size][plane][ctx], - &fc->eob_flag[tx_size][plane][ctx], &savings, update, bc); - } - } - - for (level = 0; level < NUM_BASE_LEVELS; ++level) { - for (plane = 0; plane < PLANE_TYPES; ++plane) { - for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx) { - find_new_prob(counts->coeff_base[tx_size][plane][level][ctx], - &fc->coeff_base[tx_size][plane][level][ctx], &savings, - update, bc); - } - } - } - - for (plane = 0; plane < PLANE_TYPES; ++plane) { - for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) { - find_new_prob(counts->coeff_lps[tx_size][plane][ctx], - &fc->coeff_lps[tx_size][plane][ctx], &savings, update, bc); - } - } - - // Decide if to update the model for this tx_size - if (update[1] == 0 || savings < 0) { - aom_write_bit(bc, 0); - return; - } - aom_write_bit(bc, 1); - - for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) { - find_new_prob(counts->txb_skip[tx_size][ctx], &fc->txb_skip[tx_size][ctx], - &savings, NULL, bc); - } - - for (plane = 0; plane < PLANE_TYPES; ++plane) { - for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) { - find_new_prob(counts->nz_map[tx_size][plane][ctx], - &fc->nz_map[tx_size][plane][ctx], &savings, NULL, bc); - } - } - - for (plane = 0; plane < PLANE_TYPES; ++plane) { - for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) { - find_new_prob(counts->eob_flag[tx_size][plane][ctx], - &fc->eob_flag[tx_size][plane][ctx], &savings, NULL, bc); - } - } - - for (level = 0; level < NUM_BASE_LEVELS; ++level) { - for (plane = 0; plane < PLANE_TYPES; ++plane) { - for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx) { - find_new_prob(counts->coeff_base[tx_size][plane][level][ctx], - &fc->coeff_base[tx_size][plane][level][ctx], &savings, - NULL, bc); - } - } - } - - for (plane = 0; plane < PLANE_TYPES; ++plane) { - for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) { - find_new_prob(counts->coeff_lps[tx_size][plane][ctx], - &fc->coeff_lps[tx_size][plane][ctx], &savings, NULL, bc); - } - } -} - -void av1_write_txb_probs(AV1_COMP *cpi, aom_writer *w) { - const TX_MODE tx_mode = cpi->common.tx_mode; - const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; - TX_SIZE tx_size; - int ctx, plane; - -#if LV_MAP_PROB - return; -#endif - - for (plane = 0; plane < PLANE_TYPES; ++plane) - for (ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx) - av1_cond_prob_diff_update(w, &cpi->common.fc->dc_sign[plane][ctx], - cpi->td.counts->dc_sign[plane][ctx], 1); - - for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) - write_txb_probs(w, cpi, tx_size); -} - -#if CONFIG_TXK_SEL -int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, - int block, int blk_row, int blk_col, - BLOCK_SIZE plane_bsize, TX_SIZE tx_size, - const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l, - int use_fast_coef_costing, RD_STATS *rd_stats) { - const AV1_COMMON *cm = &cpi->common; - MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - TX_TYPE txk_start = DCT_DCT; - TX_TYPE txk_end = TX_TYPES - 1; - TX_TYPE best_tx_type = txk_start; - int64_t best_rd = INT64_MAX; - uint8_t best_eob = 0; - const int coeff_ctx = combine_entropy_contexts(*a, *l); - RD_STATS best_rd_stats; - TX_TYPE tx_type; - - av1_invalid_rd_stats(&best_rd_stats); - - for (tx_type = txk_start; tx_type <= txk_end; ++tx_type) { - if (plane == 0) mbmi->txk_type[(blk_row << 4) + blk_col] = tx_type; - TX_TYPE ref_tx_type = av1_get_tx_type(get_plane_type(plane), xd, blk_row, - blk_col, block, tx_size); - if (tx_type != ref_tx_type) { - // use av1_get_tx_type() to check if the tx_type is valid for the current - // mode if it's not, we skip it here. - continue; - } - -#if CONFIG_EXT_TX - const int is_inter = is_inter_block(mbmi); - const TxSetType tx_set_type = - get_ext_tx_set_type(get_min_tx_size(tx_size), mbmi->sb_type, is_inter, - cm->reduced_tx_set_used); - if (!av1_ext_tx_used[tx_set_type][tx_type]) continue; -#endif // CONFIG_EXT_TX - - RD_STATS this_rd_stats; - av1_invalid_rd_stats(&this_rd_stats); - av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, - coeff_ctx, AV1_XFORM_QUANT_FP); - av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size, - a, l, 1); - av1_dist_block(cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size, - &this_rd_stats.dist, &this_rd_stats.sse, - OUTPUT_HAS_PREDICTED_PIXELS); - const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, mbmi); - this_rd_stats.rate = - av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block, tx_size, - scan_order, a, l, use_fast_coef_costing); - int rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); - - if (rd < best_rd) { - best_rd = rd; - best_rd_stats = this_rd_stats; - best_tx_type = tx_type; - best_eob = x->plane[plane].txb_entropy_ctx[block]; - } - } - - av1_merge_rd_stats(rd_stats, &best_rd_stats); - - if (best_eob == 0 && is_inter_block(mbmi)) best_tx_type = DCT_DCT; - - if (plane == 0) mbmi->txk_type[(blk_row << 4) + blk_col] = best_tx_type; - x->plane[plane].txb_entropy_ctx[block] = best_eob; - - if (!is_inter_block(mbmi)) { - // intra mode needs decoded result such that the next transform block - // can use it for prediction. - av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, - coeff_ctx, AV1_XFORM_QUANT_FP); - av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size, - a, l, 1); - - av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col, - x->plane[plane].eobs[block]); - } - return best_rd; -} -#endif // CONFIG_TXK_SEL diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h index 76a04bb41..aa847ad62 100644 --- a/third_party/aom/av1/encoder/encodetxb.h +++ b/third_party/aom/av1/encoder/encodetxb.h @@ -12,7 +12,8 @@ #ifndef ENCODETXB_H_ #define ENCODETXB_H_ -#include "./aom_config.h" +#include "config/aom_config.h" + #include "av1/common/blockd.h" #include "av1/common/onyxc_int.h" #include "av1/common/txb_common.h" @@ -25,6 +26,7 @@ extern "C" { typedef struct TxbInfo { tran_low_t *qcoeff; + uint8_t *levels; // absolute values and clamped to 255. tran_low_t *dqcoeff; const tran_low_t *tcoeff; const int16_t *dequant; @@ -33,7 +35,7 @@ typedef struct TxbInfo { TX_SIZE txs_ctx; TX_TYPE tx_type; int bwl; - int stride; + int width; int height; int eob; int seg_eob; @@ -41,51 +43,27 @@ typedef struct TxbInfo { TXB_CTX *txb_ctx; int64_t rdmult; const LV_MAP_CTX_TABLE *coeff_ctx_table; + const qm_val_t *iqmatrix; + int tx_type_cost; } TxbInfo; -typedef struct TxbCache { - int nz_count_arr[MAX_TX_SQUARE]; - int nz_ctx_arr[MAX_TX_SQUARE]; - int base_count_arr[NUM_BASE_LEVELS][MAX_TX_SQUARE]; - int base_mag_arr[MAX_TX_SQUARE] - [2]; // [0]: max magnitude [1]: num of max magnitude - int base_ctx_arr[NUM_BASE_LEVELS][MAX_TX_SQUARE]; - - int br_count_arr[MAX_TX_SQUARE]; - int br_mag_arr[MAX_TX_SQUARE] - [2]; // [0]: max magnitude [1]: num of max magnitude - int br_ctx_arr[MAX_TX_SQUARE]; -} TxbCache; - -typedef struct TxbProbs { - const aom_prob *dc_sign_prob; - const aom_prob *nz_map; - aom_prob (*coeff_base)[COEFF_BASE_CONTEXTS]; - const aom_prob *coeff_lps; - const aom_prob *eob_flag; - const aom_prob *txb_skip; -#if BR_NODE - const aom_prob *coeff_br; -#endif -} TxbProbs; - void av1_alloc_txb_buf(AV1_COMP *cpi); void av1_free_txb_buf(AV1_COMP *cpi); -int av1_cost_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *x, int plane, - int blk_row, int blk_col, int block, TX_SIZE tx_size, - TXB_CTX *txb_ctx); +int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x, + const int plane, const int blk_row, const int blk_col, + const int block, const TX_SIZE tx_size, + const TXB_CTX *const txb_ctx); void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd, - aom_writer *w, int blk_row, int blk_col, int block, - int plane, TX_SIZE tx_size, const tran_low_t *tcoeff, + aom_writer *w, int blk_row, int blk_col, int plane, + TX_SIZE tx_size, const tran_low_t *tcoeff, uint16_t eob, TXB_CTX *txb_ctx); -void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, - aom_writer *w, int plane); +void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, int mi_row, + int mi_col, aom_writer *w, BLOCK_SIZE bsize); int av1_get_txb_entropy_context(const tran_low_t *qcoeff, const SCAN_ORDER *scan_order, int eob); void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td, RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate, - const int mi_row, const int mi_col); -void av1_write_txb_probs(AV1_COMP *cpi, aom_writer *w); + int mi_row, int mi_col, uint8_t allow_update_cdf); void av1_update_txb_context_b(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, @@ -98,16 +76,10 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row, void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x, int mi_row, int mi_col); -#if CONFIG_TXK_SEL -int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, - int block, int blk_row, int blk_col, - BLOCK_SIZE plane_bsize, TX_SIZE tx_size, - const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l, - int use_fast_coef_costing, RD_STATS *rd_stats); -#endif -int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane, - int blk_row, int blk_col, int block, TX_SIZE tx_size, - TXB_CTX *txb_ctx, int fast_mode); +void hbt_destroy(); +int av1_optimize_txb_new(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, TX_TYPE tx_type, + const TXB_CTX *txb_ctx, int *rate_cost, int sharpness); #ifdef __cplusplus } #endif diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c index edc9b1d61..404af2e7c 100644 --- a/third_party/aom/av1/encoder/ethread.c +++ b/third_party/aom/av1/encoder/ethread.c @@ -18,15 +18,13 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { for (int i = 0; i < REFERENCE_MODES; i++) td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i]; -#if CONFIG_GLOBAL_MOTION - for (int i = 0; i < TOTAL_REFS_PER_FRAME; i++) + for (int i = 0; i < REF_FRAMES; i++) td->rd_counts.global_motion_used[i] += td_t->rd_counts.global_motion_used[i]; -#endif // CONFIG_GLOBAL_MOTION td->rd_counts.compound_ref_used_flag |= td_t->rd_counts.compound_ref_used_flag; - td->rd_counts.single_ref_used_flag |= td_t->rd_counts.single_ref_used_flag; + td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag; } static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) { @@ -53,7 +51,7 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; const int tile_cols = cm->tile_cols; const AVxWorkerInterface *const winterface = aom_get_worker_interface(); - const int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols); + int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols); int i; av1_init_tile_data(cpi); @@ -81,29 +79,19 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) { aom_memalign(32, sizeof(*thread_data->td))); av1_zero(*thread_data->td); -// Set up pc_tree. -#if !CONFIG_CB4X4 - thread_data->td->leaf_tree = NULL; -#endif + // Set up pc_tree. thread_data->td->pc_tree = NULL; av1_setup_pc_tree(cm, thread_data->td); -#if CONFIG_MOTION_VAR -#if CONFIG_HIGHBITDEPTH - int buf_scaler = 2; -#else - int buf_scaler = 1; -#endif CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf, (uint8_t *)aom_memalign( - 16, - buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE * - sizeof(*thread_data->td->above_pred_buf))); + 16, MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*thread_data->td->above_pred_buf))); CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf, (uint8_t *)aom_memalign( - 16, - buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE * - sizeof(*thread_data->td->left_pred_buf))); + 16, MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*thread_data->td->left_pred_buf))); + CHECK_MEM_ERROR( cm, thread_data->td->wsrc_buf, (int32_t *)aom_memalign( @@ -112,7 +100,6 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) { cm, thread_data->td->mask_buf, (int32_t *)aom_memalign( 16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf))); -#endif // Allocate frame counters in thread data. CHECK_MEM_ERROR(cm, thread_data->td->counts, aom_calloc(1, sizeof(*thread_data->td->counts))); @@ -133,6 +120,8 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) { winterface->sync(worker); } + } else { + num_workers = AOMMIN(num_workers, cpi->num_workers); } for (i = 0; i < num_workers; i++) { @@ -148,16 +137,13 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) { if (thread_data->td != &cpi->td) { thread_data->td->mb = cpi->td.mb; thread_data->td->rd_counts = cpi->td.rd_counts; -#if CONFIG_MOTION_VAR thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf; thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf; thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf; thread_data->td->mb.mask_buf = thread_data->td->mask_buf; -#endif } - if (thread_data->td->counts != &cpi->common.counts) { - memcpy(thread_data->td->counts, &cpi->common.counts, - sizeof(cpi->common.counts)); + if (thread_data->td->counts != &cpi->counts) { + memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts)); } if (i < num_workers - 1) @@ -187,14 +173,24 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) { for (i = 0; i < num_workers; i++) { AVxWorker *const worker = &cpi->workers[i]; EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; - + cpi->intrabc_used |= thread_data->td->intrabc_used_this_tile; // Accumulate counters. if (i < cpi->num_workers - 1) { - av1_accumulate_frame_counts(&cm->counts, thread_data->td->counts); + av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts); accumulate_rd_opt(&cpi->td, thread_data->td); -#if CONFIG_VAR_TX cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count; -#endif } } } + +// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int' +// members, so we treat it as an array, and sum over the whole length. +void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts, + const FRAME_COUNTS *counts) { + unsigned int *const acc = (unsigned int *)acc_counts; + const unsigned int *const cnt = (const unsigned int *)counts; + + const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int); + + for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i]; +} diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h index 6c30a3e5c..b6b1fed4e 100644 --- a/third_party/aom/av1/encoder/ethread.h +++ b/third_party/aom/av1/encoder/ethread.h @@ -27,6 +27,9 @@ typedef struct EncWorkerData { void av1_encode_tiles_mt(struct AV1_COMP *cpi); +void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts, + const struct FRAME_COUNTS *counts); + #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/av1/encoder/extend.c b/third_party/aom/av1/encoder/extend.c index 007694a38..e9621a574 100644 --- a/third_party/aom/av1/encoder/extend.c +++ b/third_party/aom/av1/encoder/extend.c @@ -57,7 +57,6 @@ static void copy_and_extend_plane(const uint8_t *src, int src_pitch, } } -#if CONFIG_HIGHBITDEPTH static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch, uint8_t *dst8, int dst_pitch, int w, int h, int extend_top, int extend_left, @@ -100,7 +99,6 @@ static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch, dst_ptr2 += dst_pitch; } } -#endif // CONFIG_HIGHBITDEPTH void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst) { @@ -124,7 +122,6 @@ void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, const int eb_uv = eb_y >> uv_height_subsampling; const int er_uv = er_y >> uv_width_subsampling; -#if CONFIG_HIGHBITDEPTH if (src->flags & YV12_FLAG_HIGHBITDEPTH) { highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src->y_crop_width, @@ -139,7 +136,6 @@ void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv); return; } -#endif // CONFIG_HIGHBITDEPTH copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src->y_crop_width, src->y_crop_height, diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c index 2a4200887..113c068c1 100644 --- a/third_party/aom/av1/encoder/firstpass.c +++ b/third_party/aom/av1/encoder/firstpass.c @@ -13,8 +13,8 @@ #include <math.h> #include <stdio.h> -#include "./aom_dsp_rtcd.h" -#include "./aom_scale_rtcd.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" @@ -27,9 +27,7 @@ #include "av1/common/entropymv.h" #include "av1/common/quant_common.h" #include "av1/common/reconinter.h" // av1_setup_dst_planes() -#if CONFIG_LV_MAP #include "av1/common/txb_common.h" -#endif #include "av1/encoder/aq_variance.h" #include "av1/encoder/av1_quantize.h" #include "av1/encoder/block.h" @@ -41,6 +39,7 @@ #include "av1/encoder/firstpass.h" #include "av1/encoder/mcomp.h" #include "av1/encoder/rd.h" +#include "av1/encoder/dwt.h" #define OUTPUT_FPF 0 #define ARF_STATS_OUTPUT 0 @@ -143,6 +142,7 @@ static void zero_stats(FIRSTPASS_STATS *section) { section->frame = 0.0; section->weight = 0.0; section->intra_error = 0.0; + section->frame_avg_wavelet_energy = 0.0; section->coded_error = 0.0; section->sr_coded_error = 0.0; section->pcnt_inter = 0.0; @@ -169,6 +169,7 @@ static void accumulate_stats(FIRSTPASS_STATS *section, section->frame += frame->frame; section->weight += frame->weight; section->intra_error += frame->intra_error; + section->frame_avg_wavelet_energy += frame->frame_avg_wavelet_energy; section->coded_error += frame->coded_error; section->sr_coded_error += frame->sr_coded_error; section->pcnt_inter += frame->pcnt_inter; @@ -195,6 +196,7 @@ static void subtract_stats(FIRSTPASS_STATS *section, section->frame -= frame->frame; section->weight -= frame->weight; section->intra_error -= frame->intra_error; + section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy; section->coded_error -= frame->coded_error; section->sr_coded_error -= frame->sr_coded_error; section->pcnt_inter -= frame->pcnt_inter; @@ -305,7 +307,6 @@ static unsigned int get_prediction_error(BLOCK_SIZE bsize, return sse; } -#if CONFIG_HIGHBITDEPTH static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, int bd) { switch (bd) { @@ -345,7 +346,6 @@ static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize, fn(src->buf, src->stride, ref->buf, ref->stride, &sse); return sse; } -#endif // CONFIG_HIGHBITDEPTH // Refine the motion search range according to the frame dimension // for first pass test. @@ -361,10 +361,10 @@ static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x, const MV *ref_mv, MV *best_mv, int *best_motion_err) { MACROBLOCKD *const xd = &x->e_mbd; - MV tmp_mv = { 0, 0 }; + MV tmp_mv = kZeroMv; MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 }; int num00, tmp_err, n; - const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY; @@ -376,11 +376,9 @@ static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x, // Override the default variance function to use MSE. v_fn_ptr.vf = get_block_variance_fn(bsize); -#if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd); } -#endif // CONFIG_HIGHBITDEPTH // Center the initial step/diamond search on best mv. tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, @@ -459,7 +457,6 @@ static void set_first_pass_params(AV1_COMP *cpi) { cpi->rc.frames_to_key = INT_MAX; } -#if CONFIG_EXT_REFS static double raw_motion_error_stdev(int *raw_motion_err_list, int raw_motion_err_counts) { int64_t sum_raw_err = 0; @@ -482,7 +479,6 @@ static double raw_motion_error_stdev(int *raw_motion_err_list, raw_err_stdev = sqrt(raw_err_stdev / raw_motion_err_counts); return raw_err_stdev; } -#endif // CONFIG_EXT_REFS #define UL_INTRA_THRESH 50 #define INVALID_ROW -1 @@ -490,6 +486,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { int mb_row, mb_col; MACROBLOCK *const x = &cpi->td.mb; AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; TileInfo tile; struct macroblock_plane *const p = x->plane; @@ -500,6 +497,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { int recon_yoffset, recon_uvoffset; int64_t intra_error = 0; + int64_t frame_avg_wavelet_energy = 0; int64_t coded_error = 0; int64_t sr_coded_error = 0; @@ -515,9 +513,8 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { int image_data_start_row = INVALID_ROW; int new_mv_count = 0; int sum_in_vectors = 0; - MV lastmv = { 0, 0 }; + MV lastmv = kZeroMv; TWO_PASS *twopass = &cpi->twopass; - const MV zero_mv = { 0, 0 }; int recon_y_stride, recon_uv_stride, uv_mb_height; YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); @@ -529,18 +526,12 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { BufferPool *const pool = cm->buffer_pool; const int qindex = find_fp_qindex(cm->bit_depth); const int mb_scale = mi_size_wide[BLOCK_16X16]; -#if CONFIG_PVQ - PVQ_QUEUE pvq_q; - od_adapt_ctx pvq_context; -#endif -#if CONFIG_EXT_REFS int *raw_motion_err_list; int raw_motion_err_counts = 0; CHECK_MEM_ERROR( cm, raw_motion_err_list, aom_calloc(cm->mb_rows * cm->mb_cols, sizeof(*raw_motion_err_list))); -#endif // CONFIG_EXT_REFS // First pass code requires valid last and new frame buffers. assert(new_yv12 != NULL); assert(frame_is_intra_only(cm) || (lst_yv12 != NULL)); @@ -555,7 +546,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { xd->mi = cm->mi_grid_visible; xd->mi[0] = cm->mi; - x->e_mbd.mi[0]->mbmi.sb_type = BLOCK_16X16; + x->e_mbd.mi[0]->sb_type = BLOCK_16X16; intra_factor = 0.0; brightness_factor = 0.0; @@ -564,80 +555,34 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { set_first_pass_params(cpi); av1_set_quantizer(cm, qindex); - av1_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); + av1_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y, + num_planes); - av1_setup_src_planes(x, cpi->source, 0, 0); - av1_setup_dst_planes(xd->plane, cm->sb_size, new_yv12, 0, 0); + av1_setup_src_planes(x, cpi->source, 0, 0, num_planes); + av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, new_yv12, 0, 0, 0, + num_planes); if (!frame_is_intra_only(cm)) { - av1_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL); + av1_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL, num_planes); } xd->mi = cm->mi_grid_visible; xd->mi[0] = cm->mi; -#if CONFIG_CFL // Don't store luma on the fist pass since chroma is not computed - xd->cfl->store_y = 0; -#endif // CONFIG_CFL + xd->cfl.store_y = 0; av1_frame_init_quantizer(cpi); -#if CONFIG_PVQ - // For pass 1 of 2-pass encoding, init here for PVQ for now. - { - pvq_q.buf_len = 5000; - CHECK_MEM_ERROR(cm, pvq_q.buf, - aom_malloc(pvq_q.buf_len * sizeof(PVQ_INFO))); - pvq_q.curr_pos = 0; - x->pvq_coded = 0; - - x->pvq_q = &pvq_q; - - // TODO(yushin): Since this init step is also called in 2nd pass, - // or 1-pass encoding, consider factoring out it as a function. - // TODO(yushin) - // If activity masking is enabled, change below to OD_HVS_QM - x->daala_enc.qm = OD_FLAT_QM; // Hard coded. Enc/dec required to sync. - x->daala_enc.pvq_norm_lambda = OD_PVQ_LAMBDA; - x->daala_enc.pvq_norm_lambda_dc = OD_PVQ_LAMBDA; - - od_init_qm(x->daala_enc.state.qm, x->daala_enc.state.qm_inv, - x->daala_enc.qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT); -#if !CONFIG_ANS - od_ec_enc_init(&x->daala_enc.w.ec, 65025); - od_ec_enc_reset(&x->daala_enc.w.ec); -#else -#error "CONFIG_PVQ currently requires !CONFIG_ANS." -#endif - } -#endif - - for (i = 0; i < MAX_MB_PLANE; ++i) { + for (i = 0; i < num_planes; ++i) { p[i].coeff = ctx->coeff[i]; p[i].qcoeff = ctx->qcoeff[i]; pd[i].dqcoeff = ctx->dqcoeff[i]; -#if CONFIG_PVQ - pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i]; -#endif p[i].eobs = ctx->eobs[i]; -#if CONFIG_LV_MAP p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; -#endif } av1_init_mv_probs(cm); -#if CONFIG_LV_MAP av1_init_lv_map(cm); -#endif -#if CONFIG_ADAPT_SCAN - av1_init_scan_order(cm); - av1_deliver_eob_threshold(cm, xd); -#endif - av1_convolve_init(cm); -#if CONFIG_PVQ - od_adapt_ctx_reset(&pvq_context, 0); - x->daala_enc.state.adapt = &pvq_context; -#endif // CONFIG_PVQ av1_initialize_rd_consts(cpi); // Tiling is ignored in the first pass. @@ -648,7 +593,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height); for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { - MV best_ref_mv = { 0, 0 }; + MV best_ref_mv = kZeroMv; // Reset above block coeffs. xd->up_available = (mb_row != 0); @@ -674,31 +619,28 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { aom_clear_system_state(); + const int idx_str = xd->mi_stride * mb_row * mb_scale + mb_col * mb_scale; + xd->mi = cm->mi_grid_visible + idx_str; + xd->mi[0] = cm->mi + idx_str; xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset; xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset; xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset; xd->left_available = (mb_col != 0); - xd->mi[0]->mbmi.sb_type = bsize; - xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME; + xd->mi[0]->sb_type = bsize; + xd->mi[0]->ref_frame[0] = INTRA_FRAME; set_mi_row_col(xd, &tile, mb_row * mb_scale, mi_size_high[bsize], - mb_col * mb_scale, mi_size_wide[bsize], -#if CONFIG_DEPENDENT_HORZTILES - cm->dependent_horz_tiles, -#endif // CONFIG_DEPENDENT_HORZTILES - cm->mi_rows, cm->mi_cols); + mb_col * mb_scale, mi_size_wide[bsize], cm->mi_rows, + cm->mi_cols); - set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize]); + set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes); // Do intra 16x16 prediction. - xd->mi[0]->mbmi.segment_id = 0; -#if CONFIG_SUPERTX - xd->mi[0]->mbmi.segment_id_supertx = 0; -#endif // CONFIG_SUPERTX - xd->lossless[xd->mi[0]->mbmi.segment_id] = (qindex == 0); - xd->mi[0]->mbmi.mode = DC_PRED; - xd->mi[0]->mbmi.tx_size = + xd->mi[0]->segment_id = 0; + xd->lossless[xd->mi[0]->segment_id] = (qindex == 0); + xd->mi[0]->mode = DC_PRED; + xd->mi[0]->tx_size = use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4; - av1_encode_intra_block_plane(cm, x, bsize, 0, 0, mb_row * 2, mb_col * 2); + av1_encode_intra_block_plane(cpi, x, bsize, 0, 0, mb_row * 2, mb_col * 2); this_error = aom_get_mb_ss(x->plane[0].src_diff); // Keep a record of blocks that have almost no intra error residual @@ -712,7 +654,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { image_data_start_row = mb_row; } -#if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) { switch (cm->bit_depth) { case AOM_BITS_8: break; @@ -725,7 +666,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { return; } } -#endif // CONFIG_HIGHBITDEPTH aom_clear_system_state(); log_intra = log(this_error + 1.0); @@ -734,14 +674,10 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { else intra_factor += 1.0; -#if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0]; else level_sample = x->plane[0].src.buf[0]; -#else - level_sample = x->plane[0].src.buf[0]; -#endif if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample)); else @@ -759,6 +695,15 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { // Accumulate the intra error. intra_error += (int64_t)this_error; + int stride = x->plane[0].src.stride; + uint8_t *buf = x->plane[0].src.buf; + for (int r8 = 0; r8 < 2; ++r8) + for (int c8 = 0; c8 < 2; ++c8) { + int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; + frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input( + buf + c8 * 8 + r8 * 8 * stride, stride, hbd); + } + #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { // initialization @@ -775,11 +720,10 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { if (!frame_is_intra_only(cm)) { // Do a motion search int tmp_err, motion_error, raw_motion_error; // Assume 0,0 motion with no mv overhead. - MV mv = { 0, 0 }, tmp_mv = { 0, 0 }; + MV mv = kZeroMv, tmp_mv = kZeroMv; struct buf_2d unscaled_last_source_buf_2d; xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; -#if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { motion_error = highbd_get_prediction_error( bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); @@ -787,10 +731,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { motion_error = get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); } -#else - motion_error = - get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); -#endif // CONFIG_HIGHBITDEPTH // Compute the motion error of the 0,0 motion using the last source // frame as the reference. Skip the further motion search on @@ -799,7 +739,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { cpi->unscaled_last_source->y_buffer + recon_yoffset; unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride; -#if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { raw_motion_error = highbd_get_prediction_error( bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd); @@ -807,10 +746,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, &unscaled_last_source_buf_2d); } -#else - raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, - &unscaled_last_source_buf_2d); -#endif // CONFIG_HIGHBITDEPTH // TODO(pengchong): Replace the hard-coded threshold if (raw_motion_error > 25) { @@ -822,7 +757,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { // 0,0 based search as well. if (!is_zero_mv(&best_ref_mv)) { tmp_err = INT_MAX; - first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err); + first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err); if (tmp_err < motion_error) { motion_error = tmp_err; @@ -836,7 +771,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { int gf_motion_error; xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset; -#if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { gf_motion_error = highbd_get_prediction_error( bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); @@ -844,12 +778,8 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); } -#else - gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, - &xd->plane[0].pre[0]); -#endif // CONFIG_HIGHBITDEPTH - first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, + first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &gf_motion_error); if (gf_motion_error < motion_error && gf_motion_error < this_error) @@ -913,11 +843,11 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { mv.row *= 8; mv.col *= 8; this_error = motion_error; - xd->mi[0]->mbmi.mode = NEWMV; - xd->mi[0]->mbmi.mv[0].as_mv = mv; - xd->mi[0]->mbmi.tx_size = TX_4X4; - xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME; - xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME; + xd->mi[0]->mode = NEWMV; + xd->mi[0]->mv[0].as_mv = mv; + xd->mi[0]->tx_size = TX_4X4; + xd->mi[0]->ref_frame[0] = LAST_FRAME; + xd->mi[0]->ref_frame[1] = NONE_FRAME; av1_build_inter_predictors_sby(cm, xd, mb_row * mb_scale, mb_col * mb_scale, NULL, bsize); av1_encode_sby_pass1(cm, x, bsize); @@ -1006,9 +936,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { } } } -#if CONFIG_EXT_REFS raw_motion_err_list[raw_motion_err_counts++] = raw_motion_error; -#endif // CONFIG_EXT_REFS } else { sr_coded_error += (int64_t)this_error; } @@ -1031,25 +959,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { aom_clear_system_state(); } -#if CONFIG_EXT_REFS const double raw_err_stdev = raw_motion_error_stdev(raw_motion_err_list, raw_motion_err_counts); aom_free(raw_motion_err_list); -#endif // CONFIG_EXT_REFS - -#if CONFIG_PVQ -#if !CONFIG_ANS - od_ec_enc_clear(&x->daala_enc.w.ec); -#else -#error "CONFIG_PVQ currently requires !CONFIG_ANS." -#endif - - x->pvq_q->last_pos = x->pvq_q->curr_pos; - x->pvq_q->curr_pos = 0; - x->pvq_q = NULL; - - aom_free(pvq_q.buf); -#endif // Clamp the image start to rows/2. This number of rows is discarded top // and bottom as dead data so rows / 2 means the frame is blank. @@ -1083,6 +995,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { fps.coded_error = (double)(coded_error >> 8) + min_err; fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err; fps.intra_error = (double)(intra_error >> 8) + min_err; + fps.frame_avg_wavelet_energy = (double)frame_avg_wavelet_energy; fps.count = 1.0; fps.pcnt_inter = (double)intercount / num_mbs; fps.pcnt_second_ref = (double)second_ref_count / num_mbs; @@ -1090,9 +1003,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { fps.intra_skip_pct = (double)intra_skip_count / num_mbs; fps.inactive_zone_rows = (double)image_data_start_row; fps.inactive_zone_cols = (double)0; // TODO(paulwilkins): fix -#if CONFIG_EXT_REFS fps.raw_error_stdev = raw_err_stdev; -#endif // CONFIG_EXT_REFS if (mvcount > 0) { fps.MVr = (double)sum_mvr / mvcount; @@ -1144,41 +1055,29 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { ((twopass->this_frame_stats.intra_error / DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) { if (gld_yv12 != NULL) { -#if CONFIG_EXT_REFS - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], - cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]]); -#else - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], - cm->ref_frame_map[cpi->lst_fb_idx]); -#endif // CONFIG_EXT_REFS + ref_cnt_fb(pool->frame_bufs, + &cm->ref_frame_map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]], + cm->ref_frame_map[cpi->ref_fb_idx[LAST_FRAME - 1]]); } twopass->sr_update_lag = 1; } else { ++twopass->sr_update_lag; } - aom_extend_frame_borders(new_yv12); + aom_extend_frame_borders(new_yv12, num_planes); -// The frame we just compressed now becomes the last frame. -#if CONFIG_EXT_REFS + // The frame we just compressed now becomes the last frame. ref_cnt_fb(pool->frame_bufs, - &cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]], - cm->new_fb_idx); -#else - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], + &cm->ref_frame_map[cpi->ref_fb_idx[LAST_FRAME - 1]], cm->new_fb_idx); -#endif // CONFIG_EXT_REFS // Special case for the first frame. Copy into the GF buffer as a second // reference. - if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX) { -#if CONFIG_EXT_REFS - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], - cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]]); -#else - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], - cm->ref_frame_map[cpi->lst_fb_idx]); -#endif // CONFIG_EXT_REFS + if (cm->current_video_frame == 0 && + cpi->ref_fb_idx[GOLDEN_FRAME - 1] != INVALID_IDX) { + ref_cnt_fb(pool->frame_bufs, + &cm->ref_frame_map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]], + cm->ref_frame_map[cpi->ref_fb_idx[LAST_FRAME - 1]]); } // Use this to see what the first pass reconstruction looks like. @@ -1234,7 +1133,7 @@ static int get_twopass_worst_quality(const AV1_COMP *cpi, : cpi->common.MBs; const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone)); const double av_err_per_mb = section_err / active_mbs; - const double speed_term = 1.0 + 0.04 * oxcf->speed; + const double speed_term = 1.0; double ediv_size_correction; const int target_norm_bits_per_mb = (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) / @@ -1662,21 +1561,6 @@ static int calculate_boost_bits(int frame_count, int boost, 0); } -#if !CONFIG_EXT_REFS -// Current limit on maximum number of active arfs in a GF/ARF group. -#define MAX_ACTIVE_ARFS 2 -#define ARF_SLOT1 2 -#define ARF_SLOT2 3 -// This function indirects the choice of buffers for arfs. -// At the moment the values are fixed but this may change as part of -// the integration process with other codec features that swap buffers around. -static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) { - arf_buffer_indices[0] = ARF_SLOT1; - arf_buffer_indices[1] = ARF_SLOT2; -} -#endif // !CONFIG_EXT_REFS - -#if CONFIG_EXT_REFS #if USE_GF16_MULTI_LAYER // === GF Group of 16 === #define GF_INTERVAL_16 16 @@ -2146,10 +2030,8 @@ static void define_gf_group_structure_16(AV1_COMP *cpi) { gf_group->bidir_pred_enabled[frame_index] = 0; for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) gf_group->ref_fb_idx_map[frame_index][ref_idx] = ref_idx; - gf_group->refresh_idx[frame_index] = - cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]; - gf_group->refresh_flag[frame_index] = - cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]; + gf_group->refresh_idx[frame_index] = cpi->ref_fb_idx[LAST_FRAME - 1]; + gf_group->refresh_flag[frame_index] = cpi->ref_fb_idx[LAST_FRAME - 1]; continue; } @@ -2247,19 +2129,16 @@ static void define_gf_group_structure_16(AV1_COMP *cpi) { } } #endif // USE_GF16_MULTI_LAYER -#endif // CONFIG_EXT_REFS static void define_gf_group_structure(AV1_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; -#if CONFIG_EXT_REFS #if USE_GF16_MULTI_LAYER if (rc->baseline_gf_interval == 16) { define_gf_group_structure_16(cpi); return; } #endif // USE_GF16_MULTI_LAYER -#endif // CONFIG_EXT_REFS TWO_PASS *const twopass = &cpi->twopass; GF_GROUP *const gf_group = &twopass->gf_group; @@ -2267,7 +2146,6 @@ static void define_gf_group_structure(AV1_COMP *cpi) { int frame_index = 0; const int key_frame = cpi->common.frame_type == KEY_FRAME; -#if CONFIG_EXT_REFS // The use of bi-predictive frames are only enabled when following 3 // conditions are met: // (1) ALTREF is enabled; @@ -2275,7 +2153,7 @@ static void define_gf_group_structure(AV1_COMP *cpi) { // (3) The bi-predictive group interval is strictly smaller than the // golden group interval. const int is_bipred_enabled = - cpi->bwd_ref_allowed && rc->source_alt_ref_pending && + cpi->extra_arf_allowed && rc->source_alt_ref_pending && rc->bipred_group_interval && rc->bipred_group_interval <= (rc->baseline_gf_interval - rc->source_alt_ref_pending); @@ -2288,14 +2166,6 @@ static void define_gf_group_structure(AV1_COMP *cpi) { int subgroup_interval[MAX_EXT_ARFS + 1]; int is_sg_bipred_enabled = is_bipred_enabled; int accumulative_subgroup_interval = 0; -#else - int mid_frame_idx; - unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS]; -#endif // CONFIG_EXT_REFS - -#if !CONFIG_EXT_REFS - get_arf_buffer_indices(arf_buffer_indices); -#endif // !CONFIG_EXT_REFS // For key frames the frame target rate is already set and it // is also the golden frame. @@ -2308,25 +2178,16 @@ static void define_gf_group_structure(AV1_COMP *cpi) { gf_group->update_type[frame_index] = GF_UPDATE; gf_group->rf_level[frame_index] = GF_ARF_STD; } -#if CONFIG_EXT_REFS gf_group->arf_update_idx[frame_index] = 0; gf_group->arf_ref_idx[frame_index] = 0; -#else - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0]; - gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0]; -#endif // CONFIG_EXT_REFS } -#if CONFIG_EXT_REFS gf_group->bidir_pred_enabled[frame_index] = 0; gf_group->brf_src_offset[frame_index] = 0; -#endif // CONFIG_EXT_REFS frame_index++; -#if CONFIG_EXT_REFS bipred_frame_index++; -#endif // CONFIG_EXT_REFS // === [frame_index == 1] === if (rc->source_alt_ref_pending) { @@ -2335,21 +2196,13 @@ static void define_gf_group_structure(AV1_COMP *cpi) { gf_group->arf_src_offset[frame_index] = (unsigned char)(rc->baseline_gf_interval - 1); -#if CONFIG_EXT_REFS gf_group->arf_update_idx[frame_index] = 0; gf_group->arf_ref_idx[frame_index] = 0; gf_group->bidir_pred_enabled[frame_index] = 0; gf_group->brf_src_offset[frame_index] = 0; -// NOTE: "bidir_pred_frame_index" stays unchanged for ARF_UPDATE frames. -#else - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0]; - gf_group->arf_ref_idx[frame_index] = - arf_buffer_indices[cpi->multi_arf_last_grp_enabled && - rc->source_alt_ref_active]; -#endif // CONFIG_EXT_REFS - -#if CONFIG_EXT_REFS + // NOTE: "bidir_pred_frame_index" stays unchanged for ARF_UPDATE frames. + // Work out the ARFs' positions in this gf group // NOTE(weitinglin): ALT_REFs' are indexed inversely, but coded in display // order (except for the original ARF). In the example of three ALT_REF's, @@ -2370,11 +2223,9 @@ static void define_gf_group_structure(AV1_COMP *cpi) { subgroup_interval[cpi->num_extra_arfs] = cpi->arf_pos_for_ovrly[cpi->num_extra_arfs] - frame_index - (cpi->num_extra_arfs == 0 ? 1 : 2); -#endif // CONFIG_EXT_REFS ++frame_index; -#if CONFIG_EXT_REFS // Insert an extra ARF // === [frame_index == 2] === if (cpi->num_extra_arfs) { @@ -2387,43 +2238,12 @@ static void define_gf_group_structure(AV1_COMP *cpi) { ++frame_index; } accumulative_subgroup_interval += subgroup_interval[cpi->num_extra_arfs]; -#else // !CONFIG_EXT_ARFS - if (cpi->multi_arf_enabled) { - // Set aside a slot for a level 1 arf. - gf_group->update_type[frame_index] = ARF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_LOW; - gf_group->arf_src_offset[frame_index] = - (unsigned char)((rc->baseline_gf_interval >> 1) - 1); - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1]; - gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0]; - ++frame_index; - } -#endif // CONFIG_EXT_ARFS } -#if !CONFIG_EXT_REFS - // Define middle frame - mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1; -#endif // !CONFIG_EXT_REFS - for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) { -#if !CONFIG_EXT_REFS - int arf_idx = 0; - - if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) { - if (frame_index <= mid_frame_idx) arf_idx = 1; - } -#endif // !CONFIG_EXT_REFS - -#if CONFIG_EXT_REFS gf_group->arf_update_idx[frame_index] = which_arf; gf_group->arf_ref_idx[frame_index] = which_arf; -#else - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx]; - gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx]; -#endif // CONFIG_EXT_REFS -#if CONFIG_EXT_REFS // If we are going to have ARFs, check whether we can have BWDREF in this // subgroup, and further, whether we can have ARF subgroup which contains // the BWDREF subgroup but contained within the GF group: @@ -2472,18 +2292,14 @@ static void define_gf_group_structure(AV1_COMP *cpi) { bipred_group_end = 1; } } else { -#endif // CONFIG_EXT_REFS gf_group->update_type[frame_index] = LF_UPDATE; gf_group->rf_level[frame_index] = INTER_NORMAL; -#if CONFIG_EXT_REFS gf_group->bidir_pred_enabled[frame_index] = 0; gf_group->brf_src_offset[frame_index] = 0; } -#endif // CONFIG_EXT_REFS ++frame_index; -#if CONFIG_EXT_REFS // Check if we need to update the ARF. if (is_sg_bipred_enabled && cpi->num_extra_arfs && which_arf > 0 && frame_index > cpi->arf_pos_for_ovrly[which_arf]) { @@ -2503,25 +2319,19 @@ static void define_gf_group_structure(AV1_COMP *cpi) { ++frame_index; } } -#endif // CONFIG_EXT_REFS } -// NOTE: We need to configure the frame at the end of the sequence + 1 that will -// be the start frame for the next group. Otherwise prior to the call to -// av1_rc_get_second_pass_params() the data will be undefined. -#if CONFIG_EXT_REFS + // NOTE: We need to configure the frame at the end of the sequence + 1 that + // will + // be the start frame for the next group. Otherwise prior to the call to + // av1_rc_get_second_pass_params() the data will be undefined. gf_group->arf_update_idx[frame_index] = 0; gf_group->arf_ref_idx[frame_index] = 0; -#else - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0]; - gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0]; -#endif // CONFIG_EXT_REFS if (rc->source_alt_ref_pending) { gf_group->update_type[frame_index] = OVERLAY_UPDATE; gf_group->rf_level[frame_index] = INTER_NORMAL; -#if CONFIG_EXT_REFS cpi->arf_pos_in_gf[0] = 1; if (cpi->num_extra_arfs) { // Overwrite the update_type for extra-ARF's corresponding internal @@ -2534,21 +2344,13 @@ static void define_gf_group_structure(AV1_COMP *cpi) { gf_group->rf_level[cpi->arf_pos_for_ovrly[i]] = INTER_NORMAL; } } -#else - // Final setup for second arf and its overlay. - if (cpi->multi_arf_enabled) { - gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE; - } -#endif // CONFIG_EXT_REFS } else { gf_group->update_type[frame_index] = GF_UPDATE; gf_group->rf_level[frame_index] = GF_ARF_STD; } -#if CONFIG_EXT_REFS gf_group->bidir_pred_enabled[frame_index] = 0; gf_group->brf_src_offset[frame_index] = 0; -#endif // CONFIG_EXT_REFS } static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits, @@ -2566,18 +2368,11 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits, int64_t total_group_bits = gf_group_bits; double modified_err = 0.0; double err_fraction; - int mid_boost_bits = 0; -#if CONFIG_EXT_REFS int ext_arf_boost[MAX_EXT_ARFS]; -#else - int mid_frame_idx; -#endif // CONFIG_EXT_REFS define_gf_group_structure(cpi); -#if CONFIG_EXT_REFS av1_zero_array(ext_arf_boost, MAX_EXT_ARFS); -#endif // CONFIG_EXT_REFS key_frame = cpi->common.frame_type == KEY_FRAME; @@ -2607,24 +2402,14 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits, ++frame_index; -#if CONFIG_EXT_REFS // Skip all the extra-ARF's right after ARF at the starting segment of // the current GF group. if (cpi->num_extra_arfs) { while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) ++frame_index; } -#else // !CONFIG_EXT_ARFS - // Set aside a slot for a level 1 arf. - if (cpi->multi_arf_enabled) ++frame_index; -#endif // CONFIG_EXT_ARFS } -#if !CONFIG_EXT_REFS - // Define middle frame - mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1; -#endif // !CONFIG_EXT_REFS - // Allocate bits to the other frames in the group. for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) { if (EOF == input_stats(twopass, &frame_stats)) break; @@ -2638,15 +2423,9 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits, target_frame_size = (int)((double)total_group_bits * err_fraction); - if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) { - mid_boost_bits += (target_frame_size >> 4); - target_frame_size -= (target_frame_size >> 4); - } - target_frame_size = clamp(target_frame_size, 0, AOMMIN(max_bits, (int)total_group_bits)); -#if CONFIG_EXT_REFS if (gf_group->update_type[frame_index] == BRF_UPDATE) { // Boost up the allocated bits on BWDREF_FRAME gf_group->bit_allocation[frame_index] = @@ -2662,28 +2441,22 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits, } else { assert(gf_group->update_type[frame_index] == LF_UPDATE || gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE); -#endif // CONFIG_EXT_REFS gf_group->bit_allocation[frame_index] = target_frame_size; -#if CONFIG_EXT_REFS } -#endif // CONFIG_EXT_REFS ++frame_index; -#if CONFIG_EXT_REFS // Skip all the extra-ARF's. if (cpi->num_extra_arfs) { while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) ++frame_index; } -#endif // CONFIG_EXT_REFS } // NOTE: We need to configure the frame at the end of the sequence + 1 that // will be the start frame for the next group. Otherwise prior to the // call to av1_rc_get_second_pass_params() the data will be undefined. if (rc->source_alt_ref_pending) { -#if CONFIG_EXT_REFS if (cpi->num_extra_arfs) { // NOTE: For bit allocation, move the allocated bits associated with // INTNL_OVERLAY_UPDATE to the corresponding INTNL_ARF_UPDATE. @@ -2702,18 +2475,7 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits, gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]] = 0; } } -#else - // Final setup for second arf and its overlay. - if (cpi->multi_arf_enabled) { - gf_group->bit_allocation[2] = - gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits; - gf_group->bit_allocation[mid_frame_idx] = 0; - } -#endif // CONFIG_EXT_REFS } - - // Note whether multi-arf was enabled this group for next time. - cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled; } // Analyse and define a gf/arf group. @@ -2761,10 +2523,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { const int is_key_frame = frame_is_intra_only(cm); const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active; -#if CONFIG_EXT_REFS cpi->extra_arf_allowed = 1; - cpi->bwd_ref_allowed = 1; -#endif // CONFIG_EXT_REFS // Reset the GF group data structures unless this is a key // frame in which case it will already have been done. @@ -2826,15 +2585,9 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { } } -#if CONFIG_EXT_REFS || CONFIG_BGSPRITE double avg_sr_coded_error = 0; double avg_raw_err_stdev = 0; int non_zero_stdev_count = 0; -#endif // CONFIG_EXT_REFS || CONFIG_BGSPRITE -#if CONFIG_BGSPRITE - double avg_pcnt_second_ref = 0; - int non_zero_pcnt_second_ref_count = 0; -#endif i = 0; while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) { @@ -2859,20 +2612,12 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { accumulate_frame_motion_stats( &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, &abs_mv_in_out_accumulator, &mv_ratio_accumulator); -#if CONFIG_EXT_REFS || CONFIG_BGSPRITE // sum up the metric values of current gf group avg_sr_coded_error += next_frame.sr_coded_error; if (fabs(next_frame.raw_error_stdev) > 0.000001) { non_zero_stdev_count++; avg_raw_err_stdev += next_frame.raw_error_stdev; } -#endif // CONFIG_EXT_REFS || CONFIG_BGSPRITE -#if CONFIG_BGSPRITE - if (this_frame->pcnt_second_ref) { - avg_pcnt_second_ref += this_frame->pcnt_second_ref; - } - non_zero_pcnt_second_ref_count++; -#endif // CONFIG_BGSPRITE // Accumulate the effect of prediction quality decay. if (!flash_detected) { @@ -2912,18 +2657,14 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { (abs_mv_in_out_accumulator > 3.0) || (mv_in_out_accumulator < -2.0) || ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) { -#if CONFIG_EXT_REFS // If GF group interval is < 12, we force it to be 8. Otherwise, // if it is >= 12, we keep it as is. // NOTE: 'i' is 1 more than the GF group interval candidate that is being // checked. if (i == (8 + 1) || i >= (12 + 1)) { -#endif // CONFIG_EXT_REFS boost_score = old_boost_score; break; -#if CONFIG_EXT_REFS } -#endif // CONFIG_EXT_REFS } *this_frame = next_frame; @@ -2934,12 +2675,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Was the group length constrained by the requirement for a new KF? rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; -#if CONFIG_EXT_REFS || CONFIG_BGSPRITE const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs : cpi->common.MBs; assert(num_mbs > 0); if (i) avg_sr_coded_error /= i; -#endif // CONFIG_EXT_REFS || CONFIG_BGSPRITE // Should we use the alternate reference frame. if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && @@ -2948,24 +2687,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost); rc->source_alt_ref_pending = 1; - - // Test to see if multi arf is appropriate. - cpi->multi_arf_enabled = - (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) && - (zero_motion_accumulator < 0.995)) - ? 1 - : 0; -#if CONFIG_BGSPRITE - if (non_zero_pcnt_second_ref_count) { - avg_pcnt_second_ref /= non_zero_pcnt_second_ref_count; - } - - cpi->bgsprite_allowed = 1; - if (abs_mv_in_out_accumulator > 0.30 || decay_accumulator < 0.90 || - avg_sr_coded_error / num_mbs < 20 || avg_pcnt_second_ref < 0.30) { - cpi->bgsprite_allowed = 0; - } -#endif // CONFIG_BGSPRITE } else { rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST); rc->source_alt_ref_pending = 0; @@ -2973,7 +2694,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Set the interval until the next gf. rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending); -#if CONFIG_EXT_REFS if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count; // Disable extra altrefs and backward refs for "still" gf group: @@ -2981,13 +2701,12 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { // avg_sr_coded_error: average of the SSE per pixel of each frame; // avg_raw_err_stdev: average of the standard deviation of (0,0) // motion error per block of each frame. - assert(num_mbs > 0); const int disable_bwd_extarf = (zero_motion_accumulator > MIN_ZERO_MOTION && avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR && avg_raw_err_stdev < MAX_RAW_ERR_VAR); - if (disable_bwd_extarf) cpi->extra_arf_allowed = cpi->bwd_ref_allowed = 0; + if (disable_bwd_extarf) cpi->extra_arf_allowed = 0; if (!cpi->extra_arf_allowed) { cpi->num_extra_arfs = 0; @@ -2998,15 +2717,12 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { } // Currently at maximum two extra ARFs' are allowed assert(cpi->num_extra_arfs <= MAX_EXT_ARFS); -#endif // CONFIG_EXT_REFS rc->frames_till_gf_update_due = rc->baseline_gf_interval; -#if CONFIG_EXT_REFS rc->bipred_group_interval = BFG_INTERVAL; // The minimum bi-predictive frame group interval is 2. if (rc->bipred_group_interval < 2) rc->bipred_group_interval = 0; -#endif // CONFIG_EXT_REFS // Reset the file position. reset_fpf_position(twopass, start_pos); @@ -3226,7 +2942,6 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Clear the alt ref active flag and last group multi arf flags as they // can never be set for a key frame. rc->source_alt_ref_active = 0; - cpi->multi_arf_last_grp_enabled = 0; // KF is always a GF so clear frames till next gf counter. rc->frames_till_gf_update_due = 0; @@ -3397,6 +3112,8 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Work out how many bits to allocate for the key frame itself. kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost, twopass->kf_group_bits); + // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost, + // kf_bits, twopass->kf_zeromotion_pct); // Work out the fraction of the kf group bits reserved for the inter frames // within the group after discounting the bits for the kf itself. @@ -3433,17 +3150,9 @@ void av1_ref_frame_map_idx_updates(AV1_COMP *cpi, int gf_frame_index) { int ref_fb_idx_prev[REF_FRAMES]; int ref_fb_idx_curr[REF_FRAMES]; - ref_fb_idx_prev[LAST_FRAME - LAST_FRAME] = - cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]; - ref_fb_idx_prev[LAST2_FRAME - LAST_FRAME] = - cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]; - ref_fb_idx_prev[LAST3_FRAME - LAST_FRAME] = - cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]; - ref_fb_idx_prev[GOLDEN_FRAME - LAST_FRAME] = cpi->gld_fb_idx; - ref_fb_idx_prev[BWDREF_FRAME - LAST_FRAME] = cpi->bwd_fb_idx; - ref_fb_idx_prev[ALTREF2_FRAME - LAST_FRAME] = cpi->alt2_fb_idx; - ref_fb_idx_prev[ALTREF_FRAME - LAST_FRAME] = cpi->alt_fb_idx; - ref_fb_idx_prev[REF_FRAMES - LAST_FRAME] = cpi->ext_fb_idx; + for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { + ref_fb_idx_prev[ref_frame] = cpi->ref_fb_idx[ref_frame]; + } // Update map index for each reference frame for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) { @@ -3451,17 +3160,9 @@ void av1_ref_frame_map_idx_updates(AV1_COMP *cpi, int gf_frame_index) { ref_fb_idx_curr[ref_idx] = ref_fb_idx_prev[ref_frame - LAST_FRAME]; } - cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] = - ref_fb_idx_curr[LAST_FRAME - LAST_FRAME]; - cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] = - ref_fb_idx_curr[LAST2_FRAME - LAST_FRAME]; - cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] = - ref_fb_idx_curr[LAST3_FRAME - LAST_FRAME]; - cpi->gld_fb_idx = ref_fb_idx_curr[GOLDEN_FRAME - LAST_FRAME]; - cpi->bwd_fb_idx = ref_fb_idx_curr[BWDREF_FRAME - LAST_FRAME]; - cpi->alt2_fb_idx = ref_fb_idx_curr[ALTREF2_FRAME - LAST_FRAME]; - cpi->alt_fb_idx = ref_fb_idx_curr[ALTREF_FRAME - LAST_FRAME]; - cpi->ext_fb_idx = ref_fb_idx_curr[REF_FRAMES - LAST_FRAME]; + for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { + cpi->ref_fb_idx[ref_frame] = ref_fb_idx_curr[ref_frame]; + } } // Define the reference buffers that will be updated post encode. @@ -3487,26 +3188,36 @@ static void configure_buffer_updates_16(AV1_COMP *cpi) { // Update refresh index switch (gf_group->refresh_idx[gf_group->index]) { case LAST_FRAME: - cpi->refresh_fb_idx = cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]; + cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST_FRAME - LAST_FRAME]; break; case LAST2_FRAME: - cpi->refresh_fb_idx = cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]; + cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST2_FRAME - LAST_FRAME]; break; case LAST3_FRAME: - cpi->refresh_fb_idx = cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]; + cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST3_FRAME - LAST_FRAME]; break; - case GOLDEN_FRAME: cpi->refresh_fb_idx = cpi->gld_fb_idx; break; + case GOLDEN_FRAME: + cpi->refresh_fb_idx = cpi->ref_fb_idx[GOLDEN_FRAME - 1]; + break; - case BWDREF_FRAME: cpi->refresh_fb_idx = cpi->bwd_fb_idx; break; + case BWDREF_FRAME: + cpi->refresh_fb_idx = cpi->ref_fb_idx[BWDREF_FRAME - 1]; + break; - case ALTREF2_FRAME: cpi->refresh_fb_idx = cpi->alt2_fb_idx; break; + case ALTREF2_FRAME: + cpi->refresh_fb_idx = cpi->ref_fb_idx[ALTREF2_FRAME - 1]; + break; - case ALTREF_FRAME: cpi->refresh_fb_idx = cpi->alt_fb_idx; break; + case ALTREF_FRAME: + cpi->refresh_fb_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1]; + break; - case REF_FRAMES: cpi->refresh_fb_idx = cpi->ext_fb_idx; break; + case REF_FRAMES: + cpi->refresh_fb_idx = cpi->ref_fb_idx[REF_FRAMES - 1]; + break; default: assert(0); break; } @@ -3579,7 +3290,6 @@ static void configure_buffer_updates(AV1_COMP *cpi) { // cpi->rc.is_$Source_Type to make this function as it is in the comment? cpi->rc.is_src_frame_alt_ref = 0; -#if CONFIG_EXT_REFS cpi->rc.is_bwd_ref_frame = 0; cpi->rc.is_last_bipred_frame = 0; cpi->rc.is_bipred_frame = 0; @@ -3592,22 +3302,21 @@ static void configure_buffer_updates(AV1_COMP *cpi) { return; } #endif // USE_GF16_MULTI_LAYER -#endif // CONFIG_EXT_REFS switch (twopass->gf_group.update_type[twopass->gf_group.index]) { - case KF_UPDATE: cpi->refresh_last_frame = 1; cpi->refresh_golden_frame = 1; -#if CONFIG_EXT_REFS + case KF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 1; cpi->refresh_bwd_ref_frame = 1; cpi->refresh_alt2_ref_frame = 1; -#endif // CONFIG_EXT_REFS cpi->refresh_alt_ref_frame = 1; break; - case LF_UPDATE: cpi->refresh_last_frame = 1; cpi->refresh_golden_frame = 0; -#if CONFIG_EXT_REFS + case LF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; cpi->refresh_bwd_ref_frame = 0; cpi->refresh_alt2_ref_frame = 0; -#endif // CONFIG_EXT_REFS cpi->refresh_alt_ref_frame = 0; break; @@ -3616,35 +3325,30 @@ static void configure_buffer_updates(AV1_COMP *cpi) { // needed. cpi->refresh_last_frame = 1; cpi->refresh_golden_frame = 1; -#if CONFIG_EXT_REFS cpi->refresh_bwd_ref_frame = 0; cpi->refresh_alt2_ref_frame = 0; -#endif // CONFIG_EXT_REFS cpi->refresh_alt_ref_frame = 0; break; case OVERLAY_UPDATE: cpi->refresh_last_frame = 0; cpi->refresh_golden_frame = 1; -#if CONFIG_EXT_REFS cpi->refresh_bwd_ref_frame = 0; cpi->refresh_alt2_ref_frame = 0; -#endif // CONFIG_EXT_REFS cpi->refresh_alt_ref_frame = 0; cpi->rc.is_src_frame_alt_ref = 1; break; - case ARF_UPDATE: cpi->refresh_last_frame = 0; cpi->refresh_golden_frame = 0; -#if CONFIG_EXT_REFS + case ARF_UPDATE: + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; // NOTE: BWDREF does not get updated along with ALTREF_FRAME. cpi->refresh_bwd_ref_frame = 0; cpi->refresh_alt2_ref_frame = 0; -#endif // CONFIG_EXT_REFS cpi->refresh_alt_ref_frame = 1; break; -#if CONFIG_EXT_REFS case BRF_UPDATE: cpi->refresh_last_frame = 0; cpi->refresh_golden_frame = 0; @@ -3693,7 +3397,6 @@ static void configure_buffer_updates(AV1_COMP *cpi) { cpi->refresh_alt2_ref_frame = 1; cpi->refresh_alt_ref_frame = 0; break; -#endif // CONFIG_EXT_REFS default: assert(0); break; } @@ -3734,11 +3437,8 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) { // If this is an arf frame then we dont want to read the stats file or // advance the input pointer as we already have what we need. - if (gf_group->update_type[gf_group->index] == ARF_UPDATE -#if CONFIG_EXT_REFS - || gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE -#endif // CONFIG_EXT_REFS - ) { + if (gf_group->update_type[gf_group->index] == ARF_UPDATE || + gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) { configure_buffer_updates(cpi); target_rate = gf_group->bit_allocation[gf_group->index]; target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate); @@ -3850,6 +3550,8 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) { // applied when combining MB error values for the frame. twopass->mb_av_energy = log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0); + twopass->frame_avg_haar_energy = + log((this_frame.frame_avg_wavelet_energy / num_mbs) + 1.0); } // Update the total stats remaining structure. diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h index 9ac542bf3..4ff0f73b0 100644 --- a/third_party/aom/av1/encoder/firstpass.h +++ b/third_party/aom/av1/encoder/firstpass.h @@ -42,7 +42,6 @@ typedef struct { } FIRSTPASS_MB_STATS; #endif -#if CONFIG_EXT_REFS // Length of the bi-predictive frame group (BFG) // NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain // number of bi-predictive frames. @@ -64,7 +63,6 @@ typedef struct { #define MAX_SR_CODED_ERROR 40 #define MAX_RAW_ERR_VAR 2000 #define MIN_MV_IN_OUT 0.4 -#endif // CONFIG_EXT_REFS #define VLOW_MOTION_THRESHOLD 950 @@ -72,6 +70,7 @@ typedef struct { double frame; double weight; double intra_error; + double frame_avg_wavelet_energy; double coded_error; double sr_coded_error; double pcnt_inter; @@ -91,10 +90,8 @@ typedef struct { double new_mv_count; double duration; double count; -#if CONFIG_EXT_REFS || CONFIG_BGSPRITE // standard deviation for (0, 0) motion prediction error double raw_error_stdev; -#endif // CONFIG_EXT_REFS } FIRSTPASS_STATS; typedef enum { @@ -103,16 +100,12 @@ typedef enum { GF_UPDATE = 2, ARF_UPDATE = 3, OVERLAY_UPDATE = 4, -#if CONFIG_EXT_REFS BRF_UPDATE = 5, // Backward Reference Frame LAST_BIPRED_UPDATE = 6, // Last Bi-predictive Frame BIPRED_UPDATE = 7, // Bi-predictive Frame, but not the last one INTNL_OVERLAY_UPDATE = 8, // Internal Overlay Frame INTNL_ARF_UPDATE = 9, // Internal Altref Frame (candidate for ALTREF2) FRAME_UPDATE_TYPES = 10 -#else // !CONFIG_EXT_REFS - FRAME_UPDATE_TYPES = 5 -#endif // CONFIG_EXT_REFS } FRAME_UPDATE_TYPE; #define FC_ANIMATION_THRESH 0.15 @@ -129,13 +122,11 @@ typedef struct { unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1]; unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1]; unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1]; -#if CONFIG_EXT_REFS unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1]; unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1]; unsigned char ref_fb_idx_map[(MAX_LAG_BUFFERS * 2) + 1][REF_FRAMES]; unsigned char refresh_idx[(MAX_LAG_BUFFERS * 2) + 1]; unsigned char refresh_flag[(MAX_LAG_BUFFERS * 2) + 1]; -#endif // CONFIG_EXT_REFS int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1]; } GF_GROUP; @@ -153,6 +144,7 @@ typedef struct { double modified_error_max; double modified_error_left; double mb_av_energy; + double frame_avg_haar_energy; #if CONFIG_FP_MB_STATS uint8_t *frame_mb_stats_buf; @@ -198,7 +190,6 @@ void av1_rc_get_second_pass_params(struct AV1_COMP *cpi); // Post encode update of the rate control parameters for 2-pass void av1_twopass_postencode_update(struct AV1_COMP *cpi); -#if CONFIG_EXT_REFS #if USE_GF16_MULTI_LAYER void av1_ref_frame_map_idx_updates(struct AV1_COMP *cpi, int gf_frame_index); #endif // USE_GF16_MULTI_LAYER @@ -213,7 +204,6 @@ static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) { else return 0; } -#endif // CONFIG_EXT_REFS #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/av1/encoder/generic_encoder.c b/third_party/aom/av1/encoder/generic_encoder.c deleted file mode 100644 index a31bb9ef6..000000000 --- a/third_party/aom/av1/encoder/generic_encoder.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -/* clang-format off */ - -#ifdef HAVE_CONFIG_H -# include "config.h" -#endif - -#include <stdio.h> - -#include "aom_dsp/bitwriter.h" -#include "av1/common/generic_code.h" -#include "av1/common/odintrin.h" -#include "pvq_encoder.h" - -/** Encodes a value from 0 to N-1 (with N up to 16) based on a cdf and adapts - * the cdf accordingly. - * - * @param [in,out] w multi-symbol entropy encoder - * @param [in] val variable being encoded - * @param [in,out] cdf CDF of the variable (Q15) - * @param [in] n number of values possible - * @param [in,out] count number of symbols encoded with that cdf so far - * @param [in] rate adaptation rate shift (smaller is faster) - */ -void aom_encode_cdf_adapt_q15(aom_writer *w, int val, uint16_t *cdf, int n, - int *count, int rate) { - int i; - if (*count == 0) { - /* On the first call, we normalize the cdf to (32768 - n). This should - eventually be moved to the state init, but for now it makes it much - easier to experiment and convert symbols to the Q15 adaptation.*/ - int ft; - ft = cdf[n - 1]; - for (i = 0; i < n; i++) { - cdf[i] = AOM_ICDF(cdf[i]*32768/ft); - } - } - aom_write_cdf(w, val, cdf, n); - aom_cdf_adapt_q15(val, cdf, n, count, rate); -} - -/** Encodes a random variable using a "generic" model, assuming that the - * distribution is one-sided (zero and up), has a single mode, and decays - * exponentially past the model. - * - * @param [in,out] w multi-symbol entropy encoder - * @param [in,out] model generic probability model - * @param [in] x variable being encoded - * @param [in,out] ExQ16 expectation of x (adapted) - * @param [in] integration integration period of ExQ16 (leaky average over - * 1<<integration samples) - */ -void generic_encode(aom_writer *w, generic_encoder *model, int x, - int *ex_q16, int integration) { - int lg_q1; - int shift; - int id; - uint16_t *cdf; - int xs; - lg_q1 = log_ex(*ex_q16); - OD_LOG((OD_LOG_ENTROPY_CODER, OD_LOG_DEBUG, - "%d %d", *ex_q16, lg_q1)); - /* If expectation is too large, shift x to ensure that - all we have past xs=15 is the exponentially decaying tail - of the distribution */ - shift = OD_MAXI(0, (lg_q1 - 5) >> 1); - /* Choose the cdf to use: we have two per "octave" of ExQ16 */ - id = OD_MINI(GENERIC_TABLES - 1, lg_q1); - cdf = model->cdf[id]; - xs = (x + (1 << shift >> 1)) >> shift; - aom_write_symbol_pvq(w, OD_MINI(15, xs), cdf, 16); - if (xs >= 15) { - int e; - unsigned decay; - /* Estimate decay based on the assumption that the distribution is close - to Laplacian for large values. We should probably have an adaptive - estimate instead. Note: The 2* is a kludge that's not fully understood - yet. */ - OD_ASSERT(*ex_q16 < INT_MAX >> 1); - e = ((2**ex_q16 >> 8) + (1 << shift >> 1)) >> shift; - decay = OD_MAXI(2, OD_MINI(254, 256*e/(e + 256))); - /* Encode the tail of the distribution assuming exponential decay. */ - aom_laplace_encode_special(w, xs - 15, decay); - } - if (shift != 0) { - int special; - /* Because of the rounding, there's only half the number of possibilities - for xs=0. */ - special = xs == 0; - if (shift - special > 0) { - aom_write_literal(w, x - (xs << shift) + (!special << (shift - 1)), - shift - special); - } - } - generic_model_update(ex_q16, x, integration); - OD_LOG((OD_LOG_ENTROPY_CODER, OD_LOG_DEBUG, - "enc: %d %d %d %d %d %x", *ex_q16, x, shift, id, xs, enc->rng)); -} - -/** Estimates the cost of encoding a value with generic_encode(). - * - * @param [in,out] model generic probability model - * @param [in] x variable being encoded - * @param [in,out] ExQ16 expectation of x (adapted) - * @return number of bits (approximation) - */ -double generic_encode_cost(generic_encoder *model, int x, int *ex_q16) { - int lg_q1; - int shift; - int id; - uint16_t *cdf; - int xs; - int extra; - lg_q1 = log_ex(*ex_q16); - /* If expectation is too large, shift x to ensure that - all we have past xs=15 is the exponentially decaying tail - of the distribution */ - shift = OD_MAXI(0, (lg_q1 - 5) >> 1); - /* Choose the cdf to use: we have two per "octave" of ExQ16 */ - id = OD_MINI(GENERIC_TABLES - 1, lg_q1); - cdf = model->cdf[id]; - xs = (x + (1 << shift >> 1)) >> shift; - extra = 0; - if (shift) extra = shift - (xs == 0); - xs = OD_MINI(15, xs); - /* Shortcut: assume it's going to cost 2 bits for the Laplace coder. */ - if (xs == 15) extra += 2; - return - extra - OD_LOG2((double)(cdf[xs] - (xs == 0 ? 0 : cdf[xs - 1]))/cdf[15]); -} - -/*Estimates the cost of encoding a value with a given CDF.*/ -double od_encode_cdf_cost(int val, uint16_t *cdf, int n) { - int total_prob; - int prev_prob; - double val_prob; - OD_ASSERT(n > 0); - total_prob = cdf[n - 1]; - if (val == 0) { - prev_prob = 0; - } - else { - prev_prob = cdf[val - 1]; - } - val_prob = (cdf[val] - prev_prob) / (double)total_prob; - return -OD_LOG2(val_prob); -} diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c index 4d44e9a6f..f07d1bc00 100644 --- a/third_party/aom/av1/encoder/global_motion.c +++ b/third_party/aom/av1/encoder/global_motion.c @@ -32,12 +32,14 @@ // Border over which to compute the global motion #define ERRORADV_BORDER 0 -#define ERRORADV_MAX_THRESH 0.995 -#define ERRORADV_COST_PRODUCT_THRESH 26000 +static const double erroradv_tr[] = { 0.75, 0.70, 0.65 }; +static const double erroradv_prod_tr[] = { 22000, 20000, 18000 }; -int is_enough_erroradvantage(double best_erroradvantage, int params_cost) { - return best_erroradvantage < ERRORADV_MAX_THRESH && - best_erroradvantage * params_cost < ERRORADV_COST_PRODUCT_THRESH; +int is_enough_erroradvantage(double best_erroradvantage, int params_cost, + int erroradv_type) { + assert(erroradv_type < GM_ERRORADV_TR_TYPES); + return best_erroradvantage < erroradv_tr[erroradv_type] && + best_erroradvantage * params_cost < erroradv_prod_tr[erroradv_type]; } static void convert_to_params(const double *params, int32_t *model) { @@ -76,6 +78,7 @@ static void convert_to_params(const double *params, int32_t *model) { void convert_model_to_params(const double *params, WarpedMotionParams *model) { convert_to_params(params, model->wmmat); model->wmtype = get_gmtype(model); + model->invalid = 0; } // Adds some offset to a global motion parameter and handles @@ -110,32 +113,31 @@ static int32_t add_param_offset(int param_index, int32_t param_value, static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) { switch (wmtype) { - case IDENTITY: wm->wmmat[0] = 0; wm->wmmat[1] = 0; + case IDENTITY: + wm->wmmat[0] = 0; + wm->wmmat[1] = 0; + AOM_FALLTHROUGH_INTENDED; case TRANSLATION: wm->wmmat[2] = 1 << WARPEDMODEL_PREC_BITS; wm->wmmat[3] = 0; - case ROTZOOM: wm->wmmat[4] = -wm->wmmat[3]; wm->wmmat[5] = wm->wmmat[2]; + AOM_FALLTHROUGH_INTENDED; + case ROTZOOM: + wm->wmmat[4] = -wm->wmmat[3]; + wm->wmmat[5] = wm->wmmat[2]; + AOM_FALLTHROUGH_INTENDED; case AFFINE: wm->wmmat[6] = wm->wmmat[7] = 0; break; - case HORTRAPEZOID: wm->wmmat[6] = wm->wmmat[4] = 0; break; - case VERTRAPEZOID: wm->wmmat[7] = wm->wmmat[3] = 0; break; - case HOMOGRAPHY: break; default: assert(0); } wm->wmtype = wmtype; } int64_t refine_integerized_param(WarpedMotionParams *wm, - TransformationType wmtype, -#if CONFIG_HIGHBITDEPTH - int use_hbd, int bd, -#endif // CONFIG_HIGHBITDEPTH + TransformationType wmtype, int use_hbd, int bd, uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst, int d_width, int d_height, int d_stride, int n_refinements, int64_t best_frame_error) { - static const int max_trans_model_params[TRANS_TYPES] = { - 0, 2, 4, 6, 8, 8, 8 - }; + static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 }; const int border = ERRORADV_BORDER; int i = 0, p; int n_params = max_trans_model_params[wmtype]; @@ -147,35 +149,26 @@ int64_t refine_integerized_param(WarpedMotionParams *wm, int32_t best_param; force_wmtype(wm, wmtype); - best_error = av1_warp_error( - wm, -#if CONFIG_HIGHBITDEPTH - use_hbd, bd, -#endif // CONFIG_HIGHBITDEPTH - ref, r_width, r_height, r_stride, dst + border * d_stride + border, - border, border, d_width - 2 * border, d_height - 2 * border, d_stride, 0, - 0, SCALE_SUBPEL_SHIFTS, SCALE_SUBPEL_SHIFTS, best_frame_error); + best_error = av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, border, border, + d_width - 2 * border, d_height - 2 * border, + d_stride, 0, 0, best_frame_error); best_error = AOMMIN(best_error, best_frame_error); step = 1 << (n_refinements - 1); for (i = 0; i < n_refinements; i++, step >>= 1) { for (p = 0; p < n_params; ++p) { int step_dir = 0; // Skip searches for parameters that are forced to be 0 - if (wmtype == HORTRAPEZOID && (p == 4 || p == 6)) continue; - if (wmtype == VERTRAPEZOID && (p == 3 || p == 7)) continue; param = param_mat + p; curr_param = *param; best_param = curr_param; // look to the left *param = add_param_offset(p, curr_param, -step); - step_error = av1_warp_error( - wm, -#if CONFIG_HIGHBITDEPTH - use_hbd, bd, -#endif // CONFIG_HIGHBITDEPTH - ref, r_width, r_height, r_stride, dst + border * d_stride + border, - border, border, d_width - 2 * border, d_height - 2 * border, d_stride, - 0, 0, SCALE_SUBPEL_SHIFTS, SCALE_SUBPEL_SHIFTS, best_error); + step_error = + av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, border, border, + d_width - 2 * border, d_height - 2 * border, d_stride, + 0, 0, best_error); if (step_error < best_error) { best_error = step_error; best_param = *param; @@ -184,14 +177,11 @@ int64_t refine_integerized_param(WarpedMotionParams *wm, // look to the right *param = add_param_offset(p, curr_param, step); - step_error = av1_warp_error( - wm, -#if CONFIG_HIGHBITDEPTH - use_hbd, bd, -#endif // CONFIG_HIGHBITDEPTH - ref, r_width, r_height, r_stride, dst + border * d_stride + border, - border, border, d_width - 2 * border, d_height - 2 * border, d_stride, - 0, 0, SCALE_SUBPEL_SHIFTS, SCALE_SUBPEL_SHIFTS, best_error); + step_error = + av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, border, border, + d_width - 2 * border, d_height - 2 * border, d_stride, + 0, 0, best_error); if (step_error < best_error) { best_error = step_error; best_param = *param; @@ -203,15 +193,11 @@ int64_t refine_integerized_param(WarpedMotionParams *wm, // for the biggest step size while (step_dir) { *param = add_param_offset(p, best_param, step * step_dir); - step_error = av1_warp_error( - wm, -#if CONFIG_HIGHBITDEPTH - use_hbd, bd, -#endif // CONFIG_HIGHBITDEPTH - ref, r_width, r_height, r_stride, dst + border * d_stride + border, - border, border, d_width - 2 * border, d_height - 2 * border, - d_stride, 0, 0, SCALE_SUBPEL_SHIFTS, SCALE_SUBPEL_SHIFTS, - best_error); + step_error = + av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, border, border, + d_width - 2 * border, d_height - 2 * border, + d_stride, 0, 0, best_error); if (step_error < best_error) { best_error = step_error; best_param = *param; @@ -229,9 +215,6 @@ int64_t refine_integerized_param(WarpedMotionParams *wm, static INLINE RansacFunc get_ransac_type(TransformationType type) { switch (type) { - case HOMOGRAPHY: return ransac_homography; - case HORTRAPEZOID: return ransac_hortrapezoid; - case VERTRAPEZOID: return ransac_vertrapezoid; case AFFINE: return ransac_affine; case ROTZOOM: return ransac_rotzoom; case TRANSLATION: return ransac_translation; @@ -239,7 +222,6 @@ static INLINE RansacFunc get_ransac_type(TransformationType type) { } } -#if CONFIG_HIGHBITDEPTH static unsigned char *downconvert_frame(YV12_BUFFER_CONFIG *frm, int bit_depth) { int i, j; @@ -257,14 +239,13 @@ static unsigned char *downconvert_frame(YV12_BUFFER_CONFIG *frm, } return buf_8bit; } -#endif -int compute_global_motion_feature_based( - TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref, -#if CONFIG_HIGHBITDEPTH - int bit_depth, -#endif - int *num_inliers_by_motion, double *params_by_motion, int num_motions) { +int compute_global_motion_feature_based(TransformationType type, + YV12_BUFFER_CONFIG *frm, + YV12_BUFFER_CONFIG *ref, int bit_depth, + int *num_inliers_by_motion, + double *params_by_motion, + int num_motions) { int i; int num_frm_corners, num_ref_corners; int num_correspondences; @@ -274,7 +255,6 @@ int compute_global_motion_feature_based( unsigned char *ref_buffer = ref->y_buffer; RansacFunc ransac = get_ransac_type(type); -#if CONFIG_HIGHBITDEPTH if (frm->flags & YV12_FLAG_HIGHBITDEPTH) { // The frame buffer is 16-bit, so we need to convert to 8 bits for the // following code. We cache the result until the frame is released. @@ -283,7 +263,6 @@ int compute_global_motion_feature_based( if (ref->flags & YV12_FLAG_HIGHBITDEPTH) { ref_buffer = downconvert_frame(ref, bit_depth); } -#endif // compute interest points in images using FAST features num_frm_corners = fast_corner_detect(frm_buffer, frm->y_width, frm->y_height, diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h index 7fca5327f..2c15753fd 100644 --- a/third_party/aom/av1/encoder/global_motion.h +++ b/third_party/aom/av1/encoder/global_motion.h @@ -24,16 +24,14 @@ extern "C" { void convert_model_to_params(const double *params, WarpedMotionParams *model); -int is_enough_erroradvantage(double erroradv, int params_cost); +int is_enough_erroradvantage(double best_erroradvantage, int params_cost, + int erroradv_type); // Returns the av1_warp_error between "dst" and the result of applying the // motion params that result from fine-tuning "wm" to "ref". Note that "wm" is // modified in place. int64_t refine_integerized_param(WarpedMotionParams *wm, - TransformationType wmtype, -#if CONFIG_HIGHBITDEPTH - int use_hbd, int bd, -#endif // CONFIG_HIGHBITDEPTH + TransformationType wmtype, int use_hbd, int bd, uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst, int d_width, int d_height, int d_stride, int n_refinements, @@ -54,12 +52,12 @@ int64_t refine_integerized_param(WarpedMotionParams *wm, number of inlier feature points for each motion. Params for which the num_inliers entry is 0 should be ignored by the caller. */ -int compute_global_motion_feature_based( - TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref, -#if CONFIG_HIGHBITDEPTH - int bit_depth, -#endif - int *num_inliers_by_motion, double *params_by_motion, int num_motions); +int compute_global_motion_feature_based(TransformationType type, + YV12_BUFFER_CONFIG *frm, + YV12_BUFFER_CONFIG *ref, int bit_depth, + int *num_inliers_by_motion, + double *params_by_motion, + int num_motions); #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/av1/encoder/grain_test_vectors.h b/third_party/aom/av1/encoder/grain_test_vectors.h new file mode 100644 index 000000000..45632da9b --- /dev/null +++ b/third_party/aom/av1/encoder/grain_test_vectors.h @@ -0,0 +1,781 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AV1_GRAIN_TEST_VECTORS_H_ +#define AV1_GRAIN_TEST_VECTORS_H_ + +/* Test vectors for emulation of different film grain types. + * Note that bit depth would be derived from the bitstream and + * not signaled in film grain metadata. The parameters are valid + * for any bit depth. + */ +static aom_film_grain_t film_grain_test_vectors[16] = { + /* Test 1 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 16, 0 }, + { 25, 136 }, + { 33, 144 }, + { 41, 160 }, + { 48, 168 }, + { 56, 136 }, + { 67, 128 }, + { 82, 144 }, + { 97, 152 }, + { 113, 144 }, + { 128, 176 }, + { 143, 168 }, + { 158, 176 }, + { 178, 184 } }, + 14 /* num_points_y */, + { { 16, 0 }, + { 20, 64 }, + { 28, 88 }, + { 60, 104 }, + { 90, 136 }, + { 105, 160 }, + { 134, 168 }, + { 168, 208 } }, + 8 /* num_cb_points */, + { { 16, 0 }, + { 28, 96 }, + { 56, 80 }, + { 66, 96 }, + { 80, 104 }, + { 108, 96 }, + { 122, 112 }, + { 137, 112 }, + { 169, 176 } }, + 9 /* num_cr_points */, + 11 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 }, + { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 }, + { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 }, + 8 /* ar_coeff_shift */, + 247 /* cb_mult */, + 192 /* cb_luma_mult */, + 18 /* cb_offset */, + 229 /* cr_mult */, + 192 /* cr_luma_mult */, + 54 /* cr_offset */, + 0 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /* chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 2 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 96 }, { 255, 96 } }, + 2 /* num_points_y */, + { { 0, 64 }, { 255, 64 } }, + 2 /* num_cb_points */, + { { 0, 64 }, { 255, 64 } }, + 2 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 3 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 192 }, { 255, 192 } }, + 2 /* num_points_y */, + { { 0, 128 }, { 255, 128 } }, + 2 /* num_cb_points */, + { { 0, 128 }, { 255, 128 } }, + 2 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 4, -7, 2, 4, 12, -12, 5, -8, 6, 8, -19, -16, 19, + -10, -2, 17, -42, 58, -2, -13, 9, 14, -36, 67, 0, + }, + { + 4, -7, 2, 4, 12, -12, 5, -8, 6, 8, -19, -16, 19, + -10, -2, 17, -42, 58, -2, -13, 9, 14, -36, 67, 0, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 1 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 4 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 16, 0 }, + { 24, 137 }, + { 53, 146 }, + { 63, 155 }, + { 78, 155 }, + { 107, 150 }, + { 122, 147 }, + { 136, 147 }, + { 166, 153 }, + }, + 9 /* num_points_y */, + { + { 16, 0 }, + { 20, 72 }, + { 27, 82 }, + { 33, 91 }, + { 69, 121 }, + { 95, 143 }, + { 108, 154 }, + { 134, 169 }, + { 147, 177 }, + }, + 9 /* num_cb_points */, + { + { 16, 0 }, + { 24, 95 }, + { 54, 93 }, + { 65, 94 }, + { 79, 98 }, + { 109, 107 }, + { 124, 119 }, + { 139, 136 }, + { 169, 170 }, + }, + 9 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 7, -9, 2, 4, 7, -12, 7, -18, 18, -30, -27, -42, + 13, -20, 7, -18, 6, 107, 55, -2, -4, -9, -22, 113, + }, + { + -3, -1, -4, 3, -6, -2, 3, 1, -4, -10, -10, -5, -5, + -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66, 0, + }, + { + 0, 4, -3, 13, 0, 1, -3, 0, -3, -10, -68, -4, -2, + -5, 2, -3, -20, 62, -31, 0, -4, -1, -8, -29, 0, + }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 5 */ + { + 1 /* apply_grain */, + 0 /* update_parameters */, + { { 0, 64 }, { 255, 64 } }, + 2 /* num_points_y */, + { + { 0, 96 }, + { 32, 90 }, + { 64, 83 }, + { 96, 76 }, + { 128, 68 }, + { 159, 59 }, + { 191, 48 }, + { 223, 34 }, + { 255, 0 }, + }, + 9 /* num_cb_points */, + { + { 0, 0 }, + { 32, 34 }, + { 64, 48 }, + { 96, 59 }, + { 128, 68 }, + { 159, 76 }, + { 191, 83 }, + { 223, 90 }, + { 255, 96 }, + }, + 9 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + -2, 2, -5, 7, -6, 4, -2, -1, 1, -2, 0, -2, 2, + -3, -5, 13, -13, 6, -14, 8, -1, 18, -36, 58, 0, + }, + { + -2, -1, -3, 14, -4, -1, -3, 0, -1, 7, -31, 7, 2, + 0, 1, 0, -7, 50, -8, -2, 2, 2, 2, -4, 0, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 1063 /* random_seed */ + }, + /* Test 6 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 0, 96 }, + { 20, 92 }, + { 39, 88 }, + { 59, 84 }, + { 78, 80 }, + { 98, 75 }, + { 118, 70 }, + { 137, 65 }, + { 157, 60 }, + { 177, 53 }, + { 196, 46 }, + { 216, 38 }, + { 235, 27 }, + { 255, 0 }, + }, + 14 /* num_points_y */, + { { 0, 0 } }, + 0 /* num_cb_points */, + { { 0, 0 } }, + 0 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 2754 /* random_seed */ + }, + /* Test 7 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 0, 0 }, + { 20, 27 }, + { 39, 38 }, + { 59, 46 }, + { 78, 53 }, + { 98, 60 }, + { 118, 65 }, + { 137, 70 }, + { 157, 75 }, + { 177, 80 }, + { 196, 84 }, + { 216, 88 }, + { 235, 92 }, + { 255, 96 }, + }, + 14 /* num_points_y */, + { { 0, 0 }, { 255, 0 } }, + 2 /* num_cb_points */, + { { 0, 0 }, { 255, 0 } }, + 2 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 8 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 96 }, { 255, 96 } }, + 2 /* num_points_y */, + { { 0, 62 }, { 255, 62 } }, + 2 /* num_cb_points */, + { { 0, 62 }, { 255, 62 } }, + 2 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, -2, -2, 8, 5, -1, 1, -1, 5, 16, -33, -9, 6, + -1, -3, 10, -47, 63, 0, -15, 3, 11, -42, 75, -69, + }, + { + 1, -1, -1, 9, 5, 0, 1, -1, 5, 15, -32, -10, 8, + -2, -4, 11, -46, 62, 1, -16, 3, 13, -43, 75, -55, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 9 */ + { + 1 /* apply_grain */, + 0 /* update_parameters */, + { { 0, 48 }, { 255, 48 } }, + 2 /* num_points_y */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_cb_points */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 10 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 48 }, { 255, 48 } }, + 2 /* num_points_y */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_cb_points */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, + { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 }, + { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 11 */ + { + 1 /* apply_grain */, + 0 /* update_parameters */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_points_y */, + { + { 0, 48 }, + { 32, 45 }, + { 64, 42 }, + { 96, 38 }, + { 128, 34 }, + { 159, 29 }, + { 191, 24 }, + { 223, 17 }, + { 255, 0 }, + }, + 9 /* num_cb_points */, + { + { 0, 0 }, + { 32, 17 }, + { 64, 24 }, + { 96, 29 }, + { 128, 34 }, + { 159, 38 }, + { 191, 42 }, + { 223, 45 }, + { 255, 48 }, + }, + 9 /* num_cr_points */, + 10 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 7, -9, 2, 4, 7, -12, 7, -18, 18, -30, -27, -42, + 13, -20, 7, -18, 6, 107, 55, -2, -4, -9, -22, 113, + }, + { + -3, -1, -4, 3, -6, -2, 3, 1, -4, -10, -10, -5, -5, + -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66, 0, + }, + { + 0, 4, -3, 13, 0, 1, -3, 0, -3, -10, -68, -4, -2, + -5, 2, -3, -20, 62, -31, 0, -4, -1, -8, -29, 0, + }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 1357 /* random_seed */ + }, + /* Test 12 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 16, 0 }, + { 24, 49 }, + { 39, 69 }, + { 46, 84 }, + { 53, 91 }, + { 63, 100 }, + { 78, 114 }, + { 92, 134 }, + { 164, 139 }, + }, + 9 /* num_points_y */, + { + { 16, 0 }, + { 20, 31 }, + { 26, 42 }, + { 33, 54 }, + { 40, 65 }, + { 47, 72 }, + { 56, 85 }, + { 84, 123 }, + { 152, 157 }, + }, + 9 /* num_cb_points */, + { + { 16, 0 }, + { 25, 14 }, + { 39, 33 }, + { 47, 40 }, + { 54, 47 }, + { 64, 62 }, + { 79, 76 }, + { 94, 83 }, + { 167, 101 }, + }, + 9 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 }, + { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 }, + { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 0 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 13 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 0, 48 }, + { 20, 46 }, + { 39, 44 }, + { 59, 42 }, + { 78, 40 }, + { 98, 38 }, + { 118, 35 }, + { 137, 33 }, + { 157, 30 }, + { 177, 27 }, + { 196, 23 }, + { 216, 19 }, + { 235, 13 }, + { 255, 0 }, + }, + 14 /* num_points_y */, + { { 0, 0 }, { 255, 0 } }, + 0 /* num_cb_points */, + { { 0, 0 }, { 255, 0 } }, + 0 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 14 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 0, 0 }, + { 20, 13 }, + { 39, 19 }, + { 59, 23 }, + { 78, 27 }, + { 98, 30 }, + { 118, 33 }, + { 137, 35 }, + { 157, 38 }, + { 177, 40 }, + { 196, 42 }, + { 216, 44 }, + { 235, 46 }, + { 255, 48 }, + }, + 14 /* num_points_y */, + { { 0, 0 }, { 255, 0 } }, + 0 /* num_cb_points */, + { { 0, 0 }, { 255, 0 } }, + 0 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 15 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 96 }, { 255, 96 } }, + 1 /* num_points_y */, + { { 0, 96 }, { 255, 96 } }, + 0 /* num_cb_points */, + { { 0, 96 }, { 255, 96 } }, + 0 /* num_cr_points */, + 11 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 5, -15, -10, -19, 0, -12, 6, 51, 30, -5, -12, 56 }, + { 2, 2, -24, -5, 1, 1, -18, 37, -2, 0, -15, 39, -70 }, + { 2, 3, -24, -5, -1, 0, -18, 38, -2, 0, -15, 39, -55 }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 1 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 16 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 16, 0 }, + { 58, 126 }, + { 87, 120 }, + { 97, 122 }, + { 112, 125 }, + { 126, 131 }, + { 141, 139 }, + { 199, 153 }, + }, + 8 /* num_points_y */, + { + { 16, 0 }, + { 59, 68 }, + { 66, 76 }, + { 73, 82 }, + { 79, 85 }, + { 86, 86 }, + { 151, 95 }, + { 192, 101 }, + }, + 8 /* num_cb_points */, + { + { 16, 0 }, + { 59, 64 }, + { 89, 80 }, + { 99, 86 }, + { 114, 90 }, + { 129, 93 }, + { 144, 97 }, + { 203, 85 }, + }, + 8 /* num_cr_points */, + 10 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, -2, -2, 8, 5, -1, 1, -1, 5, 16, -33, -9, 6, + -1, -3, 10, -47, 63, 0, -15, 3, 11, -42, 75, -69, + }, + { + 1, -1, -1, 9, 5, 0, 1, -1, 5, 15, -32, -10, 8, + -2, -4, 11, -46, 62, 1, -16, 3, 13, -43, 75, -55, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 2 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, +}; +#endif // AV1_GRAIN_TEST_VECTORS_H_ diff --git a/third_party/aom/av1/encoder/hash.c b/third_party/aom/av1/encoder/hash.c index 89c5bd8a3..180115d9f 100644 --- a/third_party/aom/av1/encoder/hash.c +++ b/third_party/aom/av1/encoder/hash.c @@ -22,7 +22,7 @@ static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator, } } -void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) { +static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) { p_crc_calculator->remainder = 0; } @@ -61,9 +61,65 @@ void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits, crc_calculator_init_table(p_crc_calculator); } -uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p, - int length) { +uint32_t av1_get_crc_value(void *crc_calculator, uint8_t *p, int length) { + CRC_CALCULATOR *p_crc_calculator = (CRC_CALCULATOR *)crc_calculator; crc_calculator_reset(p_crc_calculator); crc_calculator_process_data(p_crc_calculator, p, length); return crc_calculator_get_crc(p_crc_calculator); } + +/* CRC-32C (iSCSI) polynomial in reversed bit order. */ +#define POLY 0x82f63b78 + +/* Construct table for software CRC-32C calculation. */ +void av1_crc32c_calculator_init(CRC32C *p_crc32c) { + uint32_t crc; + + for (int n = 0; n < 256; n++) { + crc = n; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + p_crc32c->table[0][n] = crc; + } + for (int n = 0; n < 256; n++) { + crc = p_crc32c->table[0][n]; + for (int k = 1; k < 8; k++) { + crc = p_crc32c->table[0][crc & 0xff] ^ (crc >> 8); + p_crc32c->table[k][n] = crc; + } + } +} + +/* Table-driven software version as a fall-back. This is about 15 times slower + than using the hardware instructions. This assumes little-endian integers, + as is the case on Intel processors that the assembler code here is for. */ +uint32_t av1_get_crc32c_value_c(CRC32C *p, uint8_t *buf, size_t len) { + const uint8_t *next = (const uint8_t *)(buf); + uint64_t crc; + + crc = 0 ^ 0xffffffff; + while (len && ((uintptr_t)next & 7) != 0) { + crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + while (len >= 8) { + crc ^= *(uint64_t *)next; + crc = p->table[7][crc & 0xff] ^ p->table[6][(crc >> 8) & 0xff] ^ + p->table[5][(crc >> 16) & 0xff] ^ p->table[4][(crc >> 24) & 0xff] ^ + p->table[3][(crc >> 32) & 0xff] ^ p->table[2][(crc >> 40) & 0xff] ^ + p->table[1][(crc >> 48) & 0xff] ^ p->table[0][crc >> 56]; + next += 8; + len -= 8; + } + while (len) { + crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + return (uint32_t)crc ^ 0xffffffff; +} diff --git a/third_party/aom/av1/encoder/hash.h b/third_party/aom/av1/encoder/hash.h index a0fd54fb6..8b6227540 100644 --- a/third_party/aom/av1/encoder/hash.h +++ b/third_party/aom/av1/encoder/hash.h @@ -12,7 +12,8 @@ #ifndef AV1_ENCODER_HASH_H_ #define AV1_ENCODER_HASH_H_ -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" #ifdef __cplusplus @@ -31,9 +32,16 @@ typedef struct _crc_calculator { // calling av1_get_crc_value(). void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits, uint32_t truncPoly); +uint32_t av1_get_crc_value(void *crc_calculator, uint8_t *p, int length); + +// CRC32C: POLY = 0x82f63b78; +typedef struct _CRC32C { + /* Table for a quadword-at-a-time software crc. */ + uint32_t table[8][256]; +} CRC32C; -uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p, - int length); +// init table for software version crc32c +void av1_crc32c_calculator_init(CRC32C *p_crc32c); #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c index 2378597ad..5a8f8cbba 100644 --- a/third_party/aom/av1/encoder/hash_motion.c +++ b/third_party/aom/av1/encoder/hash_motion.c @@ -1,7 +1,9 @@ #include <assert.h> + +#include "config/av1_rtcd.h" + #include "av1/encoder/hash.h" #include "av1/encoder/hash_motion.h" -#include "./av1_rtcd.h" static const int crc_bits = 16; static const int block_size_bits = 3; @@ -16,7 +18,7 @@ static void hash_table_clear_all(hash_table *p_hash_table) { int max_addr = 1 << (crc_bits + block_size_bits); for (int i = 0; i < max_addr; i++) { if (p_hash_table->p_lookup_table[i] != NULL) { - vector_destroy(p_hash_table->p_lookup_table[i]); + aom_vector_destroy(p_hash_table->p_lookup_table[i]); aom_free(p_hash_table->p_lookup_table[i]); p_hash_table->p_lookup_table[i] = NULL; } @@ -37,11 +39,30 @@ static void get_pixels_in_1D_char_array_by_block_2x2(uint8_t *y_src, int stride, } } +static void get_pixels_in_1D_short_array_by_block_2x2(uint16_t *y_src, + int stride, + uint16_t *p_pixels_in1D) { + uint16_t *p_pel = y_src; + int index = 0; + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + p_pixels_in1D[index++] = p_pel[j]; + } + p_pel += stride; + } +} + static int is_block_2x2_row_same_value(uint8_t *p) { if (p[0] != p[1] || p[2] != p[3]) { return 0; } + return 1; +} +static int is_block16_2x2_row_same_value(uint16_t *p) { + if (p[0] != p[1] || p[2] != p[3]) { + return 0; + } return 1; } @@ -49,7 +70,13 @@ static int is_block_2x2_col_same_value(uint8_t *p) { if ((p[0] != p[2]) || (p[1] != p[3])) { return 0; } + return 1; +} +static int is_block16_2x2_col_same_value(uint16_t *p) { + if ((p[0] != p[2]) || (p[1] != p[3])) { + return 0; + } return 1; } @@ -63,6 +90,7 @@ static int hash_block_size_to_index(int block_size) { case 16: return 2; case 32: return 3; case 64: return 4; + case 128: return 5; default: return -1; } } @@ -100,11 +128,13 @@ static void hash_table_add_to_table(hash_table *p_hash_table, if (p_hash_table->p_lookup_table[hash_value] == NULL) { p_hash_table->p_lookup_table[hash_value] = aom_malloc(sizeof(p_hash_table->p_lookup_table[0][0])); - vector_setup(p_hash_table->p_lookup_table[hash_value], 10, - sizeof(curr_block_hash[0])); - vector_push_back(p_hash_table->p_lookup_table[hash_value], curr_block_hash); + aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10, + sizeof(curr_block_hash[0])); + aom_vector_push_back(p_hash_table->p_lookup_table[hash_value], + curr_block_hash); } else { - vector_push_back(p_hash_table->p_lookup_table[hash_value], curr_block_hash); + aom_vector_push_back(p_hash_table->p_lookup_table[hash_value], + curr_block_hash); } } @@ -119,7 +149,7 @@ int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value) { Iterator av1_hash_get_first_iterator(hash_table *p_hash_table, uint32_t hash_value) { assert(av1_hash_table_count(p_hash_table, hash_value) > 0); - return vector_begin(p_hash_table->p_lookup_table[hash_value]); + return aom_vector_begin(p_hash_table->p_lookup_table[hash_value]); } int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1, @@ -127,8 +157,9 @@ int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1, if (p_hash_table->p_lookup_table[hash_value1] == NULL) { return 0; } - Iterator iterator = vector_begin(p_hash_table->p_lookup_table[hash_value1]); - Iterator last = vector_end(p_hash_table->p_lookup_table[hash_value1]); + Iterator iterator = + aom_vector_begin(p_hash_table->p_lookup_table[hash_value1]); + Iterator last = aom_vector_end(p_hash_table->p_lookup_table[hash_value1]); for (; !iterator_equals(&iterator, &last); iterator_increment(&iterator)) { if ((*(block_hash *)iterator_get(&iterator)).hash_value2 == hash_value2) { return 1; @@ -146,25 +177,45 @@ void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture, const int y_end = picture->y_crop_height - height + 1; const int length = width * 2; - uint8_t p[4]; - - int pos = 0; - for (int y_pos = 0; y_pos < y_end; y_pos++) { - for (int x_pos = 0; x_pos < x_end; x_pos++) { - get_pixels_in_1D_char_array_by_block_2x2( - picture->y_buffer + y_pos * picture->y_stride + x_pos, - picture->y_stride, p); - pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p); - pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p); - - pic_block_hash[0][pos] = - av1_get_crc_value(&crc_calculator1, p, length * sizeof(p[0])); - pic_block_hash[1][pos] = - av1_get_crc_value(&crc_calculator2, p, length * sizeof(p[0])); - - pos++; + if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t p[4]; + int pos = 0; + for (int y_pos = 0; y_pos < y_end; y_pos++) { + for (int x_pos = 0; x_pos < x_end; x_pos++) { + get_pixels_in_1D_short_array_by_block_2x2( + CONVERT_TO_SHORTPTR(picture->y_buffer) + y_pos * picture->y_stride + + x_pos, + picture->y_stride, p); + pic_block_same_info[0][pos] = is_block16_2x2_row_same_value(p); + pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p); + + pic_block_hash[0][pos] = av1_get_crc_value( + &crc_calculator1, (uint8_t *)p, length * sizeof(p[0])); + pic_block_hash[1][pos] = av1_get_crc_value( + &crc_calculator2, (uint8_t *)p, length * sizeof(p[0])); + pos++; + } + pos += width - 1; + } + } else { + uint8_t p[4]; + int pos = 0; + for (int y_pos = 0; y_pos < y_end; y_pos++) { + for (int x_pos = 0; x_pos < x_end; x_pos++) { + get_pixels_in_1D_char_array_by_block_2x2( + picture->y_buffer + y_pos * picture->y_stride + x_pos, + picture->y_stride, p); + pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p); + pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p); + + pic_block_hash[0][pos] = + av1_get_crc_value(&crc_calculator1, p, length * sizeof(p[0])); + pic_block_hash[1][pos] = + av1_get_crc_value(&crc_calculator2, p, length * sizeof(p[0])); + pos++; + } + pos += width - 1; } - pos += width - 1; } } @@ -222,14 +273,14 @@ void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture, } if (block_size >= 4) { - const int size_minus1 = block_size - 1; + const int size_minus_1 = block_size - 1; pos = 0; for (int y_pos = 0; y_pos < y_end; y_pos++) { for (int x_pos = 0; x_pos < x_end; x_pos++) { dst_pic_block_same_info[2][pos] = (!dst_pic_block_same_info[0][pos] && !dst_pic_block_same_info[1][pos]) || - (((x_pos & size_minus1) == 0) && ((y_pos & size_minus1) == 0)); + (((x_pos & size_minus_1) == 0) && ((y_pos & size_minus_1) == 0)); pos++; } pos += block_size - 1; @@ -276,13 +327,25 @@ int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture, const int stride = picture->y_stride; const uint8_t *p = picture->y_buffer + y_start * stride + x_start; - for (int i = 0; i < block_size; i++) { - for (int j = 1; j < block_size; j++) { - if (p[j] != p[0]) { - return 0; + if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *p16 = CONVERT_TO_SHORTPTR(p); + for (int i = 0; i < block_size; i++) { + for (int j = 1; j < block_size; j++) { + if (p16[j] != p16[0]) { + return 0; + } } + p16 += stride; + } + } else { + for (int i = 0; i < block_size; i++) { + for (int j = 1; j < block_size; j++) { + if (p[j] != p[0]) { + return 0; + } + } + p += stride; } - p += stride; } return 1; @@ -293,26 +356,38 @@ int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture, const int stride = picture->y_stride; const uint8_t *p = picture->y_buffer + y_start * stride + x_start; - for (int i = 0; i < block_size; i++) { - for (int j = 1; j < block_size; j++) { - if (p[j * stride + i] != p[i]) { - return 0; + if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *p16 = CONVERT_TO_SHORTPTR(p); + for (int i = 0; i < block_size; i++) { + for (int j = 1; j < block_size; j++) { + if (p16[j * stride + i] != p16[i]) { + return 0; + } + } + } + } else { + for (int i = 0; i < block_size; i++) { + for (int j = 1; j < block_size; j++) { + if (p[j * stride + i] != p[i]) { + return 0; + } } } } - return 1; } // global buffer for hash value calculation of a block // used only in av1_get_block_hash_value() -static uint32_t hash_value_buffer[2][2][1024]; // [first hash/second hash] - // [two buffers used ping-pong] - // [num of 2x2 blocks in 64x64] +#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096) +// [first hash/second hash] +// [two buffers used ping-pong] +// [num of 2x2 blocks in 128x128] +static uint32_t hash_value_buffer[2][2][AOM_BUFFER_SIZE_FOR_BLOCK_HASH]; void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, - uint32_t *hash_value1, uint32_t *hash_value2) { - uint8_t pixel_to_hash[4]; + uint32_t *hash_value1, uint32_t *hash_value2, + int use_highbitdepth) { uint32_t to_hash[4]; const int add_value = hash_block_size_to_index(block_size) << crc_bits; assert(add_value >= 0); @@ -320,16 +395,34 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, // 2x2 subblock hash values in current CU int sub_block_in_width = (block_size >> 1); - for (int y_pos = 0; y_pos < block_size; y_pos += 2) { - for (int x_pos = 0; x_pos < block_size; x_pos += 2) { - int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1); - get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos, - stride, pixel_to_hash); - - hash_value_buffer[0][0][pos] = av1_get_crc_value( - &crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash)); - hash_value_buffer[1][0][pos] = av1_get_crc_value( - &crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash)); + if (use_highbitdepth) { + uint16_t pixel_to_hash[4]; + uint16_t *y16_src = CONVERT_TO_SHORTPTR(y_src); + for (int y_pos = 0; y_pos < block_size; y_pos += 2) { + for (int x_pos = 0; x_pos < block_size; x_pos += 2) { + int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1); + get_pixels_in_1D_short_array_by_block_2x2( + y16_src + y_pos * stride + x_pos, stride, pixel_to_hash); + assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + hash_value_buffer[0][0][pos] = av1_get_crc_value( + &crc_calculator1, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash)); + hash_value_buffer[1][0][pos] = av1_get_crc_value( + &crc_calculator2, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash)); + } + } + } else { + uint8_t pixel_to_hash[4]; + for (int y_pos = 0; y_pos < block_size; y_pos += 2) { + for (int x_pos = 0; x_pos < block_size; x_pos += 2) { + int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1); + get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos, + stride, pixel_to_hash); + assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + hash_value_buffer[0][0][pos] = av1_get_crc_value( + &crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash)); + hash_value_buffer[1][0][pos] = av1_get_crc_value( + &crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash)); + } } } @@ -349,6 +442,10 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, for (int x_pos = 0; x_pos < sub_block_in_width; x_pos++) { int srcPos = (y_pos << 1) * src_sub_block_in_width + (x_pos << 1); + assert(srcPos + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + assert(srcPos + src_sub_block_in_width + 1 < + AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); to_hash[0] = hash_value_buffer[0][src_idx][srcPos]; to_hash[1] = hash_value_buffer[0][src_idx][srcPos + 1]; to_hash[2] = @@ -378,3 +475,5 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, *hash_value1 = (hash_value_buffer[0][dst_idx][0] & crc_mask) + add_value; *hash_value2 = hash_value_buffer[1][dst_idx][0]; } + +#undef AOM_BUFFER_SIZE_FOR_BLOCK_HASH diff --git a/third_party/aom/av1/encoder/hash_motion.h b/third_party/aom/av1/encoder/hash_motion.h index 26e1ac46e..8deb92eb6 100644 --- a/third_party/aom/av1/encoder/hash_motion.h +++ b/third_party/aom/av1/encoder/hash_motion.h @@ -12,7 +12,8 @@ #ifndef AV1_ENCODER_HASH_MOTION_H_ #define AV1_ENCODER_HASH_MOTION_H_ -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" #include "aom_scale/yv12config.h" #include "third_party/vector/vector.h" @@ -29,7 +30,9 @@ typedef struct _block_hash { uint32_t hash_value2; } block_hash; -typedef struct _hash_table { Vector **p_lookup_table; } hash_table; +typedef struct _hash_table { + Vector **p_lookup_table; +} hash_table; void av1_hash_table_init(hash_table *p_hash_table); void av1_hash_table_destroy(hash_table *p_hash_table); @@ -63,7 +66,8 @@ int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture, int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture, int block_size, int x_start, int y_start); void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, - uint32_t *hash_value1, uint32_t *hash_value2); + uint32_t *hash_value1, uint32_t *hash_value2, + int use_highbitdepth); #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c index 6ddeb2b77..0922557d0 100644 --- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c +++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c @@ -9,228 +9,73 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./av1_rtcd.h" -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" #include "av1/common/idct.h" #include "av1/encoder/hybrid_fwd_txfm.h" -#if CONFIG_CHROMA_2X2 -static void fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { - tran_high_t a1 = src_diff[0]; - tran_high_t b1 = src_diff[1]; - tran_high_t c1 = src_diff[diff_stride]; - tran_high_t d1 = src_diff[1 + diff_stride]; - - tran_high_t a2 = a1 + c1; - tran_high_t b2 = b1 + d1; - tran_high_t c2 = a1 - c1; - tran_high_t d2 = b1 - d1; - - a1 = a2 + b2; - b1 = a2 - b2; - c1 = c2 + d2; - d1 = c2 - d2; - - coeff[0] = (tran_low_t)(4 * a1); - coeff[1] = (tran_low_t)(4 * b1); - coeff[2] = (tran_low_t)(4 * c1); - coeff[3] = (tran_low_t)(4 * d1); - - (void)txfm_param; -} -#endif - -static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { - if (txfm_param->lossless) { - assert(txfm_param->tx_type == DCT_DCT); - av1_fwht4x4(src_diff, coeff, diff_stride); - return; +/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per + pixel. */ +void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { + int i; + tran_high_t a1, b1, c1, d1, e1; + const int16_t *ip_pass0 = input; + const tran_low_t *ip = NULL; + tran_low_t *op = output; + + for (i = 0; i < 4; i++) { + a1 = ip_pass0[0 * stride]; + b1 = ip_pass0[1 * stride]; + c1 = ip_pass0[2 * stride]; + d1 = ip_pass0[3 * stride]; + + a1 += b1; + d1 = d1 - c1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= c1; + d1 += b1; + op[0] = (tran_low_t)a1; + op[4] = (tran_low_t)c1; + op[8] = (tran_low_t)d1; + op[12] = (tran_low_t)b1; + + ip_pass0++; + op++; } - -#if CONFIG_LGT || CONFIG_DAALA_DCT4 - // only C version has LGTs - av1_fht4x4_c(src_diff, coeff, diff_stride, txfm_param); -#else - av1_fht4x4(src_diff, coeff, diff_stride, txfm_param); -#endif -} - -static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { -#if CONFIG_LGT - av1_fht4x8_c(src_diff, coeff, diff_stride, txfm_param); -#else - av1_fht4x8(src_diff, coeff, diff_stride, txfm_param); -#endif -} - -static void fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { -#if CONFIG_LGT - av1_fht8x4_c(src_diff, coeff, diff_stride, txfm_param); -#else - av1_fht8x4(src_diff, coeff, diff_stride, txfm_param); -#endif -} - -static void fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { -#if CONFIG_LGT - av1_fht8x16_c(src_diff, coeff, diff_stride, txfm_param); -#else - av1_fht8x16(src_diff, coeff, diff_stride, txfm_param); -#endif -} - -static void fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { -#if CONFIG_LGT - av1_fht16x8_c(src_diff, coeff, diff_stride, txfm_param); -#else - av1_fht16x8(src_diff, coeff, diff_stride, txfm_param); -#endif -} - -static void fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { - av1_fht16x32(src_diff, coeff, diff_stride, txfm_param); -} - -static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { - av1_fht32x16(src_diff, coeff, diff_stride, txfm_param); -} - -static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { -#if CONFIG_LGT || CONFIG_DAALA_DCT8 - av1_fht8x8_c(src_diff, coeff, diff_stride, txfm_param); -#else - av1_fht8x8(src_diff, coeff, diff_stride, txfm_param); -#endif -} - -static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { -#if CONFIG_DAALA_DCT16 - av1_fht16x16_c(src_diff, coeff, diff_stride, txfm_param); -#else - av1_fht16x16(src_diff, coeff, diff_stride, txfm_param); -#endif // CONFIG_DAALA_DCT16 -} - -static void fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { -#if CONFIG_MRC_TX - // MRC_DCT currently only has a C implementation - if (txfm_param->tx_type == MRC_DCT) { - av1_fht32x32_c(src_diff, coeff, diff_stride, txfm_param); - return; + ip = output; + op = output; + + for (i = 0; i < 4; i++) { + a1 = ip[0]; + b1 = ip[1]; + c1 = ip[2]; + d1 = ip[3]; + + a1 += b1; + d1 -= c1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= c1; + d1 += b1; + op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR); + op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR); + op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR); + op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR); + + ip += 4; + op += 4; } -#endif // CONFIG_MRC_TX - av1_fht32x32(src_diff, coeff, diff_stride, txfm_param); -} - -#if CONFIG_TX64X64 -static void fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { -#if CONFIG_EXT_TX - if (txfm_param->tx_type == IDTX) - av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, 64, txfm_param->tx_type); - else -#endif - av1_fht64x64(src_diff, coeff, diff_stride, txfm_param); -} - -static void fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { -#if CONFIG_EXT_TX - if (txfm_param->tx_type == IDTX) - av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, 64, txfm_param->tx_type); - else -#endif - av1_fht32x64(src_diff, coeff, diff_stride, txfm_param); -} - -static void fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { -#if CONFIG_EXT_TX - if (txfm_param->tx_type == IDTX) - av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, 32, txfm_param->tx_type); - else -#endif - av1_fht64x32(src_diff, coeff, diff_stride, txfm_param); -} -#endif // CONFIG_TX64X64 - -#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX) -static void fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { -#if CONFIG_LGT - av1_fht16x4_c(src_diff, coeff, diff_stride, txfm_param); -#else - av1_fht16x4(src_diff, coeff, diff_stride, txfm_param); -#endif -} - -static void fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { -#if CONFIG_LGT - av1_fht4x16_c(src_diff, coeff, diff_stride, txfm_param); -#else - av1_fht4x16(src_diff, coeff, diff_stride, txfm_param); -#endif -} - -static void fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { -#if CONFIG_LGT - av1_fht32x8_c(src_diff, coeff, diff_stride, txfm_param); -#else - av1_fht32x8(src_diff, coeff, diff_stride, txfm_param); -#endif } -static void fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { -#if CONFIG_LGT - av1_fht8x32_c(src_diff, coeff, diff_stride, txfm_param); -#else - av1_fht8x32(src_diff, coeff, diff_stride, txfm_param); -#endif -} -#endif - -#if CONFIG_CHROMA_2X2 -static void highbd_fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff, - int diff_stride, TxfmParam *txfm_param) { - tran_high_t a1 = src_diff[0]; - tran_high_t b1 = src_diff[1]; - tran_high_t c1 = src_diff[diff_stride]; - tran_high_t d1 = src_diff[1 + diff_stride]; - - tran_high_t a2 = a1 + c1; - tran_high_t b2 = b1 + d1; - tran_high_t c2 = a1 - c1; - tran_high_t d2 = b1 - d1; - - a1 = a2 + b2; - b1 = a2 - b2; - c1 = c2 + d2; - d1 = c2 - d2; - - coeff[0] = (tran_low_t)(4 * a1); - coeff[1] = (tran_low_t)(4 * b1); - coeff[2] = (tran_low_t)(4 * c1); - coeff[3] = (tran_low_t)(4 * d1); - - (void)txfm_param; +void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, + int stride) { + av1_fwht4x4_c(input, output, stride); } -#endif static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { @@ -243,22 +88,6 @@ static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff, return; } switch (tx_type) { - case DCT_DCT: - case ADST_DCT: - case DCT_ADST: - case ADST_ADST: - // fallthrough intended - av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - case DCT_FLIPADST: - case FLIPADST_FLIPADST: - case ADST_FLIPADST: - case FLIPADST_ADST: - // fallthrough intended - av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd); - break; // use the c version for anything including identity for now case V_DCT: case H_DCT: @@ -267,11 +96,11 @@ static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff, case V_FLIPADST: case H_FLIPADST: case IDTX: - // fallthrough intended av1_fwd_txfm2d_4x4_c(src_diff, dst_coeff, diff_stride, tx_type, bd); break; -#endif // CONFIG_EXT_TX - default: assert(0); + default: + av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd); + break; } } @@ -317,28 +146,40 @@ static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff, txfm_param->bd); } +static void highbd_fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_16x4_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_4x16_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_32x8_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_8x32_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; const TX_TYPE tx_type = txfm_param->tx_type; const int bd = txfm_param->bd; switch (tx_type) { - case DCT_DCT: - case ADST_DCT: - case DCT_ADST: - case ADST_ADST: - // fallthrough intended - av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - case DCT_FLIPADST: - case FLIPADST_FLIPADST: - case ADST_FLIPADST: - case FLIPADST_ADST: - // fallthrough intended - av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd); - break; // use the c version for anything including identity for now case V_DCT: case H_DCT: @@ -347,11 +188,11 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff, case V_FLIPADST: case H_FLIPADST: case IDTX: - // fallthrough intended av1_fwd_txfm2d_8x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd); break; -#endif // CONFIG_EXT_TX - default: assert(0); + default: + av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd); + break; } } @@ -361,22 +202,6 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff, const TX_TYPE tx_type = txfm_param->tx_type; const int bd = txfm_param->bd; switch (tx_type) { - case DCT_DCT: - case ADST_DCT: - case DCT_ADST: - case ADST_ADST: - // fallthrough intended - av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - case DCT_FLIPADST: - case FLIPADST_FLIPADST: - case ADST_FLIPADST: - case FLIPADST_ADST: - // fallthrough intended - av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd); - break; // use the c version for anything including identity for now case V_DCT: case H_DCT: @@ -385,11 +210,11 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff, case V_FLIPADST: case H_FLIPADST: case IDTX: - // fallthrough intended av1_fwd_txfm2d_16x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd); break; -#endif // CONFIG_EXT_TX - default: assert(0); + default: + av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd); + break; } } @@ -399,22 +224,6 @@ static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff, const TX_TYPE tx_type = txfm_param->tx_type; const int bd = txfm_param->bd; switch (tx_type) { - case DCT_DCT: - case ADST_DCT: - case DCT_ADST: - case ADST_ADST: - // fallthrough intended - av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - case DCT_FLIPADST: - case FLIPADST_FLIPADST: - case ADST_FLIPADST: - case FLIPADST_ADST: - // fallthrough intended - av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd); - break; // use the c version for anything including identity for now case V_DCT: case H_DCT: @@ -423,206 +232,72 @@ static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff, case V_FLIPADST: case H_FLIPADST: case IDTX: - // fallthrough intended av1_fwd_txfm2d_32x32_c(src_diff, dst_coeff, diff_stride, tx_type, bd); break; -#endif // CONFIG_EXT_TX - default: assert(0); + default: + av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd); + break; } } -#if CONFIG_TX64X64 static void highbd_fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); int32_t *dst_coeff = (int32_t *)coeff; - const TX_TYPE tx_type = txfm_param->tx_type; const int bd = txfm_param->bd; - switch (tx_type) { - case DCT_DCT: - av1_fwd_txfm2d_32x64_c(src_diff, dst_coeff, diff_stride, tx_type, bd); - break; -#if CONFIG_EXT_TX - case ADST_DCT: - case DCT_ADST: - case ADST_ADST: - case FLIPADST_DCT: - case DCT_FLIPADST: - case FLIPADST_FLIPADST: - case ADST_FLIPADST: - case FLIPADST_ADST: - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - // TODO(sarahparker) - // I've deleted the 64x64 implementations that existed in lieu - // of adst, flipadst and identity for simplicity but will bring back - // in a later change. This shouldn't impact performance since - // DCT_DCT is the only extended type currently allowed for 64x64, - // as dictated by get_ext_tx_set_type in blockd.h. - av1_fwd_txfm2d_32x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); - break; - case IDTX: - av1_fwd_idtx_c(src_diff, dst_coeff, diff_stride, 32, 64, tx_type); - break; -#endif // CONFIG_EXT_TX - default: assert(0); break; - } + av1_fwd_txfm2d_32x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); } static void highbd_fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); int32_t *dst_coeff = (int32_t *)coeff; - const TX_TYPE tx_type = txfm_param->tx_type; const int bd = txfm_param->bd; - switch (tx_type) { - case DCT_DCT: - av1_fwd_txfm2d_64x32_c(src_diff, dst_coeff, diff_stride, tx_type, bd); - break; -#if CONFIG_EXT_TX - case ADST_DCT: - case DCT_ADST: - case ADST_ADST: - case FLIPADST_DCT: - case DCT_FLIPADST: - case FLIPADST_FLIPADST: - case ADST_FLIPADST: - case FLIPADST_ADST: - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - // TODO(sarahparker) - // I've deleted the 64x64 implementations that existed in lieu - // of adst, flipadst and identity for simplicity but will bring back - // in a later change. This shouldn't impact performance since - // DCT_DCT is the only extended type currently allowed for 64x64, - // as dictated by get_ext_tx_set_type in blockd.h. - av1_fwd_txfm2d_64x32_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); - break; - case IDTX: - av1_fwd_idtx_c(src_diff, dst_coeff, diff_stride, 64, 32, tx_type); - break; -#endif // CONFIG_EXT_TX - default: assert(0); break; - } + av1_fwd_txfm2d_64x32_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); +} + +static void highbd_fwd_txfm_16x64(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); + int32_t *dst_coeff = (int32_t *)coeff; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_16x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); +} + +static void highbd_fwd_txfm_64x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); + int32_t *dst_coeff = (int32_t *)coeff; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_64x16_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); } + static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); int32_t *dst_coeff = (int32_t *)coeff; - const TX_TYPE tx_type = txfm_param->tx_type; const int bd = txfm_param->bd; - switch (tx_type) { - case DCT_DCT: - av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, tx_type, bd); - break; -#if CONFIG_EXT_TX - case ADST_DCT: - case DCT_ADST: - case ADST_ADST: - case FLIPADST_DCT: - case DCT_FLIPADST: - case FLIPADST_FLIPADST: - case ADST_FLIPADST: - case FLIPADST_ADST: - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - // TODO(sarahparker) - // I've deleted the 64x64 implementations that existed in lieu - // of adst, flipadst and identity for simplicity but will bring back - // in a later change. This shouldn't impact performance since - // DCT_DCT is the only extended type currently allowed for 64x64, - // as dictated by get_ext_tx_set_type in blockd.h. - av1_fwd_txfm2d_64x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); - break; - case IDTX: - av1_fwd_idtx_c(src_diff, dst_coeff, diff_stride, 64, 64, tx_type); - break; -#endif // CONFIG_EXT_TX - default: assert(0); break; - } + av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); } -#endif // CONFIG_TX64X64 void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { - const TX_SIZE tx_size = txfm_param->tx_size; -#if CONFIG_LGT_FROM_PRED - if (txfm_param->use_lgt) { - // if use_lgt is 1, it will override tx_type - assert(is_lgt_allowed(txfm_param->mode, tx_size)); - flgt2d_from_pred_c(src_diff, coeff, diff_stride, txfm_param); - return; - } -#endif // CONFIG_LGT_FROM_PRED - switch (tx_size) { -#if CONFIG_TX64X64 - case TX_64X64: - fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param); - break; - case TX_32X64: - fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param); - break; - case TX_64X32: - fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param); - break; -#endif // CONFIG_TX64X64 - case TX_32X32: - fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param); - break; - case TX_16X16: - fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param); - break; - case TX_8X8: fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param); break; - case TX_4X8: fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param); break; - case TX_8X4: fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param); break; - case TX_8X16: - fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param); - break; - case TX_16X8: - fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param); - break; - case TX_16X32: - fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param); - break; - case TX_32X16: - fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param); - break; - case TX_4X4: fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param); break; -#if CONFIG_CHROMA_2X2 - case TX_2X2: fwd_txfm_2x2(src_diff, coeff, diff_stride, txfm_param); break; -#endif -#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX) - case TX_4X16: - fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param); - break; - case TX_16X4: - fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param); - break; - case TX_8X32: - fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param); - break; - case TX_32X8: - fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param); - break; -#endif - default: assert(0); break; - } + if (txfm_param->bd == 8) + av1_lowbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); + else + av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); +} + +void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); } void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const TX_SIZE tx_size = txfm_param->tx_size; switch (tx_size) { -#if CONFIG_TX64X64 case TX_64X64: highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param); break; @@ -632,7 +307,12 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, case TX_64X32: highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param); break; -#endif // CONFIG_TX64X64 + case TX_16X64: + highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_64X16: + highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param); + break; case TX_32X32: highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param); break; @@ -663,11 +343,18 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, case TX_4X4: highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param); break; -#if CONFIG_CHROMA_2X2 - case TX_2X2: - highbd_fwd_txfm_2x2(src_diff, coeff, diff_stride, txfm_param); + case TX_4X16: + highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_16X4: + highbd_fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_8X32: + highbd_fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_32X8: + highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param); break; -#endif default: assert(0); break; } } diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h index b25ffb8d8..6155b255a 100644 --- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h +++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h @@ -12,7 +12,7 @@ #ifndef AV1_ENCODER_HYBRID_FWD_TXFM_H_ #define AV1_ENCODER_HYBRID_FWD_TXFM_H_ -#include "./aom_config.h" +#include "config/aom_config.h" #ifdef __cplusplus extern "C" { diff --git a/third_party/aom/av1/encoder/k_means_template.h b/third_party/aom/av1/encoder/k_means_template.h index 3a433d9b5..9e526b88b 100644 --- a/third_party/aom/av1/encoder/k_means_template.h +++ b/third_party/aom/av1/encoder/k_means_template.h @@ -23,25 +23,23 @@ #define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y) #define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM) -static float RENAME(calc_dist)(const float *p1, const float *p2) { - float dist = 0; - int i; - for (i = 0; i < AV1_K_MEANS_DIM; ++i) { - const float diff = p1[i] - p2[i]; +static int RENAME(calc_dist)(const int *p1, const int *p2) { + int dist = 0; + for (int i = 0; i < AV1_K_MEANS_DIM; ++i) { + const int diff = p1[i] - p2[i]; dist += diff * diff; } return dist; } -void RENAME(av1_calc_indices)(const float *data, const float *centroids, +void RENAME(av1_calc_indices)(const int *data, const int *centroids, uint8_t *indices, int n, int k) { - int i, j; - for (i = 0; i < n; ++i) { - float min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids); + for (int i = 0; i < n; ++i) { + int min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids); indices[i] = 0; - for (j = 1; j < k; ++j) { - const float this_dist = RENAME(calc_dist)( - data + i * AV1_K_MEANS_DIM, centroids + j * AV1_K_MEANS_DIM); + for (int j = 1; j < k; ++j) { + const int this_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, + centroids + j * AV1_K_MEANS_DIM); if (this_dist < min_dist) { min_dist = this_dist; indices[i] = j; @@ -50,19 +48,16 @@ void RENAME(av1_calc_indices)(const float *data, const float *centroids, } } -static void RENAME(calc_centroids)(const float *data, float *centroids, +static void RENAME(calc_centroids)(const int *data, int *centroids, const uint8_t *indices, int n, int k) { - int i, j, index; - int count[PALETTE_MAX_SIZE]; + int i, j; + int count[PALETTE_MAX_SIZE] = { 0 }; unsigned int rand_state = (unsigned int)data[0]; - assert(n <= 32768); - - memset(count, 0, sizeof(count[0]) * k); memset(centroids, 0, sizeof(centroids[0]) * k * AV1_K_MEANS_DIM); for (i = 0; i < n; ++i) { - index = indices[i]; + const int index = indices[i]; assert(index < k); ++count[index]; for (j = 0; j < AV1_K_MEANS_DIM; ++j) { @@ -76,43 +71,35 @@ static void RENAME(calc_centroids)(const float *data, float *centroids, data + (lcg_rand16(&rand_state) % n) * AV1_K_MEANS_DIM, sizeof(centroids[0]) * AV1_K_MEANS_DIM); } else { - const float norm = 1.0f / count[i]; - for (j = 0; j < AV1_K_MEANS_DIM; ++j) - centroids[i * AV1_K_MEANS_DIM + j] *= norm; + for (j = 0; j < AV1_K_MEANS_DIM; ++j) { + centroids[i * AV1_K_MEANS_DIM + j] = + DIVIDE_AND_ROUND(centroids[i * AV1_K_MEANS_DIM + j], count[i]); + } } } - - // Round to nearest integers. - for (i = 0; i < k * AV1_K_MEANS_DIM; ++i) { - centroids[i] = roundf(centroids[i]); - } } -static float RENAME(calc_total_dist)(const float *data, const float *centroids, - const uint8_t *indices, int n, int k) { - float dist = 0; - int i; +static int64_t RENAME(calc_total_dist)(const int *data, const int *centroids, + const uint8_t *indices, int n, int k) { + int64_t dist = 0; (void)k; - - for (i = 0; i < n; ++i) + for (int i = 0; i < n; ++i) { dist += RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids + indices[i] * AV1_K_MEANS_DIM); - + } return dist; } -void RENAME(av1_k_means)(const float *data, float *centroids, uint8_t *indices, +void RENAME(av1_k_means)(const int *data, int *centroids, uint8_t *indices, int n, int k, int max_itr) { - int i; - float this_dist; - float pre_centroids[2 * PALETTE_MAX_SIZE]; + int pre_centroids[2 * PALETTE_MAX_SIZE]; uint8_t pre_indices[MAX_SB_SQUARE]; RENAME(av1_calc_indices)(data, centroids, indices, n, k); - this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k); + int64_t this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k); - for (i = 0; i < max_itr; ++i) { - const float pre_dist = this_dist; + for (int i = 0; i < max_itr; ++i) { + const int64_t pre_dist = this_dist; memcpy(pre_centroids, centroids, sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM); memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n); @@ -132,6 +119,5 @@ void RENAME(av1_k_means)(const float *data, float *centroids, uint8_t *indices, break; } } - #undef RENAME_ #undef RENAME diff --git a/third_party/aom/av1/encoder/laplace_encoder.c b/third_party/aom/av1/encoder/laplace_encoder.c deleted file mode 100644 index 54ffc88fb..000000000 --- a/third_party/aom/av1/encoder/laplace_encoder.c +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -/* clang-format off */ - -#ifdef HAVE_CONFIG_H -# include "config.h" -#endif - -#include <stdio.h> - -#include "aom_dsp/bitwriter.h" -#include "av1/common/odintrin.h" -#include "av1/common/pvq.h" -#include "pvq_encoder.h" - -static void aom_encode_pvq_split(aom_writer *w, od_pvq_codeword_ctx *adapt, - int count, int sum, int ctx) { - int shift; - int rest; - int fctx; - if (sum == 0) return; - shift = OD_MAXI(0, OD_ILOG(sum) - 3); - if (shift) { - rest = count & ((1 << shift) - 1); - count >>= shift; - sum >>= shift; - } - fctx = 7*ctx + sum - 1; - aom_write_symbol_pvq(w, count, adapt->pvq_split_cdf[fctx], sum + 1); - if (shift) aom_write_literal(w, rest, shift); -} - -void aom_encode_band_pvq_splits(aom_writer *w, od_pvq_codeword_ctx *adapt, - const int *y, int n, int k, int level) { - int mid; - int i; - int count_right; - if (n <= 1 || k == 0) return; - if (k == 1 && n <= 16) { - int cdf_id; - int pos; - cdf_id = od_pvq_k1_ctx(n, level == 0); - for (pos = 0; !y[pos]; pos++); - OD_ASSERT(pos < n); - aom_write_symbol_pvq(w, pos, adapt->pvq_k1_cdf[cdf_id], n); - } - else { - mid = n >> 1; - count_right = k; - for (i = 0; i < mid; i++) count_right -= abs(y[i]); - aom_encode_pvq_split(w, adapt, count_right, k, od_pvq_size_ctx(n)); - aom_encode_band_pvq_splits(w, adapt, y, mid, k - count_right, level + 1); - aom_encode_band_pvq_splits(w, adapt, y + mid, n - mid, count_right, - level + 1); - } -} - -/** Encodes the tail of a Laplace-distributed variable, i.e. it doesn't - * do anything special for the zero case. - * - * @param [in,out] enc range encoder - * @param [in] x variable to encode (has to be positive) - * @param [in] decay decay factor of the distribution in Q8 format, - * i.e. pdf ~= decay^x - */ -void aom_laplace_encode_special(aom_writer *w, int x, unsigned decay) { - int shift; - int xs; - int sym; - const uint16_t *cdf; - shift = 0; - /* We don't want a large decay value because that would require too many - symbols. */ - while (decay > 235) { - decay = (decay*decay + 128) >> 8; - shift++; - } - decay = OD_MINI(decay, 254); - decay = OD_MAXI(decay, 2); - xs = x >> shift; - cdf = EXP_CDF_TABLE[(decay + 1) >> 1]; - OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "decay = %d", decay)); - do { - sym = OD_MINI(xs, 15); - { - int i; - OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "%d %d %d %d %d\n", x, xs, shift, - sym, max)); - for (i = 0; i < 16; i++) { - OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "%d ", cdf[i])); - } - OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "\n")); - } - aom_write_cdf(w, sym, cdf, 16); - xs -= 15; - } while (sym >= 15); - if (shift) aom_write_literal(w, x & ((1 << shift) - 1), shift); -} diff --git a/third_party/aom/av1/encoder/lookahead.c b/third_party/aom/av1/encoder/lookahead.c index 591ca6152..1bf8ecbac 100644 --- a/third_party/aom/av1/encoder/lookahead.c +++ b/third_party/aom/av1/encoder/lookahead.c @@ -11,10 +11,9 @@ #include <assert.h> #include <stdlib.h> -#include "./aom_config.h" +#include "config/aom_config.h" #include "av1/common/common.h" - #include "av1/encoder/encoder.h" #include "av1/encoder/extend.h" #include "av1/encoder/lookahead.h" @@ -42,14 +41,9 @@ void av1_lookahead_destroy(struct lookahead_ctx *ctx) { } } -struct lookahead_ctx *av1_lookahead_init(unsigned int width, - unsigned int height, - unsigned int subsampling_x, - unsigned int subsampling_y, -#if CONFIG_HIGHBITDEPTH - int use_highbitdepth, -#endif - unsigned int depth) { +struct lookahead_ctx *av1_lookahead_init( + unsigned int width, unsigned int height, unsigned int subsampling_x, + unsigned int subsampling_y, int use_highbitdepth, unsigned int depth) { struct lookahead_ctx *ctx = NULL; // Clamp the lookahead queue depth @@ -68,10 +62,7 @@ struct lookahead_ctx *av1_lookahead_init(unsigned int width, if (!ctx->buf) goto bail; for (i = 0; i < depth; i++) if (aom_alloc_frame_buffer(&ctx->buf[i].img, width, height, subsampling_x, - subsampling_y, -#if CONFIG_HIGHBITDEPTH - use_highbitdepth, -#endif + subsampling_y, use_highbitdepth, AOM_BORDER_IN_PIXELS, legacy_byte_alignment)) goto bail; } @@ -84,10 +75,7 @@ bail: #define USE_PARTIAL_COPY 0 int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, - int64_t ts_start, int64_t ts_end, -#if CONFIG_HIGHBITDEPTH - int use_highbitdepth, -#endif + int64_t ts_start, int64_t ts_end, int use_highbitdepth, aom_enc_frame_flags_t flags) { struct lookahead_entry *buf; #if USE_PARTIAL_COPY @@ -160,10 +148,7 @@ int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG new_img; memset(&new_img, 0, sizeof(new_img)); if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x, - subsampling_y, -#if CONFIG_HIGHBITDEPTH - use_highbitdepth, -#endif + subsampling_y, use_highbitdepth, AOM_BORDER_IN_PIXELS, 0)) return 1; aom_free_frame_buffer(&buf->img); diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h index 19f75d7e4..3897c2a6a 100644 --- a/third_party/aom/av1/encoder/lookahead.h +++ b/third_party/aom/av1/encoder/lookahead.h @@ -44,14 +44,9 @@ struct lookahead_ctx { * The lookahead stage is a queue of frame buffers on which some analysis * may be done when buffers are enqueued. */ -struct lookahead_ctx *av1_lookahead_init(unsigned int width, - unsigned int height, - unsigned int subsampling_x, - unsigned int subsampling_y, -#if CONFIG_HIGHBITDEPTH - int use_highbitdepth, -#endif - unsigned int depth); +struct lookahead_ctx *av1_lookahead_init( + unsigned int width, unsigned int height, unsigned int subsampling_x, + unsigned int subsampling_y, int use_highbitdepth, unsigned int depth); /**\brief Destroys the lookahead stage */ @@ -73,10 +68,7 @@ void av1_lookahead_destroy(struct lookahead_ctx *ctx); * \param[in] active_map Map that specifies which macroblock is active */ int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, - int64_t ts_start, int64_t ts_end, -#if CONFIG_HIGHBITDEPTH - int use_highbitdepth, -#endif + int64_t ts_start, int64_t ts_end, int use_highbitdepth, aom_enc_frame_flags_t flags); /**\brief Get the next source buffer to encode diff --git a/third_party/aom/av1/encoder/mbgraph.c b/third_party/aom/av1/encoder/mbgraph.c index 7d2510af9..472173634 100644 --- a/third_party/aom/av1/encoder/mbgraph.c +++ b/third_party/aom/av1/encoder/mbgraph.c @@ -11,8 +11,8 @@ #include <limits.h> -#include "./av1_rtcd.h" -#include "./aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" @@ -47,32 +47,28 @@ static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv, av1_hex_search(x, &ref_full, step_param, x->errorperbit, 0, cond_cost_list(cpi, cost_list), &v_fn_ptr, 0, ref_mv); -// Try sub-pixel MC -// if (bestsme > error_thresh && bestsme < INT_MAX) -#if CONFIG_AMVR - if (cpi->common.cur_frame_mv_precision_level == 1) { + // Try sub-pixel MC + // if (bestsme > error_thresh && bestsme < INT_MAX) + if (cpi->common.cur_frame_force_integer_mv == 1) { x->best_mv.as_mv.row *= 8; x->best_mv.as_mv.col *= 8; } else { -#else - { -#endif int distortion; unsigned int sse; - cpi->find_fractional_mv_step(x, ref_mv, cpi->common.allow_high_precision_mv, - x->errorperbit, &v_fn_ptr, 0, - mv_sf->subpel_iters_per_step, - cond_cost_list(cpi, cost_list), NULL, NULL, - &distortion, &sse, NULL, NULL, 0, 0, 0, 0, 0); + cpi->find_fractional_mv_step( + x, &cpi->common, mb_row, mb_col, ref_mv, + cpi->common.allow_high_precision_mv, x->errorperbit, &v_fn_ptr, 0, + mv_sf->subpel_iters_per_step, cond_cost_list(cpi, cost_list), NULL, + NULL, &distortion, &sse, NULL, NULL, 0, 0, 0, 0, 0); } - if (has_second_ref(&xd->mi[0]->mbmi)) - xd->mi[0]->mbmi.mode = NEW_NEWMV; + if (has_second_ref(xd->mi[0])) + xd->mi[0]->mode = NEW_NEWMV; else - xd->mi[0]->mbmi.mode = NEWMV; + xd->mi[0]->mode = NEWMV; - xd->mi[0]->mbmi.mv[0] = x->best_mv; - xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME; + xd->mi[0]->mv[0] = x->best_mv; + xd->mi[0]->ref_frame[1] = NONE_FRAME; av1_build_inter_predictors_sby(&cpi->common, xd, mb_row, mb_col, NULL, BLOCK_16X16); @@ -108,7 +104,7 @@ static int do_16x16_motion_search(AV1_COMP *cpi, const MV *ref_mv, int mb_row, // If the current best reference mv is not centered on 0,0 then do a 0,0 // based search as well. if (ref_mv->row != 0 || ref_mv->col != 0) { - MV zero_ref_mv = { 0, 0 }; + MV zero_ref_mv = kZeroMv; tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, mb_row, mb_col); if (tmp_err < err) { @@ -144,14 +140,14 @@ static int find_best_16x16_intra(AV1_COMP *cpi, PREDICTION_MODE *pbest_mode) { // calculate SATD for each intra prediction mode; // we're intentionally not doing 4x4, we just want a rough estimate - for (mode = DC_PRED; mode <= TM_PRED; mode++) { + for (mode = DC_PRED; mode <= PAETH_PRED; mode++) { unsigned int err; - xd->mi[0]->mbmi.mode = mode; - av1_predict_intra_block(cm, xd, 16, 16, BLOCK_16X16, mode, - x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].dst.buf, xd->plane[0].dst.stride, 0, 0, - 0); + xd->mi[0]->mode = mode; + av1_predict_intra_block(cm, xd, 16, 16, TX_16X16, mode, 0, 0, + FILTER_INTRA_MODES, x->plane[0].src.buf, + x->plane[0].src.stride, xd->plane[0].dst.buf, + xd->plane[0].dst.stride, 0, 0, 0); err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, xd->plane[0].dst.stride); @@ -231,8 +227,8 @@ static void update_mbgraph_frame_stats(AV1_COMP *cpi, int mb_col, mb_row, offset = 0; int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0; - MV gld_top_mv = { 0, 0 }; - MODE_INFO mi_local; + MV gld_top_mv = kZeroMv; + MB_MODE_INFO mi_local; av1_zero(mi_local); // Set up limit values for motion vectors to prevent them extending outside @@ -244,9 +240,9 @@ static void update_mbgraph_frame_stats(AV1_COMP *cpi, xd->plane[0].pre[0].stride = buf->y_stride; xd->plane[1].dst.stride = buf->uv_stride; xd->mi[0] = &mi_local; - mi_local.mbmi.sb_type = BLOCK_16X16; - mi_local.mbmi.ref_frame[0] = LAST_FRAME; - mi_local.mbmi.ref_frame[1] = NONE_FRAME; + mi_local.sb_type = BLOCK_16X16; + mi_local.ref_frame[0] = LAST_FRAME; + mi_local.ref_frame[1] = NONE_FRAME; for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { MV gld_left_mv = gld_top_mv; diff --git a/third_party/aom/av1/encoder/mbgraph.h b/third_party/aom/av1/encoder/mbgraph.h index 758e2ad15..3e0a4fa9b 100644 --- a/third_party/aom/av1/encoder/mbgraph.h +++ b/third_party/aom/av1/encoder/mbgraph.h @@ -23,10 +23,12 @@ typedef struct { int_mv mv; PREDICTION_MODE mode; } m; - } ref[TOTAL_REFS_PER_FRAME]; + } ref[REF_FRAMES]; } MBGRAPH_MB_STATS; -typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS; +typedef struct { + MBGRAPH_MB_STATS *mb_stats; +} MBGRAPH_FRAME_STATS; struct AV1_COMP; diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c index 6c8503da0..c4572a341 100644 --- a/third_party/aom/av1/encoder/mcomp.c +++ b/third_party/aom/av1/encoder/mcomp.c @@ -13,8 +13,8 @@ #include <math.h> #include <stdio.h> -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" @@ -22,9 +22,11 @@ #include "av1/common/common.h" #include "av1/common/mvref_common.h" +#include "av1/common/onyxc_int.h" #include "av1/common/reconinter.h" #include "av1/encoder/encoder.h" +#include "av1/encoder/encodemv.h" #include "av1/encoder/mcomp.h" #include "av1/encoder/rdopt.h" @@ -54,10 +56,9 @@ void av1_set_mv_search_range(MvLimits *mv_limits, const MV *mv) { if (mv_limits->row_max > row_max) mv_limits->row_max = row_max; } -static void av1_set_subpel_mv_search_range(const MvLimits *mv_limits, - int *col_min, int *col_max, - int *row_min, int *row_max, - const MV *ref_mv) { +static void set_subpel_mv_search_range(const MvLimits *mv_limits, int *col_min, + int *col_max, int *row_min, int *row_max, + const MV *ref_mv) { const int max_mv = MAX_FULL_PEL_VAL * 8; const int minc = AOMMAX(mv_limits->col_min * 8, ref_mv->col - max_mv); const int maxc = AOMMIN(mv_limits->col_max * 8, ref_mv->col + max_mv); @@ -172,57 +173,64 @@ void av1_init3smotion_compensation(search_site_config *cfg, int stride) { static INLINE int sp(int x) { return x & 7; } static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { - return &buf[(r >> 3) * stride + (c >> 3)]; + const int offset = (r >> 3) * stride + (c >> 3); + return buf + offset; } /* checks if (r, c) has better score than previous best */ -#define CHECK_BETTER(v, r, c) \ - if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ - MV this_mv = { r, c }; \ - v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \ - if (second_pred == NULL) \ - thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \ - src_address, src_stride, &sse); \ - else if (mask) \ - thismse = vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \ - src_address, src_stride, second_pred, mask, \ - mask_stride, invert_mask, &sse); \ - else \ - thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \ - src_address, src_stride, &sse, second_pred); \ - v += thismse; \ - if (v < besterr) { \ - besterr = v; \ - br = r; \ - bc = c; \ - *distortion = thismse; \ - *sse1 = sse; \ - } \ - } else { \ - v = INT_MAX; \ +#define CHECK_BETTER(v, r, c) \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + MV this_mv = { r, c }; \ + v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \ + if (second_pred == NULL) { \ + thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \ + src_address, src_stride, &sse); \ + } else if (mask) { \ + thismse = vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \ + src_address, src_stride, second_pred, mask, \ + mask_stride, invert_mask, &sse); \ + } else { \ + if (xd->jcp_param.use_jnt_comp_avg) \ + thismse = vfp->jsvaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \ + src_address, src_stride, &sse, second_pred, \ + &xd->jcp_param); \ + else \ + thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \ + src_address, src_stride, &sse, second_pred); \ + } \ + v += thismse; \ + if (v < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ } #define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c) /* checks if (r, c) has better score than previous best */ -#define CHECK_BETTER1(v, r, c) \ - if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ - MV this_mv = { r, c }; \ - thismse = upsampled_pref_error(xd, vfp, src_address, src_stride, \ - pre(y, y_stride, r, c), y_stride, sp(c), \ - sp(r), second_pred, mask, mask_stride, \ - invert_mask, w, h, &sse); \ - v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \ - v += thismse; \ - if (v < besterr) { \ - besterr = v; \ - br = r; \ - bc = c; \ - *distortion = thismse; \ - *sse1 = sse; \ - } \ - } else { \ - v = INT_MAX; \ +#define CHECK_BETTER1(v, r, c) \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + MV this_mv = { r, c }; \ + thismse = upsampled_pref_error( \ + xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride, \ + pre(y, y_stride, r, c), y_stride, sp(c), sp(r), second_pred, mask, \ + mask_stride, invert_mask, w, h, &sse); \ + v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \ + v += thismse; \ + if (v < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ } #define FIRST_LEVEL_CHECKS \ @@ -294,33 +302,33 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { } \ } -#define SETUP_SUBPEL_SEARCH \ - const uint8_t *const src_address = x->plane[0].src.buf; \ - const int src_stride = x->plane[0].src.stride; \ - const MACROBLOCKD *xd = &x->e_mbd; \ - unsigned int besterr = INT_MAX; \ - unsigned int sse; \ - unsigned int whichdir; \ - int thismse; \ - MV *bestmv = &x->best_mv.as_mv; \ - const unsigned int halfiters = iters_per_step; \ - const unsigned int quarteriters = iters_per_step; \ - const unsigned int eighthiters = iters_per_step; \ - const int y_stride = xd->plane[0].pre[0].stride; \ - const int offset = bestmv->row * y_stride + bestmv->col; \ - const uint8_t *const y = xd->plane[0].pre[0].buf; \ - \ - int br = bestmv->row * 8; \ - int bc = bestmv->col * 8; \ - int hstep = 4; \ - int minc, maxc, minr, maxr; \ - int tr = br; \ - int tc = bc; \ - \ - av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, \ - ref_mv); \ - \ - bestmv->row *= 8; \ +#define SETUP_SUBPEL_SEARCH \ + const uint8_t *const src_address = x->plane[0].src.buf; \ + const int src_stride = x->plane[0].src.stride; \ + const MACROBLOCKD *xd = &x->e_mbd; \ + unsigned int besterr = INT_MAX; \ + unsigned int sse; \ + unsigned int whichdir; \ + int thismse; \ + MV *bestmv = &x->best_mv.as_mv; \ + const unsigned int halfiters = iters_per_step; \ + const unsigned int quarteriters = iters_per_step; \ + const unsigned int eighthiters = iters_per_step; \ + const int y_stride = xd->plane[0].pre[0].stride; \ + const int offset = bestmv->row * y_stride + bestmv->col; \ + const uint8_t *const y = xd->plane[0].pre[0].buf; \ + \ + int br = bestmv->row * 8; \ + int bc = bestmv->col * 8; \ + int hstep = 4; \ + int minc, maxc, minr, maxr; \ + int tr = br; \ + int tc = bc; \ + \ + set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, \ + ref_mv); \ + \ + bestmv->row *= 8; \ bestmv->col *= 8; static unsigned int setup_center_error( @@ -331,25 +339,34 @@ static unsigned int setup_center_error( int mask_stride, int invert_mask, int w, int h, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) { unsigned int besterr; -#if CONFIG_HIGHBITDEPTH if (second_pred != NULL) { if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]); - if (mask) + if (mask) { aom_highbd_comp_mask_pred(comp_pred16, second_pred, w, h, y + offset, y_stride, mask, mask_stride, invert_mask); - else - aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, - y_stride); + } else { + if (xd->jcp_param.use_jnt_comp_avg) + aom_highbd_jnt_comp_avg_pred(comp_pred16, second_pred, w, h, + y + offset, y_stride, &xd->jcp_param); + else + aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, + y_stride); + } besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1); } else { DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]); - if (mask) + if (mask) { aom_comp_mask_pred(comp_pred, second_pred, w, h, y + offset, y_stride, mask, mask_stride, invert_mask); - else - aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); + } else { + if (xd->jcp_param.use_jnt_comp_avg) + aom_jnt_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, + y_stride, &xd->jcp_param); + else + aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); + } besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); } } else { @@ -357,22 +374,6 @@ static unsigned int setup_center_error( } *distortion = besterr; besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); -#else - (void)xd; - if (second_pred != NULL) { - DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]); - if (mask) - aom_comp_mask_pred(comp_pred, second_pred, w, h, y + offset, y_stride, - mask, mask_stride, invert_mask); - else - aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); - besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); - } else { - besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1); - } - *distortion = besterr; - besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); -#endif // CONFIG_HIGHBITDEPTH return besterr; } @@ -401,11 +402,13 @@ static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) { } int av1_find_best_sub_pixel_tree_pruned_evenmore( - MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit, + MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *ref_mv, int allow_hp, int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask, - int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) { + int mask_stride, int invert_mask, int w, int h, + int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y, y_stride, @@ -418,7 +421,10 @@ int av1_find_best_sub_pixel_tree_pruned_evenmore( (void)allow_hp; (void)forced_stop; (void)hstep; - (void)use_upsampled_ref; + (void)use_accurate_subpel_search; + (void)cm; + (void)mi_row; + (void)mi_col; if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && @@ -468,13 +474,18 @@ int av1_find_best_sub_pixel_tree_pruned_evenmore( } int av1_find_best_sub_pixel_tree_pruned_more( - MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit, + MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *ref_mv, int allow_hp, int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask, - int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) { + int mask_stride, int invert_mask, int w, int h, + int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; - (void)use_upsampled_ref; + (void)use_accurate_subpel_search; + (void)cm; + (void)mi_row; + (void)mi_col; besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y, y_stride, @@ -531,13 +542,18 @@ int av1_find_best_sub_pixel_tree_pruned_more( } int av1_find_best_sub_pixel_tree_pruned( - MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit, + MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *ref_mv, int allow_hp, int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask, - int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) { + int mask_stride, int invert_mask, int w, int h, + int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; - (void)use_upsampled_ref; + (void)use_accurate_subpel_search; + (void)cm; + (void)mi_row; + (void)mi_col; besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y, y_stride, @@ -624,7 +640,8 @@ static const MV search_step_table[12] = { }; /* clang-format on */ -static int upsampled_pref_error(const MACROBLOCKD *xd, +static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, const aom_variance_fn_ptr_t *vfp, const uint8_t *const src, const int src_stride, const uint8_t *const y, int y_stride, @@ -633,73 +650,105 @@ static int upsampled_pref_error(const MACROBLOCKD *xd, int mask_stride, int invert_mask, int w, int h, unsigned int *sse) { unsigned int besterr; -#if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); if (second_pred != NULL) { - if (mask) + if (mask) { aom_highbd_comp_mask_upsampled_pred( - pred16, second_pred, w, h, subpel_x_q3, subpel_y_q3, y, y_stride, - mask, mask_stride, invert_mask, xd->bd); - else - aom_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, - subpel_x_q3, subpel_y_q3, y, - y_stride, xd->bd); + xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h, subpel_x_q3, + subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd); + } else { + if (xd->jcp_param.use_jnt_comp_avg) + aom_highbd_jnt_comp_avg_upsampled_pred( + xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h, + subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd, &xd->jcp_param); + else + aom_highbd_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, + second_pred, w, h, subpel_x_q3, + subpel_y_q3, y, y_stride, xd->bd); + } } else { - aom_highbd_upsampled_pred(pred16, w, h, subpel_x_q3, subpel_y_q3, y, - y_stride, xd->bd); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h, + subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd); } besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, sse); } else { DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); -#else - DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); - (void)xd; -#endif // CONFIG_HIGHBITDEPTH if (second_pred != NULL) { - if (mask) - aom_comp_mask_upsampled_pred(pred, second_pred, w, h, subpel_x_q3, - subpel_y_q3, y, y_stride, mask, - mask_stride, invert_mask); - else - aom_comp_avg_upsampled_pred(pred, second_pred, w, h, subpel_x_q3, - subpel_y_q3, y, y_stride); + if (mask) { + aom_comp_mask_upsampled_pred( + xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3, + subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask); + } else { + if (xd->jcp_param.use_jnt_comp_avg) + aom_jnt_comp_avg_upsampled_pred( + xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3, + subpel_y_q3, y, y_stride, &xd->jcp_param); + else + aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, + second_pred, w, h, subpel_x_q3, + subpel_y_q3, y, y_stride); + } } else { - aom_upsampled_pred(pred, w, h, subpel_x_q3, subpel_y_q3, y, y_stride); + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3, + subpel_y_q3, y, y_stride); } besterr = vfp->vf(pred, w, src, src_stride, sse); -#if CONFIG_HIGHBITDEPTH } -#endif return besterr; } static unsigned int upsampled_setup_center_error( - const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv, - int error_per_bit, const aom_variance_fn_ptr_t *vfp, - const uint8_t *const src, const int src_stride, const uint8_t *const y, - int y_stride, const uint8_t *second_pred, const uint8_t *mask, - int mask_stride, int invert_mask, int w, int h, int offset, int *mvjcost, - int *mvcost[2], unsigned int *sse1, int *distortion) { + MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *bestmv, const MV *ref_mv, int error_per_bit, + const aom_variance_fn_ptr_t *vfp, const uint8_t *const src, + const int src_stride, const uint8_t *const y, int y_stride, + const uint8_t *second_pred, const uint8_t *mask, int mask_stride, + int invert_mask, int w, int h, int offset, int *mvjcost, int *mvcost[2], + unsigned int *sse1, int *distortion) { unsigned int besterr = upsampled_pref_error( - xd, vfp, src, src_stride, y + offset, y_stride, 0, 0, second_pred, mask, - mask_stride, invert_mask, w, h, sse1); + xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride, y + offset, + y_stride, 0, 0, second_pred, mask, mask_stride, invert_mask, w, h, sse1); *distortion = besterr; besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); return besterr; } +// when use_accurate_subpel_search == 0 +static INLINE unsigned int estimate_upsampled_pref_error( + MACROBLOCKD *xd, const aom_variance_fn_ptr_t *vfp, const uint8_t *const src, + const int src_stride, const uint8_t *const pre, int y_stride, + int subpel_x_q3, int subpel_y_q3, const uint8_t *second_pred, + const uint8_t *mask, int mask_stride, int invert_mask, unsigned int *sse) { + if (second_pred == NULL) { + return vfp->svf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride, + sse); + } else if (mask) { + return vfp->msvf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride, + second_pred, mask, mask_stride, invert_mask, sse); + } else { + if (xd->jcp_param.use_jnt_comp_avg) + return vfp->jsvaf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, + src_stride, sse, second_pred, &xd->jcp_param); + else + return vfp->svaf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride, + sse, second_pred); + } +} + int av1_find_best_sub_pixel_tree( - MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit, + MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *ref_mv, int allow_hp, int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask, - int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) { + int mask_stride, int invert_mask, int w, int h, + int use_accurate_subpel_search) { const uint8_t *const src_address = x->plane[0].src.buf; const int src_stride = x->plane[0].src.stride; - const MACROBLOCKD *xd = &x->e_mbd; + MACROBLOCKD *xd = &x->e_mbd; unsigned int besterr = INT_MAX; unsigned int sse; unsigned int thismse; @@ -720,8 +769,7 @@ int av1_find_best_sub_pixel_tree( int kr, kc; int minc, maxc, minr, maxr; - av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, - ref_mv); + set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, ref_mv); if (!allow_hp) if (round == 3) round = 2; @@ -729,12 +777,11 @@ int av1_find_best_sub_pixel_tree( bestmv->row *= 8; bestmv->col *= 8; - // use_upsampled_ref can be 0 or 1 - if (use_upsampled_ref) + if (use_accurate_subpel_search) besterr = upsampled_setup_center_error( - xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y, - y_stride, second_pred, mask, mask_stride, invert_mask, w, h, offset, - mvjcost, mvcost, sse1, distortion); + xd, cm, mi_row, mi_col, bestmv, ref_mv, error_per_bit, vfp, src_address, + src_stride, y, y_stride, second_pred, mask, mask_stride, invert_mask, w, + h, offset, mvjcost, mvcost, sse1, distortion); else besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y, y_stride, @@ -751,23 +798,16 @@ int av1_find_best_sub_pixel_tree( if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { MV this_mv = { tr, tc }; - if (use_upsampled_ref) { - thismse = upsampled_pref_error(xd, vfp, src_address, src_stride, - pre(y, y_stride, tr, tc), y_stride, - sp(tc), sp(tr), second_pred, mask, - mask_stride, invert_mask, w, h, &sse); + if (use_accurate_subpel_search) { + thismse = upsampled_pref_error( + xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride, + pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred, + mask, mask_stride, invert_mask, w, h, &sse); } else { - const uint8_t *const pre_address = pre(y, y_stride, tr, tc); - if (second_pred == NULL) - thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), - src_address, src_stride, &sse); - else if (mask) - thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr), - src_address, src_stride, second_pred, mask, - mask_stride, invert_mask, &sse); - else - thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), - src_address, src_stride, &sse, second_pred); + thismse = estimate_upsampled_pref_error( + xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc), + y_stride, sp(tc), sp(tr), second_pred, mask, mask_stride, + invert_mask, &sse); } cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, @@ -793,24 +833,16 @@ int av1_find_best_sub_pixel_tree( if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { MV this_mv = { tr, tc }; - if (use_upsampled_ref) { - thismse = upsampled_pref_error(xd, vfp, src_address, src_stride, - pre(y, y_stride, tr, tc), y_stride, - sp(tc), sp(tr), second_pred, mask, - mask_stride, invert_mask, w, h, &sse); + if (use_accurate_subpel_search) { + thismse = upsampled_pref_error( + xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride, + pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred, + mask, mask_stride, invert_mask, w, h, &sse); } else { - const uint8_t *const pre_address = pre(y, y_stride, tr, tc); - - if (second_pred == NULL) - thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, - src_stride, &sse); - else if (mask) - thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr), - src_address, src_stride, second_pred, mask, - mask_stride, invert_mask, &sse); - else - thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), - src_address, src_stride, &sse, second_pred); + thismse = estimate_upsampled_pref_error( + xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc), + y_stride, sp(tc), sp(tr), second_pred, mask, mask_stride, + invert_mask, &sse); } cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, @@ -835,7 +867,7 @@ int av1_find_best_sub_pixel_tree( } if (iters_per_step > 1 && best_idx != -1) { - if (use_upsampled_ref) { + if (use_accurate_subpel_search) { SECOND_LEVEL_CHECKS_BEST(1); } else { SECOND_LEVEL_CHECKS_BEST(0); @@ -861,63 +893,51 @@ int av1_find_best_sub_pixel_tree( #undef PRE #undef CHECK_BETTER -#if CONFIG_WARPED_MOTION unsigned int av1_compute_motion_cost(const AV1_COMP *cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int mi_row, int mi_col, const MV *this_mv) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; - MODE_INFO *mi = xd->mi[0]; - MB_MODE_INFO *mbmi = &mi->mbmi; const uint8_t *const src = x->plane[0].src.buf; const int src_stride = x->plane[0].src.stride; uint8_t *const dst = xd->plane[0].dst.buf; const int dst_stride = xd->plane[0].dst.stride; const aom_variance_fn_ptr_t *vfp = &cpi->fn_ptr[bsize]; - const MV ref_mv = x->mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv; + const int_mv ref_mv = av1_get_ref_mv(x, 0); unsigned int mse; unsigned int sse; av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize); mse = vfp->vf(dst, dst_stride, src, src_stride, &sse); - mse += - mv_err_cost(this_mv, &ref_mv, x->nmvjointcost, x->mvcost, x->errorperbit); + mse += mv_err_cost(this_mv, &ref_mv.as_mv, x->nmvjointcost, x->mvcost, + x->errorperbit); return mse; } // Refine MV in a small range -#if WARPED_MOTION_SORT_SAMPLES unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int mi_row, int mi_col, - int *pts0, int *pts_inref0, int *pts_mv0, + int *pts0, int *pts_inref0, int total_samples) { -#else -unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x, - BLOCK_SIZE bsize, int mi_row, int mi_col, - int *pts, int *pts_inref) { -#endif // WARPED_MOTION_SORT_SAMPLES const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; - MODE_INFO *mi = xd->mi[0]; - MB_MODE_INFO *mbmi = &mi->mbmi; + MB_MODE_INFO *mbmi = xd->mi[0]; const MV neighbors[8] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }, { 0, -2 }, { 2, 0 }, { 0, 2 }, { -2, 0 } }; - const MV ref_mv = x->mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv; + const int_mv ref_mv = av1_get_ref_mv(x, 0); int16_t br = mbmi->mv[0].as_mv.row; int16_t bc = mbmi->mv[0].as_mv.col; int16_t *tr = &mbmi->mv[0].as_mv.row; int16_t *tc = &mbmi->mv[0].as_mv.col; WarpedMotionParams best_wm_params = mbmi->wm_params[0]; -#if WARPED_MOTION_SORT_SAMPLES int best_num_proj_ref = mbmi->num_proj_ref[0]; -#endif // WARPED_MOTION_SORT_SAMPLES unsigned int bestmse; int minc, maxc, minr, maxr; const int start = cm->allow_high_precision_mv ? 0 : 4; int ite; - av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, - &ref_mv); + set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, + &ref_mv.as_mv); // Calculate the center position's error assert(bc >= minc && bc <= maxc && br >= minr && br <= maxr); @@ -937,15 +957,13 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x, if (*tc >= minc && *tc <= maxc && *tr >= minr && *tr <= maxr) { MV this_mv = { *tr, *tc }; -#if WARPED_MOTION_SORT_SAMPLES int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); if (total_samples > 1) mbmi->num_proj_ref[0] = - sortSamples(pts_mv0, &this_mv, pts, pts_inref, total_samples); -#endif // WARPED_MOTION_SORT_SAMPLES + selectSamples(&this_mv, pts, pts_inref, total_samples, bsize); if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize, *tr, *tc, &mbmi->wm_params[0], mi_row, mi_col)) { @@ -955,9 +973,7 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x, if (thismse < bestmse) { best_idx = idx; best_wm_params = mbmi->wm_params[0]; -#if WARPED_MOTION_SORT_SAMPLES best_num_proj_ref = mbmi->num_proj_ref[0]; -#endif // WARPED_MOTION_SORT_SAMPLES bestmse = thismse; } } @@ -975,12 +991,9 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x, *tr = br; *tc = bc; mbmi->wm_params[0] = best_wm_params; -#if WARPED_MOTION_SORT_SAMPLES mbmi->num_proj_ref[0] = best_num_proj_ref; -#endif // WARPED_MOTION_SORT_SAMPLES return bestmse; } -#endif // CONFIG_WARPED_MOTION static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col, int range) { @@ -1386,11 +1399,19 @@ int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv, const MV mv = { best_mv->row * 8, best_mv->col * 8 }; unsigned int unused; - return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0, - what->buf, what->stride, &unused, second_pred) + - (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost, - x->errorperbit) - : 0); + if (xd->jcp_param.use_jnt_comp_avg) + return vfp->jsvaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0, + what->buf, what->stride, &unused, second_pred, + &xd->jcp_param) + + (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost, + x->errorperbit) + : 0); + else + return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0, + what->buf, what->stride, &unused, second_pred) + + (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost, + x->errorperbit) + : 0); } int av1_get_mvpred_mask_var(const MACROBLOCK *x, const MV *best_mv, @@ -1785,205 +1806,6 @@ int av1_diamond_search_sad_c(MACROBLOCK *x, const search_site_config *cfg, return bestsad; } -static int vector_match(int16_t *ref, int16_t *src, int bwl) { - int best_sad = INT_MAX; - int this_sad; - int d; - int center, offset = 0; - int bw = 4 << bwl; // redundant variable, to be changed in the experiments. - for (d = 0; d <= bw; d += 16) { - this_sad = aom_vector_var(&ref[d], src, bwl); - if (this_sad < best_sad) { - best_sad = this_sad; - offset = d; - } - } - center = offset; - - for (d = -8; d <= 8; d += 16) { - int this_pos = offset + d; - // check limit - if (this_pos < 0 || this_pos > bw) continue; - this_sad = aom_vector_var(&ref[this_pos], src, bwl); - if (this_sad < best_sad) { - best_sad = this_sad; - center = this_pos; - } - } - offset = center; - - for (d = -4; d <= 4; d += 8) { - int this_pos = offset + d; - // check limit - if (this_pos < 0 || this_pos > bw) continue; - this_sad = aom_vector_var(&ref[this_pos], src, bwl); - if (this_sad < best_sad) { - best_sad = this_sad; - center = this_pos; - } - } - offset = center; - - for (d = -2; d <= 2; d += 4) { - int this_pos = offset + d; - // check limit - if (this_pos < 0 || this_pos > bw) continue; - this_sad = aom_vector_var(&ref[this_pos], src, bwl); - if (this_sad < best_sad) { - best_sad = this_sad; - center = this_pos; - } - } - offset = center; - - for (d = -1; d <= 1; d += 2) { - int this_pos = offset + d; - // check limit - if (this_pos < 0 || this_pos > bw) continue; - this_sad = aom_vector_var(&ref[this_pos], src, bwl); - if (this_sad < best_sad) { - best_sad = this_sad; - center = this_pos; - } - } - - return (center - (bw >> 1)); -} - -static const MV search_pos[4] = { - { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 }, -}; - -unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, int mi_row, - int mi_col) { - MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } }; - DECLARE_ALIGNED(16, int16_t, hbuf[2 * MAX_SB_SIZE]); - DECLARE_ALIGNED(16, int16_t, vbuf[2 * MAX_SB_SIZE]); - DECLARE_ALIGNED(16, int16_t, src_hbuf[MAX_SB_SQUARE]); - DECLARE_ALIGNED(16, int16_t, src_vbuf[MAX_SB_SQUARE]); - int idx; - const int src_stride = x->plane[0].src.stride; - const int ref_stride = xd->plane[0].pre[0].stride; - uint8_t const *ref_buf, *src_buf; - MV *tmp_mv = &xd->mi[0]->mbmi.mv[0].as_mv; - unsigned int best_sad, tmp_sad, sad_arr[4]; - MV this_mv; - const YV12_BUFFER_CONFIG *scaled_ref_frame = - av1_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]); - - if (scaled_ref_frame) { - int i; - // Swap out the reference frame for a version that's been scaled to - // match the resolution of the current frame, allowing the existing - // motion search code to be used without additional modifications. - for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0]; - av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL); - } - -#if CONFIG_HIGHBITDEPTH - { - unsigned int this_sad; - tmp_mv->row = 0; - tmp_mv->col = 0; - this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, - xd->plane[0].pre[0].buf, ref_stride); - - if (scaled_ref_frame) { - int i; - for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; - } - return this_sad; - } -#endif - - const int bw = 4 << b_width_log2_lookup[bsize]; - const int bh = 4 << b_height_log2_lookup[bsize]; - const int search_width = bw << 1; - const int search_height = bh << 1; - const int norm_factor = 3 + (bw >> 5); - - // Set up prediction 1-D reference set - ref_buf = xd->plane[0].pre[0].buf - (bw >> 1); - for (idx = 0; idx < search_width; idx += 16) { - aom_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh); - ref_buf += 16; - } - - ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride; - for (idx = 0; idx < search_height; ++idx) { - vbuf[idx] = aom_int_pro_col(ref_buf, bw) >> norm_factor; - ref_buf += ref_stride; - } - - // Set up src 1-D reference set - for (idx = 0; idx < bw; idx += 16) { - src_buf = x->plane[0].src.buf + idx; - aom_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh); - } - - src_buf = x->plane[0].src.buf; - for (idx = 0; idx < bh; ++idx) { - src_vbuf[idx] = aom_int_pro_col(src_buf, bw) >> norm_factor; - src_buf += src_stride; - } - - // Find the best match per 1-D search - tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]); - tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]); - - this_mv = *tmp_mv; - src_buf = x->plane[0].src.buf; - ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col; - best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); - - { - const uint8_t *const pos[4] = { - ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride, - }; - - cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, sad_arr); - } - - for (idx = 0; idx < 4; ++idx) { - if (sad_arr[idx] < best_sad) { - best_sad = sad_arr[idx]; - tmp_mv->row = search_pos[idx].row + this_mv.row; - tmp_mv->col = search_pos[idx].col + this_mv.col; - } - } - - if (sad_arr[0] < sad_arr[3]) - this_mv.row -= 1; - else - this_mv.row += 1; - - if (sad_arr[1] < sad_arr[2]) - this_mv.col -= 1; - else - this_mv.col += 1; - - ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col; - - tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); - if (best_sad > tmp_sad) { - *tmp_mv = this_mv; - best_sad = tmp_sad; - } - - tmp_mv->row *= 8; - tmp_mv->col *= 8; - - if (scaled_ref_frame) { - int i; - for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; - } - - return best_sad; -} - /* do_refine: If last step (1-away) of n-step search doesn't pick the center point as the best match, we will do a final 1-away diamond refining search */ @@ -2110,197 +1932,6 @@ static int full_pixel_exhaustive(const AV1_COMP *const cpi, MACROBLOCK *x, return bestsme; } -int av1_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv, - int sad_per_bit, int distance, - const aom_variance_fn_ptr_t *fn_ptr, - const MV *center_mv, MV *best_mv) { - int r, c; - const MACROBLOCKD *const xd = &x->e_mbd; - const struct buf_2d *const what = &x->plane[0].src; - const struct buf_2d *const in_what = &xd->plane[0].pre[0]; - const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min); - const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max); - const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min); - const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max); - const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - int best_sad = - fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), - in_what->stride) + - mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - *best_mv = *ref_mv; - - for (r = row_min; r < row_max; ++r) { - for (c = col_min; c < col_max; ++c) { - const MV mv = { r, c }; - const int sad = - fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), - in_what->stride) + - mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - } - return best_sad; -} - -int av1_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv, - int sad_per_bit, int distance, - const aom_variance_fn_ptr_t *fn_ptr, - const MV *center_mv, MV *best_mv) { - int r; - const MACROBLOCKD *const xd = &x->e_mbd; - const struct buf_2d *const what = &x->plane[0].src; - const struct buf_2d *const in_what = &xd->plane[0].pre[0]; - const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min); - const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max); - const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min); - const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max); - const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - unsigned int best_sad = - fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), - in_what->stride) + - mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - *best_mv = *ref_mv; - - for (r = row_min; r < row_max; ++r) { - int c = col_min; - const uint8_t *check_here = &in_what->buf[r * in_what->stride + c]; - - if (fn_ptr->sdx3f != NULL) { - while ((c + 2) < col_max) { - int i; - DECLARE_ALIGNED(16, uint32_t, sads[3]); - - fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride, - sads); - - for (i = 0; i < 3; ++i) { - unsigned int sad = sads[i]; - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - } - - while (c < col_max) { - unsigned int sad = - fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride); - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - - return best_sad; -} - -int av1_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv, - int sad_per_bit, int distance, - const aom_variance_fn_ptr_t *fn_ptr, - const MV *center_mv, MV *best_mv) { - int r; - const MACROBLOCKD *const xd = &x->e_mbd; - const struct buf_2d *const what = &x->plane[0].src; - const struct buf_2d *const in_what = &xd->plane[0].pre[0]; - const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min); - const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max); - const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min); - const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max); - const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - unsigned int best_sad = - fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), - in_what->stride) + - mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - *best_mv = *ref_mv; - - for (r = row_min; r < row_max; ++r) { - int c = col_min; - const uint8_t *check_here = &in_what->buf[r * in_what->stride + c]; - - if (fn_ptr->sdx8f != NULL) { - while ((c + 7) < col_max) { - int i; - DECLARE_ALIGNED(16, uint32_t, sads[8]); - - fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride, - sads); - - for (i = 0; i < 8; ++i) { - unsigned int sad = sads[i]; - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - } - - if (fn_ptr->sdx3f != NULL) { - while ((c + 2) < col_max) { - int i; - DECLARE_ALIGNED(16, uint32_t, sads[3]); - - fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride, - sads); - - for (i = 0; i < 3; ++i) { - unsigned int sad = sads[i]; - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - } - - while (c < col_max) { - unsigned int sad = - fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride); - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - - return best_sad; -} - int av1_refining_search_sad(MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, const aom_variance_fn_ptr_t *fn_ptr, @@ -2394,16 +2025,23 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range, clamp_mv(best_mv, x->mv_limits.col_min, x->mv_limits.col_max, x->mv_limits.row_min, x->mv_limits.row_max); - if (mask) + if (mask) { best_sad = fn_ptr->msdf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv), in_what->stride, second_pred, mask, mask_stride, invert_mask) + mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit); - else - best_sad = - fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv), - in_what->stride, second_pred) + - mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit); + } else { + if (xd->jcp_param.use_jnt_comp_avg) + best_sad = fn_ptr->jsdaf(what->buf, what->stride, + get_buf_from_mv(in_what, best_mv), + in_what->stride, second_pred, &xd->jcp_param) + + mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit); + else + best_sad = fn_ptr->sdaf(what->buf, what->stride, + get_buf_from_mv(in_what, best_mv), + in_what->stride, second_pred) + + mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit); + } for (i = 0; i < search_range; ++i) { int best_site = -1; @@ -2414,14 +2052,20 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range, if (is_mv_in(&x->mv_limits, &mv)) { unsigned int sad; - if (mask) + if (mask) { sad = fn_ptr->msdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), in_what->stride, second_pred, mask, mask_stride, invert_mask); - else - sad = fn_ptr->sdaf(what->buf, what->stride, - get_buf_from_mv(in_what, &mv), in_what->stride, - second_pred); + } else { + if (xd->jcp_param.use_jnt_comp_avg) + sad = fn_ptr->jsdaf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride, + second_pred, &xd->jcp_param); + else + sad = fn_ptr->sdaf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride, + second_pred); + } if (sad < best_sad) { sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); if (sad < best_sad) { @@ -2454,45 +2098,10 @@ static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) { (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref; } -#if CONFIG_HASH_ME -#define MAX_HASH_MV_TABLE_SIZE 5 -static void add_to_sort_table(block_hash block_hashes[MAX_HASH_MV_TABLE_SIZE], - int costs[MAX_HASH_MV_TABLE_SIZE], int *existing, - int max_size, block_hash curr_block, - int curr_cost) { - if (*existing < max_size) { - block_hashes[*existing] = curr_block; - costs[*existing] = curr_cost; - (*existing)++; - } else { - int max_cost = 0; - int max_cost_idx = 0; - for (int i = 0; i < max_size; i++) { - if (costs[i] > max_cost) { - max_cost = costs[i]; - max_cost_idx = i; - } - } - - if (curr_cost < max_cost) { - block_hashes[max_cost_idx] = curr_block; - costs[max_cost_idx] = curr_cost; - } - } -} -#endif - -#if CONFIG_HASH_ME int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *mvp_full, int step_param, int error_per_bit, int *cost_list, const MV *ref_mv, int var_max, int rd, int x_pos, int y_pos, int intra) { -#else -int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - MV *mvp_full, int step_param, int error_per_bit, - int *cost_list, const MV *ref_mv, int var_max, - int rd) { -#endif const SPEED_FEATURES *const sf = &cpi->sf; const SEARCH_METHODS method = sf->mv.search_method; const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; @@ -2539,7 +2148,7 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, if (is_exhaustive_allowed(cpi, x)) { int exhuastive_thr = sf->exhaustive_searches_thresh; exhuastive_thr >>= - 10 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + 10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]); // Threshold variance for an exhaustive full search. if (var > exhuastive_thr) { @@ -2556,44 +2165,37 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, } } break; - - break; default: assert(0 && "Invalid search method."); } if (method != NSTEP && rd && var < var_max) var = av1_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1); -#if CONFIG_HASH_ME do { - if (!cpi->common.allow_screen_content_tools) { - break; - } + if (!av1_use_hash_me(&cpi->common)) break; + // already single ME // get block size and original buffer of current block const int block_height = block_size_high[bsize]; const int block_width = block_size_wide[bsize]; if (block_height == block_width && x_pos >= 0 && y_pos >= 0) { if (block_width == 4 || block_width == 8 || block_width == 16 || - block_width == 32 || block_width == 64) { + block_width == 32 || block_width == 64 || block_width == 128) { uint8_t *what = x->plane[0].src.buf; const int what_stride = x->plane[0].src.stride; - block_hash block_hashes[MAX_HASH_MV_TABLE_SIZE]; - int costs[MAX_HASH_MV_TABLE_SIZE]; - int existing = 0; - int i; uint32_t hash_value1, hash_value2; MV best_hash_mv; int best_hash_cost = INT_MAX; // for the hashMap hash_table *ref_frame_hash = - intra ? &cpi->common.cur_frame->hash_table - : get_ref_frame_hash_map(cpi, - x->e_mbd.mi[0]->mbmi.ref_frame[0]); + intra + ? &cpi->common.cur_frame->hash_table + : av1_get_ref_frame_hash_map(cpi, x->e_mbd.mi[0]->ref_frame[0]); - av1_get_block_hash_value(what, what_stride, block_width, &hash_value1, - &hash_value2); + av1_get_block_hash_value( + what, what_stride, block_width, &hash_value1, &hash_value2, + x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); const int count = av1_hash_table_count(ref_frame_hash, hash_value1); // for intra, at lest one matching can be found, itself. @@ -2603,44 +2205,31 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, Iterator iterator = av1_hash_get_first_iterator(ref_frame_hash, hash_value1); - for (i = 0; i < count; i++, iterator_increment(&iterator)) { + for (int i = 0; i < count; i++, iterator_increment(&iterator)) { block_hash ref_block_hash = *(block_hash *)(iterator_get(&iterator)); if (hash_value2 == ref_block_hash.hash_value2) { - // for intra, make sure the prediction is from valid area - // not predict from current block. - // TODO(roger): check if the constrain is necessary - if (intra && - ref_block_hash.y + block_height > - ((y_pos >> MAX_SB_SIZE_LOG2) << MAX_SB_SIZE_LOG2) && - ref_block_hash.x + block_width > - ((x_pos >> MAX_SB_SIZE_LOG2) << MAX_SB_SIZE_LOG2)) { - continue; + // For intra, make sure the prediction is from valid area. + if (intra) { + const int mi_col = x_pos / MI_SIZE; + const int mi_row = y_pos / MI_SIZE; + const MV dv = { 8 * (ref_block_hash.y - y_pos), + 8 * (ref_block_hash.x - x_pos) }; + if (!av1_is_dv_valid(dv, &cpi->common, &x->e_mbd, mi_row, mi_col, + bsize, cpi->common.seq_params.mib_size_log2)) + continue; + } + MV hash_mv; + hash_mv.col = ref_block_hash.x - x_pos; + hash_mv.row = ref_block_hash.y - y_pos; + if (!is_mv_in(&x->mv_limits, &hash_mv)) continue; + const int refCost = + av1_get_mvpred_var(x, &hash_mv, ref_mv, fn_ptr, 1); + if (refCost < best_hash_cost) { + best_hash_cost = refCost; + best_hash_mv = hash_mv; } - int refCost = - abs(ref_block_hash.x - x_pos) + abs(ref_block_hash.y - y_pos); - add_to_sort_table(block_hashes, costs, &existing, - MAX_HASH_MV_TABLE_SIZE, ref_block_hash, refCost); - } - } - - if (existing == 0) { - break; - } - - for (i = 0; i < existing; i++) { - MV hash_mv; - hash_mv.col = block_hashes[i].x - x_pos; - hash_mv.row = block_hashes[i].y - y_pos; - if (!is_mv_in(&x->mv_limits, &hash_mv)) { - continue; - } - int currHashCost = av1_get_mvpred_var(x, &hash_mv, ref_mv, fn_ptr, 1); - if (currHashCost < best_hash_cost) { - best_hash_cost = currHashCost; - best_hash_mv = hash_mv; } } - if (best_hash_cost < var) { x->second_best_mv = x->best_mv; x->best_mv.as_mv = best_hash_mv; @@ -2649,12 +2238,10 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, } } } while (0); -#endif return var; } -#if CONFIG_MOTION_VAR /* returns subpixel variance error function */ #define DIST(r, c) \ vfp->osvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, mask, &sse) @@ -2687,20 +2274,21 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, #define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c) #undef CHECK_BETTER1 -#define CHECK_BETTER1(v, r, c) \ - if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ - thismse = \ - upsampled_obmc_pref_error(xd, mask, vfp, z, pre(y, y_stride, r, c), \ - y_stride, sp(c), sp(r), w, h, &sse); \ - if ((v = MVC(r, c) + thismse) < besterr) { \ - besterr = v; \ - br = r; \ - bc = c; \ - *distortion = thismse; \ - *sse1 = sse; \ - } \ - } else { \ - v = INT_MAX; \ +#define CHECK_BETTER1(v, r, c) \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + MV this_mv = { r, c }; \ + thismse = upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, &this_mv, \ + mask, vfp, z, pre(y, y_stride, r, c), \ + y_stride, sp(c), sp(r), w, h, &sse); \ + if ((v = MVC(r, c) + thismse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ } static unsigned int setup_obmc_center_error( @@ -2715,60 +2303,55 @@ static unsigned int setup_obmc_center_error( return besterr; } -static int upsampled_obmc_pref_error(const MACROBLOCKD *xd, const int32_t *mask, - const aom_variance_fn_ptr_t *vfp, - const int32_t *const wsrc, - const uint8_t *const y, int y_stride, - int subpel_x_q3, int subpel_y_q3, int w, - int h, unsigned int *sse) { +static int upsampled_obmc_pref_error( + MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *const mv, const int32_t *mask, const aom_variance_fn_ptr_t *vfp, + const int32_t *const wsrc, const uint8_t *const y, int y_stride, + int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse) { unsigned int besterr; -#if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); - aom_highbd_upsampled_pred(pred16, w, h, subpel_x_q3, subpel_y_q3, y, - y_stride, xd->bd); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h, + subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd); besterr = vfp->ovf(CONVERT_TO_BYTEPTR(pred16), w, wsrc, mask, sse); } else { DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); -#else - DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); - (void)xd; -#endif // CONFIG_HIGHBITDEPTH - aom_upsampled_pred(pred, w, h, subpel_x_q3, subpel_y_q3, y, y_stride); + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3, + subpel_y_q3, y, y_stride); besterr = vfp->ovf(pred, w, wsrc, mask, sse); -#if CONFIG_HIGHBITDEPTH } -#endif return besterr; } static unsigned int upsampled_setup_obmc_center_error( - const MACROBLOCKD *xd, const int32_t *mask, const MV *bestmv, - const MV *ref_mv, int error_per_bit, const aom_variance_fn_ptr_t *vfp, - const int32_t *const wsrc, const uint8_t *const y, int y_stride, int w, - int h, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1, - int *distortion) { - unsigned int besterr = upsampled_obmc_pref_error( - xd, mask, vfp, wsrc, y + offset, y_stride, 0, 0, w, h, sse1); + MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, + const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit, + const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc, + const uint8_t *const y, int y_stride, int w, int h, int offset, + int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) { + unsigned int besterr = + upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc, + y + offset, y_stride, 0, 0, w, h, sse1); *distortion = besterr; besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); return besterr; } int av1_find_best_obmc_sub_pixel_tree_up( - MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, - int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop, - int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion, - unsigned int *sse1, int is_second, int use_upsampled_ref) { + MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col, + MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit, + const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, + int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1, + int is_second, int use_accurate_subpel_search) { const int32_t *wsrc = x->wsrc_buf; const int32_t *mask = x->mask_buf; const int *const z = wsrc; const int *const src_address = z; MACROBLOCKD *xd = &x->e_mbd; struct macroblockd_plane *const pd = &xd->plane[0]; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *mbmi = xd->mi[0]; unsigned int besterr = INT_MAX; unsigned int sse; unsigned int thismse; @@ -2794,8 +2377,7 @@ int av1_find_best_obmc_sub_pixel_tree_up( int minc, maxc, minr, maxr; - av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, - ref_mv); + set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, ref_mv); y = pd->pre[is_second].buf; y_stride = pd->pre[is_second].stride; @@ -2806,11 +2388,11 @@ int av1_find_best_obmc_sub_pixel_tree_up( bestmv->row *= 8; bestmv->col *= 8; - // use_upsampled_ref can be 0 or 1 - if (use_upsampled_ref) + // use_accurate_subpel_search can be 0 or 1 + if (use_accurate_subpel_search) besterr = upsampled_setup_obmc_center_error( - xd, mask, bestmv, ref_mv, error_per_bit, vfp, z, y, y_stride, w, h, - offset, mvjcost, mvcost, sse1, distortion); + xd, cm, mi_row, mi_col, mask, bestmv, ref_mv, error_per_bit, vfp, z, y, + y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion); else besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp, z, y, y_stride, offset, mvjcost, mvcost, @@ -2823,15 +2405,13 @@ int av1_find_best_obmc_sub_pixel_tree_up( tc = bc + search_step[idx].col; if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { MV this_mv = { tr, tc }; - const uint8_t *const pre_address = pre(y, y_stride, tr, tc); - - if (use_upsampled_ref) { - thismse = - upsampled_obmc_pref_error(xd, mask, vfp, src_address, pre_address, - y_stride, sp(tc), sp(tr), w, h, &sse); + if (use_accurate_subpel_search) { + thismse = upsampled_obmc_pref_error( + xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address, + pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse); } else { - thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr), - src_address, mask, &sse); + thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc), + sp(tr), src_address, mask, &sse); } cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, @@ -2856,10 +2436,10 @@ int av1_find_best_obmc_sub_pixel_tree_up( if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { MV this_mv = { tr, tc }; - if (use_upsampled_ref) { - thismse = upsampled_obmc_pref_error(xd, mask, vfp, src_address, - pre(y, y_stride, tr, tc), y_stride, - sp(tc), sp(tr), w, h, &sse); + if (use_accurate_subpel_search) { + thismse = upsampled_obmc_pref_error( + xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address, + pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse); } else { thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), src_address, mask, &sse); @@ -2887,7 +2467,7 @@ int av1_find_best_obmc_sub_pixel_tree_up( } if (iters_per_step > 1 && best_idx != -1) { - if (use_upsampled_ref) { + if (use_accurate_subpel_search) { SECOND_LEVEL_CHECKS_BEST(1); } else { SECOND_LEVEL_CHECKS_BEST(0); @@ -3123,89 +2703,98 @@ int av1_obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x, } return bestsme; } -#endif // CONFIG_MOTION_VAR // Note(yunqingwang): The following 2 functions are only used in the motion // vector unit test, which return extreme motion vectors allowed by the MV // limits. -#define COMMON_MV_TEST \ - SETUP_SUBPEL_SEARCH; \ - \ - (void)error_per_bit; \ - (void)vfp; \ - (void)src_address; \ - (void)src_stride; \ - (void)y; \ - (void)y_stride; \ - (void)second_pred; \ - (void)w; \ - (void)h; \ - (void)use_upsampled_ref; \ - (void)offset; \ - (void)mvjcost; \ - (void)mvcost; \ - (void)sse1; \ - (void)distortion; \ - \ - (void)halfiters; \ - (void)quarteriters; \ - (void)eighthiters; \ - (void)whichdir; \ - (void)forced_stop; \ - (void)hstep; \ - \ - (void)tr; \ - (void)tc; \ - (void)sse; \ - (void)thismse; \ +#define COMMON_MV_TEST \ + SETUP_SUBPEL_SEARCH; \ + \ + (void)error_per_bit; \ + (void)vfp; \ + (void)src_address; \ + (void)src_stride; \ + (void)y; \ + (void)y_stride; \ + (void)second_pred; \ + (void)w; \ + (void)h; \ + (void)use_accurate_subpel_search; \ + (void)offset; \ + (void)mvjcost; \ + (void)mvcost; \ + (void)sse1; \ + (void)distortion; \ + \ + (void)halfiters; \ + (void)quarteriters; \ + (void)eighthiters; \ + (void)whichdir; \ + (void)forced_stop; \ + (void)hstep; \ + \ + (void)tr; \ + (void)tc; \ + (void)sse; \ + (void)thismse; \ (void)cost_list; // Return the maximum MV. -int av1_return_max_sub_pixel_mv( - MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit, - const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, - int *cost_list, int *mvjcost, int *mvcost[2], int *distortion, - unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask, - int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) { +int av1_return_max_sub_pixel_mv(MACROBLOCK *x, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *ref_mv, + int allow_hp, int error_per_bit, + const aom_variance_fn_ptr_t *vfp, + int forced_stop, int iters_per_step, + int *cost_list, int *mvjcost, int *mvcost[2], + int *distortion, unsigned int *sse1, + const uint8_t *second_pred, const uint8_t *mask, + int mask_stride, int invert_mask, int w, int h, + int use_accurate_subpel_search) { COMMON_MV_TEST; (void)mask; (void)mask_stride; (void)invert_mask; (void)minr; (void)minc; + + (void)cm; + (void)mi_row; + (void)mi_col; + bestmv->row = maxr; bestmv->col = maxc; besterr = 0; -// In the sub-pel motion search, if hp is not used, then the last bit of mv -// has to be 0. -#if CONFIG_AMVR + // In the sub-pel motion search, if hp is not used, then the last bit of mv + // has to be 0. lower_mv_precision(bestmv, allow_hp, 0); -#else - lower_mv_precision(bestmv, allow_hp); -#endif return besterr; } // Return the minimum MV. -int av1_return_min_sub_pixel_mv( - MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit, - const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, - int *cost_list, int *mvjcost, int *mvcost[2], int *distortion, - unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask, - int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) { +int av1_return_min_sub_pixel_mv(MACROBLOCK *x, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *ref_mv, + int allow_hp, int error_per_bit, + const aom_variance_fn_ptr_t *vfp, + int forced_stop, int iters_per_step, + int *cost_list, int *mvjcost, int *mvcost[2], + int *distortion, unsigned int *sse1, + const uint8_t *second_pred, const uint8_t *mask, + int mask_stride, int invert_mask, int w, int h, + int use_accurate_subpel_search) { COMMON_MV_TEST; (void)maxr; (void)maxc; (void)mask; (void)mask_stride; (void)invert_mask; + + (void)cm; + (void)mi_row; + (void)mi_col; + bestmv->row = minr; bestmv->col = minc; besterr = 0; -// In the sub-pel motion search, if hp is not used, then the last bit of mv -// has to be 0. -#if CONFIG_AMVR + // In the sub-pel motion search, if hp is not used, then the last bit of mv + // has to be 0. lower_mv_precision(bestmv, allow_hp, 0); -#else - lower_mv_precision(bestmv, allow_hp); -#endif return besterr; } diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h index 2c53075cc..539e8f4e4 100644 --- a/third_party/aom/av1/encoder/mcomp.h +++ b/third_party/aom/av1/encoder/mcomp.h @@ -69,10 +69,9 @@ struct SPEED_FEATURES; int av1_init_search_range(int size); -int av1_refining_search_sad(struct macroblock *x, struct mv *ref_mv, - int sad_per_bit, int distance, - const aom_variance_fn_ptr_t *fn_ptr, - const struct mv *center_mv); +int av1_refining_search_sad(struct macroblock *x, MV *ref_mv, int sad_per_bit, + int distance, const aom_variance_fn_ptr_t *fn_ptr, + const MV *center_mv); // Runs sequence of diamond searches in smaller steps for RD. int av1_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x, @@ -81,24 +80,20 @@ int av1_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x, const aom_variance_fn_ptr_t *fn_ptr, const MV *ref_mv, MV *dst_mv); -// Perform integral projection based motion estimation. -unsigned int av1_int_pro_motion_estimation(const struct AV1_COMP *cpi, - MACROBLOCK *x, BLOCK_SIZE bsize, - int mi_row, int mi_col); - int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param, int sad_per_bit, int do_init_search, int *cost_list, const aom_variance_fn_ptr_t *vfp, int use_mvcost, const MV *center_mv); typedef int(fractional_mv_step_fp)( - MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit, + MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *ref_mv, int allow_hp, int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop, // 0 - full, 1 - qtr only, 2 - half only int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask, int mask_stride, int invert_mask, int w, int h, - int use_upsampled_ref); + int use_accurate_subpel_search); extern fractional_mv_step_fp av1_find_best_sub_pixel_tree; extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned; @@ -123,52 +118,33 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range, int invert_mask, const MV *center_mv, const uint8_t *second_pred); -struct AV1_COMP; - -#if CONFIG_HASH_ME int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *mvp_full, int step_param, int error_per_bit, int *cost_list, const MV *ref_mv, int var_max, int rd, int x_pos, int y_pos, int intra); -#else -int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, MV *mvp_full, int step_param, - int error_per_bit, int *cost_list, const MV *ref_mv, - int var_max, int rd); -#endif -#if CONFIG_MOTION_VAR int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x, MV *mvp_full, int step_param, int sadpb, int further_steps, int do_refine, const aom_variance_fn_ptr_t *fn_ptr, const MV *ref_mv, MV *dst_mv, int is_second); int av1_find_best_obmc_sub_pixel_tree_up( - MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, - int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop, - int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion, - unsigned int *sse1, int is_second, int use_upsampled_ref); -#endif // CONFIG_MOTION_VAR -#ifdef __cplusplus -} // extern "C" -#endif + MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col, + MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit, + const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, + int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1, + int is_second, int use_accurate_subpel_search); -#if CONFIG_WARPED_MOTION unsigned int av1_compute_motion_cost(const struct AV1_COMP *cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int mi_row, int mi_col, const MV *this_mv); -#if WARPED_MOTION_SORT_SAMPLES unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int mi_row, int mi_col, int *pts0, - int *pts_inref0, int *pts_mv0, - int total_samples); -#else -unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi, - MACROBLOCK *const x, BLOCK_SIZE bsize, - int mi_row, int mi_col, int *pts, - int *pts_inref); -#endif // WARPED_MOTION_SORT_SAMPLES -#endif // CONFIG_WARPED_MOTION + int *pts_inref0, int total_samples); + +#ifdef __cplusplus +} // extern "C" +#endif #endif // AV1_ENCODER_MCOMP_H_ diff --git a/third_party/aom/av1/encoder/mips/msa/error_msa.c b/third_party/aom/av1/encoder/mips/msa/error_msa.c index 8d13af7ad..2e86dee43 100644 --- a/third_party/aom/av1/encoder/mips/msa/error_msa.c +++ b/third_party/aom/av1/encoder/mips/msa/error_msa.c @@ -9,7 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./av1_rtcd.h" +#include "config/av1_rtcd.h" + #include "aom_dsp/mips/macros_msa.h" #define BLOCK_ERROR_BLOCKSIZE_MSA(BSize) \ diff --git a/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c deleted file mode 100644 index 4b0364d6c..000000000 --- a/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c +++ /dev/null @@ -1,436 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> - -#include "av1/common/enums.h" -#include "av1/encoder/mips/msa/fdct_msa.h" -#include "aom_dsp/mips/fwd_txfm_msa.h" - -static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride, - const int32_t *const0, int16_t *int_buf) { - v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; - v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3; - v4i32 k0, k1, k2, k3; - - /* load input data */ - r0 = LD_SH(input); - r15 = LD_SH(input + 15 * stride); - r7 = LD_SH(input + 7 * stride); - r8 = LD_SH(input + 8 * stride); - SLLI_4V(r0, r15, r7, r8, 2); - - /* stage 1 */ - LD_SW2(const0, 4, k0, k1); - LD_SW2(const0 + 8, 4, k2, k3); - MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); - - r3 = LD_SH(input + 3 * stride); - r4 = LD_SH(input + 4 * stride); - r11 = LD_SH(input + 11 * stride); - r12 = LD_SH(input + 12 * stride); - SLLI_4V(r3, r4, r11, r12, 2); - - LD_SW2(const0 + 4 * 4, 4, k0, k1); - LD_SW2(const0 + 4 * 6, 4, k2, k3); - MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); - - /* stage 2 */ - BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1); - ST_SH2(tp0, tp2, int_buf, 8); - ST_SH2(tp1, tp3, int_buf + 4 * 8, 8); - - LD_SW2(const0 + 4 * 8, 4, k0, k1); - k2 = LD_SW(const0 + 4 * 10); - MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); - - ST_SH2(h0, h1, int_buf + 8 * 8, 8); - ST_SH2(h3, h2, int_buf + 12 * 8, 8); - - r9 = LD_SH(input + 9 * stride); - r6 = LD_SH(input + 6 * stride); - r1 = LD_SH(input + stride); - r14 = LD_SH(input + 14 * stride); - SLLI_4V(r9, r6, r1, r14, 2); - - LD_SW2(const0 + 4 * 11, 4, k0, k1); - LD_SW2(const0 + 4 * 13, 4, k2, k3); - MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3); - - ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8); - - r13 = LD_SH(input + 13 * stride); - r2 = LD_SH(input + 2 * stride); - r5 = LD_SH(input + 5 * stride); - r10 = LD_SH(input + 10 * stride); - SLLI_4V(r13, r2, r5, r10, 2); - - LD_SW2(const0 + 4 * 15, 4, k0, k1); - LD_SW2(const0 + 4 * 17, 4, k2, k3); - MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3); - - ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8); - - BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3); - ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8); -} - -static void fadst16_step2_msa_helper(int16_t *int_buf, const int32_t *const0, - int16_t *out, int16_t *out_ptr) { - v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15; - v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11; - v8i16 out0, out1, out2, out3, out4, out5, out6, out7; - v8i16 out8, out9, out10, out11, out12, out13, out14, out15; - v4i32 k0, k1, k2, k3; - - LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15); - LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7); - LD_SW2(const0 + 4 * 19, 4, k0, k1); - k2 = LD_SW(const0 + 4 * 21); - MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); - - tp0 = LD_SH(int_buf + 4 * 8); - tp1 = LD_SH(int_buf + 5 * 8); - tp3 = LD_SH(int_buf + 10 * 8); - tp2 = LD_SH(int_buf + 14 * 8); - LD_SW2(const0 + 4 * 22, 4, k0, k1); - k2 = LD_SW(const0 + 4 * 24); - MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7); - out4 = -out4; - ST_SH(out4, (out + 3 * 16)); - ST_SH(out5, (out_ptr + 4 * 16)); - - h1 = LD_SH(int_buf + 9 * 8); - h3 = LD_SH(int_buf + 12 * 8); - MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); - out13 = -out13; - ST_SH(out12, (out + 2 * 16)); - ST_SH(out13, (out_ptr + 5 * 16)); - - tp0 = LD_SH(int_buf); - tp1 = LD_SH(int_buf + 8); - tp2 = LD_SH(int_buf + 2 * 8); - tp3 = LD_SH(int_buf + 6 * 8); - - BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10); - out1 = -out1; - ST_SH(out0, (out)); - ST_SH(out1, (out_ptr + 7 * 16)); - - h0 = LD_SH(int_buf + 8 * 8); - h2 = LD_SH(int_buf + 13 * 8); - - BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); - out8 = -out8; - ST_SH(out8, (out + 16)); - ST_SH(out9, (out_ptr + 6 * 16)); - - /* stage 4 */ - LD_SW2(const0 + 4 * 25, 4, k0, k1); - LD_SW2(const0 + 4 * 27, 4, k2, k3); - MADD_SHORT(h10, h11, k1, k2, out2, out3); - ST_SH(out2, (out + 7 * 16)); - ST_SH(out3, (out_ptr)); - - MADD_SHORT(out6, out7, k0, k3, out6, out7); - ST_SH(out6, (out + 4 * 16)); - ST_SH(out7, (out_ptr + 3 * 16)); - - MADD_SHORT(out10, out11, k0, k3, out10, out11); - ST_SH(out10, (out + 6 * 16)); - ST_SH(out11, (out_ptr + 16)); - - MADD_SHORT(out14, out15, k1, k2, out14, out15); - ST_SH(out14, (out + 5 * 16)); - ST_SH(out15, (out_ptr + 2 * 16)); -} - -static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0, - int16_t *out) { - fadst16_step2_msa_helper(int_buf, const0, out, out + 128); -} - -static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) { - v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; - v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; - - /* load input data */ - LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); - TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, - r7); - FDCT_POSTPROC_2V_NEG_H(r0, r1); - FDCT_POSTPROC_2V_NEG_H(r2, r3); - FDCT_POSTPROC_2V_NEG_H(r4, r5); - FDCT_POSTPROC_2V_NEG_H(r6, r7); - ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); - out += 64; - - LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); - TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, - r12, r13, r14, r15); - FDCT_POSTPROC_2V_NEG_H(r8, r9); - FDCT_POSTPROC_2V_NEG_H(r10, r11); - FDCT_POSTPROC_2V_NEG_H(r12, r13); - FDCT_POSTPROC_2V_NEG_H(r14, r15); - ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); - out += 64; - - /* load input data */ - input += 128; - LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); - TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, - r7); - FDCT_POSTPROC_2V_NEG_H(r0, r1); - FDCT_POSTPROC_2V_NEG_H(r2, r3); - FDCT_POSTPROC_2V_NEG_H(r4, r5); - FDCT_POSTPROC_2V_NEG_H(r6, r7); - ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); - out += 64; - - LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); - TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, - r12, r13, r14, r15); - FDCT_POSTPROC_2V_NEG_H(r8, r9); - FDCT_POSTPROC_2V_NEG_H(r10, r11); - FDCT_POSTPROC_2V_NEG_H(r12, r13); - FDCT_POSTPROC_2V_NEG_H(r14, r15); - ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); -} - -static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0, - int16_t *int_buf) { - v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; - v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3; - v4i32 k0, k1, k2, k3; - - /* load input data */ - r0 = LD_SH(input); - r7 = LD_SH(input + 7 * 8); - r8 = LD_SH(input + 8 * 8); - r15 = LD_SH(input + 15 * 8); - - /* stage 1 */ - LD_SW2(const0, 4, k0, k1); - LD_SW2(const0 + 4 * 2, 4, k2, k3); - MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); - - r3 = LD_SH(input + 3 * 8); - r4 = LD_SH(input + 4 * 8); - r11 = LD_SH(input + 11 * 8); - r12 = LD_SH(input + 12 * 8); - - LD_SW2(const0 + 4 * 4, 4, k0, k1); - LD_SW2(const0 + 4 * 6, 4, k2, k3); - MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); - - /* stage 2 */ - BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1); - ST_SH2(tp0, tp1, int_buf, 4 * 8); - ST_SH2(tp2, tp3, int_buf + 8, 4 * 8); - - LD_SW2(const0 + 4 * 8, 4, k0, k1); - k2 = LD_SW(const0 + 4 * 10); - MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); - ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8); - ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8); - - r1 = LD_SH(input + 8); - r6 = LD_SH(input + 6 * 8); - r9 = LD_SH(input + 9 * 8); - r14 = LD_SH(input + 14 * 8); - - LD_SW2(const0 + 4 * 11, 4, k0, k1); - LD_SW2(const0 + 4 * 13, 4, k2, k3); - MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3); - ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8); - - r2 = LD_SH(input + 2 * 8); - r5 = LD_SH(input + 5 * 8); - r10 = LD_SH(input + 10 * 8); - r13 = LD_SH(input + 13 * 8); - - LD_SW2(const0 + 4 * 15, 4, k0, k1); - LD_SW2(const0 + 4 * 17, 4, k2, k3); - MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3); - ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8); - BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3); - ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8); -} - -static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0, - int16_t *out) { - fadst16_step2_msa_helper(int_buf, const0, out, out + 8); -} - -static void fadst16_transpose_msa(int16_t *input, int16_t *out) { - v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; - v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; - - /* load input data */ - LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, - l7, l15); - TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, - r7); - TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, - r12, r13, r14, r15); - ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8); - ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8); - out += 16 * 8; - - /* load input data */ - input += 128; - LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, - l7, l15); - TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, - r7); - TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, - r12, r13, r14, r15); - ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8); - ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8); -} - -static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) { - int16_t *temp = intermediate; - int16_t *out = output; - v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11; - v8i16 in12, in13, in14, in15; - - LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7); - temp = intermediate + 8; - LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, - in10, in11, in12, in13, in14, in15); - FDCT_POSTPROC_2V_NEG_H(in0, in1); - FDCT_POSTPROC_2V_NEG_H(in2, in3); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - FDCT_POSTPROC_2V_NEG_H(in6, in7); - FDCT_POSTPROC_2V_NEG_H(in8, in9); - FDCT_POSTPROC_2V_NEG_H(in10, in11); - FDCT_POSTPROC_2V_NEG_H(in12, in13); - FDCT_POSTPROC_2V_NEG_H(in14, in15); - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, - in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, - tmp7, in8, in9, in10, in11, in12, in13, in14, in15); - temp = intermediate; - ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16); - FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, - tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); - temp = intermediate; - LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); - FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, - tmp1, in1, tmp2, in2, tmp3, in3); - ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16); - TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, - tmp5, in5, tmp6, in6, tmp7, in7); - out = output + 8; - ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16); -} - -void av1_fht16x16_msa(const int16_t *input, int16_t *output, int32_t stride, - int32_t tx_type) { - DECLARE_ALIGNED(32, int16_t, tmp[256]); - DECLARE_ALIGNED(32, int16_t, trans_buf[256]); - DECLARE_ALIGNED(32, int16_t, tmp_buf[128]); - int32_t i; - int16_t *ptmpbuf = &tmp_buf[0]; - int16_t *trans = &trans_buf[0]; - const int32_t const_arr[29 * 4] = { - 52707308, 52707308, 52707308, 52707308, -1072430300, - -1072430300, -1072430300, -1072430300, 795618043, 795618043, - 795618043, 795618043, -721080468, -721080468, -721080468, - -721080468, 459094491, 459094491, 459094491, 459094491, - -970646691, -970646691, -970646691, -970646691, 1010963856, - 1010963856, 1010963856, 1010963856, -361743294, -361743294, - -361743294, -361743294, 209469125, 209469125, 209469125, - 209469125, -1053094788, -1053094788, -1053094788, -1053094788, - 1053160324, 1053160324, 1053160324, 1053160324, 639644520, - 639644520, 639644520, 639644520, -862444000, -862444000, - -862444000, -862444000, 1062144356, 1062144356, 1062144356, - 1062144356, -157532337, -157532337, -157532337, -157532337, - 260914709, 260914709, 260914709, 260914709, -1041559667, - -1041559667, -1041559667, -1041559667, 920985831, 920985831, - 920985831, 920985831, -551995675, -551995675, -551995675, - -551995675, 596522295, 596522295, 596522295, 596522295, - 892853362, 892853362, 892853362, 892853362, -892787826, - -892787826, -892787826, -892787826, 410925857, 410925857, - 410925857, 410925857, -992012162, -992012162, -992012162, - -992012162, 992077698, 992077698, 992077698, 992077698, - 759246145, 759246145, 759246145, 759246145, -759180609, - -759180609, -759180609, -759180609, -759222975, -759222975, - -759222975, -759222975, 759288511, 759288511, 759288511, - 759288511 - }; - - switch (tx_type) { - case DCT_DCT: - /* column transform */ - for (i = 0; i < 2; ++i) { - fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride); - } - - /* row transform */ - for (i = 0; i < 2; ++i) { - fdct16x8_1d_row(tmp + (128 * i), output + (128 * i)); - } - break; - case ADST_DCT: - /* column transform */ - for (i = 0; i < 2; ++i) { - fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf); - fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3)); - } - - /* row transform */ - for (i = 0; i < 2; ++i) { - postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i)); - } - break; - case DCT_ADST: - /* column transform */ - for (i = 0; i < 2; ++i) { - fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride); - } - - fadst16_transpose_postproc_msa(tmp, trans); - - /* row transform */ - for (i = 0; i < 2; ++i) { - fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf); - fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7)); - } - - fadst16_transpose_msa(tmp, output); - break; - case ADST_ADST: - /* column transform */ - for (i = 0; i < 2; ++i) { - fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf); - fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3)); - } - - fadst16_transpose_postproc_msa(tmp, trans); - - /* row transform */ - for (i = 0; i < 2; ++i) { - fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf); - fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7)); - } - - fadst16_transpose_msa(tmp, output); - break; - default: assert(0); break; - } -} diff --git a/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c index da1ac74f0..085c08bfb 100644 --- a/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c +++ b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c @@ -12,7 +12,6 @@ #include <assert.h> #include "av1/common/enums.h" -#include "av1/encoder/mips/msa/fdct_msa.h" void av1_fwht4x4_msa(const int16_t *input, int16_t *output, int32_t src_stride) { @@ -45,54 +44,3 @@ void av1_fwht4x4_msa(const int16_t *input, int16_t *output, ST4x2_UB(in1, output + 8, 4); ST4x2_UB(in2, output + 12, 4); } - -void av1_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride, - int32_t tx_type) { - v8i16 in0, in1, in2, in3; - - LD_SH4(input, stride, in0, in1, in2, in3); - - /* fdct4 pre-process */ - { - v8i16 temp, mask; - v16i8 zero = { 0 }; - v16i8 one = __msa_ldi_b(1); - - mask = (v8i16)__msa_sldi_b(zero, one, 15); - SLLI_4V(in0, in1, in2, in3, 4); - temp = __msa_ceqi_h(in0, 0); - temp = (v8i16)__msa_xori_b((v16u8)temp, 255); - temp = mask & temp; - in0 += temp; - } - - switch (tx_type) { - case DCT_DCT: - AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); - TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); - AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); - break; - case ADST_DCT: - AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); - TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); - AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); - break; - case DCT_ADST: - AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); - TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); - AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); - break; - case ADST_ADST: - AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); - TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); - AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); - break; - default: assert(0); break; - } - - TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); - ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); - SRA_4V(in0, in1, in2, in3, 2); - PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); - ST_SH2(in0, in2, output, 8); -} diff --git a/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c deleted file mode 100644 index 4cbf60a11..000000000 --- a/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> - -#include "av1/common/enums.h" -#include "av1/encoder/mips/msa/fdct_msa.h" - -void av1_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride, - int32_t tx_type) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - - LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); - SLLI_4V(in0, in1, in2, in3, 2); - SLLI_4V(in4, in5, in6, in7, 2); - - switch (tx_type) { - case DCT_DCT: - AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, - in5, in6, in7); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, - in3, in4, in5, in6, in7); - AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, - in5, in6, in7); - break; - case ADST_DCT: - AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, - in5, in6, in7); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, - in3, in4, in5, in6, in7); - AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, - in5, in6, in7); - break; - case DCT_ADST: - AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, - in5, in6, in7); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, - in3, in4, in5, in6, in7); - AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, - in5, in6, in7); - break; - case ADST_ADST: - AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, - in5, in6, in7); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, - in3, in4, in5, in6, in7); - AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, - in5, in6, in7); - break; - default: assert(0); break; - } - - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); - ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); -} diff --git a/third_party/aom/av1/encoder/mips/msa/fdct_msa.h b/third_party/aom/av1/encoder/mips/msa/fdct_msa.h deleted file mode 100644 index 52bcf790c..000000000 --- a/third_party/aom/av1/encoder/mips/msa/fdct_msa.h +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_ -#define AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_ - -#include "aom_dsp/mips/fwd_txfm_msa.h" -#include "aom_dsp/mips/txfm_macros_msa.h" -#include "aom_ports/mem.h" - -#define AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ - out3, out4, out5, out6, out7) \ - { \ - v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ - v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ - v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ - cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ - v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \ - cospi_24_64, -cospi_24_64, 0, 0 }; \ - \ - SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ - cnst2_m = -cnst0_m; \ - ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ - SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ - cnst4_m = -cnst2_m; \ - ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ - \ - ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ - ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ - DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ - cnst2_m, cnst3_m, in7, in0, in4, in3); \ - \ - SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ - cnst2_m = -cnst0_m; \ - ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ - SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ - cnst4_m = -cnst2_m; \ - ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ - \ - ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ - ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ - \ - DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ - cnst2_m, cnst3_m, in5, in2, in6, in1); \ - BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ - out7 = -s0_m; \ - out0 = s1_m; \ - \ - SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ - \ - ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ - cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - cnst1_m = cnst0_m; \ - \ - ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ - ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ - DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \ - cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \ - \ - SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - \ - ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ - ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ - out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ - out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ - out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ - \ - out1 = -out1; \ - out3 = -out3; \ - out5 = -out5; \ - } - -#define AOM_FADST4(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - v4i32 s0_m, s1_m, s2_m, s3_m, constant_m; \ - v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m; \ - \ - UNPCK_R_SH_SW(in0, in0_r_m); \ - UNPCK_R_SH_SW(in1, in1_r_m); \ - UNPCK_R_SH_SW(in2, in2_r_m); \ - UNPCK_R_SH_SW(in3, in3_r_m); \ - \ - constant_m = __msa_fill_w(sinpi_4_9); \ - MUL2(in0_r_m, constant_m, in3_r_m, constant_m, s1_m, s0_m); \ - \ - constant_m = __msa_fill_w(sinpi_1_9); \ - s0_m += in0_r_m * constant_m; \ - s1_m -= in1_r_m * constant_m; \ - \ - constant_m = __msa_fill_w(sinpi_2_9); \ - s0_m += in1_r_m * constant_m; \ - s1_m += in3_r_m * constant_m; \ - \ - s2_m = in0_r_m + in1_r_m - in3_r_m; \ - \ - constant_m = __msa_fill_w(sinpi_3_9); \ - MUL2(in2_r_m, constant_m, s2_m, constant_m, s3_m, in1_r_m); \ - \ - in0_r_m = s0_m + s3_m; \ - s2_m = s1_m - s3_m; \ - s3_m = s1_m - s0_m + s3_m; \ - \ - SRARI_W4_SW(in0_r_m, in1_r_m, s2_m, s3_m, DCT_CONST_BITS); \ - PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, s3_m, s3_m, \ - out0, out1, out2, out3); \ - } -#endif // AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_ diff --git a/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c index 4ec679642..531ae090a 100644 --- a/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c +++ b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c @@ -9,7 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./av1_rtcd.h" +#include "config/av1_rtcd.h" + #include "aom_dsp/mips/macros_msa.h" static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride, diff --git a/third_party/aom/av1/encoder/ml.c b/third_party/aom/av1/encoder/ml.c new file mode 100644 index 000000000..3a27e5845 --- /dev/null +++ b/third_party/aom/av1/encoder/ml.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> + +#include "av1/encoder/ml.h" + +void av1_nn_predict(const float *features, const NN_CONFIG *nn_config, + float *output) { + int num_input_nodes = nn_config->num_inputs; + int buf_index = 0; + float buf[2][NN_MAX_NODES_PER_LAYER]; + const float *input_nodes = features; + + // Propagate hidden layers. + const int num_layers = nn_config->num_hidden_layers; + assert(num_layers <= NN_MAX_HIDDEN_LAYERS); + for (int layer = 0; layer < num_layers; ++layer) { + const float *weights = nn_config->weights[layer]; + const float *bias = nn_config->bias[layer]; + float *output_nodes = buf[buf_index]; + const int num_output_nodes = nn_config->num_hidden_nodes[layer]; + assert(num_output_nodes < NN_MAX_NODES_PER_LAYER); + for (int node = 0; node < num_output_nodes; ++node) { + float val = 0.0f; + for (int i = 0; i < num_input_nodes; ++i) + val += weights[i] * input_nodes[i]; + val += bias[node]; + // ReLU as activation function. + val = val > 0.0f ? val : 0.0f; // Could use AOMMAX(). + output_nodes[node] = val; + weights += num_input_nodes; + } + num_input_nodes = num_output_nodes; + input_nodes = output_nodes; + buf_index = 1 - buf_index; + } + + // Final output layer. + const float *weights = nn_config->weights[num_layers]; + for (int node = 0; node < nn_config->num_outputs; ++node) { + const float *bias = nn_config->bias[num_layers]; + float val = 0.0f; + for (int i = 0; i < num_input_nodes; ++i) + val += weights[i] * input_nodes[i]; + output[node] = val + bias[node]; + weights += num_input_nodes; + } +} diff --git a/third_party/aom/av1/encoder/ml.h b/third_party/aom/av1/encoder/ml.h new file mode 100644 index 000000000..614cb60bb --- /dev/null +++ b/third_party/aom/av1/encoder/ml.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_ML_H_ +#define AV1_ENCODER_ML_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define NN_MAX_HIDDEN_LAYERS 10 +#define NN_MAX_NODES_PER_LAYER 128 + +typedef struct { + int num_inputs; // Number of input nodes, i.e. features. + int num_outputs; // Number of output nodes. + int num_hidden_layers; // Number of hidden layers, maximum 10. + // Number of nodes for each hidden layer. + int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS]; + // Weight parameters, indexed by layer. + const float *weights[NN_MAX_HIDDEN_LAYERS + 1]; + // Bias parameters, indexed by layer. + const float *bias[NN_MAX_HIDDEN_LAYERS + 1]; +} NN_CONFIG; + +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +void av1_nn_predict(const float *features, const NN_CONFIG *nn_config, + float *output); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_RD_H_ diff --git a/third_party/aom/av1/encoder/palette.c b/third_party/aom/av1/encoder/palette.c index f34b82544..e61cd02ce 100644 --- a/third_party/aom/av1/encoder/palette.c +++ b/third_party/aom/av1/encoder/palette.c @@ -23,16 +23,14 @@ #include "av1/encoder/k_means_template.h" #undef AV1_K_MEANS_DIM -static int float_comparer(const void *a, const void *b) { - const float fa = *(const float *)a; - const float fb = *(const float *)b; - return (fa > fb) - (fa < fb); +static int int_comparer(const void *a, const void *b) { + return (*(int *)a - *(int *)b); } -int av1_remove_duplicates(float *centroids, int num_centroids) { +int av1_remove_duplicates(int *centroids, int num_centroids) { int num_unique; // number of unique centroids int i; - qsort(centroids, num_centroids, sizeof(*centroids), float_comparer); + qsort(centroids, num_centroids, sizeof(*centroids), int_comparer); // Remove duplicates. num_unique = 1; for (i = 1; i < num_centroids; ++i) { @@ -43,7 +41,6 @@ int av1_remove_duplicates(float *centroids, int num_centroids) { return num_unique; } -#if CONFIG_PALETTE_DELTA_ENCODING static int delta_encode_cost(const int *colors, int num, int bit_depth, int min_val) { if (num <= 0) return 0; @@ -116,15 +113,11 @@ int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi, } return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits); } -#endif // CONFIG_PALETTE_DELTA_ENCODING int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi, -#if CONFIG_PALETTE_DELTA_ENCODING uint16_t *color_cache, int n_cache, -#endif // CONFIG_PALETTE_DELTA_ENCODING int bit_depth) { const int n = pmi->palette_size[0]; -#if CONFIG_PALETTE_DELTA_ENCODING int out_cache_colors[PALETTE_MAX_SIZE]; uint8_t cache_color_found[2 * PALETTE_MAX_SIZE]; const int n_out_cache = @@ -132,19 +125,13 @@ int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi, cache_color_found, out_cache_colors); const int total_bits = n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 1); - return total_bits * av1_cost_bit(128, 0); -#else - return bit_depth * n * av1_cost_bit(128, 0); -#endif // CONFIG_PALETTE_DELTA_ENCODING + return av1_cost_literal(total_bits); } int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi, -#if CONFIG_PALETTE_DELTA_ENCODING uint16_t *color_cache, int n_cache, -#endif // CONFIG_PALETTE_DELTA_ENCODING int bit_depth) { const int n = pmi->palette_size[1]; -#if CONFIG_PALETTE_DELTA_ENCODING int total_bits = 0; // U channel palette color cost. int out_cache_colors[PALETTE_MAX_SIZE]; @@ -163,8 +150,5 @@ int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi, 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count; const int bits_using_raw = bit_depth * n; total_bits += 1 + AOMMIN(bits_using_delta, bits_using_raw); - return total_bits * av1_cost_bit(128, 0); -#else - return 2 * bit_depth * n * av1_cost_bit(128, 0); -#endif // CONFIG_PALETTE_DELTA_ENCODING + return av1_cost_literal(total_bits); } diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h index efd89f66f..bbdd50784 100644 --- a/third_party/aom/av1/encoder/palette.h +++ b/third_party/aom/av1/encoder/palette.h @@ -20,22 +20,22 @@ extern "C" { #define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim -void AV1_K_MEANS_RENAME(av1_calc_indices, 1)(const float *data, - const float *centroids, +void AV1_K_MEANS_RENAME(av1_calc_indices, 1)(const int *data, + const int *centroids, uint8_t *indices, int n, int k); -void AV1_K_MEANS_RENAME(av1_calc_indices, 2)(const float *data, - const float *centroids, +void AV1_K_MEANS_RENAME(av1_calc_indices, 2)(const int *data, + const int *centroids, uint8_t *indices, int n, int k); -void AV1_K_MEANS_RENAME(av1_k_means, 1)(const float *data, float *centroids, +void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int *data, int *centroids, uint8_t *indices, int n, int k, int max_itr); -void AV1_K_MEANS_RENAME(av1_k_means, 2)(const float *data, float *centroids, +void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int *data, int *centroids, uint8_t *indices, int n, int k, int max_itr); // Given 'n' 'data' points and 'k' 'centroids' each of dimension 'dim', // calculate the centroid 'indices' for the data points. -static INLINE void av1_calc_indices(const float *data, const float *centroids, +static INLINE void av1_calc_indices(const int *data, const int *centroids, uint8_t *indices, int n, int k, int dim) { if (dim == 1) { AV1_K_MEANS_RENAME(av1_calc_indices, 1)(data, centroids, indices, n, k); @@ -50,7 +50,7 @@ static INLINE void av1_calc_indices(const float *data, const float *centroids, // dimension 'dim', runs up to 'max_itr' iterations of k-means algorithm to get // updated 'centroids' and the centroid 'indices' for elements in 'data'. // Note: the output centroids are rounded off to nearest integers. -static INLINE void av1_k_means(const float *data, float *centroids, +static INLINE void av1_k_means(const int *data, int *centroids, uint8_t *indices, int n, int k, int dim, int max_itr) { if (dim == 1) { @@ -66,9 +66,8 @@ static INLINE void av1_k_means(const float *data, float *centroids, // puts these unique centroids in first 'k' indices of 'centroids' array. // Ideally, the centroids should be rounded to integers before calling this // method. -int av1_remove_duplicates(float *centroids, int num_centroids); +int av1_remove_duplicates(int *centroids, int num_centroids); -#if CONFIG_PALETTE_DELTA_ENCODING // Given a color cache and a set of base colors, find if each cache color is // present in the base colors, record the binary results in "cache_color_found". // Record the colors that are not in the color cache in "out_cache_colors". @@ -80,20 +79,14 @@ int av1_index_color_cache(const uint16_t *color_cache, int n_cache, // assign zero_count with the number of deltas being 0. int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi, int bit_depth, int *zero_count, int *min_bits); -#endif // CONFIG_PALETTE_DELTA_ENCODING // Return the rate cost for transmitting luma palette color values. int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi, -#if CONFIG_PALETTE_DELTA_ENCODING - uint16_t *color_cache, int n_cache, -#endif // CONFIG_PALETTE_DELTA_ENCODING - int bit_depth); + uint16_t *color_cache, int n_cache, int bit_depth); // Return the rate cost for transmitting chroma palette color values. int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi, -#if CONFIG_PALETTE_DELTA_ENCODING uint16_t *color_cache, int n_cache, -#endif // CONFIG_PALETTE_DELTA_ENCODING int bit_depth); #ifdef __cplusplus diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c index accc97e57..4f6265617 100644 --- a/third_party/aom/av1/encoder/pickcdef.c +++ b/third_party/aom/av1/encoder/pickcdef.c @@ -12,7 +12,8 @@ #include <math.h> #include <string.h> -#include "./aom_scale_rtcd.h" +#include "config/aom_scale_rtcd.h" + #include "aom/aom_integer.h" #include "av1/common/cdef.h" #include "av1/common/onyxc_int.h" @@ -23,7 +24,7 @@ #define REDUCED_TOTAL_STRENGTHS (REDUCED_PRI_STRENGTHS * CDEF_SEC_STRENGTHS) #define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS) -static int priconv[REDUCED_PRI_STRENGTHS] = { 0, 1, 2, 3, 4, 7, 12, 25 }; +static int priconv[REDUCED_PRI_STRENGTHS] = { 0, 1, 2, 3, 5, 7, 10, 13 }; /* Search for the best strength to add as an option, knowing we already selected nb_strengths options. */ @@ -68,16 +69,11 @@ static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths, uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count, int fast) { uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS]; -#if !CONFIG_CDEF_SINGLEPASS - const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS; -#endif int i, j; uint64_t best_tot_mse = (uint64_t)1 << 63; int best_id0 = 0; int best_id1 = 0; -#if CONFIG_CDEF_SINGLEPASS const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS; -#endif memset(tot_mse, 0, sizeof(tot_mse)); for (i = 0; i < sb_count; i++) { int gi; @@ -204,10 +200,9 @@ static INLINE uint64_t dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src, svar = sum_s2 - ((sum_s * sum_s + 32) >> 6); dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6); return (uint64_t)floor( - .5 + - (sum_d2 + sum_s2 - 2 * sum_sd) * .5 * - (svar + dvar + (400 << 2 * coeff_shift)) / - (sqrt((20000 << 4 * coeff_shift) + svar * (double)dvar))); + .5 + (sum_d2 + sum_s2 - 2 * sum_sd) * .5 * + (svar + dvar + (400 << 2 * coeff_shift)) / + (sqrt((20000 << 4 * coeff_shift) + svar * (double)dvar))); } static INLINE uint64_t mse_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src, @@ -290,7 +285,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, int fbr, fbc; uint16_t *src[3]; uint16_t *ref_coeff[3]; - cdef_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64]; + static cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128]; int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } }; int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } }; int stride[3]; @@ -310,32 +305,27 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, int *sb_index = aom_malloc(nvfb * nhfb * sizeof(*sb_index)); int *selected_strength = aom_malloc(nvfb * nhfb * sizeof(*sb_index)); uint64_t(*mse[2])[TOTAL_STRENGTHS]; -#if CONFIG_CDEF_SINGLEPASS int pri_damping = 3 + (cm->base_qindex >> 6); -#else - int pri_damping = 6; -#endif int sec_damping = 3 + (cm->base_qindex >> 6); int i; int nb_strengths; int nb_strength_bits; int quantizer; double lambda; - int nplanes = 3; + const int num_planes = av1_num_planes(cm); const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS; DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]); uint16_t *in; - DECLARE_ALIGNED(32, uint16_t, tmp_dst[CDEF_BLOCKSIZE * CDEF_BLOCKSIZE]); - int chroma_cdef = xd->plane[1].subsampling_x == xd->plane[1].subsampling_y && - xd->plane[2].subsampling_x == xd->plane[2].subsampling_y; + DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]); quantizer = - av1_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8); + av1_ac_quant_Q3(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8); lambda = .12 * quantizer * quantizer / 256.; - av1_setup_dst_planes(xd->plane, cm->sb_size, frame, 0, 0); + av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0, + num_planes); mse[0] = aom_malloc(sizeof(**mse) * nvfb * nhfb); mse[1] = aom_malloc(sizeof(**mse) * nvfb * nhfb); - for (pli = 0; pli < nplanes; pli++) { + for (pli = 0; pli < num_planes; pli++) { uint8_t *ref_buffer; int ref_stride; switch (pli) { @@ -371,20 +361,16 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, for (r = 0; r < frame_height; ++r) { for (c = 0; c < frame_width; ++c) { -#if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) { src[pli][r * stride[pli] + c] = CONVERT_TO_SHORTPTR( xd->plane[pli].dst.buf)[r * xd->plane[pli].dst.stride + c]; ref_coeff[pli][r * stride[pli] + c] = CONVERT_TO_SHORTPTR(ref_buffer)[r * ref_stride + c]; } else { -#endif src[pli][r * stride[pli] + c] = xd->plane[pli].dst.buf[r * xd->plane[pli].dst.stride + c]; ref_coeff[pli][r * stride[pli] + c] = ref_buffer[r * ref_stride + c]; -#if CONFIG_HIGHBITDEPTH } -#endif } } } @@ -397,13 +383,33 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, int dirinit = 0; nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc); nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr); - cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride + - MI_SIZE_64X64 * fbc] - ->mbmi.cdef_strength = -1; + int hb_step = 1; + int vb_step = 1; + BLOCK_SIZE bs = BLOCK_64X64; + MB_MODE_INFO *const mbmi = + cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride + + MI_SIZE_64X64 * fbc]; + if (((fbc & 1) && + (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64)) || + ((fbr & 1) && + (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_64X128))) + continue; + if (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64 || + mbmi->sb_type == BLOCK_64X128) + bs = mbmi->sb_type; + if (bs == BLOCK_128X128 || bs == BLOCK_128X64) { + nhb = AOMMIN(MI_SIZE_128X128, cm->mi_cols - MI_SIZE_64X64 * fbc); + hb_step = 2; + } + if (bs == BLOCK_128X128 || bs == BLOCK_64X128) { + nvb = AOMMIN(MI_SIZE_128X128, cm->mi_rows - MI_SIZE_64X64 * fbr); + vb_step = 2; + } + // No filtering if the entire filter block is skipped if (sb_all_skip(cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64)) continue; cdef_count = sb_compute_cdef_list(cm, fbr * MI_SIZE_64X64, - fbc * MI_SIZE_64X64, dlist, 1); - for (pli = 0; pli < nplanes; pli++) { + fbc * MI_SIZE_64X64, dlist, bs); + for (pli = 0; pli < num_planes; pli++) { for (i = 0; i < CDEF_INBUF_SIZE; i++) inbuf[i] = CDEF_VERY_LARGE; for (gi = 0; gi < total_strengths; gi++) { int threshold; @@ -411,7 +417,6 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, int sec_strength; threshold = gi / CDEF_SEC_STRENGTHS; if (fast) threshold = priconv[threshold]; - if (pli > 0 && !chroma_cdef) threshold = 0; /* We avoid filtering the pixels for which some of the pixels to average are outside the frame. We could change the filter instead, but it @@ -419,11 +424,10 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, int yoff = CDEF_VBORDER * (fbr != 0); int xoff = CDEF_HBORDER * (fbc != 0); int ysize = (nvb << mi_high_l2[pli]) + - CDEF_VBORDER * (fbr != nvfb - 1) + yoff; + CDEF_VBORDER * (fbr + vb_step < nvfb) + yoff; int xsize = (nhb << mi_wide_l2[pli]) + - CDEF_HBORDER * (fbc != nhfb - 1) + xoff; + CDEF_HBORDER * (fbc + hb_step < nhfb) + xoff; sec_strength = gi % CDEF_SEC_STRENGTHS; -#if CONFIG_CDEF_SINGLEPASS copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE, src[pli], (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff, @@ -433,19 +437,6 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, dir, &dirinit, var, pli, dlist, cdef_count, threshold, sec_strength + (sec_strength == 3), pri_damping, sec_damping, coeff_shift); -#else - if (sec_strength == 0) - copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE, - src[pli], - (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff, - (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff, - stride[pli], ysize, xsize); - cdef_filter_fb(sec_strength ? NULL : (uint8_t *)in, CDEF_BSTRIDE, - tmp_dst, in, xdec[pli], ydec[pli], dir, &dirinit, var, - pli, dlist, cdef_count, threshold, - sec_strength + (sec_strength == 3), sec_damping, - pri_damping, coeff_shift, sec_strength != 0, 1); -#endif curr_mse = compute_cdef_dist( ref_coeff[pli] + (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) * stride[pli] + @@ -470,7 +461,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, int best_lev0[CDEF_MAX_STRENGTHS]; int best_lev1[CDEF_MAX_STRENGTHS] = { 0 }; nb_strengths = 1 << i; - if (nplanes >= 3) + if (num_planes >= 3) tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths, mse, sb_count, fast); else @@ -500,14 +491,14 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, best_gi = 0; for (gi = 0; gi < cm->nb_cdef_strengths; gi++) { uint64_t curr = mse[0][i][cm->cdef_strengths[gi]]; - if (nplanes >= 3) curr += mse[1][i][cm->cdef_uv_strengths[gi]]; + if (num_planes >= 3) curr += mse[1][i][cm->cdef_uv_strengths[gi]]; if (curr < best_mse) { best_gi = gi; best_mse = curr; } } selected_strength[i] = best_gi; - cm->mi_grid_visible[sb_index[i]]->mbmi.cdef_strength = best_gi; + cm->mi_grid_visible[sb_index[i]]->cdef_strength = best_gi; } if (fast) { @@ -526,7 +517,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, cm->cdef_sec_damping = sec_damping; aom_free(mse[0]); aom_free(mse[1]); - for (pli = 0; pli < nplanes; pli++) { + for (pli = 0; pli < num_planes; pli++) { aom_free(src[pli]); aom_free(ref_coeff[pli]); } diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c index d8b6f9074..5f802a707 100644 --- a/third_party/aom/av1/encoder/picklpf.c +++ b/third_party/aom/av1/encoder/picklpf.c @@ -12,7 +12,7 @@ #include <assert.h> #include <limits.h> -#include "./aom_scale_rtcd.h" +#include "config/aom_scale_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/psnr.h" @@ -27,74 +27,6 @@ #include "av1/encoder/encoder.h" #include "av1/encoder/picklpf.h" -#if CONFIG_LPF_SB -#if CONFIG_HIGHBITDEPTH -static int compute_sb_y_sse_highbd(const YV12_BUFFER_CONFIG *src, - const YV12_BUFFER_CONFIG *frame, - AV1_COMMON *const cm, int mi_row, - int mi_col) { - int sse = 0; - const int mi_row_start = AOMMAX(0, mi_row - FILT_BOUNDARY_MI_OFFSET); - const int mi_col_start = AOMMAX(0, mi_col - FILT_BOUNDARY_MI_OFFSET); - const int mi_row_range = mi_row - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE; - const int mi_col_range = mi_col - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE; - const int mi_row_end = AOMMIN(mi_row_range, cm->mi_rows); - const int mi_col_end = AOMMIN(mi_col_range, cm->mi_cols); - - const int row = mi_row_start * MI_SIZE; - const int col = mi_col_start * MI_SIZE; - const uint16_t *src_y = - CONVERT_TO_SHORTPTR(src->y_buffer) + row * src->y_stride + col; - const uint16_t *frame_y = - CONVERT_TO_SHORTPTR(frame->y_buffer) + row * frame->y_stride + col; - const int row_end = (mi_row_end - mi_row_start) * MI_SIZE; - const int col_end = (mi_col_end - mi_col_start) * MI_SIZE; - - int x, y; - for (y = 0; y < row_end; ++y) { - for (x = 0; x < col_end; ++x) { - const int diff = src_y[x] - frame_y[x]; - sse += diff * diff; - } - src_y += src->y_stride; - frame_y += frame->y_stride; - } - return sse; -} -#endif - -static int compute_sb_y_sse(const YV12_BUFFER_CONFIG *src, - const YV12_BUFFER_CONFIG *frame, - AV1_COMMON *const cm, int mi_row, int mi_col) { - int sse = 0; - const int mi_row_start = AOMMAX(0, mi_row - FILT_BOUNDARY_MI_OFFSET); - const int mi_col_start = AOMMAX(0, mi_col - FILT_BOUNDARY_MI_OFFSET); - const int mi_row_range = mi_row - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE; - const int mi_col_range = mi_col - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE; - const int mi_row_end = AOMMIN(mi_row_range, cm->mi_rows); - const int mi_col_end = AOMMIN(mi_col_range, cm->mi_cols); - - const int row = mi_row_start * MI_SIZE; - const int col = mi_col_start * MI_SIZE; - const uint8_t *src_y = src->y_buffer + row * src->y_stride + col; - const uint8_t *frame_y = frame->y_buffer + row * frame->y_stride + col; - const int row_end = (mi_row_end - mi_row_start) * MI_SIZE; - const int col_end = (mi_col_end - mi_col_start) * MI_SIZE; - - int x, y; - for (y = 0; y < row_end; ++y) { - for (x = 0; x < col_end; ++x) { - const int diff = src_y[x] - frame_y[x]; - sse += diff * diff; - } - src_y += src->y_stride; - frame_y += frame->y_stride; - } - return sse; -} -#endif // CONFIG_LPF_SB - -#if !CONFIG_LPF_SB static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc, YV12_BUFFER_CONFIG *dst_bc, int plane) { switch (plane) { @@ -104,7 +36,6 @@ static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc, default: assert(plane >= 0 && plane <= 2); break; } } -#endif // CONFIG_LPF_SB int av1_get_max_filter_level(const AV1_COMP *cpi) { if (cpi->oxcf.pass == 2) { @@ -115,195 +46,58 @@ int av1_get_max_filter_level(const AV1_COMP *cpi) { } } -#if CONFIG_LPF_SB -// TODO(chengchen): reduce memory usage by copy superblock instead of frame -static int try_filter_superblock(const YV12_BUFFER_CONFIG *sd, - AV1_COMP *const cpi, int filt_level, - int partial_frame, int mi_row, int mi_col) { - AV1_COMMON *const cm = &cpi->common; - int filt_err; - -#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4 - av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1, - partial_frame, mi_row, mi_col); -#else - if (cpi->num_workers > 1) - av1_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane, - filt_level, 1, partial_frame, cpi->workers, - cpi->num_workers, &cpi->lf_row_sync); - else - av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, - 1, partial_frame); -#endif - -#if CONFIG_HIGHBITDEPTH - if (cm->use_highbitdepth) { - filt_err = - compute_sb_y_sse_highbd(sd, cm->frame_to_show, cm, mi_row, mi_col); - } else { - filt_err = compute_sb_y_sse(sd, cm->frame_to_show, cm, mi_row, mi_col); - } -#else - filt_err = compute_sb_y_sse(sd, cm->frame_to_show, cm, mi_row, mi_col); -#endif // CONFIG_HIGHBITDEPTH - - // TODO(chengchen): Copy the superblock only - // Re-instate the unfiltered frame - aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); - - return filt_err; -} - -static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, - int partial_frame, double *best_cost_ret, - int mi_row, int mi_col, int last_lvl) { - assert(partial_frame == 1); - assert(last_lvl >= 0); - - const AV1_COMMON *const cm = &cpi->common; - MACROBLOCK *x = &cpi->td.mb; - - int min_filter_level = AOMMAX(0, last_lvl - MAX_LPF_OFFSET); - int max_filter_level = - AOMMIN(av1_get_max_filter_level(cpi), last_lvl + MAX_LPF_OFFSET); - - // search a larger range for the start superblock - if (mi_row == 0 && mi_col == 0) { - min_filter_level = 0; - max_filter_level = av1_get_max_filter_level(cpi); - } - - // TODO(chengchen): Copy for superblock only - // Make a copy of the unfiltered / processed recon buffer - aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf); - - int estimate_err = - try_filter_superblock(sd, cpi, last_lvl, partial_frame, mi_row, mi_col); - - int best_err = estimate_err; - int filt_best = last_lvl; - - int i; - for (i = min_filter_level; i <= max_filter_level; i += LPF_STEP) { - if (i == last_lvl) continue; - - int filt_err = - try_filter_superblock(sd, cpi, i, partial_frame, mi_row, mi_col); - - if (filt_err < best_err) { - best_err = filt_err; - filt_best = i; - } - } - - // If previous sb filter level has similar filtering performance as current - // best filter level, use previous level such that we can only send one bit - // to indicate current filter level is the same as the previous. - int threshold = 400; - - // ratio = the filtering area / a superblock size - int ratio = 1; - if (mi_row + MAX_MIB_SIZE > cm->mi_rows) { - ratio *= (cm->mi_rows - mi_row); - } else { - if (mi_row == 0) { - ratio *= (MAX_MIB_SIZE - FILT_BOUNDARY_MI_OFFSET); - } else { - ratio *= MAX_MIB_SIZE; - } - } - if (mi_col + MAX_MIB_SIZE > cm->mi_cols) { - ratio *= (cm->mi_cols - mi_col); - } else { - if (mi_col == 0) { - ratio *= (MAX_MIB_SIZE - FILT_BOUNDARY_MI_OFFSET); - } else { - ratio *= MAX_MIB_SIZE; - } - } - threshold = threshold * ratio / (MAX_MIB_SIZE * MAX_MIB_SIZE); - - const int diff = abs(estimate_err - best_err); - - const int percent_thresh = (int)((double)estimate_err * 0.01); - threshold = AOMMAX(threshold, percent_thresh); - if (diff < threshold) { - best_err = estimate_err; - filt_best = last_lvl; - } - - // Compute rdcost to determine whether to reuse previous filter lvl - if (filt_best != last_lvl) { - } - - if (best_cost_ret) *best_cost_ret = RDCOST_DBL(x->rdmult, 0, best_err); - return filt_best; -} - -#else // CONFIG_LPF_SB static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd, AV1_COMP *const cpi, int filt_level, - int partial_frame -#if CONFIG_LOOPFILTER_LEVEL - , - int plane, int dir -#endif - ) { + int partial_frame, int plane, int dir) { AV1_COMMON *const cm = &cpi->common; int64_t filt_err; -#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4 -#if CONFIG_LOOPFILTER_LEVEL assert(plane >= 0 && plane <= 2); int filter_level[2] = { filt_level, filt_level }; if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1]; if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0]; - av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, - filter_level[0], filter_level[1], plane, partial_frame); -#else - av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1, - partial_frame); -#endif // CONFIG_LOOPFILTER_LEVEL + // set base filters for use of get_filter_level when in DELTA_Q_LF mode + switch (plane) { + case 0: + cm->lf.filter_level[0] = filter_level[0]; + cm->lf.filter_level[1] = filter_level[1]; + break; + case 1: cm->lf.filter_level_u = filter_level[0]; break; + case 2: cm->lf.filter_level_v = filter_level[0]; break; + } + + // TODO(any): please enable multi-thread and remove the flag when loop + // filter mask is compatible with multi-thread. +#if LOOP_FILTER_BITMASK + av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane, + plane + 1, partial_frame); #else if (cpi->num_workers > 1) - av1_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane, - filt_level, 1, partial_frame, cpi->workers, + av1_loop_filter_frame_mt(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane, + plane + 1, partial_frame, cpi->workers, cpi->num_workers, &cpi->lf_row_sync); else - av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, - 1, partial_frame); + av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane, + plane + 1, partial_frame); #endif int highbd = 0; -#if CONFIG_HIGHBITDEPTH highbd = cm->use_highbitdepth; -#endif // CONFIG_HIGHBITDEPTH -#if CONFIG_LOOPFILTER_LEVEL filt_err = aom_get_sse_plane(sd, cm->frame_to_show, plane, highbd); // Re-instate the unfiltered frame yv12_copy_plane(&cpi->last_frame_uf, cm->frame_to_show, plane); -#else - filt_err = aom_get_sse_plane(sd, cm->frame_to_show, 0, highbd); - - // Re-instate the unfiltered frame - yv12_copy_plane(&cpi->last_frame_uf, cm->frame_to_show, 0); -#endif // CONFIG_LOOPFILTER_LEVEL return filt_err; } static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, - int partial_frame, double *best_cost_ret -#if CONFIG_LOOPFILTER_LEVEL - , - int plane, int dir -#endif - ) { + int partial_frame, + const int *last_frame_filter_level, + double *best_cost_ret, int plane, int dir) { const AV1_COMMON *const cm = &cpi->common; - const struct loopfilter *const lf = &cm->lf; const int min_filter_level = 0; const int max_filter_level = av1_get_max_filter_level(cpi); int filt_direction = 0; @@ -311,39 +105,24 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, int filt_best; MACROBLOCK *x = &cpi->td.mb; -// Start the search at the previous frame filter level unless it is now out of -// range. -#if CONFIG_LOOPFILTER_LEVEL + // Start the search at the previous frame filter level unless it is now out of + // range. int lvl; switch (plane) { - case 0: lvl = (dir == 1) ? lf->filter_level[1] : lf->filter_level[0]; break; - case 1: lvl = lf->filter_level_u; break; - case 2: lvl = lf->filter_level_v; break; + case 0: lvl = last_frame_filter_level[dir]; break; + case 1: lvl = last_frame_filter_level[2]; break; + case 2: lvl = last_frame_filter_level[3]; break; default: assert(plane >= 0 && plane <= 2); return 0; } int filt_mid = clamp(lvl, min_filter_level, max_filter_level); -#else - int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level); -#endif // CONFIG_LOOPFILTER_LEVEL int filter_step = filt_mid < 16 ? 4 : filt_mid / 4; // Sum squared error at each filter level int64_t ss_err[MAX_LOOP_FILTER + 1]; // Set each entry to -1 memset(ss_err, 0xFF, sizeof(ss_err)); - -#if CONFIG_LOOPFILTER_LEVEL yv12_copy_plane(cm->frame_to_show, &cpi->last_frame_uf, plane); -#else - // Make a copy of the unfiltered / processed recon buffer - aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf); -#endif // CONFIG_LOOPFILTER_LEVEL - -#if CONFIG_LOOPFILTER_LEVEL best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir); -#else - best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame); -#endif // CONFIG_LOOPFILTER_LEVEL filt_best = filt_mid; ss_err[filt_mid] = best_err; @@ -363,12 +142,8 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, if (filt_direction <= 0 && filt_low != filt_mid) { // Get Low filter error score if (ss_err[filt_low] < 0) { -#if CONFIG_LOOPFILTER_LEVEL ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame, plane, dir); -#else - ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame); -#endif // CONFIG_LOOPFILTER_LEVEL } // If value is close to the best so far then bias towards a lower loop // filter value. @@ -384,12 +159,8 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, // Now look at filt_high if (filt_direction >= 0 && filt_high != filt_mid) { if (ss_err[filt_high] < 0) { -#if CONFIG_LOOPFILTER_LEVEL ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame, plane, dir); -#else - ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame); -#endif // CONFIG_LOOPFILTER_LEVEL } // If value is significantly better than previous best, bias added against // raising filter value @@ -415,33 +186,36 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, if (best_cost_ret) *best_cost_ret = RDCOST_DBL(x->rdmult, 0, best_err); return filt_best; } -#endif // CONFIG_LPF_SB void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, LPF_PICK_METHOD method) { AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); struct loopfilter *const lf = &cm->lf; + (void)sd; - lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.sharpness; + lf->sharpness_level = 0; if (method == LPF_PICK_MINIMAL_LPF) { -#if CONFIG_LOOPFILTER_LEVEL lf->filter_level[0] = 0; lf->filter_level[1] = 0; -#else - lf->filter_level = 0; -#endif } else if (method >= LPF_PICK_FROM_Q) { const int min_filter_level = 0; const int max_filter_level = av1_get_max_filter_level(cpi); - const int q = av1_ac_quant(cm->base_qindex, 0, cm->bit_depth); -// These values were determined by linear fitting the result of the -// searched level, filt_guess = q * 0.316206 + 3.87252 -#if CONFIG_HIGHBITDEPTH + const int q = av1_ac_quant_Q3(cm->base_qindex, 0, cm->bit_depth); + // These values were determined by linear fitting the result of the + // searched level for 8 bit depth: + // Keyframes: filt_guess = q * 0.06699 - 1.60817 + // Other frames: filt_guess = q * 0.02295 + 2.48225 + // + // And high bit depth separately: + // filt_guess = q * 0.316206 + 3.87252 int filt_guess; switch (cm->bit_depth) { case AOM_BITS_8: - filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18); + filt_guess = (cm->frame_type == KEY_FRAME) + ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18) + : ROUND_POWER_OF_TWO(q * 6017 + 650707, 18); break; case AOM_BITS_10: filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20); @@ -455,58 +229,36 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, "or AOM_BITS_12"); return; } -#else - int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18); -#endif // CONFIG_HIGHBITDEPTH - if (cm->frame_type == KEY_FRAME) filt_guess -= 4; -#if CONFIG_LOOPFILTER_LEVEL + if (cm->bit_depth != AOM_BITS_8 && cm->frame_type == KEY_FRAME) + filt_guess -= 4; + // TODO(chengchen): retrain the model for Y, U, V filter levels lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level); lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level); -#else - lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level); -#endif + lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level); + lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level); } else { -#if CONFIG_LPF_SB - int mi_row, mi_col; - // TODO(chengchen): init last_lvl using previous frame's info? - int last_lvl = 0; - // TODO(chengchen): if the frame size makes the last superblock very small, - // consider merge it to the previous superblock to save bits. - // Example, if frame size 1080x720, then in the last row of superblock, - // there're (FILT_BOUNDAR_OFFSET + 16) pixels. - for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MAX_MIB_SIZE) { - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) { - int lvl = - search_filter_level(sd, cpi, 1, NULL, mi_row, mi_col, last_lvl); - - av1_loop_filter_sb_level_init(cm, mi_row, mi_col, lvl); - - // For the superblock at row start, its previous filter level should be - // the one above it, not the one at the end of last row - if (mi_col + MAX_MIB_SIZE >= cm->mi_cols) { - last_lvl = cm->mi_grid_visible[mi_row * cm->mi_stride]->mbmi.filt_lvl; - } else { - last_lvl = lvl; - } - } + const int last_frame_filter_level[4] = { lf->filter_level[0], + lf->filter_level[1], + lf->filter_level_u, + lf->filter_level_v }; + + lf->filter_level[0] = lf->filter_level[1] = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, NULL, 0, 2); + lf->filter_level[0] = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, NULL, 0, 0); + lf->filter_level[1] = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, NULL, 0, 1); + + if (num_planes > 1) { + lf->filter_level_u = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, NULL, 1, 0); + lf->filter_level_v = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, NULL, 2, 0); } -#else // CONFIG_LPF_SB -#if CONFIG_LOOPFILTER_LEVEL - lf->filter_level[0] = lf->filter_level[1] = search_filter_level( - sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 0, 2); - lf->filter_level[0] = search_filter_level( - sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 0, 0); - lf->filter_level[1] = search_filter_level( - sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 0, 1); - - lf->filter_level_u = search_filter_level( - sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 1, 0); - lf->filter_level_v = search_filter_level( - sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 2, 0); -#else - lf->filter_level = - search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL); -#endif // CONFIG_LOOPFILTER_LEVEL -#endif // CONFIG_LPF_SB } } diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c index a2262b6fc..93ea09690 100644 --- a/third_party/aom/av1/encoder/pickrst.c +++ b/third_party/aom/av1/encoder/pickrst.c @@ -14,7 +14,7 @@ #include <limits.h> #include <math.h> -#include "./aom_scale_rtcd.h" +#include "config/aom_scale_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/binary_codes_writer.h" @@ -40,150 +40,156 @@ static const RestorationType force_restore_type = RESTORE_TYPES; // Number of Wiener iterations #define NUM_WIENER_ITERS 5 -typedef double (*search_restore_type)(const YV12_BUFFER_CONFIG *src, - AV1_COMP *cpi, int partial_frame, - int plane, RestorationInfo *info, - RestorationType *rest_level, - int64_t *best_tile_cost, - YV12_BUFFER_CONFIG *dst_frame); +// Penalty factor for use of dual sgr +#define DUAL_SGR_PENALTY_MULT 0.01 const int frame_level_restore_bits[RESTORE_TYPES] = { 2, 2, 2, 2 }; -static int64_t sse_restoration_tile(const YV12_BUFFER_CONFIG *src, - const YV12_BUFFER_CONFIG *dst, - const AV1_COMMON *cm, int h_start, - int width, int v_start, int height, - int components_pattern) { - int64_t filt_err = 0; - (void)cm; - // Y and UV components cannot be mixed - assert(components_pattern == 1 || components_pattern == 2 || - components_pattern == 4 || components_pattern == 6); -#if CONFIG_HIGHBITDEPTH - if (cm->use_highbitdepth) { - if ((components_pattern >> AOM_PLANE_Y) & 1) { - filt_err += - aom_highbd_get_y_sse_part(src, dst, h_start, width, v_start, height); - } - if ((components_pattern >> AOM_PLANE_U) & 1) { - filt_err += - aom_highbd_get_u_sse_part(src, dst, h_start, width, v_start, height); - } - if ((components_pattern >> AOM_PLANE_V) & 1) { - filt_err += - aom_highbd_get_v_sse_part(src, dst, h_start, width, v_start, height); - } - return filt_err; - } -#endif // CONFIG_HIGHBITDEPTH - if ((components_pattern >> AOM_PLANE_Y) & 1) { - filt_err += aom_get_y_sse_part(src, dst, h_start, width, v_start, height); - } - if ((components_pattern >> AOM_PLANE_U) & 1) { - filt_err += aom_get_u_sse_part(src, dst, h_start, width, v_start, height); - } - if ((components_pattern >> AOM_PLANE_V) & 1) { - filt_err += aom_get_v_sse_part(src, dst, h_start, width, v_start, height); - } - return filt_err; +typedef int64_t (*sse_extractor_type)(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +typedef int64_t (*sse_part_extractor_type)(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, + int hstart, int width, int vstart, + int height); + +#define NUM_EXTRACTORS (3 * (1 + 1)) + +static const sse_part_extractor_type sse_part_extractors[NUM_EXTRACTORS] = { + aom_get_y_sse_part, aom_get_u_sse_part, + aom_get_v_sse_part, aom_highbd_get_y_sse_part, + aom_highbd_get_u_sse_part, aom_highbd_get_v_sse_part, +}; + +static int64_t sse_restoration_unit(const RestorationTileLimits *limits, + const YV12_BUFFER_CONFIG *src, + const YV12_BUFFER_CONFIG *dst, int plane, + int highbd) { + return sse_part_extractors[3 * highbd + plane]( + src, dst, limits->h_start, limits->h_end - limits->h_start, + limits->v_start, limits->v_end - limits->v_start); } -static int64_t sse_restoration_frame(AV1_COMMON *const cm, - const YV12_BUFFER_CONFIG *src, - const YV12_BUFFER_CONFIG *dst, - int components_pattern) { - int64_t filt_err = 0; -#if CONFIG_HIGHBITDEPTH - if (cm->use_highbitdepth) { - if ((components_pattern >> AOM_PLANE_Y) & 1) { - filt_err += aom_highbd_get_y_sse(src, dst); - } - if ((components_pattern >> AOM_PLANE_U) & 1) { - filt_err += aom_highbd_get_u_sse(src, dst); - } - if ((components_pattern >> AOM_PLANE_V) & 1) { - filt_err += aom_highbd_get_v_sse(src, dst); - } - return filt_err; - } -#else - (void)cm; -#endif // CONFIG_HIGHBITDEPTH - if ((components_pattern >> AOM_PLANE_Y) & 1) { - filt_err = aom_get_y_sse(src, dst); - } - if ((components_pattern >> AOM_PLANE_U) & 1) { - filt_err += aom_get_u_sse(src, dst); - } - if ((components_pattern >> AOM_PLANE_V) & 1) { - filt_err += aom_get_v_sse(src, dst); - } - return filt_err; +typedef struct { + // The best coefficients for Wiener or Sgrproj restoration + WienerInfo wiener; + SgrprojInfo sgrproj; + + // The sum of squared errors for this rtype. + int64_t sse[RESTORE_SWITCHABLE_TYPES]; + + // The rtype to use for this unit given a frame rtype as + // index. Indices: WIENER, SGRPROJ, SWITCHABLE. + RestorationType best_rtype[RESTORE_TYPES - 1]; +} RestUnitSearchInfo; + +typedef struct { + const YV12_BUFFER_CONFIG *src; + YV12_BUFFER_CONFIG *dst; + + const AV1_COMMON *cm; + const MACROBLOCK *x; + int plane; + int plane_width; + int plane_height; + RestUnitSearchInfo *rusi; + + // Speed features + const SPEED_FEATURES *sf; + + uint8_t *dgd_buffer; + int dgd_stride; + const uint8_t *src_buffer; + int src_stride; + + // sse and bits are initialised by reset_rsc in search_rest_type + int64_t sse; + int64_t bits; + int tile_y0, tile_stripe0; + + // sgrproj and wiener are initialised by rsc_on_tile when starting the first + // tile in the frame. + SgrprojInfo sgrproj; + WienerInfo wiener; + AV1PixelRect tile_rect; +} RestSearchCtxt; + +static void rsc_on_tile(int tile_row, int tile_col, void *priv) { + (void)tile_col; + + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + set_default_sgrproj(&rsc->sgrproj); + set_default_wiener(&rsc->wiener); + + rsc->tile_stripe0 = + (tile_row == 0) ? 0 : rsc->cm->rst_end_stripe[tile_row - 1]; } -static int64_t try_restoration_tile(const YV12_BUFFER_CONFIG *src, - AV1_COMP *const cpi, RestorationInfo *rsi, - int components_pattern, int partial_frame, - int tile_idx, - YV12_BUFFER_CONFIG *dst_frame) { - AV1_COMMON *const cm = &cpi->common; - int64_t filt_err; - int tile_width, tile_height, nhtiles, nvtiles; - int ntiles, width, height; - - // Y and UV components cannot be mixed - assert(components_pattern == 1 || components_pattern == 2 || - components_pattern == 4 || components_pattern == 6); - - if (components_pattern == 1) { // Y only - width = src->y_crop_width; - height = src->y_crop_height; - } else { // Color - width = src->uv_crop_width; - height = src->uv_crop_height; - } - ntiles = av1_get_rest_ntiles( - width, height, cm->rst_info[components_pattern > 1].restoration_tilesize, - &tile_width, &tile_height, &nhtiles, &nvtiles); - (void)ntiles; - - av1_loop_restoration_frame(cm->frame_to_show, cm, rsi, components_pattern, - partial_frame, dst_frame); - RestorationTileLimits limits = av1_get_rest_tile_limits( - tile_idx, nhtiles, nvtiles, tile_width, tile_height, width, -#if CONFIG_STRIPED_LOOP_RESTORATION - height, components_pattern > 1 ? cm->subsampling_y : 0); -#else - height); -#endif - filt_err = sse_restoration_tile( - src, dst_frame, cm, limits.h_start, limits.h_end - limits.h_start, - limits.v_start, limits.v_end - limits.v_start, components_pattern); - - return filt_err; +static void reset_rsc(RestSearchCtxt *rsc) { + rsc->sse = 0; + rsc->bits = 0; } -static int64_t try_restoration_frame(const YV12_BUFFER_CONFIG *src, - AV1_COMP *const cpi, RestorationInfo *rsi, - int components_pattern, int partial_frame, - YV12_BUFFER_CONFIG *dst_frame) { - AV1_COMMON *const cm = &cpi->common; - int64_t filt_err; - av1_loop_restoration_frame(cm->frame_to_show, cm, rsi, components_pattern, - partial_frame, dst_frame); - filt_err = sse_restoration_frame(cm, src, dst_frame, components_pattern); - return filt_err; +static void init_rsc(const YV12_BUFFER_CONFIG *src, const AV1_COMMON *cm, + const MACROBLOCK *x, const SPEED_FEATURES *sf, int plane, + RestUnitSearchInfo *rusi, YV12_BUFFER_CONFIG *dst, + RestSearchCtxt *rsc) { + rsc->src = src; + rsc->dst = dst; + rsc->cm = cm; + rsc->x = x; + rsc->plane = plane; + rsc->rusi = rusi; + rsc->sf = sf; + + const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show; + const int is_uv = plane != AOM_PLANE_Y; + rsc->plane_width = src->crop_widths[is_uv]; + rsc->plane_height = src->crop_heights[is_uv]; + rsc->src_buffer = src->buffers[plane]; + rsc->src_stride = src->strides[is_uv]; + rsc->dgd_buffer = dgd->buffers[plane]; + rsc->dgd_stride = dgd->strides[is_uv]; + rsc->tile_rect = av1_whole_frame_rect(cm, is_uv); + assert(src->crop_widths[is_uv] == dgd->crop_widths[is_uv]); + assert(src->crop_heights[is_uv] == dgd->crop_heights[is_uv]); +} + +static int64_t try_restoration_unit(const RestSearchCtxt *rsc, + const RestorationTileLimits *limits, + const AV1PixelRect *tile_rect, + const RestorationUnitInfo *rui) { + const AV1_COMMON *const cm = rsc->cm; + const int plane = rsc->plane; + const int is_uv = plane > 0; + const RestorationInfo *rsi = &cm->rst_info[plane]; + RestorationLineBuffers rlbs; + const int bit_depth = cm->bit_depth; + const int highbd = cm->use_highbitdepth; + + const YV12_BUFFER_CONFIG *fts = cm->frame_to_show; + // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be + // also used in encoder. + const int optimized_lr = 0; + + av1_loop_restoration_filter_unit( + limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0, + is_uv && cm->subsampling_x, is_uv && cm->subsampling_y, highbd, bit_depth, + fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane], + rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr); + + return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd); } static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int use_highbitdepth, - int32_t *flt1, int flt1_stride, - int32_t *flt2, int flt2_stride, int *xqd) { + int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int *xqd, + const sgr_params_type *params) { int i, j; int64_t err = 0; int xq[2]; - decode_xq(xqd, xq); + decode_xq(xqd, xq, params); if (!use_highbitdepth) { const uint8_t *src = src8; const uint8_t *dat = dat8; @@ -191,9 +197,9 @@ static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height, for (j = 0; j < width; ++j) { const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); - const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u; - const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u; - const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS); + int32_t v = u << SGRPROJ_PRJ_BITS; + if (params->r[0] > 0) v += xq[0] * (flt0[i * flt0_stride + j] - u); + if (params->r[1] > 0) v += xq[1] * (flt1[i * flt1_stride + j] - u); const int32_t e = ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[i * src_stride + j]; @@ -203,17 +209,67 @@ static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height, } else { const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); - for (i = 0; i < height; ++i) { - for (j = 0; j < width; ++j) { - const int32_t u = - (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); - const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u; - const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u; - const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS); - const int32_t e = - ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - - src[i * src_stride + j]; - err += e * e; + const int32_t half = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1); + if (params->r[0] > 0 && params->r[1] > 0) { + int xq0 = xq[0]; + int xq1 = xq[1]; + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t d = dat[j]; + const int32_t s = src[j]; + const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS); + int32_t v0 = flt0[j] - u; + int32_t v1 = flt1[j] - u; + int32_t v = half; + v += xq0 * v0; + v += xq1 * v1; + const int32_t e = + (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s; + err += e * e; + } + dat += dat_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + src += src_stride; + } + } else if (params->r[0] > 0 || params->r[1] > 0) { + int exq; + int32_t *flt; + int flt_stride; + if (params->r[0] > 0) { + exq = xq[0]; + flt = flt0; + flt_stride = flt0_stride; + } else { + exq = xq[1]; + flt = flt1; + flt_stride = flt1_stride; + } + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t d = dat[j]; + const int32_t s = src[j]; + const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS); + int32_t v = half; + v += exq * (flt[j] - u); + const int32_t e = + (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s; + err += e * e; + } + dat += dat_stride; + flt += flt_stride; + src += src_stride; + } + } else { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t d = dat[j]; + const int32_t s = src[j]; + const int32_t e = d - s; + err += e * e; + } + dat += dat_stride; + src += src_stride; } } } @@ -223,11 +279,12 @@ static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height, #define USE_SGRPROJ_REFINEMENT_SEARCH 1 static int64_t finer_search_pixel_proj_error( const uint8_t *src8, int width, int height, int src_stride, - const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt1, - int flt1_stride, int32_t *flt2, int flt2_stride, int start_step, int *xqd) { - int64_t err = get_pixel_proj_error(src8, width, height, src_stride, dat8, - dat_stride, use_highbitdepth, flt1, - flt1_stride, flt2, flt2_stride, xqd); + const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt0, + int flt0_stride, int32_t *flt1, int flt1_stride, int start_step, int *xqd, + const sgr_params_type *params) { + int64_t err = get_pixel_proj_error( + src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0, + flt0_stride, flt1, flt1_stride, xqd, params); (void)start_step; #if USE_SGRPROJ_REFINEMENT_SEARCH int64_t err2; @@ -235,13 +292,17 @@ static int64_t finer_search_pixel_proj_error( int tap_max[] = { SGRPROJ_PRJ_MAX0, SGRPROJ_PRJ_MAX1 }; for (int s = start_step; s >= 1; s >>= 1) { for (int p = 0; p < 2; ++p) { + if ((params->r[0] == 0 && p == 0) || (params->r[1] == 0 && p == 1)) { + continue; + } int skip = 0; do { if (xqd[p] - s >= tap_min[p]) { xqd[p] -= s; - err2 = get_pixel_proj_error(src8, width, height, src_stride, dat8, - dat_stride, use_highbitdepth, flt1, - flt1_stride, flt2, flt2_stride, xqd); + err2 = + get_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, use_highbitdepth, flt0, + flt0_stride, flt1, flt1_stride, xqd, params); if (err2 > err) { xqd[p] += s; } else { @@ -257,9 +318,10 @@ static int64_t finer_search_pixel_proj_error( do { if (xqd[p] + s <= tap_max[p]) { xqd[p] += s; - err2 = get_pixel_proj_error(src8, width, height, src_stride, dat8, - dat_stride, use_highbitdepth, flt1, - flt1_stride, flt2, flt2_stride, xqd); + err2 = + get_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, use_highbitdepth, flt0, + flt0_stride, flt1, flt1_stride, xqd, params); if (err2 > err) { xqd[p] -= s; } else { @@ -277,10 +339,11 @@ static int64_t finer_search_pixel_proj_error( } static void get_proj_subspace(const uint8_t *src8, int width, int height, - int src_stride, uint8_t *dat8, int dat_stride, - int use_highbitdepth, int32_t *flt1, - int flt1_stride, int32_t *flt2, int flt2_stride, - int *xq) { + int src_stride, const uint8_t *dat8, + int dat_stride, int use_highbitdepth, + int32_t *flt0, int flt0_stride, int32_t *flt1, + int flt1_stride, int *xq, + const sgr_params_type *params) { int i, j; double H[2][2] = { { 0, 0 }, { 0, 0 } }; double C[2] = { 0, 0 }; @@ -301,8 +364,10 @@ static void get_proj_subspace(const uint8_t *src8, int width, int height, const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); const double s = (double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; - const double f1 = (double)flt1[i * flt1_stride + j] - u; - const double f2 = (double)flt2[i * flt2_stride + j] - u; + const double f1 = + (params->r[0] > 0) ? (double)flt0[i * flt0_stride + j] - u : 0; + const double f2 = + (params->r[1] > 0) ? (double)flt1[i * flt1_stride + j] - u : 0; H[0][0] += f1 * f1; H[1][1] += f2 * f2; H[0][1] += f1 * f2; @@ -318,8 +383,10 @@ static void get_proj_subspace(const uint8_t *src8, int width, int height, const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); const double s = (double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; - const double f1 = (double)flt1[i * flt1_stride + j] - u; - const double f2 = (double)flt2[i * flt2_stride + j] - u; + const double f1 = + (params->r[0] > 0) ? (double)flt0[i * flt0_stride + j] - u : 0; + const double f2 = + (params->r[1] > 0) ? (double)flt1[i * flt1_stride + j] - u : 0; H[0][0] += f1 * f1; H[1][1] += f2 * f2; H[0][1] += f1 * f2; @@ -334,99 +401,103 @@ static void get_proj_subspace(const uint8_t *src8, int width, int height, H[1][0] = H[0][1]; C[0] /= size; C[1] /= size; - Det = (H[0][0] * H[1][1] - H[0][1] * H[1][0]); - if (Det < 1e-8) return; // ill-posed, return default values - x[0] = (H[1][1] * C[0] - H[0][1] * C[1]) / Det; - x[1] = (H[0][0] * C[1] - H[1][0] * C[0]) / Det; - xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS)); - xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS)); + if (params->r[0] == 0) { + // H matrix is now only the scalar H[1][1] + // C vector is now only the scalar C[1] + Det = H[1][1]; + if (Det < 1e-8) return; // ill-posed, return default values + x[0] = 0; + x[1] = C[1] / Det; + + xq[0] = 0; + xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS)); + } else if (params->r[1] == 0) { + // H matrix is now only the scalar H[0][0] + // C vector is now only the scalar C[0] + Det = H[0][0]; + if (Det < 1e-8) return; // ill-posed, return default values + x[0] = C[0] / Det; + x[1] = 0; + + xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS)); + xq[1] = 0; + } else { + Det = (H[0][0] * H[1][1] - H[0][1] * H[1][0]); + if (Det < 1e-8) return; // ill-posed, return default values + x[0] = (H[1][1] * C[0] - H[0][1] * C[1]) / Det; + x[1] = (H[0][0] * C[1] - H[1][0] * C[0]) / Det; + + xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS)); + xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS)); + } } -void encode_xq(int *xq, int *xqd) { - xqd[0] = xq[0]; - xqd[0] = clamp(xqd[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0); - xqd[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1]; - xqd[1] = clamp(xqd[1], SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1); +void encode_xq(int *xq, int *xqd, const sgr_params_type *params) { + if (params->r[0] == 0) { + xqd[0] = 0; + xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xq[1], SGRPROJ_PRJ_MIN1, + SGRPROJ_PRJ_MAX1); + } else if (params->r[1] == 0) { + xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0); + xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0], SGRPROJ_PRJ_MIN1, + SGRPROJ_PRJ_MAX1); + } else { + xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0); + xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1], SGRPROJ_PRJ_MIN1, + SGRPROJ_PRJ_MAX1); + } } -static void search_selfguided_restoration(uint8_t *dat8, int width, int height, - int dat_stride, const uint8_t *src8, - int src_stride, int use_highbitdepth, - int bit_depth, int pu_width, - int pu_height, int *eps, int *xqd, - int32_t *rstbuf) { - int32_t *flt1 = rstbuf; - int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX; +// Apply the self-guided filter across an entire restoration unit. +static void apply_sgr(int sgr_params_idx, const uint8_t *dat8, int width, + int height, int dat_stride, int use_highbd, int bit_depth, + int pu_width, int pu_height, int32_t *flt0, int32_t *flt1, + int flt_stride) { + for (int i = 0; i < height; i += pu_height) { + const int h = AOMMIN(pu_height, height - i); + int32_t *flt0_row = flt0 + i * flt_stride; + int32_t *flt1_row = flt1 + i * flt_stride; + const uint8_t *dat8_row = dat8 + i * dat_stride; + + // Iterate over the stripe in blocks of width pu_width + for (int j = 0; j < width; j += pu_width) { + const int w = AOMMIN(pu_width, width - j); + av1_selfguided_restoration(dat8_row + j, w, h, dat_stride, flt0_row + j, + flt1_row + j, flt_stride, sgr_params_idx, + bit_depth, use_highbd); + } + } +} + +static SgrprojInfo search_selfguided_restoration( + const uint8_t *dat8, int width, int height, int dat_stride, + const uint8_t *src8, int src_stride, int use_highbitdepth, int bit_depth, + int pu_width, int pu_height, int32_t *rstbuf) { + int32_t *flt0 = rstbuf; + int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; int ep, bestep = 0; - int64_t err, besterr = -1; + int64_t besterr = -1; int exqd[2], bestxqd[2] = { 0, 0 }; - int flt1_stride = ((width + 7) & ~7) + 8; - int flt2_stride = ((width + 7) & ~7) + 8; + int flt_stride = ((width + 7) & ~7) + 8; assert(pu_width == (RESTORATION_PROC_UNIT_SIZE >> 1) || pu_width == RESTORATION_PROC_UNIT_SIZE); assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) || pu_height == RESTORATION_PROC_UNIT_SIZE); -#if !CONFIG_HIGHBITDEPTH - (void)bit_depth; -#endif for (ep = 0; ep < SGRPROJ_PARAMS; ep++) { int exq[2]; -#if CONFIG_HIGHBITDEPTH - if (use_highbitdepth) { - uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); - for (int i = 0; i < height; i += pu_height) - for (int j = 0; j < width; j += pu_width) { - const int w = AOMMIN(pu_width, width - j); - const int h = AOMMIN(pu_height, height - i); - uint16_t *dat_p = dat + i * dat_stride + j; - int32_t *flt1_p = flt1 + i * flt1_stride + j; - int32_t *flt2_p = flt2 + i * flt2_stride + j; -#if USE_HIGHPASS_IN_SGRPROJ - av1_highpass_filter_highbd(dat_p, w, h, dat_stride, flt1_p, - flt1_stride, sgr_params[ep].corner, - sgr_params[ep].edge); -#else - av1_selfguided_restoration_highbd( - dat_p, w, h, dat_stride, flt1_p, flt1_stride, bit_depth, - sgr_params[ep].r1, sgr_params[ep].e1); -#endif // USE_HIGHPASS_IN_SGRPROJ - av1_selfguided_restoration_highbd( - dat_p, w, h, dat_stride, flt2_p, flt2_stride, bit_depth, - sgr_params[ep].r2, sgr_params[ep].e2); - } - } else { -#endif - for (int i = 0; i < height; i += pu_height) - for (int j = 0; j < width; j += pu_width) { - const int w = AOMMIN(pu_width, width - j); - const int h = AOMMIN(pu_height, height - i); - uint8_t *dat_p = dat8 + i * dat_stride + j; - int32_t *flt1_p = flt1 + i * flt1_stride + j; - int32_t *flt2_p = flt2 + i * flt2_stride + j; -#if USE_HIGHPASS_IN_SGRPROJ - av1_highpass_filter(dat_p, w, h, dat_stride, flt1_p, flt1_stride, - sgr_params[ep].corner, sgr_params[ep].edge); -#else - av1_selfguided_restoration(dat_p, w, h, dat_stride, flt1_p, flt1_stride, - sgr_params[ep].r1, sgr_params[ep].e1); -#endif // USE_HIGHPASS_IN_SGRPROJ - av1_selfguided_restoration(dat_p, w, h, dat_stride, flt2_p, - flt2_stride, sgr_params[ep].r2, - sgr_params[ep].e2); - } -#if CONFIG_HIGHBITDEPTH - } -#endif + apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth, + pu_width, pu_height, flt0, flt1, flt_stride); aom_clear_system_state(); + const sgr_params_type *const params = &sgr_params[ep]; get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride, - use_highbitdepth, flt1, flt1_stride, flt2, flt2_stride, - exq); + use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq, + params); aom_clear_system_state(); - encode_xq(exq, exqd); - err = finer_search_pixel_proj_error( + encode_xq(exq, exqd, params); + int64_t err = finer_search_pixel_proj_error( src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, - flt1, flt1_stride, flt2, flt2_stride, 2, exqd); + flt0, flt_stride, flt1, flt_stride, 2, exqd, params); if (besterr == -1 || err < besterr) { bestep = ep; besterr = err; @@ -434,273 +505,86 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height, bestxqd[1] = exqd[1]; } } - *eps = bestep; - xqd[0] = bestxqd[0]; - xqd[1] = bestxqd[1]; + + SgrprojInfo ret; + ret.ep = bestep; + ret.xqd[0] = bestxqd[0]; + ret.xqd[1] = bestxqd[1]; + return ret; } static int count_sgrproj_bits(SgrprojInfo *sgrproj_info, SgrprojInfo *ref_sgrproj_info) { int bits = SGRPROJ_PARAMS_BITS; - bits += aom_count_primitive_refsubexpfin( - SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, - ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, - sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); - bits += aom_count_primitive_refsubexpfin( - SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, - ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, - sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); + const sgr_params_type *params = &sgr_params[sgrproj_info->ep]; + if (params->r[0] > 0) + bits += aom_count_primitive_refsubexpfin( + SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, + sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); + if (params->r[1] > 0) + bits += aom_count_primitive_refsubexpfin( + SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, + sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); return bits; } -struct rest_search_ctxt { - const YV12_BUFFER_CONFIG *src; - AV1_COMP *cpi; - uint8_t *dgd_buffer; - const uint8_t *src_buffer; - int dgd_stride; - int src_stride; - int partial_frame; - RestorationInfo *info; - RestorationType *type; - int64_t *best_tile_cost; - int plane; - int plane_width; - int plane_height; - int nrtiles_x; - int nrtiles_y; - YV12_BUFFER_CONFIG *dst_frame; -}; - -// Fill in ctxt. Returns the number of restoration tiles for this plane -static INLINE int init_rest_search_ctxt( - const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, int partial_frame, int plane, - RestorationInfo *info, RestorationType *type, int64_t *best_tile_cost, - YV12_BUFFER_CONFIG *dst_frame, struct rest_search_ctxt *ctxt) { - AV1_COMMON *const cm = &cpi->common; - ctxt->src = src; - ctxt->cpi = cpi; - ctxt->partial_frame = partial_frame; - ctxt->info = info; - ctxt->type = type; - ctxt->best_tile_cost = best_tile_cost; - ctxt->plane = plane; - ctxt->dst_frame = dst_frame; - - const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show; - if (plane == AOM_PLANE_Y) { - ctxt->plane_width = src->y_crop_width; - ctxt->plane_height = src->y_crop_height; - ctxt->src_buffer = src->y_buffer; - ctxt->src_stride = src->y_stride; - ctxt->dgd_buffer = dgd->y_buffer; - ctxt->dgd_stride = dgd->y_stride; - assert(ctxt->plane_width == dgd->y_crop_width); - assert(ctxt->plane_height == dgd->y_crop_height); - assert(ctxt->plane_width == src->y_crop_width); - assert(ctxt->plane_height == src->y_crop_height); - } else { - ctxt->plane_width = src->uv_crop_width; - ctxt->plane_height = src->uv_crop_height; - ctxt->src_stride = src->uv_stride; - ctxt->dgd_stride = dgd->uv_stride; - ctxt->src_buffer = plane == AOM_PLANE_U ? src->u_buffer : src->v_buffer; - ctxt->dgd_buffer = plane == AOM_PLANE_U ? dgd->u_buffer : dgd->v_buffer; - assert(ctxt->plane_width == dgd->uv_crop_width); - assert(ctxt->plane_height == dgd->uv_crop_height); - } - - return av1_get_rest_ntiles(ctxt->plane_width, ctxt->plane_height, - cm->rst_info[plane].restoration_tilesize, NULL, - NULL, &ctxt->nrtiles_x, &ctxt->nrtiles_y); -} - -typedef void (*rtile_visitor_t)(const struct rest_search_ctxt *search_ctxt, - int rtile_idx, - const RestorationTileLimits *limits, void *arg); - -static void foreach_rtile_in_tile(const struct rest_search_ctxt *ctxt, - int tile_row, int tile_col, - rtile_visitor_t fun, void *arg) { - const AV1_COMMON *const cm = &ctxt->cpi->common; - const RestorationInfo *rsi = ctxt->cpi->rst_search; - TileInfo tile_info; - - av1_tile_set_row(&tile_info, cm, tile_row); - av1_tile_set_col(&tile_info, cm, tile_col); - - int tile_col_start = tile_info.mi_col_start * MI_SIZE; - int tile_col_end = tile_info.mi_col_end * MI_SIZE; - int tile_row_start = tile_info.mi_row_start * MI_SIZE; - int tile_row_end = tile_info.mi_row_end * MI_SIZE; - if (ctxt->plane > 0) { - tile_col_start = ROUND_POWER_OF_TWO(tile_col_start, cm->subsampling_x); - tile_col_end = ROUND_POWER_OF_TWO(tile_col_end, cm->subsampling_x); - tile_row_start = ROUND_POWER_OF_TWO(tile_row_start, cm->subsampling_y); - tile_row_end = ROUND_POWER_OF_TWO(tile_row_end, cm->subsampling_y); - } +static void search_sgrproj(const RestorationTileLimits *limits, + const AV1PixelRect *tile, int rest_unit_idx, + void *priv, int32_t *tmpbuf, + RestorationLineBuffers *rlbs) { + (void)rlbs; + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; -#if CONFIG_FRAME_SUPERRES - // If upscaling is enabled, the tile limits need scaling to match the - // upscaled frame where the restoration tiles live. To do this, scale up the - // top-left and bottom-right of the tile. - if (!av1_superres_unscaled(cm)) { - av1_calculate_unscaled_superres_size(&tile_col_start, &tile_row_start, - cm->superres_scale_denominator); - av1_calculate_unscaled_superres_size(&tile_col_end, &tile_row_end, - cm->superres_scale_denominator); - // Make sure we don't fall off the bottom-right of the frame. - tile_col_end = AOMMIN(tile_col_end, ctxt->plane_width); - tile_row_end = AOMMIN(tile_row_end, ctxt->plane_height); - } -#endif // CONFIG_FRAME_SUPERRES - - const int rtile_size = rsi->restoration_tilesize; - const int rtile_col0 = (tile_col_start + rtile_size - 1) / rtile_size; - const int rtile_col1 = - AOMMIN((tile_col_end + rtile_size - 1) / rtile_size, ctxt->nrtiles_x); - const int rtile_row0 = (tile_row_start + rtile_size - 1) / rtile_size; - const int rtile_row1 = - AOMMIN((tile_row_end + rtile_size - 1) / rtile_size, ctxt->nrtiles_y); - - const int rtile_width = AOMMIN(tile_col_end - tile_col_start, rtile_size); - const int rtile_height = AOMMIN(tile_row_end - tile_row_start, rtile_size); - - for (int rtile_row = rtile_row0; rtile_row < rtile_row1; ++rtile_row) { - for (int rtile_col = rtile_col0; rtile_col < rtile_col1; ++rtile_col) { - const int rtile_idx = rtile_row * ctxt->nrtiles_x + rtile_col; - RestorationTileLimits limits = av1_get_rest_tile_limits( - rtile_idx, ctxt->nrtiles_x, ctxt->nrtiles_y, rtile_width, - rtile_height, ctxt->plane_width, -#if CONFIG_STRIPED_LOOP_RESTORATION - ctxt->plane_height, ctxt->plane > 0 ? cm->subsampling_y : 0); -#else - ctxt->plane_height); -#endif - fun(ctxt, rtile_idx, &limits, arg); - } - } -} + const MACROBLOCK *const x = rsc->x; + const AV1_COMMON *const cm = rsc->cm; + const int highbd = cm->use_highbitdepth; + const int bit_depth = cm->bit_depth; -static void search_sgrproj_for_rtile(const struct rest_search_ctxt *ctxt, - int rtile_idx, - const RestorationTileLimits *limits, - void *arg) { - const MACROBLOCK *const x = &ctxt->cpi->td.mb; - const AV1_COMMON *const cm = &ctxt->cpi->common; - RestorationInfo *rsi = ctxt->cpi->rst_search; - SgrprojInfo *sgrproj_info = ctxt->info->sgrproj_info; - - SgrprojInfo *ref_sgrproj_info = (SgrprojInfo *)arg; - - int64_t err = - sse_restoration_tile(ctxt->src, cm->frame_to_show, cm, limits->h_start, - limits->h_end - limits->h_start, limits->v_start, - limits->v_end - limits->v_start, (1 << ctxt->plane)); - // #bits when a tile is not restored - int bits = av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 0); - double cost_norestore = RDCOST_DBL(x->rdmult, (bits >> 4), err); - ctxt->best_tile_cost[rtile_idx] = INT64_MAX; - - RestorationInfo *plane_rsi = &rsi[ctxt->plane]; - SgrprojInfo *rtile_sgrproj_info = &plane_rsi->sgrproj_info[rtile_idx]; uint8_t *dgd_start = - ctxt->dgd_buffer + limits->v_start * ctxt->dgd_stride + limits->h_start; + rsc->dgd_buffer + limits->v_start * rsc->dgd_stride + limits->h_start; const uint8_t *src_start = - ctxt->src_buffer + limits->v_start * ctxt->src_stride + limits->h_start; + rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start; - search_selfguided_restoration( - dgd_start, limits->h_end - limits->h_start, - limits->v_end - limits->v_start, ctxt->dgd_stride, src_start, - ctxt->src_stride, -#if CONFIG_HIGHBITDEPTH - cm->use_highbitdepth, cm->bit_depth, -#else - 0, 8, -#endif // CONFIG_HIGHBITDEPTH - rsi[ctxt->plane].procunit_width, rsi[ctxt->plane].procunit_height, - &rtile_sgrproj_info->ep, rtile_sgrproj_info->xqd, - cm->rst_internal.tmpbuf); - plane_rsi->restoration_type[rtile_idx] = RESTORE_SGRPROJ; - err = try_restoration_tile(ctxt->src, ctxt->cpi, rsi, (1 << ctxt->plane), - ctxt->partial_frame, rtile_idx, ctxt->dst_frame); - bits = - count_sgrproj_bits(&plane_rsi->sgrproj_info[rtile_idx], ref_sgrproj_info) - << AV1_PROB_COST_SHIFT; - bits += av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 1); - double cost_sgrproj = RDCOST_DBL(x->rdmult, (bits >> 4), err); - if (cost_sgrproj >= cost_norestore) { - ctxt->type[rtile_idx] = RESTORE_NONE; - } else { - ctxt->type[rtile_idx] = RESTORE_SGRPROJ; - *ref_sgrproj_info = sgrproj_info[rtile_idx] = - plane_rsi->sgrproj_info[rtile_idx]; - ctxt->best_tile_cost[rtile_idx] = err; - } - plane_rsi->restoration_type[rtile_idx] = RESTORE_NONE; -} + const int is_uv = rsc->plane > 0; + const int ss_x = is_uv && cm->subsampling_x; + const int ss_y = is_uv && cm->subsampling_y; + const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x; + const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; -static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, - int partial_frame, int plane, - RestorationInfo *info, RestorationType *type, - int64_t *best_tile_cost, - YV12_BUFFER_CONFIG *dst_frame) { - struct rest_search_ctxt ctxt; - const int nrtiles = - init_rest_search_ctxt(src, cpi, partial_frame, plane, info, type, - best_tile_cost, dst_frame, &ctxt); - - RestorationInfo *plane_rsi = &cpi->rst_search[plane]; - plane_rsi->frame_restoration_type = RESTORE_SGRPROJ; - for (int rtile_idx = 0; rtile_idx < nrtiles; ++rtile_idx) { - plane_rsi->restoration_type[rtile_idx] = RESTORE_NONE; - } - - // Compute best Sgrproj filters for each rtile, one (encoder/decoder) - // tile at a time. - const AV1_COMMON *const cm = &cpi->common; -#if CONFIG_HIGHBITDEPTH - if (cm->use_highbitdepth) - extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width, - ctxt.plane_height, ctxt.dgd_stride, SGRPROJ_BORDER_HORZ, - SGRPROJ_BORDER_VERT); - else -#endif - extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height, - ctxt.dgd_stride, SGRPROJ_BORDER_HORZ, SGRPROJ_BORDER_VERT); - - for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) { - for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) { - SgrprojInfo ref_sgrproj_info; - set_default_sgrproj(&ref_sgrproj_info); - foreach_rtile_in_tile(&ctxt, tile_row, tile_col, search_sgrproj_for_rtile, - &ref_sgrproj_info); - } - } - - // Cost for Sgrproj filtering - SgrprojInfo ref_sgrproj_info; - set_default_sgrproj(&ref_sgrproj_info); - SgrprojInfo *sgrproj_info = info->sgrproj_info; - - int bits = frame_level_restore_bits[plane_rsi->frame_restoration_type] - << AV1_PROB_COST_SHIFT; - for (int rtile_idx = 0; rtile_idx < nrtiles; ++rtile_idx) { - bits += av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, - type[rtile_idx] != RESTORE_NONE); - plane_rsi->sgrproj_info[rtile_idx] = sgrproj_info[rtile_idx]; - if (type[rtile_idx] == RESTORE_SGRPROJ) { - bits += count_sgrproj_bits(&plane_rsi->sgrproj_info[rtile_idx], - &ref_sgrproj_info) - << AV1_PROB_COST_SHIFT; - ref_sgrproj_info = plane_rsi->sgrproj_info[rtile_idx]; - } - plane_rsi->restoration_type[rtile_idx] = type[rtile_idx]; - } - int64_t err = try_restoration_frame(src, cpi, cpi->rst_search, (1 << plane), - partial_frame, dst_frame); - double cost_sgrproj = RDCOST_DBL(cpi->td.mb.rdmult, (bits >> 4), err); - return cost_sgrproj; + rusi->sgrproj = search_selfguided_restoration( + dgd_start, limits->h_end - limits->h_start, + limits->v_end - limits->v_start, rsc->dgd_stride, src_start, + rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height, + tmpbuf); + + RestorationUnitInfo rui; + rui.restoration_type = RESTORE_SGRPROJ; + rui.sgrproj_info = rusi->sgrproj; + + rusi->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, tile, &rui); + + const int64_t bits_none = x->sgrproj_restore_cost[0]; + const int64_t bits_sgr = x->sgrproj_restore_cost[1] + + (count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj) + << AV1_PROB_COST_SHIFT); + + double cost_none = + RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]); + double cost_sgr = + RDCOST_DBL(x->rdmult, bits_sgr >> 4, rusi->sse[RESTORE_SGRPROJ]); + if (rusi->sgrproj.ep < 10) + cost_sgr *= (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->dual_sgr_penalty_level); + + RestorationType rtype = + (cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE; + rusi->best_rtype[RESTORE_SGRPROJ - 1] = rtype; + + rsc->sse += rusi->sse[rtype]; + rsc->bits += (cost_sgr < cost_none) ? bits_sgr : bits_none; + if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj; } static double find_average(const uint8_t *src, int h_start, int h_end, @@ -758,7 +642,6 @@ static void compute_stats(int wiener_win, const uint8_t *dgd, } } -#if CONFIG_HIGHBITDEPTH static double find_average_highbd(const uint16_t *src, int h_start, int h_end, int v_start, int v_end, int stride) { uint64_t sum = 0; @@ -771,10 +654,10 @@ static double find_average_highbd(const uint16_t *src, int h_start, int h_end, return avg; } -static void compute_stats_highbd(int wiener_win, const uint8_t *dgd8, - const uint8_t *src8, int h_start, int h_end, - int v_start, int v_end, int dgd_stride, - int src_stride, double *M, double *H) { +static AOM_FORCE_INLINE void compute_stats_highbd( + int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, + int h_end, int v_start, int v_end, int dgd_stride, int src_stride, + double *M, double *H) { int i, j, k, l; double Y[WIENER_WIN2]; const int wiener_win2 = wiener_win * wiener_win; @@ -798,13 +681,15 @@ static void compute_stats_highbd(int wiener_win, const uint8_t *dgd8, } assert(idx == wiener_win2); for (k = 0; k < wiener_win2; ++k) { - M[k] += Y[k] * X; - H[k * wiener_win2 + k] += Y[k] * Y[k]; + double Yk = Y[k]; + M[k] += Yk * X; + double *H2 = &H[k * wiener_win2]; + H2[k] += Yk * Yk; for (l = k + 1; l < wiener_win2; ++l) { // H is a symmetric matrix, so we only need to fill out the upper // triangle here. We can copy it down to the lower triangle outside // the (i, j) loops. - H[k * wiener_win2 + l] += Y[k] * Y[l]; + H2[l] += Yk * Y[l]; } } } @@ -815,7 +700,6 @@ static void compute_stats_highbd(int wiener_win, const uint8_t *dgd8, } } } -#endif // CONFIG_HIGHBITDEPTH static INLINE int wrap_index(int i, int wiener_win) { const int wiener_halfwin1 = (wiener_win >> 1) + 1; @@ -1059,37 +943,37 @@ static int count_wiener_bits(int wiener_win, WienerInfo *wiener_info, } #define USE_WIENER_REFINEMENT_SEARCH 1 -static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src, - AV1_COMP *cpi, RestorationInfo *rsi, - int start_step, int plane, - int wiener_win, int tile_idx, - int partial_frame, - YV12_BUFFER_CONFIG *dst_frame) { +static int64_t finer_tile_search_wiener(const RestSearchCtxt *rsc, + const RestorationTileLimits *limits, + const AV1PixelRect *tile, + RestorationUnitInfo *rui, + int wiener_win) { const int plane_off = (WIENER_WIN - wiener_win) >> 1; - int64_t err = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame, - tile_idx, dst_frame); - (void)start_step; + int64_t err = try_restoration_unit(rsc, limits, tile, rui); #if USE_WIENER_REFINEMENT_SEARCH int64_t err2; int tap_min[] = { WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP2_MINV }; int tap_max[] = { WIENER_FILT_TAP0_MAXV, WIENER_FILT_TAP1_MAXV, WIENER_FILT_TAP2_MAXV }; + + WienerInfo *plane_wiener = &rui->wiener_info; + // printf("err pre = %"PRId64"\n", err); + const int start_step = 4; for (int s = start_step; s >= 1; s >>= 1) { for (int p = plane_off; p < WIENER_HALFWIN; ++p) { int skip = 0; do { - if (rsi[plane].wiener_info[tile_idx].hfilter[p] - s >= tap_min[p]) { - rsi[plane].wiener_info[tile_idx].hfilter[p] -= s; - rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] -= s; - rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] += 2 * s; - err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame, - tile_idx, dst_frame); + if (plane_wiener->hfilter[p] - s >= tap_min[p]) { + plane_wiener->hfilter[p] -= s; + plane_wiener->hfilter[WIENER_WIN - p - 1] -= s; + plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s; + err2 = try_restoration_unit(rsc, limits, tile, rui); if (err2 > err) { - rsi[plane].wiener_info[tile_idx].hfilter[p] += s; - rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] += s; - rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] -= 2 * s; + plane_wiener->hfilter[p] += s; + plane_wiener->hfilter[WIENER_WIN - p - 1] += s; + plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s; } else { err = err2; skip = 1; @@ -1101,16 +985,15 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src, } while (1); if (skip) break; do { - if (rsi[plane].wiener_info[tile_idx].hfilter[p] + s <= tap_max[p]) { - rsi[plane].wiener_info[tile_idx].hfilter[p] += s; - rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] += s; - rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] -= 2 * s; - err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame, - tile_idx, dst_frame); + if (plane_wiener->hfilter[p] + s <= tap_max[p]) { + plane_wiener->hfilter[p] += s; + plane_wiener->hfilter[WIENER_WIN - p - 1] += s; + plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s; + err2 = try_restoration_unit(rsc, limits, tile, rui); if (err2 > err) { - rsi[plane].wiener_info[tile_idx].hfilter[p] -= s; - rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] -= s; - rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] += 2 * s; + plane_wiener->hfilter[p] -= s; + plane_wiener->hfilter[WIENER_WIN - p - 1] -= s; + plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s; } else { err = err2; // At the highest step size continue moving in the same direction @@ -1123,16 +1006,15 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src, for (int p = plane_off; p < WIENER_HALFWIN; ++p) { int skip = 0; do { - if (rsi[plane].wiener_info[tile_idx].vfilter[p] - s >= tap_min[p]) { - rsi[plane].wiener_info[tile_idx].vfilter[p] -= s; - rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] -= s; - rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] += 2 * s; - err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame, - tile_idx, dst_frame); + if (plane_wiener->vfilter[p] - s >= tap_min[p]) { + plane_wiener->vfilter[p] -= s; + plane_wiener->vfilter[WIENER_WIN - p - 1] -= s; + plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s; + err2 = try_restoration_unit(rsc, limits, tile, rui); if (err2 > err) { - rsi[plane].wiener_info[tile_idx].vfilter[p] += s; - rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] += s; - rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] -= 2 * s; + plane_wiener->vfilter[p] += s; + plane_wiener->vfilter[WIENER_WIN - p - 1] += s; + plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s; } else { err = err2; skip = 1; @@ -1144,16 +1026,15 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src, } while (1); if (skip) break; do { - if (rsi[plane].wiener_info[tile_idx].vfilter[p] + s <= tap_max[p]) { - rsi[plane].wiener_info[tile_idx].vfilter[p] += s; - rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] += s; - rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] -= 2 * s; - err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame, - tile_idx, dst_frame); + if (plane_wiener->vfilter[p] + s <= tap_max[p]) { + plane_wiener->vfilter[p] += s; + plane_wiener->vfilter[WIENER_WIN - p - 1] += s; + plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s; + err2 = try_restoration_unit(rsc, limits, tile, rui); if (err2 > err) { - rsi[plane].wiener_info[tile_idx].vfilter[p] -= s; - rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] -= s; - rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] += 2 * s; + plane_wiener->vfilter[p] -= s; + plane_wiener->vfilter[WIENER_WIN - p - 1] -= s; + plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s; } else { err = err2; // At the highest step size continue moving in the same direction @@ -1169,372 +1050,264 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src, return err; } -static void search_wiener_for_rtile(const struct rest_search_ctxt *ctxt, - int rtile_idx, - const RestorationTileLimits *limits, - void *arg) { - const MACROBLOCK *const x = &ctxt->cpi->td.mb; - const AV1_COMMON *const cm = &ctxt->cpi->common; - RestorationInfo *rsi = ctxt->cpi->rst_search; +static void search_wiener(const RestorationTileLimits *limits, + const AV1PixelRect *tile_rect, int rest_unit_idx, + void *priv, int32_t *tmpbuf, + RestorationLineBuffers *rlbs) { + (void)tmpbuf; + (void)rlbs; + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; const int wiener_win = - (ctxt->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA; + (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA; double M[WIENER_WIN2]; double H[WIENER_WIN2 * WIENER_WIN2]; double vfilterd[WIENER_WIN], hfilterd[WIENER_WIN]; - WienerInfo *ref_wiener_info = (WienerInfo *)arg; - - int64_t err = - sse_restoration_tile(ctxt->src, cm->frame_to_show, cm, limits->h_start, - limits->h_end - limits->h_start, limits->v_start, - limits->v_end - limits->v_start, (1 << ctxt->plane)); - // #bits when a tile is not restored - int bits = av1_cost_bit(RESTORE_NONE_WIENER_PROB, 0); - double cost_norestore = RDCOST_DBL(x->rdmult, (bits >> 4), err); - ctxt->best_tile_cost[rtile_idx] = INT64_MAX; - -#if CONFIG_HIGHBITDEPTH - if (cm->use_highbitdepth) - compute_stats_highbd(wiener_win, ctxt->dgd_buffer, ctxt->src_buffer, + const AV1_COMMON *const cm = rsc->cm; + if (cm->use_highbitdepth) { + compute_stats_highbd(wiener_win, rsc->dgd_buffer, rsc->src_buffer, limits->h_start, limits->h_end, limits->v_start, - limits->v_end, ctxt->dgd_stride, ctxt->src_stride, M, - H); - else -#endif // CONFIG_HIGHBITDEPTH - compute_stats(wiener_win, ctxt->dgd_buffer, ctxt->src_buffer, - limits->h_start, limits->h_end, limits->v_start, - limits->v_end, ctxt->dgd_stride, ctxt->src_stride, M, H); + limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H); + } else { + compute_stats(wiener_win, rsc->dgd_buffer, rsc->src_buffer, limits->h_start, + limits->h_end, limits->v_start, limits->v_end, + rsc->dgd_stride, rsc->src_stride, M, H); + } - ctxt->type[rtile_idx] = RESTORE_WIENER; + const MACROBLOCK *const x = rsc->x; + const int64_t bits_none = x->wiener_restore_cost[0]; if (!wiener_decompose_sep_sym(wiener_win, M, H, vfilterd, hfilterd)) { - ctxt->type[rtile_idx] = RESTORE_NONE; + rsc->bits += bits_none; + rsc->sse += rusi->sse[RESTORE_NONE]; + rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE; + rusi->sse[RESTORE_WIENER] = INT64_MAX; return; } - RestorationInfo *plane_rsi = &rsi[ctxt->plane]; - WienerInfo *rtile_wiener_info = &plane_rsi->wiener_info[rtile_idx]; - quantize_sym_filter(wiener_win, vfilterd, rtile_wiener_info->vfilter); - quantize_sym_filter(wiener_win, hfilterd, rtile_wiener_info->hfilter); + RestorationUnitInfo rui; + memset(&rui, 0, sizeof(rui)); + rui.restoration_type = RESTORE_WIENER; + quantize_sym_filter(wiener_win, vfilterd, rui.wiener_info.vfilter); + quantize_sym_filter(wiener_win, hfilterd, rui.wiener_info.hfilter); // Filter score computes the value of the function x'*A*x - x'*b for the // learned filter and compares it against identity filer. If there is no // reduction in the function, the filter is reverted back to identity - double score = compute_score(wiener_win, M, H, rtile_wiener_info->vfilter, - rtile_wiener_info->hfilter); - if (score > 0.0) { - ctxt->type[rtile_idx] = RESTORE_NONE; + if (compute_score(wiener_win, M, H, rui.wiener_info.vfilter, + rui.wiener_info.hfilter) > 0) { + rsc->bits += bits_none; + rsc->sse += rusi->sse[RESTORE_NONE]; + rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE; + rusi->sse[RESTORE_WIENER] = INT64_MAX; return; } + aom_clear_system_state(); - plane_rsi->restoration_type[rtile_idx] = RESTORE_WIENER; - err = finer_tile_search_wiener(ctxt->src, ctxt->cpi, rsi, 4, ctxt->plane, - wiener_win, rtile_idx, ctxt->partial_frame, - ctxt->dst_frame); + rusi->sse[RESTORE_WIENER] = + finer_tile_search_wiener(rsc, limits, tile_rect, &rui, wiener_win); + rusi->wiener = rui.wiener_info; + if (wiener_win != WIENER_WIN) { - assert(rtile_wiener_info->vfilter[0] == 0 && - rtile_wiener_info->vfilter[WIENER_WIN - 1] == 0); - assert(rtile_wiener_info->hfilter[0] == 0 && - rtile_wiener_info->hfilter[WIENER_WIN - 1] == 0); - } - bits = count_wiener_bits(wiener_win, rtile_wiener_info, ref_wiener_info) - << AV1_PROB_COST_SHIFT; - bits += av1_cost_bit(RESTORE_NONE_WIENER_PROB, 1); - double cost_wiener = RDCOST_DBL(x->rdmult, (bits >> 4), err); - if (cost_wiener >= cost_norestore) { - ctxt->type[rtile_idx] = RESTORE_NONE; - } else { - ctxt->type[rtile_idx] = RESTORE_WIENER; - *ref_wiener_info = ctxt->info->wiener_info[rtile_idx] = *rtile_wiener_info; - ctxt->best_tile_cost[rtile_idx] = err; + assert(rui.wiener_info.vfilter[0] == 0 && + rui.wiener_info.vfilter[WIENER_WIN - 1] == 0); + assert(rui.wiener_info.hfilter[0] == 0 && + rui.wiener_info.hfilter[WIENER_WIN - 1] == 0); } - plane_rsi->restoration_type[rtile_idx] = RESTORE_NONE; -} -static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, - int partial_frame, int plane, RestorationInfo *info, - RestorationType *type, int64_t *best_tile_cost, - YV12_BUFFER_CONFIG *dst_frame) { - struct rest_search_ctxt ctxt; - const int nrtiles = - init_rest_search_ctxt(src, cpi, partial_frame, plane, info, type, - best_tile_cost, dst_frame, &ctxt); - - RestorationInfo *plane_rsi = &cpi->rst_search[plane]; - plane_rsi->frame_restoration_type = RESTORE_WIENER; - for (int tile_idx = 0; tile_idx < nrtiles; ++tile_idx) { - plane_rsi->restoration_type[tile_idx] = RESTORE_NONE; - } + const int64_t bits_wiener = + x->wiener_restore_cost[1] + + (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener) + << AV1_PROB_COST_SHIFT); - AV1_COMMON *const cm = &cpi->common; -// Construct a (WIENER_HALFWIN)-pixel border around the frame -// Note use this border to gather stats even though the actual filter -// may use less border on the top/bottom of a processing unit. -#if CONFIG_HIGHBITDEPTH - if (cm->use_highbitdepth) - extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width, - ctxt.plane_height, ctxt.dgd_stride, WIENER_HALFWIN, - WIENER_HALFWIN); - else -#endif - extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height, - ctxt.dgd_stride, WIENER_HALFWIN, WIENER_HALFWIN); - - // Compute best Wiener filters for each rtile, one (encoder/decoder) - // tile at a time. - for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) { - for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) { - WienerInfo ref_wiener_info; - set_default_wiener(&ref_wiener_info); - - foreach_rtile_in_tile(&ctxt, tile_row, tile_col, search_wiener_for_rtile, - &ref_wiener_info); - } - } + double cost_none = + RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]); + double cost_wiener = + RDCOST_DBL(x->rdmult, bits_wiener >> 4, rusi->sse[RESTORE_WIENER]); - // cost for Wiener filtering - WienerInfo ref_wiener_info; - set_default_wiener(&ref_wiener_info); - int bits = frame_level_restore_bits[plane_rsi->frame_restoration_type] - << AV1_PROB_COST_SHIFT; - WienerInfo *wiener_info = info->wiener_info; - const int wiener_win = - (plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA; - - for (int tile_idx = 0; tile_idx < nrtiles; ++tile_idx) { - bits += - av1_cost_bit(RESTORE_NONE_WIENER_PROB, type[tile_idx] != RESTORE_NONE); - plane_rsi->wiener_info[tile_idx] = wiener_info[tile_idx]; - - if (type[tile_idx] == RESTORE_WIENER) { - bits += count_wiener_bits(wiener_win, &plane_rsi->wiener_info[tile_idx], - &ref_wiener_info) - << AV1_PROB_COST_SHIFT; - ref_wiener_info = plane_rsi->wiener_info[tile_idx]; - } - plane_rsi->restoration_type[tile_idx] = type[tile_idx]; - } - int64_t err = try_restoration_frame(src, cpi, cpi->rst_search, 1 << plane, - partial_frame, dst_frame); - double cost_wiener = RDCOST_DBL(cpi->td.mb.rdmult, (bits >> 4), err); + RestorationType rtype = + (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE; + rusi->best_rtype[RESTORE_WIENER - 1] = rtype; - return cost_wiener; + rsc->sse += rusi->sse[rtype]; + rsc->bits += (cost_wiener < cost_none) ? bits_wiener : bits_none; + if (cost_wiener < cost_none) rsc->wiener = rusi->wiener; } -static double search_norestore(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, - int partial_frame, int plane, - RestorationInfo *info, RestorationType *type, - int64_t *best_tile_cost, - YV12_BUFFER_CONFIG *dst_frame) { - int64_t err; - double cost_norestore; - int bits; - MACROBLOCK *x = &cpi->td.mb; - AV1_COMMON *const cm = &cpi->common; - int tile_idx, tile_width, tile_height, nhtiles, nvtiles; - int width, height; - if (plane == AOM_PLANE_Y) { - width = src->y_crop_width; - height = src->y_crop_height; - } else { - width = src->uv_crop_width; - height = src->uv_crop_height; - } - const int ntiles = av1_get_rest_ntiles( - width, height, cm->rst_info[plane].restoration_tilesize, &tile_width, - &tile_height, &nhtiles, &nvtiles); - (void)info; - (void)dst_frame; - (void)partial_frame; - - info->frame_restoration_type = RESTORE_NONE; - for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) { - RestorationTileLimits limits = av1_get_rest_tile_limits( - tile_idx, nhtiles, nvtiles, tile_width, tile_height, width, -#if CONFIG_STRIPED_LOOP_RESTORATION - height, plane != AOM_PLANE_Y ? cm->subsampling_y : 0); -#else - height); -#endif - err = sse_restoration_tile(src, cm->frame_to_show, cm, limits.h_start, - limits.h_end - limits.h_start, limits.v_start, - limits.v_end - limits.v_start, 1 << plane); - type[tile_idx] = RESTORE_NONE; - best_tile_cost[tile_idx] = err; - } - // RD cost associated with no restoration - err = sse_restoration_frame(cm, src, cm->frame_to_show, (1 << plane)); - bits = frame_level_restore_bits[RESTORE_NONE] << AV1_PROB_COST_SHIFT; - cost_norestore = RDCOST_DBL(x->rdmult, (bits >> 4), err); - return cost_norestore; -} +static void search_norestore(const RestorationTileLimits *limits, + const AV1PixelRect *tile_rect, int rest_unit_idx, + void *priv, int32_t *tmpbuf, + RestorationLineBuffers *rlbs) { + (void)tile_rect; + (void)tmpbuf; + (void)rlbs; -struct switchable_rest_search_ctxt { - SgrprojInfo sgrproj_info; - WienerInfo wiener_info; - RestorationType *const *restore_types; - int64_t *const *tile_cost; - double cost_switchable; -}; + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; -static void search_switchable_for_rtile(const struct rest_search_ctxt *ctxt, - int rtile_idx, - const RestorationTileLimits *limits, - void *arg) { - const MACROBLOCK *x = &ctxt->cpi->td.mb; - RestorationInfo *rsi = &ctxt->cpi->common.rst_info[ctxt->plane]; - struct switchable_rest_search_ctxt *swctxt = - (struct switchable_rest_search_ctxt *)arg; + const int highbd = rsc->cm->use_highbitdepth; + rusi->sse[RESTORE_NONE] = sse_restoration_unit( + limits, rsc->src, rsc->cm->frame_to_show, rsc->plane, highbd); + + rsc->sse += rusi->sse[RESTORE_NONE]; +} +static void search_switchable(const RestorationTileLimits *limits, + const AV1PixelRect *tile_rect, int rest_unit_idx, + void *priv, int32_t *tmpbuf, + RestorationLineBuffers *rlbs) { (void)limits; + (void)tile_rect; + (void)tmpbuf; + (void)rlbs; + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; - double best_cost = - RDCOST_DBL(x->rdmult, (x->switchable_restore_cost[RESTORE_NONE] >> 4), - swctxt->tile_cost[RESTORE_NONE][rtile_idx]); - rsi->restoration_type[rtile_idx] = RESTORE_NONE; - for (RestorationType r = 1; r < RESTORE_SWITCHABLE_TYPES; r++) { - if (force_restore_type != RESTORE_TYPES) - if (r != force_restore_type) continue; - int tilebits = 0; - if (swctxt->restore_types[r][rtile_idx] != r) continue; - if (r == RESTORE_WIENER) - tilebits += count_wiener_bits( - (ctxt->plane == AOM_PLANE_Y ? WIENER_WIN : WIENER_WIN - 2), - &rsi->wiener_info[rtile_idx], &swctxt->wiener_info); - else if (r == RESTORE_SGRPROJ) - tilebits += count_sgrproj_bits(&rsi->sgrproj_info[rtile_idx], - &swctxt->sgrproj_info); - tilebits <<= AV1_PROB_COST_SHIFT; - tilebits += x->switchable_restore_cost[r]; - double cost = - RDCOST_DBL(x->rdmult, tilebits >> 4, swctxt->tile_cost[r][rtile_idx]); - - if (cost < best_cost) { - rsi->restoration_type[rtile_idx] = r; - best_cost = cost; + const MACROBLOCK *const x = rsc->x; + + const int wiener_win = + (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA; + + double best_cost = 0; + int64_t best_bits = 0; + RestorationType best_rtype = RESTORE_NONE; + + for (RestorationType r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) { + // Check for the condition that wiener or sgrproj search could not + // find a solution or the solution was worse than RESTORE_NONE. + // In either case the best_rtype will be set as RESTORE_NONE. These + // should be skipped from the test below. + if (r > RESTORE_NONE) { + if (rusi->best_rtype[r - 1] == RESTORE_NONE) continue; } - } - if (rsi->restoration_type[rtile_idx] == RESTORE_WIENER) - swctxt->wiener_info = rsi->wiener_info[rtile_idx]; - else if (rsi->restoration_type[rtile_idx] == RESTORE_SGRPROJ) - swctxt->sgrproj_info = rsi->sgrproj_info[rtile_idx]; - if (force_restore_type != RESTORE_TYPES) - assert(rsi->restoration_type[rtile_idx] == force_restore_type || - rsi->restoration_type[rtile_idx] == RESTORE_NONE); - swctxt->cost_switchable += best_cost; -} -static double search_switchable_restoration( - const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, int partial_frame, int plane, - RestorationType *const restore_types[RESTORE_SWITCHABLE_TYPES], - int64_t *const tile_cost[RESTORE_SWITCHABLE_TYPES], RestorationInfo *rsi) { - const AV1_COMMON *const cm = &cpi->common; - struct rest_search_ctxt ctxt; - init_rest_search_ctxt(src, cpi, partial_frame, plane, NULL, NULL, NULL, NULL, - &ctxt); - struct switchable_rest_search_ctxt swctxt; - swctxt.restore_types = restore_types; - swctxt.tile_cost = tile_cost; - - rsi->frame_restoration_type = RESTORE_SWITCHABLE; - int bits = frame_level_restore_bits[rsi->frame_restoration_type] - << AV1_PROB_COST_SHIFT; - swctxt.cost_switchable = RDCOST_DBL(cpi->td.mb.rdmult, bits >> 4, 0); - - for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) { - for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) { - set_default_sgrproj(&swctxt.sgrproj_info); - set_default_wiener(&swctxt.wiener_info); - foreach_rtile_in_tile(&ctxt, tile_row, tile_col, - search_switchable_for_rtile, &swctxt); + const int64_t sse = rusi->sse[r]; + int64_t coeff_pcost = 0; + switch (r) { + case RESTORE_NONE: coeff_pcost = 0; break; + case RESTORE_WIENER: + coeff_pcost = + count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener); + break; + case RESTORE_SGRPROJ: + coeff_pcost = count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj); + break; + default: assert(0); break; + } + const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT; + const int64_t bits = x->switchable_restore_cost[r] + coeff_bits; + double cost = RDCOST_DBL(x->rdmult, bits >> 4, sse); + if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10) + cost *= (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->dual_sgr_penalty_level); + if (r == 0 || cost < best_cost) { + best_cost = cost; + best_bits = bits; + best_rtype = r; } } - return swctxt.cost_switchable; + rusi->best_rtype[RESTORE_SWITCHABLE - 1] = best_rtype; + + rsc->sse += rusi->sse[best_rtype]; + rsc->bits += best_bits; + if (best_rtype == RESTORE_WIENER) rsc->wiener = rusi->wiener; + if (best_rtype == RESTORE_SGRPROJ) rsc->sgrproj = rusi->sgrproj; } -void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, - LPF_PICK_METHOD method) { - static search_restore_type search_restore_fun[RESTORE_SWITCHABLE_TYPES] = { - search_norestore, search_wiener, search_sgrproj, +static void copy_unit_info(RestorationType frame_rtype, + const RestUnitSearchInfo *rusi, + RestorationUnitInfo *rui) { + assert(frame_rtype > 0); + rui->restoration_type = rusi->best_rtype[frame_rtype - 1]; + if (rui->restoration_type == RESTORE_WIENER) + rui->wiener_info = rusi->wiener; + else + rui->sgrproj_info = rusi->sgrproj; +} + +static double search_rest_type(RestSearchCtxt *rsc, RestorationType rtype) { + static const rest_unit_visitor_t funs[RESTORE_TYPES] = { + search_norestore, search_wiener, search_sgrproj, search_switchable }; - AV1_COMMON *const cm = &cpi->common; - double cost_restore[RESTORE_TYPES]; - int64_t *tile_cost[RESTORE_SWITCHABLE_TYPES]; - RestorationType *restore_types[RESTORE_SWITCHABLE_TYPES]; - double best_cost_restore; - RestorationType r, best_restore; - const int ywidth = src->y_crop_width; - const int yheight = src->y_crop_height; - const int uvwidth = src->uv_crop_width; - const int uvheight = src->uv_crop_height; - - const int ntiles_y = - av1_get_rest_ntiles(ywidth, yheight, cm->rst_info[0].restoration_tilesize, - NULL, NULL, NULL, NULL); - const int ntiles_uv = av1_get_rest_ntiles( - uvwidth, uvheight, cm->rst_info[1].restoration_tilesize, NULL, NULL, NULL, - NULL); - - // Assume ntiles_uv is never larger that ntiles_y and so the same arrays work. - for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++) { - tile_cost[r] = (int64_t *)aom_malloc(sizeof(*tile_cost[0]) * ntiles_y); - restore_types[r] = - (RestorationType *)aom_malloc(sizeof(*restore_types[0]) * ntiles_y); - } - for (int plane = AOM_PLANE_Y; plane <= AOM_PLANE_V; ++plane) { - for (r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) { - cost_restore[r] = DBL_MAX; - if (force_restore_type != RESTORE_TYPES) - if (r != RESTORE_NONE && r != force_restore_type) continue; - cost_restore[r] = - search_restore_fun[r](src, cpi, method == LPF_PICK_FROM_SUBIMAGE, - plane, &cm->rst_info[plane], restore_types[r], - tile_cost[r], &cpi->trial_frame_rst); - } - if (plane == AOM_PLANE_Y) - cost_restore[RESTORE_SWITCHABLE] = search_switchable_restoration( - src, cpi, method == LPF_PICK_FROM_SUBIMAGE, plane, restore_types, - tile_cost, &cm->rst_info[plane]); - else - cost_restore[RESTORE_SWITCHABLE] = DBL_MAX; - best_cost_restore = DBL_MAX; - best_restore = 0; - for (r = 0; r < RESTORE_TYPES; ++r) { - if (force_restore_type != RESTORE_TYPES) - if (r != RESTORE_NONE && r != force_restore_type) continue; - if (cost_restore[r] < best_cost_restore) { - best_restore = r; - best_cost_restore = cost_restore[r]; + reset_rsc(rsc); + rsc_on_tile(LR_TILE_ROW, LR_TILE_COL, rsc); + av1_foreach_rest_unit_in_plane(rsc->cm, rsc->plane, funs[rtype], rsc, + &rsc->tile_rect, rsc->cm->rst_tmpbuf, NULL); + return RDCOST_DBL(rsc->x->rdmult, rsc->bits >> 4, rsc->sse); +} + +static int rest_tiles_in_plane(const AV1_COMMON *cm, int plane) { + const RestorationInfo *rsi = &cm->rst_info[plane]; + return rsi->units_per_tile; +} + +void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + assert(!cm->all_lossless); + + int ntiles[2]; + for (int is_uv = 0; is_uv < 2; ++is_uv) + ntiles[is_uv] = rest_tiles_in_plane(cm, is_uv); + + assert(ntiles[1] <= ntiles[0]); + RestUnitSearchInfo *rusi = + (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * ntiles[0]); + + // If the restoration unit dimensions are not multiples of + // rsi->restoration_unit_size then some elements of the rusi array may be + // left uninitialised when we reach copy_unit_info(...). This is not a + // problem, as these elements are ignored later, but in order to quiet + // Valgrind's warnings we initialise the array below. + memset(rusi, 0, sizeof(*rusi) * ntiles[0]); + + RestSearchCtxt rsc; + const int plane_start = AOM_PLANE_Y; + const int plane_end = num_planes > 1 ? AOM_PLANE_V : AOM_PLANE_Y; + for (int plane = plane_start; plane <= plane_end; ++plane) { + init_rsc(src, &cpi->common, &cpi->td.mb, &cpi->sf, plane, rusi, + &cpi->trial_frame_rst, &rsc); + + const int plane_ntiles = ntiles[plane > 0]; + const RestorationType num_rtypes = + (plane_ntiles > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES; + + double best_cost = 0; + RestorationType best_rtype = RESTORE_NONE; + + const int highbd = rsc.cm->use_highbitdepth; + extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height, + rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER, + highbd); + + for (RestorationType r = 0; r < num_rtypes; ++r) { + if ((force_restore_type != RESTORE_TYPES) && (r != RESTORE_NONE) && + (r != force_restore_type)) + continue; + + double cost = search_rest_type(&rsc, r); + + if (r == 0 || cost < best_cost) { + best_cost = cost; + best_rtype = r; } } - cm->rst_info[plane].frame_restoration_type = best_restore; + + cm->rst_info[plane].frame_restoration_type = best_rtype; if (force_restore_type != RESTORE_TYPES) - assert(best_restore == force_restore_type || - best_restore == RESTORE_NONE); - if (best_restore != RESTORE_SWITCHABLE) { - const int nt = (plane == AOM_PLANE_Y ? ntiles_y : ntiles_uv); - memcpy(cm->rst_info[plane].restoration_type, restore_types[best_restore], - nt * sizeof(restore_types[best_restore][0])); + assert(best_rtype == force_restore_type || best_rtype == RESTORE_NONE); + + if (best_rtype != RESTORE_NONE) { + for (int u = 0; u < plane_ntiles; ++u) { + copy_unit_info(best_rtype, &rusi[u], &cm->rst_info[plane].unit_info[u]); + } } } - /* - printf("Frame %d/%d restore types: %d %d %d\n", cm->current_video_frame, - cm->show_frame, cm->rst_info[0].frame_restoration_type, - cm->rst_info[1].frame_restoration_type, - cm->rst_info[2].frame_restoration_type); - printf("Frame %d/%d frame_restore_type %d : %f %f %f %f\n", - cm->current_video_frame, cm->show_frame, - cm->rst_info[0].frame_restoration_type, cost_restore[0], - cost_restore[1], cost_restore[2], cost_restore[3]); - */ - - for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++) { - aom_free(tile_cost[r]); - aom_free(restore_types[r]); - } + + aom_free(rusi); } diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h index f6096ed1d..179b89ff9 100644 --- a/third_party/aom/av1/encoder/pickrst.h +++ b/third_party/aom/av1/encoder/pickrst.h @@ -20,8 +20,7 @@ extern "C" { struct yv12_buffer_config; struct AV1_COMP; -void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, - LPF_PICK_METHOD method); +void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi); #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h new file mode 100644 index 000000000..ef333b6d8 --- /dev/null +++ b/third_party/aom/av1/encoder/pustats.h @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_PUSTATS_H_ +#define AV1_ENCODER_PUSTATS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +#define NUM_FEATURES 20 +#define NUM_HIDDEN_LAYERS 2 +#define HIDDEN_LAYERS_0_NODES 10 +#define HIDDEN_LAYERS_1_NODES 10 +#define LOGITS_NODES 1 + +static const float + av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES * + HIDDEN_LAYERS_0_NODES] = { + 13.8498f, 19.6630f, 13.3036f, 5.2448f, -18.0270f, 21.6671f, + -0.2135f, -0.0060f, 0.1211f, -0.3549f, -0.3550f, 0.0190f, + 0.0167f, -0.1192f, 0.2003f, 8.6663f, 32.0264f, 9.9558f, + 9.0935f, -110.4994f, 51.8056f, 64.8041f, 58.5392f, 53.0189f, + -61.6300f, 4.7540f, -0.0140f, 0.0185f, -15.8050f, 0.0790f, + 0.0707f, 0.0784f, 0.0766f, -0.3030f, 0.0392f, 49.3312f, + 63.3326f, 61.4025f, 54.2723f, -62.2769f, -147.1736f, -84.9432f, + -82.5422f, -70.4857f, 46.7622f, -1.0285f, -0.4809f, 0.0068f, + 1.0888f, -0.0515f, -0.0384f, -0.0232f, -0.0396f, 0.2429f, + 0.2040f, -144.4016f, -88.0868f, -80.3134f, -70.6685f, 66.8528f, + -53.8097f, -45.4011f, -52.8680f, -58.7226f, 99.7830f, 2.3728f, + 0.0229f, 0.0002f, -0.3288f, -0.0563f, -0.0550f, -0.0552f, + -0.0563f, 0.2214f, 0.0139f, -60.8965f, -45.5251f, -50.4188f, + -51.5623f, 85.7369f, 77.3415f, 47.4930f, 53.8120f, 58.2311f, + -45.9650f, -2.4938f, 0.1639f, -0.5270f, -75.4622f, -0.0026f, + 0.0031f, 0.0047f, 0.0015f, 0.0092f, 0.0654f, 75.6402f, + 54.7447f, 54.8156f, 52.6834f, -9.1246f, -34.0108f, -35.6423f, + -34.2911f, -38.5444f, 72.1123f, 10.9750f, -0.1595f, 0.1983f, + 22.5724f, -0.0556f, -0.0618f, -0.0571f, -0.0608f, 0.2439f, + -0.0805f, -32.5107f, -28.9688f, -33.7284f, -48.1365f, 61.5297f, + 39.2492f, -35.1928f, -11.5000f, 7.7038f, -94.2469f, 13.5586f, + 0.7541f, 0.0105f, 4.4041f, 0.1799f, 0.1339f, 0.1567f, + -0.6668f, -0.7384f, 0.2185f, 17.1700f, -26.4601f, -1.8970f, + 38.9635f, -30.1916f, 31.8139f, 14.6157f, 10.0565f, 3.3340f, + -40.6985f, -2.1186f, 0.0116f, 0.0962f, 0.7115f, -1.4071f, + -1.3701f, -1.4728f, -1.3404f, -1.7286f, 5.5632f, 28.4998f, + 5.4087f, 16.2668f, 11.8693f, -39.4153f, 106.3281f, 38.3075f, + 39.4933f, 47.3805f, -15.0514f, -21.2421f, -0.2358f, -0.0024f, + 0.3505f, -0.0429f, -0.0377f, -0.0322f, -0.0344f, 0.2020f, + 0.1417f, 99.6711f, 35.3896f, 43.1117f, 59.8879f, -17.8250f, + -16.6976f, 18.5100f, 6.3383f, 25.3020f, -55.8824f, 25.1027f, + -0.9926f, -0.0738f, -1.4892f, 0.0269f, -0.0051f, -5.8168f, + -0.0579f, -0.1500f, 0.7224f, 8.3066f, -3.8805f, -12.1482f, + 14.3492f, -20.8118f, + }; + +static const float av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = + { + 17.6566f, 62.2217f, -107.2644f, -56.2255f, 68.2252f, + -37.5662f, 9.587f, 18.5206f, 69.6873f, 4.3903f, + }; + +static const float + av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES * + HIDDEN_LAYERS_1_NODES] = { + -0.0494f, 0.3505f, -0.0461f, -1.3451f, 0.0198f, -0.0746f, -0.2217f, + -0.9525f, 0.0633f, -0.0737f, -0.3568f, 1.8569f, -0.0189f, -1.8269f, + 0.6281f, -1.3266f, -0.9202f, 2.8978f, -0.6437f, -0.8709f, -1.5066f, + -1.0582f, -1.9509f, -0.0417f, -0.1315f, -0.3368f, 0.0014f, -0.5734f, + -1.4640f, -1.6042f, 3.3911f, -1.6815f, -1.9026f, -4.8702f, -0.1012f, + -1.4517f, -3.2156f, 0.8448f, 0.2331f, -0.1593f, 2.6627f, -0.8451f, + -1.7382f, 0.9303f, 2.3003f, -0.0659f, 0.5772f, 0.4253f, 0.2083f, + 0.3649f, -0.9198f, -0.2183f, -0.5381f, -1.0831f, 2.0359f, 0.0040f, + -0.0871f, -0.1715f, 2.2453f, 0.5099f, -0.5900f, -0.6313f, -1.3028f, + -1.7257f, 1.4130f, -0.7189f, -0.4336f, 1.9266f, 1.7495f, -0.3321f, + 0.2827f, 0.4015f, -0.5044f, -1.0420f, -0.1258f, -0.0342f, -0.1190f, + -3.1263f, 0.7485f, -0.3161f, -0.2224f, 2.5533f, -0.2121f, -1.3389f, + 0.5556f, -0.9407f, -0.7456f, 1.4137f, -0.0353f, -0.0521f, 2.4382f, + 0.1493f, -11.5631f, -1.6178f, 3.5538f, -3.6538f, -0.5972f, -3.0038f, + -2.1640f, 0.5754f, + }; + +static const float av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = + { + 69.1995f, 41.7369f, -1.4885f, -35.785f, 26.1678f, + 58.4472f, 36.2223f, 66.327f, 50.8867f, 2.8306f, + }; + +static const float + av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = { + 1.811f, 0.9009f, 0.0694f, -0.9985f, -0.039f, + 0.2076f, 0.5643f, 0.5408f, 0.6071f, 0.277f, + }; + +static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = { + 39.5529f, +}; + +static const NN_CONFIG av1_pustats_rate_nnconfig = { + NUM_FEATURES, // num_inputs + LOGITS_NODES, // num_outputs + NUM_HIDDEN_LAYERS, // num_hidden_layers + { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes + { + av1_pustats_rate_hiddenlayer_0_kernel, + av1_pustats_rate_hiddenlayer_1_kernel, + av1_pustats_rate_logits_kernel, + }, + { + av1_pustats_rate_hiddenlayer_0_bias, + av1_pustats_rate_hiddenlayer_1_bias, + av1_pustats_rate_logits_bias, + }, +}; + +static const float + av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES * + HIDDEN_LAYERS_0_NODES] = { + -39.0787f, -212.9998f, -174.2088f, -264.1454f, 292.7151f, -60.8750f, + -5.9915f, 0.0712f, -60.2312f, -0.2020f, -0.2135f, -0.1663f, + -0.0711f, 0.2267f, 0.9152f, -36.1294f, -159.9320f, -222.9809f, + -270.2556f, 300.7162f, 159.9224f, -172.5735f, -7.6852f, 54.3985f, + 110.6721f, 19.2907f, -15.1039f, -0.0457f, 0.3289f, 0.4529f, + -8.2222f, 1.3213f, -0.8378f, -0.2605f, 3.9600f, 17.3407f, + 113.1116f, 34.6326f, 11.6688f, 109.3541f, 240.8123f, 45.0615f, + 80.7443f, 39.2500f, -21.0931f, -27.1989f, -0.4264f, -0.1345f, + 1.6269f, -0.0716f, 0.0989f, -0.1382f, 0.0248f, 0.0913f, + 4.3903f, 244.1014f, 32.2567f, 58.6171f, 62.2273f, -2.8647f, + -227.5659f, 16.0031f, -70.5256f, 23.8071f, 290.7356f, 13.6094f, + -2.1842f, 0.0104f, -2.8760f, 0.3708f, 0.8501f, -3.2964f, + -0.2088f, -0.4474f, 1.2248f, 40.5180f, -130.7891f, -188.1583f, + -174.0906f, 205.9622f, 0.3425f, 0.2531f, 0.2822f, 0.0488f, + 0.1416f, -0.0433f, -0.1195f, -0.0413f, -0.0708f, -0.0787f, + -0.0889f, -0.4022f, -0.5055f, -0.4715f, 0.2315f, 0.1021f, + -0.3676f, -0.3499f, -0.0715f, 0.1913f, 205.7521f, 125.2265f, + 92.0640f, 77.5566f, -164.4280f, -19.3715f, -0.1346f, -0.4060f, + 0.5042f, -0.2395f, -0.1329f, -0.1397f, 0.2175f, 0.2895f, + 5.5019f, 198.9799f, 114.0018f, 94.9015f, 86.8434f, -183.4237f, + 121.5626f, 94.8945f, 65.0803f, 93.6487f, -346.5279f, -47.6168f, + 0.0633f, 0.0135f, -0.0692f, -0.1015f, -0.1146f, -0.1341f, + -0.1175f, 0.4186f, 0.1505f, 130.7402f, 107.8443f, 62.8497f, + 65.3501f, -312.7407f, 282.8321f, 98.1531f, 75.6648f, 25.8733f, + -176.9298f, -37.2695f, -0.3760f, 0.0017f, 0.1030f, -0.1483f, + 0.0787f, -0.0962f, 0.4109f, -0.2292f, 9.1681f, 274.3607f, + 60.9538f, 75.9405f, 68.3776f, -167.3098f, -335.1045f, -69.2583f, + -76.3441f, -16.5793f, 218.5244f, 28.2405f, 0.9169f, -0.0026f, + -0.8077f, -1.5756f, -0.0804f, 0.1404f, 1.2656f, 0.0272f, + -0.2529f, -340.8659f, -112.7778f, -58.3890f, -4.1224f, 108.1709f, + -180.7382f, -93.7114f, -77.8686f, -131.8134f, 353.3893f, 4.8233f, + 0.0205f, 0.0000f, -1.1654f, -0.0161f, -0.0255f, -0.0358f, + -0.0412f, 0.1103f, 0.1041f, -188.9934f, -110.1792f, -88.6301f, + -93.7226f, 336.9746f, + }; + +static const float av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = + { -175.6918f, 43.4519f, 154.196f, -81.1015f, -0.0758f, + 136.5695f, 110.8713f, 142.029f, -153.0901f, -145.2688f }; + +static const float + av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES * + HIDDEN_LAYERS_1_NODES] = { + -0.1727f, -0.2859f, -0.3757f, -0.4260f, -0.5441f, -0.0666f, -0.3792f, + -0.1335f, -0.1521f, -0.0821f, -3.1590f, 0.2711f, 0.5889f, 0.0878f, + 0.4693f, 0.7773f, -9.2989f, 0.0414f, 0.4485f, 22.8958f, -3.7024f, + -2.4672f, -43.2908f, 0.0956f, 0.4431f, 2.3429f, 1.7183f, 0.3985f, + -0.2275f, -3.1583f, -0.3485f, 0.3280f, 0.3763f, 0.2069f, 0.4231f, + 0.7366f, -6.9527f, 0.0713f, 0.1359f, 16.6500f, -1.7655f, -0.1651f, + 0.1280f, -0.2678f, -0.2120f, 1.6243f, 1.8773f, -0.7543f, -0.3292f, + -0.7627f, -0.2001f, -0.1125f, -0.8100f, -0.1866f, 0.0567f, -0.4002f, + 3.2429f, 0.6427f, -0.3759f, -11.6518f, -2.2893f, 0.7708f, -1.8637f, + 1.7148f, 0.3124f, -0.7129f, -0.4927f, 0.1964f, -0.2570f, -25.0783f, + 2.5061f, 0.1457f, -1.1239f, 0.0570f, -0.2526f, -0.0669f, 0.6791f, + 1.1531f, -0.7246f, -0.3180f, -0.0015f, -0.0061f, -0.1626f, -0.0181f, + 0.1271f, -0.0140f, -0.6027f, 0.0736f, -0.0157f, 1.2420f, -6.4055f, + 0.2128f, -0.0386f, 0.3446f, 0.1840f, -0.7208f, -1.6979f, -0.0442f, + 0.3230f, -1.9745f, + }; + +static const float av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = + { 0.f, 70.3414f, 9.6036f, -118.1096f, 49.2507f, + 95.1849f, 81.8015f, 167.0967f, -337.7945f, 169.8344f }; + +static const float + av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = { + -0.3627f, 1.2272f, 0.2201f, -1.7406f, -0.6885f, + 0.8487f, -0.2761f, 0.7731f, -5.2096f, -0.7351f, + }; + +static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = { + 48.2331f, +}; + +static const NN_CONFIG av1_pustats_dist_nnconfig = { + NUM_FEATURES, // num_inputs + LOGITS_NODES, // num_outputs + NUM_HIDDEN_LAYERS, // num_hidden_layers + { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes + { + av1_pustats_dist_hiddenlayer_0_kernel, + av1_pustats_dist_hiddenlayer_1_kernel, + av1_pustats_dist_logits_kernel, + }, + { + av1_pustats_dist_hiddenlayer_0_bias, + av1_pustats_dist_hiddenlayer_1_bias, + av1_pustats_dist_logits_bias, + }, +}; + +#undef NUM_FEATURES +#undef NUM_HIDDEN_LAYERS +#undef HIDDEN_LAYERS_0_NODES +#undef HIDDEN_LAYERS_1_NODES +#undef LOGITS_NODES + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_PUSTATS_H_ diff --git a/third_party/aom/av1/encoder/pvq_encoder.c b/third_party/aom/av1/encoder/pvq_encoder.c deleted file mode 100644 index 9d5133012..000000000 --- a/third_party/aom/av1/encoder/pvq_encoder.c +++ /dev/null @@ -1,988 +0,0 @@ -/* - * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -/* clang-format off */ - -#ifdef HAVE_CONFIG_H -# include "config.h" -#endif - -#include <math.h> -#include <stdio.h> -#include <stdlib.h> -#include "aom_dsp/entcode.h" -#include "aom_dsp/entenc.h" -#include "av1/common/blockd.h" -#include "av1/common/odintrin.h" -#include "av1/common/partition.h" -#include "av1/common/pvq_state.h" -#include "av1/encoder/encodemb.h" -#include "av1/encoder/pvq_encoder.h" -#include "aom_ports/system_state.h" - -/*Shift to ensure that the upper bound (i.e. for the max blocksize) of the - dot-product of the 1st band of chroma with the luma ref doesn't overflow.*/ -#define OD_CFL_FLIP_SHIFT (OD_LIMIT_BSIZE_MAX + 0) - -void aom_write_symbol_pvq(aom_writer *w, int symb, aom_cdf_prob *cdf, - int nsymbs) { - if (cdf[0] == 0) - aom_cdf_init_q15_1D(cdf, nsymbs, CDF_SIZE(nsymbs)); - aom_write_symbol(w, symb, cdf, nsymbs); -} - -static void aom_encode_pvq_codeword(aom_writer *w, od_pvq_codeword_ctx *adapt, - const od_coeff *in, int n, int k) { - int i; - aom_encode_band_pvq_splits(w, adapt, in, n, k, 0); - for (i = 0; i < n; i++) if (in[i]) aom_write_bit(w, in[i] < 0); -} - -/* Computes 1/sqrt(i) using a table for small values. */ -static double od_rsqrt_table(int i) { - static double table[16] = { - 1.000000, 0.707107, 0.577350, 0.500000, - 0.447214, 0.408248, 0.377964, 0.353553, - 0.333333, 0.316228, 0.301511, 0.288675, - 0.277350, 0.267261, 0.258199, 0.250000}; - if (i <= 16) return table[i-1]; - else return 1./sqrt(i); -} - -/*Computes 1/sqrt(start+2*i+1) using a lookup table containing the results - where 0 <= i < table_size.*/ -static double od_custom_rsqrt_dynamic_table(const double* table, - const int table_size, const double start, const int i) { - if (i < table_size) return table[i]; - else return od_rsqrt_table((int)(start + 2*i + 1)); -} - -/*Fills tables used in od_custom_rsqrt_dynamic_table for a given start.*/ -static void od_fill_dynamic_rsqrt_table(double *table, const int table_size, - const double start) { - int i; - for (i = 0; i < table_size; i++) - table[i] = od_rsqrt_table((int)(start + 2*i + 1)); -} - -/** Find the codepoint on the given PSphere closest to the desired - * vector. Double-precision PVQ search just to make sure our tests - * aren't limited by numerical accuracy. - * - * @param [in] xcoeff input vector to quantize (x in the math doc) - * @param [in] n number of dimensions - * @param [in] k number of pulses - * @param [out] ypulse optimal codevector found (y in the math doc) - * @param [out] g2 multiplier for the distortion (typically squared - * gain units) - * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO - * @param [in] prev_k number of pulses already in ypulse that we should - * reuse for the search (or 0 for a new search) - * @return cosine distance between x and y (between 0 and 1) - */ -double pvq_search_rdo_double_c(const od_val16 *xcoeff, int n, int k, - od_coeff *ypulse, double g2, double pvq_norm_lambda, int prev_k) { - int i, j; - double xy; - double yy; - /* TODO - This blows our 8kB stack space budget and should be fixed when - converting PVQ to fixed point. */ - double x[MAXN]; - double xx; - double lambda; - double norm_1; - int rdo_pulses; - double delta_rate; - xx = xy = yy = 0; - for (j = 0; j < n; j++) { - x[j] = fabs((float)xcoeff[j]); - xx += x[j]*x[j]; - } - norm_1 = 1./sqrt(1e-30 + xx); - lambda = pvq_norm_lambda/(1e-30 + g2); - i = 0; - if (prev_k > 0 && prev_k <= k) { - /* We reuse pulses from a previous search so we don't have to search them - again. */ - for (j = 0; j < n; j++) { - ypulse[j] = abs(ypulse[j]); - xy += x[j]*ypulse[j]; - yy += ypulse[j]*ypulse[j]; - i += ypulse[j]; - } - } - else if (k > 2) { - double l1_norm; - double l1_inv; - l1_norm = 0; - for (j = 0; j < n; j++) l1_norm += x[j]; - l1_inv = 1./OD_MAXF(l1_norm, 1e-100); - for (j = 0; j < n; j++) { - double tmp; - tmp = k*x[j]*l1_inv; - ypulse[j] = OD_MAXI(0, (int)floor(tmp)); - xy += x[j]*ypulse[j]; - yy += ypulse[j]*ypulse[j]; - i += ypulse[j]; - } - } - else OD_CLEAR(ypulse, n); - - /* Only use RDO on the last few pulses. This not only saves CPU, but using - RDO on all pulses actually makes the results worse for reasons I don't - fully understand. */ - rdo_pulses = 1 + k/4; - /* Rough assumption for now, the last position costs about 3 bits more than - the first. */ - delta_rate = 3./n; - /* Search one pulse at a time */ - for (; i < k - rdo_pulses; i++) { - int pos; - double best_xy; - double best_yy; - pos = 0; - best_xy = -10; - best_yy = 1; - for (j = 0; j < n; j++) { - double tmp_xy; - double tmp_yy; - tmp_xy = xy + x[j]; - tmp_yy = yy + 2*ypulse[j] + 1; - tmp_xy *= tmp_xy; - if (j == 0 || tmp_xy*best_yy > best_xy*tmp_yy) { - best_xy = tmp_xy; - best_yy = tmp_yy; - pos = j; - } - } - xy = xy + x[pos]; - yy = yy + 2*ypulse[pos] + 1; - ypulse[pos]++; - } - /* Search last pulses with RDO. Distortion is D = (x-y)^2 = x^2 - 2*x*y + y^2 - and since x^2 and y^2 are constant, we just maximize x*y, plus a - lambda*rate term. Note that since x and y aren't normalized here, - we need to divide by sqrt(x^2)*sqrt(y^2). */ - for (; i < k; i++) { - double rsqrt_table[4]; - int rsqrt_table_size = 4; - int pos; - double best_cost; - pos = 0; - best_cost = -1e5; - /*Fill the small rsqrt lookup table with inputs relative to yy. - Specifically, the table of n values is filled with - rsqrt(yy + 1), rsqrt(yy + 2 + 1) .. rsqrt(yy + 2*(n-1) + 1).*/ - od_fill_dynamic_rsqrt_table(rsqrt_table, rsqrt_table_size, yy); - for (j = 0; j < n; j++) { - double tmp_xy; - double tmp_yy; - tmp_xy = xy + x[j]; - /*Calculate rsqrt(yy + 2*ypulse[j] + 1) using an optimized method.*/ - tmp_yy = od_custom_rsqrt_dynamic_table(rsqrt_table, rsqrt_table_size, - yy, ypulse[j]); - tmp_xy = 2*tmp_xy*norm_1*tmp_yy - lambda*j*delta_rate; - if (j == 0 || tmp_xy > best_cost) { - best_cost = tmp_xy; - pos = j; - } - } - xy = xy + x[pos]; - yy = yy + 2*ypulse[pos] + 1; - ypulse[pos]++; - } - for (i = 0; i < n; i++) { - if (xcoeff[i] < 0) ypulse[i] = -ypulse[i]; - } - return xy/(1e-100 + sqrt(xx*yy)); -} - -/** Encodes the gain so that the return value increases with the - * distance |x-ref|, so that we can encode a zero when x=ref. The - * value x=0 is not covered because it is only allowed in the noref - * case. - * - * @param [in] x quantized gain to encode - * @param [in] ref quantized gain of the reference - * @return interleave-encoded quantized gain value - */ -static int neg_interleave(int x, int ref) { - if (x < ref) return -2*(x - ref) - 1; - else if (x < 2*ref) return 2*(x - ref); - else return x-1; -} - -int od_vector_is_null(const od_coeff *x, int len) { - int i; - for (i = 0; i < len; i++) if (x[i]) return 0; - return 1; -} - -static double od_pvq_rate(int qg, int icgr, int theta, int ts, - const od_adapt_ctx *adapt, const od_coeff *y0, int k, int n, int speed) { - double rate; - if (k == 0) rate = 0; - else if (speed > 0) { - int i; - int sum; - double f; - /* Compute "center of mass" of the pulse vector. */ - sum = 0; - for (i = 0; i < n - (theta != -1); i++) sum += i*abs(y0[i]); - f = sum/(double)(k*n); - /* Estimates the number of bits it will cost to encode K pulses in - N dimensions based on hand-tuned fit for bitrate vs K, N and - "center of mass". */ - rate = (1 + .4*f)*n*OD_LOG2(1 + OD_MAXF(0, log(n*2*(1*f + .025))*k/n)) + 3; - } - else { - aom_writer w; - od_pvq_codeword_ctx cd; - int tell; -#if !CONFIG_ANS - od_ec_enc_init(&w.ec, 1000); -#else -# error "CONFIG_PVQ currently requires !CONFIG_ANS." -#endif - OD_COPY(&cd, &adapt->pvq.pvq_codeword_ctx, 1); -#if !CONFIG_ANS - tell = od_ec_enc_tell_frac(&w.ec); -#else -# error "CONFIG_PVQ currently requires !CONFIG_ANS." -#endif - aom_encode_pvq_codeword(&w, &cd, y0, n - (theta != -1), k); -#if !CONFIG_ANS - rate = (od_ec_enc_tell_frac(&w.ec)-tell)/8.; - od_ec_enc_clear(&w.ec); -#else -# error "CONFIG_PVQ currently requires !CONFIG_ANS." -#endif - } - if (qg > 0 && theta >= 0) { - /* Approximate cost of entropy-coding theta */ - rate += .9*OD_LOG2(ts); - if (qg == icgr) rate -= .5; - } - return rate; -} - -#define MAX_PVQ_ITEMS (20) -/* This stores the information about a PVQ search candidate, so we can sort - based on K. */ -typedef struct { - int gain; - int k; - od_val32 qtheta; - int theta; - int ts; - od_val32 qcg; -} pvq_search_item; - -int items_compare(pvq_search_item *a, pvq_search_item *b) { - /* Break ties in K with gain to ensure a stable sort. - Otherwise, the order depends on qsort implementation. */ - return a->k == b->k ? a->gain - b->gain : a->k - b->k; -} - -/** Perform PVQ quantization with prediction, trying several - * possible gains and angles. See draft-valin-videocodec-pvq and - * http://jmvalin.ca/slides/pvq.pdf for more details. - * - * @param [out] out coefficients after quantization - * @param [in] x0 coefficients before quantization - * @param [in] r0 reference, aka predicted coefficients - * @param [in] n number of dimensions - * @param [in] q0 quantization step size - * @param [out] y pulse vector (i.e. selected PVQ codevector) - * @param [out] itheta angle between input and reference (-1 if noref) - * @param [out] vk total number of pulses - * @param [in] beta per-band activity masking beta param - * @param [out] skip_diff distortion cost of skipping this block - * (accumulated) - * @param [in] is_keyframe whether we're encoding a keyframe - * @param [in] pli plane index - * @param [in] adapt probability adaptation context - * @param [in] qm QM with magnitude compensation - * @param [in] qm_inv Inverse of QM with magnitude compensation - * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO - * @param [in] speed Make search faster by making approximations - * @return gain index of the quatized gain -*/ -static int pvq_theta(od_coeff *out, const od_coeff *x0, const od_coeff *r0, - int n, int q0, od_coeff *y, int *itheta, int *vk, - od_val16 beta, double *skip_diff, int is_keyframe, int pli, - const od_adapt_ctx *adapt, const int16_t *qm, const int16_t *qm_inv, - double pvq_norm_lambda, int speed) { - od_val32 g; - od_val32 gr; - od_coeff y_tmp[MAXN + 3]; - int i; - /* Number of pulses. */ - int k; - /* Companded gain of x and reference, normalized to q. */ - od_val32 cg; - od_val32 cgr; - int icgr; - int qg; - /* Best RDO cost (D + lamdba*R) so far. */ - double best_cost; - double dist0; - /* Distortion (D) that corresponds to the best RDO cost. */ - double best_dist; - double dist; - /* Sign of Householder reflection. */ - int s; - /* Dimension on which Householder reflects. */ - int m; - od_val32 theta; - double corr; - int best_k; - od_val32 best_qtheta; - od_val32 gain_offset; - int noref; - double skip_dist; - int cfl_enabled; - int skip; - double gain_weight; - od_val16 x16[MAXN]; - od_val16 r16[MAXN]; - int xshift; - int rshift; - /* Give more weight to gain error when calculating the total distortion. */ - gain_weight = 1.0; - OD_ASSERT(n > 1); - corr = 0; -#if !defined(OD_FLOAT_PVQ) - /* Shift needed to make x fit in 16 bits even after rotation. - This shift value is not normative (it can be changed without breaking - the bitstream) */ - xshift = OD_MAXI(0, od_vector_log_mag(x0, n) - 15); - /* Shift needed to make the reference fit in 15 bits, so that the Householder - vector can fit in 16 bits. - This shift value *is* normative, and has to match the decoder. */ - rshift = OD_MAXI(0, od_vector_log_mag(r0, n) - 14); -#else - xshift = 0; - rshift = 0; -#endif - for (i = 0; i < n; i++) { -#if defined(OD_FLOAT_PVQ) - /*This is slightly different from the original float PVQ code, - where the qm was applied in the accumulation in od_pvq_compute_gain and - the vectors were od_coeffs, not od_val16 (i.e. double).*/ - x16[i] = x0[i]*(double)qm[i]*OD_QM_SCALE_1; - r16[i] = r0[i]*(double)qm[i]*OD_QM_SCALE_1; -#else - x16[i] = OD_SHR_ROUND(x0[i]*qm[i], OD_QM_SHIFT + xshift); - r16[i] = OD_SHR_ROUND(r0[i]*qm[i], OD_QM_SHIFT + rshift); -#endif - corr += OD_MULT16_16(x16[i], r16[i]); - } - cfl_enabled = is_keyframe && pli != 0 && !OD_DISABLE_CFL; - cg = od_pvq_compute_gain(x16, n, q0, &g, beta, xshift); - cgr = od_pvq_compute_gain(r16, n, q0, &gr, beta, rshift); - if (cfl_enabled) cgr = OD_CGAIN_SCALE; - /* gain_offset is meant to make sure one of the quantized gains has - exactly the same gain as the reference. */ -#if defined(OD_FLOAT_PVQ) - icgr = (int)floor(.5 + cgr); -#else - icgr = OD_SHR_ROUND(cgr, OD_CGAIN_SHIFT); -#endif - gain_offset = cgr - OD_SHL(icgr, OD_CGAIN_SHIFT); - /* Start search with null case: gain=0, no pulse. */ - qg = 0; - dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2; - best_dist = dist; - best_cost = dist + pvq_norm_lambda*od_pvq_rate(0, 0, -1, 0, adapt, NULL, 0, - n, speed); - noref = 1; - best_k = 0; - *itheta = -1; - OD_CLEAR(y, n); - best_qtheta = 0; - m = 0; - s = 1; - corr = corr/(1e-100 + g*(double)gr/OD_SHL(1, xshift + rshift)); - corr = OD_MAXF(OD_MINF(corr, 1.), -1.); - if (is_keyframe) skip_dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2; - else { - skip_dist = gain_weight*(cg - cgr)*(cg - cgr) - + cgr*(double)cg*(2 - 2*corr); - skip_dist *= OD_CGAIN_SCALE_2; - } - if (!is_keyframe) { - /* noref, gain=0 isn't allowed, but skip is allowed. */ - od_val32 scgr; - scgr = OD_MAXF(0,gain_offset); - if (icgr == 0) { - best_dist = gain_weight*(cg - scgr)*(cg - scgr) - + scgr*(double)cg*(2 - 2*corr); - best_dist *= OD_CGAIN_SCALE_2; - } - best_cost = best_dist + pvq_norm_lambda*od_pvq_rate(0, icgr, 0, 0, adapt, - NULL, 0, n, speed); - best_qtheta = 0; - *itheta = 0; - noref = 0; - } - dist0 = best_dist; - if (n <= OD_MAX_PVQ_SIZE && !od_vector_is_null(r0, n) && corr > 0) { - od_val16 xr[MAXN]; - int gain_bound; - int prev_k; - pvq_search_item items[MAX_PVQ_ITEMS]; - int idx; - int nitems; - double cos_dist; - idx = 0; - gain_bound = OD_SHR(cg - gain_offset, OD_CGAIN_SHIFT); - /* Perform theta search only if prediction is useful. */ - theta = OD_ROUND32(OD_THETA_SCALE*acos(corr)); - m = od_compute_householder(r16, n, gr, &s, rshift); - od_apply_householder(xr, x16, r16, n); - prev_k = 0; - for (i = m; i < n - 1; i++) xr[i] = xr[i + 1]; - /* Compute all candidate PVQ searches within a reasonable range of gain - and theta. */ - for (i = OD_MAXI(1, gain_bound - 1); i <= gain_bound + 1; i++) { - int j; - od_val32 qcg; - int ts; - int theta_lower; - int theta_upper; - /* Quantized companded gain */ - qcg = OD_SHL(i, OD_CGAIN_SHIFT) + gain_offset; - /* Set angular resolution (in ra) to match the encoded gain */ - ts = od_pvq_compute_max_theta(qcg, beta); - theta_lower = OD_MAXI(0, (int)floor(.5 + - theta*OD_THETA_SCALE_1*2/M_PI*ts) - 2); - theta_upper = OD_MINI(ts - 1, (int)ceil(theta*OD_THETA_SCALE_1*2/M_PI*ts)); - /* Include the angles within a reasonable range. */ - for (j = theta_lower; j <= theta_upper; j++) { - od_val32 qtheta; - qtheta = od_pvq_compute_theta(j, ts); - k = od_pvq_compute_k(qcg, j, 0, n, beta); - items[idx].gain = i; - items[idx].theta = j; - items[idx].k = k; - items[idx].qcg = qcg; - items[idx].qtheta = qtheta; - items[idx].ts = ts; - idx++; - OD_ASSERT(idx < MAX_PVQ_ITEMS); - } - } - nitems = idx; - cos_dist = 0; - /* Sort PVQ search candidates in ascending order of pulses K so that - we can reuse all the previously searched pulses across searches. */ - qsort(items, nitems, sizeof(items[0]), - (int (*)(const void *, const void *))items_compare); - /* Search for the best gain/theta in order. */ - for (idx = 0; idx < nitems; idx++) { - int j; - od_val32 qcg; - int ts; - double cost; - double dist_theta; - double sin_prod; - od_val32 qtheta; - /* Quantized companded gain */ - qcg = items[idx].qcg; - i = items[idx].gain; - j = items[idx].theta; - /* Set angular resolution (in ra) to match the encoded gain */ - ts = items[idx].ts; - /* Search for the best angle within a reasonable range. */ - qtheta = items[idx].qtheta; - k = items[idx].k; - /* Compute the minimal possible distortion by not taking the PVQ - cos_dist into account. */ - dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1; - dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta; - dist *= OD_CGAIN_SCALE_2; - /* If we have no hope of beating skip (including a 1-bit worst-case - penalty), stop now. */ - if (dist > dist0 + 1.0*pvq_norm_lambda && k != 0) continue; - sin_prod = od_pvq_sin(theta)*OD_TRIG_SCALE_1*od_pvq_sin(qtheta)* - OD_TRIG_SCALE_1; - /* PVQ search, using a gain of qcg*cg*sin(theta)*sin(qtheta) since - that's the factor by which cos_dist is multiplied to get the - distortion metric. */ - if (k == 0) { - cos_dist = 0; - OD_CLEAR(y_tmp, n-1); - } - else if (k != prev_k) { - cos_dist = pvq_search_rdo_double(xr, n - 1, k, y_tmp, - qcg*(double)cg*sin_prod*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k); - } - prev_k = k; - /* See Jmspeex' Journal of Dubious Theoretical Results. */ - dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1 - + sin_prod*(2 - 2*cos_dist); - dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta; - dist *= OD_CGAIN_SCALE_2; - /* Do approximate RDO. */ - cost = dist + pvq_norm_lambda*od_pvq_rate(i, icgr, j, ts, adapt, y_tmp, - k, n, speed); - if (cost < best_cost) { - best_cost = cost; - best_dist = dist; - qg = i; - best_k = k; - best_qtheta = qtheta; - *itheta = j; - noref = 0; - OD_COPY(y, y_tmp, n - 1); - } - } - } - /* Don't bother with no-reference version if there's a reasonable - correlation. */ - if (n <= OD_MAX_PVQ_SIZE && (corr < .5 - || cg < (od_val32)(OD_SHL(2, OD_CGAIN_SHIFT)))) { - int gain_bound; - int prev_k; - gain_bound = OD_SHR(cg, OD_CGAIN_SHIFT); - prev_k = 0; - /* Search for the best gain (haven't determined reasonable range yet). */ - for (i = OD_MAXI(1, gain_bound); i <= gain_bound + 1; i++) { - double cos_dist; - double cost; - od_val32 qcg; - qcg = OD_SHL(i, OD_CGAIN_SHIFT); - k = od_pvq_compute_k(qcg, -1, 1, n, beta); - /* Compute the minimal possible distortion by not taking the PVQ - cos_dist into account. */ - dist = gain_weight*(qcg - cg)*(qcg - cg); - dist *= OD_CGAIN_SCALE_2; - if (dist > dist0 && k != 0) continue; - cos_dist = pvq_search_rdo_double(x16, n, k, y_tmp, - qcg*(double)cg*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k); - prev_k = k; - /* See Jmspeex' Journal of Dubious Theoretical Results. */ - dist = gain_weight*(qcg - cg)*(qcg - cg) - + qcg*(double)cg*(2 - 2*cos_dist); - dist *= OD_CGAIN_SCALE_2; - /* Do approximate RDO. */ - cost = dist + pvq_norm_lambda*od_pvq_rate(i, 0, -1, 0, adapt, y_tmp, k, - n, speed); - if (cost <= best_cost) { - best_cost = cost; - best_dist = dist; - qg = i; - noref = 1; - best_k = k; - *itheta = -1; - OD_COPY(y, y_tmp, n); - } - } - } - k = best_k; - theta = best_qtheta; - skip = 0; - if (noref) { - if (qg == 0) skip = OD_PVQ_SKIP_ZERO; - } - else { - if (!is_keyframe && qg == 0) { - skip = (icgr ? OD_PVQ_SKIP_ZERO : OD_PVQ_SKIP_COPY); - } - if (qg == icgr && *itheta == 0 && !cfl_enabled) skip = OD_PVQ_SKIP_COPY; - } - /* Synthesize like the decoder would. */ - if (skip) { - if (skip == OD_PVQ_SKIP_COPY) OD_COPY(out, r0, n); - else OD_CLEAR(out, n); - } - else { - if (noref) gain_offset = 0; - g = od_gain_expand(OD_SHL(qg, OD_CGAIN_SHIFT) + gain_offset, q0, beta); - od_pvq_synthesis_partial(out, y, r16, n, noref, g, theta, m, s, - qm_inv); - } - *vk = k; - *skip_diff += skip_dist - best_dist; - /* Encode gain differently depending on whether we use prediction or not. - Special encoding on inter frames where qg=0 is allowed for noref=0 - but not noref=1.*/ - if (is_keyframe) return noref ? qg : neg_interleave(qg, icgr); - else return noref ? qg - 1 : neg_interleave(qg + 1, icgr + 1); -} - -/** Encodes a single vector of integers (eg, a partition within a - * coefficient block) using PVQ - * - * @param [in,out] w multi-symbol entropy encoder - * @param [in] qg quantized gain - * @param [in] theta quantized post-prediction theta - * @param [in] in coefficient vector to code - * @param [in] n number of coefficients in partition - * @param [in] k number of pulses in partition - * @param [in,out] model entropy encoder state - * @param [in,out] adapt adaptation context - * @param [in,out] exg ExQ16 expectation of gain value - * @param [in,out] ext ExQ16 expectation of theta value - * @param [in] cdf_ctx selects which cdf context to use - * @param [in] is_keyframe whether we're encoding a keyframe - * @param [in] code_skip whether the "skip rest" flag is allowed - * @param [in] skip_rest when set, we skip all higher bands - * @param [in] encode_flip whether we need to encode the CfL flip flag now - * @param [in] flip value of the CfL flip flag - */ -void pvq_encode_partition(aom_writer *w, - int qg, - int theta, - const od_coeff *in, - int n, - int k, - generic_encoder model[3], - od_adapt_ctx *adapt, - int *exg, - int *ext, - int cdf_ctx, - int is_keyframe, - int code_skip, - int skip_rest, - int encode_flip, - int flip) { - int noref; - int id; - noref = (theta == -1); - id = (qg > 0) + 2*OD_MINI(theta + 1,3) + 8*code_skip*skip_rest; - if (is_keyframe) { - OD_ASSERT(id != 8); - if (id >= 8) id--; - } - else { - OD_ASSERT(id != 10); - if (id >= 10) id--; - } - /* Jointly code gain, theta and noref for small values. Then we handle - larger gain and theta values. For noref, theta = -1. */ - aom_write_symbol_pvq(w, id, &adapt->pvq.pvq_gaintheta_cdf[cdf_ctx][0], - 8 + 7*code_skip); - if (encode_flip) { - /* We could eventually do some smarter entropy coding here, but it would - have to be good enough to overcome the overhead of the entropy coder. - An early attempt using a "toogle" flag with simple adaptation wasn't - worth the trouble. */ - aom_write_bit(w, flip); - } - if (qg > 0) { - int tmp; - tmp = *exg; - generic_encode(w, &model[!noref], qg - 1, &tmp, 2); - OD_IIR_DIADIC(*exg, qg << 16, 2); - } - if (theta > 1) { - int tmp; - tmp = *ext; - generic_encode(w, &model[2], theta - 2, &tmp, 2); - OD_IIR_DIADIC(*ext, theta << 16, 2); - } - aom_encode_pvq_codeword(w, &adapt->pvq.pvq_codeword_ctx, in, - n - (theta != -1), k); -} - -/** Quantizes a scalar with rate-distortion optimization (RDO) - * @param [in] x unquantized value - * @param [in] q quantization step size - * @param [in] delta0 rate increase for encoding a 1 instead of a 0 - * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO - * @retval quantized value - */ -int od_rdo_quant(od_coeff x, int q, double delta0, double pvq_norm_lambda) { - int n; - /* Optimal quantization threshold is 1/2 + lambda*delta_rate/2. See - Jmspeex' Journal of Dubious Theoretical Results for details. */ - n = OD_DIV_R0(abs(x), q); - if ((double)abs(x)/q < (double)n/2 + pvq_norm_lambda*delta0/(2*n)) { - return 0; - } - else { - return OD_DIV_R0(x, q); - } -} - -/** Encode a coefficient block (excepting DC) using PVQ - * - * @param [in,out] enc daala encoder context - * @param [in] ref 'reference' (prediction) vector - * @param [in] in coefficient block to quantize and encode - * @param [out] out quantized coefficient block - * @param [in] q0 scale/quantizer - * @param [in] pli plane index - * @param [in] bs log of the block size minus two - * @param [in] beta per-band activity masking beta param - * @param [in] is_keyframe whether we're encoding a keyframe - * @param [in] qm QM with magnitude compensation - * @param [in] qm_inv Inverse of QM with magnitude compensation - * @param [in] speed Make search faster by making approximations - * @param [in] pvq_info If null, conisdered as RDO search mode - * @return Returns block skip info indicating whether DC/AC are coded. - * bit0: DC is coded, bit1: AC is coded (1 means coded) - * - */ -PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc, - od_coeff *ref, - const od_coeff *in, - od_coeff *out, - int q_dc, - int q_ac, - int pli, - int bs, - const od_val16 *beta, - int is_keyframe, - const int16_t *qm, - const int16_t *qm_inv, - int speed, - PVQ_INFO *pvq_info){ - int theta[PVQ_MAX_PARTITIONS]; - int qg[PVQ_MAX_PARTITIONS]; - int k[PVQ_MAX_PARTITIONS]; - od_coeff y[OD_TXSIZE_MAX*OD_TXSIZE_MAX]; - int *exg; - int *ext; - int nb_bands; - int i; - const int *off; - int size[PVQ_MAX_PARTITIONS]; - generic_encoder *model; - double skip_diff; - int tell; - uint16_t *skip_cdf; - od_rollback_buffer buf; - int dc_quant; - int flip; - int cfl_encoded; - int skip_rest; - int skip_dir; - int skip_theta_value; - const unsigned char *pvq_qm; - double dc_rate; - int use_masking; - PVQ_SKIP_TYPE ac_dc_coded; - - aom_clear_system_state(); - - use_masking = enc->use_activity_masking; - - if (use_masking) - pvq_qm = &enc->state.pvq_qm_q4[pli][0]; - else - pvq_qm = 0; - - exg = &enc->state.adapt->pvq.pvq_exg[pli][bs][0]; - ext = enc->state.adapt->pvq.pvq_ext + bs*PVQ_MAX_PARTITIONS; - skip_cdf = enc->state.adapt->skip_cdf[2*bs + (pli != 0)]; - model = enc->state.adapt->pvq.pvq_param_model; - nb_bands = OD_BAND_OFFSETS[bs][0]; - off = &OD_BAND_OFFSETS[bs][1]; - - if (use_masking) - dc_quant = OD_MAXI(1, q_dc * pvq_qm[od_qm_get_index(bs, 0)] >> 4); - else - dc_quant = OD_MAXI(1, q_dc); - - tell = 0; - for (i = 0; i < nb_bands; i++) size[i] = off[i+1] - off[i]; - skip_diff = 0; - flip = 0; - /*If we are coding a chroma block of a keyframe, we are doing CfL.*/ - if (pli != 0 && is_keyframe) { - od_val32 xy; - xy = 0; - /*Compute the dot-product of the first band of chroma with the luma ref.*/ - for (i = off[0]; i < off[1]; i++) { -#if defined(OD_FLOAT_PVQ) - xy += ref[i]*(double)qm[i]*OD_QM_SCALE_1* - (double)in[i]*(double)qm[i]*OD_QM_SCALE_1; -#else - od_val32 rq; - od_val32 inq; - rq = ref[i]*qm[i]; - inq = in[i]*qm[i]; - xy += OD_SHR(rq*(int64_t)inq, OD_SHL(OD_QM_SHIFT + OD_CFL_FLIP_SHIFT, - 1)); -#endif - } - /*If cos(theta) < 0, then |theta| > pi/2 and we should negate the ref.*/ - if (xy < 0) { - flip = 1; - for(i = off[0]; i < off[nb_bands]; i++) ref[i] = -ref[i]; - } - } - for (i = 0; i < nb_bands; i++) { - int q; - - if (use_masking) - q = OD_MAXI(1, q_ac * pvq_qm[od_qm_get_index(bs, i + 1)] >> 4); - else - q = OD_MAXI(1, q_ac); - - qg[i] = pvq_theta(out + off[i], in + off[i], ref + off[i], size[i], - q, y + off[i], &theta[i], &k[i], beta[i], &skip_diff, is_keyframe, - pli, enc->state.adapt, qm + off[i], qm_inv + off[i], - enc->pvq_norm_lambda, speed); - } - od_encode_checkpoint(enc, &buf); - if (is_keyframe) out[0] = 0; - else { - int n; - n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant); - if (n == 0) { - out[0] = 0; - } else { - int tell2; - od_rollback_buffer dc_buf; - - dc_rate = -OD_LOG2((double)(OD_ICDF(skip_cdf[3]) - OD_ICDF(skip_cdf[2]))/ - (double)(OD_ICDF(skip_cdf[2]) - OD_ICDF(skip_cdf[1]))); - dc_rate += 1; - -#if !CONFIG_ANS - tell2 = od_ec_enc_tell_frac(&enc->w.ec); -#else -#error "CONFIG_PVQ currently requires !CONFIG_ANS." -#endif - od_encode_checkpoint(enc, &dc_buf); - generic_encode(&enc->w, &enc->state.adapt->model_dc[pli], - n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2); -#if !CONFIG_ANS - tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2; -#else -#error "CONFIG_PVQ currently requires !CONFIG_ANS." -#endif - dc_rate += tell2/8.0; - od_encode_rollback(enc, &dc_buf); - - out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate, - enc->pvq_norm_lambda); - } - } -#if !CONFIG_ANS - tell = od_ec_enc_tell_frac(&enc->w.ec); -#else -#error "CONFIG_PVQ currently requires !CONFIG_ANS." -#endif - /* Code as if we're not skipping. */ - aom_write_symbol(&enc->w, 2 + (out[0] != 0), skip_cdf, 4); - ac_dc_coded = AC_CODED + (out[0] != 0); - cfl_encoded = 0; - skip_rest = 1; - skip_theta_value = is_keyframe ? -1 : 0; - for (i = 1; i < nb_bands; i++) { - if (theta[i] != skip_theta_value || qg[i]) skip_rest = 0; - } - skip_dir = 0; - if (nb_bands > 1) { - for (i = 0; i < 3; i++) { - int j; - int tmp; - tmp = 1; - // ToDo(yaowu): figure out better stop condition without gcc warning. - for (j = i + 1; j < nb_bands && j < PVQ_MAX_PARTITIONS; j += 3) { - if (theta[j] != skip_theta_value || qg[j]) tmp = 0; - } - skip_dir |= tmp << i; - } - } - if (theta[0] == skip_theta_value && qg[0] == 0 && skip_rest) nb_bands = 0; - - /* NOTE: There was no other better place to put this function. */ - if (pvq_info) - av1_store_pvq_enc_info(pvq_info, qg, theta, k, y, nb_bands, off, size, - skip_rest, skip_dir, bs); - - for (i = 0; i < nb_bands; i++) { - int encode_flip; - /* Encode CFL flip bit just after the first time it's used. */ - encode_flip = pli != 0 && is_keyframe && theta[i] != -1 && !cfl_encoded; - if (i == 0 || (!skip_rest && !(skip_dir & (1 << ((i - 1)%3))))) { - pvq_encode_partition(&enc->w, qg[i], theta[i], y + off[i], - size[i], k[i], model, enc->state.adapt, exg + i, ext + i, - (pli != 0)*OD_TXSIZES*PVQ_MAX_PARTITIONS + bs*PVQ_MAX_PARTITIONS + i, - is_keyframe, i == 0 && (i < nb_bands - 1), skip_rest, encode_flip, flip); - } - if (i == 0 && !skip_rest && bs > 0) { - aom_write_symbol(&enc->w, skip_dir, - &enc->state.adapt->pvq.pvq_skip_dir_cdf[(pli != 0) + 2*(bs - 1)][0], 7); - } - if (encode_flip) cfl_encoded = 1; - } -#if !CONFIG_ANS - tell = od_ec_enc_tell_frac(&enc->w.ec) - tell; -#else -#error "CONFIG_PVQ currently requires !CONFIG_ANS." -#endif - /* Account for the rate of skipping the AC, based on the same DC decision - we made when trying to not skip AC. */ - { - double skip_rate; - if (out[0] != 0) { - skip_rate = -OD_LOG2((OD_ICDF(skip_cdf[1]) - OD_ICDF(skip_cdf[0]))/ - (double)OD_ICDF(skip_cdf[3])); - } - else { - skip_rate = -OD_LOG2(OD_ICDF(skip_cdf[0])/ - (double)OD_ICDF(skip_cdf[3])); - } - tell -= (int)floor(.5+8*skip_rate); - } - if (nb_bands == 0 || skip_diff <= enc->pvq_norm_lambda/8*tell) { - if (is_keyframe) out[0] = 0; - else { - int n; - n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant); - if (n == 0) { - out[0] = 0; - } else { - int tell2; - od_rollback_buffer dc_buf; - - dc_rate = -OD_LOG2((double)(OD_ICDF(skip_cdf[1]) - OD_ICDF(skip_cdf[0]))/ - (double)OD_ICDF(skip_cdf[0])); - dc_rate += 1; - -#if !CONFIG_ANS - tell2 = od_ec_enc_tell_frac(&enc->w.ec); -#else -#error "CONFIG_PVQ currently requires !CONFIG_ANS." -#endif - od_encode_checkpoint(enc, &dc_buf); - generic_encode(&enc->w, &enc->state.adapt->model_dc[pli], - n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2); -#if !CONFIG_ANS - tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2; -#else -#error "CONFIG_PVQ currently requires !CONFIG_ANS." -#endif - dc_rate += tell2/8.0; - od_encode_rollback(enc, &dc_buf); - - out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate, - enc->pvq_norm_lambda); - } - } - /* We decide to skip, roll back everything as it was before. */ - od_encode_rollback(enc, &buf); - aom_write_symbol(&enc->w, out[0] != 0, skip_cdf, 4); - ac_dc_coded = (out[0] != 0); - if (is_keyframe) for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = 0; - else for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = ref[i]; - } - if (pvq_info) - pvq_info->ac_dc_coded = ac_dc_coded; - return ac_dc_coded; -} diff --git a/third_party/aom/av1/encoder/pvq_encoder.h b/third_party/aom/av1/encoder/pvq_encoder.h deleted file mode 100644 index b84c8961b..000000000 --- a/third_party/aom/av1/encoder/pvq_encoder.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -/* clang-format off */ - -#if !defined(_pvq_encoder_H) -# define _pvq_encoder_H (1) -# include "aom_dsp/bitwriter.h" -# include "aom_dsp/entenc.h" -# include "av1/common/blockd.h" -# include "av1/common/pvq.h" -# include "av1/encoder/encint.h" - -void aom_write_symbol_pvq(aom_writer *w, int symb, aom_cdf_prob *cdf, - int nsymbs); - -void aom_encode_band_pvq_splits(aom_writer *w, od_pvq_codeword_ctx *adapt, - const int *y, int n, int k, int level); - -void aom_laplace_encode_special(aom_writer *w, int x, unsigned decay); - -void pvq_encode_partition(aom_writer *w, - int qg, - int theta, - const od_coeff *in, - int n, - int k, - generic_encoder model[3], - od_adapt_ctx *adapt, - int *exg, - int *ext, - int cdf_ctx, - int is_keyframe, - int code_skip, - int skip_rest, - int encode_flip, - int flip); - -PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc, od_coeff *ref, - const od_coeff *in, od_coeff *out, int q_dc, int q_ac, int pli, int bs, - const od_val16 *beta, int is_keyframe, - const int16_t *qm, const int16_t *qm_inv, int speed, - PVQ_INFO *pvq_info); - -#endif diff --git a/third_party/aom/av1/encoder/ransac.c b/third_party/aom/av1/encoder/ransac.c index 6d2eb4183..781f528eb 100644 --- a/third_party/aom/av1/encoder/ransac.c +++ b/third_party/aom/av1/encoder/ransac.c @@ -80,60 +80,6 @@ static void project_points_double_affine(double *mat, double *points, } } -static void project_points_double_hortrapezoid(double *mat, double *points, - double *proj, const int n, - const int stride_points, - const int stride_proj) { - int i; - double x, y, Z, Z_inv; - for (i = 0; i < n; ++i) { - x = *(points++), y = *(points++); - Z_inv = mat[7] * y + 1; - assert(fabs(Z_inv) > 0.000001); - Z = 1. / Z_inv; - *(proj++) = (mat[2] * x + mat[3] * y + mat[0]) * Z; - *(proj++) = (mat[5] * y + mat[1]) * Z; - points += stride_points - 2; - proj += stride_proj - 2; - } -} - -static void project_points_double_vertrapezoid(double *mat, double *points, - double *proj, const int n, - const int stride_points, - const int stride_proj) { - int i; - double x, y, Z, Z_inv; - for (i = 0; i < n; ++i) { - x = *(points++), y = *(points++); - Z_inv = mat[6] * x + 1; - assert(fabs(Z_inv) > 0.000001); - Z = 1. / Z_inv; - *(proj++) = (mat[2] * x + mat[0]) * Z; - *(proj++) = (mat[4] * x + mat[5] * y + mat[1]) * Z; - points += stride_points - 2; - proj += stride_proj - 2; - } -} - -static void project_points_double_homography(double *mat, double *points, - double *proj, const int n, - const int stride_points, - const int stride_proj) { - int i; - double x, y, Z, Z_inv; - for (i = 0; i < n; ++i) { - x = *(points++), y = *(points++); - Z_inv = mat[6] * x + mat[7] * y + 1; - assert(fabs(Z_inv) > 0.000001); - Z = 1. / Z_inv; - *(proj++) = (mat[2] * x + mat[3] * y + mat[0]) * Z; - *(proj++) = (mat[4] * x + mat[5] * y + mat[1]) * Z; - points += stride_points - 2; - proj += stride_proj - 2; - } -} - static void normalize_homography(double *pts, int n, double *T) { double *p = pts; double mean[2] = { 0, 0 }; @@ -193,22 +139,6 @@ static void denormalize_homography(double *params, double *T1, double *T2) { multiply_mat(iT2, params2, params, 3, 3, 3); } -static void denormalize_homography_reorder(double *params, double *T1, - double *T2) { - double params_denorm[MAX_PARAMDIM]; - memcpy(params_denorm, params, sizeof(*params) * 8); - params_denorm[8] = 1.0; - denormalize_homography(params_denorm, T1, T2); - params[0] = params_denorm[2]; - params[1] = params_denorm[5]; - params[2] = params_denorm[0]; - params[3] = params_denorm[1]; - params[4] = params_denorm[3]; - params[5] = params_denorm[4]; - params[6] = params_denorm[6]; - params[7] = params_denorm[7]; -} - static void denormalize_affine_reorder(double *params, double *T1, double *T2) { double params_denorm[MAX_PARAMDIM]; params_denorm[0] = params[0]; @@ -377,217 +307,6 @@ static int find_affine(int np, double *pts1, double *pts2, double *mat) { return 0; } -static int find_vertrapezoid(int np, double *pts1, double *pts2, double *mat) { - const int np3 = np * 3; - double *a = (double *)aom_malloc(sizeof(*a) * np3 * 14); - double *U = a + np3 * 7; - double S[7], V[7 * 7], H[9]; - int i, mini; - double sx, sy, dx, dy; - double T1[9], T2[9]; - - normalize_homography(pts1, np, T1); - normalize_homography(pts2, np, T2); - - for (i = 0; i < np; ++i) { - dx = *(pts2++); - dy = *(pts2++); - sx = *(pts1++); - sy = *(pts1++); - - a[i * 3 * 7 + 0] = a[i * 3 * 7 + 1] = 0; - a[i * 3 * 7 + 2] = -sx; - a[i * 3 * 7 + 3] = -sy; - a[i * 3 * 7 + 4] = -1; - a[i * 3 * 7 + 5] = dy * sx; - a[i * 3 * 7 + 6] = dy; - - a[(i * 3 + 1) * 7 + 0] = sx; - a[(i * 3 + 1) * 7 + 1] = 1; - a[(i * 3 + 1) * 7 + 2] = a[(i * 3 + 1) * 7 + 3] = a[(i * 3 + 1) * 7 + 4] = - 0; - a[(i * 3 + 1) * 7 + 5] = -dx * sx; - a[(i * 3 + 1) * 7 + 6] = -dx; - - a[(i * 3 + 2) * 7 + 0] = -dy * sx; - a[(i * 3 + 2) * 7 + 1] = -dy; - a[(i * 3 + 2) * 7 + 2] = dx * sx; - a[(i * 3 + 2) * 7 + 3] = dx * sy; - a[(i * 3 + 2) * 7 + 4] = dx; - a[(i * 3 + 2) * 7 + 5] = a[(i * 3 + 2) * 7 + 6] = 0; - } - if (SVD(U, S, V, a, np3, 7)) { - aom_free(a); - return 1; - } else { - double minS = 1e12; - mini = -1; - for (i = 0; i < 7; ++i) { - if (S[i] < minS) { - minS = S[i]; - mini = i; - } - } - } - H[1] = H[7] = 0; - for (i = 0; i < 1; i++) H[i] = V[i * 7 + mini]; - for (; i < 6; i++) H[i + 1] = V[i * 7 + mini]; - for (; i < 7; i++) H[i + 2] = V[i * 7 + mini]; - - denormalize_homography_reorder(H, T1, T2); - aom_free(a); - if (H[8] == 0.0) { - return 1; - } else { - // normalize - double f = 1.0 / H[8]; - for (i = 0; i < 8; i++) mat[i] = f * H[i]; - } - return 0; -} - -static int find_hortrapezoid(int np, double *pts1, double *pts2, double *mat) { - const int np3 = np * 3; - double *a = (double *)aom_malloc(sizeof(*a) * np3 * 14); - double *U = a + np3 * 7; - double S[7], V[7 * 7], H[9]; - int i, mini; - double sx, sy, dx, dy; - double T1[9], T2[9]; - - normalize_homography(pts1, np, T1); - normalize_homography(pts2, np, T2); - - for (i = 0; i < np; ++i) { - dx = *(pts2++); - dy = *(pts2++); - sx = *(pts1++); - sy = *(pts1++); - - a[i * 3 * 7 + 0] = a[i * 3 * 7 + 1] = a[i * 3 * 7 + 2] = 0; - a[i * 3 * 7 + 3] = -sy; - a[i * 3 * 7 + 4] = -1; - a[i * 3 * 7 + 5] = dy * sy; - a[i * 3 * 7 + 6] = dy; - - a[(i * 3 + 1) * 7 + 0] = sx; - a[(i * 3 + 1) * 7 + 1] = sy; - a[(i * 3 + 1) * 7 + 2] = 1; - a[(i * 3 + 1) * 7 + 3] = a[(i * 3 + 1) * 7 + 4] = 0; - a[(i * 3 + 1) * 7 + 5] = -dx * sy; - a[(i * 3 + 1) * 7 + 6] = -dx; - - a[(i * 3 + 2) * 7 + 0] = -dy * sx; - a[(i * 3 + 2) * 7 + 1] = -dy * sy; - a[(i * 3 + 2) * 7 + 2] = -dy; - a[(i * 3 + 2) * 7 + 3] = dx * sy; - a[(i * 3 + 2) * 7 + 4] = dx; - a[(i * 3 + 2) * 7 + 5] = a[(i * 3 + 2) * 7 + 6] = 0; - } - - if (SVD(U, S, V, a, np3, 7)) { - aom_free(a); - return 1; - } else { - double minS = 1e12; - mini = -1; - for (i = 0; i < 7; ++i) { - if (S[i] < minS) { - minS = S[i]; - mini = i; - } - } - } - H[3] = H[6] = 0; - for (i = 0; i < 3; i++) H[i] = V[i * 7 + mini]; - for (; i < 5; i++) H[i + 1] = V[i * 7 + mini]; - for (; i < 7; i++) H[i + 2] = V[i * 7 + mini]; - - denormalize_homography_reorder(H, T1, T2); - aom_free(a); - if (H[8] == 0.0) { - return 1; - } else { - // normalize - double f = 1.0 / H[8]; - for (i = 0; i < 8; i++) mat[i] = f * H[i]; - } - return 0; -} - -static int find_homography(int np, double *pts1, double *pts2, double *mat) { - // Implemented from Peter Kovesi's normalized implementation - const int np3 = np * 3; - double *a = (double *)aom_malloc(sizeof(*a) * np3 * 18); - double *U = a + np3 * 9; - double S[9], V[9 * 9], H[9]; - int i, mini; - double sx, sy, dx, dy; - double T1[9], T2[9]; - - normalize_homography(pts1, np, T1); - normalize_homography(pts2, np, T2); - - for (i = 0; i < np; ++i) { - dx = *(pts2++); - dy = *(pts2++); - sx = *(pts1++); - sy = *(pts1++); - - a[i * 3 * 9 + 0] = a[i * 3 * 9 + 1] = a[i * 3 * 9 + 2] = 0; - a[i * 3 * 9 + 3] = -sx; - a[i * 3 * 9 + 4] = -sy; - a[i * 3 * 9 + 5] = -1; - a[i * 3 * 9 + 6] = dy * sx; - a[i * 3 * 9 + 7] = dy * sy; - a[i * 3 * 9 + 8] = dy; - - a[(i * 3 + 1) * 9 + 0] = sx; - a[(i * 3 + 1) * 9 + 1] = sy; - a[(i * 3 + 1) * 9 + 2] = 1; - a[(i * 3 + 1) * 9 + 3] = a[(i * 3 + 1) * 9 + 4] = a[(i * 3 + 1) * 9 + 5] = - 0; - a[(i * 3 + 1) * 9 + 6] = -dx * sx; - a[(i * 3 + 1) * 9 + 7] = -dx * sy; - a[(i * 3 + 1) * 9 + 8] = -dx; - - a[(i * 3 + 2) * 9 + 0] = -dy * sx; - a[(i * 3 + 2) * 9 + 1] = -dy * sy; - a[(i * 3 + 2) * 9 + 2] = -dy; - a[(i * 3 + 2) * 9 + 3] = dx * sx; - a[(i * 3 + 2) * 9 + 4] = dx * sy; - a[(i * 3 + 2) * 9 + 5] = dx; - a[(i * 3 + 2) * 9 + 6] = a[(i * 3 + 2) * 9 + 7] = a[(i * 3 + 2) * 9 + 8] = - 0; - } - - if (SVD(U, S, V, a, np3, 9)) { - aom_free(a); - return 1; - } else { - double minS = 1e12; - mini = -1; - for (i = 0; i < 9; ++i) { - if (S[i] < minS) { - minS = S[i]; - mini = i; - } - } - } - - for (i = 0; i < 9; i++) H[i] = V[i * 9 + mini]; - denormalize_homography_reorder(H, T1, T2); - aom_free(a); - if (H[8] == 0.0) { - return 1; - } else { - // normalize - double f = 1.0 / H[8]; - for (i = 0; i < 8; i++) mat[i] = f * H[i]; - } - return 0; -} - static int get_rand_indices(int npoints, int minpts, int *indices, unsigned int *seed) { int i, j; @@ -860,11 +579,6 @@ static int is_degenerate_affine(double *p) { return is_collinear3(p, p + 2, p + 4); } -static int is_degenerate_homography(double *p) { - return is_collinear3(p, p + 2, p + 4) || is_collinear3(p, p + 2, p + 6) || - is_collinear3(p, p + 4, p + 6) || is_collinear3(p + 2, p + 4, p + 6); -} - int ransac_translation(int *matched_points, int npoints, int *num_inliers_by_motion, double *params_by_motion, int num_desired_motions) { @@ -887,30 +601,3 @@ int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion, params_by_motion, num_desired_motions, 3, is_degenerate_affine, find_affine, project_points_double_affine); } - -int ransac_homography(int *matched_points, int npoints, - int *num_inliers_by_motion, double *params_by_motion, - int num_desired_motions) { - return ransac(matched_points, npoints, num_inliers_by_motion, - params_by_motion, num_desired_motions, 4, - is_degenerate_homography, find_homography, - project_points_double_homography); -} - -int ransac_hortrapezoid(int *matched_points, int npoints, - int *num_inliers_by_motion, double *params_by_motion, - int num_desired_motions) { - return ransac(matched_points, npoints, num_inliers_by_motion, - params_by_motion, num_desired_motions, 4, - is_degenerate_homography, find_hortrapezoid, - project_points_double_hortrapezoid); -} - -int ransac_vertrapezoid(int *matched_points, int npoints, - int *num_inliers_by_motion, double *params_by_motion, - int num_desired_motions) { - return ransac(matched_points, npoints, num_inliers_by_motion, - params_by_motion, num_desired_motions, 4, - is_degenerate_homography, find_vertrapezoid, - project_points_double_vertrapezoid); -} diff --git a/third_party/aom/av1/encoder/ransac.h b/third_party/aom/av1/encoder/ransac.h index f611add36..1019055ed 100644 --- a/third_party/aom/av1/encoder/ransac.h +++ b/third_party/aom/av1/encoder/ransac.h @@ -25,17 +25,8 @@ typedef int (*RansacFunc)(int *matched_points, int npoints, /* Each of these functions fits a motion model from a set of corresponding points in 2 frames using RANSAC. */ -int ransac_homography(int *matched_points, int npoints, - int *num_inliers_by_motion, double *params_by_motion, - int num_motions); int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion, double *params_by_motion, int num_motions); -int ransac_hortrapezoid(int *matched_points, int npoints, - int *num_inliers_by_motion, double *params_by_motion, - int num_motions); -int ransac_vertrapezoid(int *matched_points, int npoints, - int *num_inliers_by_motion, double *params_by_motion, - int num_motions); int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion, double *params_by_motion, int num_motions); int ransac_translation(int *matched_points, int npoints, diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c index a90cb880e..ac9392fa1 100644 --- a/third_party/aom/av1/encoder/ratectrl.c +++ b/third_party/aom/av1/encoder/ratectrl.c @@ -44,7 +44,6 @@ #define MAX_BPB_FACTOR 50 #define FRAME_OVERHEAD_BITS 200 -#if CONFIG_HIGHBITDEPTH #define ASSIGN_MINQ_TABLE(bit_depth, name) \ do { \ switch (bit_depth) { \ @@ -58,13 +57,6 @@ name = NULL; \ } \ } while (0) -#else -#define ASSIGN_MINQ_TABLE(bit_depth, name) \ - do { \ - (void)bit_depth; \ - name = name##_8; \ - } while (0) -#endif // Tables relating active max Q to active min Q static int kf_low_motion_minq_8[QINDEX_RANGE]; @@ -74,7 +66,6 @@ static int arfgf_high_motion_minq_8[QINDEX_RANGE]; static int inter_minq_8[QINDEX_RANGE]; static int rtc_minq_8[QINDEX_RANGE]; -#if CONFIG_HIGHBITDEPTH static int kf_low_motion_minq_10[QINDEX_RANGE]; static int kf_high_motion_minq_10[QINDEX_RANGE]; static int arfgf_low_motion_minq_10[QINDEX_RANGE]; @@ -87,7 +78,6 @@ static int arfgf_low_motion_minq_12[QINDEX_RANGE]; static int arfgf_high_motion_minq_12[QINDEX_RANGE]; static int inter_minq_12[QINDEX_RANGE]; static int rtc_minq_12[QINDEX_RANGE]; -#endif static int gf_high = 2000; static int gf_low = 400; @@ -97,7 +87,6 @@ static int kf_low = 400; // How many times less pixels there are to encode given the current scaling. // Temporary replacement for rcf_mult and rate_thresh_mult. static double resize_rate_factor(const AV1_COMP *cpi, int width, int height) { - (void)cpi; return (double)(cpi->oxcf.width * cpi->oxcf.height) / (width * height); } @@ -140,33 +129,27 @@ void av1_rc_init_minq_luts(void) { init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8, arfgf_low_motion_minq_8, arfgf_high_motion_minq_8, inter_minq_8, rtc_minq_8, AOM_BITS_8); -#if CONFIG_HIGHBITDEPTH init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10, arfgf_low_motion_minq_10, arfgf_high_motion_minq_10, inter_minq_10, rtc_minq_10, AOM_BITS_10); init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12, arfgf_low_motion_minq_12, arfgf_high_motion_minq_12, inter_minq_12, rtc_minq_12, AOM_BITS_12); -#endif } // These functions use formulaic calculations to make playing with the // quantizer tables easier. If necessary they can be replaced by lookup // tables if and when things settle down in the experimental bitstream double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) { -// Convert the index to a real Q value (scaled down to match old Q values) -#if CONFIG_HIGHBITDEPTH + // Convert the index to a real Q value (scaled down to match old Q values) switch (bit_depth) { - case AOM_BITS_8: return av1_ac_quant(qindex, 0, bit_depth) / 4.0; - case AOM_BITS_10: return av1_ac_quant(qindex, 0, bit_depth) / 16.0; - case AOM_BITS_12: return av1_ac_quant(qindex, 0, bit_depth) / 64.0; + case AOM_BITS_8: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 4.0; + case AOM_BITS_10: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 16.0; + case AOM_BITS_12: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 64.0; default: assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); return -1.0; } -#else - return av1_ac_quant(qindex, 0, bit_depth) / 4.0; -#endif } int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, @@ -196,12 +179,8 @@ int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target) { const AV1EncoderConfig *oxcf = &cpi->oxcf; const int min_frame_target = AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); -// Clip the frame target to the minimum setup value. -#if CONFIG_EXT_REFS + // Clip the frame target to the minimum setup value. if (cpi->rc.is_src_frame_alt_ref) { -#else - if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) { -#endif // CONFIG_EXT_REFS // If there is an active ARF at this location use the minimum // bits on this frame even if it is a constructed arf. // The active maximum quantizer insures that an appropriate @@ -239,14 +218,10 @@ static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) { const AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; -// Non-viewable frames are a special case and are treated as pure overhead. -#if CONFIG_EXT_REFS + // Non-viewable frames are a special case and are treated as pure overhead. // TODO(zoeliu): To further explore whether we should treat BWDREF_FRAME // differently, since it is a no-show frame. if (!cm->show_frame && !rc->is_bwd_ref_frame) -#else - if (!cm->show_frame) -#endif // CONFIG_EXT_REFS rc->bits_off_target -= encoded_frame_size; else rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size; @@ -590,11 +565,9 @@ static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) { active_worst_quality = curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] * 2; } else { - if (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || -#if CONFIG_EXT_REFS - cpi->refresh_alt2_ref_frame || -#endif // CONFIG_EXT_REFS - cpi->refresh_alt_ref_frame)) { + if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame || + cpi->refresh_alt_ref_frame)) { active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4 : rc->last_q[INTER_FRAME]; } else { @@ -931,26 +904,9 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width, } int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) { - static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = { - 1.00, // INTER_NORMAL -#if CONFIG_EXT_REFS - 0.80, // INTER_LOW - 1.50, // INTER_HIGH - 1.25, // GF_ARF_LOW -#else - 1.00, // INTER_HIGH - 1.50, // GF_ARF_LOW -#endif // CONFIG_EXT_REFS - 2.00, // GF_ARF_STD - 2.00, // KF_STD + static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] = { + INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME }; - static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] = -#if CONFIG_EXT_REFS - { INTER_FRAME, INTER_FRAME, INTER_FRAME, - INTER_FRAME, INTER_FRAME, KEY_FRAME }; -#else - { INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME }; -#endif // CONFIG_EXT_REFS const AV1_COMMON *const cm = &cpi->common; int qdelta = av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q, @@ -1020,11 +976,9 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, active_best_quality += av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); } - } else if (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || -#if CONFIG_EXT_REFS - cpi->refresh_alt2_ref_frame || -#endif // CONFIG_EXT_REFS - cpi->refresh_alt_ref_frame)) { + } else if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame || + cpi->refresh_alt_ref_frame)) { // Use the lower of active_worst_quality and recent // average Q as basis for GF/ARF best Q limit unless last frame was // a key frame. @@ -1044,11 +998,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, active_best_quality = active_best_quality * 15 / 16; } else if (oxcf->rc_mode == AOM_Q) { -#if CONFIG_EXT_REFS if (!cpi->refresh_alt_ref_frame && !cpi->refresh_alt2_ref_frame) { -#else - if (!cpi->refresh_alt_ref_frame) { -#endif // CONFIG_EXT_REFS active_best_quality = cq_level; } else { active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); @@ -1080,11 +1030,9 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, if ((cpi->oxcf.rc_mode != AOM_Q) && (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) { if (frame_is_intra_only(cm) || - (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || -#if CONFIG_EXT_REFS - cpi->refresh_alt2_ref_frame || -#endif // CONFIG_EXT_REFS - cpi->refresh_alt_ref_frame))) { + (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame || + cpi->refresh_alt_ref_frame))) { active_best_quality -= (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast); active_worst_quality += (cpi->twopass.extend_maxq / 2); @@ -1106,7 +1054,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, } // Modify active_best_quality for downscaled normal frames. - if (!av1_frame_unscaled(cm) && !frame_is_kf_gf_arf(cpi)) { + if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) { int qdelta = av1_compute_qdelta_by_rate( rc, cm->frame_type, active_best_quality, 2.0, cm->bit_depth); active_best_quality = @@ -1193,7 +1141,7 @@ static void rc_set_frame_target(AV1_COMP *cpi, int target, int width, rc->this_frame_target = target; // Modify frame size target when down-scaled. - if (!av1_frame_unscaled(cm)) + if (av1_frame_scaled(cm)) rc->this_frame_target = (int)(rc->this_frame_target * resize_rate_factor(cpi, width, height)); @@ -1217,21 +1165,13 @@ static void update_alt_ref_frame_stats(AV1_COMP *cpi) { static void update_golden_frame_stats(AV1_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; -#if CONFIG_EXT_REFS // Update the Golden frame usage counts. // NOTE(weitinglin): If we use show_existing_frame for an OVERLAY frame, // only the virtual indices for the reference frame will be // updated and cpi->refresh_golden_frame will still be zero. if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) { -#else // !CONFIG_EXT_REFS - // Update the Golden frame usage counts. - if (cpi->refresh_golden_frame) { -#endif // CONFIG_EXT_REFS - -#if CONFIG_EXT_REFS // We will not use internal overlay frames to replace the golden frame if (!rc->is_src_frame_ext_arf) -#endif // CONFIG_EXT_REFS // this frame refreshes means next frames don't unless specified by user rc->frames_since_golden = 0; @@ -1248,11 +1188,7 @@ static void update_golden_frame_stats(AV1_COMP *cpi) { // Decrement count down till next gf if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--; -#if CONFIG_EXT_REFS } else if (!cpi->refresh_alt_ref_frame && !cpi->refresh_alt2_ref_frame) { -#else - } else if (!cpi->refresh_alt_ref_frame) { -#endif // CONFIG_EXT_REFS // Decrement count down till next gf if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--; @@ -1282,10 +1218,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2); } else { if (!rc->is_src_frame_alt_ref && - !(cpi->refresh_golden_frame || -#if CONFIG_EXT_REFS - cpi->refresh_alt2_ref_frame || -#endif // CONFIG_EXT_REFS + !(cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame || cpi->refresh_alt_ref_frame)) { rc->last_q[INTER_FRAME] = qindex; rc->avg_frame_qindex[INTER_FRAME] = @@ -1307,10 +1240,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { // This is used to help set quality in forced key frames to reduce popping if ((qindex < rc->last_boosted_qindex) || (cm->frame_type == KEY_FRAME) || (!rc->constrained_gf_group && - (cpi->refresh_alt_ref_frame || -#if CONFIG_EXT_REFS - cpi->refresh_alt2_ref_frame || -#endif // CONFIG_EXT_REFS + (cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame || (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) { rc->last_boosted_qindex = qindex; } @@ -1320,7 +1250,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { // Rolling monitors of whether we are over or underspending used to help // regulate min and Max Q in two pass. - if (!av1_frame_unscaled(cm)) + if (av1_frame_scaled(cm)) rc->this_frame_target = (int)(rc->this_frame_target / resize_rate_factor(cpi, cm->width, cm->height)); @@ -1337,14 +1267,10 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { // Actual bits spent rc->total_actual_bits += rc->projected_frame_size; -#if CONFIG_EXT_REFS // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME // differently here for rc->avg_frame_bandwidth. rc->total_target_bits += (cm->show_frame || rc->is_bwd_ref_frame) ? rc->avg_frame_bandwidth : 0; -#else - rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0; -#endif // CONFIG_EXT_REFS rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits; @@ -1358,13 +1284,9 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0; -#if CONFIG_EXT_REFS // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME // differently here for rc->avg_frame_bandwidth. if (cm->show_frame || rc->is_bwd_ref_frame) { -#else - if (cm->show_frame) { -#endif // CONFIG_EXT_REFS rc->frames_since_key++; rc->frames_to_key--; } @@ -1417,6 +1339,10 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int target; + int altref_enabled = is_altref_enabled(cpi); + int sframe_dist = cpi->oxcf.sframe_dist; + int sframe_mode = cpi->oxcf.sframe_mode; + int sframe_enabled = cpi->oxcf.sframe_enabled; // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic. if (!cpi->refresh_alt_ref_frame && (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) || @@ -1429,6 +1355,37 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) { rc->source_alt_ref_active = 0; } else { cm->frame_type = INTER_FRAME; + if (sframe_enabled) { + if (altref_enabled) { + if (sframe_mode == 1) { + // sframe_mode == 1: insert sframe if it matches altref frame. + + if (cm->current_video_frame % sframe_dist == 0 && + cm->frame_type != KEY_FRAME && cm->current_video_frame != 0 && + cpi->refresh_alt_ref_frame) { + cm->frame_type = S_FRAME; + } + } else { + // sframe_mode != 1: if sframe will be inserted at the next available + // altref frame + + if (cm->current_video_frame % sframe_dist == 0 && + cm->frame_type != KEY_FRAME && cm->current_video_frame != 0) { + rc->sframe_due = 1; + } + + if (rc->sframe_due && cpi->refresh_alt_ref_frame) { + cm->frame_type = S_FRAME; + rc->sframe_due = 0; + } + } + } else { + if (cm->current_video_frame % sframe_dist == 0 && + cm->frame_type != KEY_FRAME && cm->current_video_frame != 0) { + cm->frame_type = S_FRAME; + } + } + } } if (rc->frames_till_gf_update_due == 0) { rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2; @@ -1444,6 +1401,10 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) { rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS; rc->gfu_boost = DEFAULT_GF_BOOST; } + + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + av1_cyclic_refresh_update_parameters(cpi); + if (cm->frame_type == KEY_FRAME) target = calc_iframe_target_size_one_pass_vbr(cpi); else diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h index 8b410e778..81157ce72 100644 --- a/third_party/aom/av1/encoder/ratectrl.h +++ b/third_party/aom/av1/encoder/ratectrl.h @@ -28,7 +28,6 @@ extern "C" { #define MAX_GF_INTERVAL 16 #define FIXED_GF_INTERVAL 8 // Used in some testing modes only -#if CONFIG_EXT_REFS typedef enum { INTER_NORMAL = 0, INTER_LOW = 1, @@ -38,23 +37,20 @@ typedef enum { KF_STD = 5, RATE_FACTOR_LEVELS = 6 } RATE_FACTOR_LEVEL; -#else -typedef enum { - INTER_NORMAL = 0, - INTER_HIGH = 1, - GF_ARF_LOW = 2, - GF_ARF_STD = 3, - KF_STD = 4, - RATE_FACTOR_LEVELS = 5 -} RATE_FACTOR_LEVEL; -#endif // CONFIG_EXT_REFS + +static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = { + 1.00, // INTER_NORMAL + 0.80, // INTER_LOW + 1.50, // INTER_HIGH + 1.25, // GF_ARF_LOW + 2.00, // GF_ARF_STD + 2.00, // KF_STD +}; typedef struct { int resize_width; int resize_height; -#if CONFIG_FRAME_SUPERRES uint8_t superres_denom; -#endif // CONFIG_FRAME_SUPERRES } size_params_type; typedef struct { @@ -88,8 +84,8 @@ typedef struct { int source_alt_ref_pending; int source_alt_ref_active; int is_src_frame_alt_ref; + int sframe_due; -#if CONFIG_EXT_REFS // Length of the bi-predictive frame group interval int bipred_group_interval; @@ -99,7 +95,6 @@ typedef struct { int is_last_bipred_frame; int is_bipred_frame; int is_src_frame_ext_arf; -#endif // CONFIG_EXT_REFS int avg_frame_bandwidth; // Average frame size target for clip int min_frame_bandwidth; // Minimum allocation used for any frame diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.c b/third_party/aom/av1/encoder/ratectrl_xiph.c index b9f827528..e69de29bb 100644 --- a/third_party/aom/av1/encoder/ratectrl_xiph.c +++ b/third_party/aom/av1/encoder/ratectrl_xiph.c @@ -1,1244 +0,0 @@ -/* - * Copyright (c) 2001-2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <limits.h> -#include "av1/common/odintrin.h" -#include "av1/encoder/ratectrl_xiph.h" - -#define OD_Q57(v) ((int64_t)((uint64_t)(v) << 57)) -#define OD_F_Q45(v) ((int64_t)(((v) * ((int64_t)1 << 45)))) -#define OD_F_Q12(v) ((int32_t)(((v) * ((int32_t)1 << 12)))) - -/*A rough lookup table for tan(x), 0 <= x < pi/2. - The values are Q12 fixed-point and spaced at 5 degree intervals. - These decisions are somewhat arbitrary, but sufficient for the 2nd order - Bessel follower below. - Values of x larger than 85 degrees are extrapolated from the last interval, - which is way off, but "good enough".*/ -static uint16_t OD_ROUGH_TAN_LOOKUP[18] = { 0, 358, 722, 1098, 1491, - 1910, 2365, 2868, 3437, 4096, - 4881, 5850, 7094, 8784, 11254, - 15286, 23230, 46817 }; - -/*alpha is Q24 in the range [0,0.5). - The return values is 5.12.*/ -static int od_warp_alpha(int alpha) { - int i; - int d; - int t0; - int t1; - i = alpha * 36 >> 24; - if (i >= 17) i = 16; - t0 = OD_ROUGH_TAN_LOOKUP[i]; - t1 = OD_ROUGH_TAN_LOOKUP[i + 1]; - d = alpha * 36 - (i << 24); - return (int)((((int64_t)t0 << 32) + ((t1 - t0) << 8) * (int64_t)d) >> 32); -} - -static const int64_t OD_ATANH_LOG2[32] = { - 0x32B803473F7AD0F4LL, 0x2F2A71BD4E25E916LL, 0x2E68B244BB93BA06LL, - 0x2E39FB9198CE62E4LL, 0x2E2E683F68565C8FLL, 0x2E2B850BE2077FC1LL, - 0x2E2ACC58FE7B78DBLL, 0x2E2A9E2DE52FD5F2LL, 0x2E2A92A338D53EECLL, - 0x2E2A8FC08F5E19B6LL, 0x2E2A8F07E51A485ELL, 0x2E2A8ED9BA8AF388LL, - 0x2E2A8ECE2FE7384ALL, 0x2E2A8ECB4D3E4B1ALL, 0x2E2A8ECA94940FE8LL, - 0x2E2A8ECA6669811DLL, 0x2E2A8ECA5ADEDD6ALL, 0x2E2A8ECA57FC347ELL, - 0x2E2A8ECA57438A43LL, 0x2E2A8ECA57155FB4LL, 0x2E2A8ECA5709D510LL, - 0x2E2A8ECA5706F267LL, 0x2E2A8ECA570639BDLL, 0x2E2A8ECA57060B92LL, - 0x2E2A8ECA57060008LL, 0x2E2A8ECA5705FD25LL, 0x2E2A8ECA5705FC6CLL, - 0x2E2A8ECA5705FC3ELL, 0x2E2A8ECA5705FC33LL, 0x2E2A8ECA5705FC30LL, - 0x2E2A8ECA5705FC2FLL, 0x2E2A8ECA5705FC2FLL -}; - -static int od_ilog64(int64_t v) { - static const unsigned char OD_DEBRUIJN_IDX64[64] = { - 0, 1, 2, 7, 3, 13, 8, 19, 4, 25, 14, 28, 9, 34, 20, 40, - 5, 17, 26, 38, 15, 46, 29, 48, 10, 31, 35, 54, 21, 50, 41, 57, - 63, 6, 12, 18, 24, 27, 33, 39, 16, 37, 45, 47, 30, 53, 49, 56, - 62, 11, 23, 32, 36, 44, 52, 55, 61, 22, 43, 51, 60, 42, 59, 58 - }; - int ret; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v |= v >> 32; - ret = (int)v & 1; - v = (v >> 1) + 1; - ret += OD_DEBRUIJN_IDX64[v * UINT64_C(0x218A392CD3D5DBF) >> 58 & 0x3F]; - return ret; -} - -/*Computes the binary exponential of logq57. - input: a log base 2 in Q57 format - output: a 64 bit integer in Q0 (no fraction) */ -static int64_t od_bexp64(int64_t logq57) { - int64_t w; - int64_t z; - int ipart; - ipart = (int)(logq57 >> 57); - if (ipart < 0) return 0; - if (ipart >= 63) return 0x7FFFFFFFFFFFFFFFLL; - z = logq57 - OD_Q57(ipart); - if (z) { - int64_t mask; - int64_t wlo; - int i; - /*C doesn't give us 64x64->128 muls, so we use CORDIC. - This is not particularly fast, but it's not being used in time-critical - code; it is very accurate.*/ - /*z is the fractional part of the log in Q62 format. - We need 1 bit of headroom since the magnitude can get larger than 1 - during the iteration, and a sign bit.*/ - z <<= 5; - /*w is the exponential in Q61 format (since it also needs headroom and can - get as large as 2.0); we could get another bit if we dropped the sign, - but we'll recover that bit later anyway. - Ideally this should start out as - \lim_{n->\infty} 2^{61}/\product_{i=1}^n \sqrt{1-2^{-2i}} - but in order to guarantee convergence we have to repeat iterations 4, - 13 (=3*4+1), and 40 (=3*13+1, etc.), so it winds up somewhat larger.*/ - w = 0x26A3D0E401DD846DLL; - for (i = 0;; i++) { - mask = -(z < 0); - w += ((w >> (i + 1)) + mask) ^ mask; - z -= (OD_ATANH_LOG2[i] + mask) ^ mask; - /*Repeat iteration 4.*/ - if (i >= 3) break; - z *= 2; - } - for (;; i++) { - mask = -(z < 0); - w += ((w >> (i + 1)) + mask) ^ mask; - z -= (OD_ATANH_LOG2[i] + mask) ^ mask; - /*Repeat iteration 13.*/ - if (i >= 12) break; - z *= 2; - } - for (; i < 32; i++) { - mask = -(z < 0); - w += ((w >> (i + 1)) + mask) ^ mask; - z = (z - ((OD_ATANH_LOG2[i] + mask) ^ mask)) * 2; - } - wlo = 0; - /*Skip the remaining iterations unless we really require that much - precision. - We could have bailed out earlier for smaller iparts, but that would - require initializing w from a table, as the limit doesn't converge to - 61-bit precision until n=30.*/ - if (ipart > 30) { - /*For these iterations, we just update the low bits, as the high bits - can't possibly be affected. - OD_ATANH_LOG2 has also converged (it actually did so one iteration - earlier, but that's no reason for an extra special case).*/ - for (;; i++) { - mask = -(z < 0); - wlo += ((w >> i) + mask) ^ mask; - z -= (OD_ATANH_LOG2[31] + mask) ^ mask; - /*Repeat iteration 40.*/ - if (i >= 39) break; - z <<= 1; - } - for (; i < 61; i++) { - mask = -(z < 0); - wlo += ((w >> i) + mask) ^ mask; - z = (z - ((OD_ATANH_LOG2[31] + mask) ^ mask)) << 1; - } - } - w = (w << 1) + wlo; - } else { - w = (int64_t)1 << 62; - } - if (ipart < 62) { - w = ((w >> (61 - ipart)) + 1) >> 1; - } - return w; -} - -/*Computes the binary log of w - input: a 64-bit integer in Q0 (no fraction) - output: a 64-bit log in Q57 */ -static int64_t od_blog64(int64_t w) { - int64_t z; - int ipart; - if (w <= 0) return -1; - ipart = od_ilog64(w) - 1; - if (ipart > 61) { - w >>= ipart - 61; - } else { - w <<= 61 - ipart; - } - z = 0; - if (w & (w - 1)) { - int64_t x; - int64_t y; - int64_t u; - int64_t mask; - int i; - /*C doesn't give us 64x64->128 muls, so we use CORDIC. - This is not particularly fast, but it's not being used in time-critical - code; it is very accurate.*/ - /*z is the fractional part of the log in Q61 format.*/ - /*x and y are the cosh() and sinh(), respectively, in Q61 format. - We are computing z = 2*atanh(y/x) = 2*atanh((w - 1)/(w + 1)).*/ - x = w + ((int64_t)1 << 61); - y = w - ((int64_t)1 << 61); - for (i = 0; i < 4; i++) { - mask = -(y < 0); - z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask; - u = x >> (i + 1); - x -= ((y >> (i + 1)) + mask) ^ mask; - y -= (u + mask) ^ mask; - } - /*Repeat iteration 4.*/ - for (i--; i < 13; i++) { - mask = -(y < 0); - z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask; - u = x >> (i + 1); - x -= ((y >> (i + 1)) + mask) ^ mask; - y -= (u + mask) ^ mask; - } - /*Repeat iteration 13.*/ - for (i--; i < 32; i++) { - mask = -(y < 0); - z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask; - u = x >> (i + 1); - x -= ((y >> (i + 1)) + mask) ^ mask; - y -= (u + mask) ^ mask; - } - /*OD_ATANH_LOG2 has converged.*/ - for (; i < 40; i++) { - mask = -(y < 0); - z += ((OD_ATANH_LOG2[31] >> i) + mask) ^ mask; - u = x >> (i + 1); - x -= ((y >> (i + 1)) + mask) ^ mask; - y -= (u + mask) ^ mask; - } - /*Repeat iteration 40.*/ - for (i--; i < 62; i++) { - mask = -(y < 0); - z += ((OD_ATANH_LOG2[31] >> i) + mask) ^ mask; - u = x >> (i + 1); - x -= ((y >> (i + 1)) + mask) ^ mask; - y -= (u + mask) ^ mask; - } - z = (z + 8) >> 4; - } - return OD_Q57(ipart) + z; -} - -/*Convenience function converts Q57 value to a clamped 32-bit Q24 value - in: input in Q57 format. - Return: same number in Q24 */ -static int32_t od_q57_to_q24(int64_t in) { - int64_t ret; - ret = (in + ((int64_t)1 << 32)) >> 33; - /*0x80000000 is automatically converted to unsigned on 32-bit systems. - -0x7FFFFFFF-1 is needed to avoid "promoting" the whole expression to - unsigned.*/ - return (int32_t)OD_CLAMPI(-0x7FFFFFFF - 1, ret, 0x7FFFFFFF); -} - -/*Binary exponential of log_scale with 24-bit fractional precision and - saturation. - log_scale: A binary logarithm in Q57 format. - Return: The binary exponential in Q24 format, saturated to 2**31-1 if - log_scale was too large.*/ -static int32_t od_bexp64_q24(int64_t log_scale) { - if (log_scale < OD_Q57(8)) { - int64_t ret; - ret = od_bexp64(log_scale + OD_Q57(24)); - return ret < 0x7FFFFFFF ? (int32_t)ret : 0x7FFFFFFF; - } - return 0x7FFFFFFF; -} - -/*Re-initialize Bessel filter coefficients with the specified delay. - This does not alter the x/y state, but changes the reaction time of the - filter. - Altering the time constant of a reactive filter without alterning internal - state is something that has to be done carefuly, but our design operates at - high enough delays and with small enough time constant changes to make it - safe.*/ -static void od_iir_bessel2_reinit(od_iir_bessel2 *f, int delay) { - int alpha; - int64_t one48; - int64_t warp; - int64_t k1; - int64_t k2; - int64_t d; - int64_t a; - int64_t ik2; - int64_t b1; - int64_t b2; - /*This borrows some code from an unreleased version of Postfish. - See the recipe at http://unicorn.us.com/alex/2polefilters.html for details - on deriving the filter coefficients.*/ - /*alpha is Q24*/ - alpha = (1 << 24) / delay; - one48 = (int64_t)1 << 48; - /*warp is 7.12*/ - warp = OD_MAXI(od_warp_alpha(alpha), 1); - /*k1 is 9.12*/ - k1 = 3 * warp; - /*k2 is 16.24.*/ - k2 = k1 * warp; - /*d is 16.15.*/ - d = ((((1 << 12) + k1) << 12) + k2 + 256) >> 9; - /*a is 0.32, since d is larger than both 1.0 and k2.*/ - a = (k2 << 23) / d; - /*ik2 is 25.24.*/ - ik2 = one48 / k2; - /*b1 is Q56; in practice, the integer ranges between -2 and 2.*/ - b1 = 2 * a * (ik2 - (1 << 24)); - /*b2 is Q56; in practice, the integer ranges between -2 and 2.*/ - b2 = (one48 << 8) - ((4 * a) << 24) - b1; - /*All of the filter parameters are Q24.*/ - f->c[0] = (int32_t)((b1 + ((int64_t)1 << 31)) >> 32); - f->c[1] = (int32_t)((b2 + ((int64_t)1 << 31)) >> 32); - f->g = (int32_t)((a + 128) >> 8); -} - -/*Initialize a 2nd order low-pass Bessel filter with the corresponding delay - and initial value. - value is Q24.*/ -static void od_iir_bessel2_init(od_iir_bessel2 *f, int delay, int32_t value) { - od_iir_bessel2_reinit(f, delay); - f->y[1] = f->y[0] = f->x[1] = f->x[0] = value; -} - -static int64_t od_iir_bessel2_update(od_iir_bessel2 *f, int32_t x) { - int64_t c0; - int64_t c1; - int64_t g; - int64_t x0; - int64_t x1; - int64_t y0; - int64_t y1; - int64_t ya; - c0 = f->c[0]; - c1 = f->c[1]; - g = f->g; - x0 = f->x[0]; - x1 = f->x[1]; - y0 = f->y[0]; - y1 = f->y[1]; - ya = ((x + x0 * 2 + x1) * g + y0 * c0 + y1 * c1 + (1 << 23)) >> 24; - f->x[1] = (int32_t)x0; - f->x[0] = x; - f->y[1] = (int32_t)y0; - f->y[0] = (int32_t)ya; - return ya; -} - -static void od_enc_rc_reset(od_rc_state *rc) { - int64_t npixels; - int64_t ibpp; - rc->bits_per_frame = (int64_t)(rc->target_bitrate / rc->framerate); - /*Insane framerates or frame sizes mean insane bitrates. - Let's not get carried away.*/ - if (rc->bits_per_frame > 0x400000000000LL) { - rc->bits_per_frame = (int64_t)0x400000000000LL; - } else { - if (rc->bits_per_frame < 32) { - rc->bits_per_frame = 32; - } - } - rc->reservoir_frame_delay = OD_MAXI(rc->reservoir_frame_delay, 12); - rc->reservoir_max = rc->bits_per_frame * rc->reservoir_frame_delay; - /*Start with a buffer fullness and fullness target of 50% */ - rc->reservoir_target = (rc->reservoir_max + 1) >> 1; - rc->reservoir_fullness = rc->reservoir_target; - /*Pick exponents and initial scales for quantizer selection.*/ - npixels = rc->frame_width * (int64_t)rc->frame_height; - rc->log_npixels = od_blog64(npixels); - ibpp = npixels / rc->bits_per_frame; - /*All of these initial scale/exp values are from Theora, and have not yet - been adapted to Daala, so they're certainly wrong. - The B-frame values especially are simply copies of the P-frame values.*/ - if (ibpp < 1) { - rc->exp[OD_I_FRAME] = 59; - rc->log_scale[OD_I_FRAME] = od_blog64(1997) - OD_Q57(OD_COEFF_SHIFT); - } else if (ibpp < 2) { - rc->exp[OD_I_FRAME] = 55; - rc->log_scale[OD_I_FRAME] = od_blog64(1604) - OD_Q57(OD_COEFF_SHIFT); - } else { - rc->exp[OD_I_FRAME] = 48; - rc->log_scale[OD_I_FRAME] = od_blog64(834) - OD_Q57(OD_COEFF_SHIFT); - } - if (ibpp < 4) { - rc->exp[OD_P_FRAME] = 100; - rc->log_scale[OD_P_FRAME] = od_blog64(2249) - OD_Q57(OD_COEFF_SHIFT); - } else if (ibpp < 8) { - rc->exp[OD_P_FRAME] = 95; - rc->log_scale[OD_P_FRAME] = od_blog64(1751) - OD_Q57(OD_COEFF_SHIFT); - } else { - rc->exp[OD_P_FRAME] = 73; - rc->log_scale[OD_P_FRAME] = od_blog64(1260) - OD_Q57(OD_COEFF_SHIFT); - } - /*Golden P-frames both use the same log_scale and exp modeling - values as regular P-frames and the same scale follower. - For convenience in the rate calculation code, we maintain a copy of - the scale and exp values in OD_GOLDEN_P_FRAME.*/ - rc->exp[OD_GOLDEN_P_FRAME] = rc->exp[OD_P_FRAME]; - rc->log_scale[OD_GOLDEN_P_FRAME] = rc->log_scale[OD_P_FRAME]; - rc->exp[OD_ALTREF_P_FRAME] = rc->exp[OD_P_FRAME]; - rc->log_scale[OD_ALTREF_P_FRAME] = rc->log_scale[OD_P_FRAME]; - /*We clamp the actual I and B frame delays to a minimum of 10 to work within - the range of values where later incrementing the delay works as designed. - 10 is not an exact choice, but rather a good working trade-off.*/ - rc->inter_p_delay = 10; - rc->inter_delay_target = rc->reservoir_frame_delay >> 1; - memset(rc->frame_count, 0, sizeof(rc->frame_count)); - /*Drop-frame tracking is concerned with more than just the basic three frame - types. - It needs to track boosted and cut subtypes (of which there is only one - right now, OD_GOLDEN_P_FRAME). */ - rc->prev_drop_count[OD_I_FRAME] = 0; - rc->log_drop_scale[OD_I_FRAME] = OD_Q57(0); - rc->prev_drop_count[OD_P_FRAME] = 0; - rc->log_drop_scale[OD_P_FRAME] = OD_Q57(0); - rc->prev_drop_count[OD_GOLDEN_P_FRAME] = 0; - rc->log_drop_scale[OD_GOLDEN_P_FRAME] = OD_Q57(0); - rc->prev_drop_count[OD_ALTREF_P_FRAME] = 0; - rc->log_drop_scale[OD_ALTREF_P_FRAME] = OD_Q57(0); - /*Set up second order followers, initialized according to corresponding - time constants.*/ - od_iir_bessel2_init(&rc->scalefilter[OD_I_FRAME], 4, - od_q57_to_q24(rc->log_scale[OD_I_FRAME])); - od_iir_bessel2_init(&rc->scalefilter[OD_P_FRAME], rc->inter_p_delay, - od_q57_to_q24(rc->log_scale[OD_P_FRAME])); - od_iir_bessel2_init(&rc->vfrfilter[OD_I_FRAME], 4, - od_bexp64_q24(rc->log_drop_scale[OD_I_FRAME])); - od_iir_bessel2_init(&rc->vfrfilter[OD_P_FRAME], 4, - od_bexp64_q24(rc->log_drop_scale[OD_P_FRAME])); - od_iir_bessel2_init(&rc->vfrfilter[OD_GOLDEN_P_FRAME], 4, - od_bexp64_q24(rc->log_drop_scale[OD_GOLDEN_P_FRAME])); - od_iir_bessel2_init(&rc->vfrfilter[OD_ALTREF_P_FRAME], 4, - od_bexp64_q24(rc->log_drop_scale[OD_ALTREF_P_FRAME])); -} - -int od_enc_rc_resize(od_rc_state *rc) { - /*If encoding has not yet begun, reset the buffer state.*/ - if (rc->cur_frame == 0) { - od_enc_rc_reset(rc); - } else { - int idt; - /*Otherwise, update the bounds on the buffer, but not the current - fullness.*/ - rc->bits_per_frame = (int64_t)(rc->target_bitrate / rc->framerate); - /*Insane framerates or frame sizes mean insane bitrates. - Let's not get carried away.*/ - if (rc->bits_per_frame > 0x400000000000LL) { - rc->bits_per_frame = (int64_t)0x400000000000LL; - } else { - if (rc->bits_per_frame < 32) { - rc->bits_per_frame = 32; - } - } - rc->reservoir_frame_delay = OD_MAXI(rc->reservoir_frame_delay, 12); - rc->reservoir_max = rc->bits_per_frame * rc->reservoir_frame_delay; - rc->reservoir_target = - ((rc->reservoir_max + 1) >> 1) + - ((rc->bits_per_frame + 2) >> 2) * - OD_MINI(rc->keyframe_rate, rc->reservoir_frame_delay); - /*Update the INTER-frame scale filter delay. - We jump to it immediately if we've already seen enough frames; otherwise - it is simply set as the new target.*/ - rc->inter_delay_target = idt = OD_MAXI(rc->reservoir_frame_delay >> 1, 10); - if (idt < OD_MINI(rc->inter_p_delay, rc->frame_count[OD_P_FRAME])) { - od_iir_bessel2_init(&rc->scalefilter[OD_P_FRAME], idt, - rc->scalefilter[OD_P_FRAME].y[0]); - rc->inter_p_delay = idt; - } - } - return 0; -} - -int od_enc_rc_init(od_rc_state *rc, int64_t bitrate, int delay_ms) { - if (rc->framerate <= 0) return 1; - if (rc->target_bitrate > 0) { - /*State has already been initialized; rather than reinitialize, - adjust the buffering for the new target rate. */ - rc->target_bitrate = bitrate; - return od_enc_rc_resize(rc); - } - rc->target_quantizer = 0; - rc->target_bitrate = bitrate; - rc->rate_bias = 0; - if (bitrate > 0) { - /* The buffer size is clamped between [12, 256], this interval is short - enough to - allow reaction, but long enough to allow looking into the next GOP - (avoiding - the case where the last frames before an I-frame get starved). - The 12 frame minimum gives us some chance to distribute bit estimation - errors in the worst case. The 256 frame maximum means we'll require 8-10 - seconds - of pre-buffering at 24-30 fps, which is not unreasonable.*/ - rc->reservoir_frame_delay = - (int)OD_MINI((delay_ms / 1000) * rc->framerate, 256); - rc->drop_frames = 1; - rc->cap_overflow = 1; - rc->cap_underflow = 0; - rc->twopass_state = 0; - od_enc_rc_reset(rc); - } - return 0; -} - -/*Scale the number of frames by the number of expected drops/duplicates.*/ -static int od_rc_scale_drop(od_rc_state *rc, int frame_type, int nframes) { - if (rc->prev_drop_count[frame_type] > 0 || - rc->log_drop_scale[frame_type] > OD_Q57(0)) { - int64_t dup_scale; - dup_scale = od_bexp64(((rc->log_drop_scale[frame_type] + - od_blog64(rc->prev_drop_count[frame_type] + 1)) >> - 1) + - OD_Q57(8)); - if (dup_scale < nframes << 8) { - int dup_scalei; - dup_scalei = (int)dup_scale; - if (dup_scalei > 0) { - nframes = ((nframes << 8) + dup_scalei - 1) / dup_scalei; - } - } else { - nframes = !!nframes; - } - } - return nframes; -} - -/*Closed form version of frame determination code. - Used by rate control to predict frame types and subtypes into the future. - No side effects, may be called any number of times. - Note that it ignores end-of-file conditions; one-pass planning *should* - ignore end-of-file. */ -int od_frame_type(od_rc_state *rc, int64_t coding_frame_count, int *is_golden, - int *is_altref, int64_t *ip_count) { - int frame_type; - if (coding_frame_count == 0) { - *is_golden = 1; - *is_altref = 1; - *ip_count = 0; - frame_type = OD_I_FRAME; - } else { - int keyrate = rc->keyframe_rate; - if (rc->closed_gop) { - int ip_per_gop; - int gop_n; - int gop_i; - ip_per_gop = (keyrate - 1) / 2; - gop_n = coding_frame_count / keyrate; - gop_i = coding_frame_count - gop_n * keyrate; - *ip_count = gop_n * ip_per_gop + (gop_i > 0) + (gop_i - 1); - frame_type = gop_i == 0 ? OD_I_FRAME : OD_P_FRAME; - } else { - int ip_per_gop; - int gop_n; - int gop_i; - ip_per_gop = (keyrate); - gop_n = (coding_frame_count - 1) / keyrate; - gop_i = coding_frame_count - gop_n * keyrate - 1; - *ip_count = (coding_frame_count > 0) + gop_n * ip_per_gop + (gop_i); - frame_type = gop_i / 1 < ip_per_gop - 1 ? OD_P_FRAME : OD_I_FRAME; - } - } - *is_golden = - (*ip_count % rc->goldenframe_rate) == 0 || frame_type == OD_I_FRAME; - *is_altref = (*ip_count % rc->altref_rate) == 0 || frame_type == OD_I_FRAME; - return frame_type; -} - -/*Count frames types forward from the current frame up to but not including - the last I-frame in reservoir_frame_delay. - If reservoir_frame_delay contains no I-frames (or the current frame is the - only I-frame), count all reservoir_frame_delay frames. - Returns the number of frames counted. - Right now, this implementation is simple, brute-force, and expensive. - It is also easy to understand and debug. - TODO: replace with a virtual FIFO that keeps running totals as - repeating the counting over-and-over will have a performance impact on - whole-file 2pass usage.*/ -static int frame_type_count(od_rc_state *rc, int nframes[OD_FRAME_NSUBTYPES]) { - int i; - int j; - int acc[OD_FRAME_NSUBTYPES]; - int count; - int reservoir_frames; - int reservoir_frame_delay; - memset(nframes, 0, OD_FRAME_NSUBTYPES * sizeof(*nframes)); - memset(acc, 0, sizeof(acc)); - count = 0; - reservoir_frames = 0; -#if 1 - /*Go ahead and count past end-of-stream. - We won't nail the exact bitrate on short files that end with a partial - GOP, but we also won't [potentially] destroy the quality of the last few - frames in that same case when we suddenly find out the stream is ending - before the original planning horizon.*/ - reservoir_frame_delay = rc->reservoir_frame_delay; -#else - /*Don't count past the end of the stream (once we know where end-of-stream - is).*/ - reservoir_frame_delay = - rc->end_of_input ? rc->input_size + 1 : rc->reservoir_frame_delay; -#endif - for (i = 0; i < reservoir_frame_delay; i++) { - int frame_type; - int is_golden; - int is_altref; - int64_t dummy; - frame_type = - od_frame_type(rc, rc->cur_frame + i, &is_golden, &is_altref, &dummy); - switch (frame_type) { - case OD_I_FRAME: { - for (j = 0; j < OD_FRAME_NSUBTYPES; j++) nframes[j] += acc[j]; - reservoir_frames += count; - memset(acc, 0, sizeof(acc)); - acc[OD_I_FRAME] = 1; - count = 1; - break; - } - case OD_P_FRAME: { - if (is_golden) { - ++acc[OD_GOLDEN_P_FRAME]; - ++count; - } else if (is_altref) { - ++acc[OD_ALTREF_P_FRAME]; - ++count; - } else { - ++acc[OD_P_FRAME]; - ++count; - } - break; - } - } - } - /*If there were no I-frames at all, or only the first frame was an I-frame, - the accumulators never flushed and still contain the counts for the - entire buffer. - In both these cases, we return these counts. - Otherwise, we discard what remains in the accumulators as they contain - the counts from and past the last I-frame.*/ - if (reservoir_frames == 0) { - for (i = 0; i < OD_FRAME_NSUBTYPES; i++) nframes[i] = acc[i]; - reservoir_frames += count; - } - return reservoir_frames; -} - -static int convert_to_ac_quant(int q, int bit_depth) { - return lrint(av1_convert_qindex_to_q(q, bit_depth)); -} - -int od_enc_rc_select_quantizers_and_lambdas(od_rc_state *rc, - int is_golden_frame, - int is_altref_frame, int frame_type, - int *bottom_idx, int *top_idx) { - int frame_subtype; - int64_t log_cur_scale; - int lossy_quantizer_min; - int lossy_quantizer_max; - double mqp_i = OD_MQP_I; - double mqp_p = OD_MQP_P; - double mqp_gp = OD_MQP_GP; - double mqp_ap = OD_MQP_AP; - int reservoir_frames; - int nframes[OD_FRAME_NSUBTYPES]; - int32_t mqp_Q12[OD_FRAME_NSUBTYPES]; - int64_t dqp_Q45[OD_FRAME_NSUBTYPES]; - /*Verify the closed-form frame type determination code matches what the - input queue set.*/ - /*One pseudo-non-closed-form caveat: - Once we've seen end-of-input, the batched frame determination code - suppresses the last open-GOP's I-frame (since it would only be - useful for the next GOP, which doesn't exist). - Thus, don't check one the input queue is drained.*/ - if (!rc->end_of_input) { - int closed_form_type; - int closed_form_golden; - int closed_form_altref; - int64_t closed_form_cur_frame; - closed_form_type = - od_frame_type(rc, rc->cur_frame, &closed_form_golden, - &closed_form_altref, &closed_form_cur_frame); - OD_UNUSED(closed_form_type); - OD_UNUSED(is_altref_frame); - assert(closed_form_type == frame_type); - assert(closed_form_cur_frame == rc->cur_frame); - assert(closed_form_altref == is_altref_frame); - assert(closed_form_golden == is_golden_frame); - } - - log_cur_scale = (int64_t)rc->scalefilter[frame_type].y[0] << 33; - - /*Count the various types and classes of frames.*/ - reservoir_frames = frame_type_count(rc, nframes); - nframes[OD_I_FRAME] = od_rc_scale_drop(rc, OD_I_FRAME, nframes[OD_I_FRAME]); - nframes[OD_P_FRAME] = od_rc_scale_drop(rc, OD_P_FRAME, nframes[OD_P_FRAME]); - nframes[OD_GOLDEN_P_FRAME] = - od_rc_scale_drop(rc, OD_GOLDEN_P_FRAME, nframes[OD_GOLDEN_P_FRAME]); - nframes[OD_ALTREF_P_FRAME] = - od_rc_scale_drop(rc, OD_ALTREF_P_FRAME, nframes[OD_ALTREF_P_FRAME]); - - switch (rc->twopass_state) { - default: break; - case 1: { - /*Pass 1 mode: use a fixed qi value.*/ - return rc->firstpass_quant; - } break; - case 2: { - int i; - int64_t scale_sum[OD_FRAME_NSUBTYPES]; - int qti; - /*Pass 2 mode: we know exactly how much of each frame type there is in - the current buffer window, and have estimates for the scales.*/ - for (i = 0; i < OD_FRAME_NSUBTYPES; i++) { - nframes[i] = rc->nframes[i]; - nframes[i] = rc->nframes[i]; - scale_sum[i] = rc->scale_sum[i]; - } - /*If we're not using the same frame type as in pass 1 (because someone - changed the keyframe interval), remove that scale estimate. - We'll add in a replacement for the correct frame type below.*/ - qti = rc->cur_metrics.frame_type; - if (qti != frame_type) { - nframes[qti]--; - scale_sum[qti] -= od_bexp64_q24(rc->cur_metrics.log_scale); - } - /*Compute log_scale estimates for each frame type from the pass-1 scales - we measured in the current window.*/ - for (qti = 0; qti < OD_FRAME_NSUBTYPES; qti++) { - rc->log_scale[qti] = nframes[qti] > 0 - ? od_blog64(scale_sum[qti]) - - od_blog64(nframes[qti]) - OD_Q57(24) - : -rc->log_npixels; - } - /*If we're not using the same frame type as in pass 1, add a scale - estimate for the corresponding frame using the current low-pass - filter value. - This is mostly to ensure we have a valid estimate even when pass 1 had - no frames of this type in the buffer window. - TODO: We could also plan ahead and figure out how many keyframes we'll - be forced to add in the current buffer window.*/ - qti = rc->cur_metrics.frame_type; - if (qti != frame_type) { - int64_t scale; - scale = rc->log_scale[frame_type] < OD_Q57(23) - ? od_bexp64(rc->log_scale[frame_type] + OD_Q57(24)) - : 0x7FFFFFFFFFFFLL; - scale *= nframes[frame_type]; - nframes[frame_type]++; - scale += od_bexp64_q24(log_cur_scale >> 33); - rc->log_scale[frame_type] = - od_blog64(scale) - od_blog64(nframes[qti]) - OD_Q57(24); - } else { - log_cur_scale = (int64_t)rc->cur_metrics.log_scale << 33; - } - } break; - } - - /*Quantizer selection sticks to the codable, lossy portion of the quantizer - range.*/ - lossy_quantizer_min = convert_to_ac_quant(rc->minq, rc->bit_depth); - lossy_quantizer_max = convert_to_ac_quant(rc->maxq, rc->bit_depth); - frame_subtype = frame_type; - /*Stash quantizer modulation by frame type.*/ - mqp_Q12[OD_I_FRAME] = OD_F_Q12(mqp_i); - mqp_Q12[OD_P_FRAME] = OD_F_Q12(mqp_p); - mqp_Q12[OD_GOLDEN_P_FRAME] = OD_F_Q12(mqp_gp); - mqp_Q12[OD_ALTREF_P_FRAME] = OD_F_Q12(mqp_ap); - dqp_Q45[OD_I_FRAME] = OD_F_Q45(OD_DQP_I); - dqp_Q45[OD_P_FRAME] = OD_F_Q45(OD_DQP_P); - dqp_Q45[OD_GOLDEN_P_FRAME] = OD_F_Q45(OD_DQP_GP); - dqp_Q45[OD_ALTREF_P_FRAME] = OD_F_Q45(OD_DQP_AP); - /*Is rate control active?*/ - if (rc->target_bitrate <= 0) { - /*Rate control is not active; derive quantizer directly from - quality parameter and frame type. */ - /*Can't use the OD_LOSSLESS macro, as it uses state.quantizer to intuit, - and we've not set it yet.*/ - if (rc->quality == 0) { - /*Lossless coding requested.*/ - rc->base_quantizer = 0; - rc->target_quantizer = 0; - } else { - int64_t log_quantizer; - - /* Adjust the modulation constants using the last frame's quantizer. */ - double mqp_delta = (255 - rc->target_quantizer) / 2000.0f; - mqp_i -= mqp_delta; - mqp_p += mqp_delta; - mqp_gp -= mqp_delta; - mqp_Q12[OD_I_FRAME] = OD_F_Q12(mqp_i); - mqp_Q12[OD_P_FRAME] = OD_F_Q12(mqp_p); - mqp_Q12[OD_GOLDEN_P_FRAME] = OD_F_Q12(mqp_gp); - mqp_Q12[OD_ALTREF_P_FRAME] = OD_F_Q12(mqp_ap); - - if (rc->quality == -1) { - /*A quality of -1 means quality was unset; use a default.*/ - rc->base_quantizer = convert_to_ac_quant(10, rc->bit_depth); - } else { - rc->base_quantizer = convert_to_ac_quant(rc->quality, rc->bit_depth); - } - - if (rc->periodic_boosts && !is_golden_frame) { - int pattern_rate = (rc->goldenframe_rate >> 1); - int dist_to_golden = rc->cur_frame % pattern_rate; - int dist_away_golden = pattern_rate - dist_to_golden; - int boost = dist_to_golden; - if (dist_away_golden > dist_to_golden) boost = dist_away_golden; - boost -= pattern_rate; - boost *= (rc->base_quantizer) / OD_PERIODIC_BOOST_DIV; - rc->base_quantizer = rc->base_quantizer + boost; - } - - /*As originally written, qp modulation is applied to the coded quantizer. - Because we now have and use a more precise target quantizer for various - calculation, that needs to be modulated as well. - Calculate what is, effectively, a fractional coded quantizer. */ - /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/ - log_quantizer = od_blog64(rc->base_quantizer) - OD_Q57(OD_COEFF_SHIFT); - /*log_quantizer to Q21.*/ - log_quantizer >>= 36; - /*scale log quantizer, result is Q33.*/ - log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12; - /*Add Q33 offset to Q33 log_quantizer.*/ - log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12; - /*Modulate quantizer according to frame type; result is Q45.*/ - log_quantizer *= mqp_Q12[frame_subtype]; - /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/ - log_quantizer += dqp_Q45[frame_subtype]; - /*Back to log2 quantizer in Q57.*/ - log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) * - OD_LOG_QUANTIZER_EXP_Q12 + - OD_Q57(OD_COEFF_SHIFT); - /*Convert Q57 log2 quantizer to unclamped linear target quantizer value.*/ - rc->target_quantizer = od_bexp64(log_quantizer); - } - } else { - int clamp; - int64_t rate_bias; - int64_t rate_total; - int base_quantizer; - int64_t log_quantizer; - int qlo; - int qhi; - int i; - /*We clamp the allowed amount of qi change (after initialization).*/ - clamp = rc->cur_frame > 0; - /*Figure out how to re-distribute bits so that we hit our fullness target - before the last keyframe in our current buffer window (after the current - frame), or the end of the buffer window, whichever comes first.*/ - /*Single pass only right now.*/ - /*If we've been missing our target, add a penalty term.*/ - rate_bias = (rc->rate_bias / (rc->cur_frame + 1000)) * reservoir_frames; - /*rate_total is the total bits available over the next - reservoir_frames frames.*/ - rate_total = rc->reservoir_fullness - rc->reservoir_target + rate_bias + - reservoir_frames * rc->bits_per_frame; - /*Find a target quantizer that meets our rate target for the specific mix - of frame types we'll have over the next frame_delay frames. - We model the rate<->quantizer relationship as: - rate = scale*(quantizer**-exp) - In this case, we have our desired rate, an exponent selected in setup, - and a scale that's been measured over our frame history, so we're - solving for the quantizer. - Exponentiation with arbitrary exponents is expensive, so we work in - the binary log domain (binary exp and log aren't too bad): - rate = e2(log2_scale - log2_quantizer * exp) - There's no easy closed form solution, so we bisection search for it.*/ - /*We do not currently allow rate control to select lossless encoding.*/ - qlo = 1; - /*If there's a quality specified, it's used to select the - coarsest base quantizer we can select. - Otherwise we can use up to and including the coarsest codable - quantizer.*/ - if (rc->quality > 0) - qhi = convert_to_ac_quant(rc->quality, rc->bit_depth); - else - qhi = lossy_quantizer_max; - base_quantizer = (qlo + qhi) >> 1; - while (qlo < qhi) { - volatile int64_t log_base_quantizer; - int64_t diff; - int64_t bits; - /*Count bits contributed by each frame type using the model.*/ - bits = 0; - log_base_quantizer = od_blog64(base_quantizer); - for (i = 0; i < OD_FRAME_NSUBTYPES; i++) { - /*Modulate base quantizer by frame type.*/ - /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/ - log_quantizer = log_base_quantizer - OD_Q57(OD_COEFF_SHIFT); - /*log_quantizer to Q21.*/ - log_quantizer >>= 36; - /*scale log quantizer, result is Q33.*/ - log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12; - /*Add Q33 offset to Q33 log_quantizer.*/ - log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12; - /*Modulate quantizer according to frame type; result is Q45.*/ - log_quantizer *= mqp_Q12[i]; - /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/ - log_quantizer += dqp_Q45[i]; - /*Back to log2 quantizer in Q57.*/ - log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) * - OD_LOG_QUANTIZER_EXP_Q12 + - OD_Q57(OD_COEFF_SHIFT); - /*Clamp modulated quantizer values.*/ - log_quantizer = OD_CLAMPI(od_blog64(lossy_quantizer_min), log_quantizer, - od_blog64(lossy_quantizer_max)); - /* All the fields here are Q57 except for the exponent which is Q6.*/ - bits += nframes[i] * od_bexp64(rc->log_scale[i] + rc->log_npixels - - (log_quantizer >> 6) * rc->exp[i]); - } - diff = bits - rate_total; - if (diff > 0) { - qlo = base_quantizer + 1; - } else if (diff < 0) { - qhi = base_quantizer - 1; - } else { - break; - } - base_quantizer = (qlo + qhi) >> 1; - } - /*If this was not one of the initial frames, limit the change in base - quantizer to within [0.8*Q,1.2*Q], where Q is the previous frame's - base quantizer.*/ - if (clamp) { - base_quantizer = OD_CLAMPI((rc->base_quantizer * 0x0CCCD + 0x8000) >> 16, - base_quantizer, - (rc->base_quantizer * 0x13333 + 0x8000) >> 16); - } - /*Modulate chosen base quantizer to produce target quantizer.*/ - log_quantizer = od_blog64(base_quantizer); - /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/ - log_quantizer -= OD_Q57(OD_COEFF_SHIFT); - /*log_quantizer to Q21.*/ - log_quantizer >>= 36; - /*scale log quantizer, result is Q33.*/ - log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12; - /*Add Q33 offset to Q33 log_quantizer.*/ - log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12; - /*Modulate quantizer according to frame type; result is Q45.*/ - log_quantizer *= mqp_Q12[frame_subtype]; - /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/ - log_quantizer += dqp_Q45[frame_subtype]; - /*Back to log2 quantizer in Q57.*/ - log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) * - OD_LOG_QUANTIZER_EXP_Q12 + - OD_Q57(OD_COEFF_SHIFT); - /*Clamp modulated quantizer values.*/ - log_quantizer = OD_CLAMPI(od_blog64(lossy_quantizer_min), log_quantizer, - od_blog64(lossy_quantizer_max)); - /*The above allocation looks only at the total rate we'll accumulate in - the next reservoir_frame_delay frames. - However we could overflow the bit reservoir on the very next frame, so - check for that here if we're not using a soft target.*/ - if (rc->cap_overflow) { - int64_t margin; - int64_t soft_limit; - int64_t log_soft_limit; - int64_t log_scale_pixels; - int64_t exp; - int64_t log_qexp; - /*Allow 3% of the buffer for prediction error. - This should be plenty, and we don't mind if we go a bit over; we only - want to keep these bits from being completely wasted.*/ - margin = (rc->reservoir_max + 31) >> 5; - /*We want to use at least this many bits next frame.*/ - soft_limit = rc->reservoir_fullness + rc->bits_per_frame - - (rc->reservoir_max - margin); - log_soft_limit = od_blog64(soft_limit); - /*If we're predicting we won't use that many bits...*/ - log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels; - exp = rc->exp[frame_subtype]; - log_qexp = (log_quantizer >> 6) * exp; - if (log_scale_pixels - log_qexp < log_soft_limit) { - /*Scale the adjustment based on how far into the margin we are.*/ - log_qexp += ((log_scale_pixels - log_soft_limit - log_qexp) >> 32) * - (OD_MINI(margin, soft_limit) << 32) / margin; - log_quantizer = (((log_qexp + (exp >> 1)) / exp) << 6); - } - } - /*We just checked we don't overflow the reservoir next frame, now check - we don't underflow and bust the budget (when not using a soft target). - Disabled when a quality bound is set; if we saturate quantizer to the - maximum possible size when we have a limiting max quality, the - resulting lambda can cause strange behavior.*/ - if (rc->quality == -1) { - int64_t exp; - int64_t log_qexp; - int64_t log_scale_pixels; - int64_t log_hard_limit; - /*Compute the maximum number of bits we can use in the next frame. - Allow 50% of the rate for a single frame for prediction error. - This may not be enough for keyframes or sudden changes in - complexity.*/ - log_hard_limit = - od_blog64(rc->reservoir_fullness + (rc->bits_per_frame >> 1)); - /*If we're predicting we'll use more than this...*/ - log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels; - exp = rc->exp[frame_subtype]; - log_qexp = (log_quantizer >> 6) * exp; - if (log_scale_pixels - log_qexp > log_hard_limit) { - /*Force the target to hit our limit exactly.*/ - log_qexp = log_scale_pixels - log_hard_limit; - log_quantizer = (log_qexp + (exp >> 1)) / exp << 6; - /*If that target is unreasonable, oh well; we'll have to drop.*/ - log_quantizer = OD_MAXI(log_quantizer, od_blog64(lossy_quantizer_max)); - } - } - /*Compute a final estimate of the number of bits we plan to use, update - the running rate bias measurement.*/ - { - int64_t log_qexp; - int64_t log_scale_pixels; - log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels; - log_qexp = (log_quantizer >> 6) * rc->exp[frame_subtype]; - rc->rate_bias += od_bexp64(log_scale_pixels - log_qexp); - } - rc->target_quantizer = od_bexp64(log_quantizer); - /*The various cappings and adjustments may have altered the log_quantizer - target significantly. - We can either update the base quantizer to be consistent with the - target or let it track separately. - Theora behavior effectively keeps them consistent, as it regenerates - the effective base quantizer from the target each frame rather than - saving both. - For Daala, it's easier to allow them to track separately. - For now, allow them to track separately and see how it behaves.*/ - rc->base_quantizer = base_quantizer; - } - *bottom_idx = lossy_quantizer_min; - *top_idx = lossy_quantizer_max; - rc->target_quantizer = av1_qindex_from_ac( - OD_CLAMPI(lossy_quantizer_min, rc->target_quantizer, lossy_quantizer_max), - rc->bit_depth); - return rc->target_quantizer; -} - -int od_enc_rc_update_state(od_rc_state *rc, int64_t bits, int is_golden_frame, - int is_altref_frame, int frame_type, int droppable) { - int dropped; - dropped = 0; - /*Update rate control only if rate control is active.*/ - if (rc->target_bitrate > 0) { - int64_t log_scale; - int frame_subtype; - frame_subtype = frame_type; - /*Track non-golden and golden P frame drops separately.*/ - if (is_golden_frame && frame_type == OD_P_FRAME) - frame_subtype = OD_GOLDEN_P_FRAME; - else if (is_altref_frame && frame_type == OD_P_FRAME) - frame_subtype = OD_ALTREF_P_FRAME; - if (bits <= 0) { - /*We didn't code any blocks in this frame.*/ - log_scale = OD_Q57(-64); - bits = 0; - ++rc->prev_drop_count[frame_subtype]; - } else { - int64_t log_bits; - int64_t log_qexp; - /*Compute the estimated scale factor for this frame type.*/ - log_bits = od_blog64(bits); - log_qexp = od_blog64(rc->target_quantizer); - log_qexp = (log_qexp >> 6) * (rc->exp[frame_type]); - log_scale = OD_MINI(log_bits - rc->log_npixels + log_qexp, OD_Q57(16)); - } - - switch (rc->twopass_state) { - case 1: { - int golden, altref; - int64_t ipc; - rc->cur_metrics.frame_type = - od_frame_type(rc, rc->cur_frame, &golden, &altref, &ipc); - /*Pass 1 mode: save the metrics for this frame.*/ - rc->cur_metrics.log_scale = od_q57_to_q24(log_scale); - } break; - case 2: { - /*Pass 2 mode:*/ - int m_frame_type = rc->cur_metrics.frame_type; - rc->nframes[m_frame_type]--; - rc->scale_sum[m_frame_type] -= od_bexp64_q24(rc->cur_metrics.log_scale); - } break; - } - - if (bits > 0) { - od_iir_bessel2 *f; - /*If this is the first example of the given frame type we've - seen, we immediately replace the default scale factor guess - with the estimate we just computed using the first frame.*/ - if (rc->frame_count[frame_type] == 0) { - f = rc->scalefilter + frame_type; - f->y[1] = f->y[0] = f->x[1] = f->x[0] = od_q57_to_q24(log_scale); - rc->log_scale[frame_type] = log_scale; - } else { - /*Lengthen the time constant for the inter filters as we collect more - frame statistics, until we reach our target.*/ - if (frame_type != OD_I_FRAME && - rc->inter_p_delay < rc->inter_delay_target && - rc->frame_count[frame_type] >= rc->inter_p_delay) { - od_iir_bessel2_reinit(&rc->scalefilter[frame_type], - ++rc->inter_p_delay); - } - /*Update the low-pass scale filter for this frame type - regardless of whether or not we drop this frame.*/ - rc->log_scale[frame_type] = - od_iir_bessel2_update(rc->scalefilter + frame_type, - od_q57_to_q24(log_scale)) - << 33; - } - /*If this frame busts our budget, it must be dropped.*/ - if (droppable && rc->reservoir_fullness + rc->bits_per_frame < bits) { - ++rc->prev_drop_count[frame_subtype]; - bits = 0; - dropped = 1; - } else { - uint32_t drop_count; - /*Update a low-pass filter to estimate the "real" frame rate taking - drops into account. - This is only done if the frame is coded, as it needs the final - count of dropped frames.*/ - drop_count = rc->prev_drop_count[frame_subtype] + 1; - if (drop_count > 0x7F) { - drop_count = 0x7FFFFFFF; - } else { - drop_count <<= 24; - } - rc->log_drop_scale[frame_subtype] = - od_blog64(od_iir_bessel2_update(rc->vfrfilter + frame_subtype, - drop_count)) - - OD_Q57(24); - /*Zero the drop count for this frame. - It will be increased if we drop frames.*/ - rc->prev_drop_count[frame_subtype] = 0; - } - /*Increment the frame count for filter adaptation purposes.*/ - if (!rc->twopass_state) rc->frame_count[frame_type]++; - } - rc->reservoir_fullness += rc->bits_per_frame - bits; - /*If we're too quick filling the buffer and overflow is capped, - that rate is lost forever.*/ - if (rc->cap_overflow && rc->reservoir_fullness > rc->reservoir_max) { - rc->reservoir_fullness = rc->reservoir_max; - } - /*If we're too quick draining the buffer and underflow is capped, - don't try to make up that rate later.*/ - if (rc->cap_underflow && rc->reservoir_fullness < 0) { - rc->reservoir_fullness = 0; - } - /*Adjust the bias for the real bits we've used.*/ - rc->rate_bias -= bits; - } - return dropped; -} - -static INLINE void od_rc_buffer_val(od_rc_state *rc, int64_t val, int bytes) { - while (bytes-- > 0) { - rc->twopass_buffer[rc->twopass_buffer_bytes++] = (uint8_t)(val & 0xFF); - val >>= 8; - } -} - -static INLINE int64_t od_rc_unbuffer_val(od_rc_state *rc, int bytes) { - int64_t ret = 0; - int shift = 0; - while (bytes-- > 0) { - ret |= ((int64_t)rc->twopass_buffer[rc->twopass_buffer_bytes++]) << shift; - shift += 8; - } - return ret; -} - -int od_enc_rc_2pass_out(od_rc_state *rc, struct aom_codec_pkt_list *pkt_list, - int summary) { - int i; - struct aom_codec_cx_pkt pkt; - rc->twopass_buffer = rc->firstpass_buffer; - rc->twopass_buffer_bytes = 0; - if (!rc->twopass_state) { - rc->twopass_state = 1; - for (i = 0; i < OD_FRAME_NSUBTYPES; i++) { - rc->frame_count[i] = 0; - rc->exp[i] = 0; - rc->scale_sum[i] = 0; - } - } - if (summary) { - od_rc_buffer_val(rc, OD_RC_2PASS_MAGIC, 4); - od_rc_buffer_val(rc, OD_RC_2PASS_VERSION, 1); - for (i = 0; i < OD_FRAME_NSUBTYPES; i++) { - od_rc_buffer_val(rc, rc->frame_count[i], 4); - od_rc_buffer_val(rc, rc->exp[i], 4); - od_rc_buffer_val(rc, rc->scale_sum[i], 8); - } - } else { - int frame_type = rc->cur_metrics.frame_type; - rc->scale_sum[frame_type] += od_bexp64_q24(rc->cur_metrics.log_scale); - rc->frame_count[frame_type]++; - od_rc_buffer_val(rc, rc->cur_metrics.frame_type, 1); - od_rc_buffer_val(rc, rc->cur_metrics.log_scale, 4); - } - pkt.data.twopass_stats.buf = rc->firstpass_buffer; - pkt.data.twopass_stats.sz = rc->twopass_buffer_bytes; - pkt.kind = AOM_CODEC_STATS_PKT; - aom_codec_pkt_list_add(pkt_list, &pkt); - return 0; -} - -int od_enc_rc_2pass_in(od_rc_state *rc) { - /* Enable pass 2 mode if this is the first call. */ - if (rc->twopass_state == 0) { - uint32_t i, total_frames = 0; - - if (!rc->twopass_allframes_buf || - rc->twopass_allframes_buf_size < OD_RC_2PASS_MIN) - return -1; - - /* Find summary packet at the end */ - rc->twopass_buffer = rc->twopass_allframes_buf; - rc->twopass_buffer += - rc->twopass_allframes_buf_size - OD_RC_2PASS_SUMMARY_SZ; - rc->twopass_buffer_bytes = 0; - - if (od_rc_unbuffer_val(rc, 4) != OD_RC_2PASS_MAGIC) return -1; - if (od_rc_unbuffer_val(rc, 1) != OD_RC_2PASS_VERSION) return -1; - - for (i = 0; i < OD_FRAME_NSUBTYPES; i++) { - rc->frame_count[i] = od_rc_unbuffer_val(rc, 4); - rc->exp[i] = od_rc_unbuffer_val(rc, 4); - rc->scale_sum[i] = od_rc_unbuffer_val(rc, 8); - rc->nframes[i] = rc->frame_count[i]; - total_frames += rc->frame_count[i]; - } - - if (total_frames < 1) return -1; - - if (total_frames * OD_RC_2PASS_PACKET_SZ > rc->twopass_allframes_buf_size) - return -1; - - od_enc_rc_reset(rc); - - /* Everything looks ok */ - rc->twopass_buffer = rc->twopass_allframes_buf; - rc->twopass_state = 2; - rc->twopass_buffer_bytes = 0; - } - - rc->cur_metrics.frame_type = od_rc_unbuffer_val(rc, 1); - rc->cur_metrics.log_scale = od_rc_unbuffer_val(rc, 4); - - return 0; -} diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.h b/third_party/aom/av1/encoder/ratectrl_xiph.h index a4a9052fa..e69de29bb 100644 --- a/third_party/aom/av1/encoder/ratectrl_xiph.h +++ b/third_party/aom/av1/encoder/ratectrl_xiph.h @@ -1,200 +0,0 @@ -/* - * Copyright (c) 2001-2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#if !defined(_ratectrl_xiph_H) -#define _ratectrl_xiph_H (1) - -#include "av1/encoder/ratectrl.h" -#include "aom/internal/aom_codec_internal.h" - -/*Frame types.*/ -#define OD_I_FRAME (0) -#define OD_P_FRAME (1) -#define OD_GOLDEN_P_FRAME (2) -#define OD_ALTREF_P_FRAME (3) - -#define OD_FRAME_NSUBTYPES (OD_ALTREF_P_FRAME + 1) - -/* Periodic boost (in between golden frames) strength - lower is more */ -#define OD_PERIODIC_BOOST_DIV (10) - -/* Constants for frame QP modulation <- tweak these - * Adjusts how the rate control system decides the quantizers per frame - * (sub)type */ -#define OD_MQP_I (0.98) -#define OD_MQP_P (1.06) -#define OD_MQP_GP (0.99) -#define OD_MQP_AP (0.92) -#define OD_DQP_I (-2) -#define OD_DQP_P (0) -#define OD_DQP_GP (-2) -#define OD_DQP_AP (-2) - -/*Fractional_coded_quantizer ~= - log2(quantizer / (1 << OD_COEFF_SHIFT))*6.307 + 6.235*/ -/*Base/scale factor for linear quantizer to fractional coded quantizer - conversion (6.307 * 2^12) */ -#define OD_LOG_QUANTIZER_BASE_Q12 (0x0064EB) -/*Inverse of above scale factor.*/ -#define OD_LOG_QUANTIZER_EXP_Q12 (0x000289) -/*Offset for linear quantizer to fractional coded quantizer - conversion (6.235 * 2^45) */ -#define OD_LOG_QUANTIZER_OFFSET_Q45 (0x0000C7851EB851ECLL) - -#define OD_RC_2PASS_MAGIC (0x53015641) /* [A, V, 1, S] in little endian */ -#define OD_RC_2PASS_SUMMARY_SZ (4 + 1 + (4 + 4 + 8) * OD_FRAME_NSUBTYPES) -#define OD_RC_2PASS_PACKET_SZ (1 + 4) -#define OD_RC_2PASS_MIN (OD_RC_2PASS_PACKET_SZ + OD_RC_2PASS_SUMMARY_SZ) -#define OD_RC_2PASS_VERSION (1) - -/*A 2nd order low-pass Bessel follower. - We use this for rate control because it has fast reaction time, but is - critically damped.*/ -typedef struct od_iir_bessel2 { - int32_t c[2]; - int64_t g; - int32_t x[2]; - int32_t y[2]; -} od_iir_bessel2; - -/* The 2-pass metrics associated with a single frame. */ -typedef struct od_frame_metrics { - /*The log base 2 of the scale factor for this frame in Q24 format.*/ - int64_t log_scale; - /*The frame type from pass 1.*/ - unsigned frame_type : 1; -} od_frame_metrics; - -/*Rate control setup and working state information.*/ -typedef struct od_rc_state { - /* Image format */ - int frame_width; - int frame_height; - int bit_depth; - - /* Framerate */ - double framerate; - /* Keyframe rate */ - int keyframe_rate; - /* Golden frame period */ - int goldenframe_rate; - /* Altref frame period */ - int altref_rate; - /*The target bit-rate in bits per second.*/ - int64_t target_bitrate; - /* Quality level for non-bitrate-targeting */ - int quality; - /* Copied from oxcf->frame_periodic_boost */ - int periodic_boosts; - /* Max Q */ - int maxq; - /* Min Q */ - int minq; - /* Quantizer to use for the first pass */ - int firstpass_quant; - - /* 2-pass metrics */ - od_frame_metrics cur_metrics; - - /* 2-pass state */ - int64_t scale_sum[OD_FRAME_NSUBTYPES]; - int nframes[OD_FRAME_NSUBTYPES]; - - /* 2-pass bytestream reader/writer context */ - uint8_t *twopass_buffer; - int twopass_buffer_bytes; - - /* Pass 1 stats packet storage */ - uint8_t firstpass_buffer[OD_RC_2PASS_SUMMARY_SZ]; - - /* Every state packet from the first pass in a single buffer */ - uint8_t *twopass_allframes_buf; - size_t twopass_allframes_buf_size; - - /* Actual returned quantizer */ - int target_quantizer; - /*The full-precision, unmodulated quantizer upon which - our modulated quantizers are based.*/ - int base_quantizer; - - /* Increments by 1 for each frame. */ - int64_t cur_frame; - - /* End of input flag */ - int end_of_input; - /* Closed GOP flag */ - int closed_gop; - /*The number of frames over which to distribute the reservoir usage.*/ - int reservoir_frame_delay; - /*Will we drop frames to meet bitrate target?*/ - unsigned char drop_frames; - /*Do we respect the maximum reservoir fullness?*/ - unsigned char cap_overflow; - /*Can the reservoir go negative?*/ - unsigned char cap_underflow; - /*Two-pass mode state. - 0 => 1-pass encoding. - 1 => 1st pass of 2-pass encoding. - 2 => 2nd pass of 2-pass encoding.*/ - int twopass_state; - /*The log of the number of pixels in a frame in Q57 format.*/ - int64_t log_npixels; - /*The target average bits per frame.*/ - int64_t bits_per_frame; - /*The current bit reservoir fullness (bits available to be used).*/ - int64_t reservoir_fullness; - /*The target buffer fullness. - This is where we'd like to be by the last keyframe the appears in the next - buf_delay frames.*/ - int64_t reservoir_target; - /*The maximum buffer fullness (total size of the buffer).*/ - int64_t reservoir_max; - /*The log of estimated scale factor for the rate model in Q57 format.*/ - int64_t log_scale[OD_FRAME_NSUBTYPES]; - /*The exponent used in the rate model in Q8 format.*/ - unsigned exp[OD_FRAME_NSUBTYPES]; - /*The log of an estimated scale factor used to obtain the real framerate, for - VFR sources or, e.g., 12 fps content doubled to 24 fps, etc.*/ - int64_t log_drop_scale[OD_FRAME_NSUBTYPES]; - /*The total drop count from the previous frame.*/ - uint32_t prev_drop_count[OD_FRAME_NSUBTYPES]; - /*Second-order lowpass filters to track scale and VFR/drops.*/ - od_iir_bessel2 scalefilter[OD_FRAME_NSUBTYPES]; - od_iir_bessel2 vfrfilter[OD_FRAME_NSUBTYPES]; - int frame_count[OD_FRAME_NSUBTYPES]; - int inter_p_delay; - int inter_delay_target; - /*The total accumulated estimation bias.*/ - int64_t rate_bias; -} od_rc_state; - -int od_enc_rc_init(od_rc_state *rc, int64_t bitrate, int delay_ms); - -int od_enc_rc_select_quantizers_and_lambdas(od_rc_state *rc, - int is_golden_frame, - int is_altref_frame, int frame_type, - int *bottom_idx, int *top_idx); - -/* Returns 1 if the frame should be dropped */ -int od_enc_rc_update_state(od_rc_state *rc, int64_t bits, int is_golden_frame, - int is_altref_frame, int frame_type, int droppable); - -int od_frame_type(od_rc_state *rc, int64_t coding_frame_count, int *is_golden, - int *is_altref, int64_t *ip_count); - -int od_enc_rc_resize(od_rc_state *rc); - -int od_enc_rc_2pass_out(od_rc_state *rc, struct aom_codec_pkt_list *pkt_list, - int summary); - -int od_enc_rc_2pass_in(od_rc_state *rc); - -#endif diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c index 5dd485334..17f23e5ec 100644 --- a/third_party/aom/av1/encoder/rd.c +++ b/third_party/aom/av1/encoder/rd.c @@ -13,7 +13,7 @@ #include <math.h> #include <stdio.h> -#include "./av1_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" @@ -36,9 +36,7 @@ #include "av1/encoder/encodemb.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/encoder.h" -#if CONFIG_LV_MAP #include "av1/encoder/encodetxb.h" -#endif #include "av1/encoder/mcomp.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/rd.h" @@ -54,114 +52,96 @@ // This table is used to correct for block size. // The factors here are << 2 (2 = x0.5, 32 = x8 etc). static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES_ALL] = { -#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8 - 2, 2, 2, -#endif - 2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32, -#if CONFIG_EXT_PARTITION - 48, 48, 64, -#endif // CONFIG_EXT_PARTITION - 4, 4, 8, 8, 16, 16, -#if CONFIG_EXT_PARTITION - 32, 32 -#endif // CONFIG_EXT_PARTITION + 2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32, 48, 48, 64, 4, 4, 8, 8, 16, 16 }; -#if CONFIG_EXT_TX static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA][EXT_TX_SIZES] = { -#if CONFIG_CHROMA_2X2 - { 1, 1, 1, 1, 1 }, // unused - { 0, 1, 1, 0, 0 }, - { 0, 0, 0, 1, 0 }, -#if CONFIG_MRC_TX - { 0, 0, 0, 0, 1 }, -#endif // CONFIG_MRC_TX -#else // CONFIG_CHROMA_2X2 { 1, 1, 1, 1 }, // unused { 1, 1, 0, 0 }, { 0, 0, 1, 0 }, -#if CONFIG_MRC_TX - { 0, 0, 0, 1 }, -#endif // CONFIG_MRC_TX -#endif // CONFIG_CHROMA_2X2 }; static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER][EXT_TX_SIZES] = { -#if CONFIG_CHROMA_2X2 - { 1, 1, 1, 1, 1 }, // unused - { 0, 1, 1, 0, 0 }, { 0, 0, 0, 1, 0 }, { 0, 0, 0, 0, 1 }, -#if CONFIG_MRC_TX - { 0, 0, 0, 0, 1 }, -#endif // CONFIG_MRC_TX -#else // CONFIG_CHROMA_2X2 { 1, 1, 1, 1 }, // unused - { 1, 1, 0, 0 }, { 0, 0, 1, 0 }, { 0, 0, 0, 1 }, -#if CONFIG_MRC_TX + { 1, 1, 0, 0 }, + { 0, 0, 1, 0 }, { 0, 0, 0, 1 }, -#endif // CONFIG_MRC_TX -#endif // CONFIG_CHROMA_2X2 }; -#endif // CONFIG_EXT_TX + +static const int av1_ext_tx_set_idx_to_type[2][AOMMAX(EXT_TX_SETS_INTRA, + EXT_TX_SETS_INTER)] = { + { + // Intra + EXT_TX_SET_DCTONLY, + EXT_TX_SET_DTT4_IDTX_1DDCT, + EXT_TX_SET_DTT4_IDTX, + }, + { + // Inter + EXT_TX_SET_DCTONLY, + EXT_TX_SET_ALL16, + EXT_TX_SET_DTT9_IDTX_1DDCT, + EXT_TX_SET_DCT_IDTX, + }, +}; void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x, FRAME_CONTEXT *fc) { int i, j; - if (cm->frame_type == KEY_FRAME) { - for (i = 0; i < PARTITION_CONTEXTS_PRIMARY; ++i) - av1_cost_tokens_from_cdf(x->partition_cost[i], fc->partition_cdf[i], + for (i = 0; i < PARTITION_CONTEXTS; ++i) + av1_cost_tokens_from_cdf(x->partition_cost[i], fc->partition_cdf[i], NULL); + + if (cm->skip_mode_flag) { + for (i = 0; i < SKIP_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->skip_mode_cost[i], fc->skip_mode_cdfs[i], NULL); -#if CONFIG_UNPOISON_PARTITION_CTX - for (; i < PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES; ++i) { - aom_prob p = fc->partition_prob[i][PARTITION_VERT]; - assert(p > 0); - x->partition_cost[i][PARTITION_NONE] = INT_MAX; - x->partition_cost[i][PARTITION_HORZ] = INT_MAX; - x->partition_cost[i][PARTITION_VERT] = av1_cost_bit(p, 0); - x->partition_cost[i][PARTITION_SPLIT] = av1_cost_bit(p, 1); - } - for (; i < PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES; ++i) { - aom_prob p = fc->partition_prob[i][PARTITION_HORZ]; - assert(p > 0); - x->partition_cost[i][PARTITION_NONE] = INT_MAX; - x->partition_cost[i][PARTITION_HORZ] = av1_cost_bit(p, 0); - x->partition_cost[i][PARTITION_VERT] = INT_MAX; - x->partition_cost[i][PARTITION_SPLIT] = av1_cost_bit(p, 1); } - x->partition_cost[PARTITION_CONTEXTS][PARTITION_NONE] = INT_MAX; - x->partition_cost[PARTITION_CONTEXTS][PARTITION_HORZ] = INT_MAX; - x->partition_cost[PARTITION_CONTEXTS][PARTITION_VERT] = INT_MAX; - x->partition_cost[PARTITION_CONTEXTS][PARTITION_SPLIT] = 0; -#endif // CONFIG_UNPOISON_PARTITION_CTX } -#if CONFIG_KF_CTX + for (i = 0; i < SKIP_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->skip_cost[i], fc->skip_cdfs[i], NULL); + } + for (i = 0; i < KF_MODE_CONTEXTS; ++i) for (j = 0; j < KF_MODE_CONTEXTS; ++j) av1_cost_tokens_from_cdf(x->y_mode_costs[i][j], fc->kf_y_cdf[i][j], NULL); -#else - for (i = 0; i < INTRA_MODES; ++i) - for (j = 0; j < INTRA_MODES; ++j) - av1_cost_tokens_from_cdf(x->y_mode_costs[i][j], fc->kf_y_cdf[i][j], NULL); -#endif for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) av1_cost_tokens_from_cdf(x->mbmode_cost[i], fc->y_mode_cdf[i], NULL); - for (i = 0; i < INTRA_MODES; ++i) - av1_cost_tokens_from_cdf(x->intra_uv_mode_cost[i], fc->uv_mode_cdf[i], - NULL); + for (i = 0; i < CFL_ALLOWED_TYPES; ++i) + for (j = 0; j < INTRA_MODES; ++j) + av1_cost_tokens_from_cdf(x->intra_uv_mode_cost[i][j], + fc->uv_mode_cdf[i][j], NULL); + + av1_cost_tokens_from_cdf(x->filter_intra_mode_cost, fc->filter_intra_mode_cdf, + NULL); + for (i = 0; i < BLOCK_SIZES_ALL; ++i) { + if (av1_filter_intra_allowed_bsize(cm, i)) + av1_cost_tokens_from_cdf(x->filter_intra_cost[i], + fc->filter_intra_cdfs[i], NULL); + } for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) av1_cost_tokens_from_cdf(x->switchable_interp_costs[i], fc->switchable_interp_cdf[i], NULL); - for (i = 0; i < PALETTE_BLOCK_SIZES; ++i) { + for (i = 0; i < PALATTE_BSIZE_CTXS; ++i) { av1_cost_tokens_from_cdf(x->palette_y_size_cost[i], fc->palette_y_size_cdf[i], NULL); av1_cost_tokens_from_cdf(x->palette_uv_size_cost[i], fc->palette_uv_size_cdf[i], NULL); + for (j = 0; j < PALETTE_Y_MODE_CONTEXTS; ++j) { + av1_cost_tokens_from_cdf(x->palette_y_mode_cost[i][j], + fc->palette_y_mode_cdf[i][j], NULL); + } + } + + for (i = 0; i < PALETTE_UV_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->palette_uv_mode_cost[i], + fc->palette_uv_mode_cdf[i], NULL); } for (i = 0; i < PALETTE_SIZES; ++i) { @@ -172,60 +152,38 @@ void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x, fc->palette_uv_color_index_cdf[i][j], NULL); } } -#if CONFIG_MRC_TX - for (i = 0; i < PALETTE_SIZES; ++i) { - for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) { - av1_cost_tokens_from_cdf(x->mrc_mask_inter_cost[i][j], - fc->mrc_mask_inter_cdf[i][j], NULL); - av1_cost_tokens_from_cdf(x->mrc_mask_intra_cost[i][j], - fc->mrc_mask_intra_cdf[i][j], NULL); - } - } -#endif // CONFIG_MRC_TX -#if CONFIG_CFL int sign_cost[CFL_JOINT_SIGNS]; av1_cost_tokens_from_cdf(sign_cost, fc->cfl_sign_cdf, NULL); for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) { - const aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; - const aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; int *cost_u = x->cfl_cost[joint_sign][CFL_PRED_U]; int *cost_v = x->cfl_cost[joint_sign][CFL_PRED_V]; - if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO) + if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO) { memset(cost_u, 0, CFL_ALPHABET_SIZE * sizeof(*cost_u)); - else + } else { + const aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; av1_cost_tokens_from_cdf(cost_u, cdf_u, NULL); - if (CFL_SIGN_V(joint_sign) == CFL_SIGN_ZERO) + } + if (CFL_SIGN_V(joint_sign) == CFL_SIGN_ZERO) { memset(cost_v, 0, CFL_ALPHABET_SIZE * sizeof(*cost_v)); - else + } else { + const aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; av1_cost_tokens_from_cdf(cost_v, cdf_v, NULL); + } for (int u = 0; u < CFL_ALPHABET_SIZE; u++) cost_u[u] += sign_cost[joint_sign]; } -#endif // CONFIG_CFL - for (i = 0; i < MAX_TX_DEPTH; ++i) + for (i = 0; i < MAX_TX_CATS; ++i) for (j = 0; j < TX_SIZE_CONTEXTS; ++j) av1_cost_tokens_from_cdf(x->tx_size_cost[i][j], fc->tx_size_cdf[i][j], NULL); -#if CONFIG_EXT_TX -#if CONFIG_LGT_FROM_PRED - if (LGT_FROM_PRED_INTRA) { - for (i = 0; i < LGT_SIZES; ++i) { - for (j = 0; j < INTRA_MODES; ++j) { - x->intra_lgt_cost[i][j][0] = av1_cost_bit(fc->intra_lgt_prob[i][j], 0); - x->intra_lgt_cost[i][j][1] = av1_cost_bit(fc->intra_lgt_prob[i][j], 1); - } - } - } - if (LGT_FROM_PRED_INTER) { - for (i = 0; i < LGT_SIZES; ++i) { - x->inter_lgt_cost[i][0] = av1_cost_bit(fc->inter_lgt_prob[i], 0); - x->inter_lgt_cost[i][1] = av1_cost_bit(fc->inter_lgt_prob[i], 1); - } + for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->txfm_partition_cost[i], + fc->txfm_partition_cdf[i], NULL); } -#endif // CONFIG_LGT_FROM_PRED + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { int s; for (s = 1; s < EXT_TX_SETS_INTER; ++s) { @@ -245,125 +203,124 @@ void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x, } } } -#else - for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { - for (j = 0; j < TX_TYPES; ++j) - av1_cost_tokens_from_cdf(x->intra_tx_type_costs[i][j], - fc->intra_ext_tx_cdf[i][j], av1_ext_tx_inv); - } - for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { - av1_cost_tokens_from_cdf(x->inter_tx_type_costs[i], fc->inter_ext_tx_cdf[i], - av1_ext_tx_inv); - } -#endif // CONFIG_EXT_TX -#if CONFIG_EXT_INTRA -#if CONFIG_INTRA_INTERP - for (i = 0; i < INTRA_FILTERS + 1; ++i) - av1_cost_tokens_from_cdf(x->intra_filter_cost[i], fc->intra_filter_cdf[i], + for (i = 0; i < DIRECTIONAL_MODES; ++i) { + av1_cost_tokens_from_cdf(x->angle_delta_cost[i], fc->angle_delta_cdf[i], NULL); -#endif // CONFIG_INTRA_INTERP -#endif // CONFIG_EXT_INTRA -#if CONFIG_LOOP_RESTORATION - av1_cost_tokens(x->switchable_restore_cost, fc->switchable_restore_prob, - av1_switchable_restore_tree); -#endif // CONFIG_LOOP_RESTORATION -#if CONFIG_INTRABC + } + av1_cost_tokens_from_cdf(x->switchable_restore_cost, + fc->switchable_restore_cdf, NULL); + av1_cost_tokens_from_cdf(x->wiener_restore_cost, fc->wiener_restore_cdf, + NULL); + av1_cost_tokens_from_cdf(x->sgrproj_restore_cost, fc->sgrproj_restore_cdf, + NULL); av1_cost_tokens_from_cdf(x->intrabc_cost, fc->intrabc_cdf, NULL); -#endif // CONFIG_INTRABC if (!frame_is_intra_only(cm)) { + for (i = 0; i < COMP_INTER_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->comp_inter_cost[i], fc->comp_inter_cdf[i], + NULL); + } + + for (i = 0; i < REF_CONTEXTS; ++i) { + for (j = 0; j < SINGLE_REFS - 1; ++j) { + av1_cost_tokens_from_cdf(x->single_ref_cost[i][j], + fc->single_ref_cdf[i][j], NULL); + } + } + + for (i = 0; i < COMP_REF_TYPE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->comp_ref_type_cost[i], + fc->comp_ref_type_cdf[i], NULL); + } + + for (i = 0; i < UNI_COMP_REF_CONTEXTS; ++i) { + for (j = 0; j < UNIDIR_COMP_REFS - 1; ++j) { + av1_cost_tokens_from_cdf(x->uni_comp_ref_cost[i][j], + fc->uni_comp_ref_cdf[i][j], NULL); + } + } + + for (i = 0; i < REF_CONTEXTS; ++i) { + for (j = 0; j < FWD_REFS - 1; ++j) { + av1_cost_tokens_from_cdf(x->comp_ref_cost[i][j], fc->comp_ref_cdf[i][j], + NULL); + } + } + + for (i = 0; i < REF_CONTEXTS; ++i) { + for (j = 0; j < BWD_REFS - 1; ++j) { + av1_cost_tokens_from_cdf(x->comp_bwdref_cost[i][j], + fc->comp_bwdref_cdf[i][j], NULL); + } + } + + for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->intra_inter_cost[i], fc->intra_inter_cdf[i], + NULL); + } + for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) { -#if CONFIG_NEW_MULTISYMBOL av1_cost_tokens_from_cdf(x->newmv_mode_cost[i], fc->newmv_cdf[i], NULL); -#else - x->newmv_mode_cost[i][0] = av1_cost_bit(fc->newmv_prob[i], 0); - x->newmv_mode_cost[i][1] = av1_cost_bit(fc->newmv_prob[i], 1); -#endif } - for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i) { -#if CONFIG_NEW_MULTISYMBOL + for (i = 0; i < GLOBALMV_MODE_CONTEXTS; ++i) { av1_cost_tokens_from_cdf(x->zeromv_mode_cost[i], fc->zeromv_cdf[i], NULL); -#else - x->zeromv_mode_cost[i][0] = av1_cost_bit(fc->zeromv_prob[i], 0); - x->zeromv_mode_cost[i][1] = av1_cost_bit(fc->zeromv_prob[i], 1); -#endif } for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) { -#if CONFIG_NEW_MULTISYMBOL av1_cost_tokens_from_cdf(x->refmv_mode_cost[i], fc->refmv_cdf[i], NULL); -#else - x->refmv_mode_cost[i][0] = av1_cost_bit(fc->refmv_prob[i], 0); - x->refmv_mode_cost[i][1] = av1_cost_bit(fc->refmv_prob[i], 1); -#endif } for (i = 0; i < DRL_MODE_CONTEXTS; ++i) { -#if CONFIG_NEW_MULTISYMBOL av1_cost_tokens_from_cdf(x->drl_mode_cost0[i], fc->drl_cdf[i], NULL); -#else - x->drl_mode_cost0[i][0] = av1_cost_bit(fc->drl_prob[i], 0); - x->drl_mode_cost0[i][1] = av1_cost_bit(fc->drl_prob[i], 1); -#endif } for (i = 0; i < INTER_MODE_CONTEXTS; ++i) av1_cost_tokens_from_cdf(x->inter_compound_mode_cost[i], fc->inter_compound_mode_cdf[i], NULL); -#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT for (i = 0; i < BLOCK_SIZES_ALL; ++i) av1_cost_tokens_from_cdf(x->compound_type_cost[i], fc->compound_type_cdf[i], NULL); -#endif // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT -#if CONFIG_COMPOUND_SINGLEREF - for (i = 0; i < INTER_MODE_CONTEXTS; ++i) - av1_cost_tokens_from_cdf(x->inter_singleref_comp_mode_cost[i], - fc->inter_singleref_comp_mode_cdf[i], NULL); -#endif // CONFIG_COMPOUND_SINGLEREF -#if CONFIG_INTERINTRA - for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) + for (i = 0; i < BLOCK_SIZES_ALL; ++i) { + if (get_interinter_wedge_bits(i)) { + av1_cost_tokens_from_cdf(x->wedge_idx_cost[i], fc->wedge_idx_cdf[i], + NULL); + } + } + for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) { + av1_cost_tokens_from_cdf(x->interintra_cost[i], fc->interintra_cdf[i], + NULL); av1_cost_tokens_from_cdf(x->interintra_mode_cost[i], fc->interintra_mode_cdf[i], NULL); -#endif // CONFIG_INTERINTRA -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + } + for (i = 0; i < BLOCK_SIZES_ALL; ++i) { + av1_cost_tokens_from_cdf(x->wedge_interintra_cost[i], + fc->wedge_interintra_cdf[i], NULL); + } for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) { av1_cost_tokens_from_cdf(x->motion_mode_cost[i], fc->motion_mode_cdf[i], NULL); } -#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) { -#if CONFIG_NCOBMC_ADAPT_WEIGHT - av1_cost_tokens_from_cdf(x->motion_mode_cost2[i], fc->ncobmc_cdf[i], - NULL); -#endif -#if CONFIG_NEW_MULTISYMBOL || CONFIG_NCOBMC_ADAPT_WEIGHT av1_cost_tokens_from_cdf(x->motion_mode_cost1[i], fc->obmc_cdf[i], NULL); -#else - x->motion_mode_cost1[i][0] = av1_cost_bit(fc->obmc_prob[i], 0); - x->motion_mode_cost1[i][1] = av1_cost_bit(fc->obmc_prob[i], 1); -#endif } -#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION -#if CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT - for (i = ADAPT_OVERLAP_BLOCK_8X8; i < ADAPT_OVERLAP_BLOCKS; ++i) { - av1_cost_tokens_from_cdf(x->ncobmc_mode_cost[i], fc->ncobmc_mode_cdf[i], + for (i = 0; i < COMP_INDEX_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->comp_idx_cost[i], fc->compound_index_cdf[i], NULL); } -#endif // CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + for (i = 0; i < COMP_GROUP_IDX_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->comp_group_idx_cost[i], + fc->comp_group_idx_cdf[i], NULL); + } } } // Values are now correlated to quantizer. static int sad_per_bit16lut_8[QINDEX_RANGE]; static int sad_per_bit4lut_8[QINDEX_RANGE]; - -#if CONFIG_HIGHBITDEPTH static int sad_per_bit16lut_10[QINDEX_RANGE]; static int sad_per_bit4lut_10[QINDEX_RANGE]; static int sad_per_bit16lut_12[QINDEX_RANGE]; static int sad_per_bit4lut_12[QINDEX_RANGE]; -#endif static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range, aom_bit_depth_t bit_depth) { @@ -381,31 +338,26 @@ static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range, void av1_init_me_luts(void) { init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE, AOM_BITS_8); -#if CONFIG_HIGHBITDEPTH init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE, AOM_BITS_10); init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE, AOM_BITS_12); -#endif } static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12, 8, 8, 4, 4, 2, 2, 1, 0 }; static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128, 128, 144, -#if CONFIG_EXT_REFS // TODO(zoeliu): To adjust further following factor values. 128, 128, 128, // TODO(weitinglin): We should investigate if the values should be the same // as the value used by OVERLAY frame 144, // INTNL_OVERLAY_UPDATE 128 // INTNL_ARF_UPDATE -#endif // CONFIG_EXT_REFS }; int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) { - const int64_t q = av1_dc_quant(qindex, 0, cpi->common.bit_depth); -#if CONFIG_HIGHBITDEPTH + const int64_t q = av1_dc_quant_Q3(qindex, 0, cpi->common.bit_depth); int64_t rdmult = 0; switch (cpi->common.bit_depth) { case AOM_BITS_8: rdmult = 88 * q * q / 24; break; @@ -415,9 +367,6 @@ int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) { assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); return -1; } -#else - int64_t rdmult = 88 * q * q / 24; -#endif // CONFIG_HIGHBITDEPTH if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { const GF_GROUP *const gf_group = &cpi->twopass.gf_group; const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index]; @@ -432,25 +381,19 @@ int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) { static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) { double q; -#if CONFIG_HIGHBITDEPTH switch (bit_depth) { - case AOM_BITS_8: q = av1_dc_quant(qindex, 0, AOM_BITS_8) / 4.0; break; - case AOM_BITS_10: q = av1_dc_quant(qindex, 0, AOM_BITS_10) / 16.0; break; - case AOM_BITS_12: q = av1_dc_quant(qindex, 0, AOM_BITS_12) / 64.0; break; + case AOM_BITS_8: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_8) / 4.0; break; + case AOM_BITS_10: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_10) / 16.0; break; + case AOM_BITS_12: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_12) / 64.0; break; default: assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); return -1; } -#else - (void)bit_depth; - q = av1_dc_quant(qindex, 0, AOM_BITS_8) / 4.0; -#endif // CONFIG_HIGHBITDEPTH // TODO(debargha): Adjust the function below. return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8); } void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) { -#if CONFIG_HIGHBITDEPTH switch (cpi->common.bit_depth) { case AOM_BITS_8: x->sadperbit16 = sad_per_bit16lut_8[qindex]; @@ -467,11 +410,6 @@ void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) { default: assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); } -#else - (void)cpi; - x->sadperbit16 = sad_per_bit16lut_8[qindex]; - x->sadperbit4 = sad_per_bit4lut_8[qindex]; -#endif // CONFIG_HIGHBITDEPTH } static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) { @@ -490,195 +428,89 @@ static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) { const int t = q * rd_thresh_block_size_factor[bsize]; const int thresh_max = INT_MAX / t; -#if CONFIG_CB4X4 for (i = 0; i < MAX_MODES; ++i) rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max ? rd->thresh_mult[i] * t / 4 : INT_MAX; -#else - if (bsize >= BLOCK_8X8) { - for (i = 0; i < MAX_MODES; ++i) - rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max - ? rd->thresh_mult[i] * t / 4 - : INT_MAX; - } else { - for (i = 0; i < MAX_REFS; ++i) - rd->threshes[segment_id][bsize][i] = - rd->thresh_mult_sub8x8[i] < thresh_max - ? rd->thresh_mult_sub8x8[i] * t / 4 - : INT_MAX; - } -#endif } } } -void av1_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int ref, - int ref_mv_idx) { - MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext; - int8_t rf_type = av1_ref_frame_type(x->e_mbd.mi[0]->mbmi.ref_frame); - int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], - mbmi_ext->ref_mv_stack[rf_type], ref, ref_mv_idx); - (void)ref_frame; - x->mvcost = x->mv_cost_stack[nmv_ctx]; - x->nmvjointcost = x->nmv_vec_cost[nmv_ctx]; +void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx) { + (void)ref; + (void)ref_mv_idx; + x->mvcost = x->mv_cost_stack; + x->nmvjointcost = x->nmv_vec_cost; } -#if CONFIG_LV_MAP -#if !LV_MAP_PROB -static void get_rate_cost(aom_prob p, int cost[2]) { - cost[0] = av1_cost_bit(p, 0); - cost[1] = av1_cost_bit(p, 1); -} -#endif // !LV_MAP_PROB - -void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc) { +void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc, + const int num_planes) { + const int nplanes = AOMMIN(num_planes, PLANE_TYPES); + for (int eob_multi_size = 0; eob_multi_size < 7; ++eob_multi_size) { + for (int plane = 0; plane < nplanes; ++plane) { + LV_MAP_EOB_COST *pcost = &x->eob_costs[eob_multi_size][plane]; + + for (int ctx = 0; ctx < 2; ++ctx) { + aom_cdf_prob *pcdf; + switch (eob_multi_size) { + case 0: pcdf = fc->eob_flag_cdf16[plane][ctx]; break; + case 1: pcdf = fc->eob_flag_cdf32[plane][ctx]; break; + case 2: pcdf = fc->eob_flag_cdf64[plane][ctx]; break; + case 3: pcdf = fc->eob_flag_cdf128[plane][ctx]; break; + case 4: pcdf = fc->eob_flag_cdf256[plane][ctx]; break; + case 5: pcdf = fc->eob_flag_cdf512[plane][ctx]; break; + case 6: + default: pcdf = fc->eob_flag_cdf1024[plane][ctx]; break; + } + av1_cost_tokens_from_cdf(pcost->eob_cost[ctx], pcdf, NULL); + } + } + } for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) { - for (int plane = 0; plane < PLANE_TYPES; ++plane) { + for (int plane = 0; plane < nplanes; ++plane) { LV_MAP_COEFF_COST *pcost = &x->coeff_costs[tx_size][plane]; -#if LV_MAP_PROB for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) av1_cost_tokens_from_cdf(pcost->txb_skip_cost[ctx], fc->txb_skip_cdf[tx_size][ctx], NULL); + for (int ctx = 0; ctx < SIG_COEF_CONTEXTS_EOB; ++ctx) + av1_cost_tokens_from_cdf(pcost->base_eob_cost[ctx], + fc->coeff_base_eob_cdf[tx_size][plane][ctx], + NULL); for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) - av1_cost_tokens_from_cdf(pcost->nz_map_cost[ctx], - fc->nz_map_cdf[tx_size][plane][ctx], NULL); + av1_cost_tokens_from_cdf(pcost->base_cost[ctx], + fc->coeff_base_cdf[tx_size][plane][ctx], NULL); for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) - av1_cost_tokens_from_cdf(pcost->eob_cost[ctx], - fc->eob_flag_cdf[tx_size][plane][ctx], NULL); + av1_cost_tokens_from_cdf(pcost->eob_extra_cost[ctx], + fc->eob_extra_cdf[tx_size][plane][ctx], NULL); for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx) av1_cost_tokens_from_cdf(pcost->dc_sign_cost[ctx], fc->dc_sign_cdf[plane][ctx], NULL); - for (int layer = 0; layer < NUM_BASE_LEVELS; ++layer) - for (int ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx) - av1_cost_tokens_from_cdf( - pcost->base_cost[layer][ctx], - fc->coeff_base_cdf[tx_size][plane][layer][ctx], NULL); - -#if BR_NODE - for (int br = 0; br < BASE_RANGE_SETS; ++br) - for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) - av1_cost_tokens_from_cdf(pcost->br_cost[br][ctx], - fc->coeff_br_cdf[tx_size][plane][br][ctx], - NULL); - for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) { - int lps_rate[2]; - av1_cost_tokens_from_cdf(lps_rate, - fc->coeff_lps_cdf[tx_size][plane][ctx], NULL); - - for (int base_range = 0; base_range < COEFF_BASE_RANGE + 1; - ++base_range) { - int br_set_idx = base_range < COEFF_BASE_RANGE - ? coeff_to_br_index[base_range] - : BASE_RANGE_SETS; - - pcost->lps_cost[ctx][base_range] = 0; - - for (int idx = 0; idx < BASE_RANGE_SETS; ++idx) { - if (idx == br_set_idx) { - pcost->lps_cost[ctx][base_range] += pcost->br_cost[idx][ctx][1]; - - int br_base = br_index_to_coeff[br_set_idx]; - int br_offset = base_range - br_base; - int extra_bits = (1 << br_extra_bits[idx]) - 1; - for (int tok = 0; tok < extra_bits; ++tok) { - if (tok == br_offset) { - pcost->lps_cost[ctx][base_range] += lps_rate[1]; - break; - } else { - pcost->lps_cost[ctx][base_range] += lps_rate[0]; - } - } - break; - } else { - pcost->lps_cost[ctx][base_range] += pcost->br_cost[idx][ctx][0]; - } - } - // load the base range cost - } - } -#else // BR_NODE - for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) - av1_cost_tokens_from_cdf(pcost->lps_cost[ctx], - fc->coeff_lps_cdf[tx_size][plane][ctx], NULL); -#endif // BR_NODE -#if CONFIG_CTX1D - for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class) - av1_cost_tokens_from_cdf(pcost->eob_mode_cost[tx_class], - fc->eob_mode_cdf[tx_size][plane][tx_class], + int br_rate[BR_CDF_SIZE]; + int prev_cost = 0; + int i, j; + av1_cost_tokens_from_cdf(br_rate, fc->coeff_br_cdf[tx_size][plane][ctx], NULL); - - for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class) - for (int ctx = 0; ctx < EMPTY_LINE_CONTEXTS; ++ctx) - av1_cost_tokens_from_cdf( - pcost->empty_line_cost[tx_class][ctx], - fc->empty_line_cdf[tx_size][plane][tx_class][ctx], NULL); - - for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class) - for (int ctx = 0; ctx < HV_EOB_CONTEXTS; ++ctx) - av1_cost_tokens_from_cdf( - pcost->hv_eob_cost[tx_class][ctx], - fc->hv_eob_cdf[tx_size][plane][tx_class][ctx], NULL); -#endif // CONFIG_CTX1D -#else // LV_MAP_PROB - for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) - get_rate_cost(fc->txb_skip[tx_size][ctx], pcost->txb_skip_cost[ctx]); - - for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) - get_rate_cost(fc->nz_map[tx_size][plane][ctx], pcost->nz_map_cost[ctx]); - - for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) - get_rate_cost(fc->eob_flag[tx_size][plane][ctx], pcost->eob_cost[ctx]); - - for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx) - get_rate_cost(fc->dc_sign[plane][ctx], pcost->dc_sign_cost[ctx]); - - for (int layer = 0; layer < NUM_BASE_LEVELS; ++layer) - for (int ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx) - get_rate_cost(fc->coeff_base[tx_size][plane][layer][ctx], - pcost->base_cost[layer][ctx]); - - for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) - get_rate_cost(fc->coeff_lps[tx_size][plane][ctx], pcost->lps_cost[ctx]); - -#if CONFIG_CTX1D - for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class) - get_rate_cost(fc->eob_mode[tx_size][plane][tx_class], - pcost->eob_mode_cost[tx_class]); - - for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class) - for (int ctx = 0; ctx < EMPTY_LINE_CONTEXTS; ++ctx) - get_rate_cost(fc->empty_line[tx_size][plane][tx_class][ctx], - pcost->empty_line_cost[tx_class][ctx]); - - for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class) - for (int ctx = 0; ctx < HV_EOB_CONTEXTS; ++ctx) - get_rate_cost(fc->hv_eob[tx_size][plane][tx_class][ctx], - pcost->hv_eob_cost[tx_class][ctx]); -#endif // CONFIG_CTX1D -#endif // LV_MAP_PROB - } - } -} -#endif // CONFIG_LV_MAP - -void av1_fill_token_costs_from_cdf(av1_coeff_cost *cost, - coeff_cdf_model (*cdf)[PLANE_TYPES]) { - for (int tx = 0; tx < TX_SIZES; ++tx) { - for (int pt = 0; pt < PLANE_TYPES; ++pt) { - for (int rt = 0; rt < REF_TYPES; ++rt) { - for (int band = 0; band < COEF_BANDS; ++band) { - for (int ctx = 0; ctx < BAND_COEFF_CONTEXTS(band); ++ctx) { - av1_cost_tokens_from_cdf(cost[tx][pt][rt][band][ctx], - cdf[tx][pt][rt][band][ctx], NULL); + // printf("br_rate: "); + // for(j = 0; j < BR_CDF_SIZE; j++) + // printf("%4d ", br_rate[j]); + // printf("\n"); + for (i = 0; i < COEFF_BASE_RANGE; i += BR_CDF_SIZE - 1) { + for (j = 0; j < BR_CDF_SIZE - 1; j++) { + pcost->lps_cost[ctx][i + j] = prev_cost + br_rate[j]; } + prev_cost += br_rate[j]; } + pcost->lps_cost[ctx][i] = prev_cost; + // printf("lps_cost: %d %d %2d : ", tx_size, plane, ctx); + // for (i = 0; i <= COEFF_BASE_RANGE; i++) + // printf("%5d ", pcost->lps_cost[ctx][i]); + // printf("\n"); } } } @@ -688,7 +520,6 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->td.mb; RD_OPT *const rd = &cpi->rd; - int nmv_ctx; aom_clear_system_state(); @@ -698,56 +529,35 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) { set_block_thresholds(cm, rd); - for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) { -#if CONFIG_AMVR - if (cm->cur_frame_mv_precision_level) { - av1_build_nmv_cost_table(x->nmv_vec_cost[nmv_ctx], x->nmvcost[nmv_ctx], - &cm->fc->nmvc[nmv_ctx], MV_SUBPEL_NONE); - } else { - av1_build_nmv_cost_table( - x->nmv_vec_cost[nmv_ctx], - cm->allow_high_precision_mv ? x->nmvcost_hp[nmv_ctx] - : x->nmvcost[nmv_ctx], - &cm->fc->nmvc[nmv_ctx], cm->allow_high_precision_mv); - } - -#else + if (cm->cur_frame_force_integer_mv) { + av1_build_nmv_cost_table(x->nmv_vec_cost, x->nmvcost, &cm->fc->nmvc, + MV_SUBPEL_NONE); + } else { av1_build_nmv_cost_table( - x->nmv_vec_cost[nmv_ctx], - cm->allow_high_precision_mv ? x->nmvcost_hp[nmv_ctx] - : x->nmvcost[nmv_ctx], - &cm->fc->nmvc[nmv_ctx], cm->allow_high_precision_mv); -#endif + x->nmv_vec_cost, + cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, &cm->fc->nmvc, + cm->allow_high_precision_mv); } - x->mvcost = x->mv_cost_stack[0]; - x->nmvjointcost = x->nmv_vec_cost[0]; -#if CONFIG_INTRABC + x->mvcost = x->mv_cost_stack; + x->nmvjointcost = x->nmv_vec_cost; + if (frame_is_intra_only(cm) && cm->allow_screen_content_tools && cpi->oxcf.pass != 1) { - av1_build_nmv_cost_table( - x->nmv_vec_cost[0], - cm->allow_high_precision_mv ? x->nmvcost_hp[0] : x->nmvcost[0], - &cm->fc->ndvc, MV_SUBPEL_NONE); + int *dvcost[2] = { &cpi->dv_cost[0][MV_MAX], &cpi->dv_cost[1][MV_MAX] }; + av1_build_nmv_cost_table(cpi->dv_joint_cost, dvcost, &cm->fc->ndvc, + MV_SUBPEL_NONE); } -#endif -#if CONFIG_GLOBAL_MOTION if (cpi->oxcf.pass != 1) { for (int i = 0; i < TRANS_TYPES; ++i) -#if GLOBAL_TRANS_TYPES > 4 - cpi->gmtype_cost[i] = (1 + (i > 0 ? GLOBAL_TYPE_BITS : 0)) - << AV1_PROB_COST_SHIFT; -#else // IDENTITY: 1 bit // TRANSLATION: 3 bits // ROTZOOM: 2 bits // AFFINE: 3 bits cpi->gmtype_cost[i] = (1 + (i > 0 ? (i == ROTZOOM ? 1 : 2) : 0)) << AV1_PROB_COST_SHIFT; -#endif // GLOBAL_TRANS_TYPES > 4 } -#endif // CONFIG_GLOBAL_MOTION } static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { @@ -840,288 +650,32 @@ void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2, } } -static void get_entropy_contexts_plane( - BLOCK_SIZE plane_bsize, TX_SIZE tx_size, const struct macroblockd_plane *pd, - ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE], - ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) { +static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[MAX_MIB_SIZE], + ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) { const int num_4x4_w = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; const int num_4x4_h = block_size_high[plane_bsize] >> tx_size_high_log2[0]; const ENTROPY_CONTEXT *const above = pd->above_context; const ENTROPY_CONTEXT *const left = pd->left_context; -#if CONFIG_LV_MAP memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); - return; -#endif // CONFIG_LV_MAP - - int i; - -#if CONFIG_CHROMA_2X2 - switch (tx_size) { - case TX_2X2: - memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); - memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); - break; - case TX_4X4: - for (i = 0; i < num_4x4_w; i += 2) - t_above[i] = !!*(const uint16_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 2) - t_left[i] = !!*(const uint16_t *)&left[i]; - break; - case TX_8X8: - for (i = 0; i < num_4x4_w; i += 4) - t_above[i] = !!*(const uint32_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 4) - t_left[i] = !!*(const uint32_t *)&left[i]; - break; - case TX_16X16: - for (i = 0; i < num_4x4_w; i += 8) - t_above[i] = !!*(const uint64_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 8) - t_left[i] = !!*(const uint64_t *)&left[i]; - break; - case TX_32X32: - for (i = 0; i < num_4x4_w; i += 16) - t_above[i] = - !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]); - for (i = 0; i < num_4x4_h; i += 16) - t_left[i] = - !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]); - break; -#if CONFIG_TX64X64 - case TX_32X64: - for (i = 0; i < num_4x4_w; i += 16) - t_above[i] = - !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]); - for (i = 0; i < num_4x4_h; i += 32) - t_left[i] = - !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8] | - *(const uint64_t *)&left[i + 16] | - *(const uint64_t *)&left[i + 24]); - break; - case TX_64X32: - for (i = 0; i < num_4x4_w; i += 32) - t_above[i] = - !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8] | - *(const uint64_t *)&above[i + 16] | - *(const uint64_t *)&above[i + 24]); - for (i = 0; i < num_4x4_h; i += 16) - t_left[i] = - !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]); - break; - case TX_64X64: - for (i = 0; i < num_4x4_w; i += 32) - t_above[i] = - !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8] | - *(const uint64_t *)&above[i + 16] | - *(const uint64_t *)&above[i + 24]); - for (i = 0; i < num_4x4_h; i += 32) - t_left[i] = - !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8] | - *(const uint64_t *)&left[i + 16] | - *(const uint64_t *)&left[i + 24]); - break; -#endif // CONFIG_TX64X64 - case TX_4X8: - for (i = 0; i < num_4x4_w; i += 2) - t_above[i] = !!*(const uint16_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 4) - t_left[i] = !!*(const uint32_t *)&left[i]; - break; - case TX_8X4: - for (i = 0; i < num_4x4_w; i += 4) - t_above[i] = !!*(const uint32_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 2) - t_left[i] = !!*(const uint16_t *)&left[i]; - break; - case TX_8X16: - for (i = 0; i < num_4x4_w; i += 4) - t_above[i] = !!*(const uint32_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 8) - t_left[i] = !!*(const uint64_t *)&left[i]; - break; - case TX_16X8: - for (i = 0; i < num_4x4_w; i += 8) - t_above[i] = !!*(const uint64_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 4) - t_left[i] = !!*(const uint32_t *)&left[i]; - break; - case TX_16X32: - for (i = 0; i < num_4x4_w; i += 8) - t_above[i] = !!*(const uint64_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 16) - t_left[i] = - !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]); - break; - case TX_32X16: - for (i = 0; i < num_4x4_w; i += 16) - t_above[i] = - !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]); - for (i = 0; i < num_4x4_h; i += 8) - t_left[i] = !!*(const uint64_t *)&left[i]; - break; -#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX) - case TX_4X16: - for (i = 0; i < num_4x4_w; i += 2) - t_above[i] = !!*(const uint16_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 8) - t_left[i] = !!*(const uint64_t *)&left[i]; - break; - case TX_16X4: - for (i = 0; i < num_4x4_w; i += 8) - t_above[i] = !!*(const uint64_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 2) - t_left[i] = !!*(const uint16_t *)&left[i]; - break; - case TX_8X32: - for (i = 0; i < num_4x4_w; i += 4) - t_above[i] = !!*(const uint32_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 16) - t_left[i] = - !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]); - break; - case TX_32X8: - for (i = 0; i < num_4x4_w; i += 16) - t_above[i] = - !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]); - for (i = 0; i < num_4x4_h; i += 4) - t_left[i] = !!*(const uint32_t *)&left[i]; - break; -#endif - - default: assert(0 && "Invalid transform size."); break; - } - return; -#endif // CONFIG_CHROMA_2X2 - - switch (tx_size) { - case TX_4X4: - memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); - memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); - break; - case TX_8X8: - for (i = 0; i < num_4x4_w; i += 2) - t_above[i] = !!*(const uint16_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 2) - t_left[i] = !!*(const uint16_t *)&left[i]; - break; - case TX_16X16: - for (i = 0; i < num_4x4_w; i += 4) - t_above[i] = !!*(const uint32_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 4) - t_left[i] = !!*(const uint32_t *)&left[i]; - break; - case TX_32X32: - for (i = 0; i < num_4x4_w; i += 8) - t_above[i] = !!*(const uint64_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 8) - t_left[i] = !!*(const uint64_t *)&left[i]; - break; -#if CONFIG_TX64X64 - case TX_32X64: - for (i = 0; i < num_4x4_w; i += 8) - t_above[i] = !!*(const uint64_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 16) - t_left[i] = - !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]); - break; - case TX_64X32: - for (i = 0; i < num_4x4_w; i += 16) - t_above[i] = - !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]); - for (i = 0; i < num_4x4_h; i += 8) - t_left[i] = !!*(const uint64_t *)&left[i]; - break; - case TX_64X64: - for (i = 0; i < num_4x4_w; i += 16) - t_above[i] = - !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]); - for (i = 0; i < num_4x4_h; i += 16) - t_left[i] = - !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]); - break; -#endif // CONFIG_TX64X64 - case TX_4X8: - memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); - for (i = 0; i < num_4x4_h; i += 2) - t_left[i] = !!*(const uint16_t *)&left[i]; - break; - case TX_8X4: - for (i = 0; i < num_4x4_w; i += 2) - t_above[i] = !!*(const uint16_t *)&above[i]; - memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); - break; - case TX_8X16: - for (i = 0; i < num_4x4_w; i += 2) - t_above[i] = !!*(const uint16_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 4) - t_left[i] = !!*(const uint32_t *)&left[i]; - break; - case TX_16X8: - for (i = 0; i < num_4x4_w; i += 4) - t_above[i] = !!*(const uint32_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 2) - t_left[i] = !!*(const uint16_t *)&left[i]; - break; - case TX_16X32: - for (i = 0; i < num_4x4_w; i += 4) - t_above[i] = !!*(const uint32_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 8) - t_left[i] = !!*(const uint64_t *)&left[i]; - break; - case TX_32X16: - for (i = 0; i < num_4x4_w; i += 8) - t_above[i] = !!*(const uint64_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 4) - t_left[i] = !!*(const uint32_t *)&left[i]; - break; -#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX) - case TX_4X16: - memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); - for (i = 0; i < num_4x4_h; i += 4) - t_left[i] = !!*(const uint32_t *)&left[i]; - break; - case TX_16X4: - for (i = 0; i < num_4x4_w; i += 4) - t_above[i] = !!*(const uint32_t *)&above[i]; - memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); - break; - case TX_8X32: - for (i = 0; i < num_4x4_w; i += 2) - t_above[i] = !!*(const uint16_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 8) - t_left[i] = !!*(const uint64_t *)&left[i]; - break; - case TX_32X8: - for (i = 0; i < num_4x4_w; i += 8) - t_above[i] = !!*(const uint64_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 2) - t_left[i] = !!*(const uint16_t *)&left[i]; - break; -#endif - default: assert(0 && "Invalid transform size."); break; - } } -void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, +void av1_get_entropy_contexts(BLOCK_SIZE bsize, const struct macroblockd_plane *pd, - ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE], - ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) { -#if CONFIG_CHROMA_SUB8X8 + ENTROPY_CONTEXT t_above[MAX_MIB_SIZE], + ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) { const BLOCK_SIZE plane_bsize = - AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd)); -#else - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); -#endif - get_entropy_contexts_plane(plane_bsize, tx_size, pd, t_above, t_left); + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + get_entropy_contexts_plane(plane_bsize, pd, t_above, t_left); } void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) { int i; int zero_seen = 0; - int best_index = 0; int best_sad = INT_MAX; int this_sad = INT_MAX; int max_mv = 0; @@ -1129,11 +683,15 @@ void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, uint8_t *ref_y_ptr; MV pred_mv[MAX_MV_REF_CANDIDATES + 1]; int num_mv_refs = 0; - - pred_mv[num_mv_refs++] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv; - if (x->mbmi_ext->ref_mvs[ref_frame][0].as_int != - x->mbmi_ext->ref_mvs[ref_frame][1].as_int) { - pred_mv[num_mv_refs++] = x->mbmi_ext->ref_mvs[ref_frame][1].as_mv; + const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME }; + const int_mv ref_mv = + av1_get_ref_mv_from_stack(0, ref_frames, 0, x->mbmi_ext); + const int_mv ref_mv1 = + av1_get_ref_mv_from_stack(0, ref_frames, 1, x->mbmi_ext); + + pred_mv[num_mv_refs++] = ref_mv.as_mv; + if (ref_mv.as_int != ref_mv1.as_int) { + pred_mv[num_mv_refs++] = ref_mv1.as_mv; } if (cpi->sf.adaptive_motion_search && block_size < x->max_partition_size) pred_mv[num_mv_refs++] = x->pred_mv[ref_frame]; @@ -1158,12 +716,10 @@ void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, // Note if it is the best so far. if (this_sad < best_sad) { best_sad = this_sad; - best_index = i; } } // Note the index of the mv that worked best in the reference list. - x->mv_best_ref_index[ref_frame] = best_index; x->max_mv_context[ref_frame] = max_mv; x->pred_mv_sad[ref_frame] = best_sad; } @@ -1172,7 +728,8 @@ void av1_setup_pred_block(const MACROBLOCKD *xd, struct buf_2d dst[MAX_MB_PLANE], const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, const struct scale_factors *scale, - const struct scale_factors *scale_uv) { + const struct scale_factors *scale_uv, + const int num_planes) { int i; dst[0].buf = src->y_buffer; @@ -1181,8 +738,8 @@ void av1_setup_pred_block(const MACROBLOCKD *xd, dst[2].buf = src->v_buffer; dst[1].stride = dst[2].stride = src->uv_stride; - for (i = 0; i < MAX_MB_PLANE; ++i) { - setup_pred_plane(dst + i, xd->mi[0]->mbmi.sb_type, dst[i].buf, + for (i = 0; i < num_planes; ++i) { + setup_pred_plane(dst + i, xd->mi[0]->sb_type, dst[i].buf, i ? src->uv_crop_width : src->y_crop_width, i ? src->uv_crop_height : src->y_crop_height, dst[i].stride, mi_row, mi_col, i ? scale_uv : scale, @@ -1192,7 +749,7 @@ void av1_setup_pred_block(const MACROBLOCKD *xd, int av1_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block, int stride) { - const int bw = b_width_log2_lookup[plane_bsize]; + const int bw = mi_size_wide_log2[plane_bsize]; const int y = 4 * (raster_block >> bw); const int x = 4 * (raster_block & ((1 << bw) - 1)); return y * stride + x; @@ -1214,43 +771,24 @@ YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi, : NULL; } -#if CONFIG_DUAL_FILTER int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x, const MACROBLOCKD *xd) { if (cm->interp_filter == SWITCHABLE) { - const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const MB_MODE_INFO *const mbmi = xd->mi[0]; int inter_filter_cost = 0; int dir; for (dir = 0; dir < 2; ++dir) { - if (has_subpel_mv_component(xd->mi[0], xd, dir) || - (mbmi->ref_frame[1] > INTRA_FRAME && - has_subpel_mv_component(xd->mi[0], xd, dir + 2))) { - const int ctx = av1_get_pred_context_switchable_interp(xd, dir); - const InterpFilter filter = - av1_extract_interp_filter(mbmi->interp_filters, dir); - inter_filter_cost += x->switchable_interp_costs[ctx][filter]; - } + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + const InterpFilter filter = + av1_extract_interp_filter(mbmi->interp_filters, dir); + inter_filter_cost += x->switchable_interp_costs[ctx][filter]; } return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost; } else { return 0; } } -#else -int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x, - const MACROBLOCKD *xd) { - if (cm->interp_filter == SWITCHABLE) { - const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - const int ctx = av1_get_pred_context_switchable_interp(xd); - const InterpFilter filter = - av1_extract_interp_filter(mbmi->interp_filters, 0); - return SWITCHABLE_INTERP_RATE_FACTOR * - x->switchable_interp_costs[ctx][filter]; - } - return 0; -} -#endif void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { int i; @@ -1262,22 +800,18 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { if (sf->adaptive_rd_thresh) { rd->thresh_mult[THR_NEARESTMV] = 300; -#if CONFIG_EXT_REFS rd->thresh_mult[THR_NEARESTL2] = 300; rd->thresh_mult[THR_NEARESTL3] = 300; rd->thresh_mult[THR_NEARESTB] = 300; rd->thresh_mult[THR_NEARESTA2] = 300; -#endif // CONFIG_EXT_REFS rd->thresh_mult[THR_NEARESTA] = 300; rd->thresh_mult[THR_NEARESTG] = 300; } else { rd->thresh_mult[THR_NEARESTMV] = 0; -#if CONFIG_EXT_REFS rd->thresh_mult[THR_NEARESTL2] = 0; rd->thresh_mult[THR_NEARESTL3] = 0; rd->thresh_mult[THR_NEARESTB] = 0; rd->thresh_mult[THR_NEARESTA2] = 0; -#endif // CONFIG_EXT_REFS rd->thresh_mult[THR_NEARESTA] = 0; rd->thresh_mult[THR_NEARESTG] = 0; } @@ -1285,92 +819,35 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_DC] += 1000; rd->thresh_mult[THR_NEWMV] += 1000; -#if CONFIG_EXT_REFS rd->thresh_mult[THR_NEWL2] += 1000; rd->thresh_mult[THR_NEWL3] += 1000; rd->thresh_mult[THR_NEWB] += 1000; rd->thresh_mult[THR_NEWA2] = 1000; -#endif // CONFIG_EXT_REFS rd->thresh_mult[THR_NEWA] += 1000; rd->thresh_mult[THR_NEWG] += 1000; rd->thresh_mult[THR_NEARMV] += 1000; -#if CONFIG_EXT_REFS rd->thresh_mult[THR_NEARL2] += 1000; rd->thresh_mult[THR_NEARL3] += 1000; rd->thresh_mult[THR_NEARB] += 1000; rd->thresh_mult[THR_NEARA2] = 1000; -#endif // CONFIG_EXT_REFS rd->thresh_mult[THR_NEARA] += 1000; rd->thresh_mult[THR_NEARG] += 1000; - rd->thresh_mult[THR_ZEROMV] += 2000; -#if CONFIG_EXT_REFS - rd->thresh_mult[THR_ZEROL2] += 2000; - rd->thresh_mult[THR_ZEROL3] += 2000; - rd->thresh_mult[THR_ZEROB] += 2000; - rd->thresh_mult[THR_ZEROA2] = 2000; -#endif // CONFIG_EXT_REFS - rd->thresh_mult[THR_ZEROG] += 2000; - rd->thresh_mult[THR_ZEROA] += 2000; - - rd->thresh_mult[THR_TM] += 1000; - -#if CONFIG_COMPOUND_SINGLEREF - rd->thresh_mult[THR_SR_NEAREST_NEARMV] += 1200; -#if CONFIG_EXT_REFS - rd->thresh_mult[THR_SR_NEAREST_NEARL2] += 1200; - rd->thresh_mult[THR_SR_NEAREST_NEARL3] += 1200; - rd->thresh_mult[THR_SR_NEAREST_NEARB] += 1200; -#endif // CONFIG_EXT_REFS - rd->thresh_mult[THR_SR_NEAREST_NEARA] += 1200; - rd->thresh_mult[THR_SR_NEAREST_NEARG] += 1200; - - /* - rd->thresh_mult[THR_SR_NEAREST_NEWMV] += 1200; -#if CONFIG_EXT_REFS - rd->thresh_mult[THR_SR_NEAREST_NEWL2] += 1200; - rd->thresh_mult[THR_SR_NEAREST_NEWL3] += 1200; - rd->thresh_mult[THR_SR_NEAREST_NEWB] += 1200; -#endif // CONFIG_EXT_REFS - rd->thresh_mult[THR_SR_NEAREST_NEWA] += 1200; - rd->thresh_mult[THR_SR_NEAREST_NEWG] += 1200;*/ - - rd->thresh_mult[THR_SR_NEAR_NEWMV] += 1500; -#if CONFIG_EXT_REFS - rd->thresh_mult[THR_SR_NEAR_NEWL2] += 1500; - rd->thresh_mult[THR_SR_NEAR_NEWL3] += 1500; - rd->thresh_mult[THR_SR_NEAR_NEWB] += 1500; -#endif // CONFIG_EXT_REFS - rd->thresh_mult[THR_SR_NEAR_NEWA] += 1500; - rd->thresh_mult[THR_SR_NEAR_NEWG] += 1500; - - rd->thresh_mult[THR_SR_ZERO_NEWMV] += 2000; -#if CONFIG_EXT_REFS - rd->thresh_mult[THR_SR_ZERO_NEWL2] += 2000; - rd->thresh_mult[THR_SR_ZERO_NEWL3] += 2000; - rd->thresh_mult[THR_SR_ZERO_NEWB] += 2000; -#endif // CONFIG_EXT_REFS - rd->thresh_mult[THR_SR_ZERO_NEWA] += 2000; - rd->thresh_mult[THR_SR_ZERO_NEWG] += 2000; - - rd->thresh_mult[THR_SR_NEW_NEWMV] += 1700; -#if CONFIG_EXT_REFS - rd->thresh_mult[THR_SR_NEW_NEWL2] += 1700; - rd->thresh_mult[THR_SR_NEW_NEWL3] += 1700; - rd->thresh_mult[THR_SR_NEW_NEWB] += 1700; -#endif // CONFIG_EXT_REFS - rd->thresh_mult[THR_SR_NEW_NEWA] += 1700; - rd->thresh_mult[THR_SR_NEW_NEWG] += 1700; -#endif // CONFIG_COMPOUND_SINGLEREF + rd->thresh_mult[THR_GLOBALMV] += 2000; + rd->thresh_mult[THR_GLOBALL2] += 2000; + rd->thresh_mult[THR_GLOBALL3] += 2000; + rd->thresh_mult[THR_GLOBALB] += 2000; + rd->thresh_mult[THR_GLOBALA2] = 2000; + rd->thresh_mult[THR_GLOBALG] += 2000; + rd->thresh_mult[THR_GLOBALA] += 2000; + + rd->thresh_mult[THR_PAETH] += 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000; -#if CONFIG_EXT_REFS rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000; -#endif // CONFIG_EXT_REFS rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 1000; -#if CONFIG_EXT_REFS rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] += 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] += 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] += 1000; @@ -1380,13 +857,10 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A2] += 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA2] += 1000; -#if CONFIG_EXT_COMP_REFS - rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] += 1000; - rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] += 1000; - rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] += 1000; - rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] += 1000; -#endif // CONFIG_EXT_COMP_REFS -#endif // CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] += 2000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] += 2000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] += 2000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] += 2000; rd->thresh_mult[THR_COMP_NEAR_NEARLA] += 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWLA] += 1500; @@ -1394,16 +868,15 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1700; rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1700; rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2000; - rd->thresh_mult[THR_COMP_ZERO_ZEROLA] += 2500; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] += 2500; -#if CONFIG_EXT_REFS rd->thresh_mult[THR_COMP_NEAR_NEARL2A] += 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] += 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] += 1500; rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1700; rd->thresh_mult[THR_COMP_NEW_NEARL2A] += 1700; rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 2000; - rd->thresh_mult[THR_COMP_ZERO_ZEROL2A] += 2500; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] += 2500; rd->thresh_mult[THR_COMP_NEAR_NEARL3A] += 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] += 1500; @@ -1411,8 +884,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEAR_NEWL3A] += 1700; rd->thresh_mult[THR_COMP_NEW_NEARL3A] += 1700; rd->thresh_mult[THR_COMP_NEW_NEWL3A] += 2000; - rd->thresh_mult[THR_COMP_ZERO_ZEROL3A] += 2500; -#endif // CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] += 2500; rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWGA] += 1500; @@ -1420,16 +892,15 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 1700; rd->thresh_mult[THR_COMP_NEW_NEARGA] += 1700; rd->thresh_mult[THR_COMP_NEW_NEWGA] += 2000; - rd->thresh_mult[THR_COMP_ZERO_ZEROGA] += 2500; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] += 2500; -#if CONFIG_EXT_REFS rd->thresh_mult[THR_COMP_NEAR_NEARLB] += 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWLB] += 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTLB] += 1500; rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1700; rd->thresh_mult[THR_COMP_NEW_NEARLB] += 1700; rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2000; - rd->thresh_mult[THR_COMP_ZERO_ZEROLB] += 2500; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] += 2500; rd->thresh_mult[THR_COMP_NEAR_NEARL2B] += 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] += 1500; @@ -1437,7 +908,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEAR_NEWL2B] += 1700; rd->thresh_mult[THR_COMP_NEW_NEARL2B] += 1700; rd->thresh_mult[THR_COMP_NEW_NEWL2B] += 2000; - rd->thresh_mult[THR_COMP_ZERO_ZEROL2B] += 2500; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2B] += 2500; rd->thresh_mult[THR_COMP_NEAR_NEARL3B] += 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] += 1500; @@ -1445,7 +916,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1700; rd->thresh_mult[THR_COMP_NEW_NEARL3B] += 1700; rd->thresh_mult[THR_COMP_NEW_NEWL3B] += 2000; - rd->thresh_mult[THR_COMP_ZERO_ZEROL3B] += 2500; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] += 2500; rd->thresh_mult[THR_COMP_NEAR_NEARGB] += 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWGB] += 1500; @@ -1453,7 +924,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEAR_NEWGB] += 1700; rd->thresh_mult[THR_COMP_NEW_NEARGB] += 1700; rd->thresh_mult[THR_COMP_NEW_NEWGB] += 2000; - rd->thresh_mult[THR_COMP_ZERO_ZEROGB] += 2500; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] += 2500; rd->thresh_mult[THR_COMP_NEAR_NEARLA2] += 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] += 1500; @@ -1461,7 +932,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEAR_NEWLA2] += 1700; rd->thresh_mult[THR_COMP_NEW_NEARLA2] += 1700; rd->thresh_mult[THR_COMP_NEW_NEWLA2] += 2000; - rd->thresh_mult[THR_COMP_ZERO_ZEROLA2] += 2500; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA2] += 2500; rd->thresh_mult[THR_COMP_NEAR_NEARL2A2] += 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWL2A2] += 1500; @@ -1469,7 +940,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEAR_NEWL2A2] += 1700; rd->thresh_mult[THR_COMP_NEW_NEARL2A2] += 1700; rd->thresh_mult[THR_COMP_NEW_NEWL2A2] += 2000; - rd->thresh_mult[THR_COMP_ZERO_ZEROL2A2] += 2500; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] += 2500; rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] += 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] += 1500; @@ -1477,7 +948,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] += 1700; rd->thresh_mult[THR_COMP_NEW_NEARL3A2] += 1700; rd->thresh_mult[THR_COMP_NEW_NEWL3A2] += 2000; - rd->thresh_mult[THR_COMP_ZERO_ZEROL3A2] += 2500; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A2] += 2500; rd->thresh_mult[THR_COMP_NEAR_NEARGA2] += 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWGA2] += 1500; @@ -1485,124 +956,55 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEAR_NEWGA2] += 1700; rd->thresh_mult[THR_COMP_NEW_NEARGA2] += 1700; rd->thresh_mult[THR_COMP_NEW_NEWGA2] += 2000; - rd->thresh_mult[THR_COMP_ZERO_ZEROGA2] += 2500; - -#if CONFIG_EXT_COMP_REFS - rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1200; - rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 1500; - rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 1500; - rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 1700; - rd->thresh_mult[THR_COMP_NEW_NEARLL2] += 1700; - rd->thresh_mult[THR_COMP_NEW_NEWLL2] += 2000; - rd->thresh_mult[THR_COMP_ZERO_ZEROLL2] += 2500; - - rd->thresh_mult[THR_COMP_NEAR_NEARLL3] += 1200; - rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] += 1500; - rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 1500; - rd->thresh_mult[THR_COMP_NEAR_NEWLL3] += 1700; - rd->thresh_mult[THR_COMP_NEW_NEARLL3] += 1700; - rd->thresh_mult[THR_COMP_NEW_NEWLL3] += 2000; - rd->thresh_mult[THR_COMP_ZERO_ZEROLL3] += 2500; - - rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1200; - rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 1500; - rd->thresh_mult[THR_COMP_NEW_NEARESTLG] += 1500; - rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 1700; - rd->thresh_mult[THR_COMP_NEW_NEARLG] += 1700; - rd->thresh_mult[THR_COMP_NEW_NEWLG] += 2000; - rd->thresh_mult[THR_COMP_ZERO_ZEROLG] += 2500; - - rd->thresh_mult[THR_COMP_NEAR_NEARBA] += 1200; - rd->thresh_mult[THR_COMP_NEAREST_NEWBA] += 1500; - rd->thresh_mult[THR_COMP_NEW_NEARESTBA] += 1500; - rd->thresh_mult[THR_COMP_NEAR_NEWBA] += 1700; - rd->thresh_mult[THR_COMP_NEW_NEARBA] += 1700; - rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2000; - rd->thresh_mult[THR_COMP_ZERO_ZEROBA] += 2500; -#endif // CONFIG_EXT_COMP_REFS -#endif // CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2500; rd->thresh_mult[THR_H_PRED] += 2000; rd->thresh_mult[THR_V_PRED] += 2000; rd->thresh_mult[THR_D135_PRED] += 2500; - rd->thresh_mult[THR_D207_PRED] += 2500; - rd->thresh_mult[THR_D153_PRED] += 2500; - rd->thresh_mult[THR_D63_PRED] += 2500; - rd->thresh_mult[THR_D117_PRED] += 2500; + rd->thresh_mult[THR_D203_PRED] += 2500; + rd->thresh_mult[THR_D157_PRED] += 2500; + rd->thresh_mult[THR_D67_PRED] += 2500; + rd->thresh_mult[THR_D113_PRED] += 2500; rd->thresh_mult[THR_D45_PRED] += 2500; - rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEARL] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEWL] += 2000; - -#if CONFIG_EXT_REFS - rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL2] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL2] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEARL2] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEWL2] += 2000; - - rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL3] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL3] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEARL3] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEWL3] += 2000; -#endif // CONFIG_EXT_REFS - - rd->thresh_mult[THR_COMP_INTERINTRA_ZEROG] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEARG] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEWG] += 2000; - -#if CONFIG_EXT_REFS - rd->thresh_mult[THR_COMP_INTERINTRA_ZEROB] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTB] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEARB] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEWB] += 2000; - - rd->thresh_mult[THR_COMP_INTERINTRA_ZEROA2] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTA2] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEARA2] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEWA2] += 2000; -#endif // CONFIG_EXT_REFS - - rd->thresh_mult[THR_COMP_INTERINTRA_ZEROA] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEARA] += 1500; - rd->thresh_mult[THR_COMP_INTERINTRA_NEWA] += 2000; + rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1600; + rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 2000; + rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 2000; + rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 2200; + rd->thresh_mult[THR_COMP_NEW_NEARLL2] += 2200; + rd->thresh_mult[THR_COMP_NEW_NEWLL2] += 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] += 3200; + + rd->thresh_mult[THR_COMP_NEAR_NEARLL3] += 1600; + rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] += 2000; + rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 2000; + rd->thresh_mult[THR_COMP_NEAR_NEWLL3] += 2200; + rd->thresh_mult[THR_COMP_NEW_NEARLL3] += 2200; + rd->thresh_mult[THR_COMP_NEW_NEWLL3] += 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] += 3200; + + rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1600; + rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 2000; + rd->thresh_mult[THR_COMP_NEW_NEARESTLG] += 2000; + rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 2200; + rd->thresh_mult[THR_COMP_NEW_NEARLG] += 2200; + rd->thresh_mult[THR_COMP_NEW_NEWLG] += 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] += 3200; + + rd->thresh_mult[THR_COMP_NEAR_NEARBA] += 1600; + rd->thresh_mult[THR_COMP_NEAREST_NEWBA] += 2000; + rd->thresh_mult[THR_COMP_NEW_NEARESTBA] += 2000; + rd->thresh_mult[THR_COMP_NEAR_NEWBA] += 2200; + rd->thresh_mult[THR_COMP_NEW_NEARBA] += 2200; + rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] += 3200; } void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) { - static const int thresh_mult[MAX_REFS] = { -#if CONFIG_EXT_REFS - 2500, - 2500, - 2500, - 2500, - 2500, - 2500, - 2500, - 4500, - 4500, - 4500, - 4500, - 4500, - 4500, - 4500, - 4500, - 4500, - 4500, - 4500, - 4500, - 2500 -#else // !CONFIG_EXT_REFS - 2500, - 2500, - 2500, - 4500, - 4500, - 2500 -#endif // CONFIG_EXT_REFS - }; + static const int thresh_mult[MAX_REFS] = { 2500, 2500, 2500, 2500, 2500, + 2500, 2500, 4500, 4500, 4500, + 4500, 4500, 4500, 4500, 4500, + 4500, 4500, 4500, 4500, 2500 }; RD_OPT *const rd = &cpi->rd; memcpy(rd->thresh_mult_sub8x8, thresh_mult, sizeof(thresh_mult)); } @@ -1611,15 +1013,12 @@ void av1_update_rd_thresh_fact(const AV1_COMMON *const cm, int (*factor_buf)[MAX_MODES], int rd_thresh, int bsize, int best_mode_index) { if (rd_thresh > 0) { -#if CONFIG_CB4X4 const int top_mode = MAX_MODES; -#else - const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES; -#endif int mode; for (mode = 0; mode < top_mode; ++mode) { const BLOCK_SIZE min_size = AOMMAX(bsize - 1, BLOCK_4X4); - const BLOCK_SIZE max_size = AOMMIN(bsize + 2, (int)cm->sb_size); + const BLOCK_SIZE max_size = + AOMMIN(bsize + 2, (int)cm->seq_params.sb_size); BLOCK_SIZE bs; for (bs = min_size; bs <= max_size; ++bs) { int *const fact = &factor_buf[bs][mode]; @@ -1635,8 +1034,7 @@ void av1_update_rd_thresh_fact(const AV1_COMMON *const cm, int av1_get_intra_cost_penalty(int qindex, int qdelta, aom_bit_depth_t bit_depth) { - const int q = av1_dc_quant(qindex, qdelta, bit_depth); -#if CONFIG_HIGHBITDEPTH + const int q = av1_dc_quant_Q3(qindex, qdelta, bit_depth); switch (bit_depth) { case AOM_BITS_8: return 20 * q; case AOM_BITS_10: return 5 * q; @@ -1645,7 +1043,4 @@ int av1_get_intra_cost_penalty(int qindex, int qdelta, assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); return -1; } -#else - return 20 * q; -#endif // CONFIG_HIGHBITDEPTH } diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h index 35ada8e6c..281b676b0 100644 --- a/third_party/aom/av1/encoder/rd.h +++ b/third_party/aom/av1/encoder/rd.h @@ -14,9 +14,6 @@ #include <limits.h> -#if CONFIG_ANS -#include "aom_dsp/ans.h" -#endif // CONFIG_ANS #include "av1/common/blockd.h" #include "av1/encoder/block.h" @@ -30,9 +27,9 @@ extern "C" { #define RDDIV_BITS 7 #define RD_EPB_SHIFT 6 -#define RDCOST(RM, R, D) \ - (ROUND_POWER_OF_TWO(((int64_t)R) * (RM), AV1_PROB_COST_SHIFT) + \ - (D << RDDIV_BITS)) +#define RDCOST(RM, R, D) \ + (ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT) + \ + ((D) * (1 << RDDIV_BITS))) #define RDCOST_DBL(RM, R, D) \ (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \ @@ -50,102 +47,43 @@ extern "C" { // const MODE_DEFINITION av1_mode_order[MAX_MODES] used in the rd code. typedef enum { THR_NEARESTMV, -#if CONFIG_EXT_REFS THR_NEARESTL2, THR_NEARESTL3, THR_NEARESTB, THR_NEARESTA2, -#endif // CONFIG_EXT_REFS THR_NEARESTA, THR_NEARESTG, THR_DC, THR_NEWMV, -#if CONFIG_EXT_REFS THR_NEWL2, THR_NEWL3, THR_NEWB, THR_NEWA2, -#endif // CONFIG_EXT_REFS THR_NEWA, THR_NEWG, THR_NEARMV, -#if CONFIG_EXT_REFS THR_NEARL2, THR_NEARL3, THR_NEARB, THR_NEARA2, -#endif // CONFIG_EXT_REFS THR_NEARA, THR_NEARG, - THR_ZEROMV, -#if CONFIG_EXT_REFS - THR_ZEROL2, - THR_ZEROL3, - THR_ZEROB, - THR_ZEROA2, -#endif // CONFIG_EXT_REFS - THR_ZEROA, - THR_ZEROG, - -#if CONFIG_COMPOUND_SINGLEREF - THR_SR_NEAREST_NEARMV, -#if CONFIG_EXT_REFS - THR_SR_NEAREST_NEARL2, - THR_SR_NEAREST_NEARL3, - THR_SR_NEAREST_NEARB, -#endif // CONFIG_EXT_REFS - THR_SR_NEAREST_NEARG, - THR_SR_NEAREST_NEARA, - - /* - THR_SR_NEAREST_NEWMV, -#if CONFIG_EXT_REFS - THR_SR_NEAREST_NEWL2, - THR_SR_NEAREST_NEWL3, - THR_SR_NEAREST_NEWB, -#endif // CONFIG_EXT_REFS - THR_SR_NEAREST_NEWG, - THR_SR_NEAREST_NEWA,*/ - - THR_SR_NEAR_NEWMV, -#if CONFIG_EXT_REFS - THR_SR_NEAR_NEWL2, - THR_SR_NEAR_NEWL3, - THR_SR_NEAR_NEWB, -#endif // CONFIG_EXT_REFS - THR_SR_NEAR_NEWG, - THR_SR_NEAR_NEWA, - - THR_SR_ZERO_NEWMV, -#if CONFIG_EXT_REFS - THR_SR_ZERO_NEWL2, - THR_SR_ZERO_NEWL3, - THR_SR_ZERO_NEWB, -#endif // CONFIG_EXT_REFS - THR_SR_ZERO_NEWG, - THR_SR_ZERO_NEWA, - - THR_SR_NEW_NEWMV, -#if CONFIG_EXT_REFS - THR_SR_NEW_NEWL2, - THR_SR_NEW_NEWL3, - THR_SR_NEW_NEWB, -#endif // CONFIG_EXT_REFS - THR_SR_NEW_NEWG, - THR_SR_NEW_NEWA, -#endif // CONFIG_COMPOUND_SINGLEREF + THR_GLOBALMV, + THR_GLOBALL2, + THR_GLOBALL3, + THR_GLOBALB, + THR_GLOBALA2, + THR_GLOBALA, + THR_GLOBALG, THR_COMP_NEAREST_NEARESTLA, -#if CONFIG_EXT_REFS THR_COMP_NEAREST_NEARESTL2A, THR_COMP_NEAREST_NEARESTL3A, -#endif // CONFIG_EXT_REFS THR_COMP_NEAREST_NEARESTGA, -#if CONFIG_EXT_REFS THR_COMP_NEAREST_NEARESTLB, THR_COMP_NEAREST_NEARESTL2B, THR_COMP_NEAREST_NEARESTL3B, @@ -154,21 +92,16 @@ typedef enum { THR_COMP_NEAREST_NEARESTL2A2, THR_COMP_NEAREST_NEARESTL3A2, THR_COMP_NEAREST_NEARESTGA2, -#if CONFIG_EXT_COMP_REFS THR_COMP_NEAREST_NEARESTLL2, THR_COMP_NEAREST_NEARESTLL3, THR_COMP_NEAREST_NEARESTLG, THR_COMP_NEAREST_NEARESTBA, -#endif // CONFIG_EXT_COMP_REFS -#endif // CONFIG_EXT_REFS - THR_TM, + THR_PAETH, THR_SMOOTH, -#if CONFIG_SMOOTH_HV THR_SMOOTH_V, THR_SMOOTH_H, -#endif // CONFIG_SMOOTH_HV THR_COMP_NEAR_NEARLA, THR_COMP_NEW_NEARESTLA, @@ -176,16 +109,15 @@ typedef enum { THR_COMP_NEW_NEARLA, THR_COMP_NEAR_NEWLA, THR_COMP_NEW_NEWLA, - THR_COMP_ZERO_ZEROLA, + THR_COMP_GLOBAL_GLOBALLA, -#if CONFIG_EXT_REFS THR_COMP_NEAR_NEARL2A, THR_COMP_NEW_NEARESTL2A, THR_COMP_NEAREST_NEWL2A, THR_COMP_NEW_NEARL2A, THR_COMP_NEAR_NEWL2A, THR_COMP_NEW_NEWL2A, - THR_COMP_ZERO_ZEROL2A, + THR_COMP_GLOBAL_GLOBALL2A, THR_COMP_NEAR_NEARL3A, THR_COMP_NEW_NEARESTL3A, @@ -193,8 +125,7 @@ typedef enum { THR_COMP_NEW_NEARL3A, THR_COMP_NEAR_NEWL3A, THR_COMP_NEW_NEWL3A, - THR_COMP_ZERO_ZEROL3A, -#endif // CONFIG_EXT_REFS + THR_COMP_GLOBAL_GLOBALL3A, THR_COMP_NEAR_NEARGA, THR_COMP_NEW_NEARESTGA, @@ -202,16 +133,15 @@ typedef enum { THR_COMP_NEW_NEARGA, THR_COMP_NEAR_NEWGA, THR_COMP_NEW_NEWGA, - THR_COMP_ZERO_ZEROGA, + THR_COMP_GLOBAL_GLOBALGA, -#if CONFIG_EXT_REFS THR_COMP_NEAR_NEARLB, THR_COMP_NEW_NEARESTLB, THR_COMP_NEAREST_NEWLB, THR_COMP_NEW_NEARLB, THR_COMP_NEAR_NEWLB, THR_COMP_NEW_NEWLB, - THR_COMP_ZERO_ZEROLB, + THR_COMP_GLOBAL_GLOBALLB, THR_COMP_NEAR_NEARL2B, THR_COMP_NEW_NEARESTL2B, @@ -219,7 +149,7 @@ typedef enum { THR_COMP_NEW_NEARL2B, THR_COMP_NEAR_NEWL2B, THR_COMP_NEW_NEWL2B, - THR_COMP_ZERO_ZEROL2B, + THR_COMP_GLOBAL_GLOBALL2B, THR_COMP_NEAR_NEARL3B, THR_COMP_NEW_NEARESTL3B, @@ -227,7 +157,7 @@ typedef enum { THR_COMP_NEW_NEARL3B, THR_COMP_NEAR_NEWL3B, THR_COMP_NEW_NEWL3B, - THR_COMP_ZERO_ZEROL3B, + THR_COMP_GLOBAL_GLOBALL3B, THR_COMP_NEAR_NEARGB, THR_COMP_NEW_NEARESTGB, @@ -235,7 +165,7 @@ typedef enum { THR_COMP_NEW_NEARGB, THR_COMP_NEAR_NEWGB, THR_COMP_NEW_NEWGB, - THR_COMP_ZERO_ZEROGB, + THR_COMP_GLOBAL_GLOBALGB, THR_COMP_NEAR_NEARLA2, THR_COMP_NEW_NEARESTLA2, @@ -243,7 +173,7 @@ typedef enum { THR_COMP_NEW_NEARLA2, THR_COMP_NEAR_NEWLA2, THR_COMP_NEW_NEWLA2, - THR_COMP_ZERO_ZEROLA2, + THR_COMP_GLOBAL_GLOBALLA2, THR_COMP_NEAR_NEARL2A2, THR_COMP_NEW_NEARESTL2A2, @@ -251,7 +181,7 @@ typedef enum { THR_COMP_NEW_NEARL2A2, THR_COMP_NEAR_NEWL2A2, THR_COMP_NEW_NEWL2A2, - THR_COMP_ZERO_ZEROL2A2, + THR_COMP_GLOBAL_GLOBALL2A2, THR_COMP_NEAR_NEARL3A2, THR_COMP_NEW_NEARESTL3A2, @@ -259,7 +189,7 @@ typedef enum { THR_COMP_NEW_NEARL3A2, THR_COMP_NEAR_NEWL3A2, THR_COMP_NEW_NEWL3A2, - THR_COMP_ZERO_ZEROL3A2, + THR_COMP_GLOBAL_GLOBALL3A2, THR_COMP_NEAR_NEARGA2, THR_COMP_NEW_NEARESTGA2, @@ -267,16 +197,24 @@ typedef enum { THR_COMP_NEW_NEARGA2, THR_COMP_NEAR_NEWGA2, THR_COMP_NEW_NEWGA2, - THR_COMP_ZERO_ZEROGA2, + THR_COMP_GLOBAL_GLOBALGA2, + + THR_H_PRED, + THR_V_PRED, + THR_D135_PRED, + THR_D203_PRED, + THR_D157_PRED, + THR_D67_PRED, + THR_D113_PRED, + THR_D45_PRED, -#if CONFIG_EXT_COMP_REFS THR_COMP_NEAR_NEARLL2, THR_COMP_NEW_NEARESTLL2, THR_COMP_NEAREST_NEWLL2, THR_COMP_NEW_NEARLL2, THR_COMP_NEAR_NEWLL2, THR_COMP_NEW_NEWLL2, - THR_COMP_ZERO_ZEROLL2, + THR_COMP_GLOBAL_GLOBALLL2, THR_COMP_NEAR_NEARLL3, THR_COMP_NEW_NEARESTLL3, @@ -284,7 +222,7 @@ typedef enum { THR_COMP_NEW_NEARLL3, THR_COMP_NEAR_NEWLL3, THR_COMP_NEW_NEWLL3, - THR_COMP_ZERO_ZEROLL3, + THR_COMP_GLOBAL_GLOBALLL3, THR_COMP_NEAR_NEARLG, THR_COMP_NEW_NEARESTLG, @@ -292,7 +230,7 @@ typedef enum { THR_COMP_NEW_NEARLG, THR_COMP_NEAR_NEWLG, THR_COMP_NEW_NEWLG, - THR_COMP_ZERO_ZEROLG, + THR_COMP_GLOBAL_GLOBALLG, THR_COMP_NEAR_NEARBA, THR_COMP_NEW_NEARESTBA, @@ -300,79 +238,25 @@ typedef enum { THR_COMP_NEW_NEARBA, THR_COMP_NEAR_NEWBA, THR_COMP_NEW_NEWBA, - THR_COMP_ZERO_ZEROBA, -#endif // CONFIG_EXT_COMP_REFS -#endif // CONFIG_EXT_REFS + THR_COMP_GLOBAL_GLOBALBA, - THR_H_PRED, - THR_V_PRED, - THR_D135_PRED, - THR_D207_PRED, - THR_D153_PRED, - THR_D63_PRED, - THR_D117_PRED, - THR_D45_PRED, - - THR_COMP_INTERINTRA_ZEROL, - THR_COMP_INTERINTRA_NEARESTL, - THR_COMP_INTERINTRA_NEARL, - THR_COMP_INTERINTRA_NEWL, - -#if CONFIG_EXT_REFS - THR_COMP_INTERINTRA_ZEROL2, - THR_COMP_INTERINTRA_NEARESTL2, - THR_COMP_INTERINTRA_NEARL2, - THR_COMP_INTERINTRA_NEWL2, - - THR_COMP_INTERINTRA_ZEROL3, - THR_COMP_INTERINTRA_NEARESTL3, - THR_COMP_INTERINTRA_NEARL3, - THR_COMP_INTERINTRA_NEWL3, -#endif // CONFIG_EXT_REFS - - THR_COMP_INTERINTRA_ZEROG, - THR_COMP_INTERINTRA_NEARESTG, - THR_COMP_INTERINTRA_NEARG, - THR_COMP_INTERINTRA_NEWG, - -#if CONFIG_EXT_REFS - THR_COMP_INTERINTRA_ZEROB, - THR_COMP_INTERINTRA_NEARESTB, - THR_COMP_INTERINTRA_NEARB, - THR_COMP_INTERINTRA_NEWB, - - THR_COMP_INTERINTRA_ZEROA2, - THR_COMP_INTERINTRA_NEARESTA2, - THR_COMP_INTERINTRA_NEARA2, - THR_COMP_INTERINTRA_NEWA2, -#endif // CONFIG_EXT_REFS - - THR_COMP_INTERINTRA_ZEROA, - THR_COMP_INTERINTRA_NEARESTA, - THR_COMP_INTERINTRA_NEARA, - THR_COMP_INTERINTRA_NEWA, MAX_MODES } THR_MODES; typedef enum { THR_LAST, -#if CONFIG_EXT_REFS THR_LAST2, THR_LAST3, THR_BWDR, THR_ALTR2, -#endif // CONFIG_EXT_REFS THR_GOLD, THR_ALTR, THR_COMP_LA, -#if CONFIG_EXT_REFS THR_COMP_L2A, THR_COMP_L3A, -#endif // CONFIG_EXT_REFS THR_COMP_GA, -#if CONFIG_EXT_REFS THR_COMP_LB, THR_COMP_L2B, THR_COMP_L3B, @@ -382,7 +266,6 @@ typedef enum { THR_COMP_L2A2, THR_COMP_L3A2, THR_COMP_GA2, -#endif // CONFIG_EXT_REFS THR_INTRA, @@ -399,7 +282,7 @@ typedef struct RD_OPT { int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES]; - int64_t prediction_type_threshes[TOTAL_REFS_PER_FRAME][REFERENCE_MODES]; + int64_t prediction_type_threshes[REF_FRAMES][REFERENCE_MODES]; int RDMULT; } RD_OPT; @@ -417,16 +300,16 @@ static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) { rd_stats->invalid_rate = 0; rd_stats->ref_rdcost = INT64_MAX; #if CONFIG_RD_DEBUG + // This may run into problems when monochrome video is + // encoded, as there will only be 1 plane for (plane = 0; plane < MAX_MB_PLANE; ++plane) { rd_stats->txb_coeff_cost[plane] = 0; -#if CONFIG_VAR_TX { int r, c; for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) rd_stats->txb_coeff_cost_map[plane][r][c] = 0; } -#endif } #endif } @@ -444,16 +327,16 @@ static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) { rd_stats->invalid_rate = 1; rd_stats->ref_rdcost = INT64_MAX; #if CONFIG_RD_DEBUG + // This may run into problems when monochrome video is + // encoded, as there will only be 1 plane for (plane = 0; plane < MAX_MB_PLANE; ++plane) { rd_stats->txb_coeff_cost[plane] = INT_MAX; -#if CONFIG_VAR_TX { int r, c; for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) rd_stats->txb_coeff_cost_map[plane][r][c] = INT_MAX; } -#endif } #endif } @@ -464,14 +347,17 @@ static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst, int plane; #endif rd_stats_dst->rate += rd_stats_src->rate; + if (!rd_stats_dst->zero_rate) + rd_stats_dst->zero_rate = rd_stats_src->zero_rate; rd_stats_dst->dist += rd_stats_src->dist; rd_stats_dst->sse += rd_stats_src->sse; rd_stats_dst->skip &= rd_stats_src->skip; rd_stats_dst->invalid_rate &= rd_stats_src->invalid_rate; #if CONFIG_RD_DEBUG + // This may run into problems when monochrome video is + // encoded, as there will only be 1 plane for (plane = 0; plane < MAX_MB_PLANE; ++plane) { rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane]; -#if CONFIG_VAR_TX { // TODO(angiebird): optimize this part int r, c; @@ -484,21 +370,10 @@ static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst, } assert(ref_txb_coeff_cost == rd_stats_dst->txb_coeff_cost[plane]); } -#endif } #endif } -static INLINE int av1_get_coeff_token_cost(int token, int eob_val, int is_first, - const int *head_cost_table, - const int *tail_cost_table) { - if (eob_val == LAST_EOB) return av1_cost_zero(128); - const int comb_symb = 2 * AOMMIN(token, TWO_TOKEN) - eob_val + is_first; - int cost = head_cost_table[comb_symb]; - if (token > ONE_TOKEN) cost += tail_cost_table[token - TWO_TOKEN]; - return cost; -} - struct TileInfo; struct TileDataEnc; struct AV1_COMP; @@ -528,13 +403,12 @@ YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi, void av1_init_me_luts(void); -void av1_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int ref, - int ref_mv_idx); +void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx); -void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, +void av1_get_entropy_contexts(BLOCK_SIZE bsize, const struct macroblockd_plane *pd, - ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE], - ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]); + ENTROPY_CONTEXT t_above[MAX_MIB_SIZE], + ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]); void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi); @@ -562,7 +436,8 @@ void av1_setup_pred_block(const MACROBLOCKD *xd, struct buf_2d dst[MAX_MB_PLANE], const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, const struct scale_factors *scale, - const struct scale_factors *scale_uv); + const struct scale_factors *scale_uv, + const int num_planes); int av1_get_intra_cost_penalty(int qindex, int qdelta, aom_bit_depth_t bit_depth); @@ -570,12 +445,8 @@ int av1_get_intra_cost_penalty(int qindex, int qdelta, void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x, FRAME_CONTEXT *fc); -#if CONFIG_LV_MAP -void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc); -#endif - -void av1_fill_token_costs_from_cdf(av1_coeff_cost *cost, - coeff_cdf_model (*cdf)[PLANE_TYPES]); +void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc, + const int num_planes); #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c index 607db9b86..6f4fced87 100644 --- a/third_party/aom/av1/encoder/rdopt.c +++ b/third_party/aom/av1/encoder/rdopt.c @@ -12,18 +12,17 @@ #include <assert.h> #include <math.h> -#include "./aom_dsp_rtcd.h" -#include "./av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/blend.h" #include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" #include "aom_ports/mem.h" #include "aom_ports/system_state.h" -#if CONFIG_CFL #include "av1/common/cfl.h" -#endif #include "av1/common/common.h" #include "av1/common/common_data.h" #include "av1/common/entropy.h" @@ -37,12 +36,8 @@ #include "av1/common/reconintra.h" #include "av1/common/scan.h" #include "av1/common/seg_common.h" -#if CONFIG_LV_MAP #include "av1/common/txb_common.h" -#endif -#if CONFIG_WARPED_MOTION #include "av1/common/warped_motion.h" -#endif // CONFIG_WARPED_MOTION #include "av1/encoder/aq_variance.h" #include "av1/encoder/av1_quantize.h" @@ -50,105 +45,37 @@ #include "av1/encoder/encodemb.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/encoder.h" -#if CONFIG_LV_MAP #include "av1/encoder/encodetxb.h" -#endif #include "av1/encoder/hybrid_fwd_txfm.h" #include "av1/encoder/mcomp.h" +#include "av1/encoder/ml.h" #include "av1/encoder/palette.h" +#include "av1/encoder/pustats.h" +#include "av1/encoder/random.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/rd.h" #include "av1/encoder/rdopt.h" #include "av1/encoder/tokenize.h" -#if CONFIG_PVQ -#include "av1/encoder/pvq_encoder.h" -#include "av1/common/pvq.h" -#endif // CONFIG_PVQ -#if CONFIG_DUAL_FILTER +#include "av1/encoder/tx_prune_model_weights.h" + +// Set this macro as 1 to collect data about tx size selection. +#define COLLECT_TX_SIZE_DATA 0 +#if COLLECT_TX_SIZE_DATA +static const char av1_tx_size_data_output_file[] = "tx_size_data.txt"; +#endif + #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS) -#if USE_EXTRA_FILTER -static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = { - { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 }, { 1, 0 }, { 1, 1 }, - { 1, 2 }, { 1, 3 }, { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 }, - { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 }, +static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = { + 0x00000000, 0x00010000, 0x00020000, // y = 0 + 0x00000001, 0x00010001, 0x00020001, // y = 1 + 0x00000002, 0x00010002, 0x00020002, // y = 2 }; -#else // USE_EXTRA_FILTER -static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = { - { 0, 0 }, { 0, 1 }, { 0, 2 }, { 1, 0 }, { 1, 1 }, - { 1, 2 }, { 2, 0 }, { 2, 1 }, { 2, 2 }, -}; -#endif // USE_EXTRA_FILTER -#endif // CONFIG_DUAL_FILTER - -#if CONFIG_EXT_REFS - -#define LAST_FRAME_MODE_MASK \ - ((1 << INTRA_FRAME) | (1 << LAST2_FRAME) | (1 << LAST3_FRAME) | \ - (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \ - (1 << ALTREF_FRAME)) -#define LAST2_FRAME_MODE_MASK \ - ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST3_FRAME) | \ - (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \ - (1 << ALTREF_FRAME)) -#define LAST3_FRAME_MODE_MASK \ - ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \ - (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \ - (1 << ALTREF_FRAME)) -#define GOLDEN_FRAME_MODE_MASK \ - ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \ - (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \ - (1 << ALTREF_FRAME)) -#define BWDREF_FRAME_MODE_MASK \ - ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \ - (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << ALTREF2_FRAME) | \ - (1 << ALTREF_FRAME)) -#define ALTREF2_FRAME_MODE_MASK \ - ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \ - (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | \ - (1 << ALTREF_FRAME)) -#define ALTREF_FRAME_MODE_MASK \ - ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \ - (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | \ - (1 << ALTREF2_FRAME)) - -#else // !CONFIG_EXT_REFS - -#define LAST_FRAME_MODE_MASK \ - ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME)) -#define GOLDEN_FRAME_MODE_MASK \ - ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME)) -#define ALTREF_FRAME_MODE_MASK \ - ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | (1 << INTRA_FRAME)) - -#endif // CONFIG_EXT_REFS - -#if CONFIG_EXT_REFS -#if CONFIG_EXT_COMP_REFS + #define SECOND_REF_FRAME_MASK \ ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | \ (1 << GOLDEN_FRAME) | (1 << LAST2_FRAME) | 0x01) -#else // !CONFIG_EXT_COMP_REFS -#define SECOND_REF_FRAME_MASK \ - ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | 0x01) -#endif // CONFIG_EXT_COMP_REFS -#else // !CONFIG_EXT_REFS -#define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01) -#endif // CONFIG_EXT_REFS - -#define MIN_EARLY_TERM_INDEX 3 -#define NEW_MV_DISCOUNT_FACTOR 8 -#if CONFIG_EXT_INTRA #define ANGLE_SKIP_THRESH 10 -#define FILTER_FAST_SEARCH 1 -#endif // CONFIG_EXT_INTRA - -// Setting this to 1 will disable trellis optimization within the -// transform search. Trellis optimization will still be applied -// in the final encode. -#ifndef DISABLE_TRELLISQ_SEARCH -#define DISABLE_TRELLISQ_SEARCH 0 -#endif static const double ADST_FLIP_SVM[8] = { /* vertical */ @@ -162,122 +89,72 @@ typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } MODE_DEFINITION; -typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION; +typedef struct { + MV_REFERENCE_FRAME ref_frame[2]; +} REF_DEFINITION; + +typedef enum { + FTXS_NONE = 0, + FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0, + FTXS_DISABLE_TRELLIS_OPT = 1 << 1, + FTXS_USE_TRANSFORM_DOMAIN = 1 << 2 +} FAST_TX_SEARCH_MODE; struct rdcost_block_args { const AV1_COMP *cpi; MACROBLOCK *x; - ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE]; - ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]; + ENTROPY_CONTEXT t_above[MAX_MIB_SIZE]; + ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]; RD_STATS rd_stats; int64_t this_rd; int64_t best_rd; int exit_early; int use_fast_coef_costing; + FAST_TX_SEARCH_MODE ftxs_mode; }; #define LAST_NEW_MV_INDEX 6 static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEARESTMV, { LAST_FRAME, NONE_FRAME } }, -#if CONFIG_EXT_REFS { NEARESTMV, { LAST2_FRAME, NONE_FRAME } }, { NEARESTMV, { LAST3_FRAME, NONE_FRAME } }, { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } }, { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } }, -#endif // CONFIG_EXT_REFS { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } }, { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } }, { DC_PRED, { INTRA_FRAME, NONE_FRAME } }, { NEWMV, { LAST_FRAME, NONE_FRAME } }, -#if CONFIG_EXT_REFS { NEWMV, { LAST2_FRAME, NONE_FRAME } }, { NEWMV, { LAST3_FRAME, NONE_FRAME } }, { NEWMV, { BWDREF_FRAME, NONE_FRAME } }, { NEWMV, { ALTREF2_FRAME, NONE_FRAME } }, -#endif // CONFIG_EXT_REFS { NEWMV, { ALTREF_FRAME, NONE_FRAME } }, { NEWMV, { GOLDEN_FRAME, NONE_FRAME } }, { NEARMV, { LAST_FRAME, NONE_FRAME } }, -#if CONFIG_EXT_REFS { NEARMV, { LAST2_FRAME, NONE_FRAME } }, { NEARMV, { LAST3_FRAME, NONE_FRAME } }, { NEARMV, { BWDREF_FRAME, NONE_FRAME } }, { NEARMV, { ALTREF2_FRAME, NONE_FRAME } }, -#endif // CONFIG_EXT_REFS { NEARMV, { ALTREF_FRAME, NONE_FRAME } }, { NEARMV, { GOLDEN_FRAME, NONE_FRAME } }, - { ZEROMV, { LAST_FRAME, NONE_FRAME } }, -#if CONFIG_EXT_REFS - { ZEROMV, { LAST2_FRAME, NONE_FRAME } }, - { ZEROMV, { LAST3_FRAME, NONE_FRAME } }, - { ZEROMV, { BWDREF_FRAME, NONE_FRAME } }, - { ZEROMV, { ALTREF2_FRAME, NONE_FRAME } }, -#endif // CONFIG_EXT_REFS - { ZEROMV, { GOLDEN_FRAME, NONE_FRAME } }, - { ZEROMV, { ALTREF_FRAME, NONE_FRAME } }, - -// TODO(zoeliu): May need to reconsider the order on the modes to check - -#if CONFIG_COMPOUND_SINGLEREF - // Single ref comp mode - { SR_NEAREST_NEARMV, { LAST_FRAME, NONE_FRAME } }, -#if CONFIG_EXT_REFS - { SR_NEAREST_NEARMV, { LAST2_FRAME, NONE_FRAME } }, - { SR_NEAREST_NEARMV, { LAST3_FRAME, NONE_FRAME } }, - { SR_NEAREST_NEARMV, { BWDREF_FRAME, NONE_FRAME } }, -#endif // CONFIG_EXT_REFS - { SR_NEAREST_NEARMV, { GOLDEN_FRAME, NONE_FRAME } }, - { SR_NEAREST_NEARMV, { ALTREF_FRAME, NONE_FRAME } }, - - /* - { SR_NEAREST_NEWMV, { LAST_FRAME, NONE_FRAME } }, -#if CONFIG_EXT_REFS - { SR_NEAREST_NEWMV, { LAST2_FRAME, NONE_FRAME } }, - { SR_NEAREST_NEWMV, { LAST3_FRAME, NONE_FRAME } }, - { SR_NEAREST_NEWMV, { BWDREF_FRAME, NONE_FRAME } }, -#endif // CONFIG_EXT_REFS - { SR_NEAREST_NEWMV, { GOLDEN_FRAME, NONE_FRAME } }, - { SR_NEAREST_NEWMV, { ALTREF_FRAME, NONE_FRAME } },*/ - - { SR_NEAR_NEWMV, { LAST_FRAME, NONE_FRAME } }, -#if CONFIG_EXT_REFS - { SR_NEAR_NEWMV, { LAST2_FRAME, NONE_FRAME } }, - { SR_NEAR_NEWMV, { LAST3_FRAME, NONE_FRAME } }, - { SR_NEAR_NEWMV, { BWDREF_FRAME, NONE_FRAME } }, -#endif // CONFIG_EXT_REFS - { SR_NEAR_NEWMV, { GOLDEN_FRAME, NONE_FRAME } }, - { SR_NEAR_NEWMV, { ALTREF_FRAME, NONE_FRAME } }, - - { SR_ZERO_NEWMV, { LAST_FRAME, NONE_FRAME } }, -#if CONFIG_EXT_REFS - { SR_ZERO_NEWMV, { LAST2_FRAME, NONE_FRAME } }, - { SR_ZERO_NEWMV, { LAST3_FRAME, NONE_FRAME } }, - { SR_ZERO_NEWMV, { BWDREF_FRAME, NONE_FRAME } }, -#endif // CONFIG_EXT_REFS - { SR_ZERO_NEWMV, { GOLDEN_FRAME, NONE_FRAME } }, - { SR_ZERO_NEWMV, { ALTREF_FRAME, NONE_FRAME } }, - - { SR_NEW_NEWMV, { LAST_FRAME, NONE_FRAME } }, -#if CONFIG_EXT_REFS - { SR_NEW_NEWMV, { LAST2_FRAME, NONE_FRAME } }, - { SR_NEW_NEWMV, { LAST3_FRAME, NONE_FRAME } }, - { SR_NEW_NEWMV, { BWDREF_FRAME, NONE_FRAME } }, -#endif // CONFIG_EXT_REFS - { SR_NEW_NEWMV, { GOLDEN_FRAME, NONE_FRAME } }, - { SR_NEW_NEWMV, { ALTREF_FRAME, NONE_FRAME } }, -#endif // CONFIG_COMPOUND_SINGLEREF + { GLOBALMV, { LAST_FRAME, NONE_FRAME } }, + { GLOBALMV, { LAST2_FRAME, NONE_FRAME } }, + { GLOBALMV, { LAST3_FRAME, NONE_FRAME } }, + { GLOBALMV, { BWDREF_FRAME, NONE_FRAME } }, + { GLOBALMV, { ALTREF2_FRAME, NONE_FRAME } }, + { GLOBALMV, { GOLDEN_FRAME, NONE_FRAME } }, + { GLOBALMV, { ALTREF_FRAME, NONE_FRAME } }, + + // TODO(zoeliu): May need to reconsider the order on the modes to check { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, -#if CONFIG_EXT_REFS { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, -#endif // CONFIG_EXT_REFS { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, -#if CONFIG_EXT_REFS { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, @@ -287,21 +164,16 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } }, { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, -#if CONFIG_EXT_COMP_REFS { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } }, { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } }, { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } }, { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } }, -#endif // CONFIG_EXT_COMP_REFS -#endif // CONFIG_EXT_REFS - { TM_PRED, { INTRA_FRAME, NONE_FRAME } }, + { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } }, { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } }, -#if CONFIG_SMOOTH_HV { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } }, { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } }, -#endif // CONFIG_SMOOTH_HV { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, @@ -309,16 +181,15 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, - { ZERO_ZEROMV, { LAST_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } }, -#if CONFIG_EXT_REFS { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, - { ZERO_ZEROMV, { LAST2_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } }, { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, @@ -326,8 +197,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, - { ZERO_ZEROMV, { LAST3_FRAME, ALTREF_FRAME } }, -#endif // CONFIG_EXT_REFS + { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } }, { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, @@ -335,16 +205,15 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, - { ZERO_ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } }, -#if CONFIG_EXT_REFS { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } }, { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, - { ZERO_ZEROMV, { LAST_FRAME, BWDREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } }, { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, @@ -352,7 +221,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, - { ZERO_ZEROMV, { LAST2_FRAME, BWDREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } }, { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, @@ -360,7 +229,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, - { ZERO_ZEROMV, { LAST3_FRAME, BWDREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } }, { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } }, @@ -368,7 +237,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, - { ZERO_ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } }, { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } }, @@ -376,7 +245,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } }, { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, - { ZERO_ZEROMV, { LAST_FRAME, ALTREF2_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } }, { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } }, { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } }, @@ -384,7 +253,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } }, { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, - { ZERO_ZEROMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } }, { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } }, { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } }, @@ -392,7 +261,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } }, { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, - { ZERO_ZEROMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } }, { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, @@ -400,16 +269,24 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, - { ZERO_ZEROMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + + { H_PRED, { INTRA_FRAME, NONE_FRAME } }, + { V_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D135_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D203_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D157_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D67_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D113_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D45_PRED, { INTRA_FRAME, NONE_FRAME } }, -#if CONFIG_EXT_COMP_REFS { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } }, { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } }, { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } }, { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } }, { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } }, - { ZERO_ZEROMV, { LAST_FRAME, LAST2_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } }, { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } }, @@ -417,7 +294,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } }, { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } }, { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } }, - { ZERO_ZEROMV, { LAST_FRAME, LAST3_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } }, { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } }, @@ -425,7 +302,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } }, { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, - { ZERO_ZEROMV, { LAST_FRAME, GOLDEN_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } }, { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } }, { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } }, @@ -433,89 +310,400 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } }, { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, - { ZERO_ZEROMV, { BWDREF_FRAME, ALTREF_FRAME } }, -#endif // CONFIG_EXT_COMP_REFS -#endif // CONFIG_EXT_REFS + { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } }, +}; - { H_PRED, { INTRA_FRAME, NONE_FRAME } }, - { V_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D135_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D207_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D153_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D63_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D117_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D45_PRED, { INTRA_FRAME, NONE_FRAME } }, +static const int16_t intra_to_mode_idx[INTRA_MODE_NUM] = { + 7, // DC_PRED, + 134, // V_PRED, + 133, // H_PRED, + 140, // D45_PRED, + 135, // D135_PRED, + 139, // D113_PRED, + 137, // D157_PRED, + 136, // D203_PRED, + 138, // D67_PRED, + 46, // SMOOTH_PRED, + 47, // SMOOTH_V_PRED, + 48, // SMOOTH_H_PRED, + 45, // PAETH_PRED, +}; + +/* clang-format off */ +static const int16_t single_inter_to_mode_idx[SINGLE_INTER_MODE_NUM] + [REF_FRAMES] = { + // NEARESTMV, + { -1, 0, 1, 2, 6, 3, 4, 5, }, + // NEARMV, + { -1, 15, 16, 17, 21, 18, 19, 20, }, + // GLOBALMV, + { -1, 22, 23, 24, 27, 25, 26, 28, }, + // NEWMV, + { -1, 8, 9, 10, 14, 11, 12, 13, }, +}; +/* clang-format on */ - { ZEROMV, { LAST_FRAME, INTRA_FRAME } }, - { NEARESTMV, { LAST_FRAME, INTRA_FRAME } }, - { NEARMV, { LAST_FRAME, INTRA_FRAME } }, - { NEWMV, { LAST_FRAME, INTRA_FRAME } }, - -#if CONFIG_EXT_REFS - { ZEROMV, { LAST2_FRAME, INTRA_FRAME } }, - { NEARESTMV, { LAST2_FRAME, INTRA_FRAME } }, - { NEARMV, { LAST2_FRAME, INTRA_FRAME } }, - { NEWMV, { LAST2_FRAME, INTRA_FRAME } }, - - { ZEROMV, { LAST3_FRAME, INTRA_FRAME } }, - { NEARESTMV, { LAST3_FRAME, INTRA_FRAME } }, - { NEARMV, { LAST3_FRAME, INTRA_FRAME } }, - { NEWMV, { LAST3_FRAME, INTRA_FRAME } }, -#endif // CONFIG_EXT_REFS - - { ZEROMV, { GOLDEN_FRAME, INTRA_FRAME } }, - { NEARESTMV, { GOLDEN_FRAME, INTRA_FRAME } }, - { NEARMV, { GOLDEN_FRAME, INTRA_FRAME } }, - { NEWMV, { GOLDEN_FRAME, INTRA_FRAME } }, - -#if CONFIG_EXT_REFS - { ZEROMV, { BWDREF_FRAME, INTRA_FRAME } }, - { NEARESTMV, { BWDREF_FRAME, INTRA_FRAME } }, - { NEARMV, { BWDREF_FRAME, INTRA_FRAME } }, - { NEWMV, { BWDREF_FRAME, INTRA_FRAME } }, - - { ZEROMV, { ALTREF2_FRAME, INTRA_FRAME } }, - { NEARESTMV, { ALTREF2_FRAME, INTRA_FRAME } }, - { NEARMV, { ALTREF2_FRAME, INTRA_FRAME } }, - { NEWMV, { ALTREF2_FRAME, INTRA_FRAME } }, -#endif // CONFIG_EXT_REFS - - { ZEROMV, { ALTREF_FRAME, INTRA_FRAME } }, - { NEARESTMV, { ALTREF_FRAME, INTRA_FRAME } }, - { NEARMV, { ALTREF_FRAME, INTRA_FRAME } }, - { NEWMV, { ALTREF_FRAME, INTRA_FRAME } }, +/* clang-format off */ +static const int16_t comp_inter_to_mode_idx[COMP_INTER_MODE_NUM][REF_FRAMES] + [REF_FRAMES] = { + // NEAREST_NEARESTMV, + { + { -1, -1, -1, -1, -1, -1, -1, -1, }, + { -1, -1, 41, 42, 43, 33, 37, 29, }, + { -1, -1, -1, -1, -1, 34, 38, 30, }, + { -1, -1, -1, -1, -1, 35, 39, 31, }, + { -1, -1, -1, -1, -1, 36, 40, 32, }, + { -1, -1, -1, -1, -1, -1, -1, 44, }, + { -1, -1, -1, -1, -1, -1, -1, -1, }, + { -1, -1, -1, -1, -1, -1, -1, -1, }, + }, + // NEAR_NEARMV, + { + { -1, -1, -1, -1, -1, -1, -1, -1, }, + { -1, -1, 141, 148, 155, 77, 105, 49, }, + { -1, -1, -1, -1, -1, 84, 112, 56, }, + { -1, -1, -1, -1, -1, 91, 119, 63, }, + { -1, -1, -1, -1, -1, 98, 126, 70, }, + { -1, -1, -1, -1, -1, -1, -1, 162, }, + { -1, -1, -1, -1, -1, -1, -1, -1, }, + { -1, -1, -1, -1, -1, -1, -1, -1, }, + }, + // NEAREST_NEWMV, + { + { -1, -1, -1, -1, -1, -1, -1, -1, }, + { -1, -1, 143, 150, 157, 79, 107, 51, }, + { -1, -1, -1, -1, -1, 86, 114, 58, }, + { -1, -1, -1, -1, -1, 93, 121, 65, }, + { -1, -1, -1, -1, -1, 100, 128, 72, }, + { -1, -1, -1, -1, -1, -1, -1, 164, }, + { -1, -1, -1, -1, -1, -1, -1, -1, }, + { -1, -1, -1, -1, -1, -1, -1, -1, }, + }, + // NEW_NEARESTMV, + { + { -1, -1, -1, -1, -1, -1, -1, -1, }, + { -1, -1, 142, 149, 156, 78, 106, 50, }, + { -1, -1, -1, -1, -1, 85, 113, 57, }, + { -1, -1, -1, -1, -1, 92, 120, 64, }, + { -1, -1, -1, -1, -1, 99, 127, 71, }, + { -1, -1, -1, -1, -1, -1, -1, 163, }, + { -1, -1, -1, -1, -1, -1, -1, -1, }, + { -1, -1, -1, -1, -1, -1, -1, -1, }, + }, + // NEAR_NEWMV, + { + { -1, -1, -1, -1, -1, -1, -1, -1, }, + { -1, -1, 145, 152, 159, 81, 109, 53, }, + { -1, -1, -1, -1, -1, 88, 116, 60, }, + { -1, -1, -1, -1, -1, 95, 123, 67, }, + { -1, -1, -1, -1, -1, 102, 130, 74, }, + { -1, -1, -1, -1, -1, -1, -1, 166, }, + { -1, -1, -1, -1, -1, -1, -1, -1, }, + { -1, -1, -1, -1, -1, -1, -1, -1, }, + }, + // NEW_NEARMV, + { + { -1, -1, -1, -1, -1, -1, -1, -1, }, + { -1, -1, 144, 151, 158, 80, 108, 52, }, + { -1, -1, -1, -1, -1, 87, 115, 59, }, + { -1, -1, -1, -1, -1, 94, 122, 66, }, + { -1, -1, -1, -1, -1, 101, 129, 73, }, + { -1, -1, -1, -1, -1, -1, -1, 165, }, + { -1, -1, -1, -1, -1, -1, -1, -1, }, + { -1, -1, -1, -1, -1, -1, -1, -1, }, + }, + // GLOBAL_GLOBALMV, + { + { -1, -1, -1, -1, -1, -1, -1, -1, }, + { -1, -1, 147, 154, 161, 83, 111, 55, }, + { -1, -1, -1, -1, -1, 90, 118, 62, }, + { -1, -1, -1, -1, -1, 97, 125, 69, }, + { -1, -1, -1, -1, -1, 104, 132, 76, }, + { -1, -1, -1, -1, -1, -1, -1, 168, }, + { -1, -1, -1, -1, -1, -1, -1, -1, }, + { -1, -1, -1, -1, -1, -1, -1, -1, }, + }, + // NEW_NEWMV, + { + { -1, -1, -1, -1, -1, -1, -1, -1, }, + { -1, -1, 146, 153, 160, 82, 110, 54, }, + { -1, -1, -1, -1, -1, 89, 117, 61, }, + { -1, -1, -1, -1, -1, 96, 124, 68, }, + { -1, -1, -1, -1, -1, 103, 131, 75, }, + { -1, -1, -1, -1, -1, -1, -1, 167, }, + { -1, -1, -1, -1, -1, -1, -1, -1, }, + { -1, -1, -1, -1, -1, -1, -1, -1, }, + }, }; +/* clang-format on */ + +static int get_prediction_mode_idx(PREDICTION_MODE this_mode, + MV_REFERENCE_FRAME ref_frame, + MV_REFERENCE_FRAME second_ref_frame) { + if (this_mode < INTRA_MODE_END) { + assert(ref_frame == INTRA_FRAME); + assert(second_ref_frame == NONE_FRAME); + return intra_to_mode_idx[this_mode - INTRA_MODE_START]; + } + if (this_mode >= SINGLE_INTER_MODE_START && + this_mode < SINGLE_INTER_MODE_END) { + assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME)); + assert(second_ref_frame == NONE_FRAME); + return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START] + [ref_frame]; + } + if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END) { + assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME)); + assert((second_ref_frame > INTRA_FRAME) && + (second_ref_frame <= ALTREF_FRAME)); + return comp_inter_to_mode_idx[this_mode - COMP_INTER_MODE_START][ref_frame] + [second_ref_frame]; + } + assert(0); + return -1; +} static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = { - DC_PRED, H_PRED, V_PRED, SMOOTH_PRED, TM_PRED, -#if CONFIG_SMOOTH_HV - SMOOTH_V_PRED, SMOOTH_H_PRED, -#endif // CONFIG_SMOOTH_HV - D135_PRED, D207_PRED, D153_PRED, D63_PRED, D117_PRED, D45_PRED, + DC_PRED, H_PRED, V_PRED, SMOOTH_PRED, PAETH_PRED, + SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED, D157_PRED, + D67_PRED, D113_PRED, D45_PRED, }; -#if CONFIG_CFL static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = { - UV_DC_PRED, UV_CFL_PRED, UV_H_PRED, - UV_V_PRED, UV_SMOOTH_PRED, UV_TM_PRED, -#if CONFIG_SMOOTH_HV - UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED, -#endif // CONFIG_SMOOTH_HV - UV_D135_PRED, UV_D207_PRED, UV_D153_PRED, - UV_D63_PRED, UV_D117_PRED, UV_D45_PRED, + UV_DC_PRED, UV_CFL_PRED, UV_H_PRED, UV_V_PRED, + UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED, + UV_D135_PRED, UV_D203_PRED, UV_D157_PRED, UV_D67_PRED, + UV_D113_PRED, UV_D45_PRED, }; -#else -#define uv_rd_search_mode_order intra_rd_search_mode_order -#endif // CONFIG_CFL + +typedef struct InterModeSearchState { + int64_t best_rd; + MB_MODE_INFO best_mbmode; + int best_rate_y; + int best_rate_uv; + int best_mode_skippable; + int best_skip2; + int best_mode_index; + int skip_intra_modes; + int num_available_refs; + int64_t dist_refs[REF_FRAMES]; + int dist_order_refs[REF_FRAMES]; + int64_t mode_threshold[MAX_MODES]; + PREDICTION_MODE best_intra_mode; + int64_t best_intra_rd; + int angle_stats_ready; + uint8_t directional_mode_skip_mask[INTRA_MODES]; + unsigned int best_pred_sse; + int rate_uv_intra[TX_SIZES_ALL]; + int rate_uv_tokenonly[TX_SIZES_ALL]; + int64_t dist_uvs[TX_SIZES_ALL]; + int skip_uvs[TX_SIZES_ALL]; + UV_PREDICTION_MODE mode_uv[TX_SIZES_ALL]; + PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL]; + int8_t uv_angle_delta[TX_SIZES_ALL]; + int64_t best_pred_rd[REFERENCE_MODES]; + int64_t best_pred_diff[REFERENCE_MODES]; + // Save a set of single_newmv for each checked ref_mv. + int_mv single_newmv[MAX_REF_MV_SERCH][REF_FRAMES]; + int single_newmv_rate[MAX_REF_MV_SERCH][REF_FRAMES]; + int single_newmv_valid[MAX_REF_MV_SERCH][REF_FRAMES]; + int64_t modelled_rd[MB_MODE_COUNT][REF_FRAMES]; +} InterModeSearchState; + +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + +typedef struct InterModeRdModel { + int ready; + double a; + double b; + double dist_mean; + int skip_count; + int non_skip_count; + int fp_skip_count; + int bracket_idx; +} InterModeRdModel; + +InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL]; + +#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400 +static int inter_mode_data_idx[4]; +static int64_t inter_mode_data_sse[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; +static int64_t inter_mode_data_dist[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; +static int inter_mode_data_residue_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; +static int inter_mode_data_all_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; +static int64_t inter_mode_data_ref_best_rd[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; + +int inter_mode_data_block_idx(BLOCK_SIZE bsize) { + if (bsize == BLOCK_8X8) return 1; + if (bsize == BLOCK_16X16) return 2; + if (bsize == BLOCK_32X32) return 3; + return -1; +} + +void av1_inter_mode_data_init() { + for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { + const int block_idx = inter_mode_data_block_idx(i); + if (block_idx != -1) inter_mode_data_idx[block_idx] = 0; + InterModeRdModel *md = &inter_mode_rd_models[i]; + md->ready = 0; + md->skip_count = 0; + md->non_skip_count = 0; + md->fp_skip_count = 0; + md->bracket_idx = 0; + } +} + +void av1_inter_mode_data_show(const AV1_COMMON *cm) { + printf("frame_offset %d\n", cm->frame_offset); + for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { + const int block_idx = inter_mode_data_block_idx(i); + if (block_idx != -1) inter_mode_data_idx[block_idx] = 0; + InterModeRdModel *md = &inter_mode_rd_models[i]; + if (md->ready) { + printf("bsize %d non_skip_count %d skip_count %d fp_skip_count %d\n", i, + md->non_skip_count, md->skip_count, md->fp_skip_count); + } + } +} + +static int64_t get_est_rd(BLOCK_SIZE bsize, int rdmult, int64_t sse, + int curr_cost) { + aom_clear_system_state(); + InterModeRdModel *md = &inter_mode_rd_models[bsize]; + if (md->ready) { + const double est_ld = md->a * sse + md->b; + const double est_residue_cost = (sse - md->dist_mean) / est_ld; + const int64_t est_cost = (int64_t)round(est_residue_cost) + curr_cost; + const int64_t int64_dist_mean = (int64_t)round(md->dist_mean); + const int64_t est_rd = RDCOST(rdmult, est_cost, int64_dist_mean); + return est_rd; + } + return 0; +} + +#define DATA_BRACKETS 7 +static const int data_num_threshold[DATA_BRACKETS] = { + 200, 400, 800, 1600, 3200, 6400, INT32_MAX +}; + +void av1_inter_mode_data_fit(int rdmult) { + aom_clear_system_state(); + for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { + const int block_idx = inter_mode_data_block_idx(bsize); + InterModeRdModel *md = &inter_mode_rd_models[bsize]; + if (block_idx == -1) continue; + int data_num = inter_mode_data_idx[block_idx]; + if (data_num < data_num_threshold[md->bracket_idx]) { + continue; + } + double my = 0; + double mx = 0; + double dx = 0; + double dxy = 0; + double dist_mean = 0; + const int train_num = data_num; + for (int i = 0; i < train_num; ++i) { + const double sse = (double)inter_mode_data_sse[block_idx][i]; + const double dist = (double)inter_mode_data_dist[block_idx][i]; + const double residue_cost = inter_mode_data_residue_cost[block_idx][i]; + const double ld = (sse - dist) / residue_cost; + dist_mean += dist; + my += ld; + mx += sse; + dx += sse * sse; + dxy += sse * ld; + } + dist_mean = dist_mean / data_num; + my = my / train_num; + mx = mx / train_num; + dx = sqrt(dx / train_num); + dxy = dxy / train_num; + + md->dist_mean = dist_mean; + md->a = (dxy - mx * my) / (dx * dx - mx * mx); + md->b = my - md->a * mx; + ++md->bracket_idx; + md->ready = 1; + assert(md->bracket_idx < DATA_BRACKETS); + + (void)rdmult; +#if 0 + int skip_count = 0; + int fp_skip_count = 0; + double avg_error = 0; + const int test_num = data_num; + for (int i = 0; i < data_num; ++i) { + const int64_t sse = inter_mode_data_sse[block_idx][i]; + const int64_t dist = inter_mode_data_dist[block_idx][i]; + const int64_t residue_cost = inter_mode_data_residue_cost[block_idx][i]; + const int64_t all_cost = inter_mode_data_all_cost[block_idx][i]; + const int64_t est_rd = + get_est_rd(bsize, rdmult, sse, all_cost - residue_cost); + const int64_t real_rd = RDCOST(rdmult, all_cost, dist); + const int64_t ref_best_rd = inter_mode_data_ref_best_rd[block_idx][i]; + if (est_rd > ref_best_rd) { + ++skip_count; + if (real_rd < ref_best_rd) { + ++fp_skip_count; + } + } + avg_error += abs(est_rd - real_rd) * 100. / real_rd; + } + avg_error /= test_num; + printf("test_num %d bsize %d avg_error %f skip_count %d fp_skip_count %d\n", + test_num, bsize, avg_error, skip_count, fp_skip_count); +#endif + } +} + +static void inter_mode_data_push(BLOCK_SIZE bsize, int64_t sse, int64_t dist, + int residue_cost, int all_cost, + int64_t ref_best_rd) { + if (residue_cost == 0 || sse == dist) return; + const int block_idx = inter_mode_data_block_idx(bsize); + if (block_idx == -1) return; + if (inter_mode_data_idx[block_idx] < INTER_MODE_RD_DATA_OVERALL_SIZE) { + const int data_idx = inter_mode_data_idx[block_idx]; + inter_mode_data_sse[block_idx][data_idx] = sse; + inter_mode_data_dist[block_idx][data_idx] = dist; + inter_mode_data_residue_cost[block_idx][data_idx] = residue_cost; + inter_mode_data_all_cost[block_idx][data_idx] = all_cost; + inter_mode_data_ref_best_rd[block_idx][data_idx] = ref_best_rd; + ++inter_mode_data_idx[block_idx]; + } +} +#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS static INLINE int write_uniform_cost(int n, int v) { const int l = get_unsigned_bits(n); const int m = (1 << l) - n; if (l == 0) return 0; if (v < m) - return (l - 1) * av1_cost_bit(128, 0); + return av1_cost_literal(l - 1); else - return l * av1_cost_bit(128, 0); + return av1_cost_literal(l); +} + +// Similar to store_cfl_required(), but for use during the RDO process, +// where we haven't yet determined whether this block uses CfL. +static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm, + const MACROBLOCK *x) { + const MACROBLOCKD *xd = &x->e_mbd; + + if (cm->seq_params.monochrome || x->skip_chroma_rd) return CFL_DISALLOWED; + + if (!xd->cfl.is_chroma_reference) { + // For non-chroma-reference blocks, we should always store the luma pixels, + // in case the corresponding chroma-reference block uses CfL. + // Note that this can only happen for block sizes which are <8 on + // their shortest side, as otherwise they would be chroma reference + // blocks. + return CFL_ALLOWED; + } + + // For chroma reference blocks, we should store data in the encoder iff we're + // allowed to try out CfL. + return is_cfl_allowed(xd); } // constants for prune 1 and prune 2 decision boundaries @@ -524,6 +712,10 @@ static INLINE int write_uniform_cost(int n, int v) { #define FAST_EXT_TX_CORR_MARGIN 0.5 #define FAST_EXT_TX_EDST_MARGIN 0.3 +static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode); + static unsigned pixel_dist_visible_only( const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src, const int src_stride, const uint8_t *dst, const int dst_stride, @@ -531,15 +723,10 @@ static unsigned pixel_dist_visible_only( int visible_cols) { unsigned sse; - if (txb_rows == visible_rows && txb_cols == visible_cols -#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX) - && tx_bsize < BLOCK_SIZES -#endif - ) { + if (txb_rows == visible_rows && txb_cols == visible_cols) { cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); return sse; } -#if CONFIG_HIGHBITDEPTH const MACROBLOCKD *xd = &x->e_mbd; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { @@ -547,9 +734,6 @@ static unsigned pixel_dist_visible_only( visible_cols, visible_rows); return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2); } -#else - (void)x; -#endif // CONFIG_HIGHBITDEPTH sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols, visible_rows); return sse; @@ -588,10 +772,9 @@ static uint64_t cdef_dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src, const uint64_t c1 = (400 * a << 2 * coeff_shift); const uint64_t c2 = (b * 20000 * a * a << 4 * coeff_shift); - dist = - (uint64_t)floor(.5 + - (sum_d2 + sum_s2 - 2 * sum_sd) * .5 * (svar + dvar + c1) / - (sqrt(svar * (double)dvar + c2))); + dist = (uint64_t)floor(.5 + (sum_d2 + sum_s2 - 2 * sum_sd) * .5 * + (svar + dvar + c1) / + (sqrt(svar * (double)dvar + c2))); // Calibrate dist to have similar rate for the same QP with MSE only // distortion (as in master branch) @@ -729,11 +912,9 @@ static double od_compute_dist_common(int activity_masking, uint16_t *x, static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w, int bsize_h, int qindex) { assert(bsize_w >= 8 && bsize_h >= 8); -#if CONFIG_PVQ - int activity_masking = 1; -#else + int activity_masking = 0; -#endif + int i, j; DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]); DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]); @@ -760,11 +941,9 @@ static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w, static double od_compute_dist_diff(uint16_t *x, int16_t *e, int bsize_w, int bsize_h, int qindex) { assert(bsize_w >= 8 && bsize_h >= 8); -#if CONFIG_PVQ - int activity_masking = 1; -#else + int activity_masking = 0; -#endif + DECLARE_ALIGNED(16, uint16_t, y[MAX_TX_SQUARE]); DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]); DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]); @@ -806,7 +985,6 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x, if (x->tune_metric == AOM_TUNE_CDEF_DIST || x->tune_metric == AOM_TUNE_DAALA_DIST) { -#if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { for (j = 0; j < bsh; j++) for (i = 0; i < bsw; i++) @@ -834,7 +1012,6 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x, } } } else { -#endif for (j = 0; j < bsh; j++) for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i]; @@ -858,9 +1035,7 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x, rec[j * bsw + i] = src[j * src_stride + i]; } } -#if CONFIG_HIGHBITDEPTH } -#endif // CONFIG_HIGHBITDEPTH } if (x->tune_metric == AOM_TUNE_DAALA_DIST) { @@ -874,10 +1049,8 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x, bsw, coeff_shift); } } -#if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) d = ((uint64_t)d) >> 2 * coeff_shift; -#endif } else { // Otherwise, MSE by default d = pixel_dist_visible_only(cpi, x, src, src_stride, dst, dst_stride, @@ -887,10 +1060,10 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x, return d; } -static int64_t av1_dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src, - int src_stride, const int16_t *diff, - int diff_stride, int bsw, int bsh, - int visible_w, int visible_h, int qindex) { +static int64_t dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src, + int src_stride, const int16_t *diff, + int diff_stride, int bsw, int bsh, int visible_w, + int visible_h, int qindex) { int64_t d = 0; int i, j; const MACROBLOCKD *xd = &x->e_mbd; @@ -905,18 +1078,14 @@ static int64_t av1_dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src, if (x->tune_metric == AOM_TUNE_CDEF_DIST || x->tune_metric == AOM_TUNE_DAALA_DIST) { -#if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { for (j = 0; j < bsh; j++) for (i = 0; i < bsw; i++) orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i]; } else { -#endif for (j = 0; j < bsh; j++) for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i]; -#if CONFIG_HIGHBITDEPTH } -#endif // CONFIG_HIGHBITDEPTH if ((bsw == visible_w) && (bsh == visible_h)) { for (j = 0; j < bsh; j++) @@ -971,7 +1140,8 @@ static int64_t av1_dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src, static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize, const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - double *hordist, double *verdist) { + int need_4th, double *hordist, + double *verdist) { const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; @@ -980,7 +1150,6 @@ static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize, if (f_index < 0) { const int w_shift = bw == 8 ? 1 : 2; const int h_shift = bh == 8 ? 1 : 2; -#if CONFIG_HIGHBITDEPTH if (cpi->common.use_highbitdepth) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); @@ -992,17 +1161,13 @@ static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize, (src16[j + i * src_stride] - dst16[j + i * dst_stride]); } } else { -#endif // CONFIG_HIGHBITDEPTH - for (int i = 0; i < bh; ++i) for (int j = 0; j < bw; ++j) { const int index = (j >> w_shift) + ((i >> h_shift) << 2); esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) * (src[j + i * src_stride] - dst[j + i * dst_stride]); } -#if CONFIG_HIGHBITDEPTH } -#endif // CONFIG_HIGHBITDEPTH } else { cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[0]); cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, @@ -1051,13 +1216,22 @@ static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize, hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip; hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip; hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip; + if (need_4th) { + hordist[3] = ((double)esq[3] + esq[7] + esq[11] + esq[15]) * e_recip; + } verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip; verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip; verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip; + if (need_4th) { + verdist[3] = ((double)esq[12] + esq[13] + esq[14] + esq[15]) * e_recip; + } } else { hordist[0] = verdist[0] = 0.25; hordist[1] = verdist[1] = 0.25; hordist[2] = verdist[2] = 0.25; + if (need_4th) { + hordist[3] = verdist[3] = 0.25; + } } } @@ -1067,7 +1241,7 @@ static int adst_vs_flipadst(const AV1_COMP *cpi, BLOCK_SIZE bsize, int prune_bitmask = 0; double svm_proj_h = 0, svm_proj_v = 0; double hdist[3] = { 0, 0, 0 }, vdist[3] = { 0, 0, 0 }; - get_energy_distribution_fine(cpi, bsize, src, src_stride, dst, dst_stride, + get_energy_distribution_fine(cpi, bsize, src, src_stride, dst, dst_stride, 0, hdist, vdist); svm_proj_v = vdist[0] * ADST_FLIP_SVM[0] + vdist[1] * ADST_FLIP_SVM[1] + @@ -1087,7 +1261,6 @@ static int adst_vs_flipadst(const AV1_COMP *cpi, BLOCK_SIZE bsize, return prune_bitmask; } -#if CONFIG_EXT_TX static void get_horver_correlation(const int16_t *diff, int stride, int w, int h, double *hcorr, double *vcorr) { // Returns hor/ver correlation coefficient @@ -1132,7 +1305,7 @@ static void get_horver_correlation(const int16_t *diff, int stride, int w, } } -int dct_vs_idtx(const int16_t *diff, int stride, int w, int h) { +static int dct_vs_idtx(const int16_t *diff, int stride, int w, int h) { double hcorr, vcorr; int prune_bitmask = 0; get_horver_correlation(diff, stride, w, h, &hcorr, &vcorr); @@ -1164,14 +1337,13 @@ static int prune_two_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize, if (dct_idtx) { av1_subtract_plane(x, bsize, 0); const struct macroblock_plane *const p = &x->plane[0]; - const int bw = 4 << (b_width_log2_lookup[bsize]); - const int bh = 4 << (b_height_log2_lookup[bsize]); + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; prune |= dct_vs_idtx(p->src_diff, bw, bw, bh); } return prune; } -#endif // CONFIG_EXT_TX // Performance drop: 0.3%, Speed improvement: 5% static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize, @@ -1182,61 +1354,342 @@ static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize, pd->dst.stride); } -#if CONFIG_EXT_TX // 1D Transforms used in inter set, this needs to be changed if // ext_tx_used_inter is changed static const int ext_tx_used_inter_1D[EXT_TX_SETS_INTER][TX_TYPES_1D] = { - { 1, 0, 0, 0 }, { 1, 1, 1, 1 }, { 1, 1, 1, 1 }, { 1, 0, 0, 1 }, -#if CONFIG_MRC_TX + { 1, 0, 0, 0 }, + { 1, 1, 1, 1 }, + { 1, 1, 1, 1 }, { 1, 0, 0, 1 }, -#endif // CONFIG_MRC_TX }; -#endif // CONFIG_EXT_TX -static int prune_tx_types(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, - const MACROBLOCKD *const xd, int tx_set) { -#if CONFIG_EXT_TX - const int *tx_set_1D = tx_set >= 0 ? ext_tx_used_inter_1D[tx_set] : NULL; -#else - const int tx_set_1D[TX_TYPES_1D] = { 0 }; -#endif // CONFIG_EXT_TX +static void get_energy_distribution_finer(const int16_t *diff, int stride, + int bw, int bh, float *hordist, + float *verdist) { + // First compute downscaled block energy values (esq); downscale factors + // are defined by w_shift and h_shift. + unsigned int esq[256]; + const int w_shift = bw <= 8 ? 0 : 1; + const int h_shift = bh <= 8 ? 0 : 1; + const int esq_w = bw <= 8 ? bw : bw / 2; + const int esq_h = bh <= 8 ? bh : bh / 2; + const int esq_sz = esq_w * esq_h; + int i, j; + memset(esq, 0, esq_sz * sizeof(esq[0])); + for (i = 0; i < bh; i++) { + unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w; + const int16_t *cur_diff_row = diff + i * stride; + for (j = 0; j < bw; j++) { + cur_esq_row[j >> w_shift] += cur_diff_row[j] * cur_diff_row[j]; + } + } + uint64_t total = 0; + for (i = 0; i < esq_sz; i++) total += esq[i]; + + // Output hordist and verdist arrays are normalized 1D projections of esq + if (total == 0) { + float hor_val = 1.0f / esq_w; + for (j = 0; j < esq_w - 1; j++) hordist[j] = hor_val; + float ver_val = 1.0f / esq_h; + for (i = 0; i < esq_h - 1; i++) verdist[i] = ver_val; + return; + } + + const float e_recip = 1.0f / (float)total; + memset(hordist, 0, (esq_w - 1) * sizeof(hordist[0])); + memset(verdist, 0, (esq_h - 1) * sizeof(verdist[0])); + const unsigned int *cur_esq_row; + for (i = 0; i < esq_h - 1; i++) { + cur_esq_row = esq + i * esq_w; + for (j = 0; j < esq_w - 1; j++) { + hordist[j] += (float)cur_esq_row[j]; + verdist[i] += (float)cur_esq_row[j]; + } + verdist[i] += (float)cur_esq_row[j]; + } + cur_esq_row = esq + i * esq_w; + for (j = 0; j < esq_w - 1; j++) hordist[j] += (float)cur_esq_row[j]; + + for (j = 0; j < esq_w - 1; j++) hordist[j] *= e_recip; + for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip; +} + +// Similar to get_horver_correlation, but also takes into account first +// row/column, when computing horizontal/vertical correlation. +static void get_horver_correlation_full(const int16_t *diff, int stride, int w, + int h, float *hcorr, float *vcorr) { + const float num_hor = (float)(h * (w - 1)); + const float num_ver = (float)((h - 1) * w); + int i, j; + + // The following notation is used: + // x - current pixel + // y - left neighbor pixel + // z - top neighbor pixel + int64_t xy_sum = 0, xz_sum = 0; + int64_t xhor_sum = 0, xver_sum = 0, y_sum = 0, z_sum = 0; + int64_t x2hor_sum = 0, x2ver_sum = 0, y2_sum = 0, z2_sum = 0; + + int16_t x, y, z; + for (j = 1; j < w; ++j) { + x = diff[j]; + y = diff[j - 1]; + xy_sum += x * y; + xhor_sum += x; + y_sum += y; + x2hor_sum += x * x; + y2_sum += y * y; + } + for (i = 1; i < h; ++i) { + x = diff[i * stride]; + z = diff[(i - 1) * stride]; + xz_sum += x * z; + xver_sum += x; + z_sum += z; + x2ver_sum += x * x; + z2_sum += z * z; + for (j = 1; j < w; ++j) { + x = diff[i * stride + j]; + y = diff[i * stride + j - 1]; + z = diff[(i - 1) * stride + j]; + xy_sum += x * y; + xz_sum += x * z; + xhor_sum += x; + xver_sum += x; + y_sum += y; + z_sum += z; + x2hor_sum += x * x; + x2ver_sum += x * x; + y2_sum += y * y; + z2_sum += z * z; + } + } + const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; + const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; + const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; + const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; + const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; + const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; + + *hcorr = *vcorr = 1; + if (xhor_var_n > 0 && y_var_n > 0) { + *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); + *hcorr = *hcorr < 0 ? 0 : *hcorr; + } + if (xver_var_n > 0 && z_var_n > 0) { + *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); + *vcorr = *vcorr < 0 ? 0 : *vcorr; + } +} + +// Transforms raw scores into a probability distribution across 16 TX types +static void score_2D_transform_pow8(float *scores_2D, float shift) { + float sum = 0.0f; + int i; + + for (i = 0; i < 16; i++) { + float v, v2, v4; + v = AOMMAX(scores_2D[i] + shift, 0.0f); + v2 = v * v; + v4 = v2 * v2; + scores_2D[i] = v4 * v4; + sum += scores_2D[i]; + } + for (i = 0; i < 16; i++) scores_2D[i] /= sum; +} + +// These thresholds were calibrated to provide a certain number of TX types +// pruned by the model on average, i.e. selecting a threshold with index i +// will lead to pruning i+1 TX types on average +static const float *prune_2D_adaptive_thresholds[] = { + // TX_4X4 + (float[]){ 0.02014f, 0.02722f, 0.03430f, 0.04114f, 0.04724f, 0.05212f, + 0.05627f, 0.06018f, 0.06409f, 0.06824f, 0.07312f, 0.07849f, + 0.08606f, 0.09827f }, + // TX_8X8 + (float[]){ 0.00745f, 0.01355f, 0.02039f, 0.02795f, 0.03625f, 0.04407f, + 0.05042f, 0.05579f, 0.06067f, 0.06604f, 0.07239f, 0.08093f, + 0.09363f, 0.11682f }, + // TX_16X16 + (float[]){ 0.01404f, 0.02820f, 0.04211f, 0.05164f, 0.05798f, 0.06335f, + 0.06897f, 0.07629f, 0.08875f, 0.11169f }, + // TX_32X32 + NULL, + // TX_64X64 + NULL, + // TX_4X8 + (float[]){ 0.01282f, 0.02087f, 0.02844f, 0.03601f, 0.04285f, 0.04871f, + 0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f, + 0.09119f, 0.10828f }, + // TX_8X4 + (float[]){ 0.01184f, 0.01941f, 0.02722f, 0.03503f, 0.04187f, 0.04822f, + 0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f, + 0.09167f, 0.10974f }, + // TX_8X16 + (float[]){ 0.00525f, 0.01135f, 0.01819f, 0.02576f, 0.03357f, 0.04114f, + 0.04773f, 0.05383f, 0.05920f, 0.06506f, 0.07190f, 0.08118f, + 0.09509f, 0.12097f }, + // TX_16X8 + (float[]){ 0.00525f, 0.01160f, 0.01819f, 0.02527f, 0.03308f, 0.04065f, + 0.04773f, 0.05383f, 0.05969f, 0.06531f, 0.07214f, 0.08118f, + 0.09485f, 0.12048f }, + // TX_16X32 + (float[]){ 0.01257f, 0.02576f, 0.03723f, 0.04578f, 0.05212f, 0.05798f, + 0.06506f, 0.07385f, 0.08606f, 0.10925f }, + // TX_32X16 + (float[]){ 0.01233f, 0.02527f, 0.03699f, 0.04602f, 0.05286f, 0.05896f, + 0.06531f, 0.07336f, 0.08582f, 0.11072f }, + // TX_32X64 + NULL, + // TX_64X32 + NULL, + // TX_4X16 + NULL, + // TX_16X4 + NULL, + // TX_8X32 + NULL, + // TX_32X8 + NULL, + // TX_16X64 + NULL, + // TX_64X16 + NULL, +}; + +static int prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size, + int blk_row, int blk_col, TxSetType tx_set_type, + TX_TYPE_PRUNE_MODE prune_mode) { + static const int tx_type_table_2D[16] = { + DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT, + ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST, + FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST, + H_DCT, H_ADST, H_FLIPADST, IDTX + }; + if (tx_set_type != EXT_TX_SET_ALL16 && + tx_set_type != EXT_TX_SET_DTT9_IDTX_1DDCT) + return 0; + const NN_CONFIG *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size]; + const NN_CONFIG *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size]; + if (!nn_config_hor || !nn_config_ver) return 0; // Model not established yet. + + aom_clear_system_state(); + float hfeatures[16], vfeatures[16]; + float hscores[4], vscores[4]; + float scores_2D[16]; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + const int hfeatures_num = bw <= 8 ? bw : bw / 2; + const int vfeatures_num = bh <= 8 ? bh : bh / 2; + assert(hfeatures_num <= 16); + assert(vfeatures_num <= 16); + + const struct macroblock_plane *const p = &x->plane[0]; + const int diff_stride = block_size_wide[bsize]; + const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col; + get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures, + vfeatures); + get_horver_correlation_full(diff, diff_stride, bw, bh, + &hfeatures[hfeatures_num - 1], + &vfeatures[vfeatures_num - 1]); + av1_nn_predict(hfeatures, nn_config_hor, hscores); + av1_nn_predict(vfeatures, nn_config_ver, vscores); + + float score_2D_average = 0.0f; + for (int i = 0; i < 4; i++) { + float *cur_scores_2D = scores_2D + i * 4; + cur_scores_2D[0] = vscores[i] * hscores[0]; + cur_scores_2D[1] = vscores[i] * hscores[1]; + cur_scores_2D[2] = vscores[i] * hscores[2]; + cur_scores_2D[3] = vscores[i] * hscores[3]; + score_2D_average += cur_scores_2D[0] + cur_scores_2D[1] + cur_scores_2D[2] + + cur_scores_2D[3]; + } + score_2D_average /= 16; + score_2D_transform_pow8(scores_2D, (20 - score_2D_average)); + + // Always keep the TX type with the highest score, prune all others with + // score below score_thresh. + int max_score_i = 0; + float max_score = 0.0f; + for (int i = 0; i < 16; i++) { + if (scores_2D[i] > max_score && + av1_ext_tx_used[tx_set_type][tx_type_table_2D[i]]) { + max_score = scores_2D[i]; + max_score_i = i; + } + } + + int pruning_aggressiveness = 0; + if (prune_mode == PRUNE_2D_ACCURATE) { + if (tx_set_type == EXT_TX_SET_ALL16) + pruning_aggressiveness = 6; + else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) + pruning_aggressiveness = 4; + } else if (prune_mode == PRUNE_2D_FAST) { + if (tx_set_type == EXT_TX_SET_ALL16) + pruning_aggressiveness = 10; + else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) + pruning_aggressiveness = 7; + } + const float score_thresh = + prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness - 1]; + + int prune_bitmask = 0; + for (int i = 0; i < 16; i++) { + if (scores_2D[i] < score_thresh && i != max_score_i) + prune_bitmask |= (1 << tx_type_table_2D[i]); + } + return prune_bitmask; +} + +static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, + const MACROBLOCKD *const xd, int tx_set_type) { + av1_zero(x->tx_search_prune); + x->tx_split_prune_flag = 0; + const MB_MODE_INFO *mbmi = xd->mi[0]; + if (!is_inter_block(mbmi) || cpi->sf.tx_type_search.prune_mode == NO_PRUNE || + x->use_default_inter_tx_type || xd->lossless[mbmi->segment_id] || + x->cb_partition_scan) + return; + int tx_set = ext_tx_set_index[1][tx_set_type]; + assert(tx_set >= 0); + const int *tx_set_1D = ext_tx_used_inter_1D[tx_set]; switch (cpi->sf.tx_type_search.prune_mode) { - case NO_PRUNE: return 0; break; + case NO_PRUNE: return; case PRUNE_ONE: - if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) - return 0; - return prune_one_for_sby(cpi, bsize, x, xd); + if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) return; + x->tx_search_prune[tx_set_type] = prune_one_for_sby(cpi, bsize, x, xd); break; -#if CONFIG_EXT_TX case PRUNE_TWO: - if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) { - if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return 0; - return prune_two_for_sby(cpi, bsize, x, xd, 0, 1); + if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) { + if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return; + x->tx_search_prune[tx_set_type] = + prune_two_for_sby(cpi, bsize, x, xd, 0, 1); + } + if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) { + x->tx_search_prune[tx_set_type] = + prune_two_for_sby(cpi, bsize, x, xd, 1, 0); } - if ((tx_set >= 0) && !(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) - return prune_two_for_sby(cpi, bsize, x, xd, 1, 0); - return prune_two_for_sby(cpi, bsize, x, xd, 1, 1); + x->tx_search_prune[tx_set_type] = + prune_two_for_sby(cpi, bsize, x, xd, 1, 1); break; -#endif // CONFIG_EXT_TX + case PRUNE_2D_ACCURATE: + case PRUNE_2D_FAST: break; + default: assert(0); } - assert(0); - return 0; } -static int do_tx_type_search(TX_TYPE tx_type, int prune) { -// TODO(sarahparker) implement for non ext tx -#if CONFIG_EXT_TX - return !(((prune >> vtx_tab[tx_type]) & 1) | - ((prune >> (htx_tab[tx_type] + 8)) & 1)); -#else - // temporary to avoid compiler warnings - (void)vtx_tab; - (void)htx_tab; - (void)tx_type; - (void)prune; - return 1; -#endif // CONFIG_EXT_TX +static int do_tx_type_search(TX_TYPE tx_type, int prune, + TX_TYPE_PRUNE_MODE mode) { + // TODO(sarahparker) implement for non ext tx + if (mode >= PRUNE_2D_ACCURATE) { + return !((prune >> tx_type) & 1); + } else { + return !(((prune >> vtx_tab[tx_type]) & 1) | + ((prune >> (htx_tab[tx_type] + 8)) & 1)); + } } static void model_rd_from_sse(const AV1_COMP *const cpi, @@ -1245,16 +1698,12 @@ static void model_rd_from_sse(const AV1_COMP *const cpi, int64_t *dist) { const struct macroblockd_plane *const pd = &xd->plane[plane]; const int dequant_shift = -#if CONFIG_HIGHBITDEPTH - (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : -#endif // CONFIG_HIGHBITDEPTH - 3; + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; // Fast approximate the modelling function. if (cpi->sf.simple_model_rd_from_var) { const int64_t square_error = sse; - int quantizer = (pd->dequant[1] >> dequant_shift); - + int quantizer = (pd->dequant_Q3[1] >> dequant_shift); if (quantizer < 120) *rate = (int)((square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT)); @@ -1263,22 +1712,48 @@ static void model_rd_from_sse(const AV1_COMP *const cpi, *dist = (square_error * quantizer) >> 8; } else { av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bsize], - pd->dequant[1] >> dequant_shift, rate, dist); + pd->dequant_Q3[1] >> dequant_shift, rate, + dist); } - *dist <<= 4; } +#if CONFIG_COLLECT_INTER_MODE_RD_STATS +static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) { + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + int64_t total_sse = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x, + pd->subsampling_y); + unsigned int sse; + + if (x->skip_chroma_rd && plane) continue; + + cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + &sse); + total_sse += sse; + } + total_sse <<= 4; + return total_sse; +} +#endif + static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum, int *skip_txfm_sb, - int64_t *skip_sse_sb) { + int64_t *skip_sse_sb, int *plane_rate, + int64_t *plane_sse, int64_t *plane_dist) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. int plane; - const int ref = xd->mi[0]->mbmi.ref_frame[0]; + const int ref = xd->mi[0]->ref_frame[0]; int64_t rate_sum = 0; int64_t dist_sum = 0; @@ -1289,19 +1764,13 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, for (plane = plane_from; plane <= plane_to; ++plane) { struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; -#if CONFIG_CHROMA_SUB8X8 - const BLOCK_SIZE bs = AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd)); -#else - const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); -#endif // CONFIG_CHROMA_SUB8X8 - + const BLOCK_SIZE bs = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); unsigned int sse; int rate; int64_t dist; -#if CONFIG_CB4X4 if (x->skip_chroma_rd && plane) continue; -#endif // CONFIG_CB4X4 // TODO(geza): Write direct sse functions that do not compute // variance as well. @@ -1316,14 +1785,54 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, rate_sum += rate; dist_sum += dist; + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; } - *skip_txfm_sb = total_sse == 0; - *skip_sse_sb = total_sse << 4; + if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; *out_rate_sum = (int)rate_sum; *out_dist_sum = dist_sum; } +static void check_block_skip(const AV1_COMP *const cpi, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, + int plane_to, int *skip_txfm_sb) { + *skip_txfm_sb = 1; + for (int plane = plane_from; plane <= plane_to; ++plane) { + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bs = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + unsigned int sse; + + if (x->skip_chroma_rd && plane) continue; + + // Since fast HBD variance functions scale down sse by 4 bit, we first use + // fast vf implementation to rule out blocks with non-zero scaled sse. Then, + // only if the source is HBD and the scaled sse is 0, accurate sse + // computation is applied to determine if the sse is really 0. This step is + // necessary for HBD lossless coding. + cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + &sse); + if (sse) { + *skip_txfm_sb = 0; + return; + } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint64_t sse64 = aom_highbd_sse_odd_size( + p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + block_size_wide[bs], block_size_high[bs]); + + if (sse64) { + *skip_txfm_sb = 0; + return; + } + } + } + return; +} + int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz) { int i; @@ -1339,20 +1848,6 @@ int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, return error; } -int64_t av1_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, - int block_size) { - int i; - int64_t error = 0; - - for (i = 0; i < block_size; i++) { - const int diff = coeff[i] - dqcoeff[i]; - error += diff * diff; - } - - return error; -} - -#if CONFIG_HIGHBITDEPTH int64_t av1_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd) { @@ -1373,236 +1868,13 @@ int64_t av1_highbd_block_error_c(const tran_low_t *coeff, *ssz = sqcoeff; return error; } -#endif // CONFIG_HIGHBITDEPTH - -#if CONFIG_PVQ -// Without PVQ, av1_block_error_c() return two kind of errors, -// 1) reconstruction (i.e. decoded) error and -// 2) Squared sum of transformed residue (i.e. 'coeff') -// However, if PVQ is enabled, coeff does not keep the transformed residue -// but instead a transformed original is kept. -// Hence, new parameter ref vector (i.e. transformed predicted signal) -// is required to derive the residue signal, -// i.e. coeff - ref = residue (all transformed). - -#if CONFIG_HIGHBITDEPTH -static int64_t av1_highbd_block_error2_c(const tran_low_t *coeff, - const tran_low_t *dqcoeff, - const tran_low_t *ref, - intptr_t block_size, int64_t *ssz, - int bd) { - int64_t error; - int64_t sqcoeff; - int shift = 2 * (bd - 8); - int rounding = shift > 0 ? 1 << (shift - 1) : 0; - // Use the existing sse codes for calculating distortion of decoded signal: - // i.e. (orig - decoded)^2 - // For high bit depth, throw away ssz until a 32-bit version of - // av1_block_error_fp is written. - int64_t ssz_trash; - error = av1_block_error(coeff, dqcoeff, block_size, &ssz_trash); - // prediction residue^2 = (orig - ref)^2 - sqcoeff = av1_block_error(coeff, ref, block_size, &ssz_trash); - error = (error + rounding) >> shift; - sqcoeff = (sqcoeff + rounding) >> shift; - *ssz = sqcoeff; - return error; -} -#else -// TODO(yushin) : Since 4x4 case does not need ssz, better to refactor into -// a separate function that does not do the extra computations for ssz. -static int64_t av1_block_error2_c(const tran_low_t *coeff, - const tran_low_t *dqcoeff, - const tran_low_t *ref, intptr_t block_size, - int64_t *ssz) { - int64_t error; - int64_t ssz_trash; - // Use the existing sse codes for calculating distortion of decoded signal: - // i.e. (orig - decoded)^2 - error = av1_block_error(coeff, dqcoeff, block_size, &ssz_trash); - // prediction residue^2 = (orig - ref)^2 - *ssz = av1_block_error(coeff, ref, block_size, &ssz_trash); - return error; -} -#endif // CONFIG_HIGHBITDEPTH -#endif // CONFIG_PVQ - -#if !CONFIG_PVQ || CONFIG_VAR_TX -#if !CONFIG_LV_MAP -static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane, - int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order, - const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l, - int use_fast_coef_costing) { - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - const struct macroblock_plane *p = &x->plane[plane]; - const struct macroblockd_plane *pd = &xd->plane[plane]; - const PLANE_TYPE type = pd->plane_type; - const uint16_t *band_count = &band_count_table[tx_size][1]; - const int eob = p->eobs[block]; - const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); - const TX_SIZE tx_size_ctx = txsize_sqr_map[tx_size]; - uint8_t token_cache[MAX_TX_SQUARE]; - int pt = combine_entropy_contexts(*a, *l); - int c, cost; - const int16_t *scan = scan_order->scan; - const int16_t *nb = scan_order->neighbors; - const int ref = is_inter_block(mbmi); - int(*head_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] = - x->token_head_costs[tx_size_ctx][type][ref]; - int(*tail_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] = - x->token_tail_costs[tx_size_ctx][type][ref]; - const int seg_eob = av1_get_tx_eob(&cm->seg, mbmi->segment_id, tx_size); - int eob_val; - -#if CONFIG_HIGHBITDEPTH - const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd); -#else - const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, 8); -#endif // CONFIG_HIGHBITDEPTH - -#if !CONFIG_VAR_TX && !CONFIG_SUPERTX - // Check for consistency of tx_size with mode info - assert(tx_size == av1_get_tx_size(plane, xd)); -#endif // !CONFIG_VAR_TX && !CONFIG_SUPERTX - (void)cm; - - if (eob == 0) { - // block zero - cost = (*head_token_costs)[pt][0]; - } else { - if (use_fast_coef_costing) { - int band_left = *band_count++; - - // dc token - int v = qcoeff[0]; - int16_t prev_t; - cost = av1_get_token_cost(v, &prev_t, cat6_bits); - eob_val = (eob == 1) ? EARLY_EOB : NO_EOB; - cost += av1_get_coeff_token_cost( - prev_t, eob_val, 1, (*head_token_costs)[pt], (*tail_token_costs)[pt]); - - token_cache[0] = av1_pt_energy_class[prev_t]; - ++head_token_costs; - ++tail_token_costs; - - // ac tokens - for (c = 1; c < eob; c++) { - const int rc = scan[c]; - int16_t t; - - v = qcoeff[rc]; - cost += av1_get_token_cost(v, &t, cat6_bits); - eob_val = - (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB; - cost += av1_get_coeff_token_cost(t, eob_val, 0, - (*head_token_costs)[!prev_t], - (*tail_token_costs)[!prev_t]); - prev_t = t; - if (!--band_left) { - band_left = *band_count++; - ++head_token_costs; - ++tail_token_costs; - } - } - } else { // !use_fast_coef_costing - int band_left = *band_count++; - - // dc token - int v = qcoeff[0]; - int16_t tok; - cost = av1_get_token_cost(v, &tok, cat6_bits); - eob_val = (eob == 1) ? EARLY_EOB : NO_EOB; - cost += av1_get_coeff_token_cost(tok, eob_val, 1, (*head_token_costs)[pt], - (*tail_token_costs)[pt]); - - token_cache[0] = av1_pt_energy_class[tok]; - ++head_token_costs; - ++tail_token_costs; - - // ac tokens - for (c = 1; c < eob; c++) { - const int rc = scan[c]; - - v = qcoeff[rc]; - cost += av1_get_token_cost(v, &tok, cat6_bits); - pt = get_coef_context(nb, token_cache, c); - eob_val = - (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB; - cost += av1_get_coeff_token_cost( - tok, eob_val, 0, (*head_token_costs)[pt], (*tail_token_costs)[pt]); - token_cache[rc] = av1_pt_energy_class[tok]; - if (!--band_left) { - band_left = *band_count++; - ++head_token_costs; - ++tail_token_costs; - } - } - } - } - - return cost; -} -#endif // !CONFIG_LV_MAP - -int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane, - int blk_row, int blk_col, int block, TX_SIZE tx_size, - const SCAN_ORDER *scan_order, const ENTROPY_CONTEXT *a, - const ENTROPY_CONTEXT *l, int use_fast_coef_costing) { - const AV1_COMMON *const cm = &cpi->common; -#if !CONFIG_LV_MAP - (void)blk_row; - (void)blk_col; -#if CONFIG_MRC_TX - const MACROBLOCKD *xd = &x->e_mbd; - const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - const TX_TYPE tx_type = av1_get_tx_type(xd->plane[plane].plane_type, xd, - blk_row, blk_col, block, tx_size); - const int is_inter = is_inter_block(mbmi); - if (tx_type == MRC_DCT && ((is_inter && SIGNAL_MRC_MASK_INTER) || - (!is_inter && SIGNAL_MRC_MASK_INTRA))) { - const int mrc_mask_cost = - av1_cost_color_map(x, plane, block, mbmi->sb_type, tx_size, MRC_MAP); - return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l, - use_fast_coef_costing) + - mrc_mask_cost; - } -#endif - return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l, - use_fast_coef_costing); -#else // !CONFIG_LV_MAP - (void)scan_order; - (void)use_fast_coef_costing; - const MACROBLOCKD *xd = &x->e_mbd; - const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - const struct macroblockd_plane *pd = &xd->plane[plane]; - const BLOCK_SIZE bsize = mbmi->sb_type; -#if CONFIG_CHROMA_SUB8X8 - const BLOCK_SIZE plane_bsize = - AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd)); -#elif CONFIG_CB4X4 - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); -#else // CONFIG_CB4X4 - const BLOCK_SIZE plane_bsize = - get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd); -#endif // CONFIG_CB4X4 - - TXB_CTX txb_ctx; - get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); - return av1_cost_coeffs_txb(cm, x, plane, blk_row, blk_col, block, tx_size, - &txb_ctx); -#endif // !CONFIG_LV_MAP -} -#endif // !CONFIG_PVQ || CONFIG_VAR_TX // Get transform block visible dimensions cropped to the MI units. static void get_txb_dimensions(const MACROBLOCKD *xd, int plane, BLOCK_SIZE plane_bsize, int blk_row, int blk_col, BLOCK_SIZE tx_bsize, int *width, int *height, int *visible_width, int *visible_height) { -#if !(CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)) assert(tx_bsize <= plane_bsize); -#endif int txb_height = block_size_high[tx_bsize]; int txb_width = block_size_wide[tx_bsize]; const int block_height = block_size_high[plane_bsize]; @@ -1659,234 +1931,900 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x, // Compute the pixel domain distortion from diff on all visible 4x4s in the // transform block. -static int64_t pixel_diff_dist(const MACROBLOCK *x, int plane, - const int16_t *diff, const int diff_stride, - int blk_row, int blk_col, - const BLOCK_SIZE plane_bsize, - const BLOCK_SIZE tx_bsize) { +static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane, + int blk_row, int blk_col, + const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize) { int visible_rows, visible_cols; const MACROBLOCKD *xd = &x->e_mbd; -#if CONFIG_DIST_8X8 - int txb_height = block_size_high[tx_bsize]; - int txb_width = block_size_wide[tx_bsize]; - const int src_stride = x->plane[plane].src.stride; - const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0]; - const uint8_t *src = &x->plane[plane].src.buf[src_idx]; -#endif - get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL, NULL, &visible_cols, &visible_rows); - + const int diff_stride = block_size_wide[plane_bsize]; + const int16_t *diff = x->plane[plane].src_diff; #if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && plane == 0 && txb_width >= 8 && txb_height >= 8) - return av1_dist_8x8_diff(x, src, src_stride, diff, diff_stride, txb_width, - txb_height, visible_cols, visible_rows, x->qindex); - else + int txb_height = block_size_high[tx_bsize]; + int txb_width = block_size_wide[tx_bsize]; + if (x->using_dist_8x8 && plane == 0 && txb_width >= 8 && txb_height >= 8) { + const int src_stride = x->plane[plane].src.stride; + const int src_idx = (blk_row * src_stride + blk_col) + << tx_size_wide_log2[0]; + const uint8_t *src = &x->plane[plane].src.buf[src_idx]; + return dist_8x8_diff(x, src, src_stride, diff, diff_stride, txb_width, + txb_height, visible_cols, visible_rows, x->qindex); + } #endif - return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, - visible_rows); + diff += ((blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]); + return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows); } -int av1_count_colors(const uint8_t *src, int stride, int rows, int cols) { - int val_count[256]; - memset(val_count, 0, sizeof(val_count)); +int av1_count_colors(const uint8_t *src, int stride, int rows, int cols, + int *val_count) { + const int max_pix_val = 1 << 8; + memset(val_count, 0, max_pix_val * sizeof(val_count[0])); for (int r = 0; r < rows; ++r) { for (int c = 0; c < cols; ++c) { - ++val_count[src[r * stride + c]]; + const int this_val = src[r * stride + c]; + assert(this_val < max_pix_val); + ++val_count[this_val]; } } int n = 0; - for (int i = 0; i < 256; ++i) { + for (int i = 0; i < max_pix_val; ++i) { if (val_count[i]) ++n; } return n; } -#if CONFIG_HIGHBITDEPTH int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols, - int bit_depth) { + int bit_depth, int *val_count) { assert(bit_depth <= 12); + const int max_pix_val = 1 << bit_depth; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - int val_count[1 << 12]; - memset(val_count, 0, (1 << 12) * sizeof(val_count[0])); + memset(val_count, 0, max_pix_val * sizeof(val_count[0])); for (int r = 0; r < rows; ++r) { for (int c = 0; c < cols; ++c) { - ++val_count[src[r * stride + c]]; + const int this_val = src[r * stride + c]; + assert(this_val < max_pix_val); + if (this_val >= max_pix_val) return 0; + ++val_count[this_val]; } } int n = 0; - for (int i = 0; i < (1 << bit_depth); ++i) { + for (int i = 0; i < max_pix_val; ++i) { if (val_count[i]) ++n; } return n; } -#endif // CONFIG_HIGHBITDEPTH -void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane, - BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col, - TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse, - OUTPUT_STATUS output_status) { +static void inverse_transform_block_facade(MACROBLOCKD *xd, int plane, + int block, int blk_row, int blk_col, + int eob, int reduced_tx_set) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + const PLANE_TYPE plane_type = get_plane_type(plane); + const TX_SIZE tx_size = av1_get_tx_size(plane, xd); + const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, + tx_size, reduced_tx_set); + const int dst_stride = pd->dst.stride; + uint8_t *dst = + &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; + av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, + dst_stride, eob, reduced_tx_set); +} + +static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, const uint32_t hash); + +static uint32_t get_intra_txb_hash(MACROBLOCK *x, int plane, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size) { + int16_t tmp_data[64 * 64]; + const int diff_stride = block_size_wide[plane_bsize]; + const int16_t *diff = x->plane[plane].src_diff; + const int16_t *cur_diff_row = diff + 4 * blk_row * diff_stride + 4 * blk_col; + const int txb_w = tx_size_wide[tx_size]; + const int txb_h = tx_size_high[tx_size]; + uint8_t *hash_data = (uint8_t *)cur_diff_row; + if (txb_w != diff_stride) { + int16_t *cur_hash_row = tmp_data; + for (int i = 0; i < txb_h; i++) { + memcpy(cur_hash_row, cur_diff_row, sizeof(*diff) * txb_w); + cur_hash_row += txb_w; + cur_diff_row += diff_stride; + } + hash_data = (uint8_t *)tmp_data; + } + CRC32C *crc = &x->mb_rd_record.crc_calculator; + const uint32_t hash = av1_get_crc32c_value(crc, hash_data, 2 * txb_w * txb_h); + return (hash << 5) + tx_size; +} + +static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block, + TX_SIZE tx_size, int64_t *out_dist, + int64_t *out_sse) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; -#if CONFIG_DIST_8X8 - struct macroblockd_plane *const pd = &xd->plane[plane]; -#else // CONFIG_DIST_8X8 const struct macroblockd_plane *const pd = &xd->plane[plane]; -#endif // CONFIG_DIST_8X8 + // Transform domain distortion computation is more efficient as it does + // not involve an inverse transform, but it is less accurate. + const int buffer_length = av1_get_max_eob(tx_size); + int64_t this_sse; + // TX-domain results need to shift down to Q2/D10 to match pixel + // domain distortion values which are in Q2^2 + int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2; + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - if (cpi->sf.use_transform_domain_distortion -#if CONFIG_DIST_8X8 - && !x->using_dist_8x8 -#endif - ) { - // Transform domain distortion computation is more efficient as it does - // not involve an inverse transform, but it is less accurate. - const int buffer_length = tx_size_2d[tx_size]; - int64_t this_sse; - int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2; - tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); - tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); -#if CONFIG_PVQ - tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block); - -#if CONFIG_HIGHBITDEPTH - const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8; - *out_dist = av1_highbd_block_error2_c(coeff, dqcoeff, ref_coeff, - buffer_length, &this_sse, bd); -#else - *out_dist = - av1_block_error2_c(coeff, dqcoeff, ref_coeff, buffer_length, &this_sse); -#endif // CONFIG_HIGHBITDEPTH -#else // !CONFIG_PVQ -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, - &this_sse, xd->bd); - else -#endif - *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse); -#endif // CONFIG_PVQ - *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift); - *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse, + xd->bd); + else + *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse); + + *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift); + *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift); +} + +static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x, + int plane, BLOCK_SIZE plane_bsize, + int block, int blk_row, int blk_col, + TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const uint16_t eob = p->eobs[block]; + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + const int bsw = block_size_wide[tx_bsize]; + const int bsh = block_size_high[tx_bsize]; + const int src_stride = x->plane[plane].src.stride; + const int dst_stride = xd->plane[plane].dst.stride; + // Scale the transform block index to pixel unit. + const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0]; + const int dst_idx = (blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]; + const uint8_t *src = &x->plane[plane].src.buf[src_idx]; + const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx]; + const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + + assert(cpi != NULL); + assert(tx_size_wide_log2[0] == tx_size_high_log2[0]); + + uint8_t *recon; + DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]); + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + recon = CONVERT_TO_BYTEPTR(recon16); + av1_highbd_convolve_2d_copy_sr(CONVERT_TO_SHORTPTR(dst), dst_stride, + CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw, + bsh, NULL, NULL, 0, 0, NULL, xd->bd); } else { - const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; -#if !CONFIG_PVQ || CONFIG_DIST_8X8 - const int bsw = block_size_wide[tx_bsize]; - const int bsh = block_size_high[tx_bsize]; -#endif - const int src_stride = x->plane[plane].src.stride; - const int dst_stride = xd->plane[plane].dst.stride; - // Scale the transform block index to pixel unit. - const int src_idx = (blk_row * src_stride + blk_col) - << tx_size_wide_log2[0]; - const int dst_idx = (blk_row * dst_stride + blk_col) - << tx_size_wide_log2[0]; - const uint8_t *src = &x->plane[plane].src.buf[src_idx]; - const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx]; - const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - const uint16_t eob = p->eobs[block]; + recon = (uint8_t *)recon16; + av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL, + NULL, 0, 0, NULL); + } - assert(cpi != NULL); - assert(tx_size_wide_log2[0] == tx_size_high_log2[0]); + const PLANE_TYPE plane_type = get_plane_type(plane); + TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size, + cpi->common.reduced_tx_set_used); + av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon, + MAX_TX_SIZE, eob, + cpi->common.reduced_tx_set_used); +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) { + // Save decoded pixels for inter block in pd->pred to avoid + // block_8x8_rd_txfm_daala_dist() need to produce them + // by calling av1_inverse_transform_block() again. + const int pred_stride = block_size_wide[plane_bsize]; + const int pred_idx = (blk_row * pred_stride + blk_col) + << tx_size_wide_log2[0]; + int16_t *pred = &x->pred_luma[pred_idx]; + int i, j; - { - const int diff_stride = block_size_wide[plane_bsize]; - const int diff_idx = (blk_row * diff_stride + blk_col) - << tx_size_wide_log2[0]; - const int16_t *diff = &p->src_diff[diff_idx]; - *out_sse = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col, - plane_bsize, tx_bsize); -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - *out_sse = ROUND_POWER_OF_TWO(*out_sse, (xd->bd - 8) * 2); -#endif // CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + for (j = 0; j < bsh; j++) + for (i = 0; i < bsw; i++) + pred[j * pred_stride + i] = + CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i]; + } else { + for (j = 0; j < bsh; j++) + for (i = 0; i < bsw; i++) + pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i]; } - *out_sse *= 16; + } +#endif // CONFIG_DIST_8X8 + return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE, + blk_row, blk_col, plane_bsize, tx_bsize); +} - if (eob) { - if (output_status == OUTPUT_HAS_DECODED_PIXELS) { - *out_dist = pixel_dist(cpi, x, plane, src, src_stride, dst, dst_stride, - blk_row, blk_col, plane_bsize, tx_bsize); - } else { -#if CONFIG_HIGHBITDEPTH - uint8_t *recon; - DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]); - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - recon = CONVERT_TO_BYTEPTR(recon16); - else - recon = (uint8_t *)recon16; -#else - DECLARE_ALIGNED(16, uint8_t, recon[MAX_TX_SQUARE]); -#endif // CONFIG_HIGHBITDEPTH - -#if !CONFIG_PVQ -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - aom_highbd_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0, - NULL, 0, bsw, bsh, xd->bd); - } else { -#endif // CONFIG_HIGHBITDEPTH - aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0, NULL, - 0, bsw, bsh); -#if CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_HIGHBITDEPTH -#else - (void)dst; -#endif // !CONFIG_PVQ - -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block); -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - const PLANE_TYPE plane_type = get_plane_type(plane); - TX_TYPE tx_type = - av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size); - av1_inverse_transform_block(xd, dqcoeff, -#if CONFIG_LGT_FROM_PRED - xd->mi[0]->mbmi.mode, -#endif -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - mrc_mask, -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - tx_type, tx_size, recon, MAX_TX_SIZE, eob); +static double get_mean(const int16_t *diff, int stride, int w, int h) { + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + sum += diff[j * stride + i]; + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} + +static double get_sse_norm(const int16_t *diff, int stride, int w, int h) { + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + const int err = diff[j * stride + i]; + sum += err * err; + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} + +static double get_sad_norm(const int16_t *diff, int stride, int w, int h) { + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + sum += abs(diff[j * stride + i]); + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} + +static void get_2x2_normalized_sses_and_sads( + const AV1_COMP *const cpi, BLOCK_SIZE tx_bsize, const uint8_t *const src, + int src_stride, const uint8_t *const dst, int dst_stride, + const int16_t *const src_diff, int diff_stride, double *const sse_norm_arr, + double *const sad_norm_arr) { + const BLOCK_SIZE tx_bsize_half = + get_partition_subsize(tx_bsize, PARTITION_SPLIT); + if (tx_bsize_half == BLOCK_INVALID) { // manually calculate stats + const int half_width = block_size_wide[tx_bsize] / 2; + const int half_height = block_size_high[tx_bsize] / 2; + for (int row = 0; row < 2; ++row) { + for (int col = 0; col < 2; ++col) { + const int16_t *const this_src_diff = + src_diff + row * half_height * diff_stride + col * half_width; + sse_norm_arr[row * 2 + col] = + get_sse_norm(this_src_diff, diff_stride, half_width, half_height); + sad_norm_arr[row * 2 + col] = + get_sad_norm(this_src_diff, diff_stride, half_width, half_height); + } + } + } else { // use function pointers to calculate stats + const int half_width = block_size_wide[tx_bsize_half]; + const int half_height = block_size_high[tx_bsize_half]; + const int num_samples_half = half_width * half_height; + for (int row = 0; row < 2; ++row) { + for (int col = 0; col < 2; ++col) { + const uint8_t *const this_src = + src + row * half_height * src_stride + col * half_width; + const uint8_t *const this_dst = + dst + row * half_height * dst_stride + col * half_width; + + unsigned int this_sse; + cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst, + dst_stride, &this_sse); + sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half; + + const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf( + this_src, src_stride, this_dst, dst_stride); + sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half; + } + } + } +} + +#if CONFIG_COLLECT_RD_STATS +// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values +// 0: Do not collect any RD stats +// 1: Collect RD stats for transform units +// 2: Collect RD stats for partition units +static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, + const RD_STATS *const rd_stats, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, TX_TYPE tx_type) { + if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return; + + // Generate small sample to restrict output size. + static unsigned int seed = 21743; + if (lcg_rand16(&seed) % 100 > 0) return; + + const char output_file[] = "tu_stats.txt"; + FILE *fout = fopen(output_file, "a"); + if (!fout) return; + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + const MACROBLOCKD *const xd = &x->e_mbd; + const int plane = 0; + struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int txw = tx_size_wide[tx_size]; + const int txh = tx_size_high[tx_size]; + const int dequant_shift = + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int q_step = pd->dequant_Q3[1] >> dequant_shift; + const double num_samples = txw * txh; + + const double rate_norm = (double)rd_stats->rate / num_samples; + const double dist_norm = (double)rd_stats->dist / num_samples; + + fprintf(fout, "%g %g", rate_norm, dist_norm); + + const int src_stride = p->src.stride; + const uint8_t *const src = + &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]]; + const int dst_stride = pd->dst.stride; + const uint8_t *const dst = + &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; + unsigned int sse; + cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); + const double sse_norm = (double)sse / num_samples; + + const unsigned int sad = + cpi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride); + const double sad_norm = (double)sad / num_samples; + + fprintf(fout, " %g %g", sse_norm, sad_norm); + + const int diff_stride = block_size_wide[plane_bsize]; + const int16_t *const src_diff = + &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]]; + + double sse_norm_arr[4], sad_norm_arr[4]; + get_2x2_normalized_sses_and_sads(cpi, tx_bsize, src, src_stride, dst, + dst_stride, src_diff, diff_stride, + sse_norm_arr, sad_norm_arr); + for (int i = 0; i < 4; ++i) { + fprintf(fout, " %g", sse_norm_arr[i]); + } + for (int i = 0; i < 4; ++i) { + fprintf(fout, " %g", sad_norm_arr[i]); + } + + const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type]; + const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type]; + + fprintf(fout, " %d %d %d %d %d", q_step, tx_size_wide[tx_size], + tx_size_high[tx_size], tx_type_1d_row, tx_type_1d_col); + + int model_rate; + int64_t model_dist; + model_rd_from_sse(cpi, xd, tx_bsize, plane, sse, &model_rate, &model_dist); + const double model_rate_norm = (double)model_rate / num_samples; + const double model_dist_norm = (double)model_dist / num_samples; + fprintf(fout, " %g %g", model_rate_norm, model_dist_norm); + + const double mean = get_mean(src_diff, diff_stride, txw, txh); + double hor_corr, vert_corr; + get_horver_correlation(src_diff, diff_stride, txw, txh, &hor_corr, + &vert_corr); + fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr); + + double hdist[4] = { 0 }, vdist[4] = { 0 }; + get_energy_distribution_fine(cpi, tx_bsize, src, src_stride, dst, dst_stride, + 1, hdist, vdist); + fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2], + hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]); + + fprintf(fout, "\n"); + fclose(fout); +} + +#if CONFIG_COLLECT_RD_STATS == 2 +static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, + const RD_STATS *const rd_stats, + BLOCK_SIZE plane_bsize) { + if (rd_stats->invalid_rate) return; + if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return; + + // Generate small sample to restrict output size. + static unsigned int seed = 95014; + if (lcg_rand16(&seed) % 100 > 0) return; + + const char output_file[] = "pu_stats.txt"; + FILE *fout = fopen(output_file, "a"); + if (!fout) return; + + const MACROBLOCKD *const xd = &x->e_mbd; + const int plane = 0; + struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + const int dequant_shift = + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int q_step = pd->dequant_Q3[1] >> dequant_shift; + const double num_samples = bw * bh; + + const double rate_norm = (double)rd_stats->rate / num_samples; + const double dist_norm = (double)rd_stats->dist / num_samples; + + fprintf(fout, "%g %g", rate_norm, dist_norm); + + const int src_stride = p->src.stride; + const uint8_t *const src = p->src.buf; + const int dst_stride = pd->dst.stride; + const uint8_t *const dst = pd->dst.buf; + unsigned int sse; + cpi->fn_ptr[plane_bsize].vf(src, src_stride, dst, dst_stride, &sse); + const double sse_norm = (double)sse / num_samples; + + const unsigned int sad = + cpi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride); + const double sad_norm = (double)sad / num_samples; + + fprintf(fout, " %g %g", sse_norm, sad_norm); + + const int diff_stride = block_size_wide[plane_bsize]; + const int16_t *const src_diff = p->src_diff; + + double sse_norm_arr[4], sad_norm_arr[4]; + get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst, + dst_stride, src_diff, diff_stride, + sse_norm_arr, sad_norm_arr); + for (int i = 0; i < 4; ++i) { + fprintf(fout, " %g", sse_norm_arr[i]); + } + for (int i = 0; i < 4; ++i) { + fprintf(fout, " %g", sad_norm_arr[i]); + } + + fprintf(fout, " %d %d %d", q_step, bw, bh); + + int model_rate; + int64_t model_dist; + model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate, &model_dist); + const double model_rate_norm = (double)model_rate / num_samples; + const double model_dist_norm = (double)model_dist / num_samples; + fprintf(fout, " %g %g", model_rate_norm, model_dist_norm); + + const double mean = get_mean(src_diff, diff_stride, bw, bh); + double hor_corr, vert_corr; + get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr); + fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr); + + double hdist[4] = { 0 }, vdist[4] = { 0 }; + get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst, + dst_stride, 1, hdist, vdist); + fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2], + hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]); + + fprintf(fout, "\n"); + fclose(fout); +} +#endif // CONFIG_COLLECT_RD_STATS == 2 +#endif // CONFIG_COLLECT_RD_STATS + +static void model_rd_with_dnn(const AV1_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE bsize, + int plane, unsigned int *rsse, int *rate, + int64_t *dist) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int log_numpels = num_pels_log2_lookup[plane_bsize]; + const int num_samples = (1 << log_numpels); + + const struct macroblock_plane *const p = &x->plane[plane]; + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + const int dequant_shift = + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int q_step = pd->dequant_Q3[1] >> dequant_shift; + + const int src_stride = p->src.stride; + const uint8_t *const src = p->src.buf; + const int dst_stride = pd->dst.stride; + const uint8_t *const dst = pd->dst.buf; + unsigned int sse; + cpi->fn_ptr[plane_bsize].vf(src, src_stride, dst, dst_stride, &sse); + const double sse_norm = (double)sse / num_samples; + + const int diff_stride = block_size_wide[plane_bsize]; + const int16_t *const src_diff = p->src_diff; + + double sse_norm_arr[4], sad_norm_arr[4]; + get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst, + dst_stride, src_diff, diff_stride, + sse_norm_arr, sad_norm_arr); + const double mean = get_mean(src_diff, diff_stride, bw, bh); + const double variance = sse_norm - mean * mean; + const double q_sqr = (double)(q_step * q_step); + const double q_sqr_by_variance = q_sqr / variance; + double hor_corr, vert_corr; + get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr); + double hdist[4] = { 0 }, vdist[4] = { 0 }; + get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst, + dst_stride, 1, hdist, vdist); + + float features[20]; + features[0] = (float)hdist[0]; + features[1] = (float)hdist[1]; + features[2] = (float)hdist[2]; + features[3] = (float)hdist[3]; + features[4] = (float)hor_corr; + features[5] = (float)log_numpels; + features[6] = (float)mean; + features[7] = (float)q_sqr; + features[8] = (float)q_sqr_by_variance; + features[9] = (float)sse_norm_arr[0]; + features[10] = (float)sse_norm_arr[1]; + features[11] = (float)sse_norm_arr[2]; + features[12] = (float)sse_norm_arr[3]; + features[13] = (float)sse_norm_arr[3]; + features[14] = (float)variance; + features[15] = (float)vdist[0]; + features[16] = (float)vdist[1]; + features[17] = (float)vdist[2]; + features[18] = (float)vdist[3]; + features[19] = (float)vert_corr; + + float rate_f, dist_f; + av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_f); + av1_nn_predict(features, &av1_pustats_rate_nnconfig, &rate_f); + const int rate_i = (int)(AOMMAX(0.0, rate_f * (1 << log_numpels)) + 0.5); + const int64_t dist_i = + (int64_t)(AOMMAX(0.0, dist_f * (1 << log_numpels)) + 0.5); + if (rate) *rate = rate_i; + if (dist) *dist = dist_i; + if (rsse) *rsse = sse; + return; +} + +void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, + int plane_to, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, + int64_t *skip_sse_sb, int *plane_rate, + int64_t *plane_sse, int64_t *plane_dist) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + const int ref = xd->mi[0]->ref_frame[0]; + + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; + + x->pred_sse[ref] = 0; + + for (int plane = plane_from; plane <= plane_to; ++plane) { + unsigned int sse; + int rate; + int64_t dist; + + if (x->skip_chroma_rd && plane) continue; + + model_rd_with_dnn(cpi, x, bsize, plane, &sse, &rate, &dist); + + if (plane == 0) x->pred_sse[ref] = sse; + + total_sse += sse; + rate_sum += rate; + dist_sum += dist; + + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; + } + + if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; +} + +static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + const TXB_CTX *const txb_ctx, + FAST_TX_SEARCH_MODE ftxs_mode, + int use_fast_coef_costing, int64_t ref_best_rd, + RD_STATS *best_rd_stats) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int is_inter = is_inter_block(mbmi); + int64_t best_rd = INT64_MAX; + uint16_t best_eob = 0; + TX_TYPE best_tx_type = DCT_DCT; + TX_TYPE last_tx_type = TX_TYPES; + const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY; + // The buffer used to swap dqcoeff in macroblockd_plane so we can keep dqcoeff + // of the best tx_type + DECLARE_ALIGNED(32, tran_low_t, this_dqcoeff[MAX_SB_SQUARE]); + tran_low_t *orig_dqcoeff = pd->dqcoeff; + tran_low_t *best_dqcoeff = this_dqcoeff; + const int txk_type_idx = + av1_get_txk_type_index(plane_bsize, blk_row, blk_col); + av1_invalid_rd_stats(best_rd_stats); + + TXB_RD_INFO *intra_txb_rd_info = NULL; + uint16_t cur_joint_ctx = 0; + const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); + const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); + const int within_border = + mi_row >= xd->tile.mi_row_start && + (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) && + mi_col >= xd->tile.mi_col_start && + (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end); + if (within_border && cpi->sf.use_intra_txb_hash && frame_is_intra_only(cm) && + !is_inter && plane == 0 && + tx_size_wide[tx_size] == tx_size_high[tx_size]) { + const uint32_t intra_hash = + get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size); + const int intra_hash_idx = + find_tx_size_rd_info(&x->txb_rd_record_intra, intra_hash); + intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx]; + + cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx; + if (intra_hash_idx > 0 && + intra_txb_rd_info->entropy_context == cur_joint_ctx && + x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) { + mbmi->txk_type[txk_type_idx] = intra_txb_rd_info->tx_type; + const TX_TYPE ref_tx_type = + av1_get_tx_type(get_plane_type(plane), &x->e_mbd, blk_row, blk_col, + tx_size, cpi->common.reduced_tx_set_used); + if (ref_tx_type == intra_txb_rd_info->tx_type) { + best_rd_stats->rate = intra_txb_rd_info->rate; + best_rd_stats->dist = intra_txb_rd_info->dist; + best_rd_stats->sse = intra_txb_rd_info->sse; + best_rd_stats->skip = intra_txb_rd_info->eob == 0; + x->plane[plane].eobs[block] = intra_txb_rd_info->eob; + x->plane[plane].txb_entropy_ctx[block] = + intra_txb_rd_info->txb_entropy_ctx; + best_rd = RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->dist); + best_eob = intra_txb_rd_info->eob; + best_tx_type = intra_txb_rd_info->tx_type; + update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, + best_tx_type); + goto RECON_INTRA; + } + } + } + + int rate_cost = 0; + TX_TYPE txk_start = DCT_DCT; + TX_TYPE txk_end = TX_TYPES - 1; + if (!(!is_inter && x->use_default_intra_tx_type) && + !(is_inter && x->use_default_inter_tx_type)) + if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan) + if (plane == 0) txk_end = DCT_DCT; + + uint8_t best_txb_ctx = 0; + const TxSetType tx_set_type = + av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used); + int prune = 0; + const int do_prune = plane == 0 && !fast_tx_search && txk_end != DCT_DCT && + !(!is_inter && x->use_default_intra_tx_type) && + !(is_inter && x->use_default_inter_tx_type) && + cpi->sf.tx_type_search.prune_mode > NO_PRUNE; + if (do_prune && is_inter) { + if (cpi->sf.tx_type_search.prune_mode >= PRUNE_2D_ACCURATE) { + prune = prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, + tx_set_type, cpi->sf.tx_type_search.prune_mode); + } else { + prune = x->tx_search_prune[tx_set_type]; + } + } + + TX_TYPE uv_tx_type = DCT_DCT; + if (plane) { + // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y + uv_tx_type = txk_start = txk_end = + av1_get_tx_type(get_plane_type(plane), xd, blk_row, blk_col, tx_size, + cm->reduced_tx_set_used); + } + if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) { + txk_start = txk_end = DCT_DCT; + } + + int8_t allowed_tx_mask[TX_TYPES] = { 0 }; // 1: allow; 0: skip. + int allowed_tx_num = 0; + if (fast_tx_search) { + allowed_tx_mask[DCT_DCT] = 1; + allowed_tx_mask[H_DCT] = 1; + allowed_tx_mask[V_DCT] = 1; + } else { + memset(allowed_tx_mask + txk_start, 1, txk_end - txk_start + 1); + } + for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) { + if (do_prune) { + if (!do_tx_type_search(tx_type, prune, cpi->sf.tx_type_search.prune_mode)) + allowed_tx_mask[tx_type] = 0; + } + if (plane == 0 && allowed_tx_mask[tx_type]) { + if (!av1_ext_tx_used[tx_set_type][tx_type]) + allowed_tx_mask[tx_type] = 0; + else if (!is_inter && x->use_default_intra_tx_type && + tx_type != get_default_tx_type(0, xd, tx_size)) + allowed_tx_mask[tx_type] = 0; + else if (is_inter && x->use_default_inter_tx_type && + tx_type != get_default_tx_type(0, xd, tx_size)) + allowed_tx_mask[tx_type] = 0; + } + allowed_tx_num += allowed_tx_mask[tx_type]; + } + // Need to have at least one transform type allowed. + if (allowed_tx_num == 0) { + allowed_tx_mask[plane ? uv_tx_type : DCT_DCT] = 1; + } + + int use_transform_domain_distortion = + (cpi->sf.use_transform_domain_distortion > 0) && + // Any 64-pt transforms only preserves half the coefficients. + // Therefore transform domain distortion is not valid for these + // transform sizes. + txsize_sqr_up_map[tx_size] != TX_64X64; #if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) { - // Save decoded pixels for inter block in pd->pred to avoid - // block_8x8_rd_txfm_daala_dist() need to produce them - // by calling av1_inverse_transform_block() again. - const int pred_stride = block_size_wide[plane_bsize]; - const int pred_idx = (blk_row * pred_stride + blk_col) - << tx_size_wide_log2[0]; - int16_t *pred = &pd->pred[pred_idx]; - int i, j; - -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - for (j = 0; j < bsh; j++) - for (i = 0; i < bsw; i++) - pred[j * pred_stride + i] = - CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i]; - } else { + if (x->using_dist_8x8) use_transform_domain_distortion = 0; #endif - for (j = 0; j < bsh; j++) - for (i = 0; i < bsw; i++) - pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i]; -#if CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_DIST_8X8 - *out_dist = - pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE, - blk_row, blk_col, plane_bsize, tx_bsize); + + int calc_pixel_domain_distortion_final = + cpi->sf.use_transform_domain_distortion == 1 && + use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD && + !x->cb_partition_scan; + if (calc_pixel_domain_distortion_final && allowed_tx_num <= 1) + calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0; + + const uint16_t *eobs_ptr = x->plane[plane].eobs; + + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + int64_t block_sse = + pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2); + block_sse *= 16; + + for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) { + if (!allowed_tx_mask[tx_type]) continue; + if (plane == 0) mbmi->txk_type[txk_type_idx] = tx_type; + RD_STATS this_rd_stats; + av1_invalid_rd_stats(&this_rd_stats); + + if (!cpi->optimize_seg_arr[mbmi->segment_id]) { + av1_xform_quant( + cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type, + USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP); + rate_cost = av1_cost_coeffs(cm, x, plane, blk_row, blk_col, block, + tx_size, txb_ctx, use_fast_coef_costing); + } else { + av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, + tx_size, tx_type, AV1_XFORM_QUANT_FP); + if (cpi->sf.optimize_b_precheck && best_rd < INT64_MAX && + eobs_ptr[block] >= 4) { + // Calculate distortion quickly in transform domain. + dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist, + &this_rd_stats.sse); + rate_cost = av1_cost_coeffs(cm, x, plane, blk_row, blk_col, block, + tx_size, txb_ctx, use_fast_coef_costing); + const int64_t rd_estimate = + AOMMIN(RDCOST(x->rdmult, rate_cost, this_rd_stats.dist), + RDCOST(x->rdmult, 0, this_rd_stats.sse)); + if (rd_estimate - (rd_estimate >> 3) > AOMMIN(best_rd, ref_best_rd)) + continue; } - *out_dist *= 16; + av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, 1, + &rate_cost); + } + if (eobs_ptr[block] == 0) { + // When eob is 0, pixel domain distortion is more efficient and accurate. + this_rd_stats.dist = this_rd_stats.sse = block_sse; + } else if (use_transform_domain_distortion) { + dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist, + &this_rd_stats.sse); } else { - *out_dist = *out_sse; + this_rd_stats.dist = dist_block_px_domain( + cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size); + this_rd_stats.sse = block_sse; + } + + this_rd_stats.rate = rate_cost; + + const int64_t rd = + RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); + + if (rd < best_rd) { + best_rd = rd; + *best_rd_stats = this_rd_stats; + best_tx_type = tx_type; + best_txb_ctx = x->plane[plane].txb_entropy_ctx[block]; + best_eob = x->plane[plane].eobs[block]; + last_tx_type = best_tx_type; + + // Swap qcoeff and dqcoeff buffers + tran_low_t *const tmp_dqcoeff = best_dqcoeff; + best_dqcoeff = pd->dqcoeff; + pd->dqcoeff = tmp_dqcoeff; + } + +#if CONFIG_COLLECT_RD_STATS == 1 + if (plane == 0) { + PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col, + plane_bsize, tx_size, tx_type); + } +#endif // CONFIG_COLLECT_RD_STATS == 1 + + if (cpi->sf.adaptive_txb_search_level) { + if ((best_rd - (best_rd >> cpi->sf.adaptive_txb_search_level)) > + ref_best_rd) { + break; + } + } + + // Skip transform type search when we found the block has been quantized to + // all zero and at the same time, it has better rdcost than doing transform. + if (cpi->sf.tx_type_search.skip_tx_search && !best_eob) break; + } + + assert(best_rd != INT64_MAX); + + best_rd_stats->skip = best_eob == 0; + if (best_eob == 0) best_tx_type = DCT_DCT; + if (plane == 0) { + update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, + best_tx_type); + } + x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx; + x->plane[plane].eobs[block] = best_eob; + + pd->dqcoeff = best_dqcoeff; + + if (calc_pixel_domain_distortion_final && best_eob) { + best_rd_stats->dist = dist_block_px_domain( + cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size); + best_rd_stats->sse = block_sse; + } + + if (intra_txb_rd_info != NULL) { + intra_txb_rd_info->valid = 1; + intra_txb_rd_info->entropy_context = cur_joint_ctx; + intra_txb_rd_info->rate = best_rd_stats->rate; + intra_txb_rd_info->dist = best_rd_stats->dist; + intra_txb_rd_info->sse = best_rd_stats->sse; + intra_txb_rd_info->eob = best_eob; + intra_txb_rd_info->txb_entropy_ctx = best_txb_ctx; + if (plane == 0) intra_txb_rd_info->tx_type = best_tx_type; + } + +RECON_INTRA: + if (!is_inter && best_eob && + (blk_row + tx_size_high_unit[tx_size] < mi_size_high[plane_bsize] || + blk_col + tx_size_wide_unit[tx_size] < mi_size_wide[plane_bsize])) { + // intra mode needs decoded result such that the next transform block + // can use it for prediction. + // if the last search tx_type is the best tx_type, we don't need to + // do this again + if (best_tx_type != last_tx_type) { + if (!cpi->optimize_seg_arr[mbmi->segment_id]) { + av1_xform_quant( + cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + best_tx_type, + USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP); + } else { + av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, + tx_size, best_tx_type, AV1_XFORM_QUANT_FP); + av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx, 1, + &rate_cost); + } + } + + inverse_transform_block_facade(xd, plane, block, blk_row, blk_col, + x->plane[plane].eobs[block], + cm->reduced_tx_set_used); + + // This may happen because of hash collision. The eob stored in the hash + // table is non-zero, but the real eob is zero. We need to make sure tx_type + // is DCT_DCT in this case. + if (plane == 0 && x->plane[plane].eobs[block] == 0 && + best_tx_type != DCT_DCT) { + update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, + DCT_DCT); } } + pd->dqcoeff = orig_dqcoeff; + + return best_rd; } static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, @@ -1894,7 +2832,7 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, struct rdcost_block_args *args = arg; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; - const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const MB_MODE_INFO *const mbmi = xd->mi[0]; const AV1_COMP *cpi = args->cpi; ENTROPY_CONTEXT *a = args->t_above + blk_col; ENTROPY_CONTEXT *l = args->t_left + blk_row; @@ -1909,122 +2847,44 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, // (new distortion metric) are different. // Exception is: dist-8x8 is enabled but still MSE is used, // i.e. "--tune=" encoder option is not used. + int bw = block_size_wide[plane_bsize]; + int bh = block_size_high[plane_bsize]; int disable_early_skip = - x->using_dist_8x8 && plane == 0 && plane_bsize >= BLOCK_8X8 && + x->using_dist_8x8 && plane == AOM_PLANE_Y && bw >= 8 && bh >= 8 && (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) && x->tune_metric != AOM_TUNE_PSNR; #endif // CONFIG_DIST_8X8 -#if !CONFIG_SUPERTX && !CONFIG_VAR_TX - assert(tx_size == av1_get_tx_size(plane, xd)); -#endif // !CONFIG_SUPERTX - av1_init_rd_stats(&this_rd_stats); if (args->exit_early) return; if (!is_inter_block(mbmi)) { - av1_predict_intra_block_facade(cm, xd, plane, block, blk_col, blk_row, - tx_size); + av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size); av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); } + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); + search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + &txb_ctx, args->ftxs_mode, args->use_fast_coef_costing, + args->best_rd - args->this_rd, &this_rd_stats); -#if !CONFIG_TXK_SEL - // full forward transform and quantization - const int coeff_ctx = combine_entropy_contexts(*a, *l); -#if DISABLE_TRELLISQ_SEARCH - av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, - coeff_ctx, AV1_XFORM_QUANT_B); -#else - av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, - coeff_ctx, AV1_XFORM_QUANT_FP); - - const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2; - tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block); - tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block); - const int buffer_length = tx_size_2d[tx_size]; - int64_t tmp_dist; - int64_t tmp; -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - tmp_dist = - av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd); - else -#endif - tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp); - tmp_dist = RIGHT_SIGNED_SHIFT(tmp_dist, shift); - - if ( -#if CONFIG_DIST_8X8 - disable_early_skip || -#endif - RDCOST(x->rdmult, 0, tmp_dist) + args->this_rd < args->best_rd) { - av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size, - a, l, 1); - } else { - args->exit_early = 1; - return; - } -#endif // DISABLE_TRELLISQ_SEARCH - -#if CONFIG_MRC_TX - if (mbmi->tx_type == MRC_DCT && !mbmi->valid_mrc_mask) { - args->exit_early = 1; - return; - } -#endif // CONFIG_MRC_TX - - if (!is_inter_block(mbmi)) { - struct macroblock_plane *const p = &x->plane[plane]; - av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col, - p->eobs[block]); - av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, - tx_size, &this_rd_stats.dist, &this_rd_stats.sse, - OUTPUT_HAS_DECODED_PIXELS); - } else { - av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, - tx_size, &this_rd_stats.dist, &this_rd_stats.sse, - OUTPUT_HAS_PREDICTED_PIXELS); - } -#if CONFIG_CFL - if (plane == AOM_PLANE_Y && xd->cfl->store_y) { -#if CONFIG_CHROMA_SUB8X8 + if (plane == AOM_PLANE_Y && xd->cfl.store_y) { assert(!is_inter_block(mbmi) || plane_bsize < BLOCK_8X8); -#else - assert(!is_inter_block(mbmi)); -#endif // CONFIG_CHROMA_SUB8X8 cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize); } -#endif // CONFIG_CFL - rd = RDCOST(x->rdmult, 0, this_rd_stats.dist); - if (args->this_rd + rd > args->best_rd) { - args->exit_early = 1; - return; - } -#if !CONFIG_PVQ - const PLANE_TYPE plane_type = get_plane_type(plane); - const TX_TYPE tx_type = - av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size); - const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, mbmi); - this_rd_stats.rate = - av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block, tx_size, - scan_order, a, l, args->use_fast_coef_costing); -#else // !CONFIG_PVQ - this_rd_stats.rate = x->rate; -#endif // !CONFIG_PVQ -#else // !CONFIG_TXK_SEL - av1_search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, - tx_size, a, l, args->use_fast_coef_costing, - &this_rd_stats); -#endif // !CONFIG_TXK_SEL - -#if !CONFIG_PVQ + #if CONFIG_RD_DEBUG av1_update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col, this_rd_stats.rate); #endif // CONFIG_RD_DEBUG av1_set_txb_context(x, plane, block, tx_size, a, l); -#endif // !CONFIG_PVQ + + if (plane == 0) { + x->blk_skip[blk_row * + (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) + + blk_col] = (x->plane[plane].eobs[block] == 0); + } rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse); @@ -2032,11 +2892,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, // TODO(jingning): temporarily enabled only for luma component rd = AOMMIN(rd1, rd2); -#if !CONFIG_PVQ this_rd_stats.skip &= !x->plane[plane].eobs[block]; -#else - this_rd_stats.skip &= x->pvq_skip[plane]; -#endif // !CONFIG_PVQ + av1_merge_rd_stats(&args->rd_stats, &this_rd_stats); args->this_rd += rd; @@ -2057,12 +2914,12 @@ static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x, MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[0]; const struct macroblock_plane *const p = &x->plane[0]; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; const int src_stride = p->src.stride; const int dst_stride = pd->dst.stride; const uint8_t *src = &p->src.buf[0]; const uint8_t *dst = &pd->dst.buf[0]; - const int16_t *pred = &pd->pred[0]; + const int16_t *pred = &x->pred_luma[0]; int bw = block_size_wide[bsize]; int bh = block_size_high[bsize]; int visible_w = bw; @@ -2070,7 +2927,7 @@ static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x, int i, j; int64_t rd, rd1, rd2; - unsigned int tmp1, tmp2; + int64_t sse = INT64_MAX, dist = INT64_MAX; int qindex = x->qindex; assert((bw & 0x07) == 0); @@ -2079,53 +2936,51 @@ static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x, get_txb_dimensions(xd, 0, bsize, 0, 0, bsize, &bw, &bh, &visible_w, &visible_h); -#if CONFIG_HIGHBITDEPTH - uint8_t *pred8; - DECLARE_ALIGNED(16, uint16_t, pred16[MAX_TX_SQUARE]); - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - pred8 = CONVERT_TO_BYTEPTR(pred16); - else - pred8 = (uint8_t *)pred16; -#else - DECLARE_ALIGNED(16, uint8_t, pred8[MAX_TX_SQUARE]); -#endif // CONFIG_HIGHBITDEPTH - -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - for (j = 0; j < bh; j++) - for (i = 0; i < bw; i++) - CONVERT_TO_SHORTPTR(pred8)[j * bw + i] = pred[j * bw + i]; - } else { -#endif - for (j = 0; j < bh; j++) - for (i = 0; i < bw; i++) pred8[j * bw + i] = (uint8_t)pred[j * bw + i]; -#if CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_HIGHBITDEPTH - - tmp1 = (unsigned)av1_dist_8x8(cpi, x, src, src_stride, pred8, bw, bsize, bw, - bh, visible_w, visible_h, qindex); - tmp2 = (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, bsize, - bw, bh, visible_w, visible_h, qindex); + const int diff_stride = block_size_wide[bsize]; + const int16_t *diff = p->src_diff; + sse = dist_8x8_diff(x, src, src_stride, diff, diff_stride, bw, bh, visible_w, + visible_h, qindex); + sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); + sse *= 16; if (!is_inter_block(mbmi)) { - if (x->tune_metric == AOM_TUNE_PSNR) { - assert(args->rd_stats.sse == tmp1 * 16); - assert(args->rd_stats.dist == tmp2 * 16); - } - args->rd_stats.sse = (int64_t)tmp1 * 16; - args->rd_stats.dist = (int64_t)tmp2 * 16; + dist = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, bsize, bw, bh, + visible_w, visible_h, qindex); + dist *= 16; } else { - // For inter mode, the decoded pixels are provided in pd->pred, + // For inter mode, the decoded pixels are provided in x->pred_luma, // while the predicted pixels are in dst. - if (x->tune_metric == AOM_TUNE_PSNR) { - assert(args->rd_stats.sse == tmp2 * 16); - assert(args->rd_stats.dist == tmp1 * 16); + uint8_t *pred8; + DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + pred8 = CONVERT_TO_BYTEPTR(pred16); + else + pred8 = (uint8_t *)pred16; + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + for (j = 0; j < bh; j++) + for (i = 0; i < bw; i++) + CONVERT_TO_SHORTPTR(pred8)[j * bw + i] = pred[j * bw + i]; + } else { + for (j = 0; j < bh; j++) + for (i = 0; i < bw; i++) pred8[j * bw + i] = (uint8_t)pred[j * bw + i]; } - args->rd_stats.sse = (int64_t)tmp2 * 16; - args->rd_stats.dist = (int64_t)tmp1 * 16; + + dist = av1_dist_8x8(cpi, x, src, src_stride, pred8, bw, bsize, bw, bh, + visible_w, visible_h, qindex); + dist *= 16; + } + +#ifdef DEBUG_DIST_8X8 + if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) { + assert(args->rd_stats.sse == sse); + assert(args->rd_stats.dist == dist); } +#endif // DEBUG_DIST_8X8 + + args->rd_stats.sse = sse; + args->rd_stats.dist = dist; rd1 = RDCOST(x->rdmult, args->rd_stats.rate, args->rd_stats.dist); rd2 = RDCOST(x->rdmult, 0, args->rd_stats.sse); @@ -2141,7 +2996,8 @@ static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x, static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, RD_STATS *rd_stats, int64_t ref_best_rd, int plane, BLOCK_SIZE bsize, TX_SIZE tx_size, - int use_fast_coef_casting) { + int use_fast_coef_casting, + FAST_TX_SEARCH_MODE ftxs_mode) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; struct rdcost_block_args args; @@ -2150,18 +3006,21 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, args.cpi = cpi; args.best_rd = ref_best_rd; args.use_fast_coef_costing = use_fast_coef_casting; + args.ftxs_mode = ftxs_mode; av1_init_rd_stats(&args.rd_stats); - if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size; + if (plane == 0) xd->mi[0]->tx_size = tx_size; - av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left); + av1_get_entropy_contexts(bsize, pd, args.t_above, args.t_left); av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm, &args); #if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && !args.exit_early && plane == 0 && - bsize >= BLOCK_8X8 && - (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4)) + int bw = block_size_wide[bsize]; + int bh = block_size_high[bsize]; + + if (x->using_dist_8x8 && !args.exit_early && plane == 0 && bw >= 8 && + bh >= 8 && (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4)) dist_8x8_sub8x8_txfm_rd(cpi, x, bsize, &args); #endif @@ -2172,183 +3031,48 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, } } -#if CONFIG_SUPERTX -void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate, - int64_t *distortion, int *skippable, - int64_t *sse, int64_t ref_best_rd, int plane, - BLOCK_SIZE bsize, TX_SIZE tx_size, - int use_fast_coef_casting) { - MACROBLOCKD *const xd = &x->e_mbd; - const struct macroblockd_plane *const pd = &xd->plane[plane]; - struct rdcost_block_args args; - av1_zero(args); - args.cpi = cpi; - args.x = x; - args.best_rd = ref_best_rd; - args.use_fast_coef_costing = use_fast_coef_casting; - -#if CONFIG_EXT_TX - assert(tx_size < TX_SIZES); -#endif // CONFIG_EXT_TX - - if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size; - - av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left); - - block_rd_txfm(plane, 0, 0, 0, get_plane_block_size(bsize, pd), tx_size, - &args); - - if (args.exit_early) { - *rate = INT_MAX; - *distortion = INT64_MAX; - *sse = INT64_MAX; - *skippable = 0; - } else { - *distortion = args.rd_stats.dist; - *rate = args.rd_stats.rate; - *sse = args.rd_stats.sse; - *skippable = !x->plane[plane].eobs[0]; - } -} -#endif // CONFIG_SUPERTX - -static int tx_size_cost(const AV1_COMP *const cpi, const MACROBLOCK *const x, +static int tx_size_cost(const AV1_COMMON *const cm, const MACROBLOCK *const x, BLOCK_SIZE bsize, TX_SIZE tx_size) { - const AV1_COMMON *const cm = &cpi->common; const MACROBLOCKD *const xd = &x->e_mbd; - const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const MB_MODE_INFO *const mbmi = xd->mi[0]; if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type)) { - const int is_inter = is_inter_block(mbmi); - const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize] - : intra_tx_size_cat_lookup[bsize]; - const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size]; - const int depth = tx_size_to_depth(coded_tx_size); + const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); + const int depth = tx_size_to_depth(tx_size, bsize); const int tx_size_ctx = get_tx_size_context(xd); int r_tx_size = x->tx_size_cost[tx_size_cat][tx_size_ctx][depth]; -#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX) - if (is_quarter_tx_allowed(xd, mbmi, is_inter) && tx_size != coded_tx_size) - r_tx_size += av1_cost_bit(cm->fc->quarter_tx_size_prob, - tx_size == quarter_txsize_lookup[bsize]); -#endif return r_tx_size; } else { return 0; } } -#if CONFIG_LGT_FROM_PRED -int av1_lgt_cost(const AV1_COMMON *cm, const MACROBLOCK *x, - const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, - TX_SIZE tx_size, int use_lgt) { - if (plane > 0) return 0; - const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - const int is_inter = is_inter_block(mbmi); - - assert(is_lgt_allowed(mbmi->mode, tx_size)); - if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 && - !xd->lossless[xd->mi[0]->mbmi.segment_id]) { - const int ext_tx_set = - get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used); - if (LGT_FROM_PRED_INTRA && !is_inter && ext_tx_set > 0 && - ALLOW_INTRA_EXT_TX) - return x->intra_lgt_cost[txsize_sqr_map[tx_size]][mbmi->mode][use_lgt]; - if (LGT_FROM_PRED_INTRA && is_inter && ext_tx_set > 0) - return x->inter_lgt_cost[txsize_sqr_map[tx_size]][use_lgt]; - } - return 0; -} -#endif // CONFIG_LGT_FROM_PRED - -// TODO(angiebird): use this function whenever it's possible -int av1_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x, - const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, - TX_SIZE tx_size, TX_TYPE tx_type) { - if (plane > 0) return 0; - -#if CONFIG_LGT_FROM_PRED - assert(!xd->mi[0]->mbmi.use_lgt); -#endif -#if CONFIG_VAR_TX - tx_size = get_min_tx_size(tx_size); -#endif - - const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - const int is_inter = is_inter_block(mbmi); -#if CONFIG_EXT_TX - if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 && - !xd->lossless[xd->mi[0]->mbmi.segment_id]) { - const int ext_tx_set = - get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used); - if (is_inter) { - if (ext_tx_set > 0) - return x - ->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]][tx_type]; - } else { - if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX) - return x->intra_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]] - [mbmi->mode][tx_type]; - } - } -#else - (void)bsize; - (void)cm; - if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] && - !FIXED_TX_TYPE) { - if (is_inter) { - return x->inter_tx_type_costs[tx_size][tx_type]; - } else { - return x->intra_tx_type_costs[tx_size] - [intra_mode_to_tx_type_context[mbmi->mode]] - [tx_type]; - } - } -#endif // CONFIG_EXT_TX - return 0; -} static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs, - TX_TYPE tx_type, TX_SIZE tx_size) { + TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; int64_t rd = INT64_MAX; - aom_prob skip_prob = av1_get_skip_prob(cm, xd); + const int skip_ctx = av1_get_skip_context(xd); int s0, s1; const int is_inter = is_inter_block(mbmi); const int tx_select = - cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8; - - const int r_tx_size = tx_size_cost(cpi, x, bs, tx_size); + cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type); + int ctx = txfm_partition_context( + xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size); + const int r_tx_size = is_inter ? x->txfm_partition_cost[ctx][0] + : tx_size_cost(cm, x, bs, tx_size); -#if CONFIG_PVQ - assert(tx_size >= TX_4X4); -#endif // CONFIG_PVQ - assert(skip_prob > 0); -#if CONFIG_EXT_TX && CONFIG_RECT_TX assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs))); -#endif // CONFIG_EXT_TX && CONFIG_RECT_TX - s0 = av1_cost_bit(skip_prob, 0); - s1 = av1_cost_bit(skip_prob, 1); + s0 = x->skip_cost[skip_ctx][0]; + s1 = x->skip_cost[skip_ctx][1]; - mbmi->tx_type = tx_type; mbmi->tx_size = tx_size; - txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, tx_size, - cpi->sf.use_fast_coef_costing); + txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOM_PLANE_Y, bs, tx_size, + cpi->sf.use_fast_coef_costing, ftxs_mode); if (rd_stats->rate == INT_MAX) return INT64_MAX; -#if !CONFIG_TXK_SEL - int plane = 0; -#if CONFIG_LGT_FROM_PRED - if (is_lgt_allowed(mbmi->mode, tx_size)) - rd_stats->rate += - av1_lgt_cost(cm, x, xd, bs, plane, tx_size, mbmi->use_lgt); - if (!mbmi->use_lgt) - rd_stats->rate += av1_tx_type_cost(cm, x, xd, bs, plane, tx_size, tx_type); -#else - rd_stats->rate += av1_tx_type_cost(cm, x, xd, bs, plane, tx_size, tx_type); -#endif // CONFIG_LGT_FROM_PRED -#endif if (rd_stats->skip) { if (is_inter) { @@ -2363,545 +3087,136 @@ static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, if (tx_select) rd_stats->rate += r_tx_size; - if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && - !(rd_stats->skip)) + if (is_inter && !xd->lossless[xd->mi[0]->segment_id] && !(rd_stats->skip)) rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse)); return rd; } -static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs, - TX_TYPE tx_type, TX_SIZE tx_size) { - const MACROBLOCKD *const xd = &x->e_mbd; - const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - const TX_SIZE max_tx_size = max_txsize_lookup[bs]; - const int is_inter = is_inter_block(mbmi); - int prune = 0; - if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE) - // passing -1 in for tx_type indicates that all 1D - // transforms should be considered for pruning - prune = prune_tx_types(cpi, bs, x, xd, -1); - -#if CONFIG_MRC_TX - // MRC_DCT only implemented for TX_32X32 so only include this tx in - // the search for TX_32X32 - if (tx_type == MRC_DCT && - ((is_inter && !USE_MRC_INTER) || (!is_inter && !USE_MRC_INTRA) || - tx_size != TX_32X32)) - return 1; -#endif // CONFIG_MRC_TX -#if CONFIG_LGT_FROM_PRED - if (mbmi->use_lgt && mbmi->ref_mv_idx > 0) return 1; -#endif // CONFIG_LGT_FROM_PRED - if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) return 1; - if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, tx_size)) - return 1; - if (!is_inter && x->use_default_intra_tx_type && - tx_type != get_default_tx_type(0, xd, 0, tx_size)) - return 1; - if (is_inter && x->use_default_inter_tx_type && - tx_type != get_default_tx_type(0, xd, 0, tx_size)) - return 1; - if (max_tx_size >= TX_32X32 && tx_size == TX_4X4) return 1; -#if CONFIG_EXT_TX - const AV1_COMMON *const cm = &cpi->common; - const TxSetType tx_set_type = - get_ext_tx_set_type(tx_size, bs, is_inter, cm->reduced_tx_set_used); - if (!av1_ext_tx_used[tx_set_type][tx_type]) return 1; - if (is_inter) { - if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) { - if (!do_tx_type_search(tx_type, prune)) return 1; - } - } else { - if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) { - if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) return 1; - } - } -#else // CONFIG_EXT_TX - if (tx_size >= TX_32X32 && tx_type != DCT_DCT) return 1; - if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE && - !do_tx_type_search(tx_type, prune)) - return 1; -#endif // CONFIG_EXT_TX - return 0; -} - -#if (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA) static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs, MACROBLOCK *x, int *r, int64_t *d, int *s, int64_t *sse, int64_t ref_best_rd) { RD_STATS rd_stats; - int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs, DCT_DCT, - max_txsize_lookup[bs]); + x->rd_model = LOW_TXFM_RD; + int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs, + max_txsize_rect_lookup[bs], FTXS_NONE); + x->rd_model = FULL_TXFM_RD; *r = rd_stats.rate; *d = rd_stats.dist; *s = rd_stats.skip; *sse = rd_stats.sse; return rd; } -#endif // (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT) static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - TX_TYPE tx_type, best_tx_type = DCT_DCT; - int64_t this_rd, best_rd = INT64_MAX; - aom_prob skip_prob = av1_get_skip_prob(cm, xd); - int s0 = av1_cost_bit(skip_prob, 0); - int s1 = av1_cost_bit(skip_prob, 1); + MB_MODE_INFO *const mbmi = xd->mi[0]; const int is_inter = is_inter_block(mbmi); - int prune = 0; - const int plane = 0; -#if CONFIG_LGT_FROM_PRED - int is_lgt_best = 0; - int search_lgt = is_inter - ? LGT_FROM_PRED_INTER && !x->use_default_inter_tx_type && - !cpi->sf.tx_type_search.prune_mode > NO_PRUNE - : LGT_FROM_PRED_INTRA && !x->use_default_intra_tx_type && - ALLOW_INTRA_EXT_TX; -#endif // CONFIG_LGT_FROM_PRED - av1_invalid_rd_stats(rd_stats); - - mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode, is_inter); -#if CONFIG_VAR_TX - mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size); -#endif // CONFIG_VAR_TX -#if CONFIG_EXT_TX - int ext_tx_set = - get_ext_tx_set(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used); + mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode); const TxSetType tx_set_type = - get_ext_tx_set_type(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used); -#endif // CONFIG_EXT_TX - - if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE) -#if CONFIG_EXT_TX - prune = prune_tx_types(cpi, bs, x, xd, ext_tx_set); -#else - prune = prune_tx_types(cpi, bs, x, xd, 0); -#endif // CONFIG_EXT_TX -#if CONFIG_EXT_TX - if (get_ext_tx_types(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used) > - 1 && - !xd->lossless[mbmi->segment_id]) { -#if CONFIG_PVQ - od_rollback_buffer pre_buf, post_buf; - - od_encode_checkpoint(&x->daala_enc, &pre_buf); - od_encode_checkpoint(&x->daala_enc, &post_buf); -#endif // CONFIG_PVQ - - for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) { - if (!av1_ext_tx_used[tx_set_type][tx_type]) continue; - RD_STATS this_rd_stats; - if (is_inter) { - if (x->use_default_inter_tx_type && - tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size)) - continue; - if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) { - if (!do_tx_type_search(tx_type, prune)) continue; - } - } else { - if (x->use_default_intra_tx_type && - tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size)) - continue; - if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) { - if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue; - } - } - - mbmi->tx_type = tx_type; - - txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs, - mbmi->tx_size, cpi->sf.use_fast_coef_costing); -#if CONFIG_PVQ - od_encode_rollback(&x->daala_enc, &pre_buf); -#endif // CONFIG_PVQ - if (this_rd_stats.rate == INT_MAX) continue; - av1_tx_type_cost(cm, x, xd, bs, plane, mbmi->tx_size, tx_type); - - if (this_rd_stats.skip) - this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse); - else - this_rd = - RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist); - if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] && - !this_rd_stats.skip) - this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse)); - - if (this_rd < best_rd) { - best_rd = this_rd; - best_tx_type = mbmi->tx_type; - *rd_stats = this_rd_stats; -#if CONFIG_PVQ - od_encode_checkpoint(&x->daala_enc, &post_buf); -#endif // CONFIG_PVQ - } - } -#if CONFIG_PVQ - od_encode_rollback(&x->daala_enc, &post_buf); -#endif // CONFIG_PVQ -#if CONFIG_LGT_FROM_PRED - // search LGT - if (search_lgt && is_lgt_allowed(mbmi->mode, mbmi->tx_size) && - !cm->reduced_tx_set_used) { - RD_STATS this_rd_stats; - mbmi->use_lgt = 1; - txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs, - mbmi->tx_size, cpi->sf.use_fast_coef_costing); - if (this_rd_stats.rate != INT_MAX) { - av1_lgt_cost(cm, x, xd, bs, plane, mbmi->tx_size, 1); - if (this_rd_stats.skip) - this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse); - else - this_rd = - RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist); - if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] && - !this_rd_stats.skip) - this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse)); - if (this_rd < best_rd) { - best_rd = this_rd; - is_lgt_best = 1; - *rd_stats = this_rd_stats; - } - } - mbmi->use_lgt = 0; - } -#endif // CONFIG_LGT_FROM_PRED - } else { - mbmi->tx_type = DCT_DCT; - txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size, - cpi->sf.use_fast_coef_costing); - } -#else // CONFIG_EXT_TX - if (mbmi->tx_size < TX_32X32 && !xd->lossless[mbmi->segment_id]) { - for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) { - RD_STATS this_rd_stats; - if (!is_inter && x->use_default_intra_tx_type && - tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size)) - continue; - if (is_inter && x->use_default_inter_tx_type && - tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size)) - continue; - mbmi->tx_type = tx_type; - txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs, - mbmi->tx_size, cpi->sf.use_fast_coef_costing); - if (this_rd_stats.rate == INT_MAX) continue; - - av1_tx_type_cost(cm, x, xd, bs, plane, mbmi->tx_size, tx_type); - if (is_inter) { - if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE && - !do_tx_type_search(tx_type, prune)) - continue; - } - if (this_rd_stats.skip) - this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse); - else - this_rd = - RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist); - if (is_inter && !xd->lossless[mbmi->segment_id] && !this_rd_stats.skip) - this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse)); - - if (this_rd < best_rd) { - best_rd = this_rd; - best_tx_type = mbmi->tx_type; - *rd_stats = this_rd_stats; - } - } - } else { - mbmi->tx_type = DCT_DCT; - txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size, - cpi->sf.use_fast_coef_costing); - } -#endif // CONFIG_EXT_TX - mbmi->tx_type = best_tx_type; -#if CONFIG_LGT_FROM_PRED - mbmi->use_lgt = is_lgt_best; -#endif // CONFIG_LGT_FROM_PRED + av1_get_ext_tx_set_type(mbmi->tx_size, is_inter, cm->reduced_tx_set_used); + prune_tx(cpi, bs, x, xd, tx_set_type); + txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOM_PLANE_Y, bs, + mbmi->tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE); + // Reset the pruning flags. + av1_zero(x->tx_search_prune); + x->tx_split_prune_flag = 0; } static void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs) { MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; mbmi->tx_size = TX_4X4; - mbmi->tx_type = DCT_DCT; -#if CONFIG_VAR_TX - mbmi->min_tx_size = get_min_tx_size(TX_4X4); -#endif // CONFIG_VAR_TX - txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size, - cpi->sf.use_fast_coef_costing); + cpi->sf.use_fast_coef_costing, FTXS_NONE); } -#if CONFIG_TXK_SEL || CONFIG_VAR_TX static INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) { int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * tx_size_wide_log2[0]); return num_blk; } -#endif // CONFIG_TXK_SEL || CONFIG_VAR_TX + +static int get_search_init_depth(int mi_width, int mi_height, int is_inter, + const SPEED_FEATURES *sf) { + if (sf->tx_size_search_method == USE_LARGESTALL) return MAX_VARTX_DEPTH; + + if (sf->tx_size_search_lgr_block) { + if (mi_width > mi_size_wide[BLOCK_64X64] || + mi_height > mi_size_high[BLOCK_64X64]) + return MAX_VARTX_DEPTH; + } + + if (is_inter) { + return (mi_height != mi_width) ? sf->inter_tx_size_search_init_depth_rect + : sf->inter_tx_size_search_init_depth_sqr; + } else { + return (mi_height != mi_width) ? sf->intra_tx_size_search_init_depth_rect + : sf->intra_tx_size_search_init_depth_sqr; + } +} static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; int64_t rd = INT64_MAX; int n; - int start_tx, end_tx; - int64_t best_rd = INT64_MAX, last_rd = INT64_MAX; - const TX_SIZE max_tx_size = max_txsize_lookup[bs]; - TX_SIZE best_tx_size = max_tx_size; - TX_TYPE best_tx_type = DCT_DCT; -#if CONFIG_LGT_FROM_PRED - int breakout = 0; - int is_lgt_best = 0; - mbmi->use_lgt = 0; -#endif // CONFIG_LGT_FROM_PRED -#if CONFIG_TXK_SEL - TX_TYPE best_txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)]; -#endif // CONFIG_TXK_SEL + int start_tx; + int depth; + int64_t best_rd = INT64_MAX; + const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs]; + TX_SIZE best_tx_size = max_rect_tx_size; + TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN]; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + const int n4 = bsize_to_num_blk(bs); const int tx_select = cm->tx_mode == TX_MODE_SELECT; - const int is_inter = is_inter_block(mbmi); -#if CONFIG_PVQ - od_rollback_buffer buf; - od_encode_checkpoint(&x->daala_enc, &buf); -#endif // CONFIG_PVQ av1_invalid_rd_stats(rd_stats); -#if CONFIG_EXT_TX && CONFIG_RECT_TX - int evaluate_rect_tx = 0; if (tx_select) { - evaluate_rect_tx = is_rect_tx_allowed(xd, mbmi); + start_tx = max_rect_tx_size; + depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs], + is_inter_block(mbmi), &cpi->sf); } else { - const TX_SIZE chosen_tx_size = - tx_size_from_tx_mode(bs, cm->tx_mode, is_inter); - evaluate_rect_tx = is_rect_tx(chosen_tx_size); - assert(IMPLIES(evaluate_rect_tx, is_rect_tx_allowed(xd, mbmi))); - } - if (evaluate_rect_tx) { - TX_TYPE tx_start = DCT_DCT; - TX_TYPE tx_end = TX_TYPES; -#if CONFIG_TXK_SEL - // The tx_type becomes dummy when lv_map is on. The tx_type search will be - // performed in av1_search_txk_type() - tx_end = DCT_DCT + 1; -#endif - TX_TYPE tx_type; - for (tx_type = tx_start; tx_type < tx_end; ++tx_type) { - if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue; - const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs]; - RD_STATS this_rd_stats; - const TxSetType tx_set_type = get_ext_tx_set_type( - rect_tx_size, bs, is_inter, cm->reduced_tx_set_used); - if (av1_ext_tx_used[tx_set_type][tx_type]) { - rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, - rect_tx_size); - ref_best_rd = AOMMIN(rd, ref_best_rd); - if (rd < best_rd) { -#if CONFIG_TXK_SEL - memcpy(best_txk_type, mbmi->txk_type, sizeof(best_txk_type[0]) * 256); -#endif - best_tx_type = tx_type; - best_tx_size = rect_tx_size; - best_rd = rd; - *rd_stats = this_rd_stats; - } - } -#if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4 - const int is_inter = is_inter_block(mbmi); - if (mbmi->sb_type < BLOCK_8X8 && is_inter) break; -#endif // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4 - } -#if CONFIG_LGT_FROM_PRED - const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs]; - if (is_lgt_allowed(mbmi->mode, rect_tx_size) && !cm->reduced_tx_set_used) { - RD_STATS this_rd_stats; - mbmi->use_lgt = 1; - rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, rect_tx_size); - if (rd < best_rd) { - is_lgt_best = 1; - best_tx_size = rect_tx_size; - best_rd = rd; - *rd_stats = this_rd_stats; - } - mbmi->use_lgt = 0; - } -#endif // CONFIG_LGT_FROM_PRED - } - -#if CONFIG_RECT_TX_EXT - // test 1:4/4:1 tx - int evaluate_quarter_tx = 0; - if (is_quarter_tx_allowed(xd, mbmi, is_inter)) { - if (tx_select) { - evaluate_quarter_tx = 1; - } else { - const TX_SIZE chosen_tx_size = - tx_size_from_tx_mode(bs, cm->tx_mode, is_inter); - evaluate_quarter_tx = chosen_tx_size == quarter_txsize_lookup[bs]; - } - } - if (evaluate_quarter_tx) { - TX_TYPE tx_start = DCT_DCT; - TX_TYPE tx_end = TX_TYPES; -#if CONFIG_TXK_SEL - // The tx_type becomes dummy when lv_map is on. The tx_type search will be - // performed in av1_search_txk_type() - tx_end = DCT_DCT + 1; -#endif - TX_TYPE tx_type; - for (tx_type = tx_start; tx_type < tx_end; ++tx_type) { - if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue; - const TX_SIZE tx_size = quarter_txsize_lookup[bs]; - RD_STATS this_rd_stats; - const TxSetType tx_set_type = - get_ext_tx_set_type(tx_size, bs, is_inter, cm->reduced_tx_set_used); - if (av1_ext_tx_used[tx_set_type][tx_type]) { - rd = - txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, tx_size); - if (rd < best_rd) { -#if CONFIG_TXK_SEL - memcpy(best_txk_type, mbmi->txk_type, - sizeof(best_txk_type[0]) * num_blk); -#endif - best_tx_type = tx_type; -#if CONFIG_LGT_FROM_PRED - is_lgt_best = 0; -#endif - best_tx_size = tx_size; - best_rd = rd; - *rd_stats = this_rd_stats; - } - } -#if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4 - const int is_inter = is_inter_block(mbmi); - if (mbmi->sb_type < BLOCK_8X8 && is_inter) break; -#endif // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4 - } -#if CONFIG_LGT_FROM_PRED - if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used) { - const TX_SIZE tx_size = quarter_txsize_lookup[bs]; - RD_STATS this_rd_stats; - mbmi->use_lgt = 1; - rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, tx_size); - if (rd < best_rd) { - is_lgt_best = 1; - best_tx_size = tx_size; - best_rd = rd; - *rd_stats = this_rd_stats; - } - mbmi->use_lgt = 0; - } -#endif // CONFIG_LGT_FROM_PRED + const TX_SIZE chosen_tx_size = tx_size_from_tx_mode(bs, cm->tx_mode); + start_tx = chosen_tx_size; + depth = MAX_TX_DEPTH; } -#endif // CONFIG_RECT_TX_EXT -#endif // CONFIG_EXT_TX && CONFIG_RECT_TX - if (tx_select) { - start_tx = max_tx_size; - end_tx = (max_tx_size >= TX_32X32) ? TX_8X8 : TX_4X4; - } else { - const TX_SIZE chosen_tx_size = - tx_size_from_tx_mode(bs, cm->tx_mode, is_inter); - start_tx = chosen_tx_size; - end_tx = chosen_tx_size; - } - - last_rd = INT64_MAX; - for (n = start_tx; n >= end_tx; --n) { -#if CONFIG_EXT_TX && CONFIG_RECT_TX - if (is_rect_tx(n)) break; -#endif // CONFIG_EXT_TX && CONFIG_RECT_TX - TX_TYPE tx_start = DCT_DCT; - TX_TYPE tx_end = TX_TYPES; -#if CONFIG_TXK_SEL - // The tx_type becomes dummy when lv_map is on. The tx_type search will be - // performed in av1_search_txk_type() - tx_end = DCT_DCT + 1; -#endif - TX_TYPE tx_type; - for (tx_type = tx_start; tx_type < tx_end; ++tx_type) { - RD_STATS this_rd_stats; - if (skip_txfm_search(cpi, x, bs, tx_type, n)) continue; - rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, n); -#if CONFIG_PVQ - od_encode_rollback(&x->daala_enc, &buf); -#endif // CONFIG_PVQ - // Early termination in transform size search. - if (cpi->sf.tx_size_search_breakout && - (rd == INT64_MAX || - (this_rd_stats.skip == 1 && tx_type != DCT_DCT && n < start_tx) || - (n < (int)max_tx_size && rd > last_rd))) { -#if CONFIG_LGT_FROM_PRED - breakout = 1; -#endif - break; - } + prune_tx(cpi, bs, x, xd, EXT_TX_SET_ALL16); - last_rd = rd; - ref_best_rd = AOMMIN(rd, ref_best_rd); - if (rd < best_rd) { -#if CONFIG_TXK_SEL - memcpy(best_txk_type, mbmi->txk_type, sizeof(best_txk_type[0]) * 256); -#endif - best_tx_type = tx_type; -#if CONFIG_LGT_FROM_PRED - is_lgt_best = 0; -#endif - best_tx_size = n; - best_rd = rd; - *rd_stats = this_rd_stats; - } -#if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4 - const int is_inter = is_inter_block(mbmi); - if (mbmi->sb_type < BLOCK_8X8 && is_inter) break; -#endif // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4 - } -#if CONFIG_LGT_FROM_PRED - mbmi->use_lgt = 1; - if (is_lgt_allowed(mbmi->mode, n) && !skip_txfm_search(cpi, x, bs, 0, n) && - !breakout) { - RD_STATS this_rd_stats; - rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, n); - if (rd < best_rd) { - is_lgt_best = 1; - best_tx_size = n; - best_rd = rd; - *rd_stats = this_rd_stats; - } - } - mbmi->use_lgt = 0; -#endif // CONFIG_LGT_FROM_PRED + for (n = start_tx; depth <= MAX_TX_DEPTH; depth++, n = sub_tx_size_map[n]) { + RD_STATS this_rd_stats; + if (mbmi->ref_mv_idx > 0) x->rd_model = LOW_TXFM_RD; + rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE); + x->rd_model = FULL_TXFM_RD; + + if (rd < best_rd) { + memcpy(best_txk_type, mbmi->txk_type, + sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN); + memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4); + best_tx_size = n; + best_rd = rd; + *rd_stats = this_rd_stats; + } + if (n == TX_4X4) break; } mbmi->tx_size = best_tx_size; - mbmi->tx_type = best_tx_type; -#if CONFIG_LGT_FROM_PRED - mbmi->use_lgt = is_lgt_best; - assert(!is_lgt_best || is_lgt_allowed(mbmi->mode, mbmi->tx_size)); -#endif // CONFIG_LGT_FROM_PRED -#if CONFIG_TXK_SEL - memcpy(mbmi->txk_type, best_txk_type, sizeof(best_txk_type[0]) * 256); -#endif - -#if CONFIG_VAR_TX - mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size); -#endif // CONFIG_VAR_TX + memcpy(mbmi->txk_type, best_txk_type, + sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN); + memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4); -#if !CONFIG_EXT_TX - if (mbmi->tx_size >= TX_32X32) assert(mbmi->tx_type == DCT_DCT); -#endif // !CONFIG_EXT_TX -#if CONFIG_PVQ - if (best_rd != INT64_MAX) { - txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs, best_tx_type, best_tx_size); - } -#endif // CONFIG_PVQ + // Reset the pruning flags. + av1_zero(x->tx_search_prune); + x->tx_split_prune_flag = 0; } static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, @@ -2910,9 +3225,9 @@ static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, MACROBLOCKD *xd = &x->e_mbd; av1_init_rd_stats(rd_stats); - assert(bs == xd->mi[0]->mbmi.sb_type); + assert(bs == xd->mi[0]->sb_type); - if (xd->lossless[xd->mi[0]->mbmi.segment_id]) { + if (xd->lossless[xd->mi[0]->segment_id]) { choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs); } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) { choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs); @@ -2921,18 +3236,117 @@ static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, } } +// Return the rate cost for luma prediction mode info. of intra blocks. +static int intra_mode_info_cost_y(const AV1_COMP *cpi, const MACROBLOCK *x, + const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, + int mode_cost) { + int total_rate = mode_cost; + const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0; + const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra; + const int use_intrabc = mbmi->use_intrabc; + // Can only activate one mode. + assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc + + use_filter_intra) <= 1); + const int try_palette = + av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type); + if (try_palette && mbmi->mode == DC_PRED) { + const MACROBLOCKD *xd = &x->e_mbd; + const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); + const int mode_ctx = av1_get_palette_mode_ctx(xd); + total_rate += x->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette]; + if (use_palette) { + const uint8_t *const color_map = xd->plane[0].color_index_map; + int block_width, block_height, rows, cols; + av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows, + &cols); + const int plt_size = mbmi->palette_mode_info.palette_size[0]; + int palette_mode_cost = + x->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] + + write_uniform_cost(plt_size, color_map[0]); + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 0, color_cache); + palette_mode_cost += + av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache, + n_cache, cpi->common.bit_depth); + palette_mode_cost += + av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP); + total_rate += palette_mode_cost; + } + } + if (av1_filter_intra_allowed(&cpi->common, mbmi)) { + total_rate += x->filter_intra_cost[mbmi->sb_type][use_filter_intra]; + if (use_filter_intra) { + total_rate += x->filter_intra_mode_cost[mbmi->filter_intra_mode_info + .filter_intra_mode]; + } + } + if (av1_is_directional_mode(mbmi->mode)) { + if (av1_use_angle_delta(bsize)) { + total_rate += x->angle_delta_cost[mbmi->mode - V_PRED] + [MAX_ANGLE_DELTA + + mbmi->angle_delta[PLANE_TYPE_Y]]; + } + } + if (av1_allow_intrabc(&cpi->common)) + total_rate += x->intrabc_cost[use_intrabc]; + return total_rate; +} + +// Return the rate cost for chroma prediction mode info. of intra blocks. +static int intra_mode_info_cost_uv(const AV1_COMP *cpi, const MACROBLOCK *x, + const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, + int mode_cost) { + int total_rate = mode_cost; + const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0; + const UV_PREDICTION_MODE mode = mbmi->uv_mode; + // Can only activate one mode. + assert(((mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1); + + const int try_palette = + av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type); + if (try_palette && mode == UV_DC_PRED) { + const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info; + total_rate += + x->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette]; + if (use_palette) { + const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); + const int plt_size = pmi->palette_size[1]; + const MACROBLOCKD *xd = &x->e_mbd; + const uint8_t *const color_map = xd->plane[1].color_index_map; + int palette_mode_cost = + x->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] + + write_uniform_cost(plt_size, color_map[0]); + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 1, color_cache); + palette_mode_cost += av1_palette_color_cost_uv(pmi, color_cache, n_cache, + cpi->common.bit_depth); + palette_mode_cost += + av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP); + total_rate += palette_mode_cost; + } + } + if (av1_is_directional_mode(get_uv_mode(mode))) { + if (av1_use_angle_delta(bsize)) { + total_rate += + x->angle_delta_cost[mode - V_PRED][mbmi->angle_delta[PLANE_TYPE_UV] + + MAX_ANGLE_DELTA]; + } + } + return total_rate; +} + static int conditional_skipintra(PREDICTION_MODE mode, PREDICTION_MODE best_intra_mode) { - if (mode == D117_PRED && best_intra_mode != V_PRED && + if (mode == D113_PRED && best_intra_mode != V_PRED && best_intra_mode != D135_PRED) return 1; - if (mode == D63_PRED && best_intra_mode != V_PRED && + if (mode == D67_PRED && best_intra_mode != V_PRED && best_intra_mode != D45_PRED) return 1; - if (mode == D207_PRED && best_intra_mode != H_PRED && + if (mode == D203_PRED && best_intra_mode != H_PRED && best_intra_mode != D45_PRED) return 1; - if (mode == D153_PRED && best_intra_mode != H_PRED && + if (mode == D157_PRED && best_intra_mode != H_PRED && best_intra_mode != D135_PRED) return 1; return 0; @@ -2943,48 +3357,42 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int mode_cost) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; assert(!is_inter_block(mbmi)); RD_STATS this_rd_stats; int row, col; int64_t temp_sse, this_rd; - const TX_SIZE tx_size = tx_size_from_tx_mode(bsize, cpi->common.tx_mode, 0); + TX_SIZE tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode); const int stepr = tx_size_high_unit[tx_size]; const int stepc = tx_size_wide_unit[tx_size]; const int max_blocks_wide = max_block_wide(xd, bsize, 0); const int max_blocks_high = max_block_high(xd, bsize, 0); mbmi->tx_size = tx_size; // Prediction. - const int step = stepr * stepc; - int block = 0; for (row = 0; row < max_blocks_high; row += stepr) { for (col = 0; col < max_blocks_wide; col += stepc) { - av1_predict_intra_block_facade(cm, xd, 0, block, col, row, tx_size); - block += step; + av1_predict_intra_block_facade(cm, xd, 0, col, row, tx_size); } } // RD estimation. model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate, - &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse); -#if CONFIG_EXT_INTRA - if (av1_is_directional_mode(mbmi->mode, bsize) && - av1_use_angle_delta(bsize)) { - mode_cost += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, - MAX_ANGLE_DELTA + mbmi->angle_delta[0]); - } -#endif // CONFIG_EXT_INTRA -#if CONFIG_FILTER_INTRA - if (mbmi->mode == DC_PRED) { - const aom_prob prob = cpi->common.fc->filter_intra_probs[0]; - if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) { - const int mode = mbmi->filter_intra_mode_info.filter_intra_mode[0]; - mode_cost += (av1_cost_bit(prob, 1) + - write_uniform_cost(FILTER_INTRA_MODES, mode)); + &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL, + NULL, NULL); + if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) { + mode_cost += + x->angle_delta_cost[mbmi->mode - V_PRED] + [MAX_ANGLE_DELTA + mbmi->angle_delta[PLANE_TYPE_Y]]; + } + if (mbmi->mode == DC_PRED && + av1_filter_intra_allowed_bsize(cm, mbmi->sb_type)) { + if (mbmi->filter_intra_mode_info.use_filter_intra) { + const int mode = mbmi->filter_intra_mode_info.filter_intra_mode; + mode_cost += x->filter_intra_cost[mbmi->sb_type][1] + + x->filter_intra_mode_cost[mode]; } else { - mode_cost += av1_cost_bit(prob, 0); + mode_cost += x->filter_intra_cost[mbmi->sb_type][0]; } } -#endif // CONFIG_FILTER_INTRA this_rd = RDCOST(x->rdmult, this_rd_stats.rate + mode_cost, this_rd_stats.dist); return this_rd; @@ -3014,42 +3422,99 @@ static void extend_palette_color_map(uint8_t *const color_map, int orig_width, } } -#if CONFIG_PALETTE_DELTA_ENCODING // Bias toward using colors in the cache. // TODO(huisu): Try other schemes to improve compression. static void optimize_palette_colors(uint16_t *color_cache, int n_cache, - int n_colors, int stride, - float *centroids) { + int n_colors, int stride, int *centroids) { if (n_cache <= 0) return; for (int i = 0; i < n_colors * stride; i += stride) { - float min_diff = fabsf(centroids[i] - color_cache[0]); + int min_diff = abs(centroids[i] - (int)color_cache[0]); int idx = 0; for (int j = 1; j < n_cache; ++j) { - float this_diff = fabsf(centroids[i] - color_cache[j]); + const int this_diff = abs(centroids[i] - color_cache[j]); if (this_diff < min_diff) { min_diff = this_diff; idx = j; } } - if (min_diff < 1.5) centroids[i] = color_cache[idx]; + if (min_diff <= 1) centroids[i] = color_cache[idx]; } } -#endif // CONFIG_PALETTE_DELTA_ENCODING -static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, int palette_ctx, - int dc_mode_cost, MB_MODE_INFO *best_mbmi, - uint8_t *best_palette_color_map, - int64_t *best_rd, int64_t *best_model_rd, - int *rate, int *rate_tokenonly, - int64_t *distortion, int *skippable) { +// Given the base colors as specified in centroids[], calculate the RD cost +// of palette mode. +static void palette_rd_y( + const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, + BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n, + uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi, + uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd, + int *rate, int *rate_tokenonly, int *rate_overhead, int64_t *distortion, + int *skippable, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip) { + optimize_palette_colors(color_cache, n_cache, n, 1, centroids); + int k = av1_remove_duplicates(centroids, n); + if (k < PALETTE_MIN_SIZE) { + // Too few unique colors to create a palette. And DC_PRED will work + // well for that case anyway. So skip. + return; + } + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + if (cpi->common.use_highbitdepth) + for (int i = 0; i < k; ++i) + pmi->palette_colors[i] = + clip_pixel_highbd((int)centroids[i], cpi->common.bit_depth); + else + for (int i = 0; i < k; ++i) + pmi->palette_colors[i] = clip_pixel(centroids[i]); + pmi->palette_size[0] = k; + MACROBLOCKD *const xd = &x->e_mbd; + uint8_t *const color_map = xd->plane[0].color_index_map; + int block_width, block_height, rows, cols; + av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows, + &cols); + av1_calc_indices(data, centroids, color_map, rows * cols, k, 1); + extend_palette_color_map(color_map, cols, rows, block_width, block_height); + const int palette_mode_cost = + intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost); + int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost); + if (*best_model_rd != INT64_MAX && + this_model_rd > *best_model_rd + (*best_model_rd >> 1)) + return; + if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd; + RD_STATS tokenonly_rd_stats; + super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); + if (tokenonly_rd_stats.rate == INT_MAX) return; + int this_rate = tokenonly_rd_stats.rate + palette_mode_cost; + int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); + if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) { + tokenonly_rd_stats.rate -= + tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size); + } + if (this_rd < *best_rd) { + *best_rd = this_rd; + memcpy(best_palette_color_map, color_map, + block_width * block_height * sizeof(color_map[0])); + *best_mbmi = *mbmi; + memcpy(blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + *rate_overhead = this_rate - tokenonly_rd_stats.rate; + if (rate) *rate = this_rate; + if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate; + if (distortion) *distortion = tokenonly_rd_stats.dist; + if (skippable) *skippable = tokenonly_rd_stats.skip; + } +} + +static int rd_pick_palette_intra_sby( + const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, + int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly, + int64_t *distortion, int *skippable, PICK_MODE_CONTEXT *ctx, + uint8_t *best_blk_skip) { int rate_overhead = 0; MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *const mic = xd->mi[0]; - MB_MODE_INFO *const mbmi = &mic->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; assert(!is_inter_block(mbmi)); - assert(bsize >= BLOCK_8X8); - int this_rate, colors, n; + assert(av1_allow_palette(cpi->common.allow_screen_content_tools, bsize)); + int colors, n; const int src_stride = x->plane[0].src.stride; const uint8_t *const src = x->plane[0].src.buf; uint8_t *const color_map = xd->plane[0].color_index_map; @@ -3057,37 +3522,26 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows, &cols); - assert(cpi->common.allow_screen_content_tools); - -#if CONFIG_HIGHBITDEPTH + int count_buf[1 << 12]; // Maximum (1 << 12) color levels. if (cpi->common.use_highbitdepth) colors = av1_count_colors_highbd(src, src_stride, rows, cols, - cpi->common.bit_depth); + cpi->common.bit_depth, count_buf); else -#endif // CONFIG_HIGHBITDEPTH - colors = av1_count_colors(src, src_stride, rows, cols); -#if CONFIG_FILTER_INTRA - mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0; -#endif // CONFIG_FILTER_INTRA + colors = av1_count_colors(src, src_stride, rows, cols, count_buf); + mbmi->filter_intra_mode_info.use_filter_intra = 0; if (colors > 1 && colors <= 64) { - int r, c, i, k, palette_mode_cost; + int r, c, i; const int max_itr = 50; - float *const data = x->palette_buffer->kmeans_data_buf; - float centroids[PALETTE_MAX_SIZE]; - float lb, ub, val; - RD_STATS tokenonly_rd_stats; - int64_t this_rd, this_model_rd; - PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; -#if CONFIG_HIGHBITDEPTH + int *const data = x->palette_buffer->kmeans_data_buf; + int centroids[PALETTE_MAX_SIZE]; + int lb, ub, val; uint16_t *src16 = CONVERT_TO_SHORTPTR(src); if (cpi->common.use_highbitdepth) lb = ub = src16[0]; else -#endif // CONFIG_HIGHBITDEPTH lb = ub = src[0]; -#if CONFIG_HIGHBITDEPTH if (cpi->common.use_highbitdepth) { for (r = 0; r < rows; ++r) { for (c = 0; c < cols; ++c) { @@ -3100,7 +3554,6 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, } } } else { -#endif // CONFIG_HIGHBITDEPTH for (r = 0; r < rows; ++r) { for (c = 0; c < cols; ++c) { val = src[r * src_stride + c]; @@ -3111,99 +3564,57 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, ub = val; } } -#if CONFIG_HIGHBITDEPTH } -#endif // CONFIG_HIGHBITDEPTH mbmi->mode = DC_PRED; -#if CONFIG_FILTER_INTRA - mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0; -#endif // CONFIG_FILTER_INTRA - - if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; -#if CONFIG_PALETTE_DELTA_ENCODING uint16_t color_cache[2 * PALETTE_MAX_SIZE]; const int n_cache = av1_get_palette_cache(xd, 0, color_cache); -#endif // CONFIG_PALETTE_DELTA_ENCODING - for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2; - --n) { + // Find the dominant colors, stored in top_colors[]. + int top_colors[PALETTE_MAX_SIZE] = { 0 }; + for (i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) { + int max_count = 0; + for (int j = 0; j < (1 << cpi->common.bit_depth); ++j) { + if (count_buf[j] > max_count) { + max_count = count_buf[j]; + top_colors[i] = j; + } + } + assert(max_count > 0); + count_buf[top_colors[i]] = 0; + } + + // Try the dominant colors directly. + // TODO(huisu@google.com): Try to avoid duplicate computation in cases + // where the dominant colors and the k-means results are similar. + for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) { + for (i = 0; i < n; ++i) centroids[i] = top_colors[i]; + palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, + color_cache, n_cache, best_mbmi, best_palette_color_map, + best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead, + distortion, skippable, ctx, best_blk_skip); + } + + // K-means clustering. + for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) { if (colors == PALETTE_MIN_SIZE) { // Special case: These colors automatically become the centroids. assert(colors == n); assert(colors == 2); centroids[0] = lb; centroids[1] = ub; - k = 2; } else { for (i = 0; i < n; ++i) { centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2; } av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr); -#if CONFIG_PALETTE_DELTA_ENCODING - optimize_palette_colors(color_cache, n_cache, n, 1, centroids); -#endif // CONFIG_PALETTE_DELTA_ENCODING - k = av1_remove_duplicates(centroids, n); - if (k < PALETTE_MIN_SIZE) { - // Too few unique colors to create a palette. And DC_PRED will work - // well for that case anyway. So skip. - continue; - } - } - -#if CONFIG_HIGHBITDEPTH - if (cpi->common.use_highbitdepth) - for (i = 0; i < k; ++i) - pmi->palette_colors[i] = - clip_pixel_highbd((int)centroids[i], cpi->common.bit_depth); - else -#endif // CONFIG_HIGHBITDEPTH - for (i = 0; i < k; ++i) - pmi->palette_colors[i] = clip_pixel((int)centroids[i]); - pmi->palette_size[0] = k; - - av1_calc_indices(data, centroids, color_map, rows * cols, k, 1); - extend_palette_color_map(color_map, cols, rows, block_width, - block_height); - palette_mode_cost = - dc_mode_cost + - x->palette_y_size_cost[bsize - BLOCK_8X8][k - PALETTE_MIN_SIZE] + - write_uniform_cost(k, color_map[0]) + - av1_cost_bit( - av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], - 1); - palette_mode_cost += av1_palette_color_cost_y(pmi, -#if CONFIG_PALETTE_DELTA_ENCODING - color_cache, n_cache, -#endif // CONFIG_PALETTE_DELTA_ENCODING - cpi->common.bit_depth); - palette_mode_cost += - av1_cost_color_map(x, 0, 0, bsize, mbmi->tx_size, PALETTE_MAP); - this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost); - if (*best_model_rd != INT64_MAX && - this_model_rd > *best_model_rd + (*best_model_rd >> 1)) - continue; - if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd; - super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); - if (tokenonly_rd_stats.rate == INT_MAX) continue; - this_rate = tokenonly_rd_stats.rate + palette_mode_cost; - this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); - if (!xd->lossless[mbmi->segment_id] && - block_signals_txsize(mbmi->sb_type)) { - tokenonly_rd_stats.rate -= tx_size_cost(cpi, x, bsize, mbmi->tx_size); - } - if (this_rd < *best_rd) { - *best_rd = this_rd; - memcpy(best_palette_color_map, color_map, - block_width * block_height * sizeof(color_map[0])); - *best_mbmi = *mbmi; - rate_overhead = this_rate - tokenonly_rd_stats.rate; - if (rate) *rate = this_rate; - if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate; - if (distortion) *distortion = tokenonly_rd_stats.dist; - if (skippable) *skippable = tokenonly_rd_stats.skip; } + palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, + color_cache, n_cache, best_mbmi, best_palette_color_map, + best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead, + distortion, skippable, ctx, best_blk_skip); } } @@ -3215,663 +3626,30 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, return rate_overhead; } -static int64_t rd_pick_intra_sub_8x8_y_subblock_mode( - const AV1_COMP *const cpi, MACROBLOCK *x, int row, int col, - PREDICTION_MODE *best_mode, const int *bmode_costs, ENTROPY_CONTEXT *a, - ENTROPY_CONTEXT *l, int *bestrate, int *bestratey, int64_t *bestdistortion, - BLOCK_SIZE bsize, TX_SIZE tx_size, int *y_skip, int64_t rd_thresh) { - const AV1_COMMON *const cm = &cpi->common; - PREDICTION_MODE mode; - MACROBLOCKD *const xd = &x->e_mbd; - assert(!is_inter_block(&xd->mi[0]->mbmi)); - int64_t best_rd = rd_thresh; - struct macroblock_plane *p = &x->plane[0]; - struct macroblockd_plane *pd = &xd->plane[0]; - const int src_stride = p->src.stride; - const int dst_stride = pd->dst.stride; - const uint8_t *src_init = &p->src.buf[row * 4 * src_stride + col * 4]; - uint8_t *dst_init = &pd->dst.buf[row * 4 * dst_stride + col * 4]; -#if CONFIG_CHROMA_2X2 - // TODO(jingning): This is a temporal change. The whole function should be - // out when cb4x4 is enabled. - ENTROPY_CONTEXT ta[4], tempa[4]; - ENTROPY_CONTEXT tl[4], templ[4]; -#else - ENTROPY_CONTEXT ta[2], tempa[2]; - ENTROPY_CONTEXT tl[2], templ[2]; -#endif // CONFIG_CHROMA_2X2 - - const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize]; - const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize]; - const int tx_width_unit = tx_size_wide_unit[tx_size]; - const int tx_height_unit = tx_size_high_unit[tx_size]; - const int pred_block_width = block_size_wide[bsize]; - const int pred_block_height = block_size_high[bsize]; - const int tx_width = tx_size_wide[tx_size]; - const int tx_height = tx_size_high[tx_size]; - const int pred_width_in_transform_blocks = pred_block_width / tx_width; - const int pred_height_in_transform_blocks = pred_block_height / tx_height; - int idx, idy; - int best_can_skip = 0; - uint8_t best_dst[8 * 8]; -#if CONFIG_HIGHBITDEPTH - uint16_t best_dst16[8 * 8]; -#endif // CONFIG_HIGHBITDEPTH - const int is_lossless = xd->lossless[xd->mi[0]->mbmi.segment_id]; -#if CONFIG_EXT_TX && CONFIG_RECT_TX - const int sub_bsize = bsize; -#else - const int sub_bsize = BLOCK_4X4; -#endif // CONFIG_EXT_TX && CONFIG_RECT_TX - -#if CONFIG_PVQ - od_rollback_buffer pre_buf, post_buf; - od_encode_checkpoint(&x->daala_enc, &pre_buf); - od_encode_checkpoint(&x->daala_enc, &post_buf); -#endif // CONFIG_PVQ - - assert(bsize < BLOCK_8X8); - assert(tx_width < 8 || tx_height < 8); -#if CONFIG_EXT_TX && CONFIG_RECT_TX - if (is_lossless) - assert(tx_width == 4 && tx_height == 4); - else - assert(tx_width == pred_block_width && tx_height == pred_block_height); -#else - assert(tx_width == 4 && tx_height == 4); -#endif // CONFIG_EXT_TX && CONFIG_RECT_TX - - memcpy(ta, a, pred_width_in_transform_blocks * sizeof(a[0])); - memcpy(tl, l, pred_height_in_transform_blocks * sizeof(l[0])); - - xd->mi[0]->mbmi.tx_size = tx_size; - - xd->mi[0]->mbmi.palette_mode_info.palette_size[0] = 0; - -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { -#if CONFIG_PVQ - od_encode_checkpoint(&x->daala_enc, &pre_buf); -#endif - for (mode = DC_PRED; mode <= TM_PRED; ++mode) { - int64_t this_rd; - int ratey = 0; - int64_t distortion = 0; - int rate = bmode_costs[mode]; - int can_skip = 1; - - if (!(cpi->sf.intra_y_mode_mask[txsize_sqr_up_map[tx_size]] & - (1 << mode))) - continue; - - // Only do the oblique modes if the best so far is - // one of the neighboring directional modes - if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { - if (conditional_skipintra(mode, *best_mode)) continue; - } - - memcpy(tempa, ta, pred_width_in_transform_blocks * sizeof(ta[0])); - memcpy(templ, tl, pred_height_in_transform_blocks * sizeof(tl[0])); - - for (idy = 0; idy < pred_height_in_transform_blocks; ++idy) { - for (idx = 0; idx < pred_width_in_transform_blocks; ++idx) { - const int block_raster_idx = (row + idy) * 2 + (col + idx); - const int block = - av1_raster_order_to_block_index(tx_size, block_raster_idx); - const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride]; - uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; -#if !CONFIG_PVQ - int16_t *const src_diff = av1_raster_block_offset_int16( - BLOCK_8X8, block_raster_idx, p->src_diff); -#endif - int skip; - assert(block < 4); - assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4, - idx == 0 && idy == 0)); - assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4, - block == 0 || block == 2)); - xd->mi[0]->bmi[block_raster_idx].as_mode = mode; - av1_predict_intra_block( - cm, xd, pd->width, pd->height, txsize_to_bsize[tx_size], mode, - dst, dst_stride, dst, dst_stride, col + idx, row + idy, 0); -#if !CONFIG_PVQ - aom_highbd_subtract_block(tx_height, tx_width, src_diff, 8, src, - src_stride, dst, dst_stride, xd->bd); -#endif - if (is_lossless) { - TX_TYPE tx_type = - av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size); - const SCAN_ORDER *scan_order = - get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi); - const int coeff_ctx = - combine_entropy_contexts(tempa[idx], templ[idy]); -#if !CONFIG_PVQ - av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8, - tx_size, coeff_ctx, AV1_XFORM_QUANT_FP); - ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size, - scan_order, tempa + idx, templ + idy, - cpi->sf.use_fast_coef_costing); - skip = (p->eobs[block] == 0); - can_skip &= skip; - tempa[idx] = !skip; - templ[idy] = !skip; -#if CONFIG_EXT_TX - if (tx_size == TX_8X4) { - tempa[idx + 1] = tempa[idx]; - } else if (tx_size == TX_4X8) { - templ[idy + 1] = templ[idy]; - } -#endif // CONFIG_EXT_TX -#else - (void)scan_order; - - av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8, - tx_size, coeff_ctx, AV1_XFORM_QUANT_B); - - ratey += x->rate; - skip = x->pvq_skip[0]; - tempa[idx] = !skip; - templ[idy] = !skip; - can_skip &= skip; -#endif - if (RDCOST(x->rdmult, ratey, distortion) >= best_rd) - goto next_highbd; -#if CONFIG_PVQ - if (!skip) -#endif - av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block), -#if CONFIG_LGT_FROM_PRED - mode, -#endif -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - BLOCK_OFFSET(xd->mrc_mask, block), -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - DCT_DCT, tx_size, dst, dst_stride, - p->eobs[block]); - } else { - int64_t dist; - unsigned int tmp; - TX_TYPE tx_type = - av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size); - const SCAN_ORDER *scan_order = - get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi); - const int coeff_ctx = - combine_entropy_contexts(tempa[idx], templ[idy]); -#if !CONFIG_PVQ -#if DISABLE_TRELLISQ_SEARCH - av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8, - tx_size, coeff_ctx, AV1_XFORM_QUANT_B); -#else - av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8, - tx_size, coeff_ctx, AV1_XFORM_QUANT_FP); - av1_optimize_b(cm, x, 0, 0, 0, block, BLOCK_8X8, tx_size, - tempa + idx, templ + idy, 1); -#endif // DISABLE_TRELLISQ_SEARCH - ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size, - scan_order, tempa + idx, templ + idy, - cpi->sf.use_fast_coef_costing); - skip = (p->eobs[block] == 0); - can_skip &= skip; - tempa[idx] = !skip; - templ[idy] = !skip; -#if CONFIG_EXT_TX - if (tx_size == TX_8X4) { - tempa[idx + 1] = tempa[idx]; - } else if (tx_size == TX_4X8) { - templ[idy + 1] = templ[idy]; - } -#endif // CONFIG_EXT_TX -#else - (void)scan_order; - - av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8, - tx_size, coeff_ctx, AV1_XFORM_QUANT_FP); - ratey += x->rate; - skip = x->pvq_skip[0]; - tempa[idx] = !skip; - templ[idy] = !skip; - can_skip &= skip; -#endif -#if CONFIG_PVQ - if (!skip) -#endif - av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block), -#if CONFIG_LGT_FROM_PRED - mode, -#endif -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - BLOCK_OFFSET(xd->mrc_mask, block), -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - tx_type, tx_size, dst, dst_stride, - p->eobs[block]); - cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp); - dist = (int64_t)tmp << 4; - distortion += dist; - if (RDCOST(x->rdmult, ratey, distortion) >= best_rd) - goto next_highbd; - } - } - } - - rate += ratey; - this_rd = RDCOST(x->rdmult, rate, distortion); - - if (this_rd < best_rd) { - *bestrate = rate; - *bestratey = ratey; - *bestdistortion = distortion; - best_rd = this_rd; - best_can_skip = can_skip; - *best_mode = mode; - memcpy(a, tempa, pred_width_in_transform_blocks * sizeof(tempa[0])); - memcpy(l, templ, pred_height_in_transform_blocks * sizeof(templ[0])); -#if CONFIG_PVQ - od_encode_checkpoint(&x->daala_enc, &post_buf); -#endif - for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) { - memcpy(best_dst16 + idy * 8, - CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride), - pred_width_in_transform_blocks * 4 * sizeof(uint16_t)); - } - } - next_highbd : {} -#if CONFIG_PVQ - od_encode_rollback(&x->daala_enc, &pre_buf); -#endif - } - - if (best_rd >= rd_thresh) return best_rd; - -#if CONFIG_PVQ - od_encode_rollback(&x->daala_enc, &post_buf); -#endif - - if (y_skip) *y_skip &= best_can_skip; - - for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) { - memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride), - best_dst16 + idy * 8, - pred_width_in_transform_blocks * 4 * sizeof(uint16_t)); - } - - return best_rd; - } -#endif // CONFIG_HIGHBITDEPTH - -#if CONFIG_PVQ - od_encode_checkpoint(&x->daala_enc, &pre_buf); -#endif // CONFIG_PVQ - - for (mode = DC_PRED; mode <= TM_PRED; ++mode) { - int64_t this_rd; - int ratey = 0; - int64_t distortion = 0; - int rate = bmode_costs[mode]; - int can_skip = 1; - - if (!(cpi->sf.intra_y_mode_mask[txsize_sqr_up_map[tx_size]] & - (1 << mode))) { - continue; - } - - // Only do the oblique modes if the best so far is - // one of the neighboring directional modes - if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { - if (conditional_skipintra(mode, *best_mode)) continue; - } - - memcpy(tempa, ta, pred_width_in_transform_blocks * sizeof(ta[0])); - memcpy(templ, tl, pred_height_in_transform_blocks * sizeof(tl[0])); - - for (idy = 0; idy < pred_height_in_4x4_blocks; idy += tx_height_unit) { - for (idx = 0; idx < pred_width_in_4x4_blocks; idx += tx_width_unit) { - const int block_raster_idx = (row + idy) * 2 + (col + idx); - int block = av1_raster_order_to_block_index(tx_size, block_raster_idx); - const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride]; - uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; -#if !CONFIG_PVQ - int16_t *const src_diff = av1_raster_block_offset_int16( - BLOCK_8X8, block_raster_idx, p->src_diff); -#endif // !CONFIG_PVQ - int skip; - assert(block < 4); - assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4, - idx == 0 && idy == 0)); - assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4, - block == 0 || block == 2)); - xd->mi[0]->bmi[block_raster_idx].as_mode = mode; - av1_predict_intra_block(cm, xd, pd->width, pd->height, - txsize_to_bsize[tx_size], mode, dst, dst_stride, - dst, dst_stride, -#if CONFIG_CB4X4 - 2 * (col + idx), 2 * (row + idy), -#else - col + idx, row + idy, -#endif // CONFIG_CB4X4 - 0); -#if !CONFIG_PVQ - aom_subtract_block(tx_height, tx_width, src_diff, 8, src, src_stride, - dst, dst_stride); -#endif // !CONFIG_PVQ - TX_TYPE tx_type = - av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size); - const SCAN_ORDER *scan_order = - get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi); - const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); -#if CONFIG_CB4X4 - block = 4 * block; -#endif // CONFIG_CB4X4 -#if !CONFIG_PVQ -#if DISABLE_TRELLISQ_SEARCH - av1_xform_quant(cm, x, 0, block, -#if CONFIG_CB4X4 - 2 * (row + idy), 2 * (col + idx), -#else - row + idy, col + idx, -#endif // CONFIG_CB4X4 - BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_B); -#else - const AV1_XFORM_QUANT xform_quant = - is_lossless ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP; - av1_xform_quant(cm, x, 0, block, -#if CONFIG_CB4X4 - 2 * (row + idy), 2 * (col + idx), -#else - row + idy, col + idx, -#endif // CONFIG_CB4X4 - BLOCK_8X8, tx_size, coeff_ctx, xform_quant); - - av1_optimize_b(cm, x, 0, 0, 0, block, BLOCK_8X8, tx_size, tempa + idx, - templ + idy, 1); -#endif // DISABLE_TRELLISQ_SEARCH - ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size, scan_order, - tempa + idx, templ + idy, - cpi->sf.use_fast_coef_costing); - skip = (p->eobs[block] == 0); - can_skip &= skip; - tempa[idx] = !skip; - templ[idy] = !skip; -#if CONFIG_EXT_TX - if (tx_size == TX_8X4) { - tempa[idx + 1] = tempa[idx]; - } else if (tx_size == TX_4X8) { - templ[idy + 1] = templ[idy]; - } -#endif // CONFIG_EXT_TX -#else - (void)scan_order; - - av1_xform_quant(cm, x, 0, block, -#if CONFIG_CB4X4 - 2 * (row + idy), 2 * (col + idx), -#else - row + idy, col + idx, -#endif // CONFIG_CB4X4 - BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_FP); - - ratey += x->rate; - skip = x->pvq_skip[0]; - tempa[idx] = !skip; - templ[idy] = !skip; - can_skip &= skip; -#endif // !CONFIG_PVQ - - if (!is_lossless) { // To use the pixel domain distortion, we need to - // calculate inverse txfm *before* calculating RD - // cost. Compared to calculating the distortion in - // the frequency domain, the overhead of encoding - // effort is low. -#if CONFIG_PVQ - if (!skip) -#endif // CONFIG_PVQ - av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block), -#if CONFIG_LGT_FROM_PRED - mode, -#endif -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - BLOCK_OFFSET(xd->mrc_mask, block), -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - tx_type, tx_size, dst, dst_stride, - p->eobs[block]); - unsigned int tmp; - cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp); - const int64_t dist = (int64_t)tmp << 4; - distortion += dist; - } - - if (RDCOST(x->rdmult, ratey, distortion) >= best_rd) goto next; - - if (is_lossless) { // Calculate inverse txfm *after* RD cost. -#if CONFIG_PVQ - if (!skip) -#endif // CONFIG_PVQ - av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block), -#if CONFIG_LGT_FROM_PRED - mode, -#endif -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - BLOCK_OFFSET(xd->mrc_mask, block), -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - DCT_DCT, tx_size, dst, dst_stride, - p->eobs[block]); - } - } - } - - rate += ratey; - this_rd = RDCOST(x->rdmult, rate, distortion); - - if (this_rd < best_rd) { - *bestrate = rate; - *bestratey = ratey; - *bestdistortion = distortion; - best_rd = this_rd; - best_can_skip = can_skip; - *best_mode = mode; - memcpy(a, tempa, pred_width_in_transform_blocks * sizeof(tempa[0])); - memcpy(l, templ, pred_height_in_transform_blocks * sizeof(templ[0])); -#if CONFIG_PVQ - od_encode_checkpoint(&x->daala_enc, &post_buf); -#endif // CONFIG_PVQ - for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) - memcpy(best_dst + idy * 8, dst_init + idy * dst_stride, - pred_width_in_transform_blocks * 4); - } - next : {} -#if CONFIG_PVQ - od_encode_rollback(&x->daala_enc, &pre_buf); -#endif // CONFIG_PVQ - } // mode decision loop - - if (best_rd >= rd_thresh) return best_rd; - -#if CONFIG_PVQ - od_encode_rollback(&x->daala_enc, &post_buf); -#endif // CONFIG_PVQ - - if (y_skip) *y_skip &= best_can_skip; - - for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) - memcpy(dst_init + idy * dst_stride, best_dst + idy * 8, - pred_width_in_transform_blocks * 4); - - return best_rd; -} - -static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi, - MACROBLOCK *mb, int *rate, - int *rate_y, int64_t *distortion, - int *y_skip, int64_t best_rd) { - const MACROBLOCKD *const xd = &mb->e_mbd; - MODE_INFO *const mic = xd->mi[0]; - const MODE_INFO *above_mi = xd->above_mi; - const MODE_INFO *left_mi = xd->left_mi; - MB_MODE_INFO *const mbmi = &mic->mbmi; - assert(!is_inter_block(mbmi)); - const BLOCK_SIZE bsize = mbmi->sb_type; - const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize]; - const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize]; - int idx, idy; - int cost = 0; - int64_t total_distortion = 0; - int tot_rate_y = 0; - int64_t total_rd = 0; - const int *bmode_costs = mb->mbmode_cost[0]; - const int is_lossless = xd->lossless[mbmi->segment_id]; -#if CONFIG_EXT_TX && CONFIG_RECT_TX - const TX_SIZE tx_size = is_lossless ? TX_4X4 : max_txsize_rect_lookup[bsize]; -#else - const TX_SIZE tx_size = TX_4X4; -#endif // CONFIG_EXT_TX && CONFIG_RECT_TX - -#if CONFIG_EXT_INTRA -#if CONFIG_INTRA_INTERP - mbmi->intra_filter = INTRA_FILTER_LINEAR; -#endif // CONFIG_INTRA_INTERP -#endif // CONFIG_EXT_INTRA -#if CONFIG_FILTER_INTRA - mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0; -#endif // CONFIG_FILTER_INTRA - - // TODO(any): Add search of the tx_type to improve rd performance at the - // expense of speed. - mbmi->tx_type = DCT_DCT; - mbmi->tx_size = tx_size; -#if CONFIG_LGT_FROM_PRED - mbmi->use_lgt = 0; -#endif - - if (y_skip) *y_skip = 1; - - // Pick modes for each prediction sub-block (of size 4x4, 4x8, or 8x4) in this - // 8x8 coding block. - for (idy = 0; idy < 2; idy += pred_height_in_4x4_blocks) { - for (idx = 0; idx < 2; idx += pred_width_in_4x4_blocks) { - PREDICTION_MODE best_mode = DC_PRED; - int r = INT_MAX, ry = INT_MAX; - int64_t d = INT64_MAX, this_rd = INT64_MAX; - int j; - const int pred_block_idx = idy * 2 + idx; - if (cpi->common.frame_type == KEY_FRAME) { - const PREDICTION_MODE A = - av1_above_block_mode(mic, above_mi, pred_block_idx); - const PREDICTION_MODE L = - av1_left_block_mode(mic, left_mi, pred_block_idx); - -#if CONFIG_KF_CTX - const int above_ctx = intra_mode_context[A]; - const int left_ctx = intra_mode_context[L]; - bmode_costs = mb->y_mode_costs[above_ctx][left_ctx]; -#else - bmode_costs = mb->y_mode_costs[A][L]; -#endif - } - this_rd = rd_pick_intra_sub_8x8_y_subblock_mode( - cpi, mb, idy, idx, &best_mode, bmode_costs, - xd->plane[0].above_context + idx, xd->plane[0].left_context + idy, &r, - &ry, &d, bsize, tx_size, y_skip, best_rd - total_rd); -#if CONFIG_DIST_8X8 - if (!cpi->oxcf.using_dist_8x8) -#endif - if (this_rd >= best_rd - total_rd) return INT64_MAX; - - total_rd += this_rd; - cost += r; - total_distortion += d; - tot_rate_y += ry; - - mic->bmi[pred_block_idx].as_mode = best_mode; - for (j = 1; j < pred_height_in_4x4_blocks; ++j) - mic->bmi[pred_block_idx + j * 2].as_mode = best_mode; - for (j = 1; j < pred_width_in_4x4_blocks; ++j) - mic->bmi[pred_block_idx + j].as_mode = best_mode; - - if (total_rd >= best_rd) return INT64_MAX; - } - } - mbmi->mode = mic->bmi[3].as_mode; - -#if CONFIG_DIST_8X8 - if (cpi->oxcf.using_dist_8x8) { - const struct macroblock_plane *p = &mb->plane[0]; - const struct macroblockd_plane *pd = &xd->plane[0]; - const int src_stride = p->src.stride; - const int dst_stride = pd->dst.stride; - uint8_t *src = p->src.buf; - uint8_t *dst = pd->dst.buf; - - // Daala-defined distortion computed for the block of 8x8 pixels - total_distortion = av1_dist_8x8(cpi, mb, src, src_stride, dst, dst_stride, - BLOCK_8X8, 8, 8, 8, 8, mb->qindex) - << 4; - } -#endif // CONFIG_DIST_8X8 - // Add in the cost of the transform type - if (!is_lossless) { - int rate_tx_type = 0; -#if CONFIG_EXT_TX - if (get_ext_tx_types(tx_size, bsize, 0, cpi->common.reduced_tx_set_used) > - 1) { - const int eset = - get_ext_tx_set(tx_size, bsize, 0, cpi->common.reduced_tx_set_used); -#if CONFIG_LGT_FROM_PRED - if (LGT_FROM_PRED_INTRA && is_lgt_allowed(mbmi->mode, tx_size)) - rate_tx_type += mb->intra_lgt_cost[txsize_sqr_map[tx_size]][mbmi->mode] - [mbmi->use_lgt]; - if (!LGT_FROM_PRED_INTRA || !mbmi->use_lgt) -#endif // CONFIG_LGT_FROM_PRED - rate_tx_type += mb->intra_tx_type_costs[eset][txsize_sqr_map[tx_size]] - [mbmi->mode][mbmi->tx_type]; - } -#else - rate_tx_type = - mb->intra_tx_type_costs[txsize_sqr_map[tx_size]] - [intra_mode_to_tx_type_context[mbmi->mode]] - [mbmi->tx_type]; -#endif // CONFIG_EXT_TX - assert(mbmi->tx_size == tx_size); - cost += rate_tx_type; - tot_rate_y += rate_tx_type; - } - - *rate = cost; - *rate_y = tot_rate_y; - *distortion = total_distortion; - - return RDCOST(mb->rdmult, cost, total_distortion); -} - -#if CONFIG_FILTER_INTRA // Return 1 if an filter intra mode is selected; return 0 otherwise. static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, BLOCK_SIZE bsize, int mode_cost, int64_t *best_rd, int64_t *best_model_rd, - uint16_t skip_mask) { + PICK_MODE_CONTEXT *ctx) { MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *const mic = xd->mi[0]; - MB_MODE_INFO *mbmi = &mic->mbmi; + MB_MODE_INFO *mbmi = xd->mi[0]; int filter_intra_selected_flag = 0; FILTER_INTRA_MODE mode; - TX_SIZE best_tx_size = TX_4X4; + TX_SIZE best_tx_size = TX_8X8; FILTER_INTRA_MODE_INFO filter_intra_mode_info; - TX_TYPE best_tx_type; -#if CONFIG_LGT_FROM_PRED - int use_lgt_when_selected; -#endif - + TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN]; + (void)ctx; av1_zero(filter_intra_mode_info); - mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 1; + mbmi->filter_intra_mode_info.use_filter_intra = 1; mbmi->mode = DC_PRED; mbmi->palette_mode_info.palette_size[0] = 0; for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) { - int this_rate; int64_t this_rd, this_model_rd; RD_STATS tokenonly_rd_stats; - if (skip_mask & (1 << mode)) continue; - mbmi->filter_intra_mode_info.filter_intra_mode[0] = mode; + mbmi->filter_intra_mode_info.filter_intra_mode = mode; this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost); if (*best_model_rd != INT64_MAX && this_model_rd > *best_model_rd + (*best_model_rd >> 1)) @@ -3879,19 +3657,19 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd; super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); if (tokenonly_rd_stats.rate == INT_MAX) continue; - this_rate = tokenonly_rd_stats.rate + - av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 1) + - write_uniform_cost(FILTER_INTRA_MODES, mode) + mode_cost; + const int this_rate = + tokenonly_rd_stats.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost); this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); if (this_rd < *best_rd) { *best_rd = this_rd; - best_tx_size = mic->mbmi.tx_size; + best_tx_size = mbmi->tx_size; filter_intra_mode_info = mbmi->filter_intra_mode_info; - best_tx_type = mic->mbmi.tx_type; -#if CONFIG_LGT_FROM_PRED - use_lgt_when_selected = mic->mbmi.use_lgt; -#endif + memcpy(best_txk_type, mbmi->txk_type, + sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN); + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); *rate = this_rate; *rate_tokenonly = tokenonly_rd_stats.rate; *distortion = tokenonly_rd_stats.dist; @@ -3903,43 +3681,31 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, if (filter_intra_selected_flag) { mbmi->mode = DC_PRED; mbmi->tx_size = best_tx_size; -#if CONFIG_LGT_FROM_PRED - mbmi->use_lgt = use_lgt_when_selected; -#endif - mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = - filter_intra_mode_info.use_filter_intra_mode[0]; - mbmi->filter_intra_mode_info.filter_intra_mode[0] = - filter_intra_mode_info.filter_intra_mode[0]; - mbmi->tx_type = best_tx_type; + mbmi->filter_intra_mode_info = filter_intra_mode_info; + memcpy(mbmi->txk_type, best_txk_type, + sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN); return 1; } else { return 0; } } -#endif // CONFIG_FILTER_INTRA -#if CONFIG_EXT_INTRA // Run RD calculation with given luma intra prediction angle., and return // the RD cost. Update the best mode info. if the RD cost is the best so far. static int64_t calc_rd_given_intra_angle( const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost, int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate, RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size, - TX_TYPE *best_tx_type, -#if CONFIG_LGT_FROM_PRED - int *use_lgt_when_selected, -#endif -#if CONFIG_INTRA_INTERP - INTRA_FILTER *best_filter, -#endif // CONFIG_INTRA_INTERP - int64_t *best_rd, int64_t *best_model_rd) { + int64_t *best_rd, int64_t *best_model_rd, TX_TYPE *best_txk_type, + uint8_t *best_blk_skip) { int this_rate; RD_STATS tokenonly_rd_stats; int64_t this_rd, this_model_rd; - MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi; + MB_MODE_INFO *mbmi = x->e_mbd.mi[0]; + const int n4 = bsize_to_num_blk(bsize); assert(!is_inter_block(mbmi)); - mbmi->angle_delta[0] = angle_delta; + mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta; this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost); if (*best_model_rd != INT64_MAX && this_model_rd > *best_model_rd + (*best_model_rd >> 1)) @@ -3948,22 +3714,19 @@ static int64_t calc_rd_given_intra_angle( super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in); if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX; - this_rate = tokenonly_rd_stats.rate + mode_cost + - write_uniform_cost(2 * max_angle_delta + 1, - mbmi->angle_delta[0] + max_angle_delta); + this_rate = + tokenonly_rd_stats.rate + mode_cost + + x->angle_delta_cost[mbmi->mode - V_PRED] + [max_angle_delta + mbmi->angle_delta[PLANE_TYPE_Y]]; this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); if (this_rd < *best_rd) { + memcpy(best_txk_type, mbmi->txk_type, + sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN); + memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4); *best_rd = this_rd; - *best_angle_delta = mbmi->angle_delta[0]; + *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_Y]; *best_tx_size = mbmi->tx_size; -#if CONFIG_INTRA_INTERP - *best_filter = mbmi->intra_filter; -#endif // CONFIG_INTRA_INTERP - *best_tx_type = mbmi->tx_type; -#if CONFIG_LGT_FROM_PRED - *use_lgt_when_selected = mbmi->use_lgt; -#endif *rate = this_rate; rd_stats->rate = tokenonly_rd_stats.rate; rd_stats->dist = tokenonly_rd_stats.dist; @@ -3980,131 +3743,60 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x, int64_t best_rd, int64_t *best_model_rd) { MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *const mic = xd->mi[0]; - MB_MODE_INFO *mbmi = &mic->mbmi; + MB_MODE_INFO *mbmi = xd->mi[0]; assert(!is_inter_block(mbmi)); int i, angle_delta, best_angle_delta = 0; int first_try = 1; -#if CONFIG_INTRA_INTERP - int p_angle; - const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd); - INTRA_FILTER filter, best_filter = INTRA_FILTER_LINEAR; -#endif // CONFIG_INTRA_INTERP int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)]; - TX_SIZE best_tx_size = mic->mbmi.tx_size; - TX_TYPE best_tx_type = mbmi->tx_type; -#if CONFIG_LGT_FROM_PRED - int use_lgt_when_selected = mbmi->use_lgt; -#endif + TX_SIZE best_tx_size = mbmi->tx_size; + const int n4 = bsize_to_num_blk(bsize); + TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN]; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX; for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { -#if CONFIG_INTRA_INTERP - for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) { - if (FILTER_FAST_SEARCH && filter != INTRA_FILTER_LINEAR) continue; - mic->mbmi.intra_filter = filter; -#endif // CONFIG_INTRA_INTERP - for (i = 0; i < 2; ++i) { - best_rd_in = (best_rd == INT64_MAX) - ? INT64_MAX - : (best_rd + (best_rd >> (first_try ? 3 : 5))); - this_rd = calc_rd_given_intra_angle( - cpi, x, bsize, -#if CONFIG_INTRA_INTERP - mode_cost + x->intra_filter_cost[intra_filter_ctx][filter], -#else - mode_cost, -#endif // CONFIG_INTRA_INTERP - best_rd_in, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate, - rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type, -#if CONFIG_LGT_FROM_PRED - &use_lgt_when_selected, -#endif -#if CONFIG_INTRA_INTERP - &best_filter, -#endif // CONFIG_INTRA_INTERP - &best_rd, best_model_rd); - rd_cost[2 * angle_delta + i] = this_rd; - if (first_try && this_rd == INT64_MAX) return best_rd; - first_try = 0; - if (angle_delta == 0) { - rd_cost[1] = this_rd; - break; - } + for (i = 0; i < 2; ++i) { + best_rd_in = (best_rd == INT64_MAX) + ? INT64_MAX + : (best_rd + (best_rd >> (first_try ? 3 : 5))); + this_rd = calc_rd_given_intra_angle( + cpi, x, bsize, mode_cost, best_rd_in, (1 - 2 * i) * angle_delta, + MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size, + &best_rd, best_model_rd, best_txk_type, best_blk_skip); + rd_cost[2 * angle_delta + i] = this_rd; + if (first_try && this_rd == INT64_MAX) return best_rd; + first_try = 0; + if (angle_delta == 0) { + rd_cost[1] = this_rd; + break; } -#if CONFIG_INTRA_INTERP } -#endif // CONFIG_INTRA_INTERP } assert(best_rd != INT64_MAX); for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { int64_t rd_thresh; -#if CONFIG_INTRA_INTERP - for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) { - if (FILTER_FAST_SEARCH && filter != INTRA_FILTER_LINEAR) continue; - mic->mbmi.intra_filter = filter; -#endif // CONFIG_INTRA_INTERP - for (i = 0; i < 2; ++i) { - int skip_search = 0; - rd_thresh = best_rd + (best_rd >> 5); - if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh && - rd_cost[2 * (angle_delta - 1) + i] > rd_thresh) - skip_search = 1; - if (!skip_search) { - calc_rd_given_intra_angle( - cpi, x, bsize, -#if CONFIG_INTRA_INTERP - mode_cost + x->intra_filter_cost[intra_filter_ctx][filter], -#else - mode_cost, -#endif // CONFIG_INTRA_INTERP - best_rd, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate, - rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type, -#if CONFIG_LGT_FROM_PRED - &use_lgt_when_selected, -#endif -#if CONFIG_INTRA_INTERP - &best_filter, -#endif // CONFIG_INTRA_INTERP - &best_rd, best_model_rd); - } - } -#if CONFIG_INTRA_INTERP - } -#endif // CONFIG_INTRA_INTERP - } - -#if CONFIG_INTRA_INTERP - if (FILTER_FAST_SEARCH && rd_stats->rate < INT_MAX) { - p_angle = mode_to_angle_map[mbmi->mode] + best_angle_delta * ANGLE_STEP; - if (av1_is_intra_filter_switchable(p_angle)) { - for (filter = INTRA_FILTER_LINEAR + 1; filter < INTRA_FILTERS; ++filter) { - mic->mbmi.intra_filter = filter; - this_rd = calc_rd_given_intra_angle( - cpi, x, bsize, - mode_cost + x->intra_filter_cost[intra_filter_ctx][filter], best_rd, - best_angle_delta, MAX_ANGLE_DELTA, rate, rd_stats, - &best_angle_delta, &best_tx_size, &best_tx_type, -#if CONFIG_LGT_FROM_PRED - &use_lgt_when_selected, -#endif - &best_filter, &best_rd, best_model_rd); + for (i = 0; i < 2; ++i) { + int skip_search = 0; + rd_thresh = best_rd + (best_rd >> 5); + if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh && + rd_cost[2 * (angle_delta - 1) + i] > rd_thresh) + skip_search = 1; + if (!skip_search) { + calc_rd_given_intra_angle( + cpi, x, bsize, mode_cost, best_rd, (1 - 2 * i) * angle_delta, + MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size, + &best_rd, best_model_rd, best_txk_type, best_blk_skip); } } } -#endif // CONFIG_INTRA_INTERP mbmi->tx_size = best_tx_size; - mbmi->angle_delta[0] = best_angle_delta; -#if CONFIG_INTRA_INTERP - mic->mbmi.intra_filter = best_filter; -#endif // CONFIG_INTRA_INTERP - mbmi->tx_type = best_tx_type; -#if CONFIG_LGT_FROM_PRED - mbmi->use_lgt = use_lgt_when_selected; -#endif + mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta; + memcpy(mbmi->txk_type, best_txk_type, + sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN); + memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4); return best_rd; } @@ -4173,7 +3865,7 @@ static void angle_estimation(const uint8_t *src, int src_stride, int rows, uint64_t hist_sum = 0; for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i]; for (i = 0; i < INTRA_MODES; ++i) { - if (av1_is_directional_mode(i, bsize)) { + if (av1_is_directional_mode(i)) { const uint8_t angle_bin = mode_to_angle_bin[i]; uint64_t score = 2 * hist[angle_bin]; int weight = 2; @@ -4191,7 +3883,6 @@ static void angle_estimation(const uint8_t *src, int src_stride, int rows, } } -#if CONFIG_HIGHBITDEPTH static void highbd_angle_estimation(const uint8_t *src8, int src_stride, int rows, int cols, BLOCK_SIZE bsize, uint8_t *directional_mode_skip_mask) { @@ -4229,7 +3920,7 @@ static void highbd_angle_estimation(const uint8_t *src8, int src_stride, uint64_t hist_sum = 0; for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i]; for (i = 0; i < INTRA_MODES; ++i) { - if (av1_is_directional_mode(i, bsize)) { + if (av1_is_directional_mode(i)) { const uint8_t angle_bin = mode_to_angle_bin[i]; uint64_t score = 2 * hist[angle_bin]; int weight = 2; @@ -4246,119 +3937,102 @@ static void highbd_angle_estimation(const uint8_t *src8, int src_stride, } } } -#endif // CONFIG_HIGHBITDEPTH -#endif // CONFIG_EXT_INTRA + +// Given selected prediction mode, search for the best tx type and size. +static void intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, const int *bmode_costs, + int64_t *best_rd, int *rate, int *rate_tokenonly, + int64_t *distortion, int *skippable, + MB_MODE_INFO *best_mbmi, PICK_MODE_CONTEXT *ctx) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + RD_STATS rd_stats; + super_block_yrd(cpi, x, &rd_stats, bsize, *best_rd); + if (rd_stats.rate == INT_MAX) return; + int this_rate_tokenonly = rd_stats.rate; + if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) { + // super_block_yrd above includes the cost of the tx_size in the + // tokenonly rate, but for intra blocks, tx_size is always coded + // (prediction granularity), so we account for it in the full rate, + // not the tokenonly rate. + this_rate_tokenonly -= tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size); + } + const int this_rate = + rd_stats.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]); + const int64_t this_rd = RDCOST(x->rdmult, this_rate, rd_stats.dist); + if (this_rd < *best_rd) { + *best_mbmi = *mbmi; + *best_rd = this_rd; + *rate = this_rate; + *rate_tokenonly = this_rate_tokenonly; + *distortion = rd_stats.dist; + *skippable = rd_stats.skip; + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + } +} // This function is used only for intra_only frames static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, - BLOCK_SIZE bsize, int64_t best_rd) { + BLOCK_SIZE bsize, int64_t best_rd, + PICK_MODE_CONTEXT *ctx) { MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *const mic = xd->mi[0]; - MB_MODE_INFO *const mbmi = &mic->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; assert(!is_inter_block(mbmi)); - MB_MODE_INFO best_mbmi = *mbmi; int64_t best_model_rd = INT64_MAX; -#if CONFIG_EXT_INTRA const int rows = block_size_high[bsize]; const int cols = block_size_wide[bsize]; -#if CONFIG_INTRA_INTERP - const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd); -#endif // CONFIG_INTRA_INTERP int is_directional_mode; uint8_t directional_mode_skip_mask[INTRA_MODES]; const int src_stride = x->plane[0].src.stride; const uint8_t *src = x->plane[0].src.buf; -#endif // CONFIG_EXT_INTRA -#if CONFIG_FILTER_INTRA int beat_best_rd = 0; - uint16_t filter_intra_mode_skip_mask = (1 << FILTER_INTRA_MODES) - 1; -#endif // CONFIG_FILTER_INTRA const int *bmode_costs; PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; - int palette_y_mode_ctx = 0; const int try_palette = av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type); uint8_t *best_palette_color_map = try_palette ? x->palette_buffer->best_palette_color_map : NULL; - const MODE_INFO *above_mi = xd->above_mi; - const MODE_INFO *left_mi = xd->left_mi; - const PREDICTION_MODE A = av1_above_block_mode(mic, above_mi, 0); - const PREDICTION_MODE L = av1_left_block_mode(mic, left_mi, 0); - const PREDICTION_MODE FINAL_MODE_SEARCH = TM_PRED + 1; -#if CONFIG_PVQ - od_rollback_buffer pre_buf, post_buf; - - od_encode_checkpoint(&x->daala_enc, &pre_buf); - od_encode_checkpoint(&x->daala_enc, &post_buf); -#endif // CONFIG_PVQ - -#if CONFIG_KF_CTX + const MB_MODE_INFO *above_mi = xd->above_mbmi; + const MB_MODE_INFO *left_mi = xd->left_mbmi; + const PREDICTION_MODE A = av1_above_block_mode(above_mi); + const PREDICTION_MODE L = av1_left_block_mode(left_mi); const int above_ctx = intra_mode_context[A]; const int left_ctx = intra_mode_context[L]; bmode_costs = x->y_mode_costs[above_ctx][left_ctx]; -#else - bmode_costs = x->y_mode_costs[A][L]; -#endif -#if CONFIG_EXT_INTRA - mbmi->angle_delta[0] = 0; -#if CONFIG_HIGHBITDEPTH + mbmi->angle_delta[PLANE_TYPE_Y] = 0; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) highbd_angle_estimation(src, src_stride, rows, cols, bsize, directional_mode_skip_mask); else -#endif // CONFIG_HIGHBITDEPTH angle_estimation(src, src_stride, rows, cols, bsize, directional_mode_skip_mask); -#endif // CONFIG_EXT_INTRA -#if CONFIG_FILTER_INTRA - mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0; -#endif // CONFIG_FILTER_INTRA + mbmi->filter_intra_mode_info.use_filter_intra = 0; pmi->palette_size[0] = 0; - if (try_palette) { - if (above_mi) { - palette_y_mode_ctx += - (above_mi->mbmi.palette_mode_info.palette_size[0] > 0); - } - if (left_mi) { - palette_y_mode_ctx += - (left_mi->mbmi.palette_mode_info.palette_size[0] > 0); - } - } if (cpi->sf.tx_type_search.fast_intra_tx_type_search) x->use_default_intra_tx_type = 1; else x->use_default_intra_tx_type = 0; + MB_MODE_INFO best_mbmi = *mbmi; /* Y Search for intra prediction mode */ - for (int mode_idx = DC_PRED; mode_idx <= FINAL_MODE_SEARCH; ++mode_idx) { + for (int mode_idx = DC_PRED; mode_idx < INTRA_MODES; ++mode_idx) { RD_STATS this_rd_stats; int this_rate, this_rate_tokenonly, s; int64_t this_distortion, this_rd, this_model_rd; - if (mode_idx == FINAL_MODE_SEARCH) { - if (x->use_default_intra_tx_type == 0) break; - mbmi->mode = best_mbmi.mode; - x->use_default_intra_tx_type = 0; - } else { - assert(mode_idx < INTRA_MODES); - mbmi->mode = intra_rd_search_mode_order[mode_idx]; - } -#if CONFIG_PVQ - od_encode_rollback(&x->daala_enc, &pre_buf); -#endif // CONFIG_PVQ -#if CONFIG_EXT_INTRA - mbmi->angle_delta[0] = 0; -#endif // CONFIG_EXT_INTRA + mbmi->mode = intra_rd_search_mode_order[mode_idx]; + mbmi->angle_delta[PLANE_TYPE_Y] = 0; this_model_rd = intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode]); if (best_model_rd != INT64_MAX && this_model_rd > best_model_rd + (best_model_rd >> 1)) continue; if (this_model_rd < best_model_rd) best_model_rd = this_model_rd; -#if CONFIG_EXT_INTRA - is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize); + is_directional_mode = av1_is_directional_mode(mbmi->mode); if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue; if (is_directional_mode && av1_use_angle_delta(bsize)) { this_rd_stats.rate = INT_MAX; @@ -4367,97 +4041,61 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, } else { super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd); } -#else - super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd); -#endif // CONFIG_EXT_INTRA this_rate_tokenonly = this_rd_stats.rate; this_distortion = this_rd_stats.dist; s = this_rd_stats.skip; if (this_rate_tokenonly == INT_MAX) continue; - this_rate = this_rate_tokenonly + bmode_costs[mbmi->mode]; - if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) { // super_block_yrd above includes the cost of the tx_size in the // tokenonly rate, but for intra blocks, tx_size is always coded // (prediction granularity), so we account for it in the full rate, // not the tokenonly rate. - this_rate_tokenonly -= tx_size_cost(cpi, x, bsize, mbmi->tx_size); - } - if (try_palette && mbmi->mode == DC_PRED) { - this_rate += - av1_cost_bit(av1_default_palette_y_mode_prob[bsize - BLOCK_8X8] - [palette_y_mode_ctx], - 0); - } -#if CONFIG_FILTER_INTRA - if (mbmi->mode == DC_PRED) - this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 0); -#endif // CONFIG_FILTER_INTRA -#if CONFIG_EXT_INTRA - if (is_directional_mode) { -#if CONFIG_INTRA_INTERP - const int p_angle = - mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP; - if (av1_is_intra_filter_switchable(p_angle)) - this_rate += x->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter]; -#endif // CONFIG_INTRA_INTERP - if (av1_use_angle_delta(bsize)) { - this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, - MAX_ANGLE_DELTA + mbmi->angle_delta[0]); - } - } -#endif // CONFIG_EXT_INTRA -#if CONFIG_INTRABC - if (bsize >= BLOCK_8X8 && cpi->common.allow_screen_content_tools) - this_rate += x->intrabc_cost[0]; -#endif // CONFIG_INTRABC - this_rd = RDCOST(x->rdmult, this_rate, this_distortion); -#if CONFIG_FILTER_INTRA - if (best_rd == INT64_MAX || this_rd - best_rd < (best_rd >> 4)) { - filter_intra_mode_skip_mask ^= (1 << mbmi->mode); + this_rate_tokenonly -= + tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size); } -#endif // CONFIG_FILTER_INTRA - + this_rate = + this_rd_stats.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]); + this_rd = RDCOST(x->rdmult, this_rate, this_distortion); if (this_rd < best_rd) { best_mbmi = *mbmi; best_rd = this_rd; -#if CONFIG_FILTER_INTRA beat_best_rd = 1; -#endif // CONFIG_FILTER_INTRA *rate = this_rate; *rate_tokenonly = this_rate_tokenonly; *distortion = this_distortion; *skippable = s; -#if CONFIG_PVQ - od_encode_checkpoint(&x->daala_enc, &post_buf); -#endif // CONFIG_PVQ + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); } } -#if CONFIG_PVQ - od_encode_rollback(&x->daala_enc, &post_buf); -#endif // CONFIG_PVQ - if (try_palette) { - rd_pick_palette_intra_sby(cpi, x, bsize, palette_y_mode_ctx, - bmode_costs[DC_PRED], &best_mbmi, + rd_pick_palette_intra_sby(cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, best_palette_color_map, &best_rd, &best_model_rd, - rate, rate_tokenonly, distortion, skippable); + rate, rate_tokenonly, distortion, skippable, ctx, + ctx->blk_skip); } -#if CONFIG_FILTER_INTRA - if (beat_best_rd) { + if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) { if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion, skippable, bsize, bmode_costs[DC_PRED], - &best_rd, &best_model_rd, - filter_intra_mode_skip_mask)) { + &best_rd, &best_model_rd, ctx)) { best_mbmi = *mbmi; } } -#endif // CONFIG_FILTER_INTRA + + // If previous searches use only the default tx type, do an extra search for + // the best tx type. + if (x->use_default_intra_tx_type) { + *mbmi = best_mbmi; + x->use_default_intra_tx_type = 0; + intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate, rate_tokenonly, + distortion, skippable, &best_mbmi, ctx); + } *mbmi = best_mbmi; return best_rd; @@ -4469,33 +4107,29 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, BLOCK_SIZE bsize, int64_t ref_best_rd) { MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]); + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_U]; + const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); int plane; int is_cost_valid = 1; av1_init_rd_stats(rd_stats); if (ref_best_rd < 0) is_cost_valid = 0; -#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2 if (x->skip_chroma_rd) return is_cost_valid; - bsize = scale_chroma_bsize(bsize, xd->plane[1].subsampling_x, - xd->plane[1].subsampling_y); -#endif // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2 + bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y); -#if !CONFIG_PVQ if (is_inter_block(mbmi) && is_cost_valid) { for (plane = 1; plane < MAX_MB_PLANE; ++plane) av1_subtract_plane(x, bsize, plane); } -#endif // !CONFIG_PVQ if (is_cost_valid) { for (plane = 1; plane < MAX_MB_PLANE; ++plane) { RD_STATS pn_rd_stats; txfm_rd_in_plane(x, cpi, &pn_rd_stats, ref_best_rd, plane, bsize, - uv_tx_size, cpi->sf.use_fast_coef_costing); + uv_tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE); if (pn_rd_stats.rate == INT_MAX) { is_cost_valid = 0; break; @@ -4517,283 +4151,222 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, return is_cost_valid; } -#if CONFIG_VAR_TX -void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size, - int blk_row, int blk_col, int plane, int block, - int plane_bsize, const ENTROPY_CONTEXT *a, - const ENTROPY_CONTEXT *l, RD_STATS *rd_stats) { - const AV1_COMMON *const cm = &cpi->common; - MACROBLOCKD *xd = &x->e_mbd; +static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size, + int blk_row, int blk_col, int plane, int block, + int plane_bsize, const ENTROPY_CONTEXT *a, + const ENTROPY_CONTEXT *l, RD_STATS *rd_stats, + FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost, + TXB_RD_INFO *rd_info_array) { const struct macroblock_plane *const p = &x->plane[plane]; - struct macroblockd_plane *const pd = &xd->plane[plane]; - -#if CONFIG_TXK_SEL - av1_search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, - tx_size, a, l, 0, rd_stats); - return; -#endif - - int64_t tmp; - tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block); -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - PLANE_TYPE plane_type = get_plane_type(plane); - TX_TYPE tx_type = - av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size); - const SCAN_ORDER *const scan_order = - get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi); - BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size]; - int bh = block_size_high[txm_bsize]; - int bw = block_size_wide[txm_bsize]; - int src_stride = p->src.stride; - uint8_t *src = - &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]]; - uint8_t *dst = - &pd->dst - .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]]; -#if CONFIG_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, rec_buffer16[MAX_TX_SQUARE]); - uint8_t *rec_buffer; -#else - DECLARE_ALIGNED(16, uint8_t, rec_buffer[MAX_TX_SQUARE]); -#endif // CONFIG_HIGHBITDEPTH - const int diff_stride = block_size_wide[plane_bsize]; - const int16_t *diff = - &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]]; - int txb_coeff_cost; - - assert(tx_size < TX_SIZES_ALL); - - int coeff_ctx = get_entropy_context(tx_size, a, l); - - tmp = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col, - plane_bsize, txm_bsize); - -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2); -#endif // CONFIG_HIGHBITDEPTH - rd_stats->sse += tmp << 4; - - if (rd_stats->invalid_rate) { - rd_stats->dist += tmp << 4; - rd_stats->rate += rd_stats->zero_rate; - rd_stats->skip = 1; - return; - } - -// TODO(any): Use av1_dist_block to compute distortion -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - rec_buffer = CONVERT_TO_BYTEPTR(rec_buffer16); - aom_highbd_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, - 0, NULL, 0, bw, bh, xd->bd); - } else { - rec_buffer = (uint8_t *)rec_buffer16; - aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0, - NULL, 0, bw, bh); + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); + const uint16_t cur_joint_ctx = + (txb_ctx.dc_sign_ctx << 8) + txb_ctx.txb_skip_ctx; + + const int txk_type_idx = + av1_get_txk_type_index(plane_bsize, blk_row, blk_col); + // Look up RD and terminate early in case when we've already processed exactly + // the same residual with exactly the same entropy context. + if (rd_info_array != NULL && rd_info_array->valid && + rd_info_array->entropy_context == cur_joint_ctx) { + if (plane == 0) + x->e_mbd.mi[0]->txk_type[txk_type_idx] = rd_info_array->tx_type; + const TX_TYPE ref_tx_type = + av1_get_tx_type(get_plane_type(plane), &x->e_mbd, blk_row, blk_col, + tx_size, cpi->common.reduced_tx_set_used); + if (ref_tx_type == rd_info_array->tx_type) { + rd_stats->rate += rd_info_array->rate; + rd_stats->dist += rd_info_array->dist; + rd_stats->sse += rd_info_array->sse; + rd_stats->skip &= rd_info_array->eob == 0; + p->eobs[block] = rd_info_array->eob; + p->txb_entropy_ctx[block] = rd_info_array->txb_entropy_ctx; + return; + } } -#else - aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0, NULL, - 0, bw, bh); -#endif // CONFIG_HIGHBITDEPTH - -#if DISABLE_TRELLISQ_SEARCH - av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, - coeff_ctx, AV1_XFORM_QUANT_B); - -#else - av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, - coeff_ctx, AV1_XFORM_QUANT_FP); - - const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2; - tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); - const int buffer_length = tx_size_2d[tx_size]; - int64_t tmp_dist, tmp_sse; -#if CONFIG_DIST_8X8 - int disable_early_skip = - x->using_dist_8x8 && plane == 0 && plane_bsize >= BLOCK_8X8 && - (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) && - x->tune_metric != AOM_TUNE_PSNR; -#endif // CONFIG_DIST_8X8 -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - tmp_dist = - av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp_sse, xd->bd); - else -#endif - tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp_sse); - - tmp_dist = RIGHT_SIGNED_SHIFT(tmp_dist, shift); - -#if CONFIG_MRC_TX - if (tx_type == MRC_DCT && !xd->mi[0]->mbmi.valid_mrc_mask) { - av1_invalid_rd_stats(rd_stats); - return; - } -#endif // CONFIG_MRC_TX - if ( -#if CONFIG_DIST_8X8 - disable_early_skip || -#endif - RDCOST(x->rdmult, 0, tmp_dist) < rd_stats->ref_rdcost) { - av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size, - a, l, 1); - } else { - rd_stats->rate += rd_stats->zero_rate; - rd_stats->dist += tmp << 4; - rd_stats->skip = 1; - rd_stats->invalid_rate = 1; - return; + RD_STATS this_rd_stats; + search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + &txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats); + + av1_merge_rd_stats(rd_stats, &this_rd_stats); + + // Save RD results for possible reuse in future. + if (rd_info_array != NULL) { + rd_info_array->valid = 1; + rd_info_array->entropy_context = cur_joint_ctx; + rd_info_array->rate = this_rd_stats.rate; + rd_info_array->dist = this_rd_stats.dist; + rd_info_array->sse = this_rd_stats.sse; + rd_info_array->eob = p->eobs[block]; + rd_info_array->txb_entropy_ctx = p->txb_entropy_ctx[block]; + if (plane == 0) { + rd_info_array->tx_type = x->e_mbd.mi[0]->txk_type[txk_type_idx]; + } + } +} + +static void get_mean_and_dev(const int16_t *data, int stride, int bw, int bh, + float *mean, float *dev) { + int x_sum = 0; + uint64_t x2_sum = 0; + for (int i = 0; i < bh; ++i) { + for (int j = 0; j < bw; ++j) { + const int val = data[j]; + x_sum += val; + x2_sum += val * val; + } + data += stride; + } + + const int num = bw * bh; + const float e_x = (float)x_sum / num; + const float e_x2 = (float)((double)x2_sum / num); + const float diff = e_x2 - e_x * e_x; + *dev = (diff > 0) ? sqrtf(diff) : 0; + *mean = e_x; +} + +static void get_mean_and_dev_float(const float *data, int stride, int bw, + int bh, float *mean, float *dev) { + float x_sum = 0; + float x2_sum = 0; + for (int i = 0; i < bh; ++i) { + for (int j = 0; j < bw; ++j) { + const float val = data[j]; + x_sum += val; + x2_sum += val * val; + } + data += stride; + } + + const int num = bw * bh; + const float e_x = x_sum / num; + const float e_x2 = x2_sum / num; + const float diff = e_x2 - e_x * e_x; + *dev = (diff > 0) ? sqrtf(diff) : 0; + *mean = e_x; +} + +// Feature used by the model to predict tx split: the mean and standard +// deviation values of the block and sub-blocks. +static void get_mean_dev_features(const int16_t *data, int stride, int bw, + int bh, int levels, float *feature) { + int feature_idx = 0; + int width = bw; + int height = bh; + const int16_t *const data_ptr = &data[0]; + for (int lv = 0; lv < levels; ++lv) { + if (width < 2 || height < 2) break; + float mean_buf[16]; + float dev_buf[16]; + int blk_idx = 0; + for (int row = 0; row < bh; row += height) { + for (int col = 0; col < bw; col += width) { + float mean, dev; + get_mean_and_dev(data_ptr + row * stride + col, stride, width, height, + &mean, &dev); + feature[feature_idx++] = mean; + feature[feature_idx++] = dev; + mean_buf[blk_idx] = mean; + dev_buf[blk_idx++] = dev; + } + } + if (blk_idx > 1) { + float mean, dev; + // Deviation of means. + get_mean_and_dev_float(mean_buf, 1, 1, blk_idx, &mean, &dev); + feature[feature_idx++] = dev; + // Mean of deviations. + get_mean_and_dev_float(dev_buf, 1, 1, blk_idx, &mean, &dev); + feature[feature_idx++] = mean; + } + // Reduce the block size when proceeding to the next level. + if (height == width) { + height = height >> 1; + width = width >> 1; + } else if (height > width) { + height = height >> 1; + } else { + width = width >> 1; + } } -#endif // DISABLE_TRELLISQ_SEARCH +} - const int eob = p->eobs[block]; +static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row, + int blk_col, TX_SIZE tx_size) { + const NN_CONFIG *nn_config = av1_tx_split_nnconfig_map[tx_size]; + if (!nn_config) return -1; - av1_inverse_transform_block(xd, dqcoeff, -#if CONFIG_LGT_FROM_PRED - xd->mi[0]->mbmi.mode, -#endif -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - mrc_mask, -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - tx_type, tx_size, rec_buffer, MAX_TX_SIZE, eob); - if (eob > 0) { -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && plane == 0 && (bw < 8 && bh < 8)) { - // Save sub8x8 luma decoded pixels - // since 8x8 luma decoded pixels are not available for daala-dist - // after recursive split of BLOCK_8x8 is done. - const int pred_stride = block_size_wide[plane_bsize]; - const int pred_idx = (blk_row * pred_stride + blk_col) - << tx_size_wide_log2[0]; - int16_t *decoded = &pd->pred[pred_idx]; - int i, j; + const int diff_stride = block_size_wide[bsize]; + const int16_t *diff = + x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + aom_clear_system_state(); -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - for (j = 0; j < bh; j++) - for (i = 0; i < bw; i++) - decoded[j * pred_stride + i] = - CONVERT_TO_SHORTPTR(rec_buffer)[j * MAX_TX_SIZE + i]; - } else { -#endif - for (j = 0; j < bh; j++) - for (i = 0; i < bw; i++) - decoded[j * pred_stride + i] = rec_buffer[j * MAX_TX_SIZE + i]; -#if CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_DIST_8X8 - tmp = pixel_dist(cpi, x, plane, src, src_stride, rec_buffer, MAX_TX_SIZE, - blk_row, blk_col, plane_bsize, txm_bsize); - } - rd_stats->dist += tmp * 16; - txb_coeff_cost = av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block, - tx_size, scan_order, a, l, 0); - rd_stats->rate += txb_coeff_cost; - rd_stats->skip &= (eob == 0); + float features[64] = { 0.0f }; + get_mean_dev_features(diff, diff_stride, bw, bh, 2, features); -#if CONFIG_RD_DEBUG - av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col, - txb_coeff_cost); -#endif // CONFIG_RD_DEBUG + float score = 0.0f; + av1_nn_predict(features, nn_config, &score); + if (score > 8.0f) return 100; + if (score < -8.0f) return 0; + score = 1.0f / (1.0f + (float)exp(-score)); + return (int)(score * 100); } +// Search for the best tx partition/type for a given luma block. static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, - int blk_col, int plane, int block, TX_SIZE tx_size, - int depth, BLOCK_SIZE plane_bsize, - ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl, - TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, - RD_STATS *rd_stats, int64_t ref_best_rd, - int *is_cost_valid) { - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - struct macroblock_plane *const p = &x->plane[plane]; - struct macroblockd_plane *const pd = &xd->plane[plane]; - const int tx_row = blk_row >> (1 - pd->subsampling_y); - const int tx_col = blk_col >> (1 - pd->subsampling_x); - TX_SIZE(*const inter_tx_size) - [MAX_MIB_SIZE] = - (TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col]; - const int max_blocks_high = max_block_high(xd, plane_bsize, plane); - const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); - const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; - int64_t this_rd = INT64_MAX; - ENTROPY_CONTEXT *pta = ta + blk_col; - ENTROPY_CONTEXT *ptl = tl + blk_row; - int i; - int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row, - mbmi->sb_type, tx_size); - int64_t sum_rd = INT64_MAX; - int tmp_eob = 0; - int zero_blk_rate; - RD_STATS sum_rd_stats; -#if CONFIG_TXK_SEL - TX_TYPE best_tx_type = TX_TYPES; - int txk_idx = (blk_row << 4) + blk_col; -#endif -#if CONFIG_RECT_TX_EXT - TX_SIZE quarter_txsize = quarter_txsize_lookup[mbmi->sb_type]; - int check_qttx = is_quarter_tx_allowed(xd, mbmi, is_inter_block(mbmi)) && - tx_size == max_txsize_rect_lookup[mbmi->sb_type] && - quarter_txsize != tx_size; - int is_qttx_picked = 0; - int eobs_qttx[2] = { 0, 0 }; - int skip_qttx[2] = { 0, 0 }; - int block_offset_qttx = check_qttx - ? tx_size_wide_unit[quarter_txsize] * - tx_size_high_unit[quarter_txsize] - : 0; - int blk_row_offset, blk_col_offset; - int is_wide_qttx = - tx_size_wide_unit[quarter_txsize] > tx_size_high_unit[quarter_txsize]; - blk_row_offset = is_wide_qttx ? tx_size_high_unit[quarter_txsize] : 0; - blk_col_offset = is_wide_qttx ? 0 : tx_size_wide_unit[quarter_txsize]; -#endif - - av1_init_rd_stats(&sum_rd_stats); - + int blk_col, int block, TX_SIZE tx_size, int depth, + BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta, + ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, + TXFM_CONTEXT *tx_left, RD_STATS *rd_stats, + int64_t ref_best_rd, int *is_cost_valid, + FAST_TX_SEARCH_MODE ftxs_mode, + TXB_RD_INFO_NODE *rd_info_node) { assert(tx_size < TX_SIZES_ALL); - + av1_init_rd_stats(rd_stats); if (ref_best_rd < 0) { *is_cost_valid = 0; return; } - av1_init_rd_stats(rd_stats); - + MACROBLOCKD *const xd = &x->e_mbd; + const int max_blocks_high = max_block_high(xd, plane_bsize, 0); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0); if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; -#if CONFIG_LV_MAP - TX_SIZE txs_ctx = get_txsize_context(tx_size); - TXB_CTX txb_ctx; - get_txb_ctx(plane_bsize, tx_size, plane, pta, ptl, &txb_ctx); - -#if LV_MAP_PROB - zero_blk_rate = x->coeff_costs[txs_ctx][get_plane_type(plane)] - .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; -#else - zero_blk_rate = - av1_cost_bit(xd->fc->txb_skip[txs_ctx][txb_ctx.txb_skip_ctx], 1); -#endif // LV_MAP_PROB -#else - TX_SIZE tx_size_ctx = txsize_sqr_map[tx_size]; - int coeff_ctx = get_entropy_context(tx_size, pta, ptl); - zero_blk_rate = - x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0]; -#endif - - rd_stats->ref_rdcost = ref_best_rd; - rd_stats->zero_rate = zero_blk_rate; - if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) { - inter_tx_size[0][0] = tx_size; - av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, - plane_bsize, pta, ptl, rd_stats); - if (rd_stats->rate == INT_MAX) return; + const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + ENTROPY_CONTEXT *pta = ta + blk_col; + ENTROPY_CONTEXT *ptl = tl + blk_row; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row, + mbmi->sb_type, tx_size); + struct macroblock_plane *const p = &x->plane[0]; + + const int try_no_split = 1; + int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH; + + int64_t no_split_rd = INT64_MAX; + int no_split_txb_entropy_ctx = 0; + TX_TYPE no_split_tx_type = TX_TYPES; + // TX no split + if (try_no_split) { + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx); + const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y] + .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; + + rd_stats->ref_rdcost = ref_best_rd; + rd_stats->zero_rate = zero_blk_rate; + const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col); + mbmi->inter_tx_size[index] = tx_size; + tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, pta, + ptl, rd_stats, ftxs_mode, ref_best_rd, + rd_info_node != NULL ? rd_info_node->rd_info_array : NULL); + assert(rd_stats->rate < INT_MAX); if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) || @@ -4806,187 +4379,111 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, rd_stats->rate = zero_blk_rate; rd_stats->dist = rd_stats->sse; rd_stats->skip = 1; - x->blk_skip[plane][blk_row * bw + blk_col] = 1; + x->blk_skip[blk_row * bw + blk_col] = 1; p->eobs[block] = 0; -#if CONFIG_TXK_SEL - mbmi->txk_type[txk_idx] = DCT_DCT; -#endif + update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, + DCT_DCT); } else { - x->blk_skip[plane][blk_row * bw + blk_col] = 0; + x->blk_skip[blk_row * bw + blk_col] = 0; rd_stats->skip = 0; } if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) - rd_stats->rate += - av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0); -#if CONFIG_RECT_TX_EXT - if (check_qttx) { - assert(blk_row == 0 && blk_col == 0); - rd_stats->rate += av1_cost_bit(cpi->common.fc->quarter_tx_size_prob, 0); + rd_stats->rate += x->txfm_partition_cost[ctx][0]; + no_split_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (cpi->sf.adaptive_txb_search_level && + (no_split_rd - + (no_split_rd >> (1 + cpi->sf.adaptive_txb_search_level))) > + ref_best_rd) { + *is_cost_valid = 0; + return; } -#endif - this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); -#if CONFIG_LV_MAP - tmp_eob = p->txb_entropy_ctx[block]; -#else - tmp_eob = p->eobs[block]; -#endif - -#if CONFIG_TXK_SEL - best_tx_type = mbmi->txk_type[txk_idx]; -#endif - -#if CONFIG_RECT_TX_EXT - if (check_qttx) { - assert(blk_row == 0 && blk_col == 0 && block == 0 && plane == 0); - RD_STATS rd_stats_tmp, rd_stats_qttx; - int64_t rd_qttx; + no_split_txb_entropy_ctx = p->txb_entropy_ctx[block]; + const int txk_type_idx = + av1_get_txk_type_index(plane_bsize, blk_row, blk_col); + no_split_tx_type = mbmi->txk_type[txk_type_idx]; - av1_init_rd_stats(&rd_stats_qttx); - av1_init_rd_stats(&rd_stats_tmp); - - av1_tx_block_rd_b(cpi, x, quarter_txsize, 0, 0, plane, 0, plane_bsize, - pta, ptl, &rd_stats_qttx); - if (rd_stats->rate == INT_MAX) return; - - tx_size_ctx = txsize_sqr_map[quarter_txsize]; - coeff_ctx = get_entropy_context(quarter_txsize, pta, ptl); - zero_blk_rate = - x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0]; - if ((RDCOST(x->rdmult, rd_stats_qttx.rate, rd_stats_qttx.dist) >= - RDCOST(x->rdmult, zero_blk_rate, rd_stats_qttx.sse) || - rd_stats_qttx.skip == 1) && - !xd->lossless[mbmi->segment_id]) { -#if CONFIG_RD_DEBUG - av1_update_txb_coeff_cost(&rd_stats_qttx, plane, quarter_txsize, 0, 0, - zero_blk_rate - rd_stats_qttx.rate); -#endif // CONFIG_RD_DEBUG - rd_stats_qttx.rate = zero_blk_rate; - rd_stats_qttx.dist = rd_stats_qttx.sse; - rd_stats_qttx.skip = 1; - x->blk_skip[plane][blk_row * bw + blk_col] = 1; - skip_qttx[0] = 1; - p->eobs[block] = 0; - } else { - x->blk_skip[plane][blk_row * bw + blk_col] = 0; - skip_qttx[0] = 0; - rd_stats->skip = 0; - } - - // Second tx block - av1_tx_block_rd_b(cpi, x, quarter_txsize, blk_row_offset, blk_col_offset, - plane, block_offset_qttx, plane_bsize, pta, ptl, - &rd_stats_tmp); - - if (rd_stats->rate == INT_MAX) return; - -#if !CONFIG_PVQ - av1_set_txb_context(x, plane, 0, quarter_txsize, pta, ptl); -#endif // !CONFIG_PVQ - coeff_ctx = get_entropy_context(quarter_txsize, pta + blk_col_offset, - ptl + blk_row_offset); - zero_blk_rate = - x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0]; - if ((RDCOST(x->rdmult, rd_stats_tmp.rate, rd_stats_tmp.dist) >= - RDCOST(x->rdmult, zero_blk_rate, rd_stats_tmp.sse) || - rd_stats_tmp.skip == 1) && - !xd->lossless[mbmi->segment_id]) { -#if CONFIG_RD_DEBUG - av1_update_txb_coeff_cost(&rd_stats_tmp, plane, quarter_txsize, 0, 0, - zero_blk_rate - rd_stats_tmp.rate); -#endif // CONFIG_RD_DEBUG - rd_stats_tmp.rate = zero_blk_rate; - rd_stats_tmp.dist = rd_stats_tmp.sse; - rd_stats_tmp.skip = 1; - x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = 1; - skip_qttx[1] = 1; - p->eobs[block_offset_qttx] = 0; - } else { - x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = 0; - skip_qttx[1] = 0; - rd_stats_tmp.skip = 0; - } - - av1_merge_rd_stats(&rd_stats_qttx, &rd_stats_tmp); + if (cpi->sf.txb_split_cap) + if (p->eobs[block] == 0) try_split = 0; + } - if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) { - rd_stats_qttx.rate += - av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0); - } - rd_stats_qttx.rate += - av1_cost_bit(cpi->common.fc->quarter_tx_size_prob, 1); - rd_qttx = RDCOST(x->rdmult, rd_stats_qttx.rate, rd_stats_qttx.dist); -#if CONFIG_LV_MAP - eobs_qttx[0] = p->txb_entropy_ctx[0]; - eobs_qttx[1] = p->txb_entropy_ctx[block_offset_qttx]; -#else - eobs_qttx[0] = p->eobs[0]; - eobs_qttx[1] = p->eobs[block_offset_qttx]; -#endif - if (rd_qttx < this_rd) { - is_qttx_picked = 1; - this_rd = rd_qttx; - rd_stats->rate = rd_stats_qttx.rate; - rd_stats->dist = rd_stats_qttx.dist; - rd_stats->sse = rd_stats_qttx.sse; - rd_stats->skip = rd_stats_qttx.skip; - rd_stats->rdcost = rd_stats_qttx.rdcost; - } - av1_get_entropy_contexts(plane_bsize, 0, pd, ta, tl); + if (x->e_mbd.bd == 8 && !x->cb_partition_scan && try_split) { + const int threshold = cpi->sf.tx_type_search.ml_tx_split_thresh; + if (threshold >= 0) { + const int split_score = + ml_predict_tx_split(x, plane_bsize, blk_row, blk_col, tx_size); + if (split_score >= 0 && split_score < threshold) try_split = 0; } -#endif } - if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH -#if CONFIG_MRC_TX - // If the tx type we are trying is MRC_DCT, we cannot partition the - // transform into anything smaller than TX_32X32 - && mbmi->tx_type != MRC_DCT -#endif // CONFIG_MRC_TX - ) { +#if COLLECT_TX_SIZE_DATA + // Do not skip tx_split when collecting tx size data. + try_split = 1; +#endif + + // TX split + int64_t split_rd = INT64_MAX; + RD_STATS split_rd_stats; + av1_init_rd_stats(&split_rd_stats); + if (try_split) { const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; - const int bsl = tx_size_wide_unit[sub_txs]; - int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int sub_step = bsw * bsh; RD_STATS this_rd_stats; int this_cost_valid = 1; int64_t tmp_rd = 0; #if CONFIG_DIST_8X8 - int sub8x8_eob[4]; + int sub8x8_eob[4] = { 0, 0, 0, 0 }; + struct macroblockd_plane *const pd = &xd->plane[0]; #endif - sum_rd_stats.rate = - av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1); + split_rd_stats.rate = x->txfm_partition_cost[ctx][1]; assert(tx_size < TX_SIZES_ALL); - ref_best_rd = AOMMIN(this_rd, ref_best_rd); - - for (i = 0; i < 4 && this_cost_valid; ++i) { - int offsetr = blk_row + (i >> 1) * bsl; - int offsetc = blk_col + (i & 0x01) * bsl; - - if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + ref_best_rd = AOMMIN(no_split_rd, ref_best_rd); + + int blk_idx = 0; + for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) { + for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw, ++blk_idx) { + const int offsetr = blk_row + r; + const int offsetc = blk_col + c; + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + assert(blk_idx < 4); + select_tx_block( + cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, + ta, tl, tx_above, tx_left, &this_rd_stats, ref_best_rd - tmp_rd, + &this_cost_valid, ftxs_mode, + (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL); - select_tx_block(cpi, x, offsetr, offsetc, plane, block, sub_txs, - depth + 1, plane_bsize, ta, tl, tx_above, tx_left, - &this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid); #if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && plane == 0 && tx_size == TX_8X8) { - sub8x8_eob[i] = p->eobs[block]; - } + if (!x->using_dist_8x8) +#endif + if (!this_cost_valid) goto LOOP_EXIT; +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8 && tx_size == TX_8X8) { + sub8x8_eob[2 * (r / bsh) + (c / bsw)] = p->eobs[block]; + } #endif // CONFIG_DIST_8X8 - av1_merge_rd_stats(&sum_rd_stats, &this_rd_stats); + av1_merge_rd_stats(&split_rd_stats, &this_rd_stats); - tmp_rd = RDCOST(x->rdmult, sum_rd_stats.rate, sum_rd_stats.dist); + tmp_rd = RDCOST(x->rdmult, split_rd_stats.rate, split_rd_stats.dist); #if CONFIG_DIST_8X8 - if (!x->using_dist_8x8) + if (!x->using_dist_8x8) #endif - if (this_rd < tmp_rd) break; - block += sub_step; + if (no_split_rd < tmp_rd) { + this_cost_valid = 0; + goto LOOP_EXIT; + } + block += sub_step; + } } + + LOOP_EXIT : {} + #if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && this_cost_valid && plane == 0 && - tx_size == TX_8X8) { + if (x->using_dist_8x8 && this_cost_valid && tx_size == TX_8X8) { const int src_stride = p->src.stride; const int dst_stride = pd->dst.stride; @@ -4997,34 +4494,33 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, .buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; int64_t dist_8x8; - int qindex = x->qindex; + const int qindex = x->qindex; const int pred_stride = block_size_wide[plane_bsize]; const int pred_idx = (blk_row * pred_stride + blk_col) << tx_size_wide_log2[0]; - int16_t *pred = &pd->pred[pred_idx]; - int j; + const int16_t *pred = &x->pred_luma[pred_idx]; + int i, j; int row, col; -#if CONFIG_HIGHBITDEPTH uint8_t *pred8; DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]); -#else - DECLARE_ALIGNED(16, uint8_t, pred8[8 * 8]); -#endif // CONFIG_HIGHBITDEPTH dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, BLOCK_8X8, 8, 8, 8, 8, qindex) * 16; - sum_rd_stats.sse = dist_8x8; -#if CONFIG_HIGHBITDEPTH +#ifdef DEBUG_DIST_8X8 + if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) + assert(sum_rd_stats.sse == dist_8x8); +#endif // DEBUG_DIST_8X8 + + split_rd_stats.sse = dist_8x8; + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) pred8 = CONVERT_TO_BYTEPTR(pred8_16); else pred8 = (uint8_t *)pred8_16; -#endif -#if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { for (row = 0; row < 2; ++row) { for (col = 0; col < 2; ++col) { @@ -5047,7 +4543,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, } } } else { -#endif for (row = 0; row < 2; ++row) { for (col = 0; col < 2; ++col) { int idx = row * 2 + col; @@ -5066,87 +4561,99 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, } } } -#if CONFIG_HIGHBITDEPTH } -#endif // CONFIG_HIGHBITDEPTH dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8, 8, 8, 8, qindex) * 16; - sum_rd_stats.dist = dist_8x8; - tmp_rd = RDCOST(x->rdmult, sum_rd_stats.rate, sum_rd_stats.dist); - } -#endif // CONFIG_DIST_8X8 - if (this_cost_valid) sum_rd = tmp_rd; - } - if (this_rd < sum_rd) { - int idx, idy; -#if CONFIG_RECT_TX_EXT - TX_SIZE tx_size_selected = is_qttx_picked ? quarter_txsize : tx_size; -#else - TX_SIZE tx_size_selected = tx_size; -#endif +#ifdef DEBUG_DIST_8X8 + if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) + assert(sum_rd_stats.dist == dist_8x8); +#endif // DEBUG_DIST_8X8 -#if CONFIG_RECT_TX_EXT - if (is_qttx_picked) { - assert(blk_row == 0 && blk_col == 0 && plane == 0); -#if CONFIG_LV_MAP - p->txb_entropy_ctx[0] = eobs_qttx[0]; - p->txb_entropy_ctx[block_offset_qttx] = eobs_qttx[1]; -#else - p->eobs[0] = eobs_qttx[0]; - p->eobs[block_offset_qttx] = eobs_qttx[1]; -#endif - } else { -#endif -#if CONFIG_LV_MAP - p->txb_entropy_ctx[block] = tmp_eob; -#else - p->eobs[block] = tmp_eob; -#endif -#if CONFIG_RECT_TX_EXT + split_rd_stats.dist = dist_8x8; + tmp_rd = RDCOST(x->rdmult, split_rd_stats.rate, split_rd_stats.dist); } -#endif +#endif // CONFIG_DIST_8X8 + if (this_cost_valid) split_rd = tmp_rd; + } -#if !CONFIG_PVQ - av1_set_txb_context(x, plane, block, tx_size_selected, pta, ptl); -#if CONFIG_RECT_TX_EXT - if (is_qttx_picked) - av1_set_txb_context(x, plane, block_offset_qttx, tx_size_selected, - pta + blk_col_offset, ptl + blk_row_offset); -#endif // CONFIG_RECT_TX_EXT -#endif // !CONFIG_PVQ +#if COLLECT_TX_SIZE_DATA + do { + if (tx_size <= TX_4X4 || depth >= MAX_VARTX_DEPTH) break; +#if 0 + // Randomly select blocks to collect data to reduce output file size. + const int rnd_val = rand() % 2; + if (rnd_val) break; +#endif + + const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); + const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); + const int within_border = + mi_row >= xd->tile.mi_row_start && + (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) && + mi_col >= xd->tile.mi_col_start && + (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end); + if (!within_border) break; + + FILE *fp = fopen(av1_tx_size_data_output_file, "a"); + if (!fp) break; + + // Split decision, RD cost, block type(inter/intra), q-index, rdmult, + // and block size. + const int split_selected = sum_rd < this_rd; + const int is_inter = 1; + const int txb_w = tx_size_wide[tx_size]; + const int txb_h = tx_size_high[tx_size]; + fprintf(fp, "%d,%lld,%lld,%d,%d,%d,%d,%d,", split_selected, + (long long)this_rd, (long long)sum_rd, cpi->common.base_qindex, + x->rdmult, is_inter, txb_w, txb_h); + + // Residue signal. + const int diff_stride = block_size_wide[plane_bsize]; + const int16_t *src_diff = + &p->src_diff[(blk_row * diff_stride + blk_col) * 4]; + for (int r = 0; r < txb_h; ++r) { + for (int c = 0; c < txb_w; ++c) { + fprintf(fp, "%d,", src_diff[c]); + } + src_diff += diff_stride; + } + fprintf(fp, "\n"); + + fclose(fp); + } while (0); +#endif // COLLECT_TX_SIZE_DATA + + if (no_split_rd < split_rd) { + const TX_SIZE tx_size_selected = tx_size; + p->txb_entropy_ctx[block] = no_split_txb_entropy_ctx; + av1_set_txb_context(x, 0, block, tx_size_selected, pta, ptl); txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size, tx_size); - inter_tx_size[0][0] = tx_size_selected; - for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy) - for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx) - inter_tx_size[idy][idx] = tx_size_selected; - mbmi->tx_size = tx_size_selected; -#if CONFIG_TXK_SEL - mbmi->txk_type[txk_idx] = best_tx_type; -#endif - if (this_rd == INT64_MAX) *is_cost_valid = 0; -#if CONFIG_RECT_TX_EXT - if (is_qttx_picked) { - x->blk_skip[plane][0] = skip_qttx[0]; - x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = skip_qttx[1]; - } else { -#endif - x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip; -#if CONFIG_RECT_TX_EXT + for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) { + for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) { + const int index = + av1_get_txb_size_index(plane_bsize, blk_row + idy, blk_col + idx); + mbmi->inter_tx_size[index] = tx_size_selected; + } } -#endif + mbmi->tx_size = tx_size_selected; + update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, + no_split_tx_type); + x->blk_skip[blk_row * bw + blk_col] = rd_stats->skip; } else { - *rd_stats = sum_rd_stats; - if (sum_rd == INT64_MAX) *is_cost_valid = 0; + *rd_stats = split_rd_stats; + if (split_rd == INT64_MAX) *is_cost_valid = 0; } } -static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, - RD_STATS *rd_stats, BLOCK_SIZE bsize, - int64_t ref_best_rd) { +static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd, + FAST_TX_SEARCH_MODE ftxs_mode, + TXB_RD_INFO_NODE *rd_info_tree) { MACROBLOCKD *const xd = &x->e_mbd; int is_cost_valid = 1; int64_t this_rd = 0; @@ -5157,48 +4664,57 @@ static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, if (is_cost_valid) { const struct macroblockd_plane *const pd = &xd->plane[0]; - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); - const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; - const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int mi_width = mi_size_wide[plane_bsize]; + const int mi_height = mi_size_high[plane_bsize]; const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize]; const int bh = tx_size_high_unit[max_tx_size]; const int bw = tx_size_wide_unit[max_tx_size]; int idx, idy; int block = 0; - int init_depth = - (mi_height != mi_width) ? RECT_VARTX_DEPTH_INIT : SQR_VARTX_DEPTH_INIT; int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; - ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE]; - ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE]; - TXFM_CONTEXT tx_above[MAX_MIB_SIZE * 2]; - TXFM_CONTEXT tx_left[MAX_MIB_SIZE * 2]; + ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_above[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_left[MAX_MIB_SIZE]; RD_STATS pn_rd_stats; + const int init_depth = + get_search_init_depth(mi_width, mi_height, 1, &cpi->sf); av1_init_rd_stats(&pn_rd_stats); - av1_get_entropy_contexts(bsize, 0, pd, ctxa, ctxl); + av1_get_entropy_contexts(bsize, pd, ctxa, ctxl); memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width); memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height); for (idy = 0; idy < mi_height; idy += bh) { for (idx = 0; idx < mi_width; idx += bw) { - select_tx_block(cpi, x, idy, idx, 0, block, max_tx_size, init_depth, + select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, plane_bsize, ctxa, ctxl, tx_above, tx_left, - &pn_rd_stats, ref_best_rd - this_rd, &is_cost_valid); - if (pn_rd_stats.rate == INT_MAX) { + &pn_rd_stats, ref_best_rd - this_rd, &is_cost_valid, + ftxs_mode, rd_info_tree); + if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) { av1_invalid_rd_stats(rd_stats); return; } av1_merge_rd_stats(rd_stats, &pn_rd_stats); - this_rd += AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist), - RDCOST(x->rdmult, 0, pn_rd_stats.sse)); + this_rd += + AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist), + RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse)); block += step; + if (rd_info_tree != NULL) rd_info_tree += 1; } } } - - this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist), - RDCOST(x->rdmult, 0, rd_stats->sse)); + int64_t zero_rd = RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse); + this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (zero_rd < this_rd) { + this_rd = zero_rd; + rd_stats->rate = rd_stats->zero_rate; + rd_stats->dist = rd_stats->sse; + rd_stats->skip = 1; + } if (this_rd > ref_best_rd) is_cost_valid = 0; if (!is_cost_valid) { @@ -5209,541 +4725,711 @@ static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_stats, BLOCK_SIZE bsize, - int64_t ref_best_rd, TX_TYPE tx_type) { - const AV1_COMMON *const cm = &cpi->common; + int64_t ref_best_rd, + TXB_RD_INFO_NODE *rd_info_tree) { + const int fast_tx_search = cpi->sf.tx_size_search_method > USE_FULL_RD; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; const int is_inter = is_inter_block(mbmi); - aom_prob skip_prob = av1_get_skip_prob(cm, xd); - int s0 = av1_cost_bit(skip_prob, 0); - int s1 = av1_cost_bit(skip_prob, 1); + const int skip_ctx = av1_get_skip_context(xd); + int s0 = x->skip_cost[skip_ctx][0]; + int s1 = x->skip_cost[skip_ctx][1]; int64_t rd; - int row, col; - const int max_blocks_high = max_block_high(xd, bsize, 0); - const int max_blocks_wide = max_block_wide(xd, bsize, 0); - mbmi->tx_type = tx_type; - inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd); - mbmi->min_tx_size = get_min_tx_size(mbmi->inter_tx_size[0][0]); + // TODO(debargha): enable this as a speed feature where the + // select_inter_block_yrd() function above will use a simplified search + // such as not using full optimize, but the inter_block_yrd() function + // will use more complex search given that the transform partitions have + // already been decided. + + int64_t rd_thresh = ref_best_rd; + if (fast_tx_search && rd_thresh < INT64_MAX) { + if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3); + } + assert(rd_thresh > 0); + FAST_TX_SEARCH_MODE ftxs_mode = + fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE; + select_inter_block_yrd(cpi, x, rd_stats, bsize, rd_thresh, ftxs_mode, + rd_info_tree); if (rd_stats->rate == INT_MAX) return INT64_MAX; - for (row = 0; row < max_blocks_high / 2; ++row) - for (col = 0; col < max_blocks_wide / 2; ++col) - mbmi->min_tx_size = AOMMIN( - mbmi->min_tx_size, get_min_tx_size(mbmi->inter_tx_size[row][col])); - -#if !CONFIG_TXK_SEL -#if CONFIG_EXT_TX - if (get_ext_tx_types(mbmi->min_tx_size, bsize, is_inter, - cm->reduced_tx_set_used) > 1 && - !xd->lossless[xd->mi[0]->mbmi.segment_id]) { - const int ext_tx_set = get_ext_tx_set(mbmi->min_tx_size, bsize, is_inter, - cm->reduced_tx_set_used); -#if CONFIG_LGT_FROM_PRED - if (is_lgt_allowed(mbmi->mode, mbmi->min_tx_size)) { - if (LGT_FROM_PRED_INTRA && !is_inter && ext_tx_set > 0 && - ALLOW_INTRA_EXT_TX) - rd_stats->rate += x->intra_lgt_cost[txsize_sqr_map[mbmi->min_tx_size]] - [mbmi->mode][mbmi->use_lgt]; - if (LGT_FROM_PRED_INTER && is_inter && ext_tx_set > 0) - rd_stats->rate += - x->inter_lgt_cost[txsize_sqr_map[mbmi->min_tx_size]][mbmi->use_lgt]; - } - if (!mbmi->use_lgt) { -#endif // CONFIG_LGT_FROM_PRED - if (is_inter) { - if (ext_tx_set > 0) - rd_stats->rate += - x->inter_tx_type_costs[ext_tx_set] - [txsize_sqr_map[mbmi->min_tx_size]] - [mbmi->tx_type]; - } else { - if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX) - rd_stats->rate += - x->intra_tx_type_costs[ext_tx_set][mbmi->min_tx_size][mbmi->mode] - [mbmi->tx_type]; - } - } -#if CONFIG_LGT_FROM_PRED + // If fast_tx_search is true, only DCT and 1D DCT were tested in + // select_inter_block_yrd() above. Do a better search for tx type with + // tx sizes already decided. + if (fast_tx_search) { + if (!inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, FTXS_NONE)) + return INT64_MAX; } -#endif -#else - if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id]) - rd_stats->rate += x->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type]; -#endif // CONFIG_EXT_TX -#endif // CONFIG_TXK_SEL if (rd_stats->skip) rd = RDCOST(x->rdmult, s1, rd_stats->sse); else rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist); - if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && - !(rd_stats->skip)) + if (is_inter && !xd->lossless[xd->mi[0]->segment_id] && !(rd_stats->skip)) rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse)); return rd; } -static uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) { - const int rows = block_size_high[bsize]; - const int cols = block_size_wide[bsize]; - const int diff_stride = cols; - const struct macroblock_plane *const p = &x->plane[0]; - const int16_t *diff = &p->src_diff[0]; - uint8_t hash_data[MAX_SB_SQUARE]; - for (int r = 0; r < rows; ++r) { - for (int c = 0; c < cols; ++c) { - hash_data[cols * r + c] = clip_pixel(diff[c] + 128); +// Finds rd cost for a y block, given the transform size partitions +static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, + int blk_col, int block, TX_SIZE tx_size, + BLOCK_SIZE plane_bsize, int depth, + ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx, + TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, + int64_t ref_best_rd, RD_STATS *rd_stats, + FAST_TX_SEARCH_MODE ftxs_mode) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int max_blocks_high = max_block_high(xd, plane_bsize, 0); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0); + + assert(tx_size < TX_SIZES_ALL); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + const TX_SIZE plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index( + plane_bsize, blk_row, blk_col)]; + + int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row, + mbmi->sb_type, tx_size); + + av1_init_rd_stats(rd_stats); + if (tx_size == plane_tx_size) { + ENTROPY_CONTEXT *ta = above_ctx + blk_col; + ENTROPY_CONTEXT *tl = left_ctx + blk_row; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, 0, ta, tl, &txb_ctx); + + const int zero_blk_rate = x->coeff_costs[txs_ctx][get_plane_type(0)] + .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; + rd_stats->zero_rate = zero_blk_rate; + rd_stats->ref_rdcost = ref_best_rd; + tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, ta, + tl, rd_stats, ftxs_mode, ref_best_rd, NULL); + const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= + RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) || + rd_stats->skip == 1) { + rd_stats->rate = zero_blk_rate; + rd_stats->dist = rd_stats->sse; + rd_stats->skip = 1; + x->blk_skip[blk_row * mi_width + blk_col] = 1; + x->plane[0].eobs[block] = 0; + x->plane[0].txb_entropy_ctx[block] = 0; + update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, + DCT_DCT); + } else { + rd_stats->skip = 0; + x->blk_skip[blk_row * mi_width + blk_col] = 0; } - diff += diff_stride; + if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) + rd_stats->rate += x->txfm_partition_cost[ctx][0]; + av1_set_txb_context(x, 0, block, tx_size, ta, tl); + txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size, + tx_size); + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int step = bsh * bsw; + RD_STATS pn_rd_stats; + int64_t this_rd = 0; + assert(bsw > 0 && bsh > 0); + + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + const int offsetr = blk_row + row; + const int offsetc = blk_col + col; + + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + + av1_init_rd_stats(&pn_rd_stats); + tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize, + depth + 1, above_ctx, left_ctx, tx_above, tx_left, + ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode); + if (pn_rd_stats.rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + return; + } + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + this_rd += RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist); + block += step; + } + } + + if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) + rd_stats->rate += x->txfm_partition_cost[ctx][1]; } - return (av1_get_crc_value(&x->tx_rd_record.crc_calculator, hash_data, - rows * cols) - << 7) + - bsize; +} + +// Return value 0: early termination triggered, no valid rd cost available; +// 1: rd cost values are valid. +static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) { + MACROBLOCKD *const xd = &x->e_mbd; + int is_cost_valid = 1; + int64_t this_rd = 0; + + if (ref_best_rd < 0) is_cost_valid = 0; + + av1_init_rd_stats(rd_stats); + + if (is_cost_valid) { + const struct macroblockd_plane *const pd = &xd->plane[0]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int mi_width = mi_size_wide[plane_bsize]; + const int mi_height = mi_size_high[plane_bsize]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0); + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + const int init_depth = + get_search_init_depth(mi_width, mi_height, 1, &cpi->sf); + int idx, idy; + int block = 0; + int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; + ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_above[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_left[MAX_MIB_SIZE]; + RD_STATS pn_rd_stats; + + av1_get_entropy_contexts(bsize, pd, ctxa, ctxl); + memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width); + memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height); + + for (idy = 0; idy < mi_height; idy += bh) { + for (idx = 0; idx < mi_width; idx += bw) { + av1_init_rd_stats(&pn_rd_stats); + tx_block_yrd(cpi, x, idy, idx, block, max_tx_size, plane_bsize, + init_depth, ctxa, ctxl, tx_above, tx_left, + ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode); + if (pn_rd_stats.rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + return 0; + } + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + this_rd += + AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist), + RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse)); + block += step; + } + } + } + int64_t zero_rd = RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse); + this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (zero_rd < this_rd) { + this_rd = zero_rd; + rd_stats->rate = rd_stats->zero_rate; + rd_stats->dist = rd_stats->sse; + rd_stats->skip = 1; + } + if (this_rd > ref_best_rd) is_cost_valid = 0; + + if (!is_cost_valid) { + // reset cost value + av1_invalid_rd_stats(rd_stats); + } + return is_cost_valid; +} + +static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) { + const int rows = block_size_high[bsize]; + const int cols = block_size_wide[bsize]; + const int16_t *diff = x->plane[0].src_diff; + const uint32_t hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator, + (uint8_t *)diff, 2 * rows * cols); + return (hash << 5) + bsize; } static void save_tx_rd_info(int n4, uint32_t hash, const MACROBLOCK *const x, const RD_STATS *const rd_stats, - TX_RD_INFO *const tx_rd_info) { + MB_RD_RECORD *tx_rd_record) { + int index; + if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) { + index = + (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN; + ++tx_rd_record->num; + } else { + index = tx_rd_record->index_start; + tx_rd_record->index_start = + (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN; + } + MB_RD_INFO *const tx_rd_info = &tx_rd_record->tx_rd_info[index]; const MACROBLOCKD *const xd = &x->e_mbd; - const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const MB_MODE_INFO *const mbmi = xd->mi[0]; tx_rd_info->hash_value = hash; - tx_rd_info->tx_type = mbmi->tx_type; tx_rd_info->tx_size = mbmi->tx_size; -#if CONFIG_VAR_TX - tx_rd_info->min_tx_size = mbmi->min_tx_size; - memcpy(tx_rd_info->blk_skip, x->blk_skip[0], + memcpy(tx_rd_info->blk_skip, x->blk_skip, sizeof(tx_rd_info->blk_skip[0]) * n4); - for (int idy = 0; idy < xd->n8_h; ++idy) - for (int idx = 0; idx < xd->n8_w; ++idx) - tx_rd_info->inter_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx]; -#endif // CONFIG_VAR_TX -#if CONFIG_TXK_SEL + av1_copy(tx_rd_info->inter_tx_size, mbmi->inter_tx_size); av1_copy(tx_rd_info->txk_type, mbmi->txk_type); -#endif // CONFIG_TXK_SEL tx_rd_info->rd_stats = *rd_stats; } -static void fetch_tx_rd_info(int n4, const TX_RD_INFO *const tx_rd_info, +static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info, RD_STATS *const rd_stats, MACROBLOCK *const x) { MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - mbmi->tx_type = tx_rd_info->tx_type; + MB_MODE_INFO *const mbmi = xd->mi[0]; mbmi->tx_size = tx_rd_info->tx_size; -#if CONFIG_VAR_TX - mbmi->min_tx_size = tx_rd_info->min_tx_size; - memcpy(x->blk_skip[0], tx_rd_info->blk_skip, + memcpy(x->blk_skip, tx_rd_info->blk_skip, sizeof(tx_rd_info->blk_skip[0]) * n4); - for (int idy = 0; idy < xd->n8_h; ++idy) - for (int idx = 0; idx < xd->n8_w; ++idx) - mbmi->inter_tx_size[idy][idx] = tx_rd_info->inter_tx_size[idy][idx]; -#endif // CONFIG_VAR_TX -#if CONFIG_TXK_SEL + av1_copy(mbmi->inter_tx_size, tx_rd_info->inter_tx_size); av1_copy(mbmi->txk_type, tx_rd_info->txk_type); -#endif // CONFIG_TXK_SEL *rd_stats = tx_rd_info->rd_stats; } +static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, + const uint32_t hash) { + // Linear search through the circular buffer to find matching hash. + int index; + for (int i = cur_record->num - 1; i >= 0; i--) { + index = (cur_record->index_start + i) % TX_SIZE_RD_RECORD_BUFFER_LEN; + if (cur_record->hash_vals[index] == hash) return index; + } + + // If not found - add new RD info into the buffer and return its index + if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) { + index = (cur_record->index_start + cur_record->num) % + TX_SIZE_RD_RECORD_BUFFER_LEN; + cur_record->num++; + } else { + index = cur_record->index_start; + cur_record->index_start = + (cur_record->index_start + 1) % TX_SIZE_RD_RECORD_BUFFER_LEN; + } + + cur_record->hash_vals[index] = hash; + av1_zero(cur_record->tx_rd_info[index]); + return index; +} + +// Go through all TX blocks that could be used in TX size search, compute +// residual hash values for them and find matching RD info that stores previous +// RD search results for these TX blocks. The idea is to prevent repeated +// rate/distortion computations that happen because of the combination of +// partition and TX size search. The resulting RD info records are returned in +// the form of a quadtree for easier access in actual TX size search. +static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, + int mi_col, TXB_RD_INFO_NODE *dst_rd_info) { + TXB_RD_RECORD *rd_records_table[4] = { x->txb_rd_record_8X8, + x->txb_rd_record_16X16, + x->txb_rd_record_32X32, + x->txb_rd_record_64X64 }; + const TX_SIZE max_square_tx_size = max_txsize_lookup[bsize]; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + + // Hashing is performed only for square TX sizes larger than TX_4X4 + if (max_square_tx_size < TX_8X8) return 0; + + const int bw_mi = mi_size_wide[bsize]; + const int diff_stride = bw; + const struct macroblock_plane *const p = &x->plane[0]; + const int16_t *diff = &p->src_diff[0]; + + // Coordinates of the top-left corner of current block within the superblock + // measured in pixels: + const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2; + const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2; + int cur_rd_info_idx = 0; + int cur_tx_depth = 0; + uint8_t parent_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 }; + uint8_t child_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 }; + TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize]; + while (cur_tx_depth <= MAX_VARTX_DEPTH) { + const int cur_tx_bw = tx_size_wide[cur_tx_size]; + const int cur_tx_bh = tx_size_high[cur_tx_size]; + if (cur_tx_bw < 8 || cur_tx_bh < 8) break; + const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size]; + for (int row = 0; row < bh; row += cur_tx_bh) { + for (int col = 0; col < bw; col += cur_tx_bw) { + if (cur_tx_bw != cur_tx_bh) { + // Use dummy nodes for all rectangular transforms within the + // TX size search tree. + dst_rd_info[cur_rd_info_idx].rd_info_array = NULL; + } else { + // Get spatial location of this TX block within the superblock + // (measured in cur_tx_bsize units). + const int row_in_sb = (mi_row_in_sb + row) / cur_tx_bh; + const int col_in_sb = (mi_col_in_sb + col) / cur_tx_bw; + + int16_t hash_data[MAX_SB_SQUARE]; + int16_t *cur_hash_row = hash_data; + const int16_t *cur_diff_row = diff + row * diff_stride + col; + for (int i = 0; i < cur_tx_bh; i++) { + memcpy(cur_hash_row, cur_diff_row, sizeof(*hash_data) * cur_tx_bw); + cur_hash_row += cur_tx_bw; + cur_diff_row += diff_stride; + } + const int hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator, + (uint8_t *)hash_data, + 2 * cur_tx_bw * cur_tx_bh); + + // Find corresponding RD info based on the hash value. + const int rd_record_idx = + row_in_sb * (MAX_MIB_SIZE >> (cur_tx_size + 1 - TX_8X8)) + + col_in_sb; + + int idx = find_tx_size_rd_info( + &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx], hash); + dst_rd_info[cur_rd_info_idx].rd_info_array = + &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx] + .tx_rd_info[idx]; + } + + // Update the output quadtree RD info structure. + av1_zero(dst_rd_info[cur_rd_info_idx].children); + const int this_mi_row = row / MI_SIZE; + const int this_mi_col = col / MI_SIZE; + if (cur_tx_depth > 0) { // Set up child pointers. + const int mi_index = this_mi_row * bw_mi + this_mi_col; + const int child_idx = child_idx_buf[mi_index]; + assert(child_idx < 4); + dst_rd_info[parent_idx_buf[mi_index]].children[child_idx] = + &dst_rd_info[cur_rd_info_idx]; + } + if (cur_tx_depth < MAX_VARTX_DEPTH) { // Set up parent and child idx. + const int tx_bh_mi = cur_tx_bh / MI_SIZE; + const int tx_bw_mi = cur_tx_bw / MI_SIZE; + for (int i = this_mi_row; i < this_mi_row + tx_bh_mi; ++i) { + memset(parent_idx_buf + i * bw_mi + this_mi_col, cur_rd_info_idx, + tx_bw_mi); + } + int child_idx = 0; + const int next_tx_bh_mi = tx_size_wide_unit[next_tx_size]; + const int next_tx_bw_mi = tx_size_wide_unit[next_tx_size]; + for (int i = this_mi_row; i < this_mi_row + tx_bh_mi; + i += next_tx_bh_mi) { + for (int j = this_mi_col; j < this_mi_col + tx_bw_mi; + j += next_tx_bw_mi) { + assert(child_idx < 4); + child_idx_buf[i * bw_mi + j] = child_idx++; + } + } + } + ++cur_rd_info_idx; + } + } + cur_tx_size = next_tx_size; + ++cur_tx_depth; + } + return 1; +} + +// origin_threshold * 128 / 100 +static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = { + { + 64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68, + }, + { + 88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68, + }, + { + 90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74, + }, +}; + +// lookup table for predict_skip_flag +// int max_tx_size = max_txsize_rect_lookup[bsize]; +// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16) +// max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16); +static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = { + TX_4X4, TX_4X8, TX_8X4, TX_8X8, TX_8X16, TX_16X8, + TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, + TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16, TX_16X4, + TX_8X8, TX_8X8, TX_16X16, TX_16X16, +}; + // Uses simple features on top of DCT coefficients to quickly predict // whether optimal RD decision is to skip encoding the residual. -static int predict_skip_flag_8bit(const MACROBLOCK *x, BLOCK_SIZE bsize) { - if (bsize > BLOCK_16X16) return 0; - // Tuned for target false-positive rate of 5% for all block sizes: - const uint32_t threshold_table[] = { 50, 50, 50, 55, 47, 47, 53, 22, 22, 37 }; - const struct macroblock_plane *const p = &x->plane[0]; +// The sse value is stored in dist. +static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist, + int reduced_tx_set) { const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; - tran_low_t DCT_coefs[32 * 32]; + const MACROBLOCKD *xd = &x->e_mbd; + const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd); + + *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize); + const int64_t mse = *dist / bw / bh; + // Normalized quantizer takes the transform upscaling factor (8 for tx size + // smaller than 32) into account. + const int16_t normalized_dc_q = dc_q >> 3; + const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8; + // Predict not to skip when mse is larger than threshold. + if (mse > mse_thresh) return 0; + + const int max_tx_size = max_predict_sf_tx_size[bsize]; + const int tx_h = tx_size_high[max_tx_size]; + const int tx_w = tx_size_wide[max_tx_size]; + DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]); TxfmParam param; param.tx_type = DCT_DCT; -#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX) - param.tx_size = max_txsize_rect_lookup[bsize]; -#else - param.tx_size = max_txsize_lookup[bsize]; -#endif - param.bd = 8; + param.tx_size = max_tx_size; + param.bd = xd->bd; + param.is_hbd = get_bitdepth_data_path_index(xd); param.lossless = 0; - av1_fwd_txfm(p->src_diff, DCT_coefs, bw, ¶m); - - uint32_t dc = (uint32_t)av1_dc_quant(x->qindex, 0, AOM_BITS_8); - uint32_t ac = (uint32_t)av1_ac_quant(x->qindex, 0, AOM_BITS_8); - uint32_t max_quantized_coef = (100 * (uint32_t)abs(DCT_coefs[0])) / dc; - for (int i = 1; i < bw * bh; i++) { - uint32_t cur_quantized_coef = (100 * (uint32_t)abs(DCT_coefs[i])) / ac; - if (cur_quantized_coef > max_quantized_coef) - max_quantized_coef = cur_quantized_coef; + param.tx_set_type = av1_get_ext_tx_set_type( + param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set); + const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2); + const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize]; + const int16_t *src_diff = x->plane[0].src_diff; + const int n_coeff = tx_w * tx_h; + const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd); + const uint32_t dc_thresh = max_qcoef_thresh * dc_q; + const uint32_t ac_thresh = max_qcoef_thresh * ac_q; + for (int row = 0; row < bh; row += tx_h) { + for (int col = 0; col < bw; col += tx_w) { + av1_fwd_txfm(src_diff + col, coefs, bw, ¶m); + // Operating on TX domain, not pixels; we want the QTX quantizers + const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7); + if (dc_coef >= dc_thresh) return 0; + for (int i = 1; i < n_coeff; ++i) { + const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7); + if (ac_coef >= ac_thresh) return 0; + } + } + src_diff += tx_h * bw; } - - return max_quantized_coef < threshold_table[AOMMAX(bsize - BLOCK_4X4, 0)]; + return 1; } // Used to set proper context for early termination with skip = 1. -static void set_skip_flag(const AV1_COMP *cpi, MACROBLOCK *x, - RD_STATS *rd_stats, int bsize) { +static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize, + int64_t dist) { MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; const int n4 = bsize_to_num_blk(bsize); -#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX) const TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; -#else - const TX_SIZE tx_size = max_txsize_lookup[bsize]; -#endif - mbmi->tx_type = DCT_DCT; - for (int idy = 0; idy < xd->n8_h; ++idy) - for (int idx = 0; idx < xd->n8_w; ++idx) - mbmi->inter_tx_size[idy][idx] = tx_size; + memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN); + memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size)); mbmi->tx_size = tx_size; - mbmi->min_tx_size = get_min_tx_size(tx_size); - memset(x->blk_skip[0], 1, sizeof(uint8_t) * n4); + memset(x->blk_skip, 1, sizeof(x->blk_skip[0]) * n4); rd_stats->skip = 1; // Rate. - const int tx_size_ctx = txsize_sqr_map[tx_size]; - ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE]; - ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE]; - av1_get_entropy_contexts(bsize, 0, &xd->plane[0], ctxa, ctxl); - int coeff_ctx = get_entropy_context(tx_size, ctxa, ctxl); - int rate = x->token_head_costs[tx_size_ctx][PLANE_TYPE_Y][1][0][coeff_ctx][0]; + const int tx_size_ctx = get_txsize_entropy_ctx(tx_size); + ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; + av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl); + TXB_CTX txb_ctx; + // Because plane is 0, plane_bsize equal to bsize + get_txb_ctx(bsize, tx_size, 0, ctxa, ctxl, &txb_ctx); + int rate = x->coeff_costs[tx_size_ctx][PLANE_TYPE_Y] + .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; if (tx_size > TX_4X4) { int ctx = txfm_partition_context( xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size); - rate += av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0); + rate += x->txfm_partition_cost[ctx][0]; } -#if !CONFIG_TXK_SEL -#if CONFIG_EXT_TX - const AV1_COMMON *cm = &cpi->common; - const int ext_tx_set = get_ext_tx_set(max_txsize_lookup[bsize], bsize, 1, - cm->reduced_tx_set_used); - if (get_ext_tx_types(mbmi->min_tx_size, bsize, 1, cm->reduced_tx_set_used) > - 1 && - !xd->lossless[xd->mi[0]->mbmi.segment_id]) { - if (ext_tx_set > 0) - rate += - x->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[mbmi->min_tx_size]] - [mbmi->tx_type]; - } -#else - if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id]) - rd_stats->rate += x->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type]; -#endif // CONFIG_EXT_TX -#endif // CONFIG_TXK_SEL rd_stats->rate = rate; - - // Distortion. - int64_t tmp = pixel_diff_dist(x, 0, x->plane[0].src_diff, - block_size_wide[bsize], 0, 0, bsize, bsize); -#if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2); -#endif // CONFIG_HIGHBITDEPTH - rd_stats->dist = rd_stats->sse = (tmp << 4); + dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2); + rd_stats->dist = rd_stats->sse = (dist << 4); } static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, - RD_STATS *rd_stats, BLOCK_SIZE bsize, - int64_t ref_best_rd) { + RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row, + int mi_col, int64_t ref_best_rd) { const AV1_COMMON *cm = &cpi->common; - const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; int64_t rd = INT64_MAX; int64_t best_rd = INT64_MAX; - TX_TYPE tx_type, best_tx_type = DCT_DCT; const int is_inter = is_inter_block(mbmi); - TX_SIZE best_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE]; - TX_SIZE best_tx = max_txsize_lookup[bsize]; - TX_SIZE best_min_tx_size = TX_SIZES_ALL; - uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8]; - TX_TYPE txk_start = DCT_DCT; -#if CONFIG_TXK_SEL - TX_TYPE txk_end = DCT_DCT + 1; -#else - TX_TYPE txk_end = TX_TYPES; -#endif const int n4 = bsize_to_num_blk(bsize); - int idx, idy; - int prune = 0; -#if CONFIG_EXT_TX - const TxSetType tx_set_type = get_ext_tx_set_type( - max_tx_size, bsize, is_inter, cm->reduced_tx_set_used); - const int ext_tx_set = - get_ext_tx_set(max_tx_size, bsize, is_inter, cm->reduced_tx_set_used); -#endif // CONFIG_EXT_TX + // Get the tx_size 1 level down + const TX_SIZE min_tx_size = sub_tx_size_map[max_txsize_rect_lookup[bsize]]; + const TxSetType tx_set_type = + av1_get_ext_tx_set_type(min_tx_size, is_inter, cm->reduced_tx_set_used); + const int within_border = + mi_row >= xd->tile.mi_row_start && + (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) && + mi_col >= xd->tile.mi_col_start && + (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end); av1_invalid_rd_stats(rd_stats); -#if CONFIG_LGT_FROM_PRED - mbmi->use_lgt = 0; - int search_lgt = is_inter - ? LGT_FROM_PRED_INTER && - (!cpi->sf.tx_type_search.prune_mode > NO_PRUNE) - : LGT_FROM_PRED_INTRA && ALLOW_INTRA_EXT_TX; -#endif // CONFIG_LGT_FROM_PRED + if (cpi->sf.model_based_prune_tx_search_level && ref_best_rd != INT64_MAX) { + int model_rate; + int64_t model_dist; + int model_skip; + model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist, + &model_skip, NULL, NULL, NULL, NULL); + const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist); + // If the modeled rd is a lot worse than the best so far, breakout. + // TODO(debargha, urvang): Improve the model and make the check below + // tighter. + assert(cpi->sf.model_based_prune_tx_search_level >= 0 && + cpi->sf.model_based_prune_tx_search_level <= 2); + if (!model_skip && + model_rd / (5 - cpi->sf.model_based_prune_tx_search_level) > + ref_best_rd) + return; + } const uint32_t hash = get_block_residue_hash(x, bsize); - TX_RD_RECORD *tx_rd_record = &x->tx_rd_record; + MB_RD_RECORD *mb_rd_record = &x->mb_rd_record; - if (ref_best_rd != INT64_MAX) { - for (int i = 0; i < tx_rd_record->num; ++i) { - const int index = (tx_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN; + if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_mb_rd_hash) { + for (int i = 0; i < mb_rd_record->num; ++i) { + const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN; // If there is a match in the tx_rd_record, fetch the RD decision and // terminate early. - if (tx_rd_record->tx_rd_info[index].hash_value == hash) { - TX_RD_INFO *tx_rd_info = &tx_rd_record->tx_rd_info[index]; + if (mb_rd_record->tx_rd_info[index].hash_value == hash) { + MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[index]; fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x); return; } } } -// If we predict that skip is the optimal RD decision - set the respective -// context and terminate early. -#if CONFIG_HIGHBITDEPTH - if (!(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)) -#endif // CONFIG_HIGHBITDEPTH - { - if (is_inter && cpi->sf.tx_type_search.use_skip_flag_prediction && - predict_skip_flag_8bit(x, bsize)) { - set_skip_flag(cpi, x, rd_stats, bsize); - return; - } + // If we predict that skip is the optimal RD decision - set the respective + // context and terminate early. + int64_t dist; + if (is_inter && cpi->sf.tx_type_search.use_skip_flag_prediction && + predict_skip_flag(x, bsize, &dist, cm->reduced_tx_set_used)) { + set_skip_flag(x, rd_stats, bsize, dist); + // Save the RD search results into tx_rd_record. + if (within_border) save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record); + return; } - if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE) -#if CONFIG_EXT_TX - prune = prune_tx_types(cpi, bsize, x, xd, ext_tx_set); -#else - prune = prune_tx_types(cpi, bsize, x, xd, 0); -#endif // CONFIG_EXT_TX + // Precompute residual hashes and find existing or add new RD records to + // store and reuse rate and distortion values to speed up TX size search. + TXB_RD_INFO_NODE matched_rd_info[16 + 64 + 256]; + int found_rd_info = 0; + if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_inter_txb_hash) { + found_rd_info = + find_tx_size_rd_records(x, bsize, mi_row, mi_col, matched_rd_info); + } + + prune_tx(cpi, bsize, x, xd, tx_set_type); int found = 0; - for (tx_type = txk_start; tx_type < txk_end; ++tx_type) { - RD_STATS this_rd_stats; - av1_init_rd_stats(&this_rd_stats); -#if CONFIG_MRC_TX - // MRC_DCT only implemented for TX_32X32 so only include this tx in - // the search for TX_32X32 - if (tx_type == MRC_DCT && - (max_tx_size != TX_32X32 || (is_inter && !USE_MRC_INTER) || - (!is_inter && !USE_MRC_INTRA))) - continue; -#endif // CONFIG_MRC_TX -#if CONFIG_EXT_TX - if (!av1_ext_tx_used[tx_set_type][tx_type]) continue; - if (is_inter) { - if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) { - if (!do_tx_type_search(tx_type, prune)) continue; - } - } else { - if (!ALLOW_INTRA_EXT_TX && bsize >= BLOCK_8X8) { - if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue; - } - } -#else // CONFIG_EXT_TX - if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE && - !do_tx_type_search(tx_type, prune)) - continue; -#endif // CONFIG_EXT_TX - if (is_inter && x->use_default_inter_tx_type && - tx_type != get_default_tx_type(0, xd, 0, max_tx_size)) - continue; + RD_STATS this_rd_stats; + av1_init_rd_stats(&this_rd_stats); - if (xd->lossless[mbmi->segment_id]) - if (tx_type != DCT_DCT) continue; + rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd, + found_rd_info ? matched_rd_info : NULL); - rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd, - tx_type); - ref_best_rd = AOMMIN(rd, ref_best_rd); - if (rd < best_rd) { - best_rd = rd; - *rd_stats = this_rd_stats; - best_tx_type = mbmi->tx_type; - best_tx = mbmi->tx_size; - best_min_tx_size = mbmi->min_tx_size; - memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4); - found = 1; - for (idy = 0; idy < xd->n8_h; ++idy) - for (idx = 0; idx < xd->n8_w; ++idx) - best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx]; - } + ref_best_rd = AOMMIN(rd, ref_best_rd); + if (rd < best_rd) { + *rd_stats = this_rd_stats; + found = 1; } + // Reset the pruning flags. + av1_zero(x->tx_search_prune); + x->tx_split_prune_flag = 0; + // We should always find at least one candidate unless ref_best_rd is less // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type // might have failed to find something better) assert(IMPLIES(!found, ref_best_rd != INT64_MAX)); if (!found) return; -#if CONFIG_LGT_FROM_PRED - if (search_lgt && is_lgt_allowed(mbmi->mode, max_tx_size) && - !cm->reduced_tx_set_used) { - RD_STATS this_rd_stats; - mbmi->use_lgt = 1; - rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd, 0); - if (rd < best_rd) { - best_rd = rd; - *rd_stats = this_rd_stats; - best_tx = mbmi->tx_size; - best_min_tx_size = mbmi->min_tx_size; - memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4); - for (idy = 0; idy < xd->n8_h; ++idy) - for (idx = 0; idx < xd->n8_w; ++idx) - best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx]; - } else { - mbmi->use_lgt = 0; - } - } -#endif // CONFIG_LGT_FROM_PRED - // We found a candidate transform to use. Copy our results from the "best" - // array into mbmi. - mbmi->tx_type = best_tx_type; - for (idy = 0; idy < xd->n8_h; ++idy) - for (idx = 0; idx < xd->n8_w; ++idx) - mbmi->inter_tx_size[idy][idx] = best_tx_size[idy][idx]; - mbmi->tx_size = best_tx; - mbmi->min_tx_size = best_min_tx_size; - memcpy(x->blk_skip[0], best_blk_skip, sizeof(best_blk_skip[0]) * n4); - // Save the RD search results into tx_rd_record. - int index; - if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) { - index = - (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN; - ++tx_rd_record->num; - } else { - index = tx_rd_record->index_start; - tx_rd_record->index_start = - (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN; - } - save_tx_rd_info(n4, hash, x, rd_stats, &tx_rd_record->tx_rd_info[index]); + if (within_border && cpi->sf.use_mb_rd_hash) + save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record); } -static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, - int blk_col, int plane, int block, TX_SIZE tx_size, - BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx, - ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats) { +static void tx_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, + int blk_col, int plane, int block, TX_SIZE tx_size, + BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx, + ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats, + FAST_TX_SEARCH_MODE ftxs_mode) { + assert(plane > 0); + assert(tx_size < TX_SIZES_ALL); MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - struct macroblockd_plane *const pd = &xd->plane[plane]; - BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; - const int tx_row = blk_row >> (1 - pd->subsampling_y); - const int tx_col = blk_col >> (1 - pd->subsampling_x); - TX_SIZE plane_tx_size; const int max_blocks_high = max_block_high(xd, plane_bsize, plane); const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); - - assert(tx_size < TX_SIZES_ALL); - if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; - plane_tx_size = - plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0] - : mbmi->inter_tx_size[tx_row][tx_col]; - - if (tx_size == plane_tx_size) { - ENTROPY_CONTEXT *ta = above_ctx + blk_col; - ENTROPY_CONTEXT *tl = left_ctx + blk_row; - av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, - plane_bsize, ta, tl, rd_stats); -#if !CONFIG_PVQ - av1_set_txb_context(x, plane, block, tx_size, ta, tl); -#endif // !CONFIG_PVQ - } else { - const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; - const int bsl = tx_size_wide_unit[sub_txs]; - int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs]; - int i; - - assert(bsl > 0); - - for (i = 0; i < 4; ++i) { - int offsetr = blk_row + (i >> 1) * bsl; - int offsetc = blk_col + (i & 0x01) * bsl; - - if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; - - tx_block_rd(cpi, x, offsetr, offsetc, plane, block, sub_txs, plane_bsize, - above_ctx, left_ctx, rd_stats); - block += step; - } - } + ENTROPY_CONTEXT *ta = above_ctx + blk_col; + ENTROPY_CONTEXT *tl = left_ctx + blk_row; + tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, plane_bsize, + ta, tl, rd_stats, ftxs_mode, INT64_MAX, NULL); + av1_set_txb_context(x, plane, block, tx_size, ta, tl); } // Return value 0: early termination triggered, no valid rd cost available; // 1: rd cost values are valid. static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_stats, BLOCK_SIZE bsize, - int64_t ref_best_rd) { + int64_t ref_best_rd, + FAST_TX_SEARCH_MODE ftxs_mode) { MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; int plane; int is_cost_valid = 1; - int64_t this_rd; + int64_t this_rd = 0; if (ref_best_rd < 0) is_cost_valid = 0; av1_init_rd_stats(rd_stats); -#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2 if (x->skip_chroma_rd) return is_cost_valid; - bsize = scale_chroma_bsize(mbmi->sb_type, xd->plane[1].subsampling_x, - xd->plane[1].subsampling_y); -#endif // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2 - -#if CONFIG_EXT_TX && CONFIG_RECT_TX - if (is_rect_tx(mbmi->tx_size)) { - return super_block_uvrd(cpi, x, rd_stats, bsize, ref_best_rd); - } -#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + const BLOCK_SIZE bsizec = scale_chroma_bsize( + bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y); if (is_inter_block(mbmi) && is_cost_valid) { for (plane = 1; plane < MAX_MB_PLANE; ++plane) - av1_subtract_plane(x, bsize, plane); + av1_subtract_plane(x, bsizec, plane); } - for (plane = 1; plane < MAX_MB_PLANE; ++plane) { - const struct macroblockd_plane *const pd = &xd->plane[plane]; - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); - const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; - const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0]; - const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize]; - const int bh = tx_size_high_unit[max_tx_size]; - const int bw = tx_size_wide_unit[max_tx_size]; - int idx, idy; - int block = 0; - const int step = bh * bw; - ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE]; - ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE]; - RD_STATS pn_rd_stats; - av1_init_rd_stats(&pn_rd_stats); - - av1_get_entropy_contexts(bsize, 0, pd, ta, tl); - - for (idy = 0; idy < mi_height; idy += bh) { - for (idx = 0; idx < mi_width; idx += bw) { - tx_block_rd(cpi, x, idy, idx, plane, block, max_tx_size, plane_bsize, - ta, tl, &pn_rd_stats); - block += step; + if (is_cost_valid) { + for (plane = 1; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y); + const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + const int mi_height = + block_size_high[plane_bsize] >> tx_size_high_log2[0]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + int idx, idy; + int block = 0; + const int step = bh * bw; + ENTROPY_CONTEXT ta[MAX_MIB_SIZE]; + ENTROPY_CONTEXT tl[MAX_MIB_SIZE]; + RD_STATS pn_rd_stats; + av1_init_rd_stats(&pn_rd_stats); + av1_get_entropy_contexts(bsizec, pd, ta, tl); + + for (idy = 0; idy < mi_height; idy += bh) { + for (idx = 0; idx < mi_width; idx += bw) { + tx_block_uvrd(cpi, x, idy, idx, plane, block, max_tx_size, + plane_bsize, ta, tl, &pn_rd_stats, ftxs_mode); + block += step; + } } - } - if (pn_rd_stats.rate == INT_MAX) { - is_cost_valid = 0; - break; - } + if (pn_rd_stats.rate == INT_MAX) { + is_cost_valid = 0; + break; + } - av1_merge_rd_stats(rd_stats, &pn_rd_stats); + av1_merge_rd_stats(rd_stats, &pn_rd_stats); - this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist), - RDCOST(x->rdmult, 0, rd_stats->sse)); + this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist), + RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse)); - if (this_rd > ref_best_rd) { - is_cost_valid = 0; - break; + if (this_rd > ref_best_rd) { + is_cost_valid = 0; + break; + } } } @@ -5754,7 +5440,6 @@ static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, return is_cost_valid; } -#endif // CONFIG_VAR_TX static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, int dc_mode_cost, @@ -5764,11 +5449,12 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, int *rate_tokenonly, int64_t *distortion, int *skippable) { MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; assert(!is_inter_block(mbmi)); + assert( + av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type)); PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const BLOCK_SIZE bsize = mbmi->sb_type; - assert(bsize >= BLOCK_8X8); int this_rate; int64_t this_rd; int colors_u, colors_v, colors; @@ -5780,42 +5466,32 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, int plane_block_width, plane_block_height, rows, cols; av1_get_block_dimensions(bsize, 1, xd, &plane_block_width, &plane_block_height, &rows, &cols); - if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return; mbmi->uv_mode = UV_DC_PRED; -#if CONFIG_FILTER_INTRA - mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0; -#endif // CONFIG_FILTER_INTRA -#if CONFIG_HIGHBITDEPTH + int count_buf[1 << 12]; // Maximum (1 << 12) color levels. if (cpi->common.use_highbitdepth) { colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols, - cpi->common.bit_depth); + cpi->common.bit_depth, count_buf); colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols, - cpi->common.bit_depth); + cpi->common.bit_depth, count_buf); } else { -#endif // CONFIG_HIGHBITDEPTH - colors_u = av1_count_colors(src_u, src_stride, rows, cols); - colors_v = av1_count_colors(src_v, src_stride, rows, cols); -#if CONFIG_HIGHBITDEPTH + colors_u = av1_count_colors(src_u, src_stride, rows, cols, count_buf); + colors_v = av1_count_colors(src_v, src_stride, rows, cols, count_buf); } -#endif // CONFIG_HIGHBITDEPTH -#if CONFIG_PALETTE_DELTA_ENCODING uint16_t color_cache[2 * PALETTE_MAX_SIZE]; const int n_cache = av1_get_palette_cache(xd, 1, color_cache); -#endif // CONFIG_PALETTE_DELTA_ENCODING colors = colors_u > colors_v ? colors_u : colors_v; if (colors > 1 && colors <= 64) { int r, c, n, i, j; const int max_itr = 50; - float lb_u, ub_u, val_u; - float lb_v, ub_v, val_v; - float *const data = x->palette_buffer->kmeans_data_buf; - float centroids[2 * PALETTE_MAX_SIZE]; + int lb_u, ub_u, val_u; + int lb_v, ub_v, val_v; + int *const data = x->palette_buffer->kmeans_data_buf; + int centroids[2 * PALETTE_MAX_SIZE]; -#if CONFIG_HIGHBITDEPTH uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u); uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v); if (cpi->common.use_highbitdepth) { @@ -5824,32 +5500,25 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, lb_v = src_v16[0]; ub_v = src_v16[0]; } else { -#endif // CONFIG_HIGHBITDEPTH lb_u = src_u[0]; ub_u = src_u[0]; lb_v = src_v[0]; ub_v = src_v[0]; -#if CONFIG_HIGHBITDEPTH } -#endif // CONFIG_HIGHBITDEPTH for (r = 0; r < rows; ++r) { for (c = 0; c < cols; ++c) { -#if CONFIG_HIGHBITDEPTH if (cpi->common.use_highbitdepth) { val_u = src_u16[r * src_stride + c]; val_v = src_v16[r * src_stride + c]; data[(r * cols + c) * 2] = val_u; data[(r * cols + c) * 2 + 1] = val_v; } else { -#endif // CONFIG_HIGHBITDEPTH val_u = src_u[r * src_stride + c]; val_v = src_v[r * src_stride + c]; data[(r * cols + c) * 2] = val_u; data[(r * cols + c) * 2 + 1] = val_v; -#if CONFIG_HIGHBITDEPTH } -#endif // CONFIG_HIGHBITDEPTH if (val_u < lb_u) lb_u = val_u; else if (val_u > ub_u) @@ -5868,34 +5537,30 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2; } av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr); -#if CONFIG_PALETTE_DELTA_ENCODING optimize_palette_colors(color_cache, n_cache, n, 2, centroids); // Sort the U channel colors in ascending order. for (i = 0; i < 2 * (n - 1); i += 2) { int min_idx = i; - float min_val = centroids[i]; + int min_val = centroids[i]; for (j = i + 2; j < 2 * n; j += 2) if (centroids[j] < min_val) min_val = centroids[j], min_idx = j; if (min_idx != i) { - float temp_u = centroids[i], temp_v = centroids[i + 1]; + int temp_u = centroids[i], temp_v = centroids[i + 1]; centroids[i] = centroids[min_idx]; centroids[i + 1] = centroids[min_idx + 1]; centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v; } } av1_calc_indices(data, centroids, color_map, rows * cols, n, 2); -#endif // CONFIG_PALETTE_DELTA_ENCODING extend_palette_color_map(color_map, cols, rows, plane_block_width, plane_block_height); pmi->palette_size[1] = n; for (i = 1; i < 3; ++i) { for (j = 0; j < n; ++j) { -#if CONFIG_HIGHBITDEPTH if (cpi->common.use_highbitdepth) pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd( (int)centroids[j * 2 + i - 1], cpi->common.bit_depth); else -#endif // CONFIG_HIGHBITDEPTH pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel((int)centroids[j * 2 + i - 1]); } @@ -5903,19 +5568,8 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); if (tokenonly_rd_stats.rate == INT_MAX) continue; - this_rate = - tokenonly_rd_stats.rate + dc_mode_cost + - x->palette_uv_size_cost[bsize - BLOCK_8X8][n - PALETTE_MIN_SIZE] + - write_uniform_cost(n, color_map[0]) + - av1_cost_bit( - av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 1); - this_rate += av1_palette_color_cost_uv(pmi, -#if CONFIG_PALETTE_DELTA_ENCODING - color_cache, n_cache, -#endif // CONFIG_PALETTE_DELTA_ENCODING - cpi->common.bit_depth); - this_rate += - av1_cost_color_map(x, 1, 0, bsize, mbmi->tx_size, PALETTE_MAP); + this_rate = tokenonly_rd_stats.rate + + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost); this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); if (this_rd < *best_rd) { *best_rd = this_rd; @@ -5937,68 +5591,13 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, } } -#if CONFIG_FILTER_INTRA -// Return 1 if an filter intra mode is selected; return 0 otherwise. -static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, - int *rate, int *rate_tokenonly, - int64_t *distortion, int *skippable, - BLOCK_SIZE bsize, int64_t *best_rd) { - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - int filter_intra_selected_flag = 0; - int this_rate; - int64_t this_rd; - FILTER_INTRA_MODE mode; - FILTER_INTRA_MODE_INFO filter_intra_mode_info; - RD_STATS tokenonly_rd_stats; - - av1_zero(filter_intra_mode_info); - mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 1; - mbmi->uv_mode = UV_DC_PRED; - mbmi->palette_mode_info.palette_size[1] = 0; - - for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) { - mbmi->filter_intra_mode_info.filter_intra_mode[1] = mode; - if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd)) - continue; - - this_rate = tokenonly_rd_stats.rate + - av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 1) + - x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] + - write_uniform_cost(FILTER_INTRA_MODES, mode); - this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); - if (this_rd < *best_rd) { - *best_rd = this_rd; - *rate = this_rate; - *rate_tokenonly = tokenonly_rd_stats.rate; - *distortion = tokenonly_rd_stats.dist; - *skippable = tokenonly_rd_stats.skip; - filter_intra_mode_info = mbmi->filter_intra_mode_info; - filter_intra_selected_flag = 1; - } - } - - if (filter_intra_selected_flag) { - mbmi->uv_mode = UV_DC_PRED; - mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = - filter_intra_mode_info.use_filter_intra_mode[1]; - mbmi->filter_intra_mode_info.filter_intra_mode[1] = - filter_intra_mode_info.filter_intra_mode[1]; - return 1; - } else { - return 0; - } -} -#endif // CONFIG_FILTER_INTRA - -#if CONFIG_EXT_INTRA // Run RD calculation with given chroma intra prediction angle., and return // the RD cost. Update the best mode info. if the RD cost is the best so far. static int64_t pick_intra_angle_routine_sbuv( const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats, int *best_angle_delta, int64_t *best_rd) { - MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi; + MB_MODE_INFO *mbmi = x->e_mbd.mi[0]; assert(!is_inter_block(mbmi)); int this_rate; int64_t this_rd; @@ -6006,11 +5605,12 @@ static int64_t pick_intra_angle_routine_sbuv( if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in)) return INT64_MAX; - this_rate = tokenonly_rd_stats.rate + rate_overhead; + this_rate = tokenonly_rd_stats.rate + + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, rate_overhead); this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); if (this_rd < *best_rd) { *best_rd = this_rd; - *best_angle_delta = mbmi->angle_delta[1]; + *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV]; *rate = this_rate; rd_stats->rate = tokenonly_rd_stats.rate; rd_stats->dist = tokenonly_rd_stats.dist; @@ -6026,7 +5626,7 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, int64_t best_rd, int *rate, RD_STATS *rd_stats) { MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *mbmi = xd->mi[0]; assert(!is_inter_block(mbmi)); int i, angle_delta, best_angle_delta = 0; int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)]; @@ -6041,7 +5641,7 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, best_rd_in = (best_rd == INT64_MAX) ? INT64_MAX : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5))); - mbmi->angle_delta[1] = (1 - 2 * i) * angle_delta; + mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta; this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd_in, rate, rd_stats, &best_angle_delta, &best_rd); @@ -6064,7 +5664,7 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, rd_cost[2 * (angle_delta - 1) + i] > rd_thresh) skip_search = 1; if (!skip_search) { - mbmi->angle_delta[1] = (1 - 2 * i) * angle_delta; + mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta; pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd, rate, rd_stats, &best_angle_delta, &best_rd); @@ -6072,202 +5672,137 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, } } - mbmi->angle_delta[1] = best_angle_delta; + mbmi->angle_delta[PLANE_TYPE_UV] = best_angle_delta; return rd_stats->rate != INT_MAX; } -#endif // CONFIG_EXT_INTRA - -#if CONFIG_CFL -static int64_t cfl_alpha_dist_lbd(const int16_t *pred_buf_q3, - const uint8_t *src, int src_stride, int width, - int height, int dc_pred, int alpha_q3, - int64_t *dist_neg_out) { - int64_t dist = 0; - int diff; - if (alpha_q3 == 0) { - for (int j = 0; j < height; j++) { - for (int i = 0; i < width; i++) { - diff = src[i] - dc_pred; - dist += diff * diff; - } - src += src_stride; - } - - if (dist_neg_out) *dist_neg_out = dist; - - return dist; - } - - int64_t dist_neg = 0; - for (int j = 0; j < height; j++) { - for (int i = 0; i < width; i++) { - const int uv = src[i]; - const int scaled_luma = get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]); - - diff = uv - clip_pixel(scaled_luma + dc_pred); - dist += diff * diff; +#define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \ + (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1) +static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi, + TX_SIZE tx_size, int64_t best_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; - diff = uv - clip_pixel(-scaled_luma + dc_pred); - dist_neg += diff * diff; - } - pred_buf_q3 += MAX_SB_SIZE; - src += src_stride; + const BLOCK_SIZE bsize = mbmi->sb_type; +#if CONFIG_DEBUG + assert(is_cfl_allowed(xd)); + const int ssx = xd->plane[AOM_PLANE_U].subsampling_x; + const int ssy = xd->plane[AOM_PLANE_U].subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, ssx, ssy); + (void)plane_bsize; + assert(plane_bsize < BLOCK_SIZES_ALL); + if (!xd->lossless[mbmi->segment_id]) { + assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]); + assert(block_size_high[plane_bsize] == tx_size_high[tx_size]); } +#endif // CONFIG_DEBUG - if (dist_neg_out) *dist_neg_out = dist_neg; - - return dist; -} -#if CONFIG_HIGHBITDEPTH -static int64_t cfl_alpha_dist_hbd(const int16_t *pred_buf_q3, - const uint16_t *src, int src_stride, - int width, int height, int dc_pred, - int alpha_q3, int bit_depth, - int64_t *dist_neg_out) { - const int shift = 2 * (bit_depth - 8); - const int rounding = shift > 0 ? (1 << shift) >> 1 : 0; - int64_t dist = 0; - int diff; - - if (alpha_q3 == 0) { - for (int j = 0; j < height; j++) { - for (int i = 0; i < width; i++) { - diff = src[i] - dc_pred; - dist += diff * diff; + xd->cfl.use_dc_pred_cache = 1; + const int64_t mode_rd = + RDCOST(x->rdmult, + x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED], 0); + int64_t best_rd_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES]; + int best_c[CFL_JOINT_SIGNS][CFL_PRED_PLANES]; +#if CONFIG_DEBUG + int best_rate_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES]; +#endif // CONFIG_DEBUG + + for (int plane = 0; plane < CFL_PRED_PLANES; plane++) { + RD_STATS rd_stats; + av1_init_rd_stats(&rd_stats); + for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) { + best_rd_uv[joint_sign][plane] = INT64_MAX; + best_c[joint_sign][plane] = 0; + } + // Collect RD stats for an alpha value of zero in this plane. + // Skip i == CFL_SIGN_ZERO as (0, 0) is invalid. + for (int i = CFL_SIGN_NEG; i < CFL_SIGNS; i++) { + const int joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, CFL_SIGN_ZERO, i); + if (i == CFL_SIGN_NEG) { + mbmi->cfl_alpha_idx = 0; + mbmi->cfl_alpha_signs = joint_sign; + txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, plane + 1, bsize, tx_size, + cpi->sf.use_fast_coef_costing, FTXS_NONE); + if (rd_stats.rate == INT_MAX) break; + } + const int alpha_rate = x->cfl_cost[joint_sign][plane][0]; + best_rd_uv[joint_sign][plane] = + RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist); +#if CONFIG_DEBUG + best_rate_uv[joint_sign][plane] = rd_stats.rate; +#endif // CONFIG_DEBUG + } + } + + int best_joint_sign = -1; + + for (int plane = 0; plane < CFL_PRED_PLANES; plane++) { + for (int pn_sign = CFL_SIGN_NEG; pn_sign < CFL_SIGNS; pn_sign++) { + int progress = 0; + for (int c = 0; c < CFL_ALPHABET_SIZE; c++) { + int flag = 0; + RD_STATS rd_stats; + if (c > 2 && progress < c) break; + av1_init_rd_stats(&rd_stats); + for (int i = 0; i < CFL_SIGNS; i++) { + const int joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, pn_sign, i); + if (i == 0) { + mbmi->cfl_alpha_idx = (c << CFL_ALPHABET_SIZE_LOG2) + c; + mbmi->cfl_alpha_signs = joint_sign; + txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, plane + 1, bsize, + tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE); + if (rd_stats.rate == INT_MAX) break; + } + const int alpha_rate = x->cfl_cost[joint_sign][plane][c]; + int64_t this_rd = + RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist); + if (this_rd >= best_rd_uv[joint_sign][plane]) continue; + best_rd_uv[joint_sign][plane] = this_rd; + best_c[joint_sign][plane] = c; +#if CONFIG_DEBUG + best_rate_uv[joint_sign][plane] = rd_stats.rate; +#endif // CONFIG_DEBUG + flag = 2; + if (best_rd_uv[joint_sign][!plane] == INT64_MAX) continue; + this_rd += mode_rd + best_rd_uv[joint_sign][!plane]; + if (this_rd >= best_rd) continue; + best_rd = this_rd; + best_joint_sign = joint_sign; + } + progress += flag; } - src += src_stride; - } - dist = (dist + rounding) >> shift; - - if (dist_neg_out) *dist_neg_out = dist; - - return dist; - } - - int64_t dist_neg = 0; - for (int j = 0; j < height; j++) { - for (int i = 0; i < width; i++) { - const int uv = src[i]; - const int scaled_luma = get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]); - - diff = uv - clip_pixel_highbd(scaled_luma + dc_pred, bit_depth); - dist += diff * diff; - - diff = uv - clip_pixel_highbd(-scaled_luma + dc_pred, bit_depth); - dist_neg += diff * diff; } - pred_buf_q3 += MAX_SB_SIZE; - src += src_stride; - } - - if (dist_neg_out) *dist_neg_out = (dist_neg + rounding) >> shift; - - return (dist + rounding) >> shift; -} -#endif // CONFIG_HIGHBITDEPTH -static int64_t cfl_alpha_dist(const int16_t *pred_buf_q3, const uint8_t *src, - int src_stride, int width, int height, - int dc_pred, int alpha_q3, int use_hbd, - int bit_depth, int64_t *dist_neg_out) { -#if CONFIG_HIGHBITDEPTH - if (use_hbd) { - const uint16_t *src_16 = CONVERT_TO_SHORTPTR(src); - return cfl_alpha_dist_hbd(pred_buf_q3, src_16, src_stride, width, height, - dc_pred, alpha_q3, bit_depth, dist_neg_out); } -#endif // CONFIG_HIGHBITDEPTH - (void)use_hbd; - (void)bit_depth; - return cfl_alpha_dist_lbd(pred_buf_q3, src, src_stride, width, height, - dc_pred, alpha_q3, dist_neg_out); -} - -static int cfl_rd_pick_alpha(MACROBLOCK *const x, TX_SIZE tx_size) { - const struct macroblock_plane *const p_u = &x->plane[AOM_PLANE_U]; - const struct macroblock_plane *const p_v = &x->plane[AOM_PLANE_V]; - const uint8_t *const src_u = p_u->src.buf; - const uint8_t *const src_v = p_v->src.buf; - const int src_stride_u = p_u->src.stride; - const int src_stride_v = p_v->src.stride; - - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - - CFL_CTX *const cfl = xd->cfl; - cfl_compute_parameters(xd, tx_size); - const int width = cfl->uv_width; - const int height = cfl->uv_height; - const int dc_pred_u = cfl->dc_pred[CFL_PRED_U]; - const int dc_pred_v = cfl->dc_pred[CFL_PRED_V]; - const int16_t *pred_buf_q3 = cfl->pred_buf_q3; - const int use_hbd = get_bitdepth_data_path_index(xd); - - int64_t sse[CFL_PRED_PLANES][CFL_MAGS_SIZE]; - sse[CFL_PRED_U][0] = - cfl_alpha_dist(pred_buf_q3, src_u, src_stride_u, width, height, dc_pred_u, - 0, use_hbd, xd->bd, NULL); - sse[CFL_PRED_V][0] = - cfl_alpha_dist(pred_buf_q3, src_v, src_stride_v, width, height, dc_pred_v, - 0, use_hbd, xd->bd, NULL); - - for (int c = 0; c < CFL_ALPHABET_SIZE; c++) { - const int m = c * 2 + 1; - const int abs_alpha_q3 = c + 1; - sse[CFL_PRED_U][m] = cfl_alpha_dist( - pred_buf_q3, src_u, src_stride_u, width, height, dc_pred_u, - abs_alpha_q3, use_hbd, xd->bd, &sse[CFL_PRED_U][m + 1]); - sse[CFL_PRED_V][m] = cfl_alpha_dist( - pred_buf_q3, src_v, src_stride_v, width, height, dc_pred_v, - abs_alpha_q3, use_hbd, xd->bd, &sse[CFL_PRED_V][m + 1]); - } - - int64_t dist; - int64_t cost; - int64_t best_cost = INT64_MAX; - int best_rate = 0; - // Compute least squares parameter of the entire block + int best_rate_overhead = INT_MAX; int ind = 0; - int signs = 0; - - for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) { - const int sign_u = CFL_SIGN_U(joint_sign); - const int sign_v = CFL_SIGN_V(joint_sign); - const int size_u = (sign_u == CFL_SIGN_ZERO) ? 1 : CFL_ALPHABET_SIZE; - const int size_v = (sign_v == CFL_SIGN_ZERO) ? 1 : CFL_ALPHABET_SIZE; - for (int u = 0; u < size_u; u++) { - const int idx_u = (sign_u == CFL_SIGN_ZERO) ? 0 : u * 2 + 1; - for (int v = 0; v < size_v; v++) { - const int idx_v = (sign_v == CFL_SIGN_ZERO) ? 0 : v * 2 + 1; - dist = sse[CFL_PRED_U][idx_u + (sign_u == CFL_SIGN_NEG)] + - sse[CFL_PRED_V][idx_v + (sign_v == CFL_SIGN_NEG)]; - dist *= 16; - const int rate = x->cfl_cost[joint_sign][CFL_PRED_U][u] + - x->cfl_cost[joint_sign][CFL_PRED_V][v]; - cost = RDCOST(x->rdmult, rate, dist); - if (cost < best_cost) { - best_cost = cost; - best_rate = rate; - ind = (u << CFL_ALPHABET_SIZE_LOG2) + v; - signs = joint_sign; - } - } - } + if (best_joint_sign >= 0) { + const int u = best_c[best_joint_sign][CFL_PRED_U]; + const int v = best_c[best_joint_sign][CFL_PRED_V]; + ind = (u << CFL_ALPHABET_SIZE_LOG2) + v; + best_rate_overhead = x->cfl_cost[best_joint_sign][CFL_PRED_U][u] + + x->cfl_cost[best_joint_sign][CFL_PRED_V][v]; +#if CONFIG_DEBUG + xd->cfl.rate = x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED] + + best_rate_overhead + + best_rate_uv[best_joint_sign][CFL_PRED_U] + + best_rate_uv[best_joint_sign][CFL_PRED_V]; +#endif // CONFIG_DEBUG + } else { + best_joint_sign = 0; } mbmi->cfl_alpha_idx = ind; - mbmi->cfl_alpha_signs = signs; - return best_rate; + mbmi->cfl_alpha_signs = best_joint_sign; + xd->cfl.use_dc_pred_cache = 0; + xd->cfl.dc_pred_is_cached[0] = 0; + xd->cfl.dc_pred_is_cached[1] = 0; + return best_rate_overhead; } -#endif // CONFIG_CFL static void init_sbuv_mode(MB_MODE_INFO *const mbmi) { mbmi->uv_mode = UV_DC_PRED; mbmi->palette_mode_info.palette_size[1] = 0; -#if CONFIG_FILTER_INTRA - mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0; -#endif // CONFIG_FILTER_INTRA } static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, @@ -6275,83 +5810,53 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, int64_t *distortion, int *skippable, BLOCK_SIZE bsize, TX_SIZE max_tx_size) { MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *mbmi = xd->mi[0]; assert(!is_inter_block(mbmi)); MB_MODE_INFO best_mbmi = *mbmi; int64_t best_rd = INT64_MAX, this_rd; -#if CONFIG_PVQ - od_rollback_buffer buf; - od_encode_checkpoint(&x->daala_enc, &buf); -#endif // CONFIG_PVQ - PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; - const int try_palette = - av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type); for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) { int this_rate; RD_STATS tokenonly_rd_stats; UV_PREDICTION_MODE mode = uv_rd_search_mode_order[mode_idx]; -#if CONFIG_EXT_INTRA - const int is_directional_mode = - av1_is_directional_mode(get_uv_mode(mode), mbmi->sb_type); -#endif // CONFIG_EXT_INTRA + const int is_directional_mode = av1_is_directional_mode(get_uv_mode(mode)); if (!(cpi->sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] & (1 << mode))) continue; mbmi->uv_mode = mode; -#if CONFIG_CFL int cfl_alpha_rate = 0; if (mode == UV_CFL_PRED) { + if (!is_cfl_allowed(xd)) continue; assert(!is_directional_mode); - const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]); - cfl_alpha_rate = cfl_rd_pick_alpha(x, uv_tx_size); + const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); + cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd); + if (cfl_alpha_rate == INT_MAX) continue; } -#endif -#if CONFIG_EXT_INTRA - mbmi->angle_delta[1] = 0; + mbmi->angle_delta[PLANE_TYPE_UV] = 0; if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) { - const int rate_overhead = x->intra_uv_mode_cost[mbmi->mode][mode] + - write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, 0); + const int rate_overhead = + x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode]; if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd, &this_rate, &tokenonly_rd_stats)) continue; } else { -#endif // CONFIG_EXT_INTRA if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) { -#if CONFIG_PVQ - od_encode_rollback(&x->daala_enc, &buf); -#endif // CONFIG_PVQ continue; } -#if CONFIG_EXT_INTRA } -#endif // CONFIG_EXT_INTRA - this_rate = - tokenonly_rd_stats.rate + x->intra_uv_mode_cost[mbmi->mode][mode]; - -#if CONFIG_CFL + const int mode_cost = + x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode] + + cfl_alpha_rate; + this_rate = tokenonly_rd_stats.rate + + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost); if (mode == UV_CFL_PRED) { - this_rate += cfl_alpha_rate; + assert(is_cfl_allowed(xd)); +#if CONFIG_DEBUG + if (!xd->lossless[mbmi->segment_id]) + assert(xd->cfl.rate == tokenonly_rd_stats.rate + mode_cost); +#endif // CONFIG_DEBUG } -#endif -#if CONFIG_EXT_INTRA - if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) { - this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, - MAX_ANGLE_DELTA + mbmi->angle_delta[1]); - } -#endif // CONFIG_EXT_INTRA -#if CONFIG_FILTER_INTRA - if (mbmi->sb_type >= BLOCK_8X8 && mode == UV_DC_PRED) - this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 0); -#endif // CONFIG_FILTER_INTRA - if (try_palette && mode == UV_DC_PRED) - this_rate += av1_cost_bit( - av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 0); - -#if CONFIG_PVQ - od_encode_rollback(&x->daala_enc, &buf); -#endif // CONFIG_PVQ this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); if (this_rd < best_rd) { @@ -6364,22 +5869,17 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, } } + const int try_palette = + av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type); if (try_palette) { uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map; - rd_pick_palette_intra_sbuv(cpi, x, - x->intra_uv_mode_cost[mbmi->mode][UV_DC_PRED], - best_palette_color_map, &best_mbmi, &best_rd, - rate, rate_tokenonly, distortion, skippable); + rd_pick_palette_intra_sbuv( + cpi, x, + x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_DC_PRED], + best_palette_color_map, &best_mbmi, &best_rd, rate, rate_tokenonly, + distortion, skippable); } -#if CONFIG_FILTER_INTRA - if (mbmi->sb_type >= BLOCK_8X8) { - if (rd_pick_filter_intra_sbuv(cpi, x, rate, rate_tokenonly, distortion, - skippable, bsize, &best_rd)) - best_mbmi = *mbmi; - } -#endif // CONFIG_FILTER_INTRA - *mbmi = best_mbmi; // Make sure we actually chose a mode assert(best_rd < INT64_MAX); @@ -6391,13 +5891,14 @@ static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x, int *rate_uv, int *rate_uv_tokenonly, int64_t *dist_uv, int *skip_uv, UV_PREDICTION_MODE *mode_uv) { + const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); + const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); // Use an estimated rd for uv_intra based on DC_PRED if the // appropriate speed flag is set. init_sbuv_mode(mbmi); -#if CONFIG_CB4X4 -#if !CONFIG_CHROMA_2X2 if (x->skip_chroma_rd) { *rate_uv = 0; *rate_uv_tokenonly = 0; @@ -6406,31 +5907,20 @@ static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x, *mode_uv = UV_DC_PRED; return; } + xd->cfl.is_chroma_reference = is_chroma_reference( + mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y); bsize = scale_chroma_bsize(bsize, xd->plane[AOM_PLANE_U].subsampling_x, xd->plane[AOM_PLANE_U].subsampling_y); -#endif // !CONFIG_CHROMA_2X2 -#if CONFIG_CFL // Only store reconstructed luma when there's chroma RDO. When there's no // chroma RDO, the reconstructed luma will be stored in encode_superblock(). - xd->cfl->store_y = !x->skip_chroma_rd; -#endif // CONFIG_CFL -#else - bsize = bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize; -#if CONFIG_CFL - xd->cfl->store_y = 1; -#endif // CONFIG_CFL -#endif // CONFIG_CB4X4 -#if CONFIG_CFL - if (xd->cfl->store_y) { - // Perform one extra call to txfm_rd_in_plane(), with the values chosen - // during luma RDO, so we can store reconstructed luma values - RD_STATS this_rd_stats; - txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y, - mbmi->sb_type, mbmi->tx_size, - cpi->sf.use_fast_coef_costing); - xd->cfl->store_y = 0; + xd->cfl.store_y = store_cfl_required_rdo(cm, x); + if (xd->cfl.store_y) { + // Restore reconstructed luma values. + av1_encode_intra_block_plane(cpi, x, mbmi->sb_type, AOM_PLANE_Y, + cpi->optimize_seg_arr[mbmi->segment_id], + mi_row, mi_col); + xd->cfl.store_y = 0; } -#endif // CONFIG_CFL rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv, bsize, max_tx_size); *mode_uv = mbmi->uv_mode; @@ -6441,16 +5931,10 @@ static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode, if (is_inter_compound_mode(mode)) { return x ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)]; -#if CONFIG_COMPOUND_SINGLEREF - } else if (is_inter_singleref_comp_mode(mode)) { - return x->inter_singleref_comp_mode_cost[mode_context] - [INTER_SINGLEREF_COMP_OFFSET(mode)]; -#endif // CONFIG_COMPOUND_SINGLEREF } int mode_cost = 0; int16_t mode_ctx = mode_context & NEWMV_CTX_MASK; - int16_t is_all_zero_mv = mode_context & (1 << ALL_ZERO_FLAG_OFFSET); assert(is_inter_mode(mode)); @@ -6459,43 +5943,34 @@ static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode, return mode_cost; } else { mode_cost = x->newmv_mode_cost[mode_ctx][1]; - mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK; - - if (is_all_zero_mv) return mode_cost; + mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; - if (mode == ZEROMV) { + if (mode == GLOBALMV) { mode_cost += x->zeromv_mode_cost[mode_ctx][0]; return mode_cost; } else { mode_cost += x->zeromv_mode_cost[mode_ctx][1]; mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; - - if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6; - if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7; - if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8; - mode_cost += x->refmv_mode_cost[mode_ctx][mode != NEARESTMV]; return mode_cost; } } } -#if (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT) -static int get_interinter_compound_type_bits(BLOCK_SIZE bsize, - COMPOUND_TYPE comp_type) { - (void)bsize; - switch (comp_type) { +static int get_interinter_compound_mask_rate(const MACROBLOCK *const x, + const MB_MODE_INFO *const mbmi) { + switch (mbmi->interinter_comp.type) { case COMPOUND_AVERAGE: return 0; -#if CONFIG_WEDGE - case COMPOUND_WEDGE: return get_interinter_wedge_bits(bsize); -#endif // CONFIG_WEDGE -#if CONFIG_COMPOUND_SEGMENT - case COMPOUND_SEG: return 1; -#endif // CONFIG_COMPOUND_SEGMENT + case COMPOUND_WEDGE: + return get_interinter_wedge_bits(mbmi->sb_type) > 0 + ? av1_cost_literal(1) + + x->wedge_idx_cost[mbmi->sb_type] + [mbmi->interinter_comp.wedge_index] + : 0; + case COMPOUND_DIFFWTD: return av1_cost_literal(1); default: assert(0); return 0; } } -#endif // (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT) typedef struct { int eobs; @@ -6508,13 +5983,8 @@ typedef struct { int_mv pred_mv[2]; int_mv ref_mv[2]; -#if CONFIG_CHROMA_2X2 - ENTROPY_CONTEXT ta[4]; - ENTROPY_CONTEXT tl[4]; -#else ENTROPY_CONTEXT ta[2]; ENTROPY_CONTEXT tl[2]; -#endif // CONFIG_CHROMA_2X2 } SEG_RDSTAT; typedef struct { @@ -6527,12 +5997,7 @@ typedef struct { int64_t sse; int segment_yrate; PREDICTION_MODE modes[4]; -#if CONFIG_COMPOUND_SINGLEREF - SEG_RDSTAT rdstat[4][INTER_MODES + INTER_SINGLEREF_COMP_MODES + - INTER_COMPOUND_MODES]; -#else // !CONFIG_COMPOUND_SINGLEREF SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES]; -#endif // CONFIG_COMPOUND_SINGLEREF int mvthresh; } BEST_SEG_INFO; @@ -6543,149 +6008,103 @@ static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) { (mv->col >> 3) > mv_limits->col_max; } -// Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion. -// TODO(aconverse): Find out if this is still productive then clean up or remove -static int check_best_zero_mv( - const AV1_COMP *const cpi, const MACROBLOCK *const x, - const int16_t mode_context[TOTAL_REFS_PER_FRAME], - const int16_t compound_mode_context[TOTAL_REFS_PER_FRAME], - int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME], int this_mode, - const MV_REFERENCE_FRAME ref_frames[2], const BLOCK_SIZE bsize, int block, - int mi_row, int mi_col) { - int_mv zeromv[2] = { {.as_int = 0 } }; -#if CONFIG_GLOBAL_MOTION - int comp_pred_mode = ref_frames[1] > INTRA_FRAME; -#endif - (void)mi_row; - (void)mi_col; - (void)cpi; -#if CONFIG_GLOBAL_MOTION - if (this_mode == ZEROMV || this_mode == ZERO_ZEROMV) { - for (int cur_frm = 0; cur_frm < 1 + comp_pred_mode; cur_frm++) { - zeromv[cur_frm].as_int = - gm_get_motion_vector(&cpi->common.global_motion[ref_frames[cur_frm]], - cpi->common.allow_high_precision_mv, bsize, - mi_col, mi_row, block -#if CONFIG_AMVR - , - cpi->common.cur_frame_mv_precision_level -#endif - ) - .as_int; - } +static INLINE int get_single_mode(int this_mode, int ref_idx, + int is_comp_pred) { + int single_mode; + if (is_comp_pred) { + single_mode = + ref_idx ? compound_ref1_mode(this_mode) : compound_ref0_mode(this_mode); + } else { + single_mode = this_mode; } -#endif // CONFIG_GLOBAL_MOTION - - if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) && - frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int && - (ref_frames[1] <= INTRA_FRAME || - frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int)) { - int16_t rfc = - av1_mode_context_analyzer(mode_context, ref_frames, bsize, block); - int c1 = cost_mv_ref(x, NEARMV, rfc); - int c2 = cost_mv_ref(x, NEARESTMV, rfc); - int c3 = cost_mv_ref(x, ZEROMV, rfc); + return single_mode; +} +/* If the current mode shares the same mv with other modes with higher prority, + * skip this mode. This priority order is nearest > global > near. */ +static int skip_repeated_mv(const AV1_COMMON *const cm, + const MACROBLOCK *const x, int this_mode, + const MV_REFERENCE_FRAME ref_frames[2]) { + const int is_comp_pred = ref_frames[1] > INTRA_FRAME; + const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames); + const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + if (!is_comp_pred) { if (this_mode == NEARMV) { - if (c1 > c3) return 0; - } else if (this_mode == NEARESTMV) { - if (c2 > c3) return 0; - } else { - assert(this_mode == ZEROMV); - if (ref_frames[1] <= INTRA_FRAME) { - if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) || - (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0)) - return 0; - } else { - if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 && - frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) || - (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 && - frame_mv[NEARMV][ref_frames[1]].as_int == 0)) - return 0; + if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) { + // NEARMV has the same motion vector as NEARESTMV + return 1; + } + if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 && + cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { + // NEARMV has the same motion vector as GLOBALMV + return 1; } } - } else if ((this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV || - this_mode == ZERO_ZEROMV) && - frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int && - frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int) { - int16_t rfc = compound_mode_context[ref_frames[0]]; - int c2 = cost_mv_ref(x, NEAREST_NEARESTMV, rfc); - int c3 = cost_mv_ref(x, ZERO_ZEROMV, rfc); - int c5 = cost_mv_ref(x, NEAR_NEARMV, rfc); - - if (this_mode == NEAREST_NEARESTMV) { - if (c2 > c3) return 0; - } else if (this_mode == NEAR_NEARMV) { - if (c5 > c3) return 0; - } else { - assert(this_mode == ZERO_ZEROMV); - if ((c3 >= c2 && frame_mv[NEAREST_NEARESTMV][ref_frames[0]].as_int == 0 && - frame_mv[NEAREST_NEARESTMV][ref_frames[1]].as_int == 0) || - (c3 >= c5 && frame_mv[NEAR_NEARMV][ref_frames[0]].as_int == 0 && - frame_mv[NEAR_NEARMV][ref_frames[1]].as_int == 0)) - return 0; + if (this_mode == GLOBALMV) { + if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 && + cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { + // GLOBALMV has the same motion vector as NEARESTMV + return 1; + } + } + } else { + for (int i = 0; i < 2; ++i) { + const int single_mode = get_single_mode(this_mode, i, is_comp_pred); + if (single_mode == NEARMV) { + if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) { + // NEARMV has the same motion vector as NEARESTMV in compound mode + return 1; + } + } + } + if (this_mode == NEAR_NEARMV) { + if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 && + cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION && + cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) { + // NEAR_NEARMV has the same motion vector as GLOBAL_GLOBALMV + return 1; + } + } + if (this_mode == GLOBAL_GLOBALMV) { + if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 && + cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION && + cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) { + // GLOBAL_GLOBALMV has the same motion vector as NEARST_NEARSTMV + return 1; + } } } - return 1; + return 0; } static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, int_mv *frame_mv, -#if CONFIG_COMPOUND_SINGLEREF - int_mv *frame_comp_mv, -#endif // CONFIG_COMPOUND_SINGLEREF - int mi_row, int mi_col, - int_mv *ref_mv_sub8x8[2], const uint8_t *mask, - int mask_stride, int *rate_mv, - const int block) { + BLOCK_SIZE bsize, int_mv *cur_mv, int mi_row, + int mi_col, int_mv *ref_mv_sub8x8[2], + const uint8_t *mask, int mask_stride, + int *rate_mv, const int block) { const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); const int pw = block_size_wide[bsize]; const int ph = block_size_high[bsize]; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; -// This function should only ever be called for compound modes -#if CONFIG_COMPOUND_SINGLEREF - if (!has_second_ref(mbmi)) { - assert(is_inter_singleref_comp_mode(mbmi->mode)); - assert(frame_comp_mv); - } - assert(has_second_ref(mbmi) || is_inter_singleref_comp_mode(mbmi->mode)); - const int refs[2] = { mbmi->ref_frame[0], - has_second_ref(mbmi) ? mbmi->ref_frame[1] - : mbmi->ref_frame[0] }; -#else + MB_MODE_INFO *mbmi = xd->mi[0]; + // This function should only ever be called for compound modes assert(has_second_ref(mbmi)); const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] }; -#endif // CONFIG_COMPOUND_SINGLEREF int_mv ref_mv[2]; int ite, ref; - struct scale_factors sf; -#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION // ic and ir are the 4x4 coordinates of the sub8x8 at index "block" const int ic = block & 1; const int ir = (block - ic) >> 1; struct macroblockd_plane *const pd = &xd->plane[0]; const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic; const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir; -#if CONFIG_GLOBAL_MOTION int is_global[2]; -#if CONFIG_COMPOUND_SINGLEREF - for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) -#else - for (ref = 0; ref < 2; ++ref) -#endif // CONFIG_COMPOUND_SINGLEREF - { - WarpedMotionParams *const wm = - &xd->global_motion[xd->mi[0]->mbmi.ref_frame[ref]]; - is_global[ref] = is_global_mv_block(xd->mi[0], block, wm->wmtype); - } -#if CONFIG_COMPOUND_SINGLEREF - if (!has_second_ref(mbmi)) is_global[1] = is_global[0]; -#endif // CONFIG_COMPOUND_SINGLEREF -#endif // CONFIG_GLOBAL_MOTION -#else // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - (void)block; -#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + for (ref = 0; ref < 2; ++ref) { + const WarpedMotionParams *const wm = + &xd->global_motion[xd->mi[0]->ref_frame[ref]]; + is_global[ref] = is_global_mv_block(xd->mi[0], wm->wmtype); + } // Do joint motion search in compound mode to get more accurate mv. struct buf_2d backup_yv12[2][MAX_MB_PLANE]; @@ -6695,82 +6114,14 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, av1_get_scaled_ref_frame(cpi, refs[1]) }; -// Prediction buffer from second frame. -#if CONFIG_HIGHBITDEPTH + // Prediction buffer from second frame. DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]); uint8_t *second_pred; -#else - DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]); -#endif // CONFIG_HIGHBITDEPTH - -#if CONFIG_CB4X4 (void)ref_mv_sub8x8; -#endif // CONFIG_CB4X4 - -#if CONFIG_COMPOUND_SINGLEREF - for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) -#else - for (ref = 0; ref < 2; ++ref) -#endif // CONFIG_COMPOUND_SINGLEREF - { -#if !CONFIG_CB4X4 - if (bsize < BLOCK_8X8 && ref_mv_sub8x8 != NULL) - ref_mv[ref].as_int = ref_mv_sub8x8[ref]->as_int; - else -#endif // !CONFIG_CB4X4 - ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0]; - - if (scaled_ref_frame[ref]) { - int i; - // Swap out the reference frame for a version that's been scaled to - // match the resolution of the current frame, allowing the existing - // motion search code to be used without additional modifications. - for (i = 0; i < MAX_MB_PLANE; i++) - backup_yv12[ref][i] = xd->plane[i].pre[ref]; - av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, - NULL); - } - } -#if CONFIG_COMPOUND_SINGLEREF - if (!has_second_ref(mbmi)) { - assert(is_inter_singleref_comp_mode(mbmi->mode)); - // NOTE: For single ref comp mode, set up the 2nd set of ref_mv/pre_planes - // all from the 1st reference frame, i.e. refs[0]. - ref_mv[1] = x->mbmi_ext->ref_mvs[refs[0]][0]; - if (scaled_ref_frame[0]) { - int i; - // Swap out the reference frame for a version that's been scaled to - // match the resolution of the current frame, allowing the existing - // motion search code to be used without additional modifications. - for (i = 0; i < MAX_MB_PLANE; i++) - backup_yv12[1][i] = xd->plane[i].pre[1]; - av1_setup_pre_planes(xd, 1, scaled_ref_frame[0], mi_row, mi_col, NULL); - } - } -#endif // CONFIG_COMPOUND_SINGLEREF - -// Since we have scaled the reference frames to match the size of the current -// frame we must use a unit scaling factor during mode selection. -#if CONFIG_HIGHBITDEPTH - av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width, - cm->height, cm->use_highbitdepth); -#else - av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width, - cm->height); -#endif // CONFIG_HIGHBITDEPTH - -// Allow joint search multiple times iteratively for each reference frame -// and break out of the search loop if it couldn't find a better mv. -#if CONFIG_COMPOUND_SINGLEREF - const int num_ites = - (has_second_ref(mbmi) || mbmi->mode == SR_NEW_NEWMV) ? 4 : 1; - const int start_ite = has_second_ref(mbmi) ? 0 : 1; - for (ite = start_ite; ite < (start_ite + num_ites); ite++) -#else - for (ite = 0; ite < 4; ite++) -#endif // CONFIG_COMPOUND_SINGLEREF - { + // Allow joint search multiple times iteratively for each reference frame + // and break out of the search loop if it couldn't find a better mv. + for (ite = 0; ite < 4; ite++) { struct buf_2d ref_yv12[2]; int bestsme = INT_MAX; int sadpb = x->sadperbit16; @@ -6782,84 +6133,78 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, // odd iterations search in the second. The predictor // found for the 'other' reference frame is factored in. const int plane = 0; - ConvolveParams conv_params = get_conv_params(!id, 0, plane); -#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + ConvolveParams conv_params = get_conv_params(!id, 0, plane, xd->bd); + conv_params.use_jnt_comp_avg = 0; WarpTypesAllowed warp_types; -#if CONFIG_GLOBAL_MOTION warp_types.global_warp_allowed = is_global[!id]; -#endif // CONFIG_GLOBAL_MOTION -#if CONFIG_WARPED_MOTION warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL; -#endif // CONFIG_WARPED_MOTION -#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - // Initialized here because of compiler problem in Visual Studio. + for (ref = 0; ref < 2; ++ref) { + ref_mv[ref] = av1_get_ref_mv(x, ref); + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + if (scaled_ref_frame[ref]) { + int i; + for (i = 0; i < num_planes; i++) + backup_yv12[ref][i] = xd->plane[i].pre[ref]; + av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, + NULL, num_planes); + } + } + + assert(IMPLIES(scaled_ref_frame[0] != NULL, + cm->width == scaled_ref_frame[0]->y_crop_width && + cm->height == scaled_ref_frame[0]->y_crop_height)); + assert(IMPLIES(scaled_ref_frame[1] != NULL, + cm->width == scaled_ref_frame[1]->y_crop_width && + cm->height == scaled_ref_frame[1]->y_crop_height)); + + // Initialize based on (possibly scaled) prediction buffers. ref_yv12[0] = xd->plane[plane].pre[0]; ref_yv12[1] = xd->plane[plane].pre[1]; -// Get the prediction block from the 'other' reference frame. -#if CONFIG_COMPOUND_SINGLEREF - MV *const the_other_mv = (has_second_ref(mbmi) || id) - ? &frame_mv[refs[!id]].as_mv - : &frame_comp_mv[refs[0]].as_mv; -#endif // CONFIG_COMPOUND_SINGLEREF + // Get the prediction block from the 'other' reference frame. + InterpFilters interp_filters = EIGHTTAP_REGULAR; -#if CONFIG_HIGHBITDEPTH + // Since we have scaled the reference frames to match the size of the + // current frame we must use a unit scaling factor during mode selection. if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16); av1_highbd_build_inter_predictor( ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw, -#if CONFIG_COMPOUND_SINGLEREF - the_other_mv, -#else // !(CONFIG_COMPOUND_SINGLEREF) - &frame_mv[refs[!id]].as_mv, -#endif // CONFIG_COMPOUND_SINGLEREF - &sf, pw, ph, 0, mbmi->interp_filters, -#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - &warp_types, p_col, p_row, -#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - plane, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd); + &cur_mv[!id].as_mv, &cm->sf_identity, pw, ph, 0, interp_filters, + &warp_types, p_col, p_row, plane, MV_PRECISION_Q3, mi_col * MI_SIZE, + mi_row * MI_SIZE, xd, cm->allow_warped_motion); } else { second_pred = (uint8_t *)second_pred_alloc_16; -#endif // CONFIG_HIGHBITDEPTH - av1_build_inter_predictor( - ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw, -#if CONFIG_COMPOUND_SINGLEREF - the_other_mv, -#else // !(CONFIG_COMPOUND_SINGLEREF) - &frame_mv[refs[!id]].as_mv, -#endif // CONFIG_COMPOUND_SINGLEREF - &sf, pw, ph, &conv_params, mbmi->interp_filters, -#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - &warp_types, p_col, p_row, plane, !id, -#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd); -#if CONFIG_HIGHBITDEPTH - } -#endif // CONFIG_HIGHBITDEPTH - - // Do compound motion search on the current reference frame. + av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride, + second_pred, pw, &cur_mv[!id].as_mv, + &cm->sf_identity, pw, ph, &conv_params, + interp_filters, &warp_types, p_col, p_row, + plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE, + mi_row * MI_SIZE, xd, cm->allow_warped_motion); + } + + const int order_idx = id != 0; + av1_jnt_comp_weight_assign(cm, mbmi, order_idx, &xd->jcp_param.fwd_offset, + &xd->jcp_param.bck_offset, + &xd->jcp_param.use_jnt_comp_avg, 1); + + // Do full-pixel compound motion search on the current reference frame. if (id) xd->plane[plane].pre[0] = ref_yv12[id]; av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv); -// Use the mv result from the single mode as mv predictor. -// Use the mv result from the single mode as mv predictor. -#if CONFIG_COMPOUND_SINGLEREF - if (!has_second_ref(mbmi) && id) - *best_mv = frame_comp_mv[refs[0]].as_mv; - else -#endif // CONFIG_COMPOUND_SINGLEREF - *best_mv = frame_mv[refs[id]].as_mv; + // Use the mv result from the single mode as mv predictor. + // Use the mv result from the single mode as mv predictor. + *best_mv = cur_mv[id].as_mv; best_mv->col >>= 3; best_mv->row >>= 3; -#if CONFIG_COMPOUND_SINGLEREF - if (!has_second_ref(mbmi)) - av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx); - else -#endif // CONFIG_COMPOUND_SINGLEREF - av1_set_mvcost(x, refs[id], id, mbmi->ref_mv_idx); + av1_set_mvcost( + x, id, + mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0)); // Small-range full-pixel motion search. bestsme = av1_refining_search_8p_c(x, sadpb, search_range, @@ -6877,42 +6222,44 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, x->mv_limits = tmp_mv_limits; -#if CONFIG_AMVR - if (cpi->common.cur_frame_mv_precision_level) { + // Restore the pointer to the first (possibly scaled) prediction buffer. + if (id) xd->plane[plane].pre[0] = ref_yv12[0]; + + for (ref = 0; ref < 2; ++ref) { + if (scaled_ref_frame[ref]) { + // Swap back the original buffers for subpel motion search. + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[ref] = backup_yv12[ref][i]; + } + // Re-initialize based on unscaled prediction buffers. + ref_yv12[ref] = xd->plane[plane].pre[ref]; + } + } + + // Do sub-pixel compound motion search on the current reference frame. + if (id) xd->plane[plane].pre[0] = ref_yv12[id]; + + if (cpi->common.cur_frame_force_integer_mv) { x->best_mv.as_mv.row *= 8; x->best_mv.as_mv.col *= 8; } - if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0) -#else - if (bestsme < INT_MAX) -#endif - { + if (bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0) { int dis; /* TODO: use dis in distortion calculation later. */ unsigned int sse; bestsme = cpi->find_fractional_mv_step( - x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv, - x->errorperbit, &cpi->fn_ptr[bsize], 0, - cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost, - &dis, &sse, second_pred, mask, mask_stride, id, pw, ph, - cpi->sf.use_upsampled_references); + x, cm, mi_row, mi_col, &ref_mv[id].as_mv, + cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL, + x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask, + mask_stride, id, pw, ph, cpi->sf.use_accurate_subpel_search); } - // Restore the pointer to the first (possibly scaled) prediction buffer. + // Restore the pointer to the first prediction buffer. if (id) xd->plane[plane].pre[0] = ref_yv12[0]; if (bestsme < last_besterr[id]) { -#if CONFIG_COMPOUND_SINGLEREF - // NOTE: For single ref comp mode, frame_mv stores the first mv and - // frame_comp_mv stores the second mv. - if (!has_second_ref(mbmi) && id) - frame_comp_mv[refs[0]].as_mv = *best_mv; - else -#endif // CONFIG_COMPOUND_SINGLEREF - frame_mv[refs[id]].as_mv = *best_mv; + cur_mv[id].as_mv = *best_mv; last_besterr[id] = bestsme; -#if CONFIG_COMPOUND_SINGLEREF - if (!has_second_ref(mbmi)) last_besterr[!id] = last_besterr[id]; -#endif // CONFIG_COMPOUND_SINGLEREF } else { break; } @@ -6920,216 +6267,124 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, *rate_mv = 0; -#if CONFIG_COMPOUND_SINGLEREF - for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) -#else - for (ref = 0; ref < 2; ++ref) -#endif // CONFIG_COMPOUND_SINGLEREF - { - if (scaled_ref_frame[ref]) { - // Restore the prediction frame pointers to their unscaled versions. - int i; - for (i = 0; i < MAX_MB_PLANE; i++) - xd->plane[i].pre[ref] = backup_yv12[ref][i]; - } - -#if CONFIG_COMPOUND_SINGLEREF - if (!has_second_ref(mbmi)) - av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx); - else -#endif // CONFIG_COMPOUND_SINGLEREF - av1_set_mvcost(x, refs[ref], ref, mbmi->ref_mv_idx); - -#if CONFIG_COMPOUND_SINGLEREF - if (!has_second_ref(mbmi)) { - // NOTE: For single ref comp mode, i.e. !has_second_ref(mbmi) is true, the - // first mv is stored in frame_mv[] and the second mv is stored in - // frame_comp_mv[]. - if (compound_ref0_mode(mbmi->mode) == NEWMV) // SR_NEW_NEWMV - *rate_mv += av1_mv_bit_cost(&frame_mv[refs[0]].as_mv, - &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv, - x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); - assert(compound_ref1_mode(mbmi->mode) == NEWMV); - *rate_mv += av1_mv_bit_cost(&frame_comp_mv[refs[0]].as_mv, - &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv, - x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); - } else { -#endif // CONFIG_COMPOUND_SINGLEREF -#if !CONFIG_CB4X4 - if (bsize >= BLOCK_8X8) -#endif // !CONFIG_CB4X4 - *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv, - &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv, - x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); -#if !CONFIG_CB4X4 - else - *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv, - &ref_mv_sub8x8[ref]->as_mv, x->nmvjointcost, - x->mvcost, MV_COST_WEIGHT); -#endif // !CONFIG_CB4X4 -#if CONFIG_COMPOUND_SINGLEREF - } -#endif // CONFIG_COMPOUND_SINGLEREF - } + for (ref = 0; ref < 2; ++ref) { + av1_set_mvcost( + x, ref, + mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0)); -#if CONFIG_COMPOUND_SINGLEREF - if (!has_second_ref(mbmi)) { - if (scaled_ref_frame[0]) { - // Restore the prediction frame pointers to their unscaled versions. - int i; - for (i = 0; i < MAX_MB_PLANE; i++) - xd->plane[i].pre[1] = backup_yv12[1][i]; - } + const int_mv curr_ref_mv = av1_get_ref_mv(x, ref); + *rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); } -#endif // CONFIG_COMPOUND_SINGLEREF } static void estimate_ref_frame_costs( - const AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id, - unsigned int *ref_costs_single, -#if CONFIG_EXT_COMP_REFS - unsigned int (*ref_costs_comp)[TOTAL_REFS_PER_FRAME], -#else - unsigned int *ref_costs_comp, -#endif // CONFIG_EXT_COMP_REFS - aom_prob *comp_mode_p) { + const AV1_COMMON *cm, const MACROBLOCKD *xd, const MACROBLOCK *x, + int segment_id, unsigned int *ref_costs_single, + unsigned int (*ref_costs_comp)[REF_FRAMES]) { int seg_ref_active = segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME); if (seg_ref_active) { - memset(ref_costs_single, 0, - TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_single)); -#if CONFIG_EXT_COMP_REFS + memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single)); int ref_frame; - for (ref_frame = 0; ref_frame < TOTAL_REFS_PER_FRAME; ++ref_frame) + for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) memset(ref_costs_comp[ref_frame], 0, - TOTAL_REFS_PER_FRAME * sizeof((*ref_costs_comp)[0])); -#else - memset(ref_costs_comp, 0, TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_comp)); -#endif // CONFIG_EXT_COMP_REFS - - *comp_mode_p = 128; + REF_FRAMES * sizeof((*ref_costs_comp)[0])); } else { - aom_prob intra_inter_p = av1_get_intra_inter_prob(cm, xd); - aom_prob comp_inter_p = 128; - - if (cm->reference_mode == REFERENCE_MODE_SELECT) { - comp_inter_p = av1_get_reference_mode_prob(cm, xd); - *comp_mode_p = comp_inter_p; - } else { - *comp_mode_p = 128; - } - - ref_costs_single[INTRA_FRAME] = av1_cost_bit(intra_inter_p, 0); - - if (cm->reference_mode != COMPOUND_REFERENCE) { - aom_prob ref_single_p1 = av1_get_pred_prob_single_ref_p1(cm, xd); - aom_prob ref_single_p2 = av1_get_pred_prob_single_ref_p2(cm, xd); -#if CONFIG_EXT_REFS - aom_prob ref_single_p3 = av1_get_pred_prob_single_ref_p3(cm, xd); - aom_prob ref_single_p4 = av1_get_pred_prob_single_ref_p4(cm, xd); - aom_prob ref_single_p5 = av1_get_pred_prob_single_ref_p5(cm, xd); - aom_prob ref_single_p6 = av1_get_pred_prob_single_ref_p6(cm, xd); -#endif // CONFIG_EXT_REFS - - unsigned int base_cost = av1_cost_bit(intra_inter_p, 1); - - ref_costs_single[LAST_FRAME] = -#if CONFIG_EXT_REFS - ref_costs_single[LAST2_FRAME] = ref_costs_single[LAST3_FRAME] = - ref_costs_single[BWDREF_FRAME] = ref_costs_single[ALTREF2_FRAME] = -#endif // CONFIG_EXT_REFS - ref_costs_single[GOLDEN_FRAME] = - ref_costs_single[ALTREF_FRAME] = base_cost; - -#if CONFIG_EXT_REFS - ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0); - ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p1, 0); - ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p1, 0); - ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 0); - ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p1, 1); - ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p1, 1); - ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1); - - ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p3, 0); - ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p3, 0); - ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p3, 1); - ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p3, 1); - - ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p2, 0); - ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p2, 0); - ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1); - - ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p4, 0); - ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p4, 1); - - ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p5, 0); - ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p5, 1); - - ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p6, 0); - ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p6, 1); -#else // !CONFIG_EXT_REFS - ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0); - ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 1); - ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1); - - ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p2, 0); - ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1); -#endif // CONFIG_EXT_REFS - } else { - ref_costs_single[LAST_FRAME] = 512; -#if CONFIG_EXT_REFS - ref_costs_single[LAST2_FRAME] = 512; - ref_costs_single[LAST3_FRAME] = 512; - ref_costs_single[BWDREF_FRAME] = 512; - ref_costs_single[ALTREF2_FRAME] = 512; -#endif // CONFIG_EXT_REFS - ref_costs_single[GOLDEN_FRAME] = 512; - ref_costs_single[ALTREF_FRAME] = 512; - } + int intra_inter_ctx = av1_get_intra_inter_context(xd); + ref_costs_single[INTRA_FRAME] = x->intra_inter_cost[intra_inter_ctx][0]; + unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1]; + + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) + ref_costs_single[i] = base_cost; + + const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd); + const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd); + const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd); + const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd); + const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd); + const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd); + + // Determine cost of a single ref frame, where frame types are represented + // by a tree: + // Level 0: add cost whether this ref is a forward or backward ref + ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p1][0][1]; + ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p1][0][1]; + ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p1][0][1]; + + // Level 1: if this ref is forward ref, + // add cost whether it is last/last2 or last3/golden + ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p3][2][0]; + ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p3][2][0]; + ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p3][2][1]; + ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p3][2][1]; + + // Level 1: if this ref is backward ref + // then add cost whether this ref is altref or backward ref + ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p2][1][0]; + ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p2][1][0]; + ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p2][1][1]; + + // Level 2: further add cost whether this ref is last or last2 + ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p4][3][0]; + ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p4][3][1]; + + // Level 2: last3 or golden + ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p5][4][0]; + ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p5][4][1]; + + // Level 2: bwdref or altref2 + ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p6][5][0]; + ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p6][5][1]; if (cm->reference_mode != SINGLE_REFERENCE) { - aom_prob ref_comp_p = av1_get_pred_prob_comp_ref_p(cm, xd); -#if CONFIG_EXT_REFS - aom_prob ref_comp_p1 = av1_get_pred_prob_comp_ref_p1(cm, xd); - aom_prob ref_comp_p2 = av1_get_pred_prob_comp_ref_p2(cm, xd); - aom_prob bwdref_comp_p = av1_get_pred_prob_comp_bwdref_p(cm, xd); - aom_prob bwdref_comp_p1 = av1_get_pred_prob_comp_bwdref_p1(cm, xd); -#endif // CONFIG_EXT_REFS - - unsigned int base_cost = av1_cost_bit(intra_inter_p, 1); + // Similar to single ref, determine cost of compound ref frames. + // cost_compound_refs = cost_first_ref + cost_second_ref + const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd); + const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd); + const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd); + const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd); + const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd); -#if CONFIG_EXT_COMP_REFS - aom_prob comp_ref_type_p = av1_get_comp_reference_type_prob(cm, xd); - unsigned int ref_bicomp_costs[TOTAL_REFS_PER_FRAME] = { 0 }; + const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd); + unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 }; ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] = ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] = -#if USE_UNI_COMP_REFS - base_cost + av1_cost_bit(comp_ref_type_p, 1); -#else - base_cost; -#endif // USE_UNI_COMP_REFS + base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][1]; ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0; ref_bicomp_costs[ALTREF_FRAME] = 0; - ref_bicomp_costs[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0); - ref_bicomp_costs[LAST2_FRAME] += av1_cost_bit(ref_comp_p, 0); - ref_bicomp_costs[LAST3_FRAME] += av1_cost_bit(ref_comp_p, 1); - ref_bicomp_costs[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1); + // cost of first ref frame + ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0]; + ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0]; + ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1]; + ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1]; - ref_bicomp_costs[LAST_FRAME] += av1_cost_bit(ref_comp_p1, 1); - ref_bicomp_costs[LAST2_FRAME] += av1_cost_bit(ref_comp_p1, 0); + ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][0]; + ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][1]; - ref_bicomp_costs[LAST3_FRAME] += av1_cost_bit(ref_comp_p2, 0); - ref_bicomp_costs[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p2, 1); + ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][0]; + ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][1]; - ref_bicomp_costs[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0); - ref_bicomp_costs[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p, 0); - ref_bicomp_costs[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1); + // cost of second ref frame + ref_bicomp_costs[BWDREF_FRAME] += + x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0]; + ref_bicomp_costs[ALTREF2_FRAME] += + x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0]; + ref_bicomp_costs[ALTREF_FRAME] += + x->comp_bwdref_cost[bwdref_comp_ctx_p][0][1]; - ref_bicomp_costs[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p1, 0); - ref_bicomp_costs[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p1, 1); + ref_bicomp_costs[BWDREF_FRAME] += + x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0]; + ref_bicomp_costs[ALTREF2_FRAME] += + x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1]; + // cost: if one ref frame is forward ref, the other ref is backward ref int ref0, ref1; for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) { for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) { @@ -7138,66 +6393,28 @@ static void estimate_ref_frame_costs( } } - aom_prob uni_comp_ref_p = av1_get_pred_prob_uni_comp_ref_p(cm, xd); - aom_prob uni_comp_ref_p1 = av1_get_pred_prob_uni_comp_ref_p1(cm, xd); - aom_prob uni_comp_ref_p2 = av1_get_pred_prob_uni_comp_ref_p2(cm, xd); - + // cost: if both ref frames are the same side. + const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd); + const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd); + const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd); ref_costs_comp[LAST_FRAME][LAST2_FRAME] = - base_cost + av1_cost_bit(comp_ref_type_p, 0) + - av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 0); + base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0]; ref_costs_comp[LAST_FRAME][LAST3_FRAME] = - base_cost + av1_cost_bit(comp_ref_type_p, 0) + - av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 1) + - av1_cost_bit(uni_comp_ref_p2, 0); + base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0]; ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = - base_cost + av1_cost_bit(comp_ref_type_p, 0) + - av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 1) + - av1_cost_bit(uni_comp_ref_p2, 1); - + base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1]; ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = - base_cost + av1_cost_bit(comp_ref_type_p, 0) + - av1_cost_bit(uni_comp_ref_p, 1); - -#else // !CONFIG_EXT_COMP_REFS - - ref_costs_comp[LAST_FRAME] = -#if CONFIG_EXT_REFS - ref_costs_comp[LAST2_FRAME] = ref_costs_comp[LAST3_FRAME] = -#endif // CONFIG_EXT_REFS - ref_costs_comp[GOLDEN_FRAME] = base_cost; - -#if CONFIG_EXT_REFS - ref_costs_comp[BWDREF_FRAME] = ref_costs_comp[ALTREF2_FRAME] = - ref_costs_comp[ALTREF_FRAME] = 0; -#endif // CONFIG_EXT_REFS - -#if CONFIG_EXT_REFS - ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0); - ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p, 0); - ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p, 1); - ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1); - - ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p1, 1); - ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p1, 0); - - ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p2, 0); - ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p2, 1); - - // NOTE(zoeliu): BWDREF and ALTREF each add an extra cost by coding 1 - // more bit. - ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0); - ref_costs_comp[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p, 0); - ref_costs_comp[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1); - - ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p1, 0); - ref_costs_comp[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p1, 1); -#else // !CONFIG_EXT_REFS - ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0); - ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1); -#endif // CONFIG_EXT_REFS -#endif // CONFIG_EXT_COMP_REFS + base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1]; } else { -#if CONFIG_EXT_COMP_REFS int ref0, ref1; for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) { for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) @@ -7207,17 +6424,6 @@ static void estimate_ref_frame_costs( ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512; ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512; ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512; -#else // !CONFIG_EXT_COMP_REFS - ref_costs_comp[LAST_FRAME] = 512; -#if CONFIG_EXT_REFS - ref_costs_comp[LAST2_FRAME] = 512; - ref_costs_comp[LAST3_FRAME] = 512; - ref_costs_comp[BWDREF_FRAME] = 512; - ref_costs_comp[ALTREF2_FRAME] = 512; - ref_costs_comp[ALTREF_FRAME] = 512; -#endif // CONFIG_EXT_REFS - ref_costs_comp[GOLDEN_FRAME] = 512; -#endif // CONFIG_EXT_COMP_REFS } } } @@ -7240,17 +6446,15 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT]; } -static void setup_buffer_inter( +static void setup_buffer_ref_mvs_inter( const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE block_size, int mi_row, int mi_col, - int_mv frame_nearest_mv[TOTAL_REFS_PER_FRAME], - int_mv frame_near_mv[TOTAL_REFS_PER_FRAME], - struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE]) { + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) { const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *const mi = xd->mi[0]; - int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame]; + MB_MODE_INFO *const mbmi = xd->mi[0]; const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; @@ -7258,35 +6462,20 @@ static void setup_buffer_inter( // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this // use the UV scaling factors. - av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); + av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf, + num_planes); // Gets an initial list of candidate vectors from neighbours and orders them - av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame], - mbmi_ext->ref_mv_stack[ref_frame], - mbmi_ext->compound_mode_context, candidates, mi_row, mi_col, - NULL, NULL, mbmi_ext->mode_context); - -// Candidate refinement carried out at encoder and decoder -#if CONFIG_AMVR - av1_find_best_ref_mvs(cm->allow_high_precision_mv, candidates, - &frame_nearest_mv[ref_frame], &frame_near_mv[ref_frame], - cm->cur_frame_mv_precision_level); -#else - av1_find_best_ref_mvs(cm->allow_high_precision_mv, candidates, - &frame_nearest_mv[ref_frame], - &frame_near_mv[ref_frame]); -#endif -// Further refinement that is encode side only to test the top few candidates -// in full and choose the best as the centre point for subsequent searches. -// The current implementation doesn't support scaling. -#if CONFIG_CB4X4 + av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, + mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row, + mi_col, mbmi_ext->mode_context); + + // Further refinement that is encode side only to test the top few candidates + // in full and choose the best as the centre point for subsequent searches. + // The current implementation doesn't support scaling. + (void)block_size; av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame, block_size); -#else - if (!av1_is_scaled(sf) && block_size >= BLOCK_8X8) - av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame, - block_size); -#endif // CONFIG_CB4X4 } static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, @@ -7294,19 +6483,15 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, int ref_idx, int *rate_mv) { MACROBLOCKD *xd = &x->e_mbd; const AV1_COMMON *cm = &cpi->common; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + const int num_planes = av1_num_planes(cm); + MB_MODE_INFO *mbmi = xd->mi[0]; struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } }; int bestsme = INT_MAX; int step_param; int sadpb = x->sadperbit16; MV mvp_full; -#if CONFIG_COMPOUND_SINGLEREF - int ref = - has_second_ref(mbmi) ? mbmi->ref_frame[ref_idx] : mbmi->ref_frame[0]; -#else // !CONFIG_COMPOUND_SINGLEREF int ref = mbmi->ref_frame[ref_idx]; -#endif // CONFIG_COMPOUND_SINGLEREF - MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv; + MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv; MvLimits tmp_mv_limits = x->mv_limits; int cost_list[5]; @@ -7314,25 +6499,21 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, const YV12_BUFFER_CONFIG *scaled_ref_frame = av1_get_scaled_ref_frame(cpi, ref); - MV pred_mv[3]; - pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv; - pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv; - pred_mv[2] = x->pred_mv[ref]; - if (scaled_ref_frame) { - int i; // Swap out the reference frame for a version that's been scaled to // match the resolution of the current frame, allowing the existing - // motion search code to be used without additional modifications. - for (i = 0; i < MAX_MB_PLANE; i++) + // full-pixel motion search code to be used without additional + // modifications. + for (int i = 0; i < num_planes; i++) { backup_yv12[i] = xd->plane[i].pre[ref_idx]; - - av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL); + } + av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL, + num_planes); } - av1_set_mv_search_range(&x->mv_limits, &ref_mv); - - av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx); + av1_set_mvcost( + x, ref_idx, + mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0)); // Work out the size of the first step in the mv step search. // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc. @@ -7347,16 +6528,16 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, step_param = cpi->mv_step_param; } - if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size) { + if (cpi->sf.adaptive_motion_search && bsize < cm->seq_params.sb_size) { int boffset = - 2 * (b_width_log2_lookup[cm->sb_size] - - AOMMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize])); + 2 * (mi_size_wide_log2[cm->seq_params.sb_size] - + AOMMIN(mi_size_high_log2[bsize], mi_size_wide_log2[bsize])); step_param = AOMMAX(step_param, boffset); } if (cpi->sf.adaptive_motion_search) { - int bwl = b_width_log2_lookup[bsize]; - int bhl = b_height_log2_lookup[bsize]; + int bwl = mi_size_wide_log2[bsize]; + int bhl = mi_size_high_log2[bsize]; int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4); if (tlevel < 5) { @@ -7374,8 +6555,8 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, x->best_mv.as_int = INVALID_MV; if (scaled_ref_frame) { - int j; - for (j = 0; j < MAX_MB_PLANE; ++j) + // Swap back the original buffers before returning. + for (int j = 0; j < num_planes; ++j) xd->plane[j].pre[ref_idx] = backup_yv12[j]; } return; @@ -7384,35 +6565,26 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, } } + // Note: MV limits are modified here. Always restore the original values + // after full-pixel motion search. av1_set_mv_search_range(&x->mv_limits, &ref_mv); -#if CONFIG_MOTION_VAR if (mbmi->motion_mode != SIMPLE_TRANSLATION) mvp_full = mbmi->mv[0].as_mv; else -#endif // CONFIG_MOTION_VAR - mvp_full = pred_mv[x->mv_best_ref_index[ref]]; + mvp_full = ref_mv; mvp_full.col >>= 3; mvp_full.row >>= 3; x->best_mv.as_int = x->second_best_mv.as_int = INVALID_MV; -#if CONFIG_MOTION_VAR switch (mbmi->motion_mode) { case SIMPLE_TRANSLATION: -#endif // CONFIG_MOTION_VAR -#if CONFIG_HASH_ME bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1, (MI_SIZE * mi_col), (MI_SIZE * mi_row), 0); -#else - bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, - cond_cost_list(cpi, cost_list), &ref_mv, - INT_MAX, 1); -#endif -#if CONFIG_MOTION_VAR break; case OBMC_CAUSAL: bestsme = av1_obmc_full_pixel_diamond( @@ -7422,25 +6594,27 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, break; default: assert(0 && "Invalid motion mode!\n"); } -#endif // CONFIG_MOTION_VAR + + if (scaled_ref_frame) { + // Swap back the original buffers for subpel motion search. + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[ref_idx] = backup_yv12[i]; + } + } x->mv_limits = tmp_mv_limits; -#if CONFIG_AMVR - if (cpi->common.cur_frame_mv_precision_level) { + if (cpi->common.cur_frame_force_integer_mv) { x->best_mv.as_mv.row *= 8; x->best_mv.as_mv.col *= 8; } - if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0) { -#else - if (bestsme < INT_MAX) { -#endif + const int use_fractional_mv = + bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0; + if (use_fractional_mv) { int dis; /* TODO: use dis in distortion calculation later. */ -#if CONFIG_MOTION_VAR switch (mbmi->motion_mode) { case SIMPLE_TRANSLATION: -#endif // CONFIG_MOTION_VAR - if (cpi->sf.use_upsampled_references) { + if (cpi->sf.use_accurate_subpel_search) { int best_mv_var; const int try_second = x->second_best_mv.as_int != INVALID_MV && x->second_best_mv.as_int != x->best_mv.as_int; @@ -7448,8 +6622,8 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, const int ph = block_size_high[bsize]; best_mv_var = cpi->find_fractional_mv_step( - x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit, - &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, + x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL, 0, 0, pw, ph, 1); @@ -7472,8 +6646,9 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, x->best_mv.as_mv.col * 8 <= maxc && x->best_mv.as_mv.col * 8 >= minc) { this_var = cpi->find_fractional_mv_step( - x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit, - &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, + x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL, 0, 0, pw, ph, 1); @@ -7483,45 +6658,35 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, } } else { cpi->find_fractional_mv_step( - x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit, - &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, + x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL, 0, 0, 0, 0, 0); } -#if CONFIG_MOTION_VAR break; case OBMC_CAUSAL: av1_find_best_obmc_sub_pixel_tree_up( - x, &x->best_mv.as_mv, &ref_mv, cm->allow_high_precision_mv, - x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, - cpi->sf.mv.subpel_iters_per_step, x->nmvjointcost, x->mvcost, &dis, - &x->pred_sse[ref], 0, cpi->sf.use_upsampled_references); + x, cm, mi_row, mi_col, &x->best_mv.as_mv, &ref_mv, + cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step, + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], 0, + cpi->sf.use_accurate_subpel_search); break; default: assert(0 && "Invalid motion mode!\n"); } -#endif // CONFIG_MOTION_VAR } *rate_mv = av1_mv_bit_cost(&x->best_mv.as_mv, &ref_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); -#if CONFIG_MOTION_VAR if (cpi->sf.adaptive_motion_search && mbmi->motion_mode == SIMPLE_TRANSLATION) -#else - if (cpi->sf.adaptive_motion_search) -#endif // CONFIG_MOTION_VAR x->pred_mv[ref] = x->best_mv.as_mv; - - if (scaled_ref_frame) { - int i; - for (i = 0; i < MAX_MB_PLANE; i++) - xd->plane[i].pre[ref_idx] = backup_yv12[i]; - } } -static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst) { +static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst, + const int num_planes) { int i; - for (i = 0; i < MAX_MB_PLANE; i++) { + for (i = 0; i < num_planes; i++) { xd->plane[i].dst.buf = dst.plane[i]; xd->plane[i].dst.stride = dst.stride[i]; } @@ -7535,106 +6700,50 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x, const int pw = block_size_wide[bsize]; const int ph = block_size_high[bsize]; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; -#if CONFIG_COMPOUND_SINGLEREF - const int other_ref = - has_second_ref(mbmi) ? mbmi->ref_frame[!ref_idx] : mbmi->ref_frame[0]; -#else // !CONFIG_COMPOUND_SINGLEREF + MB_MODE_INFO *mbmi = xd->mi[0]; const int other_ref = mbmi->ref_frame[!ref_idx]; -#endif // CONFIG_COMPOUND_SINGLEREF - struct scale_factors sf; -#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION struct macroblockd_plane *const pd = &xd->plane[0]; // ic and ir are the 4x4 coordinates of the sub8x8 at index "block" const int ic = block & 1; const int ir = (block - ic) >> 1; const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic; const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir; -#if CONFIG_GLOBAL_MOTION - WarpedMotionParams *const wm = &xd->global_motion[other_ref]; - int is_global = is_global_mv_block(xd->mi[0], block, wm->wmtype); -#endif // CONFIG_GLOBAL_MOTION -#else - (void)block; -#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + const WarpedMotionParams *const wm = &xd->global_motion[other_ref]; + int is_global = is_global_mv_block(xd->mi[0], wm->wmtype); -// This function should only ever be called for compound modes -#if CONFIG_COMPOUND_SINGLEREF - assert(has_second_ref(mbmi) || is_inter_singleref_comp_mode(mbmi->mode)); -#else // !CONFIG_COMPOUND_SINGLEREF + // This function should only ever be called for compound modes assert(has_second_ref(mbmi)); -#endif // CONFIG_COMPOUND_SINGLEREF - - struct buf_2d backup_yv12[MAX_MB_PLANE]; - const YV12_BUFFER_CONFIG *const scaled_ref_frame = - av1_get_scaled_ref_frame(cpi, other_ref); - if (scaled_ref_frame) { - int i; - // Swap out the reference frame for a version that's been scaled to - // match the resolution of the current frame, allowing the existing - // motion search code to be used without additional modifications. - for (i = 0; i < MAX_MB_PLANE; i++) - backup_yv12[i] = xd->plane[i].pre[!ref_idx]; - av1_setup_pre_planes(xd, !ref_idx, scaled_ref_frame, mi_row, mi_col, NULL); - } - -// Since we have scaled the reference frames to match the size of the current -// frame we must use a unit scaling factor during mode selection. -#if CONFIG_HIGHBITDEPTH - av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width, - cm->height, cm->use_highbitdepth); -#else - av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width, - cm->height); -#endif // CONFIG_HIGHBITDEPTH + const int plane = 0; + struct buf_2d ref_yv12 = xd->plane[plane].pre[!ref_idx]; - struct buf_2d ref_yv12; + struct scale_factors sf; + av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height, + cm->width, cm->height); - const int plane = 0; - ConvolveParams conv_params = get_conv_params(!ref_idx, 0, plane); -#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + ConvolveParams conv_params = get_conv_params(!ref_idx, 0, plane, xd->bd); WarpTypesAllowed warp_types; -#if CONFIG_GLOBAL_MOTION warp_types.global_warp_allowed = is_global; -#endif // CONFIG_GLOBAL_MOTION -#if CONFIG_WARPED_MOTION warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL; -#endif // CONFIG_WARPED_MOTION -#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - // Initialized here because of compiler problem in Visual Studio. - ref_yv12 = xd->plane[plane].pre[!ref_idx]; - -// Get the prediction block from the 'other' reference frame. -#if CONFIG_HIGHBITDEPTH + // Get the prediction block from the 'other' reference frame. if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { av1_highbd_build_inter_predictor( ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph, - 0, mbmi->interp_filters, -#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - &warp_types, p_col, p_row, -#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - plane, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd); + 0, mbmi->interp_filters, &warp_types, p_col, p_row, plane, + MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd, + cm->allow_warped_motion); } else { -#endif // CONFIG_HIGHBITDEPTH av1_build_inter_predictor( ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph, - &conv_params, mbmi->interp_filters, -#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - &warp_types, p_col, p_row, plane, !ref_idx, -#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd); -#if CONFIG_HIGHBITDEPTH + &conv_params, mbmi->interp_filters, &warp_types, p_col, p_row, plane, + !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd, + cm->allow_warped_motion); } -#endif // CONFIG_HIGHBITDEPTH - if (scaled_ref_frame) { - // Restore the prediction frame pointers to their unscaled versions. - int i; - for (i = 0; i < MAX_MB_PLANE; i++) - xd->plane[i].pre[!ref_idx] = backup_yv12[i]; - } + av1_jnt_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset, + &xd->jcp_param.bck_offset, + &xd->jcp_param.use_jnt_comp_avg, 1); } // Search for the best mv for one component of a compound, @@ -7645,45 +6754,41 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, const uint8_t *second_pred, const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); const int pw = block_size_wide[bsize]; const int ph = block_size_high[bsize]; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; -#if CONFIG_COMPOUND_SINGLEREF - const int ref = - has_second_ref(mbmi) ? mbmi->ref_frame[ref_idx] : mbmi->ref_frame[0]; -#else + MB_MODE_INFO *mbmi = xd->mi[0]; const int ref = mbmi->ref_frame[ref_idx]; -#endif // CONFIG_COMPOUND_SINGLEREF - int_mv ref_mv = x->mbmi_ext->ref_mvs[ref][0]; + const int_mv ref_mv = av1_get_ref_mv(x, ref_idx); struct macroblockd_plane *const pd = &xd->plane[0]; struct buf_2d backup_yv12[MAX_MB_PLANE]; const YV12_BUFFER_CONFIG *const scaled_ref_frame = av1_get_scaled_ref_frame(cpi, ref); -// Check that this is either an interinter or an interintra block -#if CONFIG_COMPOUND_SINGLEREF - assert(has_second_ref(mbmi) || - // or a single ref comp pred mode - is_inter_singleref_comp_mode(mbmi->mode) || - (ref_idx == 0 && mbmi->ref_frame[1] == INTRA_FRAME)); -#else - assert(has_second_ref(mbmi) || - (ref_idx == 0 && mbmi->ref_frame[1] == INTRA_FRAME)); -#endif // CONFIG_COMPOUND_SINGLEREF + // Check that this is either an interinter or an interintra block + assert(has_second_ref(mbmi) || (ref_idx == 0 && is_interintra_mode(mbmi))); + + // Store the first prediction buffer. + struct buf_2d orig_yv12; + if (ref_idx) { + orig_yv12 = pd->pre[0]; + pd->pre[0] = pd->pre[ref_idx]; + } if (scaled_ref_frame) { int i; // Swap out the reference frame for a version that's been scaled to // match the resolution of the current frame, allowing the existing - // motion search code to be used without additional modifications. - for (i = 0; i < MAX_MB_PLANE; i++) - backup_yv12[i] = xd->plane[i].pre[ref_idx]; - av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL); + // full-pixel motion search code to be used without additional + // modifications. + for (i = 0; i < num_planes; i++) backup_yv12[i] = xd->plane[i].pre[ref_idx]; + av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL, + num_planes); } - struct buf_2d orig_yv12; int bestsme = INT_MAX; int sadpb = x->sadperbit16; MV *const best_mv = &x->best_mv.as_mv; @@ -7691,12 +6796,6 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, MvLimits tmp_mv_limits = x->mv_limits; - // Initialized here because of compiler problem in Visual Studio. - if (ref_idx) { - orig_yv12 = pd->pre[0]; - pd->pre[0] = pd->pre[ref_idx]; - } - // Do compound motion search on the current reference frame. av1_set_mv_search_range(&x->mv_limits, &ref_mv.as_mv); @@ -7706,12 +6805,9 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, best_mv->col >>= 3; best_mv->row >>= 3; -#if CONFIG_COMPOUND_SINGLEREF - if (!has_second_ref(mbmi)) - av1_set_mvcost(x, ref, 0, mbmi->ref_mv_idx); - else -#endif // CONFIG_COMPOUND_SINGLEREF - av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx); + av1_set_mvcost( + x, ref_idx, + mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0)); // Small-range full-pixel motion search. bestsme = av1_refining_search_8p_c(x, sadpb, search_range, @@ -7729,44 +6825,40 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, x->mv_limits = tmp_mv_limits; -#if CONFIG_AMVR - if (cpi->common.cur_frame_mv_precision_level) { + if (scaled_ref_frame) { + // Swap back the original buffers for subpel motion search. + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[ref_idx] = backup_yv12[i]; + } + } + + if (cpi->common.cur_frame_force_integer_mv) { x->best_mv.as_mv.row *= 8; x->best_mv.as_mv.col *= 8; } - if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0) { -#else - if (bestsme < INT_MAX) { -#endif + const int use_fractional_mv = + bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0; + if (use_fractional_mv) { int dis; /* TODO: use dis in distortion calculation later. */ unsigned int sse; bestsme = cpi->find_fractional_mv_step( - x, &ref_mv.as_mv, cpi->common.allow_high_precision_mv, x->errorperbit, + x, cm, mi_row, mi_col, &ref_mv.as_mv, + cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask, mask_stride, - ref_idx, pw, ph, cpi->sf.use_upsampled_references); + ref_idx, pw, ph, cpi->sf.use_accurate_subpel_search); } - // Restore the pointer to the first (possibly scaled) prediction buffer. + // Restore the pointer to the first unscaled prediction buffer. if (ref_idx) pd->pre[0] = orig_yv12; if (bestsme < INT_MAX) *this_mv = *best_mv; *rate_mv = 0; - if (scaled_ref_frame) { - // Restore the prediction frame pointers to their unscaled versions. - int i; - for (i = 0; i < MAX_MB_PLANE; i++) - xd->plane[i].pre[ref_idx] = backup_yv12[i]; - } - -#if CONFIG_COMPOUND_SINGLEREF - if (!has_second_ref(mbmi)) - av1_set_mvcost(x, ref, 0, mbmi->ref_mv_idx); - else -#endif // CONFIG_COMPOUND_SINGLEREF - av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx); + av1_set_mvcost( + x, ref_idx, + mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0)); *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); } @@ -7774,51 +6866,23 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, // Wrapper for compound_single_motion_search, for the common case // where the second prediction is also an inter mode. static void compound_single_motion_search_interinter( - const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *frame_mv, -#if CONFIG_COMPOUND_SINGLEREF - int_mv *frame_comp_mv, -#endif // CONFIG_COMPOUND_SINGLEREF + const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv, int mi_row, int mi_col, const uint8_t *mask, int mask_stride, int *rate_mv, const int block, int ref_idx) { MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - -// This function should only ever be called for compound modes -#if CONFIG_COMPOUND_SINGLEREF - int is_singleref_comp_mode = - !has_second_ref(mbmi) && is_inter_singleref_comp_mode(mbmi->mode); - assert(has_second_ref(mbmi) || is_singleref_comp_mode); - if (is_singleref_comp_mode && ref_idx) assert(frame_comp_mv); -#else // !CONFIG_COMPOUND_SINGLEREF - assert(has_second_ref(mbmi)); -#endif // CONFIG_COMPOUND_SINGLEREF + // This function should only ever be called for compound modes + assert(has_second_ref(xd->mi[0])); -// Prediction buffer from second frame. -#if CONFIG_HIGHBITDEPTH + // Prediction buffer from second frame. DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]); uint8_t *second_pred; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16); else second_pred = (uint8_t *)second_pred_alloc_16; -#else - DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]); -#endif // CONFIG_HIGHBITDEPTH - -#if CONFIG_COMPOUND_SINGLEREF - MV *this_mv = has_second_ref(mbmi) - ? &frame_mv[mbmi->ref_frame[ref_idx]].as_mv - : (ref_idx ? &frame_comp_mv[mbmi->ref_frame[0]].as_mv - : &frame_mv[mbmi->ref_frame[0]].as_mv); - const MV *other_mv = - has_second_ref(mbmi) - ? &frame_mv[mbmi->ref_frame[!ref_idx]].as_mv - : (ref_idx ? &frame_mv[mbmi->ref_frame[0]].as_mv - : &frame_comp_mv[mbmi->ref_frame[0]].as_mv); -#else // !CONFIG_COMPOUND_SINGLEREF - MV *this_mv = &frame_mv[mbmi->ref_frame[ref_idx]].as_mv; - const MV *other_mv = &frame_mv[mbmi->ref_frame[!ref_idx]].as_mv; -#endif // CONFIG_COMPOUND_SINGLEREF + + MV *this_mv = &cur_mv[ref_idx].as_mv; + const MV *other_mv = &cur_mv[!ref_idx].as_mv; build_second_inter_pred(cpi, x, bsize, other_mv, mi_row, mi_col, block, ref_idx, second_pred); @@ -7828,58 +6892,33 @@ static void compound_single_motion_search_interinter( ref_idx); } -#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE static void do_masked_motion_search_indexed( const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize, int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv, int which) { // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *mbmi = xd->mi[0]; BLOCK_SIZE sb_type = mbmi->sb_type; const uint8_t *mask; const int mask_stride = block_size_wide[bsize]; mask = av1_get_compound_type_mask(comp_data, sb_type); - int_mv frame_mv[TOTAL_REFS_PER_FRAME]; -#if CONFIG_COMPOUND_SINGLEREF - int_mv frame_comp_mv[TOTAL_REFS_PER_FRAME]; -#endif // CONFIG_COMPOUND_SINGLEREF - MV_REFERENCE_FRAME rf[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] }; - assert(bsize >= BLOCK_8X8 || CONFIG_CB4X4); - - frame_mv[rf[0]].as_int = cur_mv[0].as_int; -#if CONFIG_COMPOUND_SINGLEREF - if (!has_second_ref(mbmi)) - frame_comp_mv[rf[0]].as_int = cur_mv[1].as_int; - else -#endif // CONFIG_COMPOUND_SINGLEREF - frame_mv[rf[1]].as_int = cur_mv[1].as_int; + tmp_mv[0].as_int = cur_mv[0].as_int; + tmp_mv[1].as_int = cur_mv[1].as_int; if (which == 0 || which == 1) { - compound_single_motion_search_interinter( - cpi, x, bsize, frame_mv, -#if CONFIG_COMPOUND_SINGLEREF - has_second_ref(mbmi) ? NULL : frame_comp_mv, -#endif // CONFIG_COMPOUND_SINGLEREF - mi_row, mi_col, mask, mask_stride, rate_mv, 0, which); + compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mi_row, + mi_col, mask, mask_stride, rate_mv, + 0, which); } else if (which == 2) { - joint_motion_search(cpi, x, bsize, frame_mv, -#if CONFIG_COMPOUND_SINGLEREF - has_second_ref(mbmi) ? NULL : frame_comp_mv, -#endif // CONFIG_COMPOUND_SINGLEREF - mi_row, mi_col, NULL, mask, mask_stride, rate_mv, 0); - } - tmp_mv[0].as_int = frame_mv[rf[0]].as_int; -#if CONFIG_COMPOUND_SINGLEREF - if (!has_second_ref(mbmi)) - tmp_mv[1].as_int = frame_comp_mv[rf[0]].as_int; - else // comp ref -#endif // CONFIG_COMPOUND_SINGLEREF - tmp_mv[1].as_int = frame_mv[rf[1]].as_int; -} -#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE + joint_motion_search(cpi, x, bsize, tmp_mv, mi_row, mi_col, NULL, mask, + mask_stride, rate_mv, 0); + } +} +#define USE_DISCOUNT_NEWMV_TEST 0 +#if USE_DISCOUNT_NEWMV_TEST // In some situations we want to discount the apparent cost of a new motion // vector. Where there is a subtle motion field and especially where there is // low spatial complexity then it can be hard to cover the cost of a new motion @@ -7887,17 +6926,42 @@ static void do_masked_motion_search_indexed( // However, once established that vector may be usable through the nearest and // near mv modes to reduce distortion in subsequent blocks and also improve // visual quality. -static int discount_newmv_test(const AV1_COMP *const cpi, int this_mode, - int_mv this_mv, - int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], - int ref_frame) { - return (!cpi->rc.is_src_frame_alt_ref && (this_mode == NEWMV) && - (this_mv.as_int != 0) && - ((mode_mv[NEARESTMV][ref_frame].as_int == 0) || - (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) && - ((mode_mv[NEARMV][ref_frame].as_int == 0) || - (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV))); +#define NEW_MV_DISCOUNT_FACTOR 8 +static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx, + int ref_mv_idx, + const MV_REFERENCE_FRAME *ref_frame, + const MB_MODE_INFO_EXT *mbmi_ext); +static int discount_newmv_test(const AV1_COMP *const cpi, const MACROBLOCK *x, + int this_mode, int_mv this_mv) { + if (this_mode == NEWMV && this_mv.as_int != 0 && + !cpi->rc.is_src_frame_alt_ref) { + // Only discount new_mv when nearst_mv and all near_mv are zero, and the + // new_mv is not equal to global_mv + const AV1_COMMON *const cm = &cpi->common; + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const MV_REFERENCE_FRAME tmp_ref_frames[2] = { mbmi->ref_frame[0], + NONE_FRAME }; + const uint8_t ref_frame_type = av1_ref_frame_type(tmp_ref_frames); + int_mv nearest_mv; + get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext); + int ret = nearest_mv.as_int == 0; + for (int ref_mv_idx = 0; + ref_mv_idx < x->mbmi_ext->ref_mv_count[ref_frame_type]; ++ref_mv_idx) { + int_mv near_mv; + get_this_mv(&near_mv, NEARMV, 0, ref_mv_idx, tmp_ref_frames, x->mbmi_ext); + ret &= near_mv.as_int == 0; + } + if (cm->global_motion[tmp_ref_frames[0]].wmtype <= TRANSLATION) { + int_mv global_mv; + get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext); + ret &= global_mv.as_int != this_mv.as_int; + } + return ret; + } + return 0; } +#endif #define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3) #define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3) @@ -7910,25 +6974,42 @@ static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); } -#if CONFIG_WEDGE static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x, const BLOCK_SIZE bsize, const uint8_t *pred0, int stride0, const uint8_t *pred1, int stride1) { + static const BLOCK_SIZE split_qtr[BLOCK_SIZES_ALL] = { + // 4X4 + BLOCK_INVALID, + // 4X8, 8X4, 8X8 + BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4, + // 8X16, 16X8, 16X16 + BLOCK_4X8, BLOCK_8X4, BLOCK_8X8, + // 16X32, 32X16, 32X32 + BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, + // 32X64, 64X32, 64X64 + BLOCK_16X32, BLOCK_32X16, BLOCK_32X32, + // 64x128, 128x64, 128x128 + BLOCK_32X64, BLOCK_64X32, BLOCK_64X64, + // 4X16, 16X4, 8X32 + BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16, + // 32X8, 16X64, 64X16 + BLOCK_16X4, BLOCK_8X32, BLOCK_32X8 + }; const struct macroblock_plane *const p = &x->plane[0]; const uint8_t *src = p->src.buf; int src_stride = p->src.stride; - const int f_index = bsize - BLOCK_8X8; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; uint32_t esq[2][4]; int64_t tl, br; -#if CONFIG_HIGHBITDEPTH + const BLOCK_SIZE f_index = split_qtr[bsize]; + assert(f_index != BLOCK_INVALID); + if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { pred0 = CONVERT_TO_BYTEPTR(pred0); pred1 = CONVERT_TO_BYTEPTR(pred1); } -#endif // CONFIG_HIGHBITDEPTH cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]); cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred0 + bw / 2, stride0, @@ -7947,100 +7028,14 @@ static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x, pred1 + bh / 2 * stride1 + bw / 2, stride0, &esq[1][3]); - tl = (int64_t)(esq[0][0] + esq[0][1] + esq[0][2]) - - (int64_t)(esq[1][0] + esq[1][1] + esq[1][2]); - br = (int64_t)(esq[1][3] + esq[1][1] + esq[1][2]) - - (int64_t)(esq[0][3] + esq[0][1] + esq[0][2]); + tl = ((int64_t)esq[0][0] + esq[0][1] + esq[0][2]) - + ((int64_t)esq[1][0] + esq[1][1] + esq[1][2]); + br = ((int64_t)esq[1][3] + esq[1][1] + esq[1][2]) - + ((int64_t)esq[0][3] + esq[0][1] + esq[0][2]); return (tl + br > 0); } -#endif // CONFIG_WEDGE - -#if !CONFIG_DUAL_FILTER -static InterpFilter predict_interp_filter( - const AV1_COMP *cpi, const MACROBLOCK *x, const BLOCK_SIZE bsize, - const int mi_row, const int mi_col, - InterpFilter (*single_filter)[TOTAL_REFS_PER_FRAME]) { - InterpFilter best_filter = SWITCHABLE; - const AV1_COMMON *cm = &cpi->common; - const MACROBLOCKD *xd = &x->e_mbd; - int bsl = mi_width_log2_lookup[bsize]; - int pred_filter_search = - cpi->sf.cb_pred_filter_search - ? (((mi_row + mi_col) >> bsl) + - get_chessboard_index(cm->current_video_frame)) & - 0x1 - : 0; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - const int is_comp_pred = has_second_ref(mbmi); - const int this_mode = mbmi->mode; - int refs[2] = { mbmi->ref_frame[0], - (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; - if (pred_filter_search) { - InterpFilter af = SWITCHABLE, lf = SWITCHABLE; - if (xd->up_available) - af = av1_extract_interp_filter( - xd->mi[-xd->mi_stride]->mbmi.interp_filters, 0); - if (xd->left_available) - lf = av1_extract_interp_filter(xd->mi[-1]->mbmi.interp_filters, 0); - - if ((this_mode != NEWMV && this_mode != NEW_NEWMV) || (af == lf)) - best_filter = af; - } - if (is_comp_pred) { - if (cpi->sf.adaptive_mode_search) { - switch (this_mode) { - case NEAREST_NEARESTMV: - if (single_filter[NEARESTMV][refs[0]] == - single_filter[NEARESTMV][refs[1]]) - best_filter = single_filter[NEARESTMV][refs[0]]; - break; - case NEAR_NEARMV: - if (single_filter[NEARMV][refs[0]] == single_filter[NEARMV][refs[1]]) - best_filter = single_filter[NEARMV][refs[0]]; - break; - case ZERO_ZEROMV: - if (single_filter[ZEROMV][refs[0]] == single_filter[ZEROMV][refs[1]]) - best_filter = single_filter[ZEROMV][refs[0]]; - break; - case NEW_NEWMV: - if (single_filter[NEWMV][refs[0]] == single_filter[NEWMV][refs[1]]) - best_filter = single_filter[NEWMV][refs[0]]; - break; - case NEAREST_NEWMV: - if (single_filter[NEARESTMV][refs[0]] == - single_filter[NEWMV][refs[1]]) - best_filter = single_filter[NEARESTMV][refs[0]]; - break; - case NEAR_NEWMV: - if (single_filter[NEARMV][refs[0]] == single_filter[NEWMV][refs[1]]) - best_filter = single_filter[NEARMV][refs[0]]; - break; - case NEW_NEARESTMV: - if (single_filter[NEWMV][refs[0]] == - single_filter[NEARESTMV][refs[1]]) - best_filter = single_filter[NEWMV][refs[0]]; - break; - case NEW_NEARMV: - if (single_filter[NEWMV][refs[0]] == single_filter[NEARMV][refs[1]]) - best_filter = single_filter[NEWMV][refs[0]]; - break; - default: - if (single_filter[this_mode][refs[0]] == - single_filter[this_mode][refs[1]]) - best_filter = single_filter[this_mode][refs[0]]; - break; - } - } - } - if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) { - best_filter = EIGHTTAP_REGULAR; - } - return best_filter; -} -#endif // !CONFIG_DUAL_FILTER // Choose the best wedge index and sign -#if CONFIG_WEDGE static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, const BLOCK_SIZE bsize, const uint8_t *const p0, const uint8_t *const p1, int *const best_wedge_sign, @@ -8058,12 +7053,8 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, int wedge_types = (1 << get_wedge_bits_lookup(bsize)); const uint8_t *mask; uint64_t sse; -#if CONFIG_HIGHBITDEPTH const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; -#else - const int bd_round = 0; -#endif // CONFIG_HIGHBITDEPTH DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]); @@ -8072,7 +7063,6 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, int64_t sign_limit; -#if CONFIG_HIGHBITDEPTH if (hbd) { aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride, CONVERT_TO_BYTEPTR(p0), bw, xd->bd); @@ -8080,9 +7070,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, CONVERT_TO_BYTEPTR(p1), bw, xd->bd); aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw, CONVERT_TO_BYTEPTR(p0), bw, xd->bd); - } else // NOLINT -#endif // CONFIG_HIGHBITDEPTH - { + } else { aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw); aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw); aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw); @@ -8114,6 +7102,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, sse = ROUND_POWER_OF_TWO(sse, bd_round); model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist); + rate += x->wedge_idx_cost[bsize][wedge_index]; rd = RDCOST(x->rdmult, rate, dist); if (rd < best_rd) { @@ -8123,7 +7112,8 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, } } - return best_rd; + return best_rd - + RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0); } // Choose the best wedge index the specified sign @@ -8143,25 +7133,18 @@ static int64_t pick_wedge_fixed_sign( int wedge_types = (1 << get_wedge_bits_lookup(bsize)); const uint8_t *mask; uint64_t sse; -#if CONFIG_HIGHBITDEPTH const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; -#else - const int bd_round = 0; -#endif // CONFIG_HIGHBITDEPTH DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]); -#if CONFIG_HIGHBITDEPTH if (hbd) { aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride, CONVERT_TO_BYTEPTR(p1), bw, xd->bd); aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw, CONVERT_TO_BYTEPTR(p0), bw, xd->bd); - } else // NOLINT -#endif // CONFIG_HIGHBITDEPTH - { + } else { aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw); aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw); } @@ -8175,6 +7158,7 @@ static int64_t pick_wedge_fixed_sign( sse = ROUND_POWER_OF_TWO(sse, bd_round); model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist); + rate += x->wedge_idx_cost[bsize][wedge_index]; rd = RDCOST(x->rdmult, rate, dist); if (rd < best_rd) { @@ -8183,7 +7167,8 @@ static int64_t pick_wedge_fixed_sign( } } - return best_rd; + return best_rd - + RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0); } static int64_t pick_interinter_wedge(const AV1_COMP *const cpi, @@ -8192,7 +7177,7 @@ static int64_t pick_interinter_wedge(const AV1_COMP *const cpi, const uint8_t *const p0, const uint8_t *const p1) { MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; const int bw = block_size_wide[bsize]; int64_t rd; @@ -8200,7 +7185,7 @@ static int64_t pick_interinter_wedge(const AV1_COMP *const cpi, int wedge_sign = 0; assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize)); - assert(cpi->common.allow_masked_compound); + assert(cpi->common.seq_params.enable_masked_compound); if (cpi->sf.fast_wedge_sign_estimate) { wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw); @@ -8209,19 +7194,17 @@ static int64_t pick_interinter_wedge(const AV1_COMP *const cpi, rd = pick_wedge(cpi, x, bsize, p0, p1, &wedge_sign, &wedge_index); } - mbmi->wedge_sign = wedge_sign; - mbmi->wedge_index = wedge_index; + mbmi->interinter_comp.wedge_sign = wedge_sign; + mbmi->interinter_comp.wedge_index = wedge_index; return rd; } -#endif // CONFIG_WEDGE -#if CONFIG_COMPOUND_SEGMENT static int64_t pick_interinter_seg(const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize, const uint8_t *const p0, const uint8_t *const p1) { MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; const struct buf_2d *const src = &x->plane[0].src; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; @@ -8230,20 +7213,15 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi, uint64_t sse; int64_t dist; int64_t rd0; - SEG_MASK_TYPE cur_mask_type; + DIFFWTD_MASK_TYPE cur_mask_type; int64_t best_rd = INT64_MAX; - SEG_MASK_TYPE best_mask_type = 0; -#if CONFIG_HIGHBITDEPTH + DIFFWTD_MASK_TYPE best_mask_type = 0; const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; -#else - const int bd_round = 0; -#endif // CONFIG_HIGHBITDEPTH DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]); -#if CONFIG_HIGHBITDEPTH if (hbd) { aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride, CONVERT_TO_BYTEPTR(p0), bw, xd->bd); @@ -8251,26 +7229,22 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi, CONVERT_TO_BYTEPTR(p1), bw, xd->bd); aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw, CONVERT_TO_BYTEPTR(p0), bw, xd->bd); - } else // NOLINT -#endif // CONFIG_HIGHBITDEPTH - { + } else { aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw); aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw); aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw); } // try each mask type and its inverse - for (cur_mask_type = 0; cur_mask_type < SEG_MASK_TYPES; cur_mask_type++) { -// build mask and inverse -#if CONFIG_HIGHBITDEPTH + for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) { + // build mask and inverse if (hbd) - build_compound_seg_mask_highbd( + av1_build_compound_diffwtd_mask_highbd( xd->seg_mask, cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw, - CONVERT_TO_BYTEPTR(p1), bw, bsize, bh, bw, xd->bd); + CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd); else -#endif // CONFIG_HIGHBITDEPTH - build_compound_seg_mask(xd->seg_mask, cur_mask_type, p0, bw, p1, bw, - bsize, bh, bw); + av1_build_compound_diffwtd_mask(xd->seg_mask, cur_mask_type, p0, bw, p1, + bw, bh, bw); // compute rd for mask sse = av1_wedge_sse_from_residuals(r1, d10, xd->seg_mask, N); @@ -8286,35 +7260,31 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi, } // make final mask - mbmi->mask_type = best_mask_type; -#if CONFIG_HIGHBITDEPTH + mbmi->interinter_comp.mask_type = best_mask_type; if (hbd) - build_compound_seg_mask_highbd( - xd->seg_mask, mbmi->mask_type, CONVERT_TO_BYTEPTR(p0), bw, - CONVERT_TO_BYTEPTR(p1), bw, bsize, bh, bw, xd->bd); + av1_build_compound_diffwtd_mask_highbd( + xd->seg_mask, mbmi->interinter_comp.mask_type, CONVERT_TO_BYTEPTR(p0), + bw, CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd); else -#endif // CONFIG_HIGHBITDEPTH - build_compound_seg_mask(xd->seg_mask, mbmi->mask_type, p0, bw, p1, bw, - bsize, bh, bw); + av1_build_compound_diffwtd_mask( + xd->seg_mask, mbmi->interinter_comp.mask_type, p0, bw, p1, bw, bh, bw); return best_rd; } -#endif // CONFIG_COMPOUND_SEGMENT -#if CONFIG_WEDGE && CONFIG_INTERINTRA static int64_t pick_interintra_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, const BLOCK_SIZE bsize, const uint8_t *const p0, const uint8_t *const p1) { const MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; int64_t rd; int wedge_index = -1; assert(is_interintra_wedge_used(bsize)); - assert(cpi->common.allow_interintra_compound); + assert(cpi->common.seq_params.enable_interintra_compound); rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, 0, &wedge_index); @@ -8322,22 +7292,15 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi, mbmi->interintra_wedge_index = wedge_index; return rd; } -#endif // CONFIG_WEDGE && CONFIG_INTERINTRA -#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize, const uint8_t *const p0, const uint8_t *const p1) { - const COMPOUND_TYPE compound_type = - x->e_mbd.mi[0]->mbmi.interinter_compound_type; + const COMPOUND_TYPE compound_type = x->e_mbd.mi[0]->interinter_comp.type; switch (compound_type) { -#if CONFIG_WEDGE case COMPOUND_WEDGE: return pick_interinter_wedge(cpi, x, bsize, p0, p1); -#endif // CONFIG_WEDGE -#if CONFIG_COMPOUND_SEGMENT - case COMPOUND_SEG: return pick_interinter_seg(cpi, x, bsize, p0, p1); -#endif // CONFIG_COMPOUND_SEGMENT + case COMPOUND_DIFFWTD: return pick_interinter_seg(cpi, x, bsize, p0, p1); default: assert(0); return 0; } } @@ -8346,46 +7309,23 @@ static int interinter_compound_motion_search( const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, const BLOCK_SIZE bsize, const int this_mode, int mi_row, int mi_col) { MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; int_mv tmp_mv[2]; int tmp_rate_mv = 0; - const INTERINTER_COMPOUND_DATA compound_data = { -#if CONFIG_WEDGE - mbmi->wedge_index, - mbmi->wedge_sign, -#endif // CONFIG_WEDGE -#if CONFIG_COMPOUND_SEGMENT - mbmi->mask_type, - xd->seg_mask, -#endif // CONFIG_COMPOUND_SEGMENT - mbmi->interinter_compound_type - }; -#if CONFIG_COMPOUND_SINGLEREF - // NOTE: Mode is needed to identify the compound mode prediction, regardless - // of comp refs or single ref. - mbmi->mode = this_mode; -#endif // CONFIG_COMPOUND_SINGLEREF - - if (this_mode == NEW_NEWMV -#if CONFIG_COMPOUND_SINGLEREF - || this_mode == SR_NEW_NEWMV -#endif // CONFIG_COMPOUND_SINGLEREF - ) { - do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize, + mbmi->interinter_comp.seg_mask = xd->seg_mask; + const INTERINTER_COMPOUND_DATA *compound_data = &mbmi->interinter_comp; + + if (this_mode == NEW_NEWMV) { + do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize, mi_row, mi_col, tmp_mv, &tmp_rate_mv, 2); mbmi->mv[0].as_int = tmp_mv[0].as_int; mbmi->mv[1].as_int = tmp_mv[1].as_int; } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) { - do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize, + do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize, mi_row, mi_col, tmp_mv, &tmp_rate_mv, 0); mbmi->mv[0].as_int = tmp_mv[0].as_int; - } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV -#if CONFIG_COMPOUND_SINGLEREF - // || this_mode == SR_NEAREST_NEWMV - || this_mode == SR_NEAR_NEWMV || this_mode == SR_ZERO_NEWMV -#endif // CONFIG_COMPOUND_SINGLEREF - ) { - do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize, + } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) { + do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize, mi_row, mi_col, tmp_mv, &tmp_rate_mv, 1); mbmi->mv[1].as_int = tmp_mv[1].as_int; } @@ -8394,22 +7334,23 @@ static int interinter_compound_motion_search( static int64_t build_and_cost_compound_type( const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, - const BLOCK_SIZE bsize, const int this_mode, int rs2, int rate_mv, + const BLOCK_SIZE bsize, const int this_mode, int *rs2, int rate_mv, BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1, int *strides, int mi_row, int mi_col) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; int rate_sum; int64_t dist_sum; int64_t best_rd_cur = INT64_MAX; int64_t rd = INT64_MAX; int tmp_skip_txfm_sb; int64_t tmp_skip_sse_sb; - const COMPOUND_TYPE compound_type = mbmi->interinter_compound_type; + const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type; best_rd_cur = pick_interinter_mask(cpi, x, bsize, *preds0, *preds1); - best_rd_cur += RDCOST(x->rdmult, rs2 + rate_mv, 0); + *rs2 += get_interinter_compound_mask_rate(x, mbmi); + best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0); if (have_newmv_in_inter_mode(this_mode) && use_masked_motion_search(compound_type)) { @@ -8417,80 +7358,74 @@ static int64_t build_and_cost_compound_type( this_mode, mi_row, mi_col); av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize); model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb); - rd = RDCOST(x->rdmult, rs2 + *out_rate_mv + rate_sum, dist_sum); + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum); if (rd >= best_rd_cur) { mbmi->mv[0].as_int = cur_mv[0].as_int; mbmi->mv[1].as_int = cur_mv[1].as_int; *out_rate_mv = rate_mv; - av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, -#if CONFIG_SUPERTX - 0, 0, -#endif // CONFIG_SUPERTX - preds0, strides, preds1, - strides); + av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides, + preds1, strides); } av1_subtract_plane(x, bsize, 0); rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); if (rd != INT64_MAX) - rd = RDCOST(x->rdmult, rs2 + *out_rate_mv + rate_sum, dist_sum); + rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum); best_rd_cur = rd; } else { - av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, -#if CONFIG_SUPERTX - 0, 0, -#endif // CONFIG_SUPERTX - preds0, strides, preds1, strides); + av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides, + preds1, strides); av1_subtract_plane(x, bsize, 0); rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); if (rd != INT64_MAX) - rd = RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum); + rd = RDCOST(x->rdmult, *rs2 + rate_mv + rate_sum, dist_sum); best_rd_cur = rd; } return best_rd_cur; } -#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE typedef struct { -#if CONFIG_MOTION_VAR - // Inter prediction buffers and respective strides + // OBMC secondary prediction buffers and respective strides uint8_t *above_pred_buf[MAX_MB_PLANE]; int above_pred_stride[MAX_MB_PLANE]; uint8_t *left_pred_buf[MAX_MB_PLANE]; int left_pred_stride[MAX_MB_PLANE]; -#endif // CONFIG_MOTION_VAR int_mv *single_newmv; // Pointer to array of motion vectors to use for each ref and their rates // Should point to first of 2 arrays in 2D array int *single_newmv_rate; + int *single_newmv_valid; // Pointer to array of predicted rate-distortion // Should point to first of 2 arrays in 2D array - int64_t (*modelled_rd)[TOTAL_REFS_PER_FRAME]; - InterpFilter single_filter[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME]; + int64_t (*modelled_rd)[REF_FRAMES]; + InterpFilter single_filter[MB_MODE_COUNT][REF_FRAMES]; + int ref_frame_cost; + int single_comp_cost; } HandleInterModeArgs; +static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv, + const AV1_COMMON *cm, + const MACROBLOCK *x) { + const MACROBLOCKD *const xd = &x->e_mbd; + *out_mv = in_mv; + lower_mv_precision(&out_mv->as_mv, cm->allow_high_precision_mv, + cm->cur_frame_force_integer_mv); + clamp_mv2(&out_mv->as_mv, xd); + return !mv_check_bounds(&x->mv_limits, &out_mv->as_mv); +} + static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x, - const BLOCK_SIZE bsize, - int_mv (*const mode_mv)[TOTAL_REFS_PER_FRAME], -#if CONFIG_COMPOUND_SINGLEREF - int_mv (*const mode_comp_mv)[TOTAL_REFS_PER_FRAME], -#endif // CONFIG_COMPOUND_SINGLEREF + const BLOCK_SIZE bsize, int_mv *cur_mv, const int mi_row, const int mi_col, - int *const rate_mv, int_mv *const single_newmv, + int *const rate_mv, HandleInterModeArgs *const args) { const MACROBLOCKD *const xd = &x->e_mbd; - const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const MB_MODE_INFO *const mbmi = xd->mi[0]; const int is_comp_pred = has_second_ref(mbmi); const PREDICTION_MODE this_mode = mbmi->mode; - const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME); - int_mv *const frame_mv = mode_mv[this_mode]; -#if CONFIG_COMPOUND_SINGLEREF - int_mv *const frame_comp_mv = mode_comp_mv[this_mode]; -#endif // CONFIG_COMPOUND_SINGLEREF const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] }; int i; @@ -8498,392 +7433,338 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x, (void)args; if (is_comp_pred) { - for (i = 0; i < 2; ++i) { - single_newmv[refs[i]].as_int = args->single_newmv[refs[i]].as_int; - } - if (this_mode == NEW_NEWMV) { - frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int; - frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int; + cur_mv[0].as_int = args->single_newmv[refs[0]].as_int; + cur_mv[1].as_int = args->single_newmv[refs[1]].as_int; if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { - joint_motion_search(cpi, x, bsize, frame_mv, -#if CONFIG_COMPOUND_SINGLEREF - NULL, // int_mv *frame_comp_mv -#endif // CONFIG_COMPOUND_SINGLEREF - mi_row, mi_col, NULL, NULL, 0, rate_mv, 0); + joint_motion_search(cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, NULL, + 0, rate_mv, 0); } else { *rate_mv = 0; for (i = 0; i < 2; ++i) { - av1_set_mvcost(x, refs[i], i, mbmi->ref_mv_idx); - *rate_mv += av1_mv_bit_cost( - &frame_mv[refs[i]].as_mv, &mbmi_ext->ref_mvs[refs[i]][0].as_mv, - x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + const int_mv ref_mv = av1_get_ref_mv(x, i); + av1_set_mvcost(x, i, mbmi->ref_mv_idx); + *rate_mv += + av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv, x->nmvjointcost, + x->mvcost, MV_COST_WEIGHT); } } } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) { - frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int; + cur_mv[1].as_int = args->single_newmv[refs[1]].as_int; if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { - frame_mv[refs[0]].as_int = - mode_mv[compound_ref0_mode(this_mode)][refs[0]].as_int; - compound_single_motion_search_interinter(cpi, x, bsize, frame_mv, -#if CONFIG_COMPOUND_SINGLEREF - NULL, -#endif // CONFIG_COMPOUND_SINGLEREF - mi_row, mi_col, NULL, 0, - rate_mv, 0, 1); + compound_single_motion_search_interinter( + cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 1); } else { - av1_set_mvcost(x, refs[1], 1, mbmi->ref_mv_idx); - *rate_mv = av1_mv_bit_cost(&frame_mv[refs[1]].as_mv, - &mbmi_ext->ref_mvs[refs[1]][0].as_mv, + av1_set_mvcost(x, 1, + mbmi->ref_mv_idx + (this_mode == NEAR_NEWMV ? 1 : 0)); + const int_mv ref_mv = av1_get_ref_mv(x, 1); + *rate_mv = av1_mv_bit_cost(&cur_mv[1].as_mv, &ref_mv.as_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); } } else { assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV); - frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int; + cur_mv[0].as_int = args->single_newmv[refs[0]].as_int; if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { - frame_mv[refs[1]].as_int = - mode_mv[compound_ref1_mode(this_mode)][refs[1]].as_int; - compound_single_motion_search_interinter(cpi, x, bsize, frame_mv, -#if CONFIG_COMPOUND_SINGLEREF - NULL, -#endif // CONFIG_COMPOUND_SINGLEREF - mi_row, mi_col, NULL, 0, - rate_mv, 0, 0); + compound_single_motion_search_interinter( + cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 0); } else { - av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx); - *rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv, - &mbmi_ext->ref_mvs[refs[0]][0].as_mv, + const int_mv ref_mv = av1_get_ref_mv(x, 0); + av1_set_mvcost(x, 0, + mbmi->ref_mv_idx + (this_mode == NEW_NEARMV ? 1 : 0)); + *rate_mv = av1_mv_bit_cost(&cur_mv[0].as_mv, &ref_mv.as_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); } } -#if CONFIG_COMPOUND_SINGLEREF - } else if (is_inter_singleref_comp_mode(this_mode)) { - // Single ref comp mode - const int mode0 = compound_ref0_mode(this_mode); - - single_newmv[refs[0]].as_int = args->single_newmv[refs[0]].as_int; - frame_mv[refs[0]].as_int = (mode0 == NEWMV) - ? single_newmv[refs[0]].as_int - : mode_mv[mode0][refs[0]].as_int; - assert(compound_ref1_mode(this_mode) == NEWMV); - frame_comp_mv[refs[0]].as_int = single_newmv[refs[0]].as_int; - - if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { - if (this_mode == SR_NEW_NEWMV) { - joint_motion_search(cpi, x, bsize, frame_mv, frame_comp_mv, mi_row, - mi_col, NULL, NULL, 0, rate_mv, 0); - } else { - assert( // this_mode == SR_NEAREST_NEWMV || - this_mode == SR_NEAR_NEWMV || this_mode == SR_ZERO_NEWMV); - compound_single_motion_search_interinter(cpi, x, bsize, frame_mv, - frame_comp_mv, mi_row, mi_col, - NULL, 0, rate_mv, 0, 1); - } - } else { - *rate_mv = 0; - av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx); - if (mode0 == NEWMV) - *rate_mv += av1_mv_bit_cost(&frame_mv[refs[0]].as_mv, - &mbmi_ext->ref_mvs[refs[0]][0].as_mv, - x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); - *rate_mv += av1_mv_bit_cost(&frame_comp_mv[refs[0]].as_mv, - &mbmi_ext->ref_mvs[refs[0]][0].as_mv, - x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); - } -#endif // CONFIG_COMPOUND_SINGLEREF } else { - if (is_comp_interintra_pred) { - x->best_mv = args->single_newmv[refs[0]]; - *rate_mv = args->single_newmv_rate[refs[0]]; - } else { - single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, rate_mv); - args->single_newmv[refs[0]] = x->best_mv; - args->single_newmv_rate[refs[0]] = *rate_mv; - } - + single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, rate_mv); if (x->best_mv.as_int == INVALID_MV) return INT64_MAX; - frame_mv[refs[0]] = x->best_mv; - xd->mi[0]->bmi[0].as_mv[0] = x->best_mv; + args->single_newmv[refs[0]] = x->best_mv; + args->single_newmv_rate[refs[0]] = *rate_mv; + args->single_newmv_valid[refs[0]] = 1; + + cur_mv[0].as_int = x->best_mv.as_int; +#if USE_DISCOUNT_NEWMV_TEST // Estimate the rate implications of a new mv but discount this // under certain circumstances where we want to help initiate a weak // motion field, where the distortion gain for a single block may not // be enough to overcome the cost of a new mv. - if (discount_newmv_test(cpi, this_mode, x->best_mv, mode_mv, refs[0])) { + if (discount_newmv_test(cpi, x, this_mode, x->best_mv)) { *rate_mv = AOMMAX(*rate_mv / NEW_MV_DISCOUNT_FACTOR, 1); } +#endif } return 0; } -int64_t interpolation_filter_search( +static INLINE void swap_dst_buf(MACROBLOCKD *xd, const BUFFER_SET *dst_bufs[2], + int num_planes) { + const BUFFER_SET *buf0 = dst_bufs[0]; + dst_bufs[0] = dst_bufs[1]; + dst_bufs[1] = buf0; + restore_dst_buf(xd, *dst_bufs[0], num_planes); +} + +// calculate the rdcost of given interpolation_filter +static INLINE int64_t interpolation_filter_rd( + MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd, + int *const switchable_rate, int *const skip_txfm_sb, + int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], int filter_idx) { + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + int tmp_rate, tmp_skip_sb = 0; + int64_t tmp_dist, tmp_skip_sse = INT64_MAX; + + const InterpFilters last_best = mbmi->interp_filters; + mbmi->interp_filters = filter_sets[filter_idx]; + const int tmp_rs = av1_get_switchable_rate(cm, x, xd); + av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); + model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist, + &tmp_skip_sb, &tmp_skip_sse, NULL, NULL, NULL); + int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist); + if (tmp_rd < *rd) { + *rd = tmp_rd; + *switchable_rate = tmp_rs; + *skip_txfm_sb = tmp_skip_sb; + *skip_sse_sb = tmp_skip_sse; + swap_dst_buf(xd, dst_bufs, num_planes); + return 1; + } + mbmi->interp_filters = last_best; + return 0; +} + +// check if there is saved result match with this search +static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st, + MB_MODE_INFO *const mi) { + for (int i = 0; i < 2; ++i) { + if ((st->ref_frames[i] != mi->ref_frame[i]) || + (st->mv[i].as_int != mi->mv[i].as_int)) { + return 0; + } + } + return 1; +} + +static INLINE int find_interp_filter_in_stats(MACROBLOCK *x, + MB_MODE_INFO *const mbmi) { + const int comp_idx = mbmi->compound_idx; + const int offset = x->interp_filter_stats_idx[comp_idx]; + for (int j = 0; j < offset; ++j) { + const INTERPOLATION_FILTER_STATS *st = &x->interp_filter_stats[comp_idx][j]; + if (is_interp_filter_match(st, mbmi)) { + mbmi->interp_filters = st->filters; + return j; + } + } + return -1; // no match result found +} + +static INLINE void save_interp_filter_search_stat(MACROBLOCK *x, + MB_MODE_INFO *const mbmi) { + const int comp_idx = mbmi->compound_idx; + const int offset = x->interp_filter_stats_idx[comp_idx]; + if (offset < MAX_INTERP_FILTER_STATS) { + INTERPOLATION_FILTER_STATS stat = { + mbmi->interp_filters, + { mbmi->mv[0], mbmi->mv[1] }, + { mbmi->ref_frame[0], mbmi->ref_frame[1] }, + }; + x->interp_filter_stats[comp_idx][offset] = stat; + x->interp_filter_stats_idx[comp_idx]++; + } +} + +static int64_t interpolation_filter_search( MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, const BUFFER_SET *const tmp_dst, - BUFFER_SET *const orig_dst, - InterpFilter (*const single_filter)[TOTAL_REFS_PER_FRAME], + BUFFER_SET *const orig_dst, InterpFilter (*const single_filter)[REF_FRAMES], int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb, int64_t *const skip_sse_sb) { const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - int i; - int tmp_rate; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int need_search = + av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd); + int i, tmp_rate; int64_t tmp_dist; (void)single_filter; - - InterpFilter assign_filter = SWITCHABLE; - - if (cm->interp_filter == SWITCHABLE) { -#if !CONFIG_DUAL_FILTER - assign_filter = av1_is_interp_needed(xd) - ? predict_interp_filter(cpi, x, bsize, mi_row, mi_col, - single_filter) - : cm->interp_filter; -#endif // !CONFIG_DUAL_FILTER - } else { - assign_filter = cm->interp_filter; + int match_found = -1; + const InterpFilter assign_filter = cm->interp_filter; + if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) { + match_found = find_interp_filter_in_stats(x, mbmi); + } + if (!need_search || match_found == -1) { + set_default_interp_filters(mbmi, assign_filter); } - - set_default_interp_filters(mbmi, assign_filter); - *switchable_rate = av1_get_switchable_rate(cm, x, xd); av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); - model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, &tmp_dist, - skip_txfm_sb, skip_sse_sb); + model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist, + skip_txfm_sb, skip_sse_sb, NULL, NULL, NULL); *rd = RDCOST(x->rdmult, *switchable_rate + tmp_rate, tmp_dist); - if (assign_filter == SWITCHABLE) { - // do interp_filter search - if (av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd)) { -#if CONFIG_DUAL_FILTER - const int filter_set_size = DUAL_FILTER_SET_SIZE; -#else - const int filter_set_size = SWITCHABLE_FILTERS; -#endif // CONFIG_DUAL_FILTER - int best_in_temp = 0; - InterpFilters best_filters = mbmi->interp_filters; - restore_dst_buf(xd, *tmp_dst); - // EIGHTTAP_REGULAR mode is calculated beforehand - for (i = 1; i < filter_set_size; ++i) { - int tmp_skip_sb = 0; - int64_t tmp_skip_sse = INT64_MAX; - int tmp_rs; - int64_t tmp_rd; -#if CONFIG_DUAL_FILTER - mbmi->interp_filters = - av1_make_interp_filters(filter_sets[i][0], filter_sets[i][1]); -#else - mbmi->interp_filters = av1_broadcast_interp_filter((InterpFilter)i); -#endif // CONFIG_DUAL_FILTER - tmp_rs = av1_get_switchable_rate(cm, x, xd); - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); - model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, - &tmp_dist, &tmp_skip_sb, &tmp_skip_sse); - tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist); - - if (tmp_rd < *rd) { - *rd = tmp_rd; - *switchable_rate = av1_get_switchable_rate(cm, x, xd); - best_filters = mbmi->interp_filters; - *skip_txfm_sb = tmp_skip_sb; - *skip_sse_sb = tmp_skip_sse; - best_in_temp = !best_in_temp; - if (best_in_temp) { - restore_dst_buf(xd, *orig_dst); - } else { - restore_dst_buf(xd, *tmp_dst); - } - } - } - if (best_in_temp) { - restore_dst_buf(xd, *tmp_dst); - } else { - restore_dst_buf(xd, *orig_dst); + if (assign_filter != SWITCHABLE || match_found != -1) { + return 0; + } + if (!need_search) { + assert(mbmi->interp_filters == + av1_broadcast_interp_filter(EIGHTTAP_REGULAR)); + return 0; + } + // do interp_filter search + const int filter_set_size = DUAL_FILTER_SET_SIZE; + restore_dst_buf(xd, *tmp_dst, num_planes); + const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst }; + if (cpi->sf.use_fast_interpolation_filter_search && + cm->seq_params.enable_dual_filter) { + // default to (R,R): EIGHTTAP_REGULARxEIGHTTAP_REGULAR + int best_dual_mode = 0; + // Find best of {R}x{R,Sm,Sh} + // EIGHTTAP_REGULAR mode is calculated beforehand + for (i = 1; i < SWITCHABLE_FILTERS; ++i) { + if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, + switchable_rate, skip_txfm_sb, skip_sse_sb, + dst_bufs, i)) { + best_dual_mode = i; + } + } + // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes + for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size; + i += SWITCHABLE_FILTERS) { + interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, + switchable_rate, skip_txfm_sb, skip_sse_sb, + dst_bufs, i); + } + } else { + // EIGHTTAP_REGULAR mode is calculated beforehand + for (i = 1; i < filter_set_size; ++i) { + if (cm->seq_params.enable_dual_filter == 0) { + const int16_t filter_y = filter_sets[i] & 0xffff; + const int16_t filter_x = filter_sets[i] >> 16; + if (filter_x != filter_y) continue; } - mbmi->interp_filters = best_filters; - } else { - assert(mbmi->interp_filters == - av1_broadcast_interp_filter(EIGHTTAP_REGULAR)); + interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, + switchable_rate, skip_txfm_sb, skip_sse_sb, + dst_bufs, i); } } - + swap_dst_buf(xd, dst_bufs, num_planes); + // save search results + if (cpi->sf.skip_repeat_interpolation_filter_search) { + assert(match_found == -1); + save_interp_filter_search_stat(x, mbmi); + } return 0; } -#if CONFIG_DUAL_FILTER -static InterpFilters condition_interp_filters_on_mv( - InterpFilters interp_filters, const MACROBLOCKD *xd) { - InterpFilter filters[2]; - for (int i = 0; i < 2; ++i) - filters[i] = (has_subpel_mv_component(xd->mi[0], xd, i)) - ? av1_extract_interp_filter(interp_filters, i) - : EIGHTTAP_REGULAR; - - return av1_make_interp_filters(filters[0], filters[1]); -} -#endif - // TODO(afergs): Refactor the MBMI references in here - there's four // TODO(afergs): Refactor optional args - add them to a struct or remove -static int64_t motion_mode_rd( - const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, - RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, - int *disable_skip, int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], int mi_row, - int mi_col, HandleInterModeArgs *const args, const int64_t ref_best_rd, - const int *refs, int rate_mv, -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - // only used when WARPED_MOTION is on? - int_mv *const single_newmv, int rate2_bmc_nocoeff, - MB_MODE_INFO *best_bmc_mbmi, int rate_mv_bmc, -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - int rs, int *skip_txfm_sb, int64_t *skip_sse_sb, BUFFER_SET *orig_dst) { +static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, RD_STATS *rd_stats, + RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, + int *disable_skip, int mi_row, int mi_col, + HandleInterModeArgs *const args, + int64_t ref_best_rd, const int *refs, int rate_mv, + BUFFER_SET *orig_dst +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + , + int64_t *best_est_rd +#endif +) { const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); MACROBLOCKD *xd = &x->e_mbd; - MODE_INFO *mi = xd->mi[0]; - MB_MODE_INFO *mbmi = &mi->mbmi; + MB_MODE_INFO *mbmi = xd->mi[0]; const int is_comp_pred = has_second_ref(mbmi); const PREDICTION_MODE this_mode = mbmi->mode; - - (void)mode_mv; - (void)mi_row; - (void)mi_col; - (void)args; - (void)refs; - (void)rate_mv; - (void)is_comp_pred; - (void)this_mode; -#if !CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR - (void)single_newmv; -#endif - -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - MOTION_MODE motion_mode, last_motion_mode_allowed; int rate2_nocoeff = 0, best_xskip, best_disable_skip = 0; RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv; MB_MODE_INFO base_mbmi, best_mbmi; -#if CONFIG_VAR_TX - uint8_t best_blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4]; -#endif // CONFIG_VAR_TX -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - -#if CONFIG_WARPED_MOTION -#if WARPED_MOTION_SORT_SAMPLES + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + int interintra_allowed = cm->seq_params.enable_interintra_compound && + is_interintra_allowed(mbmi) && mbmi->compound_idx; int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE]; - int pts_mv0[SAMPLES_ARRAY_SIZE]; int total_samples; -#else - int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; -#endif // WARPED_MOTION_SORT_SAMPLES -#endif // CONFIG_WARPED_MOTION -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + (void)rate_mv; + av1_invalid_rd_stats(&best_rd_stats); -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - if (cm->interp_filter == SWITCHABLE) rd_stats->rate += rs; -#if CONFIG_WARPED_MOTION aom_clear_system_state(); -#if WARPED_MOTION_SORT_SAMPLES - mbmi->num_proj_ref[0] = - findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0, pts_mv0); + mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0); total_samples = mbmi->num_proj_ref[0]; -#else - mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref); -#endif // WARPED_MOTION_SORT_SAMPLES - best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0]; -#endif // CONFIG_WARPED_MOTION -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION rate2_nocoeff = rd_stats->rate; - last_motion_mode_allowed = motion_mode_allowed( -#if CONFIG_GLOBAL_MOTION - 0, xd->global_motion, -#endif // CONFIG_GLOBAL_MOTION -#if CONFIG_WARPED_MOTION - xd, -#endif - mi); base_mbmi = *mbmi; -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + MOTION_MODE last_motion_mode_allowed = + cm->switchable_motion_mode + ? motion_mode_allowed(xd->global_motion, xd, mbmi, + cm->allow_warped_motion) + : SIMPLE_TRANSLATION; + assert(mbmi->ref_frame[1] != INTRA_FRAME); + const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1]; -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION int64_t best_rd = INT64_MAX; - for (motion_mode = SIMPLE_TRANSLATION; - motion_mode <= last_motion_mode_allowed; motion_mode++) { + + for (int mode_index = (int)SIMPLE_TRANSLATION; + mode_index <= (int)last_motion_mode_allowed + interintra_allowed; + mode_index++) { int64_t tmp_rd = INT64_MAX; - int tmp_rate; - int64_t tmp_dist; - int tmp_rate2 = - motion_mode != SIMPLE_TRANSLATION ? rate2_bmc_nocoeff : rate2_nocoeff; - -#if CONFIG_NCOBMC_ADAPT_WEIGHT - // We cannot estimate the rd cost for the motion mode NCOBMC_ADAPT_WEIGHT - // right now since it requires mvs from all neighboring blocks. We will - // check if this mode is beneficial after all the mv's in the current - // superblock are selected. - if (motion_mode == NCOBMC_ADAPT_WEIGHT) continue; -#endif + int tmp_rate2 = rate2_nocoeff; + int is_interintra_mode = mode_index > (int)last_motion_mode_allowed; + int skip_txfm_sb = 0; *mbmi = base_mbmi; - mbmi->motion_mode = motion_mode; -#if CONFIG_MOTION_VAR - if (mbmi->motion_mode == OBMC_CAUSAL) { - *mbmi = *best_bmc_mbmi; + if (is_interintra_mode) { + mbmi->motion_mode = SIMPLE_TRANSLATION; + } else { + mbmi->motion_mode = (MOTION_MODE)mode_index; + assert(mbmi->ref_frame[1] != INTRA_FRAME); + } + + if (mbmi->motion_mode == SIMPLE_TRANSLATION && !is_interintra_mode) { + // SIMPLE_TRANSLATION mode: no need to recalculate. + // The prediction is calculated before motion_mode_rd() is called in + // handle_inter_mode() + } else if (mbmi->motion_mode == OBMC_CAUSAL) { mbmi->motion_mode = OBMC_CAUSAL; - if (!is_comp_pred && -#if CONFIG_COMPOUND_SINGLEREF - !is_inter_singleref_comp_mode(this_mode) && -#endif // CONFIG_COMPOUND_SINGLEREF - have_newmv_in_inter_mode(this_mode)) { + if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) { int tmp_rate_mv = 0; single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv); mbmi->mv[0].as_int = x->best_mv.as_int; - if (discount_newmv_test(cpi, this_mode, mbmi->mv[0], mode_mv, - refs[0])) { +#if USE_DISCOUNT_NEWMV_TEST + if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) { tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1); } - tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv; -#if CONFIG_DUAL_FILTER - mbmi->interp_filters = - condition_interp_filters_on_mv(mbmi->interp_filters, xd); -#endif // CONFIG_DUAL_FILTER - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); - } else { - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); +#endif + tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv; } + av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); av1_build_obmc_inter_prediction( cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride, args->left_pred_buf, args->left_pred_stride); - model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, - &tmp_dist, skip_txfm_sb, skip_sse_sb); - } -#endif // CONFIG_MOTION_VAR - -#if CONFIG_WARPED_MOTION - if (mbmi->motion_mode == WARPED_CAUSAL) { -#if WARPED_MOTION_SORT_SAMPLES + } else if (mbmi->motion_mode == WARPED_CAUSAL) { int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; -#endif // WARPED_MOTION_SORT_SAMPLES - *mbmi = *best_bmc_mbmi; mbmi->motion_mode = WARPED_CAUSAL; mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE; mbmi->interp_filters = av1_broadcast_interp_filter( av1_unswitchable_filter(cm->interp_filter)); -#if WARPED_MOTION_SORT_SAMPLES memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); - // Rank the samples by motion vector difference + // Select the samples according to motion vector difference if (mbmi->num_proj_ref[0] > 1) { - mbmi->num_proj_ref[0] = sortSamples(pts_mv0, &mbmi->mv[0].as_mv, pts, - pts_inref, mbmi->num_proj_ref[0]); - best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0]; + mbmi->num_proj_ref[0] = selectSamples( + &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref[0], bsize); } -#endif // WARPED_MOTION_SORT_SAMPLES if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize, mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col, @@ -8892,144 +7773,299 @@ static int64_t motion_mode_rd( if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) { int tmp_rate_mv = 0; const int_mv mv0 = mbmi->mv[0]; - WarpedMotionParams wm_params0 = mbmi->wm_params[0]; -#if WARPED_MOTION_SORT_SAMPLES + const WarpedMotionParams wm_params0 = mbmi->wm_params[0]; int num_proj_ref0 = mbmi->num_proj_ref[0]; // Refine MV in a small range. av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0, - pts_mv0, total_samples); -#else - // Refine MV in a small range. - av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts, pts_inref); -#endif // WARPED_MOTION_SORT_SAMPLES + total_samples); // Keep the refined MV and WM parameters. if (mv0.as_int != mbmi->mv[0].as_int) { const int ref = refs[0]; - const MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv; - + const int_mv ref_mv = av1_get_ref_mv(x, 0); tmp_rate_mv = - av1_mv_bit_cost(&mbmi->mv[0].as_mv, &ref_mv, x->nmvjointcost, - x->mvcost, MV_COST_WEIGHT); + av1_mv_bit_cost(&mbmi->mv[0].as_mv, &ref_mv.as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); if (cpi->sf.adaptive_motion_search) x->pred_mv[ref] = mbmi->mv[0].as_mv; - single_newmv[ref] = mbmi->mv[0]; - - if (discount_newmv_test(cpi, this_mode, mbmi->mv[0], mode_mv, - refs[0])) { +#if USE_DISCOUNT_NEWMV_TEST + if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) { tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1); } -#if WARPED_MOTION_SORT_SAMPLES - best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0]; -#endif // WARPED_MOTION_SORT_SAMPLES - tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv; -#if CONFIG_DUAL_FILTER - mbmi->interp_filters = - condition_interp_filters_on_mv(mbmi->interp_filters, xd); -#endif // CONFIG_DUAL_FILTER +#endif + tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv; } else { // Restore the old MV and WM parameters. mbmi->mv[0] = mv0; mbmi->wm_params[0] = wm_params0; -#if WARPED_MOTION_SORT_SAMPLES mbmi->num_proj_ref[0] = num_proj_ref0; -#endif // WARPED_MOTION_SORT_SAMPLES } } av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize); - model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, - &tmp_dist, skip_txfm_sb, skip_sse_sb); } else { continue; } + } else if (is_interintra_mode) { + INTERINTRA_MODE best_interintra_mode = II_DC_PRED; + int64_t rd, best_interintra_rd = INT64_MAX; + int rmode, rate_sum; + int64_t dist_sum; + int j; + int tmp_rate_mv = 0; + int tmp_skip_txfm_sb; + int bw = block_size_wide[bsize]; + int64_t tmp_skip_sse_sb; + DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]); + uint8_t *tmp_buf, *intrapred; + const int *const interintra_mode_cost = + x->interintra_mode_cost[size_group_lookup[bsize]]; + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_); + intrapred = CONVERT_TO_BYTEPTR(intrapred_); + } else { + tmp_buf = tmp_buf_; + intrapred = intrapred_; + } + const int_mv mv0 = mbmi->mv[0]; + + mbmi->ref_frame[1] = NONE_FRAME; + xd->plane[0].dst.buf = tmp_buf; + xd->plane[0].dst.stride = bw; + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize); + + restore_dst_buf(xd, *orig_dst, num_planes); + mbmi->ref_frame[1] = INTRA_FRAME; + mbmi->use_wedge_interintra = 0; + for (j = 0; j < INTERINTRA_MODES; ++j) { + mbmi->interintra_mode = (INTERINTRA_MODE)j; + rmode = interintra_mode_cost[mbmi->interintra_mode]; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum); + if (rd < best_interintra_rd) { + best_interintra_rd = rd; + best_interintra_mode = mbmi->interintra_mode; + } + } + mbmi->interintra_mode = best_interintra_mode; + rmode = interintra_mode_cost[mbmi->interintra_mode]; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + av1_subtract_plane(x, bsize, 0); + rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + if (rd != INT64_MAX) + rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum, dist_sum); + best_interintra_rd = rd; + + if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) { + // restore ref_frame[1] + mbmi->ref_frame[1] = ref_frame_1; + continue; + } + + if (is_interintra_wedge_used(bsize)) { + int64_t best_interintra_rd_nowedge = INT64_MAX; + int64_t best_interintra_rd_wedge = INT64_MAX; + int_mv tmp_mv; + InterpFilters backup_interp_filters = mbmi->interp_filters; + int rwedge = x->wedge_interintra_cost[bsize][0]; + if (rd != INT64_MAX) + rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum + rwedge, dist_sum); + best_interintra_rd_nowedge = rd; + + // Disable wedge search if source variance is small + if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) { + mbmi->use_wedge_interintra = 1; + + rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) + + x->wedge_interintra_cost[bsize][1]; + + best_interintra_rd_wedge = + pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + + best_interintra_rd_wedge += + RDCOST(x->rdmult, rmode + rate_mv + rwedge, 0); + // Refine motion vector. + if (have_newmv_in_inter_mode(mbmi->mode)) { + // get negative of mask + const uint8_t *mask = av1_get_contiguous_soft_mask( + mbmi->interintra_wedge_index, 1, bsize); + tmp_mv = av1_get_ref_mv(x, 0); + compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row, + mi_col, intrapred, mask, bw, + &tmp_rate_mv, 0); + mbmi->mv[0].as_int = tmp_mv.as_int; + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, + bsize); + model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, + NULL); + rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge, + dist_sum); + if (rd >= best_interintra_rd_wedge) { + tmp_mv.as_int = mv0.as_int; + tmp_rate_mv = rate_mv; + mbmi->interp_filters = backup_interp_filters; + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + } + } else { + tmp_mv.as_int = mv0.as_int; + tmp_rate_mv = rate_mv; + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + } + // Evaluate closer to true rd + av1_subtract_plane(x, bsize, 0); + rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, + INT64_MAX); + if (rd != INT64_MAX) + rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum, + dist_sum); + best_interintra_rd_wedge = rd; + if (best_interintra_rd_wedge < best_interintra_rd_nowedge) { + mbmi->use_wedge_interintra = 1; + mbmi->mv[0].as_int = tmp_mv.as_int; + tmp_rate2 += tmp_rate_mv - rate_mv; + } else { + mbmi->use_wedge_interintra = 0; + mbmi->mv[0].as_int = mv0.as_int; + mbmi->interp_filters = backup_interp_filters; + } + } else { + mbmi->use_wedge_interintra = 0; + } + } // if (is_interintra_wedge_used(bsize)) + restore_dst_buf(xd, *orig_dst, num_planes); + av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); } -#endif // CONFIG_WARPED_MOTION + + if (!cpi->common.all_lossless) + check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb); + x->skip = 0; rd_stats->dist = 0; rd_stats->sse = 0; rd_stats->skip = 1; rd_stats->rate = tmp_rate2; - if (last_motion_mode_allowed > SIMPLE_TRANSLATION) { -#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR - if (last_motion_mode_allowed == WARPED_CAUSAL) -#endif // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR + if (av1_is_interp_needed(xd)) + rd_stats->rate += av1_get_switchable_rate(cm, x, xd); + if (interintra_allowed) { + rd_stats->rate += x->interintra_cost[size_group_lookup[bsize]] + [mbmi->ref_frame[1] == INTRA_FRAME]; + if (mbmi->ref_frame[1] == INTRA_FRAME) { + rd_stats->rate += x->interintra_mode_cost[size_group_lookup[bsize]] + [mbmi->interintra_mode]; + if (is_interintra_wedge_used(bsize)) { + rd_stats->rate += + x->wedge_interintra_cost[bsize][mbmi->use_wedge_interintra]; + if (mbmi->use_wedge_interintra) { + rd_stats->rate += + av1_cost_literal(get_interintra_wedge_bits(bsize)); + } + } + } + } + if ((last_motion_mode_allowed > SIMPLE_TRANSLATION) && + (mbmi->ref_frame[1] != INTRA_FRAME)) { + if (last_motion_mode_allowed == WARPED_CAUSAL) { rd_stats->rate += x->motion_mode_cost[bsize][mbmi->motion_mode]; -#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR - else + } else { rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode]; -#endif // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR - } -#if CONFIG_WARPED_MOTION - if (mbmi->motion_mode == WARPED_CAUSAL) { - rd_stats->rate -= rs; + } } -#endif // CONFIG_WARPED_MOTION -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - if (!*skip_txfm_sb) { + if (!skip_txfm_sb) { +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + int64_t est_rd = 0; + int est_skip = 0; + if (cpi->sf.inter_mode_rd_model_estimation) { + InterModeRdModel *md = &inter_mode_rd_models[mbmi->sb_type]; + if (md->ready) { + const int64_t curr_sse = get_sse(cpi, x); + est_rd = + get_est_rd(mbmi->sb_type, x->rdmult, curr_sse, rd_stats->rate); + est_skip = est_rd * 0.8 > *best_est_rd; +#if INTER_MODE_RD_TEST + if (est_rd < *best_est_rd) { + *best_est_rd = est_rd; + } +#else // INTER_MODE_RD_TEST + if (est_skip) { + ++md->skip_count; + mbmi->ref_frame[1] = ref_frame_1; + continue; + } else { + if (est_rd < *best_est_rd) { + *best_est_rd = est_rd; + } + ++md->non_skip_count; + } +#endif // INTER_MODE_RD_TEST + } + } +#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS + int64_t rdcosty = INT64_MAX; int is_cost_valid_uv = 0; // cost and distortion av1_subtract_plane(x, bsize, 0); -#if CONFIG_VAR_TX if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { - select_tx_type_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd); + // Motion mode + select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, + ref_best_rd); +#if CONFIG_COLLECT_RD_STATS == 2 + PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 2 } else { - int idx, idy; super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd); - for (idy = 0; idy < xd->n8_h; ++idy) - for (idx = 0; idx < xd->n8_w; ++idx) - mbmi->inter_tx_size[idy][idx] = mbmi->tx_size; - memset(x->blk_skip[0], rd_stats_y->skip, - sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4); + memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); + memset(x->blk_skip, rd_stats_y->skip, + sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); } -#else - /* clang-format off */ - super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd); -/* clang-format on */ -#endif // CONFIG_VAR_TX if (rd_stats_y->rate == INT_MAX) { av1_invalid_rd_stats(rd_stats); -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - if (mbmi->motion_mode != SIMPLE_TRANSLATION) { + if (mbmi->motion_mode != SIMPLE_TRANSLATION || + mbmi->ref_frame[1] == INTRA_FRAME) { + mbmi->ref_frame[1] = ref_frame_1; continue; } else { -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - restore_dst_buf(xd, *orig_dst); + restore_dst_buf(xd, *orig_dst, num_planes); + mbmi->ref_frame[1] = ref_frame_1; return INT64_MAX; -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION } -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION } av1_merge_rd_stats(rd_stats, rd_stats_y); rdcosty = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, 0, rd_stats->sse)); -/* clang-format off */ -#if CONFIG_VAR_TX - is_cost_valid_uv = - inter_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty); -#else - is_cost_valid_uv = - super_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty); -#endif // CONFIG_VAR_TX - if (!is_cost_valid_uv) { -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - continue; -#else - restore_dst_buf(xd, *orig_dst); - return INT64_MAX; -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + if (num_planes > 1) { + /* clang-format off */ + is_cost_valid_uv = + inter_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty, + FTXS_NONE); + if (!is_cost_valid_uv) { + mbmi->ref_frame[1] = ref_frame_1; + continue; + } + /* clang-format on */ + av1_merge_rd_stats(rd_stats, rd_stats_uv); + } else { + av1_init_rd_stats(rd_stats_uv); } - /* clang-format on */ - av1_merge_rd_stats(rd_stats, rd_stats_uv); #if CONFIG_RD_DEBUG // record transform block coefficient cost // TODO(angiebird): So far rd_debug tool only detects discrepancy of @@ -9038,812 +8074,766 @@ static int64_t motion_mode_rd( // other place when we need to compare non-coefficient cost. mbmi->rd_stats = *rd_stats; #endif // CONFIG_RD_DEBUG -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + const int skip_ctx = av1_get_skip_context(xd); if (rd_stats->skip) { rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; rd_stats_y->rate = 0; rd_stats_uv->rate = 0; - rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1); + rd_stats->rate += x->skip_cost[skip_ctx][1]; mbmi->skip = 0; // here mbmi->skip temporarily plays a role as what this_skip2 does } else if (!xd->lossless[mbmi->segment_id] && (RDCOST(x->rdmult, rd_stats_y->rate + rd_stats_uv->rate + - av1_cost_bit(av1_get_skip_prob(cm, xd), 0), - rd_stats->dist) >= - RDCOST(x->rdmult, av1_cost_bit(av1_get_skip_prob(cm, xd), 1), - rd_stats->sse))) { + x->skip_cost[skip_ctx][0], + rd_stats->dist) >= RDCOST(x->rdmult, + x->skip_cost[skip_ctx][1], + rd_stats->sse))) { rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; - rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1); + rd_stats->rate += x->skip_cost[skip_ctx][1]; rd_stats->dist = rd_stats->sse; rd_stats_y->rate = 0; rd_stats_uv->rate = 0; mbmi->skip = 1; } else { - rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0); + rd_stats->rate += x->skip_cost[skip_ctx][0]; mbmi->skip = 0; } *disable_skip = 0; -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 && + cm->tile_rows == 1) { +#if INTER_MODE_RD_TEST + if (md->ready) { + int64_t real_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (est_skip) { + ++md->skip_count; + if (real_rd < ref_best_rd) { + ++md->fp_skip_count; + } + // int fp_skip = real_rd < ref_best_rd; + // printf("est_skip %d fp_skip %d est_rd %ld best_est_rd %ld real_rd + // %ld ref_best_rd %ld\n", + // est_skip, fp_skip, est_rd, *best_est_rd, real_rd, + // ref_best_rd); + } else { + ++md->non_skip_count; + } + } +#endif // INTER_MODE_RD_TEST + inter_mode_data_push(mbmi->sb_type, rd_stats->sse, rd_stats->dist, + rd_stats_y->rate + rd_stats_uv->rate + + x->skip_cost[skip_ctx][mbmi->skip], + rd_stats->rate, ref_best_rd); + } +#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS + int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (curr_rd < ref_best_rd) { + ref_best_rd = curr_rd; + } } else { x->skip = 1; *disable_skip = 1; - mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, 1); + mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode); -// The cost of skip bit needs to be added. -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + // The cost of skip bit needs to be added. mbmi->skip = 0; -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1); + rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1]; - rd_stats->dist = *skip_sse_sb; - rd_stats->sse = *skip_sse_sb; + rd_stats->dist = 0; + rd_stats->sse = 0; rd_stats_y->rate = 0; rd_stats_uv->rate = 0; rd_stats->skip = 1; } -#if CONFIG_GLOBAL_MOTION - if (this_mode == ZEROMV || this_mode == ZERO_ZEROMV) { - if (is_nontrans_global_motion(xd)) { - rd_stats->rate -= rs; + if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) { + if (is_nontrans_global_motion(xd, xd->mi[0])) { mbmi->interp_filters = av1_broadcast_interp_filter( av1_unswitchable_filter(cm->interp_filter)); } } -#endif // CONFIG_GLOBAL_MOTION -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - if (mbmi->motion_mode == SIMPLE_TRANSLATION || (tmp_rd < best_rd)) { + if ((mbmi->motion_mode == SIMPLE_TRANSLATION && + mbmi->ref_frame[1] != INTRA_FRAME) || + (tmp_rd < best_rd)) { best_mbmi = *mbmi; best_rd = tmp_rd; best_rd_stats = *rd_stats; best_rd_stats_y = *rd_stats_y; - best_rd_stats_uv = *rd_stats_uv; -#if CONFIG_VAR_TX - for (int i = 0; i < MAX_MB_PLANE; ++i) - memcpy(best_blk_skip[i], x->blk_skip[i], - sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4); -#endif // CONFIG_VAR_TX + if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv; + memcpy(best_blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); best_xskip = x->skip; best_disable_skip = *disable_skip; + if (best_xskip) break; } } + mbmi->ref_frame[1] = ref_frame_1; if (best_rd == INT64_MAX) { av1_invalid_rd_stats(rd_stats); - restore_dst_buf(xd, *orig_dst); + restore_dst_buf(xd, *orig_dst, num_planes); return INT64_MAX; } *mbmi = best_mbmi; *rd_stats = best_rd_stats; *rd_stats_y = best_rd_stats_y; - *rd_stats_uv = best_rd_stats_uv; -#if CONFIG_VAR_TX - for (int i = 0; i < MAX_MB_PLANE; ++i) - memcpy(x->blk_skip[i], best_blk_skip[i], - sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4); -#endif // CONFIG_VAR_TX + if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv; + memcpy(x->blk_skip, best_blk_skip, + sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); x->skip = best_xskip; *disable_skip = best_disable_skip; -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - restore_dst_buf(xd, *orig_dst); + restore_dst_buf(xd, *orig_dst, num_planes); + return 0; +} + +static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi, + MACROBLOCK *const x, BLOCK_SIZE bsize, int mi_row, + int mi_col, BUFFER_SET *const orig_dst) { + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); + + int64_t total_sse = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + + av1_subtract_plane(x, bsize, plane); + int64_t sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh); + sse = sse << 4; + total_sse += sse; + } + const int skip_mode_ctx = av1_get_skip_mode_context(xd); + rd_stats->dist = rd_stats->sse = total_sse; + rd_stats->rate = x->skip_mode_cost[skip_mode_ctx][1]; + rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + + restore_dst_buf(xd, *orig_dst, num_planes); return 0; } +#ifndef NDEBUG +static INLINE int is_single_inter_mode(int this_mode) { + return this_mode >= SINGLE_INTER_MODE_START && + this_mode < SINGLE_INTER_MODE_END; +} +#endif + +static INLINE int get_ref_mv_offset(int single_mode, uint8_t ref_mv_idx) { + assert(is_single_inter_mode(single_mode)); + int ref_mv_offset; + if (single_mode == NEARESTMV) { + ref_mv_offset = 0; + } else if (single_mode == NEARMV) { + ref_mv_offset = ref_mv_idx + 1; + } else { + ref_mv_offset = -1; + } + return ref_mv_offset; +} + +static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx, + int ref_mv_idx, + const MV_REFERENCE_FRAME *ref_frame, + const MB_MODE_INFO_EXT *mbmi_ext) { + const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const int is_comp_pred = ref_frame[1] > INTRA_FRAME; + const int single_mode = get_single_mode(this_mode, ref_idx, is_comp_pred); + assert(is_single_inter_mode(single_mode)); + if (single_mode == NEWMV) { + this_mv->as_int = INVALID_MV; + } else if (single_mode == GLOBALMV) { + *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]]; + } else { + assert(single_mode == NEARMV || single_mode == NEARESTMV); + const int ref_mv_offset = get_ref_mv_offset(single_mode, ref_mv_idx); + if (ref_mv_offset < mbmi_ext->ref_mv_count[ref_frame_type]) { + assert(ref_mv_offset >= 0); + if (ref_idx == 0) { + *this_mv = + mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].this_mv; + } else { + *this_mv = + mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].comp_mv; + } + } else { + *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]]; + } + } +} + +// This function update the non-new mv for the current prediction mode +static INLINE int build_cur_mv(int_mv *cur_mv, int this_mode, + const AV1_COMMON *cm, const MACROBLOCK *x) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + const int is_comp_pred = has_second_ref(mbmi); + int ret = 1; + for (int i = 0; i < is_comp_pred + 1; ++i) { + int_mv this_mv; + get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx, mbmi->ref_frame, + x->mbmi_ext); + const int single_mode = get_single_mode(this_mode, i, is_comp_pred); + if (single_mode == NEWMV) { + cur_mv[i] = this_mv; + } else { + ret &= clamp_and_check_mv(cur_mv + i, this_mv, cm, x); + } + } + return ret; +} + +static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi, + const MB_MODE_INFO_EXT *mbmi_ext, + int (*drl_mode_cost0)[2], + int8_t ref_frame_type) { + int cost = 0; + if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) { + for (int idx = 0; idx < 2; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = + av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx); + cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != idx]; + if (mbmi->ref_mv_idx == idx) return cost; + } + } + return cost; + } + + if (have_nearmv_in_inter_mode(mbmi->mode)) { + for (int idx = 1; idx < 3; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = + av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx); + cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != (idx - 1)]; + if (mbmi->ref_mv_idx == (idx - 1)) return cost; + } + } + return cost; + } + return cost; +} + static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, - int *disable_skip, - int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], -#if CONFIG_COMPOUND_SINGLEREF - int_mv (*mode_comp_mv)[TOTAL_REFS_PER_FRAME], -#endif // CONFIG_COMPOUND_SINGLEREF - int mi_row, int mi_col, - HandleInterModeArgs *args, - const int64_t ref_best_rd) { + int *disable_skip, int mi_row, int mi_col, + HandleInterModeArgs *args, int64_t ref_best_rd +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + , + int64_t *best_est_rd +#endif +) { const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); MACROBLOCKD *xd = &x->e_mbd; - MODE_INFO *mi = xd->mi[0]; - MB_MODE_INFO *mbmi = &mi->mbmi; + MB_MODE_INFO *mbmi = xd->mi[0]; MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; const int is_comp_pred = has_second_ref(mbmi); const int this_mode = mbmi->mode; -#if CONFIG_COMPOUND_SINGLEREF - const int is_singleref_comp_mode = is_inter_singleref_comp_mode(this_mode); -#endif // CONFIG_COMPOUND_SINGLEREF - int_mv *frame_mv = mode_mv[this_mode]; -#if CONFIG_COMPOUND_SINGLEREF - // The comp mv for the compound mode in single ref - int_mv *frame_comp_mv = mode_comp_mv[this_mode]; -#endif // CONFIG_COMPOUND_SINGLEREF int i; int refs[2] = { mbmi->ref_frame[0], (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; - int_mv cur_mv[2]; int rate_mv = 0; - int pred_exists = 1; -#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA const int bw = block_size_wide[bsize]; -#endif // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT - int_mv single_newmv[TOTAL_REFS_PER_FRAME]; -#if CONFIG_INTERINTRA - const int *const interintra_mode_cost = - x->interintra_mode_cost[size_group_lookup[bsize]]; -#endif // CONFIG_INTERINTRA - const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME); - uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); -#if CONFIG_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]); -#else - DECLARE_ALIGNED(16, uint8_t, tmp_buf_[MAX_MB_PLANE * MAX_SB_SQUARE]); -#endif // CONFIG_HIGHBITDEPTH + DECLARE_ALIGNED(32, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]); uint8_t *tmp_buf; - -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - int rate2_bmc_nocoeff; - MB_MODE_INFO best_bmc_mbmi; - int rate_mv_bmc; -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION int64_t rd = INT64_MAX; BUFFER_SET orig_dst, tmp_dst; - int rs = 0; int skip_txfm_sb = 0; int64_t skip_sse_sb = INT64_MAX; int16_t mode_ctx; -#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_MOTION_VAR - // dummy fillers - mbmi->ncobmc_mode[0] = NO_OVERLAP; - mbmi->ncobmc_mode[1] = NO_OVERLAP; -#endif -#if CONFIG_INTERINTRA - int compmode_interintra_cost = 0; - mbmi->use_wedge_interintra = 0; -#endif -#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT - int compmode_interinter_cost = 0; - mbmi->interinter_compound_type = COMPOUND_AVERAGE; -#endif -#if CONFIG_LGT_FROM_PRED - mbmi->use_lgt = 0; -#endif + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 1; + if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME; -#if CONFIG_INTERINTRA - if (!cm->allow_interintra_compound && is_comp_interintra_pred) - return INT64_MAX; -#endif // CONFIG_INTERINTRA - - // is_comp_interintra_pred implies !is_comp_pred - assert(!is_comp_interintra_pred || (!is_comp_pred)); - // is_comp_interintra_pred implies is_interintra_allowed(mbmi->sb_type) - assert(!is_comp_interintra_pred || is_interintra_allowed(mbmi)); - -#if CONFIG_COMPOUND_SINGLEREF - if (is_comp_pred || is_singleref_comp_mode) -#else // !CONFIG_COMPOUND_SINGLEREF - if (is_comp_pred) -#endif // CONFIG_COMPOUND_SINGLEREF - mode_ctx = mbmi_ext->compound_mode_context[refs[0]]; - else - mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, - mbmi->ref_frame, bsize, -1); + mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); -#if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_); else -#endif // CONFIG_HIGHBITDEPTH tmp_buf = tmp_buf_; // Make sure that we didn't leave the plane destination buffers set // to tmp_buf at the end of the last iteration assert(xd->plane[0].dst.buf != tmp_buf); -#if CONFIG_WARPED_MOTION mbmi->num_proj_ref[0] = 0; mbmi->num_proj_ref[1] = 0; -#endif // CONFIG_WARPED_MOTION if (is_comp_pred) { - if (frame_mv[refs[0]].as_int == INVALID_MV || - frame_mv[refs[1]].as_int == INVALID_MV) - return INT64_MAX; -#if CONFIG_COMPOUND_SINGLEREF - } else if (is_singleref_comp_mode) { - if (frame_mv[refs[0]].as_int == INVALID_MV || - frame_comp_mv[refs[0]].as_int == INVALID_MV) - return INT64_MAX; -#endif // CONFIG_COMPOUND_SINGLEREF + for (int ref_idx = 0; ref_idx < is_comp_pred + 1; ++ref_idx) { + const int single_mode = get_single_mode(this_mode, ref_idx, is_comp_pred); + if (single_mode == NEWMV && + args->single_newmv[mbmi->ref_frame[ref_idx]].as_int == INVALID_MV) + return INT64_MAX; + } } mbmi->motion_mode = SIMPLE_TRANSLATION; - if (have_newmv_in_inter_mode(this_mode)) { - const int64_t ret_val = - handle_newmv(cpi, x, bsize, mode_mv, -#if CONFIG_COMPOUND_SINGLEREF - mode_comp_mv, -#endif // CONFIG_COMPOUND_SINGLEREF - mi_row, mi_col, &rate_mv, single_newmv, args); - if (ret_val != 0) - return ret_val; - else - rd_stats->rate += rate_mv; - } - for (i = 0; i < is_comp_pred + 1; ++i) { - cur_mv[i] = frame_mv[refs[i]]; - // Clip "next_nearest" so that it does not extend to far out of image - if (this_mode != NEWMV) clamp_mv2(&cur_mv[i].as_mv, xd); - if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX; - mbmi->mv[i].as_int = cur_mv[i].as_int; - } - -#if CONFIG_COMPOUND_SINGLEREF - if (!is_comp_pred && is_singleref_comp_mode) { - cur_mv[1] = frame_comp_mv[refs[0]]; - // Clip "next_nearest" so that it does not extend to far out of image - if (this_mode != NEWMV) clamp_mv2(&cur_mv[1].as_mv, xd); - if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX; - mbmi->mv[1].as_int = cur_mv[1].as_int; - } -#endif // CONFIG_COMPOUND_SINGLEREF + const int masked_compound_used = is_any_masked_compound_used(bsize) && + cm->seq_params.enable_masked_compound; + int64_t ret_val = INT64_MAX; + const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + rd_stats->rate += args->ref_frame_cost + args->single_comp_cost; + rd_stats->rate += + get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type); + const RD_STATS backup_rd_stats = *rd_stats; + const RD_STATS backup_rd_stats_y = *rd_stats_y; + const RD_STATS backup_rd_stats_uv = *rd_stats_uv; + const MB_MODE_INFO backup_mbmi = *mbmi; + INTERINTER_COMPOUND_DATA best_compound_data; + uint8_t tmp_best_mask_buf[2 * MAX_SB_SQUARE]; + RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv; + int64_t best_rd = INT64_MAX; + int64_t best_ret_val = INT64_MAX; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + MB_MODE_INFO best_mbmi = *mbmi; + int64_t early_terminate = 0; + int plane_rate[MAX_MB_PLANE] = { 0 }; + int64_t plane_sse[MAX_MB_PLANE] = { 0 }; + int64_t plane_dist[MAX_MB_PLANE] = { 0 }; + int64_t newmv_ret_val = INT64_MAX; + int_mv backup_mv[2] = { { 0 } }; + int backup_rate_mv = 0; + + int comp_idx; + const int search_jnt_comp = is_comp_pred & cm->seq_params.enable_jnt_comp & + (mbmi->mode != GLOBAL_GLOBALMV); + // If !search_jnt_comp, we need to force mbmi->compound_idx = 1. + for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) { + int rs = 0; + int compmode_interinter_cost = 0; + early_terminate = 0; + *rd_stats = backup_rd_stats; + *rd_stats_y = backup_rd_stats_y; + *rd_stats_uv = backup_rd_stats_uv; + *mbmi = backup_mbmi; + mbmi->compound_idx = comp_idx; - if (this_mode == NEAREST_NEARESTMV) { - if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) { - cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv; - cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv; + if (is_comp_pred && comp_idx == 0) { + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 0; - for (i = 0; i < 2; ++i) { - clamp_mv2(&cur_mv[i].as_mv, xd); - if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX; - mbmi->mv[i].as_int = cur_mv[i].as_int; + const int comp_group_idx_ctx = get_comp_group_idx_context(xd); + const int comp_index_ctx = get_comp_index_context(cm, xd); + if (masked_compound_used) { + compmode_interinter_cost += + x->comp_group_idx_cost[comp_group_idx_ctx][0]; } + compmode_interinter_cost += x->comp_idx_cost[comp_index_ctx][0]; } - } - - if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) { -#if CONFIG_COMPOUND_SINGLEREF - if (this_mode == NEAREST_NEWMV || // this_mode == SR_NEAREST_NEWMV || - this_mode == SR_NEAREST_NEARMV) -#else // !CONFIG_COMPOUND_SINGLEREF - if (this_mode == NEAREST_NEWMV) -#endif // CONFIG_COMPOUND_SINGLEREF - { - cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv; -#if CONFIG_AMVR - lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv, - cm->cur_frame_mv_precision_level); -#else - lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv); -#endif - clamp_mv2(&cur_mv[0].as_mv, xd); - if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX; - mbmi->mv[0].as_int = cur_mv[0].as_int; + int_mv cur_mv[2]; + if (!build_cur_mv(cur_mv, this_mode, cm, x)) { + early_terminate = INT64_MAX; + continue; } + if (have_newmv_in_inter_mode(this_mode)) { + if (comp_idx == 0) { + cur_mv[0] = backup_mv[0]; + cur_mv[1] = backup_mv[1]; + rate_mv = backup_rate_mv; + } - if (this_mode == NEW_NEARESTMV) { - cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv; + // when jnt_comp_skip_mv_search flag is on, new mv will be searched once + if (!(search_jnt_comp && cpi->sf.jnt_comp_skip_mv_search && + comp_idx == 0)) { + newmv_ret_val = + handle_newmv(cpi, x, bsize, cur_mv, mi_row, mi_col, &rate_mv, args); -#if CONFIG_AMVR - lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv, - cm->cur_frame_mv_precision_level); -#else - lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv); -#endif - clamp_mv2(&cur_mv[1].as_mv, xd); - if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX; - mbmi->mv[1].as_int = cur_mv[1].as_int; - } - } - - if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) { - int ref_mv_idx = mbmi->ref_mv_idx + 1; - if (this_mode == NEAR_NEWMV || -#if CONFIG_COMPOUND_SINGLEREF - this_mode == SR_NEAR_NEWMV || -#endif // CONFIG_COMPOUND_SINGLEREF - this_mode == NEAR_NEARMV) { - cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv; + // Store cur_mv and rate_mv so that they can be restored in the next + // iteration of the loop + backup_mv[0] = cur_mv[0]; + backup_mv[1] = cur_mv[1]; + backup_rate_mv = rate_mv; + } -#if CONFIG_AMVR - lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv, - cm->cur_frame_mv_precision_level); -#else - lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv); -#endif - clamp_mv2(&cur_mv[0].as_mv, xd); - if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX; - mbmi->mv[0].as_int = cur_mv[0].as_int; + if (newmv_ret_val != 0) { + early_terminate = INT64_MAX; + continue; + } else { + rd_stats->rate += rate_mv; + } + } + for (i = 0; i < is_comp_pred + 1; ++i) { + mbmi->mv[i].as_int = cur_mv[i].as_int; + } + + // Initialise tmp_dst and orig_dst buffers to prevent "may be used + // uninitialized" warnings in GCC when the stream is monochrome. + memset(tmp_dst.plane, 0, sizeof(tmp_dst.plane)); + memset(tmp_dst.stride, 0, sizeof(tmp_dst.stride)); + memset(orig_dst.plane, 0, sizeof(tmp_dst.plane)); + memset(orig_dst.stride, 0, sizeof(tmp_dst.stride)); + + // do first prediction into the destination buffer. Do the next + // prediction into a temporary buffer. Then keep track of which one + // of these currently holds the best predictor, and use the other + // one for future predictions. In the end, copy from tmp_buf to + // dst if necessary. + for (i = 0; i < num_planes; i++) { + tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE; + tmp_dst.stride[i] = MAX_SB_SIZE; + } + for (i = 0; i < num_planes; i++) { + orig_dst.plane[i] = xd->plane[i].dst.buf; + orig_dst.stride[i] = xd->plane[i].dst.stride; + } + + const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx); +#if USE_DISCOUNT_NEWMV_TEST + // We don't include the cost of the second reference here, because there + // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other + // words if you present them in that order, the second one is always known + // if the first is known. + // + // Under some circumstances we discount the cost of new mv mode to encourage + // initiation of a motion field. + if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) { + // discount_newmv_test only applies discount on NEWMV mode. + assert(this_mode == NEWMV); + rd_stats->rate += AOMMIN(cost_mv_ref(x, this_mode, mode_ctx), + cost_mv_ref(x, NEARESTMV, mode_ctx)); + } else { + rd_stats->rate += ref_mv_cost; } - - if (this_mode == NEW_NEARMV || -#if CONFIG_COMPOUND_SINGLEREF - this_mode == SR_NEAREST_NEARMV || -#endif // CONFIG_COMPOUND_SINGLEREF - this_mode == NEAR_NEARMV) { -#if CONFIG_COMPOUND_SINGLEREF - if (this_mode == SR_NEAREST_NEARMV) - cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv; - else -#endif // CONFIG_COMPOUND_SINGLEREF - cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv; - -#if CONFIG_AMVR - lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv, - cm->cur_frame_mv_precision_level); #else - lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv); + rd_stats->rate += ref_mv_cost; #endif - clamp_mv2(&cur_mv[1].as_mv, xd); - if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX; - mbmi->mv[1].as_int = cur_mv[1].as_int; + + if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd && + mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) { + early_terminate = INT64_MAX; + continue; } - } - // do first prediction into the destination buffer. Do the next - // prediction into a temporary buffer. Then keep track of which one - // of these currently holds the best predictor, and use the other - // one for future predictions. In the end, copy from tmp_buf to - // dst if necessary. - for (i = 0; i < MAX_MB_PLANE; i++) { - tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE; - tmp_dst.stride[i] = MAX_SB_SIZE; - } - for (i = 0; i < MAX_MB_PLANE; i++) { - orig_dst.plane[i] = xd->plane[i].dst.buf; - orig_dst.stride[i] = xd->plane[i].dst.stride; - } + ret_val = interpolation_filter_search( + x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, args->single_filter, + &rd, &rs, &skip_txfm_sb, &skip_sse_sb); + if (ret_val != 0) { + early_terminate = INT64_MAX; + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } else if (cpi->sf.model_based_post_interp_filter_breakout && + ref_best_rd != INT64_MAX && (rd / 6) > ref_best_rd) { + early_terminate = INT64_MAX; + restore_dst_buf(xd, orig_dst, num_planes); + if ((rd >> 4) > ref_best_rd) break; + continue; + } - // We don't include the cost of the second reference here, because there - // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other - // words if you present them in that order, the second one is always known - // if the first is known. - // - // Under some circumstances we discount the cost of new mv mode to encourage - // initiation of a motion field. - if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv, - refs[0])) { - rd_stats->rate += AOMMIN( - cost_mv_ref(x, this_mode, mode_ctx), - cost_mv_ref(x, is_comp_pred ? NEAREST_NEARESTMV : NEARESTMV, mode_ctx)); - } else { - rd_stats->rate += cost_mv_ref(x, this_mode, mode_ctx); - } + if (is_comp_pred && comp_idx) { + int rate_sum, rs2; + int64_t dist_sum; + int64_t best_rd_compound = INT64_MAX, best_rd_cur = INT64_MAX; + int_mv best_mv[2]; + int best_tmp_rate_mv = rate_mv; + int tmp_skip_txfm_sb; + int64_t tmp_skip_sse_sb; + DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]); + uint8_t *preds0[1] = { pred0 }; + uint8_t *preds1[1] = { pred1 }; + int strides[1] = { bw }; + int tmp_rate_mv; + const int num_pix = 1 << num_pels_log2_lookup[bsize]; + COMPOUND_TYPE cur_type; + int best_compmode_interinter_cost = 0; + int can_use_previous = cm->allow_warped_motion; + + best_mv[0].as_int = cur_mv[0].as_int; + best_mv[1].as_int = cur_mv[1].as_int; - if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd && - mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) - return INT64_MAX; + if (masked_compound_used) { + // get inter predictors to use for masked compound modes + av1_build_inter_predictors_for_planes_single_buf( + xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, + can_use_previous); + av1_build_inter_predictors_for_planes_single_buf( + xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, + can_use_previous); + } + + int best_comp_group_idx = 0; + int best_compound_idx = 1; + for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) { + if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break; + if (!is_interinter_compound_used(cur_type, bsize)) continue; + tmp_rate_mv = rate_mv; + best_rd_cur = INT64_MAX; + mbmi->interinter_comp.type = cur_type; + int masked_type_cost = 0; + + const int comp_group_idx_ctx = get_comp_group_idx_context(xd); + const int comp_index_ctx = get_comp_index_context(cm, xd); + if (masked_compound_used) { + if (cur_type == COMPOUND_AVERAGE) { + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 1; + + masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][0]; + masked_type_cost += x->comp_idx_cost[comp_index_ctx][1]; + } else { + mbmi->comp_group_idx = 1; + mbmi->compound_idx = 1; - int64_t ret_val = interpolation_filter_search( - x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, args->single_filter, - &rd, &rs, &skip_txfm_sb, &skip_sse_sb); - if (ret_val != 0) return ret_val; + masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1]; + masked_type_cost += + x->compound_type_cost[bsize][mbmi->interinter_comp.type - 1]; + } + } else { + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 1; -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - best_bmc_mbmi = *mbmi; - rate2_bmc_nocoeff = rd_stats->rate; - if (cm->interp_filter == SWITCHABLE) rate2_bmc_nocoeff += rs; - rate_mv_bmc = rate_mv; -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + masked_type_cost += x->comp_idx_cost[comp_index_ctx][1]; + } + rs2 = masked_type_cost; + + switch (cur_type) { + case COMPOUND_AVERAGE: + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst, + bsize); + av1_subtract_plane(x, bsize, 0); + rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, + INT64_MAX); + if (rd != INT64_MAX) + best_rd_cur = + RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum); + break; + case COMPOUND_WEDGE: + if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh && + best_rd_compound / 3 < ref_best_rd) { + best_rd_cur = build_and_cost_compound_type( + cpi, x, cur_mv, bsize, this_mode, &rs2, rate_mv, &orig_dst, + &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col); + } + break; + case COMPOUND_DIFFWTD: + if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh && + best_rd_compound / 3 < ref_best_rd) { + best_rd_cur = build_and_cost_compound_type( + cpi, x, cur_mv, bsize, this_mode, &rs2, rate_mv, &orig_dst, + &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col); + } + break; + default: assert(0); return INT64_MAX; + } -#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT -#if CONFIG_COMPOUND_SINGLEREF - if (is_comp_pred || is_singleref_comp_mode) -#else - if (is_comp_pred) -#endif // CONFIG_COMPOUND_SINGLEREF - { - int rate_sum, rs2; - int64_t dist_sum; - int64_t best_rd_compound = INT64_MAX, best_rd_cur = INT64_MAX; - INTERINTER_COMPOUND_DATA best_compound_data; - int_mv best_mv[2]; - int best_tmp_rate_mv = rate_mv; - int tmp_skip_txfm_sb; - int64_t tmp_skip_sse_sb; - DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]); - DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]); - uint8_t *preds0[1] = { pred0 }; - uint8_t *preds1[1] = { pred1 }; - int strides[1] = { bw }; - int tmp_rate_mv; - int masked_compound_used = is_any_masked_compound_used(bsize); -#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE - masked_compound_used = masked_compound_used && cm->allow_masked_compound; -#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE - COMPOUND_TYPE cur_type; - int best_compmode_interinter_cost = 0; - - best_mv[0].as_int = cur_mv[0].as_int; - best_mv[1].as_int = cur_mv[1].as_int; - memset(&best_compound_data, 0, sizeof(best_compound_data)); -#if CONFIG_COMPOUND_SEGMENT - uint8_t tmp_mask_buf[2 * MAX_SB_SQUARE]; - best_compound_data.seg_mask = tmp_mask_buf; -#endif // CONFIG_COMPOUND_SEGMENT - -#if CONFIG_COMPOUND_SINGLEREF - // TODO(zoeliu): To further check whether the following setups are needed. - // Single ref compound mode: Prepare the 2nd ref frame predictor the same as - // the 1st one. - if (!is_comp_pred && is_singleref_comp_mode) { - xd->block_refs[1] = xd->block_refs[0]; - for (i = 0; i < MAX_MB_PLANE; i++) - xd->plane[i].pre[1] = xd->plane[i].pre[0]; - } -#endif // CONFIG_COMPOUND_SINGLEREF - - if (masked_compound_used) { - // get inter predictors to use for masked compound modes - av1_build_inter_predictors_for_planes_single_buf( - xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides); - av1_build_inter_predictors_for_planes_single_buf( - xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides); - } - - for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) { - if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break; - if (!is_interinter_compound_used(cur_type, bsize)) continue; - tmp_rate_mv = rate_mv; - best_rd_cur = INT64_MAX; - mbmi->interinter_compound_type = cur_type; - int masked_type_cost = 0; - if (masked_compound_used) { -#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT - if (!is_interinter_compound_used(COMPOUND_WEDGE, bsize)) - masked_type_cost += av1_cost_literal(1); - else -#endif // CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT - masked_type_cost += - x->compound_type_cost[bsize][mbmi->interinter_compound_type]; - } - rs2 = av1_cost_literal(get_interinter_compound_type_bits( - bsize, mbmi->interinter_compound_type)) + - masked_type_cost; - - switch (cur_type) { - case COMPOUND_AVERAGE: - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst, - bsize); - av1_subtract_plane(x, bsize, 0); - rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, - INT64_MAX); - if (rd != INT64_MAX) - best_rd_cur = RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum); + if (best_rd_cur < best_rd_compound) { + best_comp_group_idx = mbmi->comp_group_idx; + best_compound_idx = mbmi->compound_idx; best_rd_compound = best_rd_cur; - break; -#if CONFIG_WEDGE - case COMPOUND_WEDGE: - if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh && - best_rd_compound / 3 < ref_best_rd) { - best_rd_cur = build_and_cost_compound_type( - cpi, x, cur_mv, bsize, this_mode, rs2, rate_mv, &orig_dst, - &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col); - } - break; -#endif // CONFIG_WEDGE -#if CONFIG_COMPOUND_SEGMENT - case COMPOUND_SEG: - if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh && - best_rd_compound / 3 < ref_best_rd) { - best_rd_cur = build_and_cost_compound_type( - cpi, x, cur_mv, bsize, this_mode, rs2, rate_mv, &orig_dst, - &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col); - } - break; -#endif // CONFIG_COMPOUND_SEGMENT - default: assert(0); return 0; - } - - if (best_rd_cur < best_rd_compound) { - best_rd_compound = best_rd_cur; -#if CONFIG_WEDGE - best_compound_data.wedge_index = mbmi->wedge_index; - best_compound_data.wedge_sign = mbmi->wedge_sign; -#endif // CONFIG_WEDGE -#if CONFIG_COMPOUND_SEGMENT - best_compound_data.mask_type = mbmi->mask_type; - memcpy(best_compound_data.seg_mask, xd->seg_mask, - 2 * MAX_SB_SQUARE * sizeof(uint8_t)); -#endif // CONFIG_COMPOUND_SEGMENT - best_compound_data.interinter_compound_type = - mbmi->interinter_compound_type; - best_compmode_interinter_cost = rs2; - if (have_newmv_in_inter_mode(this_mode)) { - if (use_masked_motion_search(cur_type)) { - best_tmp_rate_mv = tmp_rate_mv; - best_mv[0].as_int = mbmi->mv[0].as_int; - best_mv[1].as_int = mbmi->mv[1].as_int; - } else { - best_mv[0].as_int = cur_mv[0].as_int; - best_mv[1].as_int = cur_mv[1].as_int; + best_compound_data = mbmi->interinter_comp; + memcpy(tmp_best_mask_buf, xd->seg_mask, + 2 * num_pix * sizeof(uint8_t)); + best_compmode_interinter_cost = rs2; + if (have_newmv_in_inter_mode(this_mode)) { + if (use_masked_motion_search(cur_type)) { + best_tmp_rate_mv = tmp_rate_mv; + best_mv[0].as_int = mbmi->mv[0].as_int; + best_mv[1].as_int = mbmi->mv[1].as_int; + } else { + best_mv[0].as_int = cur_mv[0].as_int; + best_mv[1].as_int = cur_mv[1].as_int; + } } } + // reset to original mvs for next iteration + mbmi->mv[0].as_int = cur_mv[0].as_int; + mbmi->mv[1].as_int = cur_mv[1].as_int; + } + mbmi->comp_group_idx = best_comp_group_idx; + mbmi->compound_idx = best_compound_idx; + mbmi->interinter_comp = best_compound_data; + assert(IMPLIES(mbmi->comp_group_idx == 1, + mbmi->interinter_comp.type != COMPOUND_AVERAGE)); + memcpy(xd->seg_mask, tmp_best_mask_buf, 2 * num_pix * sizeof(uint8_t)); + if (have_newmv_in_inter_mode(this_mode)) { + mbmi->mv[0].as_int = best_mv[0].as_int; + mbmi->mv[1].as_int = best_mv[1].as_int; + if (use_masked_motion_search(mbmi->interinter_comp.type)) { + rd_stats->rate += best_tmp_rate_mv - rate_mv; + rate_mv = best_tmp_rate_mv; + } } - // reset to original mvs for next iteration - mbmi->mv[0].as_int = cur_mv[0].as_int; - mbmi->mv[1].as_int = cur_mv[1].as_int; - } -#if CONFIG_WEDGE - mbmi->wedge_index = best_compound_data.wedge_index; - mbmi->wedge_sign = best_compound_data.wedge_sign; -#endif // CONFIG_WEDGE -#if CONFIG_COMPOUND_SEGMENT - mbmi->mask_type = best_compound_data.mask_type; - memcpy(xd->seg_mask, best_compound_data.seg_mask, - 2 * MAX_SB_SQUARE * sizeof(uint8_t)); -#endif // CONFIG_COMPOUND_SEGMENT - mbmi->interinter_compound_type = - best_compound_data.interinter_compound_type; - if (have_newmv_in_inter_mode(this_mode)) { - mbmi->mv[0].as_int = best_mv[0].as_int; - mbmi->mv[1].as_int = best_mv[1].as_int; - xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int; - xd->mi[0]->bmi[0].as_mv[1].as_int = mbmi->mv[1].as_int; - if (use_masked_motion_search(mbmi->interinter_compound_type)) { - rd_stats->rate += best_tmp_rate_mv - rate_mv; - rate_mv = best_tmp_rate_mv; - } - } - if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) { - restore_dst_buf(xd, orig_dst); - return INT64_MAX; + if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) { + restore_dst_buf(xd, orig_dst, num_planes); + early_terminate = INT64_MAX; + continue; + } + compmode_interinter_cost = best_compmode_interinter_cost; } - pred_exists = 0; - - compmode_interinter_cost = best_compmode_interinter_cost; - } -#endif // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT - -#if CONFIG_INTERINTRA - if (is_comp_interintra_pred) { - INTERINTRA_MODE best_interintra_mode = II_DC_PRED; - int64_t best_interintra_rd = INT64_MAX; - int rmode, rate_sum; - int64_t dist_sum; - int j; - int tmp_rate_mv = 0; - int tmp_skip_txfm_sb; - int64_t tmp_skip_sse_sb; - DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_SB_SQUARE]); - uint8_t *intrapred; - -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - intrapred = CONVERT_TO_BYTEPTR(intrapred_); - else -#endif // CONFIG_HIGHBITDEPTH - intrapred = intrapred_; - - mbmi->ref_frame[1] = NONE_FRAME; - for (j = 0; j < MAX_MB_PLANE; j++) { - xd->plane[j].dst.buf = tmp_buf + j * MAX_SB_SQUARE; - xd->plane[j].dst.stride = bw; + if (is_comp_pred) { + int tmp_rate; + int64_t tmp_dist; + av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, bsize); + model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, + &tmp_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, + plane_sse, plane_dist); + rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist); + } + + if (search_jnt_comp) { + // if 1/2 model rd is larger than best_rd in jnt_comp mode, + // use jnt_comp mode, save additional search + if ((rd >> 1) > best_rd) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } } - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst, bsize); - restore_dst_buf(xd, orig_dst); - mbmi->ref_frame[1] = INTRA_FRAME; - mbmi->use_wedge_interintra = 0; - for (j = 0; j < INTERINTRA_MODES; ++j) { - mbmi->interintra_mode = (INTERINTRA_MODE)j; - rmode = interintra_mode_cost[mbmi->interintra_mode]; - av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, &orig_dst, - intrapred, bw); - av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb); - rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum); - if (rd < best_interintra_rd) { - best_interintra_rd = rd; - best_interintra_mode = mbmi->interintra_mode; - } - } - mbmi->interintra_mode = best_interintra_mode; - rmode = interintra_mode_cost[mbmi->interintra_mode]; - av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, &orig_dst, - intrapred, bw); - av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - av1_subtract_plane(x, bsize, 0); - rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); - if (rd != INT64_MAX) - rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum, dist_sum); - best_interintra_rd = rd; + if (!is_comp_pred) + args->single_filter[this_mode][refs[0]] = + av1_extract_interp_filter(mbmi->interp_filters, 0); - if (ref_best_rd < INT64_MAX && best_interintra_rd > 2 * ref_best_rd) { - // Don't need to call restore_dst_buf here - return INT64_MAX; - } -#if CONFIG_WEDGE - if (is_interintra_wedge_used(bsize)) { - int64_t best_interintra_rd_nowedge = INT64_MAX; - int64_t best_interintra_rd_wedge = INT64_MAX; - int_mv tmp_mv; - int rwedge = av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 0); - if (rd != INT64_MAX) - rd = RDCOST(x->rdmult, rmode + rate_mv + rwedge + rate_sum, dist_sum); - best_interintra_rd_nowedge = best_interintra_rd; - - // Disable wedge search if source variance is small - if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) { - mbmi->use_wedge_interintra = 1; - - rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) + - av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 1); - - best_interintra_rd_wedge = - pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); - - best_interintra_rd_wedge += - RDCOST(x->rdmult, rmode + rate_mv + rwedge, 0); - // Refine motion vector. - if (have_newmv_in_inter_mode(this_mode)) { - // get negative of mask - const uint8_t *mask = av1_get_contiguous_soft_mask( - mbmi->interintra_wedge_index, 1, bsize); - tmp_mv.as_int = x->mbmi_ext->ref_mvs[refs[0]][0].as_int; - compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row, - mi_col, intrapred, mask, bw, - &tmp_rate_mv, 0); - mbmi->mv[0].as_int = tmp_mv.as_int; - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst, - bsize); - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb); - rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum, - dist_sum); - if (rd >= best_interintra_rd_wedge) { - tmp_mv.as_int = cur_mv[0].as_int; - tmp_rate_mv = rate_mv; - } - } else { - tmp_mv.as_int = cur_mv[0].as_int; - tmp_rate_mv = rate_mv; - av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - } - // Evaluate closer to true rd - av1_subtract_plane(x, bsize, 0); - rd = - estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); - if (rd != INT64_MAX) - rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum, - dist_sum); - best_interintra_rd_wedge = rd; - if (best_interintra_rd_wedge < best_interintra_rd_nowedge) { - mbmi->use_wedge_interintra = 1; - mbmi->mv[0].as_int = tmp_mv.as_int; - rd_stats->rate += tmp_rate_mv - rate_mv; - rate_mv = tmp_rate_mv; - } else { - mbmi->use_wedge_interintra = 0; - mbmi->mv[0].as_int = cur_mv[0].as_int; + if (args->modelled_rd != NULL) { + if (is_comp_pred) { + const int mode0 = compound_ref0_mode(this_mode); + const int mode1 = compound_ref1_mode(this_mode); + const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]], + args->modelled_rd[mode1][refs[1]]); + if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) { + restore_dst_buf(xd, orig_dst, num_planes); + early_terminate = INT64_MAX; + continue; } } else { - mbmi->use_wedge_interintra = 0; + args->modelled_rd[this_mode][refs[0]] = rd; } } -#endif // CONFIG_WEDGE - pred_exists = 0; - compmode_interintra_cost = - av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 1) + - interintra_mode_cost[mbmi->interintra_mode]; - if (is_interintra_wedge_used(bsize)) { - compmode_interintra_cost += av1_cost_bit( - cm->fc->wedge_interintra_prob[bsize], mbmi->use_wedge_interintra); - if (mbmi->use_wedge_interintra) { - compmode_interintra_cost += - av1_cost_literal(get_interintra_wedge_bits(bsize)); + if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { + // if current pred_error modeled rd is substantially more than the best + // so far, do not bother doing full rd + if (rd / 2 > ref_best_rd) { + restore_dst_buf(xd, orig_dst, num_planes); + early_terminate = INT64_MAX; + continue; } } - } else if (is_interintra_allowed(mbmi)) { - compmode_interintra_cost = - av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 0); - } -#endif // CONFIG_INTERINTRA - if (pred_exists == 0) { - int tmp_rate; - int64_t tmp_dist; - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, bsize); - model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, - &tmp_dist, &skip_txfm_sb, &skip_sse_sb); - rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist); - } - - if (!is_comp_pred) - args->single_filter[this_mode][refs[0]] = - av1_extract_interp_filter(mbmi->interp_filters, 0); + rd_stats->rate += compmode_interinter_cost; - if (args->modelled_rd != NULL) { - if (is_comp_pred) { - const int mode0 = compound_ref0_mode(this_mode); - const int mode1 = compound_ref1_mode(this_mode); - const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]], - args->modelled_rd[mode1][refs[1]]); - if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) { - restore_dst_buf(xd, orig_dst); - return INT64_MAX; + if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) { + // TODO(chengchen): this speed feature introduces big loss. + // Need better estimation of rate distortion. + rd_stats->rate += rs; + rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2]; + rd_stats_y->rate = plane_rate[0]; + rd_stats_uv->rate = plane_rate[1] + plane_rate[2]; + rd_stats->sse = plane_sse[0] + plane_sse[1] + plane_sse[2]; + rd_stats_y->sse = plane_sse[0]; + rd_stats_uv->sse = plane_sse[1] + plane_sse[2]; + rd_stats->dist = plane_dist[0] + plane_dist[1] + plane_dist[2]; + rd_stats_y->dist = plane_dist[0]; + rd_stats_uv->dist = plane_dist[1] + plane_dist[2]; + } else { +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, + disable_skip, mi_row, mi_col, args, ref_best_rd, + refs, rate_mv, &orig_dst, best_est_rd); +#else + ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, + disable_skip, mi_row, mi_col, args, ref_best_rd, + refs, rate_mv, &orig_dst); +#endif + } + if (ret_val != INT64_MAX) { + if (search_jnt_comp) { + int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (tmp_rd < best_rd) { + best_rd_stats = *rd_stats; + best_rd_stats_y = *rd_stats_y; + best_rd_stats_uv = *rd_stats_uv; + best_ret_val = ret_val; + best_rd = tmp_rd; + best_mbmi = *mbmi; + memcpy(best_blk_skip, x->blk_skip, + sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w); + } + if (tmp_rd < ref_best_rd) { + ref_best_rd = tmp_rd; + } } - } else if (!is_comp_interintra_pred) { - args->modelled_rd[this_mode][refs[0]] = rd; } - } - - if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { - // if current pred_error modeled rd is substantially more than the best - // so far, do not bother doing full rd - if (rd / 2 > ref_best_rd) { - restore_dst_buf(xd, orig_dst); - return INT64_MAX; + if (!search_jnt_comp && ret_val != 0) { + restore_dst_buf(xd, orig_dst, num_planes); + return ret_val; } + restore_dst_buf(xd, orig_dst, num_planes); } -#if CONFIG_INTERINTRA - rd_stats->rate += compmode_interintra_cost; -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - rate2_bmc_nocoeff += compmode_interintra_cost; -#endif -#endif -#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT - rd_stats->rate += compmode_interinter_cost; -#endif - - ret_val = motion_mode_rd( - cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip, mode_mv, - mi_row, mi_col, args, ref_best_rd, refs, rate_mv, -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - single_newmv, rate2_bmc_nocoeff, &best_bmc_mbmi, rate_mv_bmc, -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - rs, &skip_txfm_sb, &skip_sse_sb, &orig_dst); + // re-instate status of the best choice + if (is_comp_pred && best_ret_val != INT64_MAX) { + *rd_stats = best_rd_stats; + *rd_stats_y = best_rd_stats_y; + *rd_stats_uv = best_rd_stats_uv; + ret_val = best_ret_val; + *mbmi = best_mbmi; + assert(IMPLIES(mbmi->comp_group_idx == 1, + mbmi->interinter_comp.type != COMPOUND_AVERAGE)); + memcpy(x->blk_skip, best_blk_skip, + sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w); + } + if (early_terminate == INT64_MAX) return INT64_MAX; if (ret_val != 0) return ret_val; - - return 0; // The rate-distortion cost will be re-calculated by caller. + return RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); } -#if CONFIG_INTRABC static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, BLOCK_SIZE bsize, int64_t best_rd) { const AV1_COMMON *const cm = &cpi->common; - if (!av1_allow_intrabc(bsize, cm)) return INT64_MAX; + if (!av1_allow_intrabc(cm)) return INT64_MAX; + const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; const TileInfo *tile = &xd->tile; - MODE_INFO *const mi = xd->mi[0]; + MB_MODE_INFO *mbmi = xd->mi[0]; const int mi_row = -xd->mb_to_top_edge / (8 * MI_SIZE); const int mi_col = -xd->mb_to_left_edge / (8 * MI_SIZE); const int w = block_size_wide[bsize]; const int h = block_size_high[bsize]; - const int sb_row = mi_row / MAX_MIB_SIZE; - const int sb_col = mi_col / MAX_MIB_SIZE; + const int sb_row = mi_row >> cm->seq_params.mib_size_log2; + const int sb_col = mi_col >> cm->seq_params.mib_size_log2; MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; MV_REFERENCE_FRAME ref_frame = INTRA_FRAME; - int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame]; - av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame], - mbmi_ext->ref_mv_stack[ref_frame], - mbmi_ext->compound_mode_context, candidates, mi_row, mi_col, - NULL, NULL, mbmi_ext->mode_context); + av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, + mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row, + mi_col, mbmi_ext->mode_context); int_mv nearestmv, nearmv; - av1_find_best_ref_mvs(0, candidates, &nearestmv, &nearmv); + av1_find_best_ref_mvs_from_stack(0, mbmi_ext, ref_frame, &nearestmv, &nearmv, + 0); int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv; - if (dv_ref.as_int == 0) av1_find_ref_dv(&dv_ref, mi_row, mi_col); - mbmi_ext->ref_mvs[INTRA_FRAME][0] = dv_ref; + if (dv_ref.as_int == 0) + av1_find_ref_dv(&dv_ref, tile, cm->seq_params.mib_size, mi_row, mi_col); + // Ref DV should not have sub-pel. + assert((dv_ref.as_mv.col & 7) == 0); + assert((dv_ref.as_mv.row & 7) == 0); + mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv = dv_ref; struct buf_2d yv12_mb[MAX_MB_PLANE]; - av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, mi_row, mi_col, NULL, NULL); - for (int i = 0; i < MAX_MB_PLANE; ++i) { + av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, mi_row, mi_col, NULL, NULL, + num_planes); + for (int i = 0; i < num_planes; ++i) { xd->plane[i].pre[0] = yv12_mb[i]; } @@ -9853,11 +8843,11 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, IBC_MOTION_DIRECTIONS }; - MB_MODE_INFO *mbmi = &mi->mbmi; MB_MODE_INFO best_mbmi = *mbmi; RD_STATS best_rdcost = *rd_cost; int best_skip = x->skip; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 }; for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE; dir < IBC_MOTION_DIRECTIONS; ++dir) { const MvLimits tmp_mv_limits = x->mv_limits; @@ -9866,16 +8856,18 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE; x->mv_limits.col_max = (tile->mi_col_end - mi_col) * MI_SIZE - w; x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE; - x->mv_limits.row_max = (sb_row * MAX_MIB_SIZE - mi_row) * MI_SIZE - h; + x->mv_limits.row_max = + (sb_row * cm->seq_params.mib_size - mi_row) * MI_SIZE - h; break; case IBC_MOTION_LEFT: x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE; - x->mv_limits.col_max = (sb_col * MAX_MIB_SIZE - mi_col) * MI_SIZE - w; + x->mv_limits.col_max = + (sb_col * cm->seq_params.mib_size - mi_col) * MI_SIZE - w; // TODO(aconverse@google.com): Minimize the overlap between above and // left areas. x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE; int bottom_coded_mi_edge = - AOMMIN((sb_row + 1) * MAX_MIB_SIZE, tile->mi_row_end); + AOMMIN((sb_row + 1) * cm->seq_params.mib_size, tile->mi_row_end); x->mv_limits.row_max = (bottom_coded_mi_edge - mi_row) * MI_SIZE - h; break; default: assert(0); @@ -9898,66 +8890,67 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, mvp_full.row >>= 3; int sadpb = x->sadperbit16; int cost_list[5]; -#if CONFIG_HASH_ME int bestsme = av1_full_pixel_search( cpi, x, bsize, &mvp_full, step_param, sadpb, cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1, (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1); -#else - int bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, - sadpb, cond_cost_list(cpi, cost_list), - &dv_ref.as_mv, INT_MAX, 1); -#endif x->mv_limits = tmp_mv_limits; if (bestsme == INT_MAX) continue; mvp_full = x->best_mv.as_mv; - MV dv = {.row = mvp_full.row * 8, .col = mvp_full.col * 8 }; + MV dv = { .row = mvp_full.row * 8, .col = mvp_full.col * 8 }; if (mv_check_bounds(&x->mv_limits, &dv)) continue; - if (!is_dv_valid(dv, tile, mi_row, mi_col, bsize)) continue; + if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize, + cm->seq_params.mib_size_log2)) + continue; + // DV should not have sub-pel. + assert((dv.col & 7) == 0); + assert((dv.row & 7) == 0); memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info)); + mbmi->filter_intra_mode_info.use_filter_intra = 0; mbmi->use_intrabc = 1; mbmi->mode = DC_PRED; mbmi->uv_mode = UV_DC_PRED; + mbmi->motion_mode = SIMPLE_TRANSLATION; mbmi->mv[0].as_mv = dv; mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR); mbmi->skip = 0; x->skip = 0; av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize); - assert(x->mvcost == x->mv_cost_stack[0]); + int *dvcost[2] = { (int *)&cpi->dv_cost[0][MV_MAX], + (int *)&cpi->dv_cost[1][MV_MAX] }; // TODO(aconverse@google.com): The full motion field defining discount // in MV_COST_WEIGHT is too large. Explore other values. - int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, x->nmvjointcost, - x->mvcost, MV_COST_WEIGHT_SUB); + int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, cpi->dv_joint_cost, + dvcost, MV_COST_WEIGHT_SUB); const int rate_mode = x->intrabc_cost[1]; RD_STATS rd_stats, rd_stats_uv; av1_subtract_plane(x, bsize, 0); - super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX); - super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); - av1_merge_rd_stats(&rd_stats, &rd_stats_uv); + if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { + // Intrabc + select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX); + } else { + super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX); + memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); + memset(x->blk_skip, rd_stats.skip, + sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + } + if (num_planes > 1) { + super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); + av1_merge_rd_stats(&rd_stats, &rd_stats_uv); + } #if CONFIG_RD_DEBUG mbmi->rd_stats = rd_stats; #endif -#if CONFIG_VAR_TX - // TODO(aconverse@google.com): Evaluate allowing VAR TX on intrabc blocks - const int width = block_size_wide[bsize] >> tx_size_wide_log2[0]; - const int height = block_size_high[bsize] >> tx_size_high_log2[0]; - int idx, idy; - for (idy = 0; idy < height; ++idy) - for (idx = 0; idx < width; ++idx) - mbmi->inter_tx_size[idy >> 1][idx >> 1] = mbmi->tx_size; - mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size); -#endif // CONFIG_VAR_TX - - const aom_prob skip_prob = av1_get_skip_prob(cm, xd); + const int skip_ctx = av1_get_skip_context(xd); RD_STATS rdc_noskip; av1_init_rd_stats(&rdc_noskip); rdc_noskip.rate = - rate_mode + rate_mv + rd_stats.rate + av1_cost_bit(skip_prob, 0); + rate_mode + rate_mv + rd_stats.rate + x->skip_cost[skip_ctx][0]; rdc_noskip.dist = rd_stats.dist; rdc_noskip.rdcost = RDCOST(x->rdmult, rdc_noskip.rate, rdc_noskip.dist); if (rdc_noskip.rdcost < best_rd) { @@ -9965,98 +8958,88 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, best_mbmi = *mbmi; best_skip = x->skip; best_rdcost = rdc_noskip; + memcpy(best_blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); } - x->skip = 1; - mbmi->skip = 1; - RD_STATS rdc_skip; - av1_init_rd_stats(&rdc_skip); - rdc_skip.rate = rate_mode + rate_mv + av1_cost_bit(skip_prob, 1); - rdc_skip.dist = rd_stats.sse; - rdc_skip.rdcost = RDCOST(x->rdmult, rdc_skip.rate, rdc_skip.dist); - if (rdc_skip.rdcost < best_rd) { - best_rd = rdc_skip.rdcost; - best_mbmi = *mbmi; - best_skip = x->skip; - best_rdcost = rdc_skip; + if (!xd->lossless[mbmi->segment_id]) { + x->skip = 1; + mbmi->skip = 1; + RD_STATS rdc_skip; + av1_init_rd_stats(&rdc_skip); + rdc_skip.rate = rate_mode + rate_mv + x->skip_cost[skip_ctx][1]; + rdc_skip.dist = rd_stats.sse; + rdc_skip.rdcost = RDCOST(x->rdmult, rdc_skip.rate, rdc_skip.dist); + if (rdc_skip.rdcost < best_rd) { + best_rd = rdc_skip.rdcost; + best_mbmi = *mbmi; + best_skip = x->skip; + best_rdcost = rdc_skip; + memcpy(best_blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + } } } *mbmi = best_mbmi; *rd_cost = best_rdcost; x->skip = best_skip; + memcpy(x->blk_skip, best_blk_skip, + sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); return best_rd; } -#endif // CONFIG_INTRABC -void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, - RD_STATS *rd_cost, BLOCK_SIZE bsize, +void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, + int mi_col, RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - struct macroblockd_plane *const pd = xd->plane; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int num_planes = av1_num_planes(cm); int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0; int y_skip = 0, uv_skip = 0; int64_t dist_y = 0, dist_uv = 0; TX_SIZE max_uv_tx_size; - const int unify_bsize = CONFIG_CB4X4; ctx->skip = 0; mbmi->ref_frame[0] = INTRA_FRAME; mbmi->ref_frame[1] = NONE_FRAME; -#if CONFIG_INTRABC mbmi->use_intrabc = 0; mbmi->mv[0].as_int = 0; -#endif // CONFIG_INTRABC -#if CONFIG_LGT_FROM_PRED - mbmi->use_lgt = 0; -#endif const int64_t intra_yrd = - (bsize >= BLOCK_8X8 || unify_bsize) - ? rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y, - &y_skip, bsize, best_rd) - : rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly, - &dist_y, &y_skip, best_rd); + rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y, + &y_skip, bsize, best_rd, ctx); if (intra_yrd < best_rd) { -#if CONFIG_CFL -#if CONFIG_CB4X4 // Only store reconstructed luma when there's chroma RDO. When there's no // chroma RDO, the reconstructed luma will be stored in encode_superblock(). - xd->cfl->store_y = !x->skip_chroma_rd; -#else - xd->cfl->store_y = 1; -#endif // CONFIG_CB4X4 - if (xd->cfl->store_y) { - // Perform one extra call to txfm_rd_in_plane(), with the values chosen - // during luma RDO, so we can store reconstructed luma values - RD_STATS this_rd_stats; - txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y, - mbmi->sb_type, mbmi->tx_size, - cpi->sf.use_fast_coef_costing); - xd->cfl->store_y = 0; - } -#endif // CONFIG_CFL - max_uv_tx_size = uv_txsize_lookup[bsize][mbmi->tx_size][pd[1].subsampling_x] - [pd[1].subsampling_y]; - init_sbuv_mode(mbmi); -#if CONFIG_CB4X4 - if (!x->skip_chroma_rd) - rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv, - &uv_skip, bsize, max_uv_tx_size); -#else - rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv, - &uv_skip, AOMMAX(BLOCK_8X8, bsize), max_uv_tx_size); -#endif // CONFIG_CB4X4 + xd->cfl.is_chroma_reference = is_chroma_reference( + mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y); + xd->cfl.store_y = store_cfl_required_rdo(cm, x); + if (xd->cfl.store_y) { + // Restore reconstructed luma values. + memcpy(x->blk_skip, ctx->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + av1_encode_intra_block_plane(cpi, x, bsize, AOM_PLANE_Y, + cpi->optimize_seg_arr[mbmi->segment_id], + mi_row, mi_col); + xd->cfl.store_y = 0; + } + if (num_planes > 1) { + max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); + init_sbuv_mode(mbmi); + if (!x->skip_chroma_rd) + rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv, + &uv_skip, bsize, max_uv_tx_size); + } if (y_skip && (uv_skip || x->skip_chroma_rd)) { rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly + - av1_cost_bit(av1_get_skip_prob(cm, xd), 1); + x->skip_cost[av1_get_skip_context(xd)][1]; rd_cost->dist = dist_y + dist_uv; } else { rd_cost->rate = - rate_y + rate_uv + av1_cost_bit(av1_get_skip_prob(cm, xd), 0); + rate_y + rate_uv + x->skip_cost[av1_get_skip_context(xd)][0]; rd_cost->dist = dist_y + dist_uv; } rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist); @@ -10064,125 +9047,47 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, rd_cost->rate = INT_MAX; } -#if CONFIG_INTRABC if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd) best_rd = rd_cost->rdcost; if (rd_pick_intrabc_mode_sb(cpi, x, rd_cost, bsize, best_rd) < best_rd) { - ctx->skip = x->skip; // FIXME where is the proper place to set this?! + ctx->skip = x->skip; + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); assert(rd_cost->rate != INT_MAX); - rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist); } -#endif if (rd_cost->rate == INT_MAX) return; ctx->mic = *xd->mi[0]; ctx->mbmi_ext = *x->mbmi_ext; } -// Do we have an internal image edge (e.g. formatting bars). -int av1_internal_image_edge(const AV1_COMP *cpi) { - return (cpi->oxcf.pass == 2) && - ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) || - (cpi->twopass.this_frame_stats.inactive_zone_cols > 0)); -} - -// Checks to see if a super block is on a horizontal image edge. -// In most cases this is the "real" edge unless there are formatting -// bars embedded in the stream. -int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) { - int top_edge = 0; - int bottom_edge = cpi->common.mi_rows; - int is_active_h_edge = 0; - - // For two pass account for any formatting bars detected. - if (cpi->oxcf.pass == 2) { - const TWO_PASS *const twopass = &cpi->twopass; - - // The inactive region is specified in MBs not mi units. - // The image edge is in the following MB row. - top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2); - - bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2); - bottom_edge = AOMMAX(top_edge, bottom_edge); - } - - if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) || - ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) { - is_active_h_edge = 1; - } - return is_active_h_edge; -} - -// Checks to see if a super block is on a vertical image edge. -// In most cases this is the "real" edge unless there are formatting -// bars embedded in the stream. -int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) { - int left_edge = 0; - int right_edge = cpi->common.mi_cols; - int is_active_v_edge = 0; - - // For two pass account for any formatting bars detected. - if (cpi->oxcf.pass == 2) { - const TWO_PASS *const twopass = &cpi->twopass; - - // The inactive region is specified in MBs not mi units. - // The image edge is in the following MB row. - left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2); - - right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2); - right_edge = AOMMAX(left_edge, right_edge); - } - - if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) || - ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) { - is_active_v_edge = 1; - } - return is_active_v_edge; -} - -// Checks to see if a super block is at the edge of the active image. -// In most cases this is the "real" edge unless there are formatting -// bars embedded in the stream. -int av1_active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) { - return av1_active_h_edge(cpi, mi_row, cpi->common.mib_size) || - av1_active_v_edge(cpi, mi_col, cpi->common.mib_size); -} - static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) { MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const BLOCK_SIZE bsize = mbmi->sb_type; - assert(bsize >= BLOCK_8X8); int src_stride = x->plane[1].src.stride; const uint8_t *const src_u = x->plane[1].src.buf; const uint8_t *const src_v = x->plane[2].src.buf; - float *const data = x->palette_buffer->kmeans_data_buf; - float centroids[2 * PALETTE_MAX_SIZE]; + int *const data = x->palette_buffer->kmeans_data_buf; + int centroids[2 * PALETTE_MAX_SIZE]; uint8_t *const color_map = xd->plane[1].color_index_map; int r, c; -#if CONFIG_HIGHBITDEPTH const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u); const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v); -#endif // CONFIG_HIGHBITDEPTH int plane_block_width, plane_block_height, rows, cols; av1_get_block_dimensions(bsize, 1, xd, &plane_block_width, &plane_block_height, &rows, &cols); - (void)cpi; for (r = 0; r < rows; ++r) { for (c = 0; c < cols; ++c) { -#if CONFIG_HIGHBITDEPTH if (cpi->common.use_highbitdepth) { data[(r * cols + c) * 2] = src_u16[r * src_stride + c]; data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c]; } else { -#endif // CONFIG_HIGHBITDEPTH data[(r * cols + c) * 2] = src_u[r * src_stride + c]; data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c]; -#if CONFIG_HIGHBITDEPTH } -#endif // CONFIG_HIGHBITDEPTH } } @@ -10198,451 +9103,361 @@ static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) { plane_block_height); } -#if CONFIG_FILTER_INTRA -static void pick_filter_intra_interframe( - const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, - int mi_col, int *rate_uv_intra, int *rate_uv_tokenonly, int64_t *dist_uv, - int *skip_uv, UV_PREDICTION_MODE *mode_uv, - FILTER_INTRA_MODE_INFO *filter_intra_mode_info_uv, -#if CONFIG_EXT_INTRA - int8_t *uv_angle_delta, -#endif // CONFIG_EXT_INTRA - PALETTE_MODE_INFO *pmi_uv, int palette_ctx, int skip_mask, - unsigned int *ref_costs_single, int64_t *best_rd, int64_t *best_intra_rd, - PREDICTION_MODE *best_intra_mode, int *best_mode_index, int *best_skip2, - int *best_mode_skippable, -#if CONFIG_SUPERTX - int *returnrate_nocoef, -#endif // CONFIG_SUPERTX - int64_t *best_pred_rd, MB_MODE_INFO *best_mbmode, RD_STATS *rd_cost) { +static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x, + const MACROBLOCKD *xd, int mi_row, + int mi_col, const uint8_t *above, + int above_stride, const uint8_t *left, + int left_stride); + +static const int ref_frame_flag_list[REF_FRAMES] = { 0, + AOM_LAST_FLAG, + AOM_LAST2_FLAG, + AOM_LAST3_FLAG, + AOM_GOLD_FLAG, + AOM_BWD_FLAG, + AOM_ALT2_FLAG, + AOM_ALT_FLAG }; + +static void rd_pick_skip_mode(RD_STATS *rd_cost, + InterModeSearchState *search_state, + const AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) { const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; - const int try_palette = - av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type); - int rate2 = 0, rate_y = INT_MAX, skippable = 0, rate_uv, rate_dummy, i; - int dc_mode_index; - const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]]; - int64_t distortion2 = 0, distortion_y = 0, this_rd = *best_rd; - int64_t distortion_uv, model_rd = INT64_MAX; - TX_SIZE uv_tx; + MB_MODE_INFO *const mbmi = xd->mi[0]; - for (i = 0; i < MAX_MODES; ++i) - if (av1_mode_order[i].mode == DC_PRED && - av1_mode_order[i].ref_frame[0] == INTRA_FRAME) - break; - dc_mode_index = i; - assert(i < MAX_MODES); + x->compound_idx = 1; // COMPOUND_AVERAGE + RD_STATS skip_mode_rd_stats; + av1_invalid_rd_stats(&skip_mode_rd_stats); - // TODO(huisu): use skip_mask for further speedup. - (void)skip_mask; - mbmi->mode = DC_PRED; + if (cm->ref_frame_idx_0 == INVALID_IDX || + cm->ref_frame_idx_1 == INVALID_IDX) { + return; + } + + const MV_REFERENCE_FRAME ref_frame = LAST_FRAME + cm->ref_frame_idx_0; + const MV_REFERENCE_FRAME second_ref_frame = LAST_FRAME + cm->ref_frame_idx_1; + const PREDICTION_MODE this_mode = NEAREST_NEARESTMV; + const int mode_index = + get_prediction_mode_idx(this_mode, ref_frame, second_ref_frame); + + if (mode_index == -1) { + return; + } + + mbmi->mode = this_mode; mbmi->uv_mode = UV_DC_PRED; - mbmi->ref_frame[0] = INTRA_FRAME; - mbmi->ref_frame[1] = NONE_FRAME; - if (!rd_pick_filter_intra_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y, - &skippable, bsize, intra_mode_cost[mbmi->mode], - &this_rd, &model_rd, 0)) { + mbmi->ref_frame[0] = ref_frame; + mbmi->ref_frame[1] = second_ref_frame; + + assert(this_mode == NEAREST_NEARESTMV); + if (!build_cur_mv(mbmi->mv, this_mode, cm, x)) { return; } - if (rate_y == INT_MAX) return; - - uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x] - [xd->plane[1].subsampling_y]; - if (rate_uv_intra[uv_tx] == INT_MAX) { - choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx], - &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx], - &skip_uv[uv_tx], &mode_uv[uv_tx]); - if (cm->allow_screen_content_tools) pmi_uv[uv_tx] = *pmi; - filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info; -#if CONFIG_EXT_INTRA - uv_angle_delta[uv_tx] = mbmi->angle_delta[1]; -#endif // CONFIG_EXT_INTRA - } - - rate_uv = rate_uv_tokenonly[uv_tx]; - distortion_uv = dist_uv[uv_tx]; - skippable = skippable && skip_uv[uv_tx]; - mbmi->uv_mode = mode_uv[uv_tx]; - if (cm->allow_screen_content_tools) { - pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1]; - memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, - pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE, - 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0])); - } -#if CONFIG_EXT_INTRA - mbmi->angle_delta[1] = uv_angle_delta[uv_tx]; -#endif // CONFIG_EXT_INTRA - mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = - filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]; - if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) { - mbmi->filter_intra_mode_info.filter_intra_mode[1] = - filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1]; - } - - rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv + - x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode]; - if (try_palette && mbmi->mode == DC_PRED) - rate2 += av1_cost_bit( - av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0); - if (!xd->lossless[mbmi->segment_id]) { - // super_block_yrd above includes the cost of the tx_size in the - // tokenonly rate, but for intra blocks, tx_size is always coded - // (prediction granularity), so we account for it in the full rate, - // not the tokenonly rate. - rate_y -= tx_size_cost(cpi, x, bsize, mbmi->tx_size); - } - - rate2 += av1_cost_bit(cm->fc->filter_intra_probs[0], - mbmi->filter_intra_mode_info.use_filter_intra_mode[0]); - rate2 += write_uniform_cost( - FILTER_INTRA_MODES, mbmi->filter_intra_mode_info.filter_intra_mode[0]); -#if CONFIG_EXT_INTRA - if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize) && - av1_use_angle_delta(bsize)) { - rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, - MAX_ANGLE_DELTA + mbmi->angle_delta[1]); - } -#endif // CONFIG_EXT_INTRA - if (mbmi->mode == DC_PRED) { - rate2 += - av1_cost_bit(cpi->common.fc->filter_intra_probs[1], - mbmi->filter_intra_mode_info.use_filter_intra_mode[1]); - if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1]) - rate2 += - write_uniform_cost(FILTER_INTRA_MODES, - mbmi->filter_intra_mode_info.filter_intra_mode[1]); - } - distortion2 = distortion_y + distortion_uv; - av1_encode_intra_block_plane((AV1_COMMON *)cm, x, bsize, 0, 0, mi_row, - mi_col); + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1); + mbmi->comp_group_idx = 0; + mbmi->compound_idx = x->compound_idx; + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->ref_mv_idx = 0; + mbmi->skip_mode = mbmi->skip = 1; - rate2 += ref_costs_single[INTRA_FRAME]; + set_default_interp_filters(mbmi, cm->interp_filter); - if (skippable) { - rate2 -= (rate_y + rate_uv); - rate_y = 0; - rate_uv = 0; - rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1); - } else { - rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0); + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; } - this_rd = RDCOST(x->rdmult, rate2, distortion2); - if (this_rd < *best_intra_rd) { - *best_intra_rd = this_rd; - *best_intra_mode = mbmi->mode; + BUFFER_SET orig_dst; + for (int i = 0; i < num_planes; i++) { + orig_dst.plane[i] = xd->plane[i].dst.buf; + orig_dst.stride[i] = xd->plane[i].dst.stride; } - for (i = 0; i < REFERENCE_MODES; ++i) - best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd); - if (this_rd < *best_rd) { - *best_mode_index = dc_mode_index; - mbmi->mv[0].as_int = 0; - rd_cost->rate = rate2; -#if CONFIG_SUPERTX - if (x->skip) - *returnrate_nocoef = rate2; - else - *returnrate_nocoef = rate2 - rate_y - rate_uv; - *returnrate_nocoef -= av1_cost_bit(av1_get_skip_prob(cm, xd), skippable); - *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd), - mbmi->ref_frame[0] != INTRA_FRAME); -#endif // CONFIG_SUPERTX - rd_cost->dist = distortion2; - rd_cost->rdcost = this_rd; - *best_rd = this_rd; - *best_mbmode = *mbmi; - *best_skip2 = 0; - *best_mode_skippable = skippable; + // Obtain the rdcost for skip_mode. + skip_mode_rd(&skip_mode_rd_stats, cpi, x, bsize, mi_row, mi_col, &orig_dst); + + // Compare the use of skip_mode with the best intra/inter mode obtained. + const int skip_mode_ctx = av1_get_skip_mode_context(xd); + const int64_t best_intra_inter_mode_cost = + (rd_cost->dist < INT64_MAX && rd_cost->rate < INT32_MAX) + ? RDCOST(x->rdmult, + rd_cost->rate + x->skip_mode_cost[skip_mode_ctx][0], + rd_cost->dist) + : INT64_MAX; + + if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost) { + assert(mode_index != -1); + search_state->best_mbmode.skip_mode = 1; + search_state->best_mbmode = *mbmi; + + search_state->best_mbmode.skip_mode = search_state->best_mbmode.skip = 1; + search_state->best_mbmode.mode = NEAREST_NEARESTMV; + search_state->best_mbmode.ref_frame[0] = mbmi->ref_frame[0]; + search_state->best_mbmode.ref_frame[1] = mbmi->ref_frame[1]; + search_state->best_mbmode.mv[0].as_int = mbmi->mv[0].as_int; + search_state->best_mbmode.mv[1].as_int = mbmi->mv[1].as_int; + search_state->best_mbmode.ref_mv_idx = 0; + + // Set up tx_size related variables for skip-specific loop filtering. + search_state->best_mbmode.tx_size = + block_signals_txsize(bsize) ? tx_size_from_tx_mode(bsize, cm->tx_mode) + : max_txsize_rect_lookup[bsize]; + memset(search_state->best_mbmode.inter_tx_size, + search_state->best_mbmode.tx_size, + sizeof(search_state->best_mbmode.inter_tx_size)); + set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n8_w, xd->n8_h, + search_state->best_mbmode.skip && is_inter_block(mbmi), xd); + + // Set up color-related variables for skip mode. + search_state->best_mbmode.uv_mode = UV_DC_PRED; + search_state->best_mbmode.palette_mode_info.palette_size[0] = 0; + search_state->best_mbmode.palette_mode_info.palette_size[1] = 0; + + search_state->best_mbmode.comp_group_idx = 0; + search_state->best_mbmode.compound_idx = x->compound_idx; + search_state->best_mbmode.interinter_comp.type = COMPOUND_AVERAGE; + search_state->best_mbmode.motion_mode = SIMPLE_TRANSLATION; + + search_state->best_mbmode.interintra_mode = + (INTERINTRA_MODE)(II_DC_PRED - 1); + search_state->best_mbmode.filter_intra_mode_info.use_filter_intra = 0; + + set_default_interp_filters(&search_state->best_mbmode, cm->interp_filter); + + search_state->best_mode_index = mode_index; + + // Update rd_cost + rd_cost->rate = skip_mode_rd_stats.rate; + rd_cost->dist = rd_cost->sse = skip_mode_rd_stats.dist; + rd_cost->rdcost = skip_mode_rd_stats.rdcost; + + search_state->best_rd = rd_cost->rdcost; + search_state->best_skip2 = 1; + search_state->best_mode_skippable = (skip_mode_rd_stats.sse == 0); + + x->skip = 1; } } -#endif // CONFIG_FILTER_INTRA - -#if CONFIG_MOTION_VAR -static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x, - const MACROBLOCKD *xd, int mi_row, - int mi_col, const uint8_t *above, - int above_stride, const uint8_t *left, - int left_stride); -#endif // CONFIG_MOTION_VAR -void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, - MACROBLOCK *x, int mi_row, int mi_col, - RD_STATS *rd_cost, -#if CONFIG_SUPERTX - int *returnrate_nocoef, -#endif // CONFIG_SUPERTX - BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, - int64_t best_rd_so_far) { +// speed feature: fast intra/inter transform type search +// Used for speed >= 2 +// When this speed feature is on, in rd mode search, only DCT is used. +// After the mode is determined, this function is called, to select +// transform types and get accurate rdcost. +static void sf_refine_fast_tx_type_search( + const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, + RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int best_mode_index, MB_MODE_INFO *best_mbmode, + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], int best_rate_y, + int best_rate_uv, int *best_skip2) { const AV1_COMMON *const cm = &cpi->common; - const RD_OPT *const rd_opt = &cpi->rd; const SPEED_FEATURES *const sf = &cpi->sf; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - const int try_palette = - av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type); - PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int num_planes = av1_num_planes(cm); + + if (xd->lossless[mbmi->segment_id] == 0 && best_mode_index >= 0 && + ((sf->tx_type_search.fast_inter_tx_type_search == 1 && + is_inter_mode(best_mbmode->mode)) || + (sf->tx_type_search.fast_intra_tx_type_search == 1 && + !is_inter_mode(best_mbmode->mode)))) { + int skip_blk = 0; + RD_STATS rd_stats_y, rd_stats_uv; + + x->use_default_inter_tx_type = 0; + x->use_default_intra_tx_type = 0; + + *mbmi = *best_mbmode; + + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + + // Select prediction reference frames. + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + if (has_second_ref(mbmi)) + xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + if (is_inter_mode(mbmi->mode)) { + av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize); + if (mbmi->motion_mode == OBMC_CAUSAL) + av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col); + + av1_subtract_plane(x, bsize, 0); + if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { + // av1_rd_pick_inter_mode_sb + select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, + INT64_MAX); + assert(rd_stats_y.rate != INT_MAX); + } else { + super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); + memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); + memset(x->blk_skip, rd_stats_y.skip, + sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + } + if (num_planes > 1) { + inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, FTXS_NONE); + } else { + av1_init_rd_stats(&rd_stats_uv); + } + } else { + super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); + if (num_planes > 1) { + super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); + } else { + av1_init_rd_stats(&rd_stats_uv); + } + } + + if (RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate, + (rd_stats_y.dist + rd_stats_uv.dist)) > + RDCOST(x->rdmult, 0, (rd_stats_y.sse + rd_stats_uv.sse))) { + skip_blk = 1; + rd_stats_y.rate = x->skip_cost[av1_get_skip_context(xd)][1]; + rd_stats_uv.rate = 0; + rd_stats_y.dist = rd_stats_y.sse; + rd_stats_uv.dist = rd_stats_uv.sse; + } else { + skip_blk = 0; + rd_stats_y.rate += x->skip_cost[av1_get_skip_context(xd)][0]; + } + + if (RDCOST(x->rdmult, best_rate_y + best_rate_uv, rd_cost->dist) > + RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate, + (rd_stats_y.dist + rd_stats_uv.dist))) { + best_mbmode->tx_size = mbmi->tx_size; + av1_copy(best_mbmode->inter_tx_size, mbmi->inter_tx_size); + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + av1_copy(best_mbmode->txk_type, mbmi->txk_type); + rd_cost->rate += + (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv); + rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist; + rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist); + *best_skip2 = skip_blk; + } + } +} + +// Please add/modify parameter setting in this function, making it consistent +// and easy to read and maintain. +static void set_params_rd_pick_inter_mode( + const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args, + BLOCK_SIZE bsize, int mi_row, int mi_col, uint16_t ref_frame_skip_mask[2], + uint32_t mode_skip_mask[REF_FRAMES], + unsigned int ref_costs_single[REF_FRAMES], + unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES], + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; const struct segmentation *const seg = &cm->seg; - PREDICTION_MODE this_mode; - MV_REFERENCE_FRAME ref_frame, second_ref_frame; + const SPEED_FEATURES *const sf = &cpi->sf; unsigned char segment_id = mbmi->segment_id; - int comp_pred, i, k; - int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME]; -#if CONFIG_COMPOUND_SINGLEREF - int_mv frame_comp_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME]; -#endif // CONFIG_COMPOUND_SINGLEREF - struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE]; - int_mv single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } }; - int single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 }; - int64_t modelled_rd[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME]; - static const int flag_list[TOTAL_REFS_PER_FRAME] = { - 0, - AOM_LAST_FLAG, -#if CONFIG_EXT_REFS - AOM_LAST2_FLAG, - AOM_LAST3_FLAG, -#endif // CONFIG_EXT_REFS - AOM_GOLD_FLAG, -#if CONFIG_EXT_REFS - AOM_BWD_FLAG, - AOM_ALT2_FLAG, -#endif // CONFIG_EXT_REFS - AOM_ALT_FLAG - }; - int64_t best_rd = best_rd_so_far; - int best_rate_y = INT_MAX, best_rate_uv = INT_MAX; - int64_t best_pred_diff[REFERENCE_MODES]; - int64_t best_pred_rd[REFERENCE_MODES]; - MB_MODE_INFO best_mbmode; - int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0); - int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1); - int best_mode_skippable = 0; - int midx, best_mode_index = -1; - unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME]; -#if CONFIG_EXT_COMP_REFS - unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME][TOTAL_REFS_PER_FRAME]; -#else - unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME]; -#endif // CONFIG_EXT_COMP_REFS - aom_prob comp_mode_p; - int64_t best_intra_rd = INT64_MAX; - unsigned int best_pred_sse = UINT_MAX; - PREDICTION_MODE best_intra_mode = DC_PRED; - int rate_uv_intra[TX_SIZES_ALL], rate_uv_tokenonly[TX_SIZES_ALL]; - int64_t dist_uvs[TX_SIZES_ALL]; - int skip_uvs[TX_SIZES_ALL]; - UV_PREDICTION_MODE mode_uv[TX_SIZES_ALL]; - PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL]; -#if CONFIG_EXT_INTRA - int8_t uv_angle_delta[TX_SIZES_ALL]; - int is_directional_mode, angle_stats_ready = 0; - uint8_t directional_mode_skip_mask[INTRA_MODES]; -#endif // CONFIG_EXT_INTRA -#if CONFIG_FILTER_INTRA - int8_t dc_skipped = 1; - FILTER_INTRA_MODE_INFO filter_intra_mode_info_uv[TX_SIZES_ALL]; -#endif // CONFIG_FILTER_INTRA - const int intra_cost_penalty = av1_get_intra_cost_penalty( - cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); - const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]]; - int best_skip2 = 0; - uint16_t ref_frame_skip_mask[2] = { 0 }; - uint32_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 }; -#if CONFIG_INTERINTRA - MV_REFERENCE_FRAME best_single_inter_ref = LAST_FRAME; - int64_t best_single_inter_rd = INT64_MAX; -#endif // CONFIG_INTERINTRA - int mode_skip_start = sf->mode_skip_start + 1; - const int *const rd_threshes = rd_opt->threshes[segment_id][bsize]; - const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; - int64_t mode_threshold[MAX_MODES]; - int *mode_map = tile_data->mode_map[bsize]; - const int mode_search_skip_flags = sf->mode_search_skip_flags; -#if CONFIG_PVQ - od_rollback_buffer pre_buf; -#endif // CONFIG_PVQ - - HandleInterModeArgs args = { -#if CONFIG_MOTION_VAR - { NULL }, - { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }, - { NULL }, - { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }, -#endif // CONFIG_MOTION_VAR - NULL, - NULL, - NULL, - { { 0 } }, - }; - - const int rows = block_size_high[bsize]; - const int cols = block_size_wide[bsize]; - int palette_ctx = 0; - const MODE_INFO *above_mi = xd->above_mi; - const MODE_INFO *left_mi = xd->left_mi; -#if CONFIG_MOTION_VAR int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, + MAX_SB_SIZE >> 1 }; + int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, + MAX_SB_SIZE >> 1 }; int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; -#if CONFIG_HIGHBITDEPTH + for (int i = 0; i < MB_MODE_COUNT; ++i) + for (int k = 0; k < REF_FRAMES; ++k) args->single_filter[i][k] = SWITCHABLE; + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { int len = sizeof(uint16_t); - args.above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf); - args.above_pred_buf[1] = + args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf); + args->above_pred_buf[1] = + CONVERT_TO_BYTEPTR(x->above_pred_buf + (MAX_SB_SQUARE >> 1) * len); + args->above_pred_buf[2] = CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len); - args.above_pred_buf[2] = - CONVERT_TO_BYTEPTR(x->above_pred_buf + 2 * MAX_SB_SQUARE * len); - args.left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf); - args.left_pred_buf[1] = + args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf); + args->left_pred_buf[1] = + CONVERT_TO_BYTEPTR(x->left_pred_buf + (MAX_SB_SQUARE >> 1) * len); + args->left_pred_buf[2] = CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len); - args.left_pred_buf[2] = - CONVERT_TO_BYTEPTR(x->left_pred_buf + 2 * MAX_SB_SQUARE * len); } else { -#endif // CONFIG_HIGHBITDEPTH - args.above_pred_buf[0] = x->above_pred_buf; - args.above_pred_buf[1] = x->above_pred_buf + MAX_SB_SQUARE; - args.above_pred_buf[2] = x->above_pred_buf + 2 * MAX_SB_SQUARE; - args.left_pred_buf[0] = x->left_pred_buf; - args.left_pred_buf[1] = x->left_pred_buf + MAX_SB_SQUARE; - args.left_pred_buf[2] = x->left_pred_buf + 2 * MAX_SB_SQUARE; -#if CONFIG_HIGHBITDEPTH + args->above_pred_buf[0] = x->above_pred_buf; + args->above_pred_buf[1] = x->above_pred_buf + (MAX_SB_SQUARE >> 1); + args->above_pred_buf[2] = x->above_pred_buf + MAX_SB_SQUARE; + args->left_pred_buf[0] = x->left_pred_buf; + args->left_pred_buf[1] = x->left_pred_buf + (MAX_SB_SQUARE >> 1); + args->left_pred_buf[2] = x->left_pred_buf + MAX_SB_SQUARE; } -#endif // CONFIG_HIGHBITDEPTH -#endif // CONFIG_MOTION_VAR - - av1_zero(best_mbmode); - av1_zero(pmi_uv); - if (try_palette) { - if (above_mi) - palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0); - if (left_mi) - palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0); - } - - estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp, - &comp_mode_p); - - for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX; - for (i = 0; i < TX_SIZES_ALL; i++) rate_uv_intra[i] = INT_MAX; - for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX; - for (i = 0; i < MB_MODE_COUNT; ++i) { - for (k = 0; k < TOTAL_REFS_PER_FRAME; ++k) { - args.single_filter[i][k] = SWITCHABLE; - } - } + av1_collect_neighbors_ref_counts(xd); - rd_cost->rate = INT_MAX; -#if CONFIG_SUPERTX - *returnrate_nocoef = INT_MAX; -#endif // CONFIG_SUPERTX + estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single, + ref_costs_comp); + MV_REFERENCE_FRAME ref_frame; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { x->pred_mv_sad[ref_frame] = INT_MAX; x->mbmi_ext->mode_context[ref_frame] = 0; x->mbmi_ext->compound_mode_context[ref_frame] = 0; - if (cpi->ref_frame_flags & flag_list[ref_frame]) { + if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame]) { assert(get_ref_frame_buffer(cpi, ref_frame) != NULL); - setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, - frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); - } - frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; -#if CONFIG_GLOBAL_MOTION - frame_mv[ZEROMV][ref_frame].as_int = - gm_get_motion_vector(&cm->global_motion[ref_frame], - cm->allow_high_precision_mv, bsize, mi_col, mi_row, - 0 -#if CONFIG_AMVR - , - cm->cur_frame_mv_precision_level -#endif - ) - .as_int; -#else // CONFIG_GLOBAL_MOTION - frame_mv[ZEROMV][ref_frame].as_int = 0; -#endif // CONFIG_GLOBAL_MOTION - frame_mv[NEW_NEWMV][ref_frame].as_int = INVALID_MV; -#if CONFIG_COMPOUND_SINGLEREF - frame_mv[SR_NEW_NEWMV][ref_frame].as_int = INVALID_MV; - frame_comp_mv[SR_NEW_NEWMV][ref_frame].as_int = INVALID_MV; -#endif // CONFIG_COMPOUND_SINGLEREF -#if CONFIG_GLOBAL_MOTION - frame_mv[ZERO_ZEROMV][ref_frame].as_int = - gm_get_motion_vector(&cm->global_motion[ref_frame], - cm->allow_high_precision_mv, bsize, mi_col, mi_row, - 0 -#if CONFIG_AMVR - , - cm->cur_frame_mv_precision_level -#endif - ) - .as_int; -#else // CONFIG_GLOBAL_MOTION - frame_mv[ZERO_ZEROMV][ref_frame].as_int = 0; -#endif // CONFIG_GLOBAL_MOTION + setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, + yv12_mb); + } } + // TODO(zoeliu@google.com): To further optimize the obtaining of motion vector + // references for compound prediction, as not every pair of reference frames + // woud be examined for the RD evaluation. for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) { - MODE_INFO *const mi = xd->mi[0]; - int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame]; x->mbmi_ext->mode_context[ref_frame] = 0; - av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame], - mbmi_ext->ref_mv_stack[ref_frame], - mbmi_ext->compound_mode_context, candidates, mi_row, - mi_col, NULL, NULL, mbmi_ext->mode_context); - if (mbmi_ext->ref_mv_count[ref_frame] < 2) { - MV_REFERENCE_FRAME rf[2]; - av1_set_ref_frame(rf, ref_frame); - if (mbmi_ext->ref_mvs[rf[0]][0].as_int != - frame_mv[ZEROMV][rf[0]].as_int || - mbmi_ext->ref_mvs[rf[0]][1].as_int != - frame_mv[ZEROMV][rf[0]].as_int || - mbmi_ext->ref_mvs[rf[1]][0].as_int != - frame_mv[ZEROMV][rf[1]].as_int || - mbmi_ext->ref_mvs[rf[1]][1].as_int != frame_mv[ZEROMV][rf[1]].as_int) - mbmi_ext->mode_context[ref_frame] &= ~(1 << ALL_ZERO_FLAG_OFFSET); - } - } - -#if CONFIG_MOTION_VAR + av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, + mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row, + mi_col, mbmi_ext->mode_context); + } + av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col); if (check_num_overlappable_neighbors(mbmi) && is_motion_variation_allowed_bsize(bsize)) { av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, - args.above_pred_buf, dst_width1, - dst_height1, args.above_pred_stride); + args->above_pred_buf, dst_width1, + dst_height1, args->above_pred_stride); av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, - args.left_pred_buf, dst_width2, - dst_height2, args.left_pred_stride); + args->left_pred_buf, dst_width2, + dst_height2, args->left_pred_stride); av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row, - mi_col); - calc_target_weighted_pred(cm, x, xd, mi_row, mi_col, args.above_pred_buf[0], - args.above_pred_stride[0], args.left_pred_buf[0], - args.left_pred_stride[0]); + mi_col, 0, num_planes); + calc_target_weighted_pred( + cm, x, xd, mi_row, mi_col, args->above_pred_buf[0], + args->above_pred_stride[0], args->left_pred_buf[0], + args->left_pred_stride[0]); } -#endif // CONFIG_MOTION_VAR + int min_pred_mv_sad = INT_MAX; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) + min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]); + + for (int i = 0; i < 2; ++i) { + ref_frame_skip_mask[i] = 0; + } + memset(mode_skip_mask, 0, REF_FRAMES * sizeof(*mode_skip_mask)); for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - if (!(cpi->ref_frame_flags & flag_list[ref_frame])) { + if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame])) { // Skip checking missing references in both single and compound reference // modes. Note that a mode will be skipped iff both reference frames // are masked out. ref_frame_skip_mask[0] |= (1 << ref_frame); ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; } else { - for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { - // Skip fixed mv modes for poor references - if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) { - mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO; - break; - } + // Skip fixed mv modes for poor references + if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) { + mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO; } } // If the segment reference frame feature is enabled.... @@ -10658,55 +9473,34 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, // segment level feature is enabled for this segment. This is to // prevent the possibility that we end up unable to pick any mode. if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { - // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, + // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame, // unless ARNR filtering is enabled in which case we want // an unfiltered alternative. We allow near/nearest as well // because they may result in zero-zero MVs but be cheaper. if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { - int_mv zeromv; - ref_frame_skip_mask[0] = (1 << LAST_FRAME) | -#if CONFIG_EXT_REFS - (1 << LAST2_FRAME) | (1 << LAST3_FRAME) | - (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | -#endif // CONFIG_EXT_REFS - (1 << GOLDEN_FRAME); + ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << LAST2_FRAME) | + (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) | + (1 << ALTREF2_FRAME) | (1 << GOLDEN_FRAME); ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK; // TODO(zoeliu): To further explore whether following needs to be done for // BWDREF_FRAME as well. mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO; -#if CONFIG_GLOBAL_MOTION - zeromv.as_int = gm_get_motion_vector(&cm->global_motion[ALTREF_FRAME], - cm->allow_high_precision_mv, bsize, - mi_col, mi_row, 0 -#if CONFIG_AMVR - , - cm->cur_frame_mv_precision_level -#endif - ) - .as_int; -#else - zeromv.as_int = 0; -#endif // CONFIG_GLOBAL_MOTION - if (frame_mv[NEARMV][ALTREF_FRAME].as_int != zeromv.as_int) + const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME }; + int_mv near_mv, nearest_mv, global_mv; + get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext); + get_this_mv(&near_mv, NEARMV, 0, 0, tmp_ref_frames, x->mbmi_ext); + get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext); + + if (near_mv.as_int != global_mv.as_int) mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV); - if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int) + if (nearest_mv.as_int != global_mv.as_int) mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV); - if (frame_mv[NEAREST_NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int) - mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARESTMV); - if (frame_mv[NEAR_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int) - mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARMV); -#if CONFIG_COMPOUND_SINGLEREF - if (frame_mv[SR_NEAREST_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int || - frame_comp_mv[SR_NEAREST_NEARMV][ALTREF_FRAME].as_int != - zeromv.as_int) - mode_skip_mask[ALTREF_FRAME] |= (1 << SR_NEAREST_NEARMV); -#endif // CONFIG_COMPOUND_SINGLEREF } } if (cpi->rc.is_src_frame_alt_ref) { if (sf->alt_ref_search_fp) { - assert(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]); + assert(cpi->ref_frame_flags & ref_frame_flag_list[ALTREF_FRAME]); mode_skip_mask[ALTREF_FRAME] = 0; ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME); ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK; @@ -10733,24 +9527,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, mode_skip_mask[INTRA_FRAME] |= ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]); - for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0; - for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i) - mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5; - - midx = sf->schedule_mode_search ? mode_skip_start : 0; - while (midx > 4) { - uint8_t end_pos = 0; - for (i = 5; i < midx; ++i) { - if (mode_threshold[mode_map[i - 1]] > mode_threshold[mode_map[i]]) { - uint8_t tmp = mode_map[i]; - mode_map[i] = mode_map[i - 1]; - mode_map[i - 1] = tmp; - end_pos = i; - } - } - midx = end_pos; - } - if (cpi->sf.tx_type_search.fast_intra_tx_type_search) x->use_default_intra_tx_type = 1; else @@ -10760,528 +9536,705 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, x->use_default_inter_tx_type = 1; else x->use_default_inter_tx_type = 0; -#if CONFIG_PVQ - od_encode_checkpoint(&x->daala_enc, &pre_buf); -#endif // CONFIG_PVQ - for (i = 0; i < MB_MODE_COUNT; ++i) - for (ref_frame = 0; ref_frame < TOTAL_REFS_PER_FRAME; ++ref_frame) - modelled_rd[i][ref_frame] = INT64_MAX; - - for (midx = 0; midx < MAX_MODES; ++midx) { - int mode_index; - int mode_excluded = 0; + if (cpi->sf.skip_repeat_interpolation_filter_search) { + x->interp_filter_stats_idx[0] = 0; + x->interp_filter_stats_idx[1] = 0; + } +} + +static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_cost, PICK_MODE_CONTEXT *ctx, + BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi, + PALETTE_MODE_INFO *const pmi, + unsigned int *ref_costs_single, + InterModeSearchState *search_state) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + int rate2 = 0; + int64_t distortion2 = 0, best_rd_palette = search_state->best_rd, this_rd, + best_model_rd_palette = INT64_MAX; + int skippable = 0, rate_overhead_palette = 0; + RD_STATS rd_stats_y; + TX_SIZE uv_tx = TX_4X4; + uint8_t *const best_palette_color_map = + x->palette_buffer->best_palette_color_map; + uint8_t *const color_map = xd->plane[0].color_index_map; + MB_MODE_INFO best_mbmi_palette = *mbmi; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]]; + const int rows = block_size_high[bsize]; + const int cols = block_size_wide[bsize]; + + mbmi->mode = DC_PRED; + mbmi->uv_mode = UV_DC_PRED; + mbmi->ref_frame[0] = INTRA_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + rate_overhead_palette = rd_pick_palette_intra_sby( + cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette, + best_palette_color_map, &best_rd_palette, &best_model_rd_palette, NULL, + NULL, NULL, NULL, ctx, best_blk_skip); + if (pmi->palette_size[0] == 0) return; + + memcpy(x->blk_skip, best_blk_skip, + sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize)); + + memcpy(color_map, best_palette_color_map, + rows * cols * sizeof(best_palette_color_map[0])); + super_block_yrd(cpi, x, &rd_stats_y, bsize, search_state->best_rd); + if (rd_stats_y.rate == INT_MAX) return; + + skippable = rd_stats_y.skip; + distortion2 = rd_stats_y.dist; + rate2 = rd_stats_y.rate + rate_overhead_palette; + rate2 += ref_costs_single[INTRA_FRAME]; + if (num_planes > 1) { + uv_tx = av1_get_tx_size(AOM_PLANE_U, xd); + if (search_state->rate_uv_intra[uv_tx] == INT_MAX) { + choose_intra_uv_mode( + cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx], + &search_state->rate_uv_tokenonly[uv_tx], + &search_state->dist_uvs[uv_tx], &search_state->skip_uvs[uv_tx], + &search_state->mode_uv[uv_tx]); + search_state->pmi_uv[uv_tx] = *pmi; + search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV]; + } + mbmi->uv_mode = search_state->mode_uv[uv_tx]; + pmi->palette_size[1] = search_state->pmi_uv[uv_tx].palette_size[1]; + if (pmi->palette_size[1] > 0) { + memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, + search_state->pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE, + 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0])); + } + mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx]; + skippable = skippable && search_state->skip_uvs[uv_tx]; + distortion2 += search_state->dist_uvs[uv_tx]; + rate2 += search_state->rate_uv_intra[uv_tx]; + } + + if (skippable) { + rate2 -= rd_stats_y.rate; + if (num_planes > 1) rate2 -= search_state->rate_uv_tokenonly[uv_tx]; + rate2 += x->skip_cost[av1_get_skip_context(xd)][1]; + } else { + rate2 += x->skip_cost[av1_get_skip_context(xd)][0]; + } + this_rd = RDCOST(x->rdmult, rate2, distortion2); + if (this_rd < search_state->best_rd) { + search_state->best_mode_index = 3; + mbmi->mv[0].as_int = 0; + rd_cost->rate = rate2; + rd_cost->dist = distortion2; + rd_cost->rdcost = this_rd; + search_state->best_rd = this_rd; + search_state->best_mbmode = *mbmi; + search_state->best_skip2 = 0; + search_state->best_mode_skippable = skippable; + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + } +} + +static void init_inter_mode_search_state(InterModeSearchState *search_state, + const AV1_COMP *cpi, + const TileDataEnc *tile_data, + const MACROBLOCK *x, BLOCK_SIZE bsize, + int64_t best_rd_so_far) { + search_state->best_rd = best_rd_so_far; + + av1_zero(search_state->best_mbmode); + + search_state->best_rate_y = INT_MAX; + + search_state->best_rate_uv = INT_MAX; + + search_state->best_mode_skippable = 0; + + search_state->best_skip2 = 0; + + search_state->best_mode_index = -1; + + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const unsigned char segment_id = mbmi->segment_id; + + search_state->skip_intra_modes = 0; + + search_state->num_available_refs = 0; + memset(search_state->dist_refs, -1, sizeof(search_state->dist_refs)); + memset(search_state->dist_order_refs, -1, + sizeof(search_state->dist_order_refs)); + + for (int i = 0; i <= LAST_NEW_MV_INDEX; ++i) + search_state->mode_threshold[i] = 0; + const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize]; + for (int i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i) + search_state->mode_threshold[i] = + ((int64_t)rd_threshes[i] * tile_data->thresh_freq_fact[bsize][i]) >> 5; + + search_state->best_intra_mode = DC_PRED; + search_state->best_intra_rd = INT64_MAX; + + search_state->angle_stats_ready = 0; + + search_state->best_pred_sse = UINT_MAX; + + for (int i = 0; i < TX_SIZES_ALL; i++) + search_state->rate_uv_intra[i] = INT_MAX; + + av1_zero(search_state->pmi_uv); + + for (int i = 0; i < REFERENCE_MODES; ++i) + search_state->best_pred_rd[i] = INT64_MAX; + + av1_zero(search_state->single_newmv); + av1_zero(search_state->single_newmv_rate); + av1_zero(search_state->single_newmv_valid); + for (int i = 0; i < MB_MODE_COUNT; ++i) + for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) + search_state->modelled_rd[i][ref_frame] = INT64_MAX; +} + +static int inter_mode_search_order_independent_skip( + const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, int mode_index, + int mi_row, int mi_col, uint32_t *mode_skip_mask, + uint16_t *ref_frame_skip_mask) { + const SPEED_FEATURES *const sf = &cpi->sf; + const AV1_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const unsigned char segment_id = mbmi->segment_id; + const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame; + const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode; + + if (cpi->sf.mode_pruning_based_on_two_pass_partition_search && + !x->cb_partition_scan) { + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + int found = 0; + // Search in the stats table to see if the ref frames have been used in the + // first pass of partition search. + for (int row = mi_row; row < mi_row + mi_width && !found; + row += FIRST_PARTITION_PASS_SAMPLE_REGION) { + for (int col = mi_col; col < mi_col + mi_height && !found; + col += FIRST_PARTITION_PASS_SAMPLE_REGION) { + const int index = av1_first_partition_pass_stats_index(row, col); + const FIRST_PARTITION_PASS_STATS *const stats = + &x->first_partition_pass_stats[index]; + if (stats->ref0_counts[ref_frame[0]] && + (ref_frame[1] < 0 || stats->ref1_counts[ref_frame[1]])) { + found = 1; + break; + } + } + } + if (!found) return 1; + } + + if (ref_frame[0] > INTRA_FRAME && ref_frame[1] == INTRA_FRAME) { + // Mode must by compatible + if (!is_interintra_allowed_mode(this_mode)) return 1; + if (!is_interintra_allowed_bsize(bsize)) return 1; + } + + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test && ref_frame[0] == INTRA_FRAME) + return 1; + + if (ref_frame[0] == INTRA_FRAME) { + if (this_mode != DC_PRED) { + // Disable intra modes other than DC_PRED for blocks with low variance + // Threshold for intra skipping based on source variance + // TODO(debargha): Specialize the threshold for super block sizes + const unsigned int skip_intra_var_thresh = 64; + if ((sf->mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) && + x->source_variance < skip_intra_var_thresh) + return 1; + } + } else { + if (!is_comp_ref_allowed(bsize) && ref_frame[1] > INTRA_FRAME) return 1; + } + + const int comp_pred = ref_frame[1] > INTRA_FRAME; + if (comp_pred) { + if (!cpi->allow_comp_inter_inter) return 1; + + // Skip compound inter modes if ARF is not available. + if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame[1]])) return 1; + + // Do not allow compound prediction if the segment level reference frame + // feature is in use as in this case there can only be one reference. + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1; + } + + if (sf->selective_ref_frame) { + if (sf->selective_ref_frame >= 2 || x->cb_partition_scan) { + if (ref_frame[0] == ALTREF2_FRAME || ref_frame[1] == ALTREF2_FRAME) + if (get_relative_dist( + cm, cm->cur_frame->ref_frame_offset[ALTREF2_FRAME - LAST_FRAME], + cm->frame_offset) < 0) + return 1; + if (ref_frame[0] == BWDREF_FRAME || ref_frame[1] == BWDREF_FRAME) + if (get_relative_dist( + cm, cm->cur_frame->ref_frame_offset[BWDREF_FRAME - LAST_FRAME], + cm->frame_offset) < 0) + return 1; + } + if (ref_frame[0] == LAST3_FRAME || ref_frame[1] == LAST3_FRAME) + if (get_relative_dist( + cm, cm->cur_frame->ref_frame_offset[LAST3_FRAME - LAST_FRAME], + cm->cur_frame->ref_frame_offset[GOLDEN_FRAME - LAST_FRAME]) <= 0) + return 1; + if (ref_frame[0] == LAST2_FRAME || ref_frame[1] == LAST2_FRAME) + if (get_relative_dist( + cm, cm->cur_frame->ref_frame_offset[LAST2_FRAME - LAST_FRAME], + cm->cur_frame->ref_frame_offset[GOLDEN_FRAME - LAST_FRAME]) <= 0) + return 1; + } + + // One-sided compound is used only when all reference frames are one-sided. + if (sf->selective_ref_frame && comp_pred && !cpi->all_one_sided_refs) { + unsigned int ref_offsets[2]; + for (int i = 0; i < 2; ++i) { + const int buf_idx = cm->frame_refs[ref_frame[i] - LAST_FRAME].idx; + assert(buf_idx >= 0); + ref_offsets[i] = cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset; + } + if ((get_relative_dist(cm, ref_offsets[0], cm->frame_offset) <= 0 && + get_relative_dist(cm, ref_offsets[1], cm->frame_offset) <= 0) || + (get_relative_dist(cm, ref_offsets[0], cm->frame_offset) > 0 && + get_relative_dist(cm, ref_offsets[1], cm->frame_offset) > 0)) + return 1; + } + + if (mode_skip_mask[ref_frame[0]] & (1 << this_mode)) { + return 1; + } + + if ((ref_frame_skip_mask[0] & (1 << ref_frame[0])) && + (ref_frame_skip_mask[1] & (1 << AOMMAX(0, ref_frame[1])))) { + return 1; + } + + if (skip_repeated_mv(cm, x, this_mode, ref_frame)) { + return 1; + } + return 0; +} + +static INLINE void init_mbmi(MB_MODE_INFO *mbmi, int mode_index, + const AV1_COMMON *cm) { + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode; + mbmi->ref_mv_idx = 0; + mbmi->mode = this_mode; + mbmi->uv_mode = UV_DC_PRED; + mbmi->ref_frame[0] = av1_mode_order[mode_index].ref_frame[0]; + mbmi->ref_frame[1] = av1_mode_order[mode_index].ref_frame[1]; + pmi->palette_size[0] = 0; + pmi->palette_size[1] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1); + set_default_interp_filters(mbmi, cm->interp_filter); +} + +static int handle_intra_mode(InterModeSearchState *search_state, + const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int ref_frame_cost, + const PICK_MODE_CONTEXT *ctx, int disable_skip, + RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv) { + const AV1_COMMON *cm = &cpi->common; + const SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(mbmi->ref_frame[0] == INTRA_FRAME); + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int try_palette = + av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type); + const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]]; + const int intra_cost_penalty = av1_get_intra_cost_penalty( + cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); + const int rows = block_size_high[bsize]; + const int cols = block_size_wide[bsize]; + const int num_planes = av1_num_planes(cm); + av1_init_rd_stats(rd_stats); + av1_init_rd_stats(rd_stats_y); + av1_init_rd_stats(rd_stats_uv); + TX_SIZE uv_tx; + int is_directional_mode = av1_is_directional_mode(mbmi->mode); + if (is_directional_mode && av1_use_angle_delta(bsize)) { + int rate_dummy; + int64_t model_rd = INT64_MAX; + if (!search_state->angle_stats_ready) { + const int src_stride = x->plane[0].src.stride; + const uint8_t *src = x->plane[0].src.buf; + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + highbd_angle_estimation(src, src_stride, rows, cols, bsize, + search_state->directional_mode_skip_mask); + else + angle_estimation(src, src_stride, rows, cols, bsize, + search_state->directional_mode_skip_mask); + search_state->angle_stats_ready = 1; + } + if (search_state->directional_mode_skip_mask[mbmi->mode]) return 0; + rd_stats_y->rate = INT_MAX; + rd_pick_intra_angle_sby(cpi, x, &rate_dummy, rd_stats_y, bsize, + intra_mode_cost[mbmi->mode], search_state->best_rd, + &model_rd); + } else { + mbmi->angle_delta[PLANE_TYPE_Y] = 0; + super_block_yrd(cpi, x, rd_stats_y, bsize, search_state->best_rd); + } + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + memcpy(best_blk_skip, x->blk_skip, + sizeof(best_blk_skip[0]) * ctx->num_4x4_blk); + + if (mbmi->mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) { + RD_STATS rd_stats_y_fi; + int filter_intra_selected_flag = 0; + TX_SIZE best_tx_size = mbmi->tx_size; + TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN]; + memcpy(best_txk_type, mbmi->txk_type, + sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN); + FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED; + int64_t best_rd_tmp = INT64_MAX; + if (rd_stats_y->rate != INT_MAX) { + best_rd_tmp = RDCOST(x->rdmult, + rd_stats_y->rate + x->filter_intra_cost[bsize][0] + + intra_mode_cost[mbmi->mode], + rd_stats_y->dist); + } + + mbmi->filter_intra_mode_info.use_filter_intra = 1; + for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED; + fi_mode < FILTER_INTRA_MODES; ++fi_mode) { + int64_t this_rd_tmp; + mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode; + + super_block_yrd(cpi, x, &rd_stats_y_fi, bsize, search_state->best_rd); + if (rd_stats_y_fi.rate == INT_MAX) { + continue; + } + const int this_rate_tmp = + rd_stats_y_fi.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, + intra_mode_cost[mbmi->mode]); + this_rd_tmp = RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist); + + if (this_rd_tmp < best_rd_tmp) { + best_tx_size = mbmi->tx_size; + memcpy(best_txk_type, mbmi->txk_type, + sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN); + memcpy(best_blk_skip, x->blk_skip, + sizeof(best_blk_skip[0]) * ctx->num_4x4_blk); + best_fi_mode = fi_mode; + *rd_stats_y = rd_stats_y_fi; + filter_intra_selected_flag = 1; + best_rd_tmp = this_rd_tmp; + } + } + + mbmi->tx_size = best_tx_size; + memcpy(mbmi->txk_type, best_txk_type, + sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN); + memcpy(x->blk_skip, best_blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + + if (filter_intra_selected_flag) { + mbmi->filter_intra_mode_info.use_filter_intra = 1; + mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode; + } else { + mbmi->filter_intra_mode_info.use_filter_intra = 0; + } + } + + if (rd_stats_y->rate == INT_MAX) return 0; + + if (num_planes > 1) { + uv_tx = av1_get_tx_size(AOM_PLANE_U, xd); + if (search_state->rate_uv_intra[uv_tx] == INT_MAX) { + choose_intra_uv_mode( + cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx], + &search_state->rate_uv_tokenonly[uv_tx], + &search_state->dist_uvs[uv_tx], &search_state->skip_uvs[uv_tx], + &search_state->mode_uv[uv_tx]); + if (try_palette) search_state->pmi_uv[uv_tx] = *pmi; + search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV]; + } + + rd_stats_uv->rate = search_state->rate_uv_tokenonly[uv_tx]; + rd_stats_uv->dist = search_state->dist_uvs[uv_tx]; + rd_stats_uv->skip = search_state->skip_uvs[uv_tx]; + rd_stats->skip = rd_stats_y->skip && rd_stats_uv->skip; + mbmi->uv_mode = search_state->mode_uv[uv_tx]; + if (try_palette) { + pmi->palette_size[1] = search_state->pmi_uv[uv_tx].palette_size[1]; + memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, + search_state->pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE, + 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0])); + } + mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx]; + } + + rd_stats->rate = + rd_stats_y->rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]); + if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) { + // super_block_yrd above includes the cost of the tx_size in the + // tokenonly rate, but for intra blocks, tx_size is always coded + // (prediction granularity), so we account for it in the full rate, + // not the tokenonly rate. + rd_stats_y->rate -= tx_size_cost(cm, x, bsize, mbmi->tx_size); + } + if (num_planes > 1 && !x->skip_chroma_rd) { + const int uv_mode_cost = + x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mbmi->uv_mode]; + rd_stats->rate += + rd_stats_uv->rate + + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost); + } + if (mbmi->mode != DC_PRED && mbmi->mode != PAETH_PRED) + rd_stats->rate += intra_cost_penalty; + rd_stats->dist = rd_stats_y->dist + rd_stats_uv->dist; + + // Estimate the reference frame signaling cost and add it + // to the rolling cost variable. + rd_stats->rate += ref_frame_cost; + if (rd_stats->skip) { + // Back out the coefficient coding costs + rd_stats->rate -= (rd_stats_y->rate + rd_stats_uv->rate); + rd_stats_y->rate = 0; + rd_stats_uv->rate = 0; + // Cost the skip mb case + rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1]; + } else { + // Add in the cost of the no skip flag. + rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][0]; + } + // Calculate the final RD estimate for this mode. + int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + + // Keep record of best intra rd + if (this_rd < search_state->best_intra_rd) { + search_state->best_intra_rd = this_rd; + search_state->best_intra_mode = mbmi->mode; + } + + if (sf->skip_intra_in_interframe) { + if (search_state->best_rd < (INT64_MAX / 2) && + this_rd > (search_state->best_rd + (search_state->best_rd >> 1))) + search_state->skip_intra_modes = 1; + } + + if (!disable_skip) { + for (int i = 0; i < REFERENCE_MODES; ++i) + search_state->best_pred_rd[i] = + AOMMIN(search_state->best_pred_rd[i], this_rd); + } + return 1; +} + +void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, + MACROBLOCK *x, int mi_row, int mi_col, + RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int try_palette = + av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type); + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const struct segmentation *const seg = &cm->seg; + PREDICTION_MODE this_mode; + MV_REFERENCE_FRAME ref_frame, second_ref_frame; + unsigned char segment_id = mbmi->segment_id; + int i, k; + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]; + unsigned int ref_costs_single[REF_FRAMES]; + unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES]; + int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)]; + int *mode_map = tile_data->mode_map[bsize]; + uint32_t mode_skip_mask[REF_FRAMES]; + uint16_t ref_frame_skip_mask[2]; + + InterModeSearchState search_state; + init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize, + best_rd_so_far); + + HandleInterModeArgs args = { + { NULL }, { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }, + { NULL }, { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 }, + NULL, NULL, + NULL, NULL, + { { 0 } }, INT_MAX, + INT_MAX + }; + for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX; + + av1_invalid_rd_stats(rd_cost); + + // init params, set frame modes, speed features + set_params_rd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col, + ref_frame_skip_mask, mode_skip_mask, + ref_costs_single, ref_costs_comp, yv12_mb); + +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + int64_t best_est_rd = INT64_MAX; +#endif + + for (int midx = 0; midx < MAX_MODES; ++midx) { + int mode_index = mode_map[midx]; int64_t this_rd = INT64_MAX; int disable_skip = 0; - int compmode_cost = 0; int rate2 = 0, rate_y = 0, rate_uv = 0; - int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0; + int64_t distortion2 = 0; int skippable = 0; int this_skip2 = 0; - int64_t total_sse = INT64_MAX; uint8_t ref_frame_type; -#if CONFIG_PVQ - od_encode_rollback(&x->daala_enc, &pre_buf); -#endif // CONFIG_PVQ - mode_index = mode_map[midx]; + this_mode = av1_mode_order[mode_index].mode; ref_frame = av1_mode_order[mode_index].ref_frame[0]; second_ref_frame = av1_mode_order[mode_index].ref_frame[1]; - mbmi->ref_mv_idx = 0; - - if (ref_frame > INTRA_FRAME && second_ref_frame == INTRA_FRAME) { - // Mode must by compatible - if (!is_interintra_allowed_mode(this_mode)) continue; - if (!is_interintra_allowed_bsize(bsize)) continue; - } - - if (is_inter_compound_mode(this_mode)) { - frame_mv[this_mode][ref_frame].as_int = - frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int; - frame_mv[this_mode][second_ref_frame].as_int = - frame_mv[compound_ref1_mode(this_mode)][second_ref_frame].as_int; -#if CONFIG_COMPOUND_SINGLEREF - } else if (is_inter_singleref_comp_mode(this_mode)) { - frame_mv[this_mode][ref_frame].as_int = - frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int; - frame_comp_mv[this_mode][ref_frame].as_int = - frame_mv[compound_ref1_mode(this_mode)][ref_frame].as_int; -#endif // CONFIG_COMPOUND_SINGLEREF - } - - // Look at the reference frame of the best mode so far and set the - // skip mask to look at a subset of the remaining modes. - if (midx == mode_skip_start && best_mode_index >= 0) { - switch (best_mbmode.ref_frame[0]) { - case INTRA_FRAME: break; - case LAST_FRAME: - ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK; - ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; - break; -#if CONFIG_EXT_REFS - case LAST2_FRAME: - ref_frame_skip_mask[0] |= LAST2_FRAME_MODE_MASK; - ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; - break; - case LAST3_FRAME: - ref_frame_skip_mask[0] |= LAST3_FRAME_MODE_MASK; - ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; - break; -#endif // CONFIG_EXT_REFS - case GOLDEN_FRAME: - ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK; - ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; - break; -#if CONFIG_EXT_REFS - case BWDREF_FRAME: - ref_frame_skip_mask[0] |= BWDREF_FRAME_MODE_MASK; - ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; - break; - case ALTREF2_FRAME: - ref_frame_skip_mask[0] |= ALTREF2_FRAME_MODE_MASK; - ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; - break; -#endif // CONFIG_EXT_REFS - case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALTREF_FRAME_MODE_MASK; -#if CONFIG_EXT_REFS - ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; -#endif // CONFIG_EXT_REFS - break; - case NONE_FRAME: - case TOTAL_REFS_PER_FRAME: - assert(0 && "Invalid Reference frame"); - break; - } - } - if ((ref_frame_skip_mask[0] & (1 << ref_frame)) && - (ref_frame_skip_mask[1] & (1 << AOMMAX(0, second_ref_frame)))) - continue; + init_mbmi(mbmi, mode_index, cm); -#if CONFIG_EXT_COMP_REFS -// TODO(zoeliu): Following toggle between #if 0/1 and the bug will manifest -// itself. -#if 0 - if (!(cpi->ref_frame_flags & flag_list[ref_frame]) || - (second_ref_frame > INTRA_FRAME && - (!(cpi->ref_frame_flags & flag_list[second_ref_frame])))) - printf("Frame=%d, bsize=%d, (mi_row,mi_col)=(%d,%d), ref_frame=%d, " - "second_ref_frame=%d\n", cm->current_video_frame, bsize, mi_row, - mi_col, ref_frame, second_ref_frame); - - if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; - if (second_ref_frame > INTRA_FRAME && - (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))) + x->skip = 0; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); + + if (inter_mode_search_order_independent_skip(cpi, x, bsize, mode_index, + mi_row, mi_col, mode_skip_mask, + ref_frame_skip_mask)) continue; -#endif // 0 -#if !USE_UNI_COMP_REFS - // NOTE(zoeliu): Temporarily disable uni-directional comp refs - if (second_ref_frame > INTRA_FRAME) { - if (!((ref_frame < BWDREF_FRAME) ^ (second_ref_frame < BWDREF_FRAME))) + if (ref_frame == INTRA_FRAME) { + if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) continue; } - assert(second_ref_frame <= INTRA_FRAME || - ((ref_frame < BWDREF_FRAME) ^ (second_ref_frame < BWDREF_FRAME))); -#endif // !USE_UNI_COMP_REFS -#endif // CONFIG_EXT_COMP_REFS - if (mode_skip_mask[ref_frame] & (1 << this_mode)) continue; - - // Test best rd so far against threshold for trying this mode. - if (best_mode_skippable && sf->schedule_mode_search) - mode_threshold[mode_index] <<= 1; - - if (best_rd < mode_threshold[mode_index]) continue; - - // This is only used in motion vector unit test. - if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue; - -#if CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS // Changes LL bitstream -#if CONFIG_EXT_REFS - if (cpi->oxcf.pass == 0) { - // Complexity-compression trade-offs - // if (ref_frame == ALTREF_FRAME) continue; - // if (ref_frame == BWDREF_FRAME) continue; - if (second_ref_frame == ALTREF_FRAME) continue; - // if (second_ref_frame == BWDREF_FRAME) continue; + if (sf->drop_ref) { + if (ref_frame > INTRA_FRAME && second_ref_frame > INTRA_FRAME) { + if (search_state.num_available_refs > 2) { + if ((ref_frame == search_state.dist_order_refs[0] && + second_ref_frame == search_state.dist_order_refs[1]) || + (ref_frame == search_state.dist_order_refs[1] && + second_ref_frame == search_state.dist_order_refs[0])) + continue; + } + } } -#endif // CONFIG_EXT_REFS -#endif // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS - comp_pred = second_ref_frame > INTRA_FRAME; - if (comp_pred) { - if (!cpi->allow_comp_inter_inter) continue; - // Skip compound inter modes if ARF is not available. - if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; + if (search_state.best_rd < search_state.mode_threshold[mode_index]) + continue; - // Do not allow compound prediction if the segment level reference frame - // feature is in use as in this case there can only be one reference. - if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue; + const int comp_pred = second_ref_frame > INTRA_FRAME; + const int ref_frame_cost = comp_pred + ? ref_costs_comp[ref_frame][second_ref_frame] + : ref_costs_single[ref_frame]; + const int compmode_cost = + is_comp_ref_allowed(mbmi->sb_type) ? comp_inter_cost[comp_pred] : 0; + const int real_compmode_cost = + cm->reference_mode == REFERENCE_MODE_SELECT ? compmode_cost : 0; - if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && - best_mode_index >= 0 && best_mbmode.ref_frame[0] == INTRA_FRAME) + if (comp_pred) { + if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && + search_state.best_mode_index >= 0 && + search_state.best_mbmode.ref_frame[0] == INTRA_FRAME) continue; - - mode_excluded = cm->reference_mode == SINGLE_REFERENCE; - } else { - if (ref_frame != INTRA_FRAME) - mode_excluded = cm->reference_mode == COMPOUND_REFERENCE; } if (ref_frame == INTRA_FRAME) { if (sf->adaptive_mode_search) - if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse) + if ((x->source_variance << num_pels_log2_lookup[bsize]) > + search_state.best_pred_sse) continue; if (this_mode != DC_PRED) { - // Disable intra modes other than DC_PRED for blocks with low variance - // Threshold for intra skipping based on source variance - // TODO(debargha): Specialize the threshold for super block sizes - const unsigned int skip_intra_var_thresh = 64; - if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) && - x->source_variance < skip_intra_var_thresh) - continue; // Only search the oblique modes if the best so far is // one of the neighboring directional modes - if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && - (this_mode >= D45_PRED && this_mode <= TM_PRED)) { - if (best_mode_index >= 0 && best_mbmode.ref_frame[0] > INTRA_FRAME) + if ((sf->mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && + (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) { + if (search_state.best_mode_index >= 0 && + search_state.best_mbmode.ref_frame[0] > INTRA_FRAME) continue; } - if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { - if (conditional_skipintra(this_mode, best_intra_mode)) continue; + if (sf->mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { + if (conditional_skipintra(this_mode, search_state.best_intra_mode)) + continue; } } -#if CONFIG_GLOBAL_MOTION - } else if (cm->global_motion[ref_frame].wmtype == IDENTITY && - (!comp_pred || - cm->global_motion[second_ref_frame].wmtype == IDENTITY)) { -#else // CONFIG_GLOBAL_MOTION - } else { -#endif // CONFIG_GLOBAL_MOTION - const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, second_ref_frame }; - if (!check_best_zero_mv(cpi, x, mbmi_ext->mode_context, - mbmi_ext->compound_mode_context, frame_mv, - this_mode, ref_frames, bsize, -1, mi_row, mi_col)) - continue; } - mbmi->mode = this_mode; - mbmi->uv_mode = UV_DC_PRED; - mbmi->ref_frame[0] = ref_frame; - mbmi->ref_frame[1] = second_ref_frame; - pmi->palette_size[0] = 0; - pmi->palette_size[1] = 0; -#if CONFIG_FILTER_INTRA - mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0; - mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0; -#endif // CONFIG_FILTER_INTRA - // Evaluate all sub-pel filters irrespective of whether we can use - // them for this frame. - - set_default_interp_filters(mbmi, cm->interp_filter); - - mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0; - mbmi->motion_mode = SIMPLE_TRANSLATION; - - x->skip = 0; - set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); - // Select prediction reference frames. - for (i = 0; i < MAX_MB_PLANE; i++) { + for (i = 0; i < num_planes; i++) { xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; } -#if CONFIG_COMPOUND_SINGLEREF - // Single ref compound mode - if (!comp_pred && is_inter_singleref_comp_mode(mbmi->mode)) { - xd->block_refs[1] = xd->block_refs[0]; - for (i = 0; i < MAX_MB_PLANE; i++) - xd->plane[i].pre[1] = xd->plane[i].pre[0]; - } -#endif // CONFIG_COMPOUND_SINGLEREF - -#if CONFIG_INTERINTRA - mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1); -#endif // CONFIG_INTERINTRA - if (ref_frame == INTRA_FRAME) { - RD_STATS rd_stats_y; - TX_SIZE uv_tx; - struct macroblockd_plane *const pd = &xd->plane[1]; -#if CONFIG_EXT_INTRA - is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize); - if (is_directional_mode && av1_use_angle_delta(bsize)) { - int rate_dummy; - int64_t model_rd = INT64_MAX; - if (!angle_stats_ready) { - const int src_stride = x->plane[0].src.stride; - const uint8_t *src = x->plane[0].src.buf; -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - highbd_angle_estimation(src, src_stride, rows, cols, bsize, - directional_mode_skip_mask); - else -#endif // CONFIG_HIGHBITDEPTH - angle_estimation(src, src_stride, rows, cols, bsize, - directional_mode_skip_mask); - angle_stats_ready = 1; - } - if (directional_mode_skip_mask[mbmi->mode]) continue; - rd_stats_y.rate = INT_MAX; - rd_pick_intra_angle_sby(cpi, x, &rate_dummy, &rd_stats_y, bsize, - intra_mode_cost[mbmi->mode], best_rd, - &model_rd); - } else { - mbmi->angle_delta[0] = 0; - super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd); - } -#else - super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd); -#endif // CONFIG_EXT_INTRA - rate_y = rd_stats_y.rate; - distortion_y = rd_stats_y.dist; - skippable = rd_stats_y.skip; - - if (rate_y == INT_MAX) continue; - -#if CONFIG_FILTER_INTRA - if (mbmi->mode == DC_PRED) dc_skipped = 0; -#endif // CONFIG_FILTER_INTRA - - uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][pd->subsampling_x] - [pd->subsampling_y]; - if (rate_uv_intra[uv_tx] == INT_MAX) { - choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx], - &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx], - &skip_uvs[uv_tx], &mode_uv[uv_tx]); - if (try_palette) pmi_uv[uv_tx] = *pmi; - -#if CONFIG_EXT_INTRA - uv_angle_delta[uv_tx] = mbmi->angle_delta[1]; -#endif // CONFIG_EXT_INTRA -#if CONFIG_FILTER_INTRA - filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info; -#endif // CONFIG_FILTER_INTRA - } - - rate_uv = rate_uv_tokenonly[uv_tx]; - distortion_uv = dist_uvs[uv_tx]; - skippable = skippable && skip_uvs[uv_tx]; - mbmi->uv_mode = mode_uv[uv_tx]; - if (try_palette) { - pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1]; - memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, - pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE, - 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0])); - } - -#if CONFIG_EXT_INTRA - mbmi->angle_delta[1] = uv_angle_delta[uv_tx]; -#endif // CONFIG_EXT_INTRA -#if CONFIG_FILTER_INTRA - mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = - filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]; - if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) { - mbmi->filter_intra_mode_info.filter_intra_mode[1] = - filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1]; - } -#endif // CONFIG_FILTER_INTRA - -#if CONFIG_CB4X4 - rate2 = rate_y + intra_mode_cost[mbmi->mode]; - if (!x->skip_chroma_rd) - rate2 += rate_uv + x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode]; -#else - rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv + - x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode]; -#endif // CONFIG_CB4X4 - - if (try_palette && mbmi->mode == DC_PRED) { - rate2 += av1_cost_bit( - av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0); - } - - if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) { - // super_block_yrd above includes the cost of the tx_size in the - // tokenonly rate, but for intra blocks, tx_size is always coded - // (prediction granularity), so we account for it in the full rate, - // not the tokenonly rate. - rate_y -= tx_size_cost(cpi, x, bsize, mbmi->tx_size); - } -#if CONFIG_EXT_INTRA - if (is_directional_mode) { -#if CONFIG_INTRA_INTERP - const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd); - const int p_angle = - mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP; - if (av1_is_intra_filter_switchable(p_angle)) - rate2 += x->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter]; -#endif // CONFIG_INTRA_INTERP - if (av1_use_angle_delta(bsize)) { - rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, - MAX_ANGLE_DELTA + mbmi->angle_delta[0]); - } - } - if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize) && - av1_use_angle_delta(bsize)) { - rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, - MAX_ANGLE_DELTA + mbmi->angle_delta[1]); - } -#endif // CONFIG_EXT_INTRA -#if CONFIG_FILTER_INTRA - if (mbmi->mode == DC_PRED) { - rate2 += - av1_cost_bit(cm->fc->filter_intra_probs[0], - mbmi->filter_intra_mode_info.use_filter_intra_mode[0]); - if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) { - rate2 += write_uniform_cost( - FILTER_INTRA_MODES, - mbmi->filter_intra_mode_info.filter_intra_mode[0]); - } + RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv; + const int ret = handle_intra_mode( + &search_state, cpi, x, bsize, ref_frame_cost, ctx, disable_skip, + &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv); + if (!ret) { + continue; } - if (mbmi->uv_mode == UV_DC_PRED) { - rate2 += - av1_cost_bit(cpi->common.fc->filter_intra_probs[1], - mbmi->filter_intra_mode_info.use_filter_intra_mode[1]); - if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1]) - rate2 += write_uniform_cost( - FILTER_INTRA_MODES, - mbmi->filter_intra_mode_info.filter_intra_mode[1]); - } -#endif // CONFIG_FILTER_INTRA - if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED) - rate2 += intra_cost_penalty; - distortion2 = distortion_y + distortion_uv; + rate2 = intra_rd_stats.rate; + distortion2 = intra_rd_stats.dist; + this_rd = RDCOST(x->rdmult, rate2, distortion2); + skippable = intra_rd_stats.skip; + rate_y = intra_rd_stats_y.rate; } else { - int_mv backup_ref_mv[2]; - - if (!is_comp_ref_allowed(bsize) && mbmi->ref_frame[1] > INTRA_FRAME) - continue; - - backup_ref_mv[0] = mbmi_ext->ref_mvs[ref_frame][0]; - if (comp_pred) backup_ref_mv[1] = mbmi_ext->ref_mvs[second_ref_frame][0]; -#if CONFIG_INTERINTRA - if (second_ref_frame == INTRA_FRAME) { - if (best_single_inter_ref != ref_frame) continue; - mbmi->interintra_mode = intra_to_interintra_mode[best_intra_mode]; -// TODO(debargha|geza.lore): -// Should we use ext_intra modes for interintra? -#if CONFIG_EXT_INTRA - mbmi->angle_delta[0] = 0; - mbmi->angle_delta[1] = 0; -#if CONFIG_INTRA_INTERP - mbmi->intra_filter = INTRA_FILTER_LINEAR; -#endif // CONFIG_INTRA_INTERP -#endif // CONFIG_EXT_INTRA -#if CONFIG_FILTER_INTRA - mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0; - mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0; -#endif // CONFIG_FILTER_INTRA - } -#endif // CONFIG_INTERINTRA + mbmi->angle_delta[PLANE_TYPE_Y] = 0; + mbmi->angle_delta[PLANE_TYPE_UV] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; mbmi->ref_mv_idx = 0; ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); - - if (comp_pred) { - if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) { - int ref_mv_idx = 0; - // Special case: NEAR_NEWMV and NEW_NEARMV modes use - // 1 + mbmi->ref_mv_idx (like NEARMV) instead of - // mbmi->ref_mv_idx (like NEWMV) - if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) - ref_mv_idx = 1; - - if (compound_ref0_mode(mbmi->mode) == NEWMV) { - int_mv this_mv = - mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv; - clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, - xd->n8_h << MI_SIZE_LOG2, xd); - mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv; - } - if (compound_ref1_mode(mbmi->mode) == NEWMV) { - int_mv this_mv = - mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv; - clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, - xd->n8_h << MI_SIZE_LOG2, xd); - mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv; - } - } -#if CONFIG_COMPOUND_SINGLEREF - } else if (is_inter_singleref_comp_mode(mbmi->mode)) { - if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) { - // TODO(zoeliu): To further investigate which ref_mv_idx should be - // chosen for the mode of SR_NEAR_NEWMV. - int ref_mv_idx = 0; - // Special case: SR_NEAR_NEWMV mode use - // 1 + mbmi->ref_mv_idx (like NEARMV) instead of - // mbmi->ref_mv_idx (like NEWMV) - if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx = 1; - - if (compound_ref0_mode(mbmi->mode) == NEWMV || - compound_ref1_mode(mbmi->mode) == NEWMV) { - int_mv this_mv = - mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv; - clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, - xd->n8_h << MI_SIZE_LOG2, xd); - mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv; - } - } -#endif // CONFIG_COMPOUND_SINGLEREF - } else { - if (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1) { - int ref; - for (ref = 0; ref < 1 + comp_pred; ++ref) { - int_mv this_mv = - (ref == 0) ? mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv - : mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv; - clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, - xd->n8_h << MI_SIZE_LOG2, xd); - mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv; - } - } - } + int64_t ref_best_rd = search_state.best_rd; { RD_STATS rd_stats, rd_stats_y, rd_stats_uv; av1_init_rd_stats(&rd_stats); rd_stats.rate = rate2; // Point to variables that are maintained between loop iterations - args.single_newmv = single_newmv; - args.single_newmv_rate = single_newmv_rate; - args.modelled_rd = modelled_rd; + args.single_newmv = search_state.single_newmv[0]; + args.single_newmv_rate = search_state.single_newmv_rate[0]; + args.single_newmv_valid = search_state.single_newmv_valid[0]; + args.modelled_rd = search_state.modelled_rd; + args.single_comp_cost = real_compmode_cost; + args.ref_frame_cost = ref_frame_cost; +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y, + &rd_stats_uv, &disable_skip, mi_row, mi_col, + &args, ref_best_rd, &best_est_rd); +#else this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y, - &rd_stats_uv, &disable_skip, frame_mv, -#if CONFIG_COMPOUND_SINGLEREF - frame_comp_mv, -#endif // CONFIG_COMPOUND_SINGLEREF - mi_row, mi_col, &args, best_rd); + &rd_stats_uv, &disable_skip, mi_row, mi_col, + &args, ref_best_rd); +#endif + if (this_rd < ref_best_rd) { + ref_best_rd = this_rd; + } rate2 = rd_stats.rate; skippable = rd_stats.skip; distortion2 = rd_stats.dist; - total_sse = rd_stats.sse; rate_y = rd_stats_y.rate; rate_uv = rd_stats_uv.rate; } -// TODO(jingning): This needs some refactoring to improve code quality -// and reduce redundant steps. -#if CONFIG_COMPOUND_SINGLEREF - if ((have_nearmv_in_inter_mode(mbmi->mode) && - mbmi_ext->ref_mv_count[ref_frame_type] > 2) || - ((mbmi->mode == NEWMV || mbmi->mode == SR_NEW_NEWMV || - mbmi->mode == NEW_NEWMV) && - mbmi_ext->ref_mv_count[ref_frame_type] > 1)) -#else // !CONFIG_COMPOUND_SINGLEREF + // TODO(jingning): This needs some refactoring to improve code quality + // and reduce redundant steps. if ((have_nearmv_in_inter_mode(mbmi->mode) && mbmi_ext->ref_mv_count[ref_frame_type] > 2) || ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) && - mbmi_ext->ref_mv_count[ref_frame_type] > 1)) -#endif // CONFIG_COMPOUND_SINGLEREF - { - int_mv backup_mv = frame_mv[NEARMV][ref_frame]; + mbmi_ext->ref_mv_count[ref_frame_type] > 1)) { MB_MODE_INFO backup_mbmi = *mbmi; int backup_skip = x->skip; int64_t tmp_ref_rd = this_rd; @@ -11290,40 +10243,14 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, // TODO(jingning): This should be deprecated shortly. int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; int ref_set = - AOMMIN(2, mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset); - - uint8_t drl_ctx = - av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx_offset); - // Dummy - int_mv backup_fmv[2]; - backup_fmv[0] = frame_mv[NEWMV][ref_frame]; - if (comp_pred) backup_fmv[1] = frame_mv[NEWMV][second_ref_frame]; - - rate2 += (rate2 < INT_MAX ? x->drl_mode_cost0[drl_ctx][0] : 0); - - if (this_rd < INT64_MAX) { - if (RDCOST(x->rdmult, rate_y + rate_uv, distortion2) < - RDCOST(x->rdmult, 0, total_sse)) - tmp_ref_rd = RDCOST( - x->rdmult, rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 0), - distortion2); - else - tmp_ref_rd = - RDCOST(x->rdmult, - rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) - - rate_y - rate_uv, - total_sse); - } -#if CONFIG_VAR_TX - for (i = 0; i < MAX_MB_PLANE; ++i) - memcpy(x->blk_skip_drl[i], x->blk_skip[i], - sizeof(uint8_t) * ctx->num_4x4_blk); -#endif // CONFIG_VAR_TX + AOMMIN(MAX_REF_MV_SERCH - 1, + mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset); + memcpy(x->blk_skip_drl, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); for (ref_idx = 0; ref_idx < ref_set; ++ref_idx) { int64_t tmp_alt_rd = INT64_MAX; int dummy_disable_skip = 0; - int ref; int_mv cur_mv; RD_STATS tmp_rd_stats, tmp_rd_stats_y, tmp_rd_stats_uv; @@ -11333,80 +10260,19 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, mbmi->ref_mv_idx = 1 + ref_idx; - if (comp_pred) { - int ref_mv_idx = mbmi->ref_mv_idx; - // Special case: NEAR_NEWMV and NEW_NEARMV modes use - // 1 + mbmi->ref_mv_idx (like NEARMV) instead of - // mbmi->ref_mv_idx (like NEWMV) - if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) - ref_mv_idx = 1 + mbmi->ref_mv_idx; - - if (compound_ref0_mode(mbmi->mode) == NEWMV) { - int_mv this_mv = - mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv; - clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, - xd->n8_h << MI_SIZE_LOG2, xd); - mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv; - } else if (compound_ref0_mode(mbmi->mode) == NEARESTMV) { - int_mv this_mv = - mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv; - clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, - xd->n8_h << MI_SIZE_LOG2, xd); - mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv; - } - - if (compound_ref1_mode(mbmi->mode) == NEWMV) { - int_mv this_mv = - mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv; - clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, - xd->n8_h << MI_SIZE_LOG2, xd); - mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv; - } else if (compound_ref1_mode(mbmi->mode) == NEARESTMV) { - int_mv this_mv = - mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv; - clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, - xd->n8_h << MI_SIZE_LOG2, xd); - mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv; - } -#if CONFIG_COMPOUND_SINGLEREF - } else if (is_inter_singleref_comp_mode(mbmi->mode)) { - int ref_mv_idx = mbmi->ref_mv_idx; - // Special case: SR_NEAR_NEWMV mode use - // 1 + mbmi->ref_mv_idx (like NEARMV) instead of - // mbmi->ref_mv_idx (like NEWMV) - if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx = 1 + mbmi->ref_mv_idx; - - // TODO(zoeliu): For the mode of SR_NEAREST_NEWMV, as it only runs - // the "if", not the "else if", - // mbmi_ext->ref_mvs[mbmi->ref_frame[0]] takes the - // value for "NEWMV", instead of "NEARESTMV". - if (compound_ref0_mode(mbmi->mode) == NEWMV || - compound_ref1_mode(mbmi->mode) == NEWMV) { - int_mv this_mv = - mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv; - clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, - xd->n8_h << MI_SIZE_LOG2, xd); - mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv; - } else if (compound_ref0_mode(mbmi->mode) == NEARESTMV || - compound_ref1_mode(mbmi->mode) == NEARESTMV) { - int_mv this_mv = - mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv; - clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, - xd->n8_h << MI_SIZE_LOG2, xd); - mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv; - } -#endif // CONFIG_COMPOUND_SINGLEREF - } else { - for (ref = 0; ref < 1 + comp_pred; ++ref) { - int_mv this_mv = - (ref == 0) - ? mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx] - .this_mv - : mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx] - .comp_mv; - clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, - xd->n8_h << MI_SIZE_LOG2, xd); - mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv; + if (cpi->sf.reduce_inter_modes) { + if (mbmi->ref_frame[0] == LAST2_FRAME || + mbmi->ref_frame[0] == LAST3_FRAME || + mbmi->ref_frame[1] == LAST2_FRAME || + mbmi->ref_frame[1] == LAST3_FRAME) { + if (mbmi_ext + ->ref_mv_stack[ref_frame_type] + [mbmi->ref_mv_idx + idx_offset] + .weight < REF_CAT_LEVEL) { + *mbmi = backup_mbmi; + x->skip = backup_skip; + continue; + } } } @@ -11416,69 +10282,31 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, clamp_mv2(&cur_mv.as_mv, xd); if (!mv_check_bounds(&x->mv_limits, &cur_mv.as_mv)) { - int_mv dummy_single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } }; - int dummy_single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 }; - - frame_mv[NEARMV][ref_frame] = cur_mv; av1_init_rd_stats(&tmp_rd_stats); - // Point to variables that are not maintained between iterations - args.single_newmv = dummy_single_newmv; - args.single_newmv_rate = dummy_single_newmv_rate; args.modelled_rd = NULL; - tmp_alt_rd = handle_inter_mode(cpi, x, bsize, &tmp_rd_stats, - &tmp_rd_stats_y, &tmp_rd_stats_uv, - &dummy_disable_skip, frame_mv, -#if CONFIG_COMPOUND_SINGLEREF - frame_comp_mv, -#endif // CONFIG_COMPOUND_SINGLEREF - mi_row, mi_col, &args, best_rd); - // Prevent pointers from escaping local scope - args.single_newmv = NULL; - args.single_newmv_rate = NULL; - } - - for (i = 0; i < mbmi->ref_mv_idx; ++i) { - uint8_t drl1_ctx = 0; - drl1_ctx = av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], - i + idx_offset); - tmp_rd_stats.rate += - (tmp_rd_stats.rate < INT_MAX ? x->drl_mode_cost0[drl1_ctx][1] - : 0); - } - - if (mbmi_ext->ref_mv_count[ref_frame_type] > - mbmi->ref_mv_idx + idx_offset + 1 && - ref_idx < ref_set - 1) { - uint8_t drl1_ctx = - av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], - mbmi->ref_mv_idx + idx_offset); - tmp_rd_stats.rate += - (tmp_rd_stats.rate < INT_MAX ? x->drl_mode_cost0[drl1_ctx][0] - : 0); - } - - if (tmp_alt_rd < INT64_MAX) { -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + args.single_newmv = search_state.single_newmv[mbmi->ref_mv_idx]; + args.single_newmv_rate = + search_state.single_newmv_rate[mbmi->ref_mv_idx]; + args.single_newmv_valid = + search_state.single_newmv_valid[mbmi->ref_mv_idx]; + args.single_comp_cost = real_compmode_cost; + args.ref_frame_cost = ref_frame_cost; +#if CONFIG_COLLECT_INTER_MODE_RD_STATS tmp_alt_rd = - RDCOST(x->rdmult, tmp_rd_stats.rate, tmp_rd_stats.dist); + handle_inter_mode(cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y, + &tmp_rd_stats_uv, &dummy_disable_skip, mi_row, + mi_col, &args, ref_best_rd, &best_est_rd); #else - if (RDCOST(x->rdmult, tmp_rd_stats_y.rate + tmp_rd_stats_uv.rate, - tmp_rd_stats.dist) < - RDCOST(x->rdmult, 0, tmp_rd_stats.sse)) - tmp_alt_rd = - RDCOST(x->rdmult, - tmp_rd_stats.rate + - av1_cost_bit(av1_get_skip_prob(cm, xd), 0), - tmp_rd_stats.dist); - else - tmp_alt_rd = - RDCOST(x->rdmult, - tmp_rd_stats.rate + - av1_cost_bit(av1_get_skip_prob(cm, xd), 1) - - tmp_rd_stats_y.rate - tmp_rd_stats_uv.rate, - tmp_rd_stats.sse); -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + tmp_alt_rd = handle_inter_mode( + cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y, &tmp_rd_stats_uv, + &dummy_disable_skip, mi_row, mi_col, &args, ref_best_rd); +#endif + + // Prevent pointers from escaping local scope + args.single_newmv = search_state.single_newmv[0]; + args.single_newmv_rate = search_state.single_newmv_rate[0]; + args.single_newmv_valid = search_state.single_newmv_valid[0]; } if (tmp_ref_rd > tmp_alt_rd) { @@ -11488,192 +10316,61 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, skippable = tmp_rd_stats.skip; rate_y = tmp_rd_stats_y.rate; rate_uv = tmp_rd_stats_uv.rate; - total_sse = tmp_rd_stats.sse; this_rd = tmp_alt_rd; tmp_ref_rd = tmp_alt_rd; backup_mbmi = *mbmi; backup_skip = x->skip; -#if CONFIG_VAR_TX - for (i = 0; i < MAX_MB_PLANE; ++i) - memcpy(x->blk_skip_drl[i], x->blk_skip[i], - sizeof(uint8_t) * ctx->num_4x4_blk); -#endif // CONFIG_VAR_TX + memcpy(x->blk_skip_drl, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); } else { *mbmi = backup_mbmi; x->skip = backup_skip; } } - frame_mv[NEARMV][ref_frame] = backup_mv; - frame_mv[NEWMV][ref_frame] = backup_fmv[0]; - if (comp_pred) frame_mv[NEWMV][second_ref_frame] = backup_fmv[1]; -#if CONFIG_VAR_TX - for (i = 0; i < MAX_MB_PLANE; ++i) - memcpy(x->blk_skip[i], x->blk_skip_drl[i], - sizeof(uint8_t) * ctx->num_4x4_blk); -#endif // CONFIG_VAR_TX + memcpy(x->blk_skip, x->blk_skip_drl, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); } - mbmi_ext->ref_mvs[ref_frame][0] = backup_ref_mv[0]; - if (comp_pred) mbmi_ext->ref_mvs[second_ref_frame][0] = backup_ref_mv[1]; - if (this_rd == INT64_MAX) continue; - if (is_comp_ref_allowed(mbmi->sb_type)) - compmode_cost = av1_cost_bit(comp_mode_p, comp_pred); - - if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost; - } - - // Estimate the reference frame signaling cost and add it - // to the rolling cost variable. - if (comp_pred) { -#if CONFIG_EXT_COMP_REFS - rate2 += ref_costs_comp[ref_frame][second_ref_frame]; -#else // !CONFIG_EXT_COMP_REFS - rate2 += ref_costs_comp[ref_frame]; -#if CONFIG_EXT_REFS - rate2 += ref_costs_comp[second_ref_frame]; -#endif // CONFIG_EXT_REFS -#endif // CONFIG_EXT_COMP_REFS - } else { - rate2 += ref_costs_single[ref_frame]; - } - -#if CONFIG_COMPOUND_SINGLEREF - // Add the cost to signal single/comp mode in single ref. - if (!comp_pred && cm->reference_mode != COMPOUND_REFERENCE) { - aom_prob singleref_comp_mode_p = av1_get_inter_mode_prob(cm, xd); - rate2 += av1_cost_bit(singleref_comp_mode_p, - is_inter_singleref_comp_mode(mbmi->mode)); - } -#endif // CONFIG_COMPOUND_SINGLEREF - -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - if (ref_frame == INTRA_FRAME) -#else - if (!disable_skip) -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - { - if (skippable) { - // Back out the coefficient coding costs - rate2 -= (rate_y + rate_uv); - rate_y = 0; - rate_uv = 0; - // Cost the skip mb case - rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1); - } else if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) { - if (RDCOST(x->rdmult, rate_y + rate_uv + rate_skip0, distortion2) < - RDCOST(x->rdmult, rate_skip1, total_sse)) { - // Add in the cost of the no skip flag. - rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0); - } else { - // FIXME(rbultje) make this work for splitmv also - rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1); - distortion2 = total_sse; - assert(total_sse >= 0); - rate2 -= (rate_y + rate_uv); - this_skip2 = 1; - rate_y = 0; - rate_uv = 0; - } - } else { - // Add in the cost of the no skip flag. - rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0); - } - - // Calculate the final RD estimate for this mode. - this_rd = RDCOST(x->rdmult, rate2, distortion2); -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - } else { this_skip2 = mbmi->skip; this_rd = RDCOST(x->rdmult, rate2, distortion2); if (this_skip2) { rate_y = 0; rate_uv = 0; } -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION - } - - if (ref_frame == INTRA_FRAME) { - // Keep record of best intra rd - if (this_rd < best_intra_rd) { - best_intra_rd = this_rd; - best_intra_mode = mbmi->mode; - } -#if CONFIG_INTERINTRA - } else if (second_ref_frame == NONE_FRAME) { - if (this_rd < best_single_inter_rd) { - best_single_inter_rd = this_rd; - best_single_inter_ref = mbmi->ref_frame[0]; - } -#endif // CONFIG_INTERINTRA - } - - if (!disable_skip && ref_frame == INTRA_FRAME) { - for (i = 0; i < REFERENCE_MODES; ++i) - best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd); } // Did this mode help.. i.e. is it the new best mode - if (this_rd < best_rd || x->skip) { + if (this_rd < search_state.best_rd || x->skip) { + int mode_excluded = 0; + if (comp_pred) { + mode_excluded = cm->reference_mode == SINGLE_REFERENCE; + } if (!mode_excluded) { // Note index of best mode so far - best_mode_index = mode_index; + search_state.best_mode_index = mode_index; if (ref_frame == INTRA_FRAME) { /* required for left and above block mv */ mbmi->mv[0].as_int = 0; } else { - best_pred_sse = x->pred_sse[ref_frame]; + search_state.best_pred_sse = x->pred_sse[ref_frame]; } rd_cost->rate = rate2; -#if CONFIG_SUPERTX - if (x->skip) - *returnrate_nocoef = rate2; - else - *returnrate_nocoef = rate2 - rate_y - rate_uv; - *returnrate_nocoef -= av1_cost_bit( - av1_get_skip_prob(cm, xd), disable_skip || skippable || this_skip2); - *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd), - mbmi->ref_frame[0] != INTRA_FRAME); -#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION -#if CONFIG_WARPED_MOTION - set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); -#endif -#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION - MODE_INFO *const mi = xd->mi[0]; - const MOTION_MODE motion_allowed = motion_mode_allowed( -#if CONFIG_GLOBAL_MOTION - 0, xd->global_motion, -#endif // CONFIG_GLOBAL_MOTION -#if CONFIG_WARPED_MOTION - xd, -#endif - mi); - if (motion_allowed == WARPED_CAUSAL) - *returnrate_nocoef -= x->motion_mode_cost[bsize][mbmi->motion_mode]; - else if (motion_allowed == OBMC_CAUSAL) - *returnrate_nocoef -= x->motion_mode_cost1[bsize][mbmi->motion_mode]; -#else - *returnrate_nocoef -= x->motion_mode_cost[bsize][mbmi->motion_mode]; -#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION -#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION -#endif // CONFIG_SUPERTX rd_cost->dist = distortion2; rd_cost->rdcost = this_rd; - best_rd = this_rd; - best_mbmode = *mbmi; - best_skip2 = this_skip2; - best_mode_skippable = skippable; - best_rate_y = rate_y + av1_cost_bit(av1_get_skip_prob(cm, xd), - this_skip2 || skippable); - best_rate_uv = rate_uv; -#if CONFIG_VAR_TX - for (i = 0; i < MAX_MB_PLANE; ++i) - memcpy(ctx->blk_skip[i], x->blk_skip[i], - sizeof(uint8_t) * ctx->num_4x4_blk); -#endif // CONFIG_VAR_TX + search_state.best_rd = this_rd; + search_state.best_mbmode = *mbmi; + search_state.best_skip2 = this_skip2; + search_state.best_mode_skippable = skippable; + search_state.best_rate_y = + rate_y + + x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable]; + search_state.best_rate_uv = rate_uv; + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); } } @@ -11693,458 +10390,136 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, hybrid_rd = RDCOST(x->rdmult, hybrid_rate, distortion2); if (!comp_pred) { - if (single_rd < best_pred_rd[SINGLE_REFERENCE]) - best_pred_rd[SINGLE_REFERENCE] = single_rd; + if (single_rd < search_state.best_pred_rd[SINGLE_REFERENCE]) + search_state.best_pred_rd[SINGLE_REFERENCE] = single_rd; } else { - if (single_rd < best_pred_rd[COMPOUND_REFERENCE]) - best_pred_rd[COMPOUND_REFERENCE] = single_rd; + if (single_rd < search_state.best_pred_rd[COMPOUND_REFERENCE]) + search_state.best_pred_rd[COMPOUND_REFERENCE] = single_rd; } - if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT]) - best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd; + if (hybrid_rd < search_state.best_pred_rd[REFERENCE_MODE_SELECT]) + search_state.best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd; } - if (x->skip && !comp_pred) break; - } - - if (xd->lossless[mbmi->segment_id] == 0 && best_mode_index >= 0 && - ((sf->tx_type_search.fast_inter_tx_type_search == 1 && - is_inter_mode(best_mbmode.mode)) || - (sf->tx_type_search.fast_intra_tx_type_search == 1 && - !is_inter_mode(best_mbmode.mode)))) { - int skip_blk = 0; - RD_STATS rd_stats_y, rd_stats_uv; - - x->use_default_inter_tx_type = 0; - x->use_default_intra_tx_type = 0; - - *mbmi = best_mbmode; - - set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); - - // Select prediction reference frames. - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; - if (has_second_ref(mbmi)) - xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; - } + if (sf->drop_ref) { + if (second_ref_frame == NONE_FRAME) { + const int idx = ref_frame - LAST_FRAME; + if (idx && distortion2 > search_state.dist_refs[idx]) { + search_state.dist_refs[idx] = distortion2; + search_state.dist_order_refs[idx] = ref_frame; + } -#if CONFIG_COMPOUND_SINGLEREF - // Single ref compound mode - if (!has_second_ref(mbmi) && is_inter_singleref_comp_mode(mbmi->mode)) { - xd->block_refs[1] = xd->block_refs[0]; - for (i = 0; i < MAX_MB_PLANE; i++) - xd->plane[i].pre[1] = xd->plane[i].pre[0]; - } -#endif // CONFIG_COMPOUND_SINGLEREF + // Reach the last single ref prediction mode + if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) { + // bubble sort dist_refs and the order index + for (i = 0; i < REF_FRAMES; ++i) { + for (k = i + 1; k < REF_FRAMES; ++k) { + if (search_state.dist_refs[i] < search_state.dist_refs[k]) { + int64_t tmp_dist = search_state.dist_refs[i]; + search_state.dist_refs[i] = search_state.dist_refs[k]; + search_state.dist_refs[k] = tmp_dist; + + int tmp_idx = search_state.dist_order_refs[i]; + search_state.dist_order_refs[i] = + search_state.dist_order_refs[k]; + search_state.dist_order_refs[k] = tmp_idx; + } + } + } - if (is_inter_mode(mbmi->mode)) { - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize); -#if CONFIG_MOTION_VAR - if (mbmi->motion_mode == OBMC_CAUSAL) { - av1_build_obmc_inter_prediction( - cm, xd, mi_row, mi_col, args.above_pred_buf, args.above_pred_stride, - args.left_pred_buf, args.left_pred_stride); - } -#endif // CONFIG_MOTION_VAR - av1_subtract_plane(x, bsize, 0); -#if CONFIG_VAR_TX - if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) { - select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); - assert(rd_stats_y.rate != INT_MAX); - } else { - int idx, idy; - super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); - for (idy = 0; idy < xd->n8_h; ++idy) - for (idx = 0; idx < xd->n8_w; ++idx) - mbmi->inter_tx_size[idy][idx] = mbmi->tx_size; - memset(x->blk_skip[0], rd_stats_y.skip, - sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4); + for (i = 0; i < REF_FRAMES; ++i) { + if (search_state.dist_refs[i] == -1) break; + search_state.num_available_refs = i; + } + search_state.num_available_refs++; + } } - - inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); -#else - super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); - super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); -#endif // CONFIG_VAR_TX - } else { - super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); - super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); - } - - if (RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate, - (rd_stats_y.dist + rd_stats_uv.dist)) > - RDCOST(x->rdmult, 0, (rd_stats_y.sse + rd_stats_uv.sse))) { - skip_blk = 1; - rd_stats_y.rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1); - rd_stats_uv.rate = 0; - rd_stats_y.dist = rd_stats_y.sse; - rd_stats_uv.dist = rd_stats_uv.sse; - } else { - skip_blk = 0; - rd_stats_y.rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0); } - if (RDCOST(x->rdmult, best_rate_y + best_rate_uv, rd_cost->dist) > - RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate, - (rd_stats_y.dist + rd_stats_uv.dist))) { -#if CONFIG_VAR_TX - int idx, idy; -#endif // CONFIG_VAR_TX - best_mbmode.tx_type = mbmi->tx_type; - best_mbmode.tx_size = mbmi->tx_size; -#if CONFIG_LGT_FROM_PRED - best_mbmode.use_lgt = mbmi->use_lgt; -#endif -#if CONFIG_VAR_TX - for (idy = 0; idy < xd->n8_h; ++idy) - for (idx = 0; idx < xd->n8_w; ++idx) - best_mbmode.inter_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx]; - - for (i = 0; i < MAX_MB_PLANE; ++i) - memcpy(ctx->blk_skip[i], x->blk_skip[i], - sizeof(uint8_t) * ctx->num_4x4_blk); - - best_mbmode.min_tx_size = mbmi->min_tx_size; -#endif // CONFIG_VAR_TX - rd_cost->rate += - (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv); - rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist; - rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist); - best_skip2 = skip_blk; - } + if (x->skip && !comp_pred) break; } - // Only try palette mode when the best mode so far is an intra mode. - if (try_palette && !is_inter_mode(best_mbmode.mode)) { - int rate2 = 0; -#if CONFIG_SUPERTX - int best_rate_nocoef; -#endif // CONFIG_SUPERTX - int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd, - best_model_rd_palette = INT64_MAX; - int skippable = 0, rate_overhead_palette = 0; - RD_STATS rd_stats_y; - TX_SIZE uv_tx; - uint8_t *const best_palette_color_map = - x->palette_buffer->best_palette_color_map; - uint8_t *const color_map = xd->plane[0].color_index_map; - MB_MODE_INFO best_mbmi_palette = best_mbmode; + // In effect only when speed >= 2. + sf_refine_fast_tx_type_search( + cpi, x, mi_row, mi_col, rd_cost, bsize, ctx, search_state.best_mode_index, + &search_state.best_mbmode, yv12_mb, search_state.best_rate_y, + search_state.best_rate_uv, &search_state.best_skip2); - mbmi->mode = DC_PRED; - mbmi->uv_mode = UV_DC_PRED; - mbmi->ref_frame[0] = INTRA_FRAME; - mbmi->ref_frame[1] = NONE_FRAME; - rate_overhead_palette = rd_pick_palette_intra_sby( - cpi, x, bsize, palette_ctx, intra_mode_cost[DC_PRED], - &best_mbmi_palette, best_palette_color_map, &best_rd_palette, - &best_model_rd_palette, NULL, NULL, NULL, NULL); - if (pmi->palette_size[0] == 0) goto PALETTE_EXIT; - memcpy(color_map, best_palette_color_map, - rows * cols * sizeof(best_palette_color_map[0])); - super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd); - if (rd_stats_y.rate == INT_MAX) goto PALETTE_EXIT; - uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x] - [xd->plane[1].subsampling_y]; - if (rate_uv_intra[uv_tx] == INT_MAX) { - choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx], - &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx], - &skip_uvs[uv_tx], &mode_uv[uv_tx]); - pmi_uv[uv_tx] = *pmi; -#if CONFIG_EXT_INTRA - uv_angle_delta[uv_tx] = mbmi->angle_delta[1]; -#endif // CONFIG_EXT_INTRA -#if CONFIG_FILTER_INTRA - filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info; -#endif // CONFIG_FILTER_INTRA - } - mbmi->uv_mode = mode_uv[uv_tx]; - pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1]; - if (pmi->palette_size[1] > 0) { - memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, - pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE, - 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0])); - } -#if CONFIG_EXT_INTRA - mbmi->angle_delta[1] = uv_angle_delta[uv_tx]; -#endif // CONFIG_EXT_INTRA -#if CONFIG_FILTER_INTRA - mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = - filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]; - if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) { - mbmi->filter_intra_mode_info.filter_intra_mode[1] = - filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1]; - } -#endif // CONFIG_FILTER_INTRA - skippable = rd_stats_y.skip && skip_uvs[uv_tx]; - distortion2 = rd_stats_y.dist + dist_uvs[uv_tx]; - rate2 = rd_stats_y.rate + rate_overhead_palette + rate_uv_intra[uv_tx]; - rate2 += ref_costs_single[INTRA_FRAME]; - - if (skippable) { - rate2 -= (rd_stats_y.rate + rate_uv_tokenonly[uv_tx]); -#if CONFIG_SUPERTX - best_rate_nocoef = rate2; -#endif // CONFIG_SUPERTX - rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1); - } else { -#if CONFIG_SUPERTX - best_rate_nocoef = rate2 - (rd_stats_y.rate + rate_uv_tokenonly[uv_tx]); -#endif // CONFIG_SUPERTX - rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0); - } - this_rd = RDCOST(x->rdmult, rate2, distortion2); - if (this_rd < best_rd) { - best_mode_index = 3; - mbmi->mv[0].as_int = 0; - rd_cost->rate = rate2; -#if CONFIG_SUPERTX - *returnrate_nocoef = best_rate_nocoef; -#endif // CONFIG_SUPERTX - rd_cost->dist = distortion2; - rd_cost->rdcost = this_rd; - best_rd = this_rd; - best_mbmode = *mbmi; - best_skip2 = 0; - best_mode_skippable = skippable; - } - } -PALETTE_EXIT: - -#if CONFIG_FILTER_INTRA - // TODO(huisu): filter-intra is turned off in lossless mode for now to - // avoid a unit test failure - if (!xd->lossless[mbmi->segment_id] && pmi->palette_size[0] == 0 && - !dc_skipped && best_mode_index >= 0 && - best_intra_rd < (best_rd + (best_rd >> 3))) { - pick_filter_intra_interframe( - cpi, x, bsize, mi_row, mi_col, rate_uv_intra, rate_uv_tokenonly, - dist_uvs, skip_uvs, mode_uv, filter_intra_mode_info_uv, -#if CONFIG_EXT_INTRA - uv_angle_delta, -#endif // CONFIG_EXT_INTRA - pmi_uv, palette_ctx, 0, ref_costs_single, &best_rd, &best_intra_rd, - &best_intra_mode, &best_mode_index, &best_skip2, &best_mode_skippable, -#if CONFIG_SUPERTX - returnrate_nocoef, -#endif // CONFIG_SUPERTX - best_pred_rd, &best_mbmode, rd_cost); - } -#endif // CONFIG_FILTER_INTRA - -// The inter modes' rate costs are not calculated precisely in some cases. -// Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and -// ZEROMV. Here, checks are added for those cases, and the mode decisions -// are corrected. -#if CONFIG_COMPOUND_SINGLEREF -// NOTE: For SR_NEW_NEWMV, no need to check as the two mvs from the same ref -// are surely different from each other. -#endif // CONFIG_COMPOUND_SINGLEREF - if (best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV) { - const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0], - best_mbmode.ref_frame[1] }; - int comp_pred_mode = refs[1] > INTRA_FRAME; - int_mv zeromv[2]; - const uint8_t rf_type = av1_ref_frame_type(best_mbmode.ref_frame); -#if CONFIG_GLOBAL_MOTION - zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]], - cm->allow_high_precision_mv, bsize, - mi_col, mi_row, 0 -#if CONFIG_AMVR - , - cm->cur_frame_mv_precision_level -#endif - ) - .as_int; - zeromv[1].as_int = - comp_pred_mode - ? gm_get_motion_vector(&cm->global_motion[refs[1]], - cm->allow_high_precision_mv, bsize, mi_col, - mi_row, 0 -#if CONFIG_AMVR - , - cm->cur_frame_mv_precision_level -#endif - ) - .as_int - : 0; -#else - zeromv[0].as_int = 0; - zeromv[1].as_int = 0; -#endif // CONFIG_GLOBAL_MOTION - if (!comp_pred_mode) { - int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2) - ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2) - : INT_MAX; - - for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) { - int_mv cur_mv = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv; - if (cur_mv.as_int == best_mbmode.mv[0].as_int) { - best_mbmode.mode = NEARMV; - best_mbmode.ref_mv_idx = i; - } - } - - if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int) - best_mbmode.mode = NEARESTMV; - else if (best_mbmode.mv[0].as_int == zeromv[0].as_int) - best_mbmode.mode = ZEROMV; - } else { - int_mv nearestmv[2]; - int_mv nearmv[2]; - - if (mbmi_ext->ref_mv_count[rf_type] > 1) { - nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][1].this_mv; - nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][1].comp_mv; - } else { - nearmv[0] = frame_mv[NEARMV][refs[0]]; - nearmv[1] = frame_mv[NEARMV][refs[1]]; - } - if (mbmi_ext->ref_mv_count[rf_type] >= 1) { - nearestmv[0] = mbmi_ext->ref_mv_stack[rf_type][0].this_mv; - nearestmv[1] = mbmi_ext->ref_mv_stack[rf_type][0].comp_mv; - } else { - nearestmv[0] = frame_mv[NEARESTMV][refs[0]]; - nearestmv[1] = frame_mv[NEARESTMV][refs[1]]; - } - - if (nearestmv[0].as_int == best_mbmode.mv[0].as_int && - nearestmv[1].as_int == best_mbmode.mv[1].as_int) { - best_mbmode.mode = NEAREST_NEARESTMV; - } else { - int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2) - ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2) - : INT_MAX; - - for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) { - nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv; - nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv; - - // Try switching to the NEAR_NEARMV mode - if (nearmv[0].as_int == best_mbmode.mv[0].as_int && - nearmv[1].as_int == best_mbmode.mv[1].as_int) { - best_mbmode.mode = NEAR_NEARMV; - best_mbmode.ref_mv_idx = i; - } - } + // Only try palette mode when the best mode so far is an intra mode. + if (try_palette && !is_inter_mode(search_state.best_mbmode.mode)) { + search_palette_mode(cpi, x, rd_cost, ctx, bsize, mbmi, pmi, + ref_costs_single, &search_state); + } - if (best_mbmode.mode == NEW_NEWMV && - best_mbmode.mv[0].as_int == zeromv[0].as_int && - best_mbmode.mv[1].as_int == zeromv[1].as_int) - best_mbmode.mode = ZERO_ZEROMV; - } - } + search_state.best_mbmode.skip_mode = 0; + if (cm->skip_mode_flag && + !segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + is_comp_ref_allowed(bsize)) { + rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, mi_row, mi_col, + yv12_mb); } // Make sure that the ref_mv_idx is only nonzero when we're // using a mode which can support ref_mv_idx - if (best_mbmode.ref_mv_idx != 0 && -#if CONFIG_COMPOUND_SINGLEREF - !(best_mbmode.mode == NEWMV || best_mbmode.mode == SR_NEW_NEWMV || - best_mbmode.mode == NEW_NEWMV || - have_nearmv_in_inter_mode(best_mbmode.mode))) -#else // !CONFIG_COMPOUND_SINGLEREF - !(best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV || - have_nearmv_in_inter_mode(best_mbmode.mode))) -#endif // CONFIG_COMPOUND_SINGLEREF - { - best_mbmode.ref_mv_idx = 0; - } - - if (best_mbmode.ref_frame[0] > INTRA_FRAME && - best_mbmode.ref_frame[1] <= INTRA_FRAME) { - int8_t ref_frame_type = av1_ref_frame_type(best_mbmode.ref_frame); - int16_t mode_ctx = mbmi_ext->mode_context[ref_frame_type]; - if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) { - int_mv zeromv; -#if CONFIG_GLOBAL_MOTION - const MV_REFERENCE_FRAME ref = best_mbmode.ref_frame[0]; - zeromv.as_int = gm_get_motion_vector(&cm->global_motion[ref], - cm->allow_high_precision_mv, bsize, - mi_col, mi_row, 0 -#if CONFIG_AMVR - , - cm->cur_frame_mv_precision_level -#endif - ) - .as_int; -#else - zeromv.as_int = 0; -#endif // CONFIG_GLOBAL_MOTION - if (best_mbmode.mv[0].as_int == zeromv.as_int) { - best_mbmode.mode = ZEROMV; - } - } + if (search_state.best_mbmode.ref_mv_idx != 0 && + !(search_state.best_mbmode.mode == NEWMV || + search_state.best_mbmode.mode == NEW_NEWMV || + have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) { + search_state.best_mbmode.ref_mv_idx = 0; } - if (best_mode_index < 0 || best_rd >= best_rd_so_far) { + if (search_state.best_mode_index < 0 || + search_state.best_rd >= best_rd_so_far) { rd_cost->rate = INT_MAX; rd_cost->rdcost = INT64_MAX; return; } - assert((cm->interp_filter == SWITCHABLE) || - (cm->interp_filter == - av1_extract_interp_filter(best_mbmode.interp_filters, 0)) || - !is_inter_block(&best_mbmode)); -#if CONFIG_DUAL_FILTER - assert((cm->interp_filter == SWITCHABLE) || - (cm->interp_filter == - av1_extract_interp_filter(best_mbmode.interp_filters, 1)) || - !is_inter_block(&best_mbmode)); -#endif // CONFIG_DUAL_FILTER + assert( + (cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == + av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 0)) || + !is_inter_block(&search_state.best_mbmode)); + assert( + (cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == + av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 1)) || + !is_inter_block(&search_state.best_mbmode)); if (!cpi->rc.is_src_frame_alt_ref) av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact, - sf->adaptive_rd_thresh, bsize, best_mode_index); + sf->adaptive_rd_thresh, bsize, + search_state.best_mode_index); // macroblock modes - *mbmi = best_mbmode; - x->skip |= best_skip2; - -// Note: this section is needed since the mode may have been forced to -// ZEROMV by the all-zero mode handling of ref-mv. -#if CONFIG_GLOBAL_MOTION - if (mbmi->mode == ZEROMV || mbmi->mode == ZERO_ZEROMV) { -#if CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR - // Correct the motion mode for ZEROMV - const MOTION_MODE last_motion_mode_allowed = - motion_mode_allowed(0, xd->global_motion, -#if CONFIG_WARPED_MOTION - xd, -#endif - xd->mi[0]); - if (mbmi->motion_mode > last_motion_mode_allowed) - mbmi->motion_mode = last_motion_mode_allowed; -#endif // CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR - - // Correct the interpolation filter for ZEROMV - if (is_nontrans_global_motion(xd)) { - mbmi->interp_filters = av1_broadcast_interp_filter( - av1_unswitchable_filter(cm->interp_filter)); + *mbmi = search_state.best_mbmode; + x->skip |= search_state.best_skip2; + + // Note: this section is needed since the mode may have been forced to + // GLOBALMV by the all-zero mode handling of ref-mv. + if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) { + // Correct the interp filters for GLOBALMV + if (is_nontrans_global_motion(xd, xd->mi[0])) { + assert(mbmi->interp_filters == + av1_broadcast_interp_filter( + av1_unswitchable_filter(cm->interp_filter))); } } -#endif // CONFIG_GLOBAL_MOTION - - for (i = 0; i < 1 + has_second_ref(mbmi); ++i) { - if (mbmi->mode != NEWMV) - mbmi->pred_mv[i].as_int = mbmi->mv[i].as_int; - else - mbmi->pred_mv[i].as_int = mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_int; - } for (i = 0; i < REFERENCE_MODES; ++i) { - if (best_pred_rd[i] == INT64_MAX) - best_pred_diff[i] = INT_MIN; + if (search_state.best_pred_rd[i] == INT64_MAX) + search_state.best_pred_diff[i] = INT_MIN; else - best_pred_diff[i] = best_rd - best_pred_rd[i]; + search_state.best_pred_diff[i] = + search_state.best_rd - search_state.best_pred_rd[i]; } - x->skip |= best_mode_skippable; + x->skip |= search_state.best_mode_skippable; - assert(best_mode_index >= 0); + assert(search_state.best_mode_index >= 0); - store_coding_context(x, ctx, best_mode_index, best_pred_diff, - best_mode_skippable); + store_coding_context(x, ctx, search_state.best_mode_index, + search_state.best_pred_diff, + search_state.best_mode_skippable); if (pmi->palette_size[1] > 0) { assert(try_palette); @@ -12160,18 +10535,14 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi, int64_t best_rd_so_far) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; unsigned char segment_id = mbmi->segment_id; const int comp_pred = 0; int i; int64_t best_pred_diff[REFERENCE_MODES]; - unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME]; -#if CONFIG_EXT_COMP_REFS - unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME][TOTAL_REFS_PER_FRAME]; -#else - unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME]; -#endif // CONFIG_EXT_COMP_REFS - aom_prob comp_mode_p; + unsigned int ref_costs_single[REF_FRAMES]; + unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES]; + int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)]; InterpFilter best_filter = SWITCHABLE; int64_t this_rd = INT64_MAX; int rate2 = 0; @@ -12179,12 +10550,13 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi, (void)mi_row; (void)mi_col; - estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp, - &comp_mode_p); + av1_collect_neighbors_ref_counts(xd); - for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX; - for (i = LAST_FRAME; i < TOTAL_REFS_PER_FRAME; ++i) - x->pred_mv_sad[i] = INT_MAX; + estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single, + ref_costs_comp); + + for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX; + for (i = LAST_FRAME; i < REF_FRAMES; ++i) x->pred_mv_sad[i] = INT_MAX; rd_cost->rate = INT_MAX; @@ -12192,58 +10564,35 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi, mbmi->palette_mode_info.palette_size[0] = 0; mbmi->palette_mode_info.palette_size[1] = 0; - -#if CONFIG_FILTER_INTRA - mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0; - mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0; -#endif // CONFIG_FILTER_INTRA - mbmi->mode = ZEROMV; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->mode = GLOBALMV; mbmi->motion_mode = SIMPLE_TRANSLATION; mbmi->uv_mode = UV_DC_PRED; - mbmi->ref_frame[0] = LAST_FRAME; + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) + mbmi->ref_frame[0] = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME); + else + mbmi->ref_frame[0] = LAST_FRAME; mbmi->ref_frame[1] = NONE_FRAME; -#if CONFIG_GLOBAL_MOTION mbmi->mv[0].as_int = gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]], - cm->allow_high_precision_mv, bsize, mi_col, mi_row, 0 -#if CONFIG_AMVR - , - cm->cur_frame_mv_precision_level -#endif - ) + cm->allow_high_precision_mv, bsize, mi_col, mi_row, + cm->cur_frame_force_integer_mv) .as_int; -#else // CONFIG_GLOBAL_MOTION - mbmi->mv[0].as_int = 0; -#endif // CONFIG_GLOBAL_MOTION mbmi->tx_size = max_txsize_lookup[bsize]; x->skip = 1; mbmi->ref_mv_idx = 0; - mbmi->pred_mv[0].as_int = 0; -#if CONFIG_LGT_FROM_PRED - mbmi->use_lgt = 0; -#endif mbmi->motion_mode = SIMPLE_TRANSLATION; -#if CONFIG_MOTION_VAR av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col); -#endif -#if CONFIG_WARPED_MOTION if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) { int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; -#if WARPED_MOTION_SORT_SAMPLES - int pts_mv[SAMPLES_ARRAY_SIZE]; - mbmi->num_proj_ref[0] = - findSamples(cm, xd, mi_row, mi_col, pts, pts_inref, pts_mv); - // Rank the samples by motion vector difference - if (mbmi->num_proj_ref[0] > 1) - mbmi->num_proj_ref[0] = sortSamples(pts_mv, &mbmi->mv[0].as_mv, pts, - pts_inref, mbmi->num_proj_ref[0]); -#else mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref); -#endif // WARPED_MOTION_SORT_SAMPLES + // Select the samples according to motion vector difference + if (mbmi->num_proj_ref[0] > 1) + mbmi->num_proj_ref[0] = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, + mbmi->num_proj_ref[0], bsize); } -#endif set_default_interp_filters(mbmi, cm->interp_filter); @@ -12270,7 +10619,7 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi, rate2 += av1_get_switchable_rate(cm, x, xd); if (cm->reference_mode == REFERENCE_MODE_SELECT) - rate2 += av1_cost_bit(comp_mode_p, comp_pred); + rate2 += comp_inter_cost[comp_pred]; // Estimate the reference frame signaling cost and add it // to the rolling cost variable. @@ -12292,15 +10641,13 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi, av1_extract_interp_filter(mbmi->interp_filters, 0))); av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact, - cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV); + cpi->sf.adaptive_rd_thresh, bsize, THR_GLOBALMV); av1_zero(best_pred_diff); - store_coding_context(x, ctx, THR_ZEROMV, best_pred_diff, 0); + store_coding_context(x, ctx, THR_GLOBALMV, best_pred_diff, 0); } -#if CONFIG_MOTION_VAR - struct calc_target_weighted_pred_ctxt { const MACROBLOCK *x; const uint8_t *tmp; @@ -12308,28 +10655,22 @@ struct calc_target_weighted_pred_ctxt { int overlap; }; -static INLINE void calc_target_weighted_pred_above(MACROBLOCKD *xd, - int rel_mi_col, - uint8_t nb_mi_width, - MODE_INFO *nb_mi, - void *fun_ctxt) { +static INLINE void calc_target_weighted_pred_above( + MACROBLOCKD *xd, int rel_mi_col, uint8_t nb_mi_width, MB_MODE_INFO *nb_mi, + void *fun_ctxt, const int num_planes) { (void)nb_mi; + (void)num_planes; struct calc_target_weighted_pred_ctxt *ctxt = (struct calc_target_weighted_pred_ctxt *)fun_ctxt; -#if CONFIG_HIGHBITDEPTH - const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; -#else - const int is_hbd = 0; -#endif // CONFIG_HIGHBITDEPTH - const int bw = xd->n8_w << MI_SIZE_LOG2; const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap); int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE); int32_t *mask = ctxt->x->mask_buf + (rel_mi_col * MI_SIZE); const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE; + const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; if (!is_hbd) { for (int row = 0; row < ctxt->overlap; ++row) { @@ -12343,7 +10684,6 @@ static INLINE void calc_target_weighted_pred_above(MACROBLOCKD *xd, mask += bw; tmp += ctxt->tmp_stride; } -#if CONFIG_HIGHBITDEPTH } else { const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp); @@ -12358,32 +10698,25 @@ static INLINE void calc_target_weighted_pred_above(MACROBLOCKD *xd, mask += bw; tmp16 += ctxt->tmp_stride; } -#endif // CONFIG_HIGHBITDEPTH } } -static INLINE void calc_target_weighted_pred_left(MACROBLOCKD *xd, - int rel_mi_row, - uint8_t nb_mi_height, - MODE_INFO *nb_mi, - void *fun_ctxt) { +static INLINE void calc_target_weighted_pred_left( + MACROBLOCKD *xd, int rel_mi_row, uint8_t nb_mi_height, MB_MODE_INFO *nb_mi, + void *fun_ctxt, const int num_planes) { (void)nb_mi; + (void)num_planes; struct calc_target_weighted_pred_ctxt *ctxt = (struct calc_target_weighted_pred_ctxt *)fun_ctxt; -#if CONFIG_HIGHBITDEPTH - const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; -#else - const int is_hbd = 0; -#endif // CONFIG_HIGHBITDEPTH - const int bw = xd->n8_w << MI_SIZE_LOG2; const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap); int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw); int32_t *mask = ctxt->x->mask_buf + (rel_mi_row * MI_SIZE * bw); const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride); + const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; if (!is_hbd) { for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) { @@ -12398,7 +10731,6 @@ static INLINE void calc_target_weighted_pred_left(MACROBLOCKD *xd, mask += bw; tmp += ctxt->tmp_stride; } -#if CONFIG_HIGHBITDEPTH } else { const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp); @@ -12414,7 +10746,6 @@ static INLINE void calc_target_weighted_pred_left(MACROBLOCKD *xd, mask += bw; tmp16 += ctxt->tmp_stride; } -#endif // CONFIG_HIGHBITDEPTH } } @@ -12461,18 +10792,14 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x, int mi_col, const uint8_t *above, int above_stride, const uint8_t *left, int left_stride) { - const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; const int bw = xd->n8_w << MI_SIZE_LOG2; const int bh = xd->n8_h << MI_SIZE_LOG2; int32_t *mask_buf = x->mask_buf; int32_t *wsrc_buf = x->wsrc_buf; - const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA; -#if CONFIG_HIGHBITDEPTH const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; -#else - const int is_hbd = 0; -#endif // CONFIG_HIGHBITDEPTH + const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA; // plane 0 should not be subsampled assert(xd->plane[0].subsampling_x == 0); @@ -12488,7 +10815,7 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x, struct calc_target_weighted_pred_ctxt ctxt = { x, above, above_stride, overlap }; foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd, mi_col, - max_neighbor_obmc[b_width_log2_lookup[bsize]], + max_neighbor_obmc[mi_size_wide_log2[bsize]], calc_target_weighted_pred_above, &ctxt); } @@ -12504,7 +10831,7 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x, struct calc_target_weighted_pred_ctxt ctxt = { x, left, left_stride, overlap }; foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd, mi_row, - max_neighbor_obmc[b_height_log2_lookup[bsize]], + max_neighbor_obmc[mi_size_high_log2[bsize]], calc_target_weighted_pred_left, &ctxt); } @@ -12518,7 +10845,6 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x, wsrc_buf += bw; src += x->plane[0].src.stride; } -#if CONFIG_HIGHBITDEPTH } else { const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf); @@ -12529,462 +10855,5 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x, wsrc_buf += bw; src += x->plane[0].src.stride; } -#endif // CONFIG_HIGHBITDEPTH - } -} - -#if CONFIG_NCOBMC -void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x, - int mi_row, int mi_col) { - const AV1_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - MB_MODE_INFO backup_mbmi; - BLOCK_SIZE bsize = mbmi->sb_type; - int ref, skip_blk, backup_skip = x->skip; - int64_t rd_causal; - RD_STATS rd_stats_y, rd_stats_uv; - int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0); - int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1); - - // Recompute the best causal predictor and rd - mbmi->motion_mode = SIMPLE_TRANSLATION; - set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); - for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { - YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]); - assert(cfg != NULL); - av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col, - &xd->block_refs[ref]->sf); - } - av1_setup_dst_planes(x->e_mbd.plane, bsize, - get_frame_new_buffer(&cpi->common), mi_row, mi_col); - - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize); - - av1_subtract_plane(x, bsize, 0); -#if CONFIG_VAR_TX - if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { - select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); - } else { - int idx, idy; - super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); - for (idy = 0; idy < xd->n8_h; ++idy) - for (idx = 0; idx < xd->n8_w; ++idx) - mbmi->inter_tx_size[idy][idx] = mbmi->tx_size; - memset(x->blk_skip[0], rd_stats_y.skip, - sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4); - } - inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); -#else - super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); - super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); -#endif - assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX); - if (rd_stats_y.skip && rd_stats_uv.skip) { - rd_stats_y.rate = rate_skip1; - rd_stats_uv.rate = 0; - rd_stats_y.dist = rd_stats_y.sse; - rd_stats_uv.dist = rd_stats_uv.sse; - skip_blk = 0; - } else if (RDCOST(x->rdmult, - (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0), - (rd_stats_y.dist + rd_stats_uv.dist)) > - RDCOST(x->rdmult, rate_skip1, - (rd_stats_y.sse + rd_stats_uv.sse))) { - rd_stats_y.rate = rate_skip1; - rd_stats_uv.rate = 0; - rd_stats_y.dist = rd_stats_y.sse; - rd_stats_uv.dist = rd_stats_uv.sse; - skip_blk = 1; - } else { - rd_stats_y.rate += rate_skip0; - skip_blk = 0; - } - backup_skip = skip_blk; - backup_mbmi = *mbmi; - rd_causal = RDCOST(x->rdmult, (rd_stats_y.rate + rd_stats_uv.rate), - (rd_stats_y.dist + rd_stats_uv.dist)); - rd_causal += - RDCOST(x->rdmult, av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 0), 0); - - // Check non-causal mode - mbmi->motion_mode = OBMC_CAUSAL; - av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col); - - av1_subtract_plane(x, bsize, 0); -#if CONFIG_VAR_TX - if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { - select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); - } else { - int idx, idy; - super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); - for (idy = 0; idy < xd->n8_h; ++idy) - for (idx = 0; idx < xd->n8_w; ++idx) - mbmi->inter_tx_size[idy][idx] = mbmi->tx_size; - memset(x->blk_skip[0], rd_stats_y.skip, - sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4); - } - inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); -#else - super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); - super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); -#endif - assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX); - if (rd_stats_y.skip && rd_stats_uv.skip) { - rd_stats_y.rate = rate_skip1; - rd_stats_uv.rate = 0; - rd_stats_y.dist = rd_stats_y.sse; - rd_stats_uv.dist = rd_stats_uv.sse; - skip_blk = 0; - } else if (RDCOST(x->rdmult, - (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0), - (rd_stats_y.dist + rd_stats_uv.dist)) > - RDCOST(x->rdmult, rate_skip1, - (rd_stats_y.sse + rd_stats_uv.sse))) { - rd_stats_y.rate = rate_skip1; - rd_stats_uv.rate = 0; - rd_stats_y.dist = rd_stats_y.sse; - rd_stats_uv.dist = rd_stats_uv.sse; - skip_blk = 1; - } else { - rd_stats_y.rate += rate_skip0; - skip_blk = 0; - } - - if (rd_causal > - RDCOST(x->rdmult, - rd_stats_y.rate + rd_stats_uv.rate + - av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 1), - (rd_stats_y.dist + rd_stats_uv.dist))) { - x->skip = skip_blk; - } else { - *mbmi = backup_mbmi; - x->skip = backup_skip; - } -} -#endif // CONFIG_NCOBMC - -int64_t get_prediction_rd_cost(const struct AV1_COMP *cpi, struct macroblock *x, - int mi_row, int mi_col, int *skip_blk, - MB_MODE_INFO *backup_mbmi) { - const AV1_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - BLOCK_SIZE bsize = mbmi->sb_type; -#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION - const MOTION_MODE motion_allowed = motion_mode_allowed( -#if CONFIG_GLOBAL_MOTION - 0, xd->global_motion, -#endif // CONFIG_GLOBAL_MOTION -#if CONFIG_WARPED_MOTION - xd, -#endif - xd->mi[0]); -#endif // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION - RD_STATS rd_stats_y, rd_stats_uv; - int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0); - int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1); - int64_t this_rd; - int ref; - -#if CONFIG_CB4X4 - x->skip_chroma_rd = - !is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, - xd->plane[1].subsampling_y); -#endif - - set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); - for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { - YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]); - assert(cfg != NULL); - av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col, - &xd->block_refs[ref]->sf); - } - av1_setup_dst_planes(x->e_mbd.plane, bsize, - get_frame_new_buffer(&cpi->common), mi_row, mi_col); - -#if CONFIG_NCOBMC_ADAPT_WEIGHT - if (mbmi->motion_mode != NCOBMC_ADAPT_WEIGHT) -#endif - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize); - -#if CONFIG_MOTION_VAR - if (mbmi->motion_mode == OBMC_CAUSAL) { -#if CONFIG_NCOBMC - av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col); -#else - av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col); -#endif - } -#endif // CONFIG_MOTION_VAR - -#if CONFIG_NCOBMC_ADAPT_WEIGHT - if (mbmi->motion_mode == NCOBMC_ADAPT_WEIGHT) - for (int plane = 0; plane < MAX_MB_PLANE; ++plane) - get_pred_from_intrpl_buf(xd, mi_row, mi_col, bsize, plane); -#endif - av1_subtract_plane(x, bsize, 0); - -#if CONFIG_VAR_TX - if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { - select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); - } else { - int idx, idy; - super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); - for (idy = 0; idy < xd->n8_h; ++idy) - for (idx = 0; idx < xd->n8_w; ++idx) - mbmi->inter_tx_size[idy][idx] = mbmi->tx_size; - memset(x->blk_skip[0], rd_stats_y.skip, - sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4); - } - inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); -#else - super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); - super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); -#endif - assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX); - - if (rd_stats_y.skip && rd_stats_uv.skip) { - rd_stats_y.rate = rate_skip1; - rd_stats_uv.rate = 0; - rd_stats_y.dist = rd_stats_y.sse; - rd_stats_uv.dist = rd_stats_uv.sse; - *skip_blk = 1; - } else if (RDCOST(x->rdmult, - (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0), - (rd_stats_y.dist + rd_stats_uv.dist)) > - RDCOST(x->rdmult, rate_skip1, - (rd_stats_y.sse + rd_stats_uv.sse))) { - rd_stats_y.rate = rate_skip1; - rd_stats_uv.rate = 0; - rd_stats_y.dist = rd_stats_y.sse; - rd_stats_uv.dist = rd_stats_uv.sse; - *skip_blk = 1; - } else { - rd_stats_y.rate += rate_skip0; - *skip_blk = 0; - } - - if (backup_mbmi) *backup_mbmi = *mbmi; - - this_rd = RDCOST(x->rdmult, (rd_stats_y.rate + rd_stats_uv.rate), - (rd_stats_y.dist + rd_stats_uv.dist)); -#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION - if (motion_allowed == NCOBMC_ADAPT_WEIGHT) { - assert(mbmi->motion_mode <= NCOBMC_ADAPT_WEIGHT); - this_rd += - RDCOST(x->rdmult, x->motion_mode_cost2[bsize][mbmi->motion_mode], 0); - } else if (motion_allowed == OBMC_CAUSAL) { - assert(mbmi->motion_mode <= OBMC_CAUSAL); - this_rd += - RDCOST(x->rdmult, x->motion_mode_cost1[bsize][mbmi->motion_mode], 0); - } else { -#endif // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION - this_rd += - RDCOST(x->rdmult, x->motion_mode_cost[bsize][mbmi->motion_mode], 0); -#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION - } -#endif // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION - return this_rd; -} - -#if CONFIG_NCOBMC_ADAPT_WEIGHT -void av1_check_ncobmc_adapt_weight_rd(const struct AV1_COMP *cpi, - struct macroblock *x, int mi_row, - int mi_col) { - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - BLOCK_SIZE bsize = mbmi->sb_type; -#if CONFIG_VAR_TX - const int n4 = bsize_to_num_blk(bsize); - uint8_t st_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8]; - uint8_t obmc_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8]; - uint8_t ncobmc_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8]; -#endif - MB_MODE_INFO st_mbmi, obmc_mbmi, ncobmc_mbmi; - int st_skip, obmc_skip, ncobmc_skip; - int64_t st_rd, obmc_rd, ncobmc_rd; -#if CONFIG_WARPED_MOTION - const AV1_COMMON *const cm = &cpi->common; - const int is_warp_motion = mbmi->motion_mode == WARPED_CAUSAL; - const int rs = RDCOST(x->rdmult, av1_get_switchable_rate(cm, x, xd), 0); - MB_MODE_INFO warp_mbmi; - int64_t warp_rd; - int warp_skip; -#endif - - // Recompute the rd for the motion mode decided in rd loop - mbmi->motion_mode = SIMPLE_TRANSLATION; - st_rd = get_prediction_rd_cost(cpi, x, mi_row, mi_col, &st_skip, &st_mbmi); -#if CONFIG_WARPED_MOTION - st_rd += rs; -#endif -#if CONFIG_VAR_TX - memcpy(st_blk_skip, x->blk_skip[0], sizeof(st_blk_skip[0]) * n4); -#endif - - mbmi->motion_mode = OBMC_CAUSAL; - obmc_rd = - get_prediction_rd_cost(cpi, x, mi_row, mi_col, &obmc_skip, &obmc_mbmi); -#if CONFIG_WARPED_MOTION - obmc_rd += rs; -#endif -#if CONFIG_VAR_TX - memcpy(obmc_blk_skip, x->blk_skip[0], sizeof(obmc_blk_skip[0]) * n4); -#endif - - // Compute the rd cost for ncobmc adaptive weight - mbmi->motion_mode = NCOBMC_ADAPT_WEIGHT; - ncobmc_rd = get_prediction_rd_cost(cpi, x, mi_row, mi_col, &ncobmc_skip, - &ncobmc_mbmi); -#if CONFIG_WARPED_MOTION - ncobmc_rd += rs; -#endif - // Calculate the ncobmc mode costs - { - ADAPT_OVERLAP_BLOCK aob = adapt_overlap_block_lookup[bsize]; - ncobmc_rd += - RDCOST(x->rdmult, x->ncobmc_mode_cost[aob][mbmi->ncobmc_mode[0]], 0); - if (mi_size_wide[bsize] != mi_size_high[bsize]) - ncobmc_rd += - RDCOST(x->rdmult, x->ncobmc_mode_cost[aob][mbmi->ncobmc_mode[1]], 0); - } -#if CONFIG_VAR_TX - memcpy(ncobmc_blk_skip, x->blk_skip[0], sizeof(ncobmc_blk_skip[0]) * n4); -#endif - -#if CONFIG_WARPED_MOTION - if (is_warp_motion) { - mbmi->motion_mode = WARPED_CAUSAL; - warp_rd = - get_prediction_rd_cost(cpi, x, mi_row, mi_col, &warp_skip, &warp_mbmi); - } else { - warp_rd = INT64_MAX; - } -#endif - -#if CONFIG_WARPED_MOTION - if (AOMMIN(ncobmc_rd, warp_rd) < AOMMIN(st_rd, obmc_rd)) { - if (ncobmc_rd < warp_rd) { - x->skip = ncobmc_skip; - *mbmi = ncobmc_mbmi; -#if CONFIG_VAR_TX - memcpy(x->blk_skip[0], ncobmc_blk_skip, sizeof(ncobmc_blk_skip[0]) * n4); -#endif - } else { - x->skip = warp_skip; - *mbmi = warp_mbmi; - } -#else - if (ncobmc_rd < AOMMIN(st_rd, obmc_rd)) { - x->skip = ncobmc_skip; - *mbmi = ncobmc_mbmi; -#if CONFIG_VAR_TX - memcpy(x->blk_skip[0], ncobmc_blk_skip, sizeof(ncobmc_blk_skip[0]) * n4); -#endif -#endif // CONFIG_WARPED_MOTION - } else { - if (obmc_rd < st_rd) { - *mbmi = obmc_mbmi; - x->skip = obmc_skip; -#if CONFIG_VAR_TX - memcpy(x->blk_skip[0], obmc_blk_skip, sizeof(obmc_blk_skip[0]) * n4); -#endif - } else { - *mbmi = st_mbmi; - x->skip = st_skip; -#if CONFIG_VAR_TX - memcpy(x->blk_skip[0], st_blk_skip, sizeof(st_blk_skip[0]) * n4); -#endif - } - } -} - -int64_t get_ncobmc_error(MACROBLOCKD *xd, int pxl_row, int pxl_col, - BLOCK_SIZE bsize, int plane, struct buf_2d *src) { - const int wide = AOMMIN(mi_size_wide[bsize] * MI_SIZE, - (xd->sb_mi_bd.mi_col_end + 1) * MI_SIZE - pxl_col); - const int high = AOMMIN(mi_size_high[bsize] * MI_SIZE, - (xd->sb_mi_bd.mi_row_end + 1) * MI_SIZE - pxl_row); - const int ss_x = xd->plane[plane].subsampling_x; - const int ss_y = xd->plane[plane].subsampling_y; - int row_offset = (pxl_row - xd->sb_mi_bd.mi_row_begin * MI_SIZE) >> ss_y; - int col_offset = (pxl_col - xd->sb_mi_bd.mi_col_begin * MI_SIZE) >> ss_x; - int dst_stride = xd->ncobmc_pred_buf_stride[plane]; - int dst_offset = row_offset * dst_stride + col_offset; - int src_stride = src->stride; - - int r, c; - int64_t tmp, error = 0; - - for (r = 0; r < (high >> ss_y); ++r) { - for (c = 0; c < (wide >> ss_x); ++c) { - tmp = xd->ncobmc_pred_buf[plane][r * dst_stride + c + dst_offset] - - src->buf[r * src_stride + c]; - error += tmp * tmp; - } - } - return error; -} - -int get_ncobmc_mode(const AV1_COMP *const cpi, MACROBLOCK *const x, - MACROBLOCKD *xd, int mi_row, int mi_col, int bsize) { - const AV1_COMMON *const cm = &cpi->common; - uint8_t *pred_buf[4][MAX_MB_PLANE]; - - // TODO(weitinglin): stride size needs to be fixed for high-bit depth - int pred_stride[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - - // target block in pxl - int pxl_row = mi_row << MI_SIZE_LOG2; - int pxl_col = mi_col << MI_SIZE_LOG2; - int64_t error, best_error = INT64_MAX; - int plane, tmp_mode, best_mode = 0; -#if CONFIG_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - int len = sizeof(uint16_t); - ASSIGN_ALIGNED_PTRS_HBD(pred_buf[0], cm->ncobmcaw_buf[0], MAX_SB_SQUARE, - len); - ASSIGN_ALIGNED_PTRS_HBD(pred_buf[1], cm->ncobmcaw_buf[1], MAX_SB_SQUARE, - len); - ASSIGN_ALIGNED_PTRS_HBD(pred_buf[2], cm->ncobmcaw_buf[2], MAX_SB_SQUARE, - len); - ASSIGN_ALIGNED_PTRS_HBD(pred_buf[3], cm->ncobmcaw_buf[3], MAX_SB_SQUARE, - len); - } else { -#endif // CONFIG_HIGHBITDEPTH - ASSIGN_ALIGNED_PTRS(pred_buf[0], cm->ncobmcaw_buf[0], MAX_SB_SQUARE); - ASSIGN_ALIGNED_PTRS(pred_buf[1], cm->ncobmcaw_buf[1], MAX_SB_SQUARE); - ASSIGN_ALIGNED_PTRS(pred_buf[2], cm->ncobmcaw_buf[2], MAX_SB_SQUARE); - ASSIGN_ALIGNED_PTRS(pred_buf[3], cm->ncobmcaw_buf[3], MAX_SB_SQUARE); -#if CONFIG_HIGHBITDEPTH - } -#endif - - av1_get_ext_blk_preds(cm, xd, bsize, mi_row, mi_col, pred_buf, pred_stride); - av1_get_ori_blk_pred(cm, xd, bsize, mi_row, mi_col, pred_buf[3], pred_stride); - - for (tmp_mode = 0; tmp_mode < MAX_NCOBMC_MODES; ++tmp_mode) { - error = 0; - for (plane = 0; plane < MAX_MB_PLANE; ++plane) { - build_ncobmc_intrpl_pred(cm, xd, plane, pxl_row, pxl_col, bsize, pred_buf, - pred_stride, tmp_mode); - error += get_ncobmc_error(xd, pxl_row, pxl_col, bsize, plane, - &x->plane[plane].src); - } - if (error < best_error) { - best_mode = tmp_mode; - best_error = error; - } - } - - for (plane = 0; plane < MAX_MB_PLANE; ++plane) { - build_ncobmc_intrpl_pred(cm, xd, plane, pxl_row, pxl_col, bsize, pred_buf, - pred_stride, best_mode); } - - return best_mode; } - -#endif // CONFIG_NCOBMC_ADAPT_WEIGHT -#endif // CONFIG_MOTION_VAR diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h index dbc7527fb..1fa3d68ce 100644 --- a/third_party/aom/av1/encoder/rdopt.h +++ b/third_party/aom/av1/encoder/rdopt.h @@ -13,16 +13,20 @@ #define AV1_ENCODER_RDOPT_H_ #include "av1/common/blockd.h" +#include "av1/common/txb_common.h" #include "av1/encoder/block.h" #include "av1/encoder/context_tree.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodetxb.h" #ifdef __cplusplus extern "C" { #endif +#define MAX_REF_MV_SERCH 3 + struct TileInfo; -struct AV1_COMP; struct macroblock; struct RD_STATS; @@ -35,7 +39,6 @@ static INLINE void av1_update_txb_coeff_cost(RD_STATS *rd_stats, int plane, (void)tx_size; rd_stats->txb_coeff_cost[plane] += txb_coeff_cost; -#if CONFIG_VAR_TX { const int txb_h = tx_size_high_unit[tx_size]; const int txb_w = tx_size_wide_unit[tx_size]; @@ -48,113 +51,86 @@ static INLINE void av1_update_txb_coeff_cost(RD_STATS *rd_stats, int plane, } assert(blk_row < TXB_COEFF_COST_MAP_SIZE); assert(blk_col < TXB_COEFF_COST_MAP_SIZE); -#endif } #endif -typedef enum OUTPUT_STATUS { - OUTPUT_HAS_PREDICTED_PIXELS, - OUTPUT_HAS_DECODED_PIXELS -} OUTPUT_STATUS; - // Returns the number of colors in 'src'. -int av1_count_colors(const uint8_t *src, int stride, int rows, int cols); -#if CONFIG_HIGHBITDEPTH +int av1_count_colors(const uint8_t *src, int stride, int rows, int cols, + int *val_count); // Same as av1_count_colors(), but for high-bitdepth mode. int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols, - int bit_depth); -#endif // CONFIG_HIGHBITDEPTH - -void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane, - BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col, - TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse, - OUTPUT_STATUS output_status); + int bit_depth, int *val_count); #if CONFIG_DIST_8X8 -int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x, +int64_t av1_dist_8x8(const struct AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, const BLOCK_SIZE tx_bsize, int bsw, int bsh, int visible_w, int visible_h, int qindex); #endif -#if !CONFIG_PVQ || CONFIG_VAR_TX -int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane, - int blk_row, int blk_col, int block, TX_SIZE tx_size, - const SCAN_ORDER *scan_order, const ENTROPY_CONTEXT *a, - const ENTROPY_CONTEXT *l, int use_fast_coef_costing); +static INLINE int av1_cost_skip_txb(MACROBLOCK *x, const TXB_CTX *const txb_ctx, + int plane, TX_SIZE tx_size) { + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const PLANE_TYPE plane_type = get_plane_type(plane); + const LV_MAP_COEFF_COST *const coeff_costs = + &x->coeff_costs[txs_ctx][plane_type]; + return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; +} + +static INLINE int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, + int plane, int blk_row, int blk_col, + int block, TX_SIZE tx_size, + const TXB_CTX *const txb_ctx, + int use_fast_coef_costing) { +#if TXCOEFF_COST_TIMER + struct aom_usec_timer timer; + aom_usec_timer_start(&timer); +#endif + (void)use_fast_coef_costing; + const int cost = av1_cost_coeffs_txb(cm, x, plane, blk_row, blk_col, block, + tx_size, txb_ctx); +#if TXCOEFF_COST_TIMER + AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common; + aom_usec_timer_mark(&timer); + const int64_t elapsed_time = aom_usec_timer_elapsed(&timer); + tmp_cm->txcoeff_cost_timer += elapsed_time; + ++tmp_cm->txcoeff_cost_count; #endif + return cost; +} + void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x, - struct RD_STATS *rd_cost, BLOCK_SIZE bsize, - PICK_MODE_CONTEXT *ctx, int64_t best_rd); + int mi_row, int mi_col, struct RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd); -unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi, +unsigned int av1_get_sby_perpixel_variance(const struct AV1_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs); -#if CONFIG_HIGHBITDEPTH -unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi, +unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs, int bd); -#endif void av1_rd_pick_inter_mode_sb(const struct AV1_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x, int mi_row, int mi_col, - struct RD_STATS *rd_cost, -#if CONFIG_SUPERTX - int *returnrate_nocoef, -#endif // CONFIG_SUPERTX - BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, - int64_t best_rd_so_far); + struct RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); void av1_rd_pick_inter_mode_sb_seg_skip( const struct AV1_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); -int av1_internal_image_edge(const struct AV1_COMP *cpi); -int av1_active_h_edge(const struct AV1_COMP *cpi, int mi_row, int mi_step); -int av1_active_v_edge(const struct AV1_COMP *cpi, int mi_col, int mi_step); -int av1_active_edge_sb(const struct AV1_COMP *cpi, int mi_row, int mi_col); - -#if CONFIG_MOTION_VAR && CONFIG_NCOBMC -void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x, - int mi_row, int mi_col); -#endif // CONFIG_MOTION_VAR && CONFIG_NCOBMC - -#if CONFIG_SUPERTX -#if CONFIG_VAR_TX -void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size, - int blk_row, int blk_col, int plane, int block, - int plane_bsize, const ENTROPY_CONTEXT *a, - const ENTROPY_CONTEXT *l, RD_STATS *rd_stats); +#if CONFIG_COLLECT_INTER_MODE_RD_STATS +#define INTER_MODE_RD_TEST 0 +void av1_inter_mode_data_init(); +void av1_inter_mode_data_fit(int rdmult); +void av1_inter_mode_data_show(const AV1_COMMON *cm); #endif -void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate, - int64_t *distortion, int *skippable, - int64_t *sse, int64_t ref_best_rd, int plane, - BLOCK_SIZE bsize, TX_SIZE tx_size, - int use_fast_coef_casting); -#endif // CONFIG_SUPERTX - #ifdef __cplusplus } // extern "C" #endif -int av1_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x, - const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, - TX_SIZE tx_size, TX_TYPE tx_type); - -int64_t get_prediction_rd_cost(const struct AV1_COMP *cpi, struct macroblock *x, - int mi_row, int mi_col, int *skip_blk, - MB_MODE_INFO *backup_mbmi); - -#if CONFIG_NCOBMC_ADAPT_WEIGHT -void av1_check_ncobmc_adapt_weight_rd(const struct AV1_COMP *cpi, - struct macroblock *x, int mi_row, - int mi_col); -int get_ncobmc_mode(const AV1_COMP *const cpi, MACROBLOCK *const x, - MACROBLOCKD *xd, int mi_row, int mi_col, int bsize); - -#endif - #endif // AV1_ENCODER_RDOPT_H_ diff --git a/third_party/aom/av1/encoder/segmentation.c b/third_party/aom/av1/encoder/segmentation.c index 4f01fbba4..2e9102745 100644 --- a/third_party/aom/av1/encoder/segmentation.c +++ b/third_party/aom/av1/encoder/segmentation.c @@ -18,26 +18,21 @@ #include "av1/encoder/cost.h" #include "av1/encoder/segmentation.h" -#include "av1/encoder/subexp.h" void av1_enable_segmentation(struct segmentation *seg) { seg->enabled = 1; seg->update_map = 1; seg->update_data = 1; + seg->temporal_update = 0; } void av1_disable_segmentation(struct segmentation *seg) { seg->enabled = 0; seg->update_map = 0; seg->update_data = 0; + seg->temporal_update = 0; } -void av1_set_segment_data(struct segmentation *seg, int8_t *feature_data, - unsigned char abs_delta) { - seg->abs_delta = abs_delta; - - memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data)); -} void av1_disable_segfeature(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id) { seg->feature_mask[segment_id] &= ~(1 << feature_id); @@ -48,76 +43,8 @@ void av1_clear_segdata(struct segmentation *seg, int segment_id, seg->feature_data[segment_id][feature_id] = 0; } -// Based on set of segment counts calculate a probability tree -static void calc_segtree_probs(unsigned *segcounts, - aom_prob *segment_tree_probs, - const aom_prob *cur_tree_probs, - const int probwt) { - // Work out probabilities of each segment - const unsigned cc[4] = { segcounts[0] + segcounts[1], - segcounts[2] + segcounts[3], - segcounts[4] + segcounts[5], - segcounts[6] + segcounts[7] }; - const unsigned ccc[2] = { cc[0] + cc[1], cc[2] + cc[3] }; - int i; - - segment_tree_probs[0] = get_binary_prob(ccc[0], ccc[1]); - segment_tree_probs[1] = get_binary_prob(cc[0], cc[1]); - segment_tree_probs[2] = get_binary_prob(cc[2], cc[3]); - segment_tree_probs[3] = get_binary_prob(segcounts[0], segcounts[1]); - segment_tree_probs[4] = get_binary_prob(segcounts[2], segcounts[3]); - segment_tree_probs[5] = get_binary_prob(segcounts[4], segcounts[5]); - segment_tree_probs[6] = get_binary_prob(segcounts[6], segcounts[7]); - - for (i = 0; i < 7; i++) { - const unsigned *ct = - i == 0 ? ccc : i < 3 ? cc + (i & 2) : segcounts + (i - 3) * 2; - av1_prob_diff_update_savings_search(ct, cur_tree_probs[i], - &segment_tree_probs[i], - DIFF_UPDATE_PROB, probwt); - } -} - -// Based on set of segment counts and probabilities calculate a cost estimate -static int cost_segmap(unsigned *segcounts, aom_prob *probs) { - const int c01 = segcounts[0] + segcounts[1]; - const int c23 = segcounts[2] + segcounts[3]; - const int c45 = segcounts[4] + segcounts[5]; - const int c67 = segcounts[6] + segcounts[7]; - const int c0123 = c01 + c23; - const int c4567 = c45 + c67; - - // Cost the top node of the tree - int cost = c0123 * av1_cost_zero(probs[0]) + c4567 * av1_cost_one(probs[0]); - - // Cost subsequent levels - if (c0123 > 0) { - cost += c01 * av1_cost_zero(probs[1]) + c23 * av1_cost_one(probs[1]); - - if (c01 > 0) - cost += segcounts[0] * av1_cost_zero(probs[3]) + - segcounts[1] * av1_cost_one(probs[3]); - if (c23 > 0) - cost += segcounts[2] * av1_cost_zero(probs[4]) + - segcounts[3] * av1_cost_one(probs[4]); - } - - if (c4567 > 0) { - cost += c45 * av1_cost_zero(probs[2]) + c67 * av1_cost_one(probs[2]); - - if (c45 > 0) - cost += segcounts[4] * av1_cost_zero(probs[5]) + - segcounts[5] * av1_cost_one(probs[5]); - if (c67 > 0) - cost += segcounts[6] * av1_cost_zero(probs[6]) + - segcounts[7] * av1_cost_one(probs[6]); - } - - return cost; -} - static void count_segs(const AV1_COMMON *cm, MACROBLOCKD *xd, - const TileInfo *tile, MODE_INFO **mi, + const TileInfo *tile, MB_MODE_INFO **mi, unsigned *no_pred_segcounts, unsigned (*temporal_predictor_count)[2], unsigned *t_unpred_seg_counts, int bw, int bh, @@ -127,29 +54,27 @@ static void count_segs(const AV1_COMMON *cm, MACROBLOCKD *xd, if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; xd->mi = mi; - segment_id = xd->mi[0]->mbmi.segment_id; + segment_id = xd->mi[0]->segment_id; - set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, -#if CONFIG_DEPENDENT_HORZTILES - cm->dependent_horz_tiles, -#endif // CONFIG_DEPENDENT_HORZTILES - cm->mi_rows, cm->mi_cols); + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); // Count the number of hits on each segment with no prediction no_pred_segcounts[segment_id]++; // Temporal prediction not allowed on key frames if (cm->frame_type != KEY_FRAME) { - const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; // Test to see if the segment id matches the predicted value. const int pred_segment_id = - get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col); + cm->last_frame_seg_map + ? get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col) + : 0; const int pred_flag = pred_segment_id == segment_id; const int pred_context = av1_get_pred_context_seg_id(xd); // Store the prediction status for this mb and update counts // as appropriate - xd->mi[0]->mbmi.seg_id_predicted = pred_flag; + xd->mi[0]->seg_id_predicted = pred_flag; temporal_predictor_count[pred_context][pred_flag]++; // Update the "unpredicted" segment count @@ -158,21 +83,15 @@ static void count_segs(const AV1_COMMON *cm, MACROBLOCKD *xd, } static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, - const TileInfo *tile, MODE_INFO **mi, + const TileInfo *tile, MB_MODE_INFO **mi, unsigned *no_pred_segcounts, unsigned (*temporal_predictor_count)[2], unsigned *t_unpred_seg_counts, int mi_row, int mi_col, BLOCK_SIZE bsize) { const int mis = cm->mi_stride; const int bs = mi_size_wide[bsize], hbs = bs / 2; -#if CONFIG_EXT_PARTITION_TYPES PARTITION_TYPE partition; -#if CONFIG_EXT_PARTITION_TYPES_AB const int qbs = bs / 4; -#endif // CONFIG_EXT_PARTITION_TYPES_AB -#else - int bw, bh; -#endif // CONFIG_EXT_PARTITION_TYPES if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; @@ -181,7 +100,6 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, \ (cs_bw), (cs_bh), mi_row + (cs_rowoff), mi_col + (cs_coloff)); -#if CONFIG_EXT_PARTITION_TYPES if (bsize == BLOCK_8X8) partition = PARTITION_NONE; else @@ -196,28 +114,6 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, CSEGS(hbs, bs, 0, 0); CSEGS(hbs, bs, 0, hbs); break; -#if CONFIG_EXT_PARTITION_TYPES_AB - case PARTITION_HORZ_A: - CSEGS(bs, qbs, 0, 0); - CSEGS(bs, qbs, qbs, 0); - CSEGS(bs, hbs, hbs, 0); - break; - case PARTITION_HORZ_B: - CSEGS(bs, hbs, 0, 0); - CSEGS(bs, qbs, hbs, 0); - if (mi_row + 3 * qbs < cm->mi_rows) CSEGS(bs, qbs, 3 * qbs, 0); - break; - case PARTITION_VERT_A: - CSEGS(qbs, bs, 0, 0); - CSEGS(qbs, bs, 0, qbs); - CSEGS(hbs, bs, 0, hbs); - break; - case PARTITION_VERT_B: - CSEGS(hbs, bs, 0, 0); - CSEGS(qbs, bs, 0, hbs); - if (mi_col + 3 * qbs < cm->mi_cols) CSEGS(qbs, bs, 0, 3 * qbs); - break; -#else case PARTITION_HORZ_A: CSEGS(hbs, hbs, 0, 0); CSEGS(hbs, hbs, 0, hbs); @@ -238,14 +134,24 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, CSEGS(hbs, hbs, 0, hbs); CSEGS(hbs, hbs, hbs, hbs); break; -#endif + case PARTITION_HORZ_4: + CSEGS(bs, qbs, 0, 0); + CSEGS(bs, qbs, qbs, 0); + CSEGS(bs, qbs, 2 * qbs, 0); + if (mi_row + 3 * qbs < cm->mi_rows) CSEGS(bs, qbs, 3 * qbs, 0); + break; + + case PARTITION_VERT_4: + CSEGS(qbs, bs, 0, 0); + CSEGS(qbs, bs, 0, qbs); + CSEGS(qbs, bs, 0, 2 * qbs); + if (mi_col + 3 * qbs < cm->mi_cols) CSEGS(qbs, bs, 0, 3 * qbs); + break; + case PARTITION_SPLIT: { - const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize]; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); int n; - assert(num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type] < bs && - num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type] < bs); - for (n = 0; n < 4; n++) { const int mi_dc = hbs * (n & 1); const int mi_dr = hbs * (n >> 1); @@ -257,34 +163,6 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, } break; default: assert(0); } -#else - bw = mi_size_wide[mi[0]->mbmi.sb_type]; - bh = mi_size_high[mi[0]->mbmi.sb_type]; - - if (bw == bs && bh == bs) { - CSEGS(bs, bs, 0, 0); - } else if (bw == bs && bh < bs) { - CSEGS(bs, hbs, 0, 0); - CSEGS(bs, hbs, hbs, 0); - } else if (bw < bs && bh == bs) { - CSEGS(hbs, bs, 0, 0); - CSEGS(hbs, bs, 0, hbs); - } else { - const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize]; - int n; - - assert(bw < bs && bh < bs); - - for (n = 0; n < 4; n++) { - const int mi_dc = hbs * (n & 1); - const int mi_dr = hbs * (n >> 1); - - count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts, - temporal_predictor_count, t_unpred_seg_counts, - mi_row + mi_dr, mi_col + mi_dc, subsize); - } - } -#endif // CONFIG_EXT_PARTITION_TYPES #undef CSEGS } @@ -292,83 +170,58 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) { struct segmentation *seg = &cm->seg; struct segmentation_probs *segp = &cm->fc->seg; - int no_pred_cost; int t_pred_cost = INT_MAX; - int tile_col, tile_row, mi_row, mi_col; - const int probwt = cm->num_tg; - - unsigned(*temporal_predictor_count)[2] = cm->counts.seg.pred; - unsigned *no_pred_segcounts = cm->counts.seg.tree_total; - unsigned *t_unpred_seg_counts = cm->counts.seg.tree_mispred; - - aom_prob no_pred_tree[SEG_TREE_PROBS]; - aom_prob t_pred_tree[SEG_TREE_PROBS]; -#if !CONFIG_NEW_MULTISYMBOL - aom_prob t_nopred_prob[PREDICTION_PROBS]; -#endif - + unsigned temporal_predictor_count[SEG_TEMPORAL_PRED_CTXS][2] = { { 0 } }; + unsigned no_pred_segcounts[MAX_SEGMENTS] = { 0 }; + unsigned t_unpred_seg_counts[MAX_SEGMENTS] = { 0 }; (void)xd; - // We are about to recompute all the segment counts, so zero the accumulators. - av1_zero(cm->counts.seg); - // First of all generate stats regarding how well the last segment map // predicts this one for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) { TileInfo tile_info; av1_tile_set_row(&tile_info, cm, tile_row); for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) { - MODE_INFO **mi_ptr; + MB_MODE_INFO **mi_ptr; av1_tile_set_col(&tile_info, cm, tile_col); -#if CONFIG_DEPENDENT_HORZTILES - av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col); -#endif mi_ptr = cm->mi_grid_visible + tile_info.mi_row_start * cm->mi_stride + tile_info.mi_col_start; for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end; - mi_row += cm->mib_size, mi_ptr += cm->mib_size * cm->mi_stride) { - MODE_INFO **mi = mi_ptr; + mi_row += cm->seq_params.mib_size, + mi_ptr += cm->seq_params.mib_size * cm->mi_stride) { + MB_MODE_INFO **mi = mi_ptr; for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end; - mi_col += cm->mib_size, mi += cm->mib_size) { + mi_col += cm->seq_params.mib_size, mi += cm->seq_params.mib_size) { count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, mi_row, - mi_col, cm->sb_size); + mi_col, cm->seq_params.sb_size); } } } } - // Work out probability tree for coding segments without prediction - // and the cost. - calc_segtree_probs(no_pred_segcounts, no_pred_tree, segp->tree_probs, probwt); - no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree); - - // Key frames cannot use temporal prediction - if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) { - // Work out probability tree for coding those segments not - // predicted using the temporal method and the cost. - calc_segtree_probs(t_unpred_seg_counts, t_pred_tree, segp->tree_probs, - probwt); - t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree); -#if !CONFIG_NEW_MULTISYMBOL - // Add in the cost of the signaling for each prediction context. - int i; - for (i = 0; i < PREDICTION_PROBS; i++) { - const int count0 = temporal_predictor_count[i][0]; - const int count1 = temporal_predictor_count[i][1]; - - t_nopred_prob[i] = get_binary_prob(count0, count1); - av1_prob_diff_update_savings_search( - temporal_predictor_count[i], segp->pred_probs[i], &t_nopred_prob[i], - DIFF_UPDATE_PROB, probwt); - - // Add in the predictor signaling cost - t_pred_cost += count0 * av1_cost_zero(t_nopred_prob[i]) + - count1 * av1_cost_one(t_nopred_prob[i]); + int seg_id_cost[MAX_SEGMENTS]; + av1_cost_tokens_from_cdf(seg_id_cost, segp->tree_cdf, NULL); + no_pred_cost = 0; + for (int i = 0; i < MAX_SEGMENTS; ++i) + no_pred_cost += no_pred_segcounts[i] * seg_id_cost[i]; + + // Frames without past dependency cannot use temporal prediction + if (cm->primary_ref_frame != PRIMARY_REF_NONE) { + int pred_flag_cost[SEG_TEMPORAL_PRED_CTXS][2]; + for (int i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i) + av1_cost_tokens_from_cdf(pred_flag_cost[i], segp->pred_cdf[i], NULL); + t_pred_cost = 0; + // Cost for signaling the prediction flag. + for (int i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i) { + for (int j = 0; j < 2; ++j) + t_pred_cost += temporal_predictor_count[i][j] * pred_flag_cost[i][j]; } -#endif + // Cost for signaling the unpredicted segment id. + for (int i = 0; i < MAX_SEGMENTS; ++i) + t_pred_cost += t_unpred_seg_counts[i] * seg_id_cost[i]; } // Now choose which coding method to use. diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h index 1d24ed1d1..a207b0f26 100644 --- a/third_party/aom/av1/encoder/segmentation.h +++ b/third_party/aom/av1/encoder/segmentation.h @@ -27,19 +27,6 @@ void av1_disable_segfeature(struct segmentation *seg, int segment_id, void av1_clear_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id); -// The values given for each segment can be either deltas (from the default -// value chosen for the frame) or absolute values. -// -// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for -// SEGMENT_ALT_LF) -// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for -// SEGMENT_ALT_LF) -// -// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use -// the absolute values given). -void av1_set_segment_data(struct segmentation *seg, int8_t *feature_data, - unsigned char abs_delta); - void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd); void av1_reset_segment_features(AV1_COMMON *cm); diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c index 5608d031e..49740817c 100644 --- a/third_party/aom/av1/encoder/speed_features.c +++ b/third_party/aom/av1/encoder/speed_features.c @@ -17,6 +17,12 @@ #include "aom_dsp/aom_dsp_common.h" +// Setting this to 1 will disable trellis optimization completely. +// Setting this to 2 will disable trellis optimization within the +// transform search. Trellis optimization will still be applied +// in the final encode. +#define DISABLE_TRELLISQ_SEARCH 0 + #define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method static MESH_PATTERN good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = { @@ -28,23 +34,21 @@ static MESH_PATTERN { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, }; static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] = { - 50, 25, 15, 5, 1, 1 + 50, 50, 25, 15, 5, 1 }; -#if CONFIG_INTRABC -// TODO(aconverse@google.com): These settings are pretty relaxed, tune them for +// TODO(huisu@google.com): These settings are pretty relaxed, tune them for // each speed setting static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = { { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } }, + { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } }, { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } }, { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } }, { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } }, { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } }, - { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } }, }; static uint8_t intrabc_max_mesh_pct[MAX_MESH_SPEED + 1] = { 100, 100, 100, 25, 25, 10 }; -#endif // Intra only frames, golden frames (except alt ref overlays) and // alt ref frames tend to be coded at a higher than ambient quality @@ -74,22 +78,18 @@ static BLOCK_SIZE set_partition_min_limit(AV1_COMMON *const cm) { } } +// Do we have an internal image edge (e.g. formatting bars). +static int has_internal_image_edge(const AV1_COMP *cpi) { + return (cpi->oxcf.pass == 2) && + ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) || + (cpi->twopass.this_frame_stats.inactive_zone_cols > 0)); +} + static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi, SPEED_FEATURES *sf, int speed) { AV1_COMMON *const cm = &cpi->common; - if (speed >= 1) { - if (AOMMIN(cm->width, cm->height) >= 720) { - sf->disable_split_mask = - cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; - sf->partition_search_breakout_dist_thr = (1 << 23); - } else { - sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; - sf->partition_search_breakout_dist_thr = (1 << 21); - } - } - if (speed >= 2) { if (AOMMIN(cm->width, cm->height) >= 720) { sf->disable_split_mask = @@ -121,11 +121,11 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi, } // If this is a two pass clip that fits the criteria for animated or - // graphics content then reset disable_split_mask for speeds 1-4. + // graphics content then reset disable_split_mask for speeds 2+. // Also if the image edge is internal to the coded area. - if ((speed >= 1) && (cpi->oxcf.pass == 2) && + if ((speed >= 2) && (cpi->oxcf.pass == 2) && ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) || - (av1_internal_image_edge(cpi)))) { + (has_internal_image_edge(cpi)))) { sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; } @@ -145,85 +145,83 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, AV1_COMMON *const cm = &cpi->common; const int boosted = frame_is_boosted(cpi); + // Speed 0 for all speed features that give neutral coding performance change. + sf->reduce_inter_modes = 1; + sf->prune_ext_partition_types_search_level = 1; + sf->ml_prune_ab_partition = 1; + sf->adaptive_txb_search_level = 1; + sf->jnt_comp_skip_mv_search = 1; + sf->model_based_prune_tx_search_level = 1; + sf->model_based_post_interp_filter_breakout = 1; + sf->inter_mode_rd_model_estimation = 1; + if (speed >= 1) { - sf->tx_type_search.fast_intra_tx_type_search = 1; - sf->tx_type_search.fast_inter_tx_type_search = 1; + sf->gm_erroradv_type = GM_ERRORADV_TR_1; + sf->selective_ref_frame = 1; + sf->inter_tx_size_search_init_depth_rect = 1; + sf->inter_tx_size_search_init_depth_sqr = 1; + sf->intra_tx_size_search_init_depth_rect = 1; + sf->intra_tx_size_search_init_depth_sqr = 1; + sf->tx_size_search_lgr_block = 1; + sf->two_pass_partition_search = 1; + sf->mode_pruning_based_on_two_pass_partition_search = 1; + sf->prune_ext_partition_types_search_level = 2; + sf->use_fast_interpolation_filter_search = 1; + sf->skip_repeat_interpolation_filter_search = 1; + sf->tx_type_search.skip_tx_search = 1; + sf->tx_type_search.ml_tx_split_thresh = 40; + sf->model_based_prune_tx_search_level = 0; + sf->model_based_post_interp_filter_breakout = 0; + // TODO(angiebird): Re-evaluate the impact of inter_mode_rd_model_estimation + // on speed 1 + sf->inter_mode_rd_model_estimation = 0; + sf->adaptive_txb_search_level = 2; + sf->use_intra_txb_hash = 1; + sf->optimize_b_precheck = 1; + sf->dual_sgr_penalty_level = 1; } if (speed >= 2) { - if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) || - av1_internal_image_edge(cpi)) { - sf->use_square_partition_only = !frame_is_boosted(cpi); - } else { - sf->use_square_partition_only = !frame_is_intra_only(cm); - } + sf->gm_erroradv_type = GM_ERRORADV_TR_2; - sf->less_rectangular_check = 1; + sf->selective_ref_frame = 2; + sf->fast_cdef_search = 1; sf->use_rd_breakout = 1; - sf->adaptive_motion_search = 1; - sf->mv.auto_mv_step_size = 1; sf->adaptive_rd_thresh = 1; + sf->mv.auto_mv_step_size = 1; sf->mv.subpel_iters_per_step = 1; - sf->mode_skip_start = 10; - sf->adaptive_pred_interp_filter = 1; - - sf->recode_loop = ALLOW_RECODE_KFARFGF; -#if CONFIG_TX64X64 - sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V; -#if CONFIG_CFL - sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL; -#else - sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC_H_V; -#endif // CONFIG_CFL -#endif // CONFIG_TX64X64 - sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; -#if CONFIG_CFL - sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL; -#else - sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; -#endif - sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; -#if CONFIG_CFL - sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL; -#else - sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; -#endif + sf->disable_filter_search_var_thresh = 100; + sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL; - sf->tx_size_search_breakout = 1; sf->partition_search_breakout_rate_thr = 80; - sf->tx_type_search.prune_mode = PRUNE_ONE; - // Use transform domain distortion. - // Note var-tx expt always uses pixel domain distortion. - sf->use_transform_domain_distortion = 1; + sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; + sf->allow_partition_search_skip = 1; sf->disable_wedge_search_var_thresh = 100; sf->fast_wedge_sign_estimate = 1; } if (speed >= 3) { - sf->tx_size_search_method = - frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL; - sf->mode_search_skip_flags = - (cm->frame_type == KEY_FRAME) - ? 0 - : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | - FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR; - sf->disable_filter_search_var_thresh = 100; - sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL; - sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; - sf->allow_partition_search_skip = 1; - sf->use_upsampled_references = 0; + sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL; + sf->less_rectangular_check = 1; + sf->mode_skip_start = 10; + sf->adaptive_pred_interp_filter = 1; + // adaptive_motion_search breaks encoder multi-thread tests. + // The values in x->pred_mv[] differ for single and multi-thread cases. + // See aomedia:1778. + // sf->adaptive_motion_search = 1; + sf->recode_loop = ALLOW_RECODE_KFARFGF; + sf->use_transform_domain_distortion = 1; + sf->use_accurate_subpel_search = 0; sf->adaptive_rd_thresh = 2; -#if CONFIG_EXT_TX - sf->tx_type_search.prune_mode = PRUNE_TWO; -#endif -#if CONFIG_GLOBAL_MOTION + sf->tx_type_search.prune_mode = PRUNE_2D_FAST; sf->gm_search_type = GM_DISABLE_SEARCH; -#endif // CONFIG_GLOBAL_MOTION } if (speed >= 4) { - sf->use_square_partition_only = !frame_is_intra_only(cm); + sf->tx_type_search.fast_intra_tx_type_search = 1; + sf->tx_type_search.fast_inter_tx_type_search = 1; + sf->use_square_partition_only = !boosted; sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL; sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED; @@ -232,52 +230,44 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->cb_partition_search = !boosted; sf->cb_pred_filter_search = 1; sf->alt_ref_search_fp = 1; - sf->recode_loop = ALLOW_RECODE_KFMAXBW; - sf->adaptive_rd_thresh = 3; sf->mode_skip_start = 6; -#if CONFIG_TX64X64 - sf->intra_y_mode_mask[TX_64X64] = INTRA_DC; -#if CONFIG_CFL - sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_CFL; -#else - sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC; -#endif // CONFIG_CFL -#endif // CONFIG_TX64X64 - sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; -#if CONFIG_CFL - sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_CFL; -#else - sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; -#endif // CONFIG_CFL sf->adaptive_interp_filter_search = 1; } if (speed >= 5) { + sf->recode_loop = ALLOW_RECODE_KFMAXBW; + sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL; + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL; + sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL; sf->use_square_partition_only = 1; sf->tx_size_search_method = USE_LARGESTALL; sf->mv.search_method = BIGDIA; sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; sf->adaptive_rd_thresh = 4; - if (cm->frame_type != KEY_FRAME) - sf->mode_search_skip_flags |= FLAG_EARLY_TERMINATE; + sf->mode_search_skip_flags = + (cm->frame_type == KEY_FRAME) + ? 0 + : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR | + FLAG_EARLY_TERMINATE; sf->disable_filter_search_var_thresh = 200; sf->use_fast_coef_updates = ONE_LOOP_REDUCED; sf->use_fast_coef_costing = 1; sf->partition_search_breakout_rate_thr = 300; + sf->use_transform_domain_distortion = 2; } if (speed >= 6) { int i; - sf->optimize_coefficients = 0; + sf->optimize_coefficients = NO_TRELLIS_OPT; sf->mv.search_method = HEX; sf->disable_filter_search_var_thresh = 500; for (i = 0; i < TX_SIZES; ++i) { sf->intra_y_mode_mask[i] = INTRA_DC; -#if CONFIG_CFL sf->intra_uv_mode_mask[i] = UV_INTRA_DC_CFL; -#else - sf->intra_uv_mode_mask[i] = INTRA_DC; -#endif // CONFIG_CFL } sf->partition_search_breakout_rate_thr = 500; sf->mv.reduce_first_step_size = 1; @@ -288,9 +278,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key; sf->default_max_partition_size = BLOCK_32X32; sf->default_min_partition_size = BLOCK_8X8; -#if CONFIG_TX64X64 sf->intra_y_mode_mask[TX_64X64] = INTRA_DC; -#endif // CONFIG_TX64X64 sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->frame_parameter_update = 0; sf->mv.search_method = FAST_HEX; @@ -298,13 +286,10 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST; sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST; sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST; -#if CONFIG_EXT_PARTITION sf->inter_mode_mask[BLOCK_64X128] = INTER_NEAREST; sf->inter_mode_mask[BLOCK_128X64] = INTER_NEAREST; sf->inter_mode_mask[BLOCK_128X128] = INTER_NEAREST; -#endif // CONFIG_EXT_PARTITION sf->partition_search_type = REFERENCE_PARTITION; - sf->default_min_partition_size = BLOCK_8X8; sf->reuse_inter_pred_sby = 1; sf->force_frame_boost = is_keyframe || @@ -324,31 +309,9 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) { SPEED_FEATURES *const sf = &cpi->sf; const AV1EncoderConfig *const oxcf = &cpi->oxcf; - AV1_COMMON *const cm = &cpi->common; RD_OPT *const rd = &cpi->rd; int i; -// Limit memory usage for high resolutions -#if CONFIG_EXT_REFS - // TODO(zoeliu): Temporary solution to resolve the insufficient RAM issue for - // ext-refs. Need to work with @yunqingwang to have a more - // effective solution. - if (AOMMIN(cm->width, cm->height) > 720) { - // Turn off the use of upsampled references for HD resolution - sf->use_upsampled_references = 0; - } else if ((AOMMIN(cm->width, cm->height) > 540) && - (oxcf->profile != PROFILE_0)) { - sf->use_upsampled_references = 0; - } -#else - if (AOMMIN(cm->width, cm->height) > 1080) { - sf->use_upsampled_references = 0; - } else if ((AOMMIN(cm->width, cm->height) > 720) && - (oxcf->profile != PROFILE_0)) { - sf->use_upsampled_references = 0; - } -#endif // CONFIG_EXT_REFS - if (oxcf->mode == GOOD) { set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed); } @@ -371,6 +334,52 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) { cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv; } +static void set_dev_sf(AV1_COMP *cpi, SPEED_FEATURES *sf, int speed) { + AV1_COMMON *const cm = &cpi->common; + + if (speed & TXFM_CODING_SF) { + sf->inter_tx_size_search_init_depth_rect = 1; + sf->inter_tx_size_search_init_depth_sqr = 1; + sf->intra_tx_size_search_init_depth_rect = 1; + sf->intra_tx_size_search_init_depth_sqr = 1; + sf->tx_size_search_method = USE_FAST_RD; + sf->tx_type_search.fast_intra_tx_type_search = 1; + sf->tx_type_search.fast_inter_tx_type_search = 1; + } + + if (speed & INTER_PRED_SF) { + sf->selective_ref_frame = 2; + // sf->adaptive_motion_search = 1; + sf->mv.auto_mv_step_size = 1; + sf->adaptive_rd_thresh = 1; + sf->mv.subpel_iters_per_step = 1; + sf->adaptive_pred_interp_filter = 1; + } + + if (speed & INTRA_PRED_SF) { + sf->max_intra_bsize = BLOCK_32X32; + } + + if (speed & PARTITION_SF) { + if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) || + has_internal_image_edge(cpi)) { + sf->use_square_partition_only = !frame_is_boosted(cpi); + } else { + sf->use_square_partition_only = !frame_is_intra_only(cm); + } + sf->less_rectangular_check = 1; + sf->prune_ext_partition_types_search_level = 2; + } + + if (speed & LOOP_FILTER_SF) { + sf->fast_cdef_search = 1; + } + + if (speed & RD_SKIP_SF) { + sf->use_rd_breakout = 1; + } +} + void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; SPEED_FEATURES *const sf = &cpi->sf; @@ -378,7 +387,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { const AV1EncoderConfig *const oxcf = &cpi->oxcf; int i; - (void)cm; // best quality defaults sf->frame_parameter_update = 1; sf->mv.search_method = NSTEP; @@ -386,7 +394,19 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->mv.subpel_search_method = SUBPEL_TREE; sf->mv.subpel_iters_per_step = 2; sf->mv.subpel_force_stop = 0; - sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf); +#if DISABLE_TRELLISQ_SEARCH == 2 + sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf) + ? FINAL_PASS_TRELLIS_OPT + : NO_TRELLIS_OPT; +#elif DISABLE_TRELLISQ_SEARCH == 1 + sf->optimize_coefficients = NO_TRELLIS_OPT; +#else + if (is_lossless_requested(&cpi->oxcf)) + sf->optimize_coefficients = NO_TRELLIS_OPT; + else + sf->optimize_coefficients = FULL_TRELLIS_OPT; +#endif // DISABLE_TRELLISQ_SEARCH + sf->gm_erroradv_type = GM_ERRORADV_TR_0; sf->mv.reduce_first_step_size = 0; sf->coeff_prob_appx_step = 1; sf->mv.auto_mv_step_size = 0; @@ -394,6 +414,15 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->comp_inter_joint_search_thresh = BLOCK_4X4; sf->adaptive_rd_thresh = 0; sf->tx_size_search_method = USE_FULL_RD; + sf->inter_tx_size_search_init_depth_sqr = 0; + sf->inter_tx_size_search_init_depth_rect = 0; + sf->intra_tx_size_search_init_depth_rect = 0; + sf->intra_tx_size_search_init_depth_sqr = 0; + sf->tx_size_search_lgr_block = 0; + sf->model_based_prune_tx_search_level = 0; + sf->model_based_post_interp_filter_breakout = 0; + sf->reduce_inter_modes = 0; + sf->selective_ref_gm = 1; sf->adaptive_motion_search = 0; sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 0; @@ -401,10 +430,13 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->cb_partition_search = 0; sf->alt_ref_search_fp = 0; sf->partition_search_type = SEARCH_PARTITION; - sf->tx_type_search.prune_mode = NO_PRUNE; + sf->tx_type_search.prune_mode = PRUNE_2D_ACCURATE; + sf->tx_type_search.ml_tx_split_thresh = 30; sf->tx_type_search.use_skip_flag_prediction = 1; sf->tx_type_search.fast_intra_tx_type_search = 0; sf->tx_type_search.fast_inter_tx_type_search = 0; + sf->tx_type_search.skip_tx_search = 0; + sf->selective_ref_frame = 0; sf->less_rectangular_check = 0; sf->use_square_partition_only = 0; sf->auto_min_max_partition_size = NOT_IN_USE; @@ -420,17 +452,25 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->disable_filter_search_var_thresh = 0; sf->adaptive_interp_filter_search = 0; sf->allow_partition_search_skip = 0; - sf->use_upsampled_references = 1; + sf->use_accurate_subpel_search = 1; sf->disable_wedge_search_var_thresh = 0; sf->fast_wedge_sign_estimate = 0; + sf->drop_ref = 0; + sf->skip_intra_in_interframe = 1; + sf->txb_split_cap = 1; + sf->adaptive_txb_search_level = 0; + sf->two_pass_partition_search = 0; + sf->mode_pruning_based_on_two_pass_partition_search = 0; + sf->use_intra_txb_hash = 0; + sf->use_inter_txb_hash = 1; + sf->use_mb_rd_hash = 1; + sf->optimize_b_precheck = 0; + sf->jnt_comp_fast_tx_search = 0; + sf->jnt_comp_skip_mv_search = 0; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_ALL; -#if CONFIG_CFL sf->intra_uv_mode_mask[i] = UV_INTRA_ALL; -#else - sf->intra_uv_mode_mask[i] = INTRA_ALL; -#endif // CONFIG_CFL } sf->use_rd_breakout = 0; sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE; @@ -448,22 +488,28 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { // Recode loop tolerance %. sf->recode_tolerance = 25; sf->default_interp_filter = SWITCHABLE; - sf->tx_size_search_breakout = 0; sf->partition_search_breakout_dist_thr = 0; sf->partition_search_breakout_rate_thr = 0; sf->simple_model_rd_from_var = 0; + sf->prune_ext_partition_types_search_level = 0; + sf->ml_prune_ab_partition = 0; + sf->fast_cdef_search = 0; // Set this at the appropriate speed levels sf->use_transform_domain_distortion = 0; -#if CONFIG_GLOBAL_MOTION sf->gm_search_type = GM_FULL_SEARCH; -#endif // CONFIG_GLOBAL_MOTION + sf->use_fast_interpolation_filter_search = 0; + sf->skip_repeat_interpolation_filter_search = 0; + sf->use_hash_based_trellis = 0; + + // Set decoder side speed feature to use less dual sgr modes + sf->dual_sgr_penalty_level = 0; + + sf->inter_mode_rd_model_estimation = 0; - if (oxcf->mode == GOOD -#if CONFIG_XIPHRC - || oxcf->pass == 1 -#endif - ) + set_dev_sf(cpi, sf, oxcf->dev_sf); + + if (oxcf->mode == GOOD) set_good_speed_features_framesize_independent(cpi, sf, oxcf->speed); // sf->partition_search_breakout_dist_thr is set assuming max 64x64 @@ -472,7 +518,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->partition_search_breakout_dist_thr <<= 2 * (MAX_SB_SIZE_LOG2 - 6); } - cpi->full_search_sad = av1_full_search_sad; cpi->diamond_search_sad = av1_diamond_search_sad; sf->allow_exhaustive_searches = 1; @@ -490,7 +535,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->mesh_patterns[i].interval = good_quality_mesh_patterns[speed][i].interval; } -#if CONFIG_INTRABC if ((frame_is_intra_only(cm) && cm->allow_screen_content_tools) && (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION || cpi->oxcf.content == AOM_CONTENT_SCREEN)) { @@ -500,18 +544,15 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { } sf->max_exaustive_pct = intrabc_max_mesh_pct[speed]; } -#endif // CONFIG_INTRABC -#if !CONFIG_XIPHRC // Slow quant, dct and trellis not worthwhile for first pass // so make sure they are always turned off. - if (oxcf->pass == 1) sf->optimize_coefficients = 0; -#endif + if (oxcf->pass == 1) sf->optimize_coefficients = NO_TRELLIS_OPT; // No recode for 1 pass. if (oxcf->pass == 0) { sf->recode_loop = DISALLOW_RECODE; - sf->optimize_coefficients = 0; + sf->optimize_coefficients = NO_TRELLIS_OPT; } if (sf->mv.subpel_search_method == SUBPEL_TREE) { @@ -524,12 +565,11 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_evenmore; } -#if !CONFIG_AOM_QM - x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1; -#else + cpi->optimize_speed_feature = + oxcf->pass != 1 ? sf->optimize_coefficients : NO_TRELLIS_OPT; // FIXME: trellis not very efficient for quantisation matrices - x->optimize = 0; -#endif + if (cm->using_qmatrix) cpi->optimize_speed_feature = NO_TRELLIS_OPT; + if (oxcf->disable_trellis_quant) cpi->optimize_speed_feature = NO_TRELLIS_OPT; x->min_partition_size = sf->default_min_partition_size; x->max_partition_size = sf->default_max_partition_size; @@ -543,4 +583,8 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv; else if (cpi->oxcf.motion_vector_unit_test == 2) cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv; + +#if CONFIG_DIST_8X8 + if (sf->use_transform_domain_distortion > 0) cpi->oxcf.using_dist_8x8 = 0; +#endif // CONFIG_DIST_8X8 } diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h index edd79cd16..59cb6be58 100644 --- a/third_party/aom/av1/encoder/speed_features.h +++ b/third_party/aom/av1/encoder/speed_features.h @@ -20,64 +20,51 @@ extern "C" { enum { INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) | - (1 << D135_PRED) | (1 << D117_PRED) | (1 << D153_PRED) | - (1 << D207_PRED) | (1 << D63_PRED) | (1 << SMOOTH_PRED) | -#if CONFIG_SMOOTH_HV - (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) | -#endif // CONFIG_SMOOTH_HV - (1 << TM_PRED), -#if CONFIG_CFL - UV_INTRA_ALL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) | - (1 << UV_D45_PRED) | (1 << UV_D135_PRED) | - (1 << UV_D117_PRED) | (1 << UV_D153_PRED) | - (1 << UV_D207_PRED) | (1 << UV_D63_PRED) | - (1 << UV_SMOOTH_PRED) | -#if CONFIG_SMOOTH_HV - (1 << UV_SMOOTH_V_PRED) | (1 << UV_SMOOTH_H_PRED) | -#endif // CONFIG_SMOOTH_HV - (1 << UV_TM_PRED) | (1 << UV_CFL_PRED), + (1 << D135_PRED) | (1 << D113_PRED) | (1 << D157_PRED) | + (1 << D203_PRED) | (1 << D67_PRED) | (1 << SMOOTH_PRED) | + (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) | (1 << PAETH_PRED), + UV_INTRA_ALL = + (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) | + (1 << UV_D45_PRED) | (1 << UV_D135_PRED) | (1 << UV_D113_PRED) | + (1 << UV_D157_PRED) | (1 << UV_D203_PRED) | (1 << UV_D67_PRED) | + (1 << UV_SMOOTH_PRED) | (1 << UV_SMOOTH_V_PRED) | + (1 << UV_SMOOTH_H_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED), UV_INTRA_DC = (1 << UV_DC_PRED), UV_INTRA_DC_CFL = (1 << UV_DC_PRED) | (1 << UV_CFL_PRED), - UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_TM_PRED), - UV_INTRA_DC_TM_CFL = - (1 << UV_DC_PRED) | (1 << UV_TM_PRED) | (1 << UV_CFL_PRED), + UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED), + UV_INTRA_DC_PAETH_CFL = + (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED), UV_INTRA_DC_H_V = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED), UV_INTRA_DC_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) | (1 << UV_CFL_PRED), - UV_INTRA_DC_TM_H_V = (1 << UV_DC_PRED) | (1 << UV_TM_PRED) | - (1 << UV_V_PRED) | (1 << UV_H_PRED), - UV_INTRA_DC_TM_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_TM_PRED) | - (1 << UV_V_PRED) | (1 << UV_H_PRED) | - (1 << UV_CFL_PRED), -#endif // CONFIG_CFL + UV_INTRA_DC_PAETH_H_V = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | + (1 << UV_V_PRED) | (1 << UV_H_PRED), + UV_INTRA_DC_PAETH_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | + (1 << UV_V_PRED) | (1 << UV_H_PRED) | + (1 << UV_CFL_PRED), INTRA_DC = (1 << DC_PRED), - INTRA_DC_TM = (1 << DC_PRED) | (1 << TM_PRED), + INTRA_DC_TM = (1 << DC_PRED) | (1 << PAETH_PRED), INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED), - INTRA_DC_TM_H_V = - (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) | (1 << H_PRED) + INTRA_DC_PAETH_H_V = + (1 << DC_PRED) | (1 << PAETH_PRED) | (1 << V_PRED) | (1 << H_PRED) }; enum { -#if CONFIG_COMPOUND_SINGLEREF -// TODO(zoeliu): To further consider following single ref comp modes: -// SR_NEAREST_NEARMV, SR_NEAREST_NEWMV, SR_NEAR_NEWMV, -// SR_ZERO_NEWMV, and SR_NEW_NEWMV. -#endif // CONFIG_COMPOUND_SINGLEREF - INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV) | - (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) | (1 << NEW_NEWMV) | - (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) | (1 << NEW_NEARMV) | - (1 << NEW_NEARESTMV) | (1 << ZERO_ZEROMV), + INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | + (1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) | + (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) | + (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV), INTER_NEAREST = (1 << NEARESTMV) | (1 << NEAREST_NEARESTMV) | (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV), INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) | (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) | (1 << NEW_NEARMV) | (1 << NEAR_NEWMV), - INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) | - (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) | + INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << GLOBALMV) | + (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) | (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV), - INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV) | - (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) | + INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << GLOBALMV) | (1 << NEWMV) | + (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) | (1 << NEW_NEWMV) | (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) | (1 << NEW_NEARMV) | (1 << NEAR_NEWMV), @@ -86,8 +73,8 @@ enum { (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) | (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | (1 << NEAR_NEARMV), - INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | - (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) | + INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | + (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) | (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) | (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | (1 << NEAR_NEARMV), @@ -106,6 +93,17 @@ enum { }; typedef enum { + TXFM_CODING_SF = 1, + INTER_PRED_SF = 2, + INTRA_PRED_SF = 4, + PARTITION_SF = 8, + LOOP_FILTER_SF = 16, + RD_SKIP_SF = 32, + RESERVE_2_SF = 64, + RESERVE_3_SF = 128, +} DEV_SPEED_FEATURES; + +typedef enum { DIAMOND = 0, NSTEP = 1, HEX = 2, @@ -141,8 +139,8 @@ typedef enum { typedef enum { USE_FULL_RD = 0, + USE_FAST_RD, USE_LARGESTALL, - USE_TX_8X8 } TX_SIZE_SEARCH_METHOD; typedef enum { @@ -190,10 +188,13 @@ typedef enum { NO_PRUNE = 0, // eliminates one tx type in vertical and horizontal direction PRUNE_ONE = 1, -#if CONFIG_EXT_TX // eliminates two tx types in each direction PRUNE_TWO = 2, -#endif + // adaptively prunes the least perspective tx types out of all 16 + // (tuned to provide negligible quality loss) + PRUNE_2D_ACCURATE = 3, + // similar, but applies much more aggressive pruning to get better speed-up + PRUNE_2D_FAST = 4, } TX_TYPE_PRUNE_MODE; typedef struct { @@ -204,6 +205,13 @@ typedef struct { // Use a skip flag prediction model to detect blocks with skip = 1 early // and avoid doing full TX type search for such blocks. int use_skip_flag_prediction; + + // Threshold used by the ML based method to predict TX block split decisions. + int ml_tx_split_thresh; + + // skip remaining transform type search when we found the rdcost of skip is + // better than applying transform + int skip_tx_search; } TX_TYPE_SEARCH; typedef enum { @@ -261,13 +269,29 @@ typedef struct MESH_PATTERN { int interval; } MESH_PATTERN; -#if CONFIG_GLOBAL_MOTION typedef enum { GM_FULL_SEARCH, GM_REDUCED_REF_SEARCH, GM_DISABLE_SEARCH } GM_SEARCH_TYPE; -#endif // CONFIG_GLOBAL_MOTION + +typedef enum { + GM_ERRORADV_TR_0, + GM_ERRORADV_TR_1, + GM_ERRORADV_TR_2, + GM_ERRORADV_TR_TYPES, +} GM_ERRORADV_TYPE; + +typedef enum { + NO_TRELLIS_OPT, // No trellis optimization + FULL_TRELLIS_OPT, // Trellis optimization in all stages + FINAL_PASS_TRELLIS_OPT // Trellis optimization in only the final encode pass +} TRELLIS_OPT_TYPE; + +typedef enum { + FULL_TXFM_RD, + LOW_TXFM_RD, +} TXFM_RD_MODEL; typedef struct SPEED_FEATURES { MV_SPEED_FEATURES mv; @@ -277,8 +301,11 @@ typedef struct SPEED_FEATURES { RECODE_LOOP_TYPE recode_loop; - // Trellis (dynamic programming) optimization of quantized values (+1, 0). - int optimize_coefficients; + // Trellis (dynamic programming) optimization of quantized values + TRELLIS_OPT_TYPE optimize_coefficients; + + // Global motion warp error threshold + GM_ERRORADV_TYPE gm_erroradv_type; // Always set to 0. If on it enables 0 cost background transmission // (except for the initial transmission of the segmentation). The feature is @@ -287,6 +314,14 @@ typedef struct SPEED_FEATURES { // adds overhead. int static_segmentation; + // Limit the inter mode tested in the RD loop + int reduce_inter_modes; + + // Do not compute the global motion parameters for a LAST2_FRAME or + // LAST3_FRAME if the GOLDEN_FRAME is closer and it has a non identity + // global model. + int selective_ref_gm; + // If 1 we iterate finding a best reference for 2 ref frames together - via // a log search that iterates 4 times (check around mv for last for best // error of combined predictor then check around mv for alt). If 0 we @@ -309,6 +344,17 @@ typedef struct SPEED_FEATURES { // for intra and model coefs for the rest. TX_SIZE_SEARCH_METHOD tx_size_search_method; + // Init search depth for square and rectangular transform partitions. + // Values: + // 0 - search full tree, 1: search 1 level, 2: search the highest level only + int inter_tx_size_search_init_depth_sqr; + int inter_tx_size_search_init_depth_rect; + int intra_tx_size_search_init_depth_sqr; + int intra_tx_size_search_init_depth_rect; + // If any dimension of a coding block size above 64, always search the + // largest transform only, since the largest transform block size is 64x64. + int tx_size_search_lgr_block; + // After looking at the first set of modes (set by index here), skip // checking modes for reference frames that don't match the reference frame // of the best so far. @@ -318,9 +364,51 @@ typedef struct SPEED_FEATURES { TX_TYPE_SEARCH tx_type_search; + // Skip split transform block partition when the collocated bigger block + // is selected as all zero coefficients. + int txb_split_cap; + + // Shortcut the transform block partition and type search when the target + // rdcost is relatively lower. + // Values are 0 (not used) , or 1 - 2 with progressively increasing + // aggressiveness + int adaptive_txb_search_level; + + // Prune level for tx_size_type search for inter based on rd model + // 0: no pruning + // 1-2: progressively increasing aggressiveness of pruning + int model_based_prune_tx_search_level; + + // Model based breakout after interpolation filter search + // 0: no breakout + // 1: use model based rd breakout + int model_based_post_interp_filter_breakout; + // Used if partition_search_type = FIXED_SIZE_PARTITION BLOCK_SIZE always_this_block_size; + // Drop less likely to be picked reference frames in the RD search. + // Has three levels for now: 0, 1 and 2, where higher levels prune more + // aggressively than lower ones. (0 means no pruning). + int selective_ref_frame; + + // Prune extended partition types search + // Can take values 0 - 2, 0 referring to no pruning, and 1 - 2 increasing + // aggressiveness of pruning in order. + int prune_ext_partition_types_search_level; + + // Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions. + int ml_prune_ab_partition; + + int fast_cdef_search; + + // 2-pass coding block partition search + int two_pass_partition_search; + + // Use the mode decisions made in the initial partition search to prune mode + // candidates, e.g. ref frames. + int mode_pruning_based_on_two_pass_partition_search; + // Skip rectangular partition test when partition type none gives better // rd than partition type split. int less_rectangular_check; @@ -427,7 +515,7 @@ typedef struct SPEED_FEATURES { // by only looking at counts from 1/2 the bands. FAST_COEFF_UPDATE use_fast_coef_updates; - // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV + // A binary mask indicating if NEARESTMV, NEARMV, GLOBALMV, NEWMV // modes are used in order from LSB to MSB for each BLOCK_SIZE. int inter_mode_mask[BLOCK_SIZES_ALL]; @@ -456,10 +544,6 @@ typedef struct SPEED_FEATURES { // default interp filter choice InterpFilter default_interp_filter; - // Early termination in transform size search, which only applies while - // tx_size_search_method is USE_FULL_RD. - int tx_size_search_breakout; - // adaptive interp_filter search to allow skip of certain filter types. int adaptive_interp_filter_search; @@ -476,16 +560,67 @@ typedef struct SPEED_FEATURES { // Fast approximation of av1_model_rd_from_var_lapndz int simple_model_rd_from_var; - // Do sub-pixel search in up-sampled reference frames - int use_upsampled_references; + // If true, sub-pixel search uses the exact convolve function used for final + // encoding and decoding; otherwise, it uses bilinear interpolation. + int use_accurate_subpel_search; // Whether to compute distortion in the image domain (slower but // more accurate), or in the transform domain (faster but less acurate). + // 0: use image domain + // 1: use transform domain in tx_type search, and use image domain for + // RD_STATS + // 2: use transform domain int use_transform_domain_distortion; -#if CONFIG_GLOBAL_MOTION GM_SEARCH_TYPE gm_search_type; -#endif // CONFIG_GLOBAL_MOTION + + // Do limited interpolation filter search for dual filters, since best choice + // usually includes EIGHTTAP_REGULAR. + int use_fast_interpolation_filter_search; + + // Save results of interpolation_filter_search for a block + // Check mv and ref_frames before search, if they are same with previous + // saved results, it can be skipped. + int skip_repeat_interpolation_filter_search; + + // Use a hash table to store previously computed optimized qcoeffs from + // expensive calls to optimize_txb. + int use_hash_based_trellis; + + // flag to drop some ref frames in compound motion search + int drop_ref; + + // flag to allow skipping intra mode for inter frame prediction + int skip_intra_in_interframe; + + // Use hash table to store intra(keyframe only) txb transform search results + // to avoid repeated search on the same residue signal. + int use_intra_txb_hash; + + // Use hash table to store inter txb transform search results + // to avoid repeated search on the same residue signal. + int use_inter_txb_hash; + + // Use hash table to store macroblock RD search results + // to avoid repeated search on the same residue signal. + int use_mb_rd_hash; + + // Calculate RD cost before doing optimize_b, and skip if the cost is large. + int optimize_b_precheck; + + // Use model rd instead of transform search in jnt_comp + int jnt_comp_fast_tx_search; + + // Skip mv search in jnt_comp + int jnt_comp_skip_mv_search; + + // Decoder side speed feature to add penalty for use of dual-sgr filters. + // Takes values 0 - 10, 0 indicating no penalty and each additional level + // adding a penalty of 1% + int dual_sgr_penalty_level; + + // Dynamically estimate final rd from prediction error and mode cost + int inter_mode_rd_model_estimation; } SPEED_FEATURES; struct AV1_COMP; diff --git a/third_party/aom/av1/encoder/subexp.c b/third_party/aom/av1/encoder/subexp.c deleted file mode 100644 index dc96d712a..000000000 --- a/third_party/aom/av1/encoder/subexp.c +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#include "aom_dsp/bitwriter.h" - -#include "av1/common/common.h" -#include "av1/common/entropy.h" -#include "av1/encoder/cost.h" -#include "av1/encoder/subexp.h" - -static const uint8_t update_bits[255] = { - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, - 11, 11, 11, 11, 11, 11, 11, 0, -}; -#define MIN_DELP_BITS 5 - -static int recenter_nonneg(int v, int m) { - if (v > (m << 1)) - return v; - else if (v >= m) - return ((v - m) << 1); - else - return ((m - v) << 1) - 1; -} - -static int remap_prob(int v, int m) { - int i; - static const uint8_t map_table[MAX_PROB - 1] = { - // generated by: - // map_table[j] = split_index(j, MAX_PROB - 1, MODULUS_PARAM); - 20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33, - 34, 35, 36, 37, 1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 2, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, - 3, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 4, 74, - 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 5, 86, 87, 88, - 89, 90, 91, 92, 93, 94, 95, 96, 97, 6, 98, 99, 100, 101, 102, - 103, 104, 105, 106, 107, 108, 109, 7, 110, 111, 112, 113, 114, 115, 116, - 117, 118, 119, 120, 121, 8, 122, 123, 124, 125, 126, 127, 128, 129, 130, - 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, - 145, 10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11, - 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171, - 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 13, 182, 183, 184, 185, - 186, 187, 188, 189, 190, 191, 192, 193, 14, 194, 195, 196, 197, 198, 199, - 200, 201, 202, 203, 204, 205, 15, 206, 207, 208, 209, 210, 211, 212, 213, - 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, - 228, 229, 17, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, - 18, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 19, - }; - v--; - m--; - if ((m << 1) <= MAX_PROB) - i = recenter_nonneg(v, m) - 1; - else - i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1; - - i = map_table[i]; - return i; -} - -static int prob_diff_update_cost(aom_prob newp, aom_prob oldp) { - int delp = remap_prob(newp, oldp); - return update_bits[delp] << AV1_PROB_COST_SHIFT; -} - -static void encode_uniform(aom_writer *w, int v) { - const int l = 8; - const int m = (1 << l) - 190; - if (v < m) { - aom_write_literal(w, v, l - 1); - } else { - aom_write_literal(w, m + ((v - m) >> 1), l - 1); - aom_write_literal(w, (v - m) & 1, 1); - } -} - -static INLINE int write_bit_gte(aom_writer *w, int word, int test) { - aom_write_literal(w, word >= test, 1); - return word >= test; -} - -static void encode_term_subexp(aom_writer *w, int word) { - if (!write_bit_gte(w, word, 16)) { - aom_write_literal(w, word, 4); - } else if (!write_bit_gte(w, word, 32)) { - aom_write_literal(w, word - 16, 4); - } else if (!write_bit_gte(w, word, 64)) { - aom_write_literal(w, word - 32, 5); - } else { - encode_uniform(w, word - 64); - } -} - -void av1_write_prob_diff_update(aom_writer *w, aom_prob newp, aom_prob oldp) { - const int delp = remap_prob(newp, oldp); - encode_term_subexp(w, delp); -} - -int av1_prob_diff_update_savings_search(const unsigned int *ct, aom_prob oldp, - aom_prob *bestp, aom_prob upd, - int probwt) { - const uint32_t old_b = cost_branch256(ct, oldp); - int bestsavings = 0; - aom_prob newp, bestnewp = oldp; - const int step = *bestp > oldp ? -1 : 1; - const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd); - - if (old_b > (uint32_t)upd_cost + (MIN_DELP_BITS << AV1_PROB_COST_SHIFT)) { - for (newp = *bestp; newp != oldp; newp += step) { - const int new_b = cost_branch256(ct, newp); - const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost; - const int savings = (int)((int64_t)old_b - new_b - update_b * probwt); - if (savings > bestsavings) { - bestsavings = savings; - bestnewp = newp; - } - } - } - *bestp = bestnewp; - return bestsavings; -} - -void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp, - const unsigned int ct[2], int probwt) { - const aom_prob upd = DIFF_UPDATE_PROB; - aom_prob newp = get_binary_prob(ct[0], ct[1]); - const int savings = - av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd, probwt); - assert(newp >= 1); - if (savings > 0) { - aom_write(w, 1, upd); - av1_write_prob_diff_update(w, newp, *oldp); - *oldp = newp; - } else { - aom_write(w, 0, upd); - } -} - -int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2], - int probwt) { - const aom_prob upd = DIFF_UPDATE_PROB; - aom_prob newp = get_binary_prob(ct[0], ct[1]); - const int savings = - av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd, probwt); - return savings; -} diff --git a/third_party/aom/av1/encoder/subexp.h b/third_party/aom/av1/encoder/subexp.h deleted file mode 100644 index 580edabdb..000000000 --- a/third_party/aom/av1/encoder/subexp.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AV1_ENCODER_SUBEXP_H_ -#define AV1_ENCODER_SUBEXP_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include "aom_dsp/bitwriter.h" -#include "aom_dsp/prob.h" - -void av1_write_prob_diff_update(aom_writer *w, aom_prob newp, aom_prob oldpm); - -void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp, - const unsigned int ct[2], int probwt); - -int av1_prob_diff_update_savings_search(const unsigned int *ct, aom_prob oldp, - aom_prob *bestp, aom_prob upd, - int probwt); - -int av1_prob_diff_update_savings_search_model(const unsigned int *ct, - const aom_prob oldp, - aom_prob *bestp, aom_prob upd, - int stepsize, int probwt); - -int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2], - int probwt); -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AV1_ENCODER_SUBEXP_H_ diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c index daa647689..250feab81 100644 --- a/third_party/aom/av1/encoder/temporal_filter.c +++ b/third_party/aom/av1/encoder/temporal_filter.c @@ -12,7 +12,8 @@ #include <math.h> #include <limits.h> -#include "./aom_config.h" +#include "config/aom_config.h" + #include "av1/common/alloccommon.h" #include "av1/common/onyxc_int.h" #include "av1/common/quant_common.h" @@ -35,26 +36,17 @@ static void temporal_filter_predictors_mb_c( MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr, int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col, - uint8_t *pred, struct scale_factors *scale, int x, int y) { + uint8_t *pred, struct scale_factors *scale, int x, int y, + int can_use_previous) { const int which_mv = 0; const MV mv = { mv_row, mv_col }; enum mv_precision mv_precision_uv; int uv_stride; // TODO(angiebird): change plane setting accordingly - ConvolveParams conv_params = get_conv_params(which_mv, which_mv, 0); - -#if USE_TEMPORALFILTER_12TAP - const InterpFilters interp_filters = - av1_broadcast_interp_filter(TEMPORALFILTER_12TAP); - (void)xd; -#else - const InterpFilters interp_filters = xd->mi[0]->mbmi.interp_filters; -#endif // USE_TEMPORALFILTER_12TAP - -#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + ConvolveParams conv_params = get_conv_params(which_mv, 0, 0, xd->bd); + const InterpFilters interp_filters = xd->mi[0]->interp_filters; WarpTypesAllowed warp_types; memset(&warp_types, 0, sizeof(WarpTypesAllowed)); -#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION if (uv_block_width == 8) { uv_stride = (stride + 1) >> 1; @@ -64,55 +56,36 @@ static void temporal_filter_predictors_mb_c( mv_precision_uv = MV_PRECISION_Q3; } -#if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { av1_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16, which_mv, interp_filters, -#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - &warp_types, x, y, -#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - 0, MV_PRECISION_Q3, x, y, xd); - - av1_highbd_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], - uv_block_width, &mv, scale, uv_block_width, - uv_block_height, which_mv, interp_filters, -#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - &warp_types, x, y, -#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - 1, mv_precision_uv, x, y, xd); - - av1_highbd_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], - uv_block_width, &mv, scale, uv_block_width, - uv_block_height, which_mv, interp_filters, -#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - &warp_types, x, y, -#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - 2, mv_precision_uv, x, y, xd); + &warp_types, x, y, 0, MV_PRECISION_Q3, x, + y, xd, can_use_previous); + + av1_highbd_build_inter_predictor( + u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale, + uv_block_width, uv_block_height, which_mv, interp_filters, &warp_types, + x, y, 1, mv_precision_uv, x, y, xd, can_use_previous); + + av1_highbd_build_inter_predictor( + v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale, + uv_block_width, uv_block_height, which_mv, interp_filters, &warp_types, + x, y, 2, mv_precision_uv, x, y, xd, can_use_previous); return; } -#endif // CONFIG_HIGHBITDEPTH av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16, - &conv_params, interp_filters, -#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - &warp_types, x, y, 0, 0, -#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - MV_PRECISION_Q3, x, y, xd); + &conv_params, interp_filters, &warp_types, x, y, 0, + 0, MV_PRECISION_Q3, x, y, xd, can_use_previous); av1_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale, uv_block_width, uv_block_height, - &conv_params, interp_filters, -#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - &warp_types, x, y, 1, 0, -#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - mv_precision_uv, x, y, xd); + &conv_params, interp_filters, &warp_types, x, y, 1, + 0, mv_precision_uv, x, y, xd, can_use_previous); av1_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale, uv_block_width, uv_block_height, - &conv_params, interp_filters, -#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - &warp_types, x, y, 2, 0, -#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION - mv_precision_uv, x, y, xd); + &conv_params, interp_filters, &warp_types, x, y, 2, + 0, mv_precision_uv, x, y, xd, can_use_previous); } void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, @@ -176,7 +149,6 @@ void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, } } -#if CONFIG_HIGHBITDEPTH void av1_highbd_temporal_filter_apply_c( uint8_t *frame1_8, unsigned int stride, uint8_t *frame2_8, unsigned int block_width, unsigned int block_height, int strength, @@ -238,7 +210,6 @@ void av1_highbd_temporal_filter_apply_c( byte += stride - block_width; } } -#endif // CONFIG_HIGHBITDEPTH static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi, uint8_t *arf_frame_buf, @@ -255,7 +226,7 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi, int cost_list[5]; MvLimits tmp_mv_limits = x->mv_limits; - MV best_ref_mv1 = { 0, 0 }; + MV best_ref_mv1 = kZeroMv; MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ // Save input state @@ -276,8 +247,8 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi, av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1); - x->mvcost = x->mv_cost_stack[0]; - x->nmvjointcost = x->nmv_vec_cost[0]; + x->mvcost = x->mv_cost_stack; + x->nmvjointcost = x->nmv_vec_cost; // Use mv costing from x->mvcost directly av1_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1, @@ -286,9 +257,8 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi, x->mv_limits = tmp_mv_limits; -// Ignore mv costing by sending NULL pointer instead of cost array -#if CONFIG_AMVR - if (cpi->common.cur_frame_mv_precision_level == 1) { + // Ignore mv costing by sending NULL pointer instead of cost array + if (cpi->common.cur_frame_force_integer_mv == 1) { const uint8_t *const src_address = x->plane[0].src.buf; const int src_stride = x->plane[0].src.stride; const uint8_t *const y = xd->plane[0].pre[0].buf; @@ -301,17 +271,15 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi, bestsme = cpi->fn_ptr[BLOCK_16X16].vf(y + offset, y_stride, src_address, src_stride, &sse); } else { -#endif bestsme = cpi->find_fractional_mv_step( - x, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, + x, &cpi->common, 0, 0, &best_ref_mv1, + cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_iters_per_step, cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, NULL, 0, 0, 0, 0, 0); -#if CONFIG_AMVR } -#endif - x->e_mbd.mi[0]->bmi[0].as_mv[0] = x->best_mv; + x->e_mbd.mi[0]->mv[0] = x->best_mv; // Restore input state x->plane[0].src = src; @@ -321,13 +289,12 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi, } static void temporal_filter_iterate_c(AV1_COMP *cpi, -#if CONFIG_BGSPRITE - YV12_BUFFER_CONFIG *target, -#endif // CONFIG_BGSPRITE YV12_BUFFER_CONFIG **frames, int frame_count, int alt_ref_index, int strength, struct scale_factors *scale) { + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); int byte; int frame; int mb_col, mb_row; @@ -341,28 +308,22 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi, MACROBLOCKD *mbd = &cpi->td.mb.e_mbd; YV12_BUFFER_CONFIG *f = frames[alt_ref_index]; uint8_t *dst1, *dst2; -#if CONFIG_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, predictor16[16 * 16 * 3]); - DECLARE_ALIGNED(16, uint8_t, predictor8[16 * 16 * 3]); + DECLARE_ALIGNED(32, uint16_t, predictor16[16 * 16 * 3]); + DECLARE_ALIGNED(32, uint8_t, predictor8[16 * 16 * 3]); uint8_t *predictor; -#else - DECLARE_ALIGNED(16, uint8_t, predictor[16 * 16 * 3]); -#endif const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y; const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x; // Save input state uint8_t *input_buffer[MAX_MB_PLANE]; int i; -#if CONFIG_HIGHBITDEPTH if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { predictor = CONVERT_TO_BYTEPTR(predictor16); } else { predictor = predictor8; } -#endif - for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf; + for (i = 0; i < num_planes; i++) input_buffer[i] = mbd->plane[i].pre[0].buf; for (mb_row = 0; mb_row < mb_rows; mb_row++) { // Source frames are extended to 16 pixels. This is different than @@ -399,8 +360,9 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi, if (frames[frame] == NULL) continue; - mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0; - mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0; + mbd->mi[0]->mv[0].as_mv.row = 0; + mbd->mi[0]->mv[0].as_mv.col = 0; + mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION; if (frame == alt_ref_index) { filter_weight = 2; @@ -422,60 +384,51 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi, mbd, frames[frame]->y_buffer + mb_y_offset, frames[frame]->u_buffer + mb_uv_offset, frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride, - mb_uv_width, mb_uv_height, mbd->mi[0]->bmi[0].as_mv[0].as_mv.row, - mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, predictor, scale, - mb_col * 16, mb_row * 16); + mb_uv_width, mb_uv_height, mbd->mi[0]->mv[0].as_mv.row, + mbd->mi[0]->mv[0].as_mv.col, predictor, scale, mb_col * 16, + mb_row * 16, cm->allow_warped_motion); -// Apply the filter (YUV) -#if CONFIG_HIGHBITDEPTH + // Apply the filter (YUV) if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { int adj_strength = strength + 2 * (mbd->bd - 8); av1_highbd_temporal_filter_apply( f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, adj_strength, filter_weight, accumulator, count); - av1_highbd_temporal_filter_apply( - f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, - mb_uv_width, mb_uv_height, adj_strength, filter_weight, - accumulator + 256, count + 256); - av1_highbd_temporal_filter_apply( - f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, - mb_uv_width, mb_uv_height, adj_strength, filter_weight, - accumulator + 512, count + 512); + if (num_planes > 1) { + av1_highbd_temporal_filter_apply( + f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, + mb_uv_width, mb_uv_height, adj_strength, filter_weight, + accumulator + 256, count + 256); + av1_highbd_temporal_filter_apply( + f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, + mb_uv_width, mb_uv_height, adj_strength, filter_weight, + accumulator + 512, count + 512); + } } else { -#endif // CONFIG_HIGHBITDEPTH av1_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, strength, filter_weight, accumulator, count); - av1_temporal_filter_apply_c( - f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, - mb_uv_width, mb_uv_height, strength, filter_weight, - accumulator + 256, count + 256); - av1_temporal_filter_apply_c( - f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, - mb_uv_width, mb_uv_height, strength, filter_weight, - accumulator + 512, count + 512); -#if CONFIG_HIGHBITDEPTH + if (num_planes > 1) { + av1_temporal_filter_apply_c( + f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, + mb_uv_width, mb_uv_height, strength, filter_weight, + accumulator + 256, count + 256); + av1_temporal_filter_apply_c( + f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, + mb_uv_width, mb_uv_height, strength, filter_weight, + accumulator + 512, count + 512); + } } -#endif // CONFIG_HIGHBITDEPTH } } -// Normalize filter output to produce AltRef frame -#if CONFIG_HIGHBITDEPTH + // Normalize filter output to produce AltRef frame if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { uint16_t *dst1_16; uint16_t *dst2_16; -#if CONFIG_BGSPRITE - dst1 = target->y_buffer; -#else dst1 = cpi->alt_ref_buffer.y_buffer; -#endif // CONFIG_BGSPRITE dst1_16 = CONVERT_TO_SHORTPTR(dst1); -#if CONFIG_BGSPRITE - stride = target->y_stride; -#else stride = cpi->alt_ref_buffer.y_stride; -#endif // CONFIG_BGSPRITE byte = mb_y_offset; for (i = 0, k = 0; i < 16; i++) { for (j = 0; j < 16; j++, k++) { @@ -488,40 +441,31 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi, byte += stride - 16; } - - dst1 = cpi->alt_ref_buffer.u_buffer; - dst2 = cpi->alt_ref_buffer.v_buffer; - dst1_16 = CONVERT_TO_SHORTPTR(dst1); - dst2_16 = CONVERT_TO_SHORTPTR(dst2); - stride = cpi->alt_ref_buffer.uv_stride; - byte = mb_uv_offset; - for (i = 0, k = 256; i < mb_uv_height; i++) { - for (j = 0; j < mb_uv_width; j++, k++) { - int m = k + 256; - - // U - dst1_16[byte] = - (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]); - - // V - dst2_16[byte] = - (uint16_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]); - - // move to next pixel - byte++; + if (num_planes > 1) { + dst1 = cpi->alt_ref_buffer.u_buffer; + dst2 = cpi->alt_ref_buffer.v_buffer; + dst1_16 = CONVERT_TO_SHORTPTR(dst1); + dst2_16 = CONVERT_TO_SHORTPTR(dst2); + stride = cpi->alt_ref_buffer.uv_stride; + byte = mb_uv_offset; + for (i = 0, k = 256; i < mb_uv_height; i++) { + for (j = 0; j < mb_uv_width; j++, k++) { + int m = k + 256; + // U + dst1_16[byte] = + (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]); + // V + dst2_16[byte] = + (uint16_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]); + // move to next pixel + byte++; + } + byte += stride - mb_uv_width; } - - byte += stride - mb_uv_width; } } else { -#endif // CONFIG_HIGHBITDEPTH -#if CONFIG_BGSPRITE - dst1 = target->y_buffer; - stride = target->y_stride; -#else - dst1 = cpi->alt_ref_buffer.y_buffer; - stride = cpi->alt_ref_buffer.y_stride; -#endif // CONFIG_BGSPRITE + dst1 = cpi->alt_ref_buffer.y_buffer; + stride = cpi->alt_ref_buffer.y_stride; byte = mb_y_offset; for (i = 0, k = 0; i < 16; i++) { for (j = 0; j < 16; j++, k++) { @@ -533,36 +477,27 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi, } byte += stride - 16; } -#if CONFIG_BGSPRITE - dst1 = target->u_buffer; - dst2 = target->v_buffer; - stride = target->uv_stride; -#else - dst1 = cpi->alt_ref_buffer.u_buffer; - dst2 = cpi->alt_ref_buffer.v_buffer; - stride = cpi->alt_ref_buffer.uv_stride; -#endif // CONFIG_BGSPRITE - byte = mb_uv_offset; - for (i = 0, k = 256; i < mb_uv_height; i++) { - for (j = 0; j < mb_uv_width; j++, k++) { - int m = k + 256; - - // U - dst1[byte] = - (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]); - - // V - dst2[byte] = - (uint8_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]); - - // move to next pixel - byte++; + if (num_planes > 1) { + dst1 = cpi->alt_ref_buffer.u_buffer; + dst2 = cpi->alt_ref_buffer.v_buffer; + stride = cpi->alt_ref_buffer.uv_stride; + byte = mb_uv_offset; + for (i = 0, k = 256; i < mb_uv_height; i++) { + for (j = 0; j < mb_uv_width; j++, k++) { + int m = k + 256; + // U + dst1[byte] = + (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]); + // V + dst2[byte] = + (uint8_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]); + // move to next pixel + byte++; + } + byte += stride - mb_uv_width; } - byte += stride - mb_uv_width; } -#if CONFIG_HIGHBITDEPTH } -#endif // CONFIG_HIGHBITDEPTH mb_y_offset += 16; mb_uv_offset += mb_uv_width; } @@ -571,7 +506,7 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi, } // Restore input state - for (i = 0; i < MAX_MB_PLANE; i++) mbd->plane[i].pre[0].buf = input_buffer[i]; + for (i = 0; i < num_planes; i++) mbd->plane[i].pre[0].buf = input_buffer[i]; } // Apply buffer limits and context specific adjustments to arnr filter. @@ -633,11 +568,7 @@ static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost, *arnr_strength = strength; } -void av1_temporal_filter(AV1_COMP *cpi, -#if CONFIG_BGSPRITE - YV12_BUFFER_CONFIG *bg, YV12_BUFFER_CONFIG *target, -#endif // CONFIG_BGSPRITE - int distance) { +void av1_temporal_filter(AV1_COMP *cpi, int distance) { RATE_CONTROL *const rc = &cpi->rc; int frame; int frames_to_blur; @@ -647,17 +578,14 @@ void av1_temporal_filter(AV1_COMP *cpi, int frames_to_blur_forward; struct scale_factors sf; YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL }; -#if CONFIG_EXT_REFS const GF_GROUP *const gf_group = &cpi->twopass.gf_group; -#endif // CONFIG_EXT_REFS // Apply context specific adjustments to the arnr filter parameters. adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength); -// TODO(weitinglin): Currently, we enforce the filtering strength on -// extra ARFs' to be zeros. We should investigate in which -// case it is more beneficial to use non-zero strength -// filtering. -#if CONFIG_EXT_REFS + // TODO(weitinglin): Currently, we enforce the filtering strength on + // extra ARFs' to be zeros. We should investigate in which + // case it is more beneficial to use non-zero strength + // filtering. if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) { strength = 0; frames_to_blur = 1; @@ -685,7 +613,7 @@ void av1_temporal_filter(AV1_COMP *cpi, cpi->is_arf_filter_off[which_arf] = 1; else cpi->is_arf_filter_off[which_arf] = 0; -#endif // CONFIG_EXT_REFS + cpi->common.showable_frame = cpi->is_arf_filter_off[which_arf]; frames_to_blur_backward = (frames_to_blur / 2); frames_to_blur_forward = ((frames_to_blur - 1) / 2); @@ -694,40 +622,20 @@ void av1_temporal_filter(AV1_COMP *cpi, // Setup frame pointers, NULL indicates frame not included in filter. for (frame = 0; frame < frames_to_blur; ++frame) { const int which_buffer = start_frame - frame; -#if CONFIG_BGSPRITE - if (frame == frames_to_blur_backward && bg != NULL) { - // Insert bg into frames at ARF index. - frames[frames_to_blur - 1 - frame] = bg; - } else { -#endif // CONFIG_BGSPRITE - struct lookahead_entry *buf = - av1_lookahead_peek(cpi->lookahead, which_buffer); - frames[frames_to_blur - 1 - frame] = &buf->img; -#if CONFIG_BGSPRITE - } -#endif // CONFIG_BGSPRITE + struct lookahead_entry *buf = + av1_lookahead_peek(cpi->lookahead, which_buffer); + frames[frames_to_blur - 1 - frame] = &buf->img; } if (frames_to_blur > 0) { -// Setup scaling factors. Scaling on each of the arnr frames is not -// supported. -// ARF is produced at the native frame size and resized when coded. -#if CONFIG_HIGHBITDEPTH - av1_setup_scale_factors_for_frame( - &sf, frames[0]->y_crop_width, frames[0]->y_crop_height, - frames[0]->y_crop_width, frames[0]->y_crop_height, - cpi->common.use_highbitdepth); -#else + // Setup scaling factors. Scaling on each of the arnr frames is not + // supported. + // ARF is produced at the native frame size and resized when coded. av1_setup_scale_factors_for_frame( &sf, frames[0]->y_crop_width, frames[0]->y_crop_height, frames[0]->y_crop_width, frames[0]->y_crop_height); -#endif // CONFIG_HIGHBITDEPTH } - temporal_filter_iterate_c(cpi, -#if CONFIG_BGSPRITE - target, -#endif // CONFIG_BGSPRITE - frames, frames_to_blur, frames_to_blur_backward, - strength, &sf); + temporal_filter_iterate_c(cpi, frames, frames_to_blur, + frames_to_blur_backward, strength, &sf); } diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h index 7dd9fad58..bc0863a63 100644 --- a/third_party/aom/av1/encoder/temporal_filter.h +++ b/third_party/aom/av1/encoder/temporal_filter.h @@ -16,11 +16,7 @@ extern "C" { #endif -void av1_temporal_filter(AV1_COMP *cpi, -#if CONFIG_BGSPRITE - YV12_BUFFER_CONFIG *bg, YV12_BUFFER_CONFIG *target, -#endif // CONFIG_BGSPRITE - int distance); +void av1_temporal_filter(AV1_COMP *cpi, int distance); #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/av1/encoder/tokenize.c b/third_party/aom/av1/encoder/tokenize.c index a2e24d66b..16a6a9a35 100644 --- a/third_party/aom/av1/encoder/tokenize.c +++ b/third_party/aom/av1/encoder/tokenize.c @@ -23,314 +23,13 @@ #include "av1/encoder/cost.h" #include "av1/encoder/encoder.h" -#if CONFIG_LV_MAP -#include "av1/encoder/encodetxb.c" -#endif +#include "av1/encoder/encodetxb.h" #include "av1/encoder/rdopt.h" #include "av1/encoder/tokenize.h" -static const TOKENVALUE dct_cat_lt_10_value_tokens[] = { - { 9, 63 }, { 9, 61 }, { 9, 59 }, { 9, 57 }, { 9, 55 }, { 9, 53 }, { 9, 51 }, - { 9, 49 }, { 9, 47 }, { 9, 45 }, { 9, 43 }, { 9, 41 }, { 9, 39 }, { 9, 37 }, - { 9, 35 }, { 9, 33 }, { 9, 31 }, { 9, 29 }, { 9, 27 }, { 9, 25 }, { 9, 23 }, - { 9, 21 }, { 9, 19 }, { 9, 17 }, { 9, 15 }, { 9, 13 }, { 9, 11 }, { 9, 9 }, - { 9, 7 }, { 9, 5 }, { 9, 3 }, { 9, 1 }, { 8, 31 }, { 8, 29 }, { 8, 27 }, - { 8, 25 }, { 8, 23 }, { 8, 21 }, { 8, 19 }, { 8, 17 }, { 8, 15 }, { 8, 13 }, - { 8, 11 }, { 8, 9 }, { 8, 7 }, { 8, 5 }, { 8, 3 }, { 8, 1 }, { 7, 15 }, - { 7, 13 }, { 7, 11 }, { 7, 9 }, { 7, 7 }, { 7, 5 }, { 7, 3 }, { 7, 1 }, - { 6, 7 }, { 6, 5 }, { 6, 3 }, { 6, 1 }, { 5, 3 }, { 5, 1 }, { 4, 1 }, - { 3, 1 }, { 2, 1 }, { 1, 1 }, { 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 }, - { 4, 0 }, { 5, 0 }, { 5, 2 }, { 6, 0 }, { 6, 2 }, { 6, 4 }, { 6, 6 }, - { 7, 0 }, { 7, 2 }, { 7, 4 }, { 7, 6 }, { 7, 8 }, { 7, 10 }, { 7, 12 }, - { 7, 14 }, { 8, 0 }, { 8, 2 }, { 8, 4 }, { 8, 6 }, { 8, 8 }, { 8, 10 }, - { 8, 12 }, { 8, 14 }, { 8, 16 }, { 8, 18 }, { 8, 20 }, { 8, 22 }, { 8, 24 }, - { 8, 26 }, { 8, 28 }, { 8, 30 }, { 9, 0 }, { 9, 2 }, { 9, 4 }, { 9, 6 }, - { 9, 8 }, { 9, 10 }, { 9, 12 }, { 9, 14 }, { 9, 16 }, { 9, 18 }, { 9, 20 }, - { 9, 22 }, { 9, 24 }, { 9, 26 }, { 9, 28 }, { 9, 30 }, { 9, 32 }, { 9, 34 }, - { 9, 36 }, { 9, 38 }, { 9, 40 }, { 9, 42 }, { 9, 44 }, { 9, 46 }, { 9, 48 }, - { 9, 50 }, { 9, 52 }, { 9, 54 }, { 9, 56 }, { 9, 58 }, { 9, 60 }, { 9, 62 } -}; -const TOKENVALUE *av1_dct_cat_lt_10_value_tokens = - dct_cat_lt_10_value_tokens + - (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens)) / - 2; -// The corresponding costs of the extrabits for the tokens in the above table -// are stored in the table below. The values are obtained from looking up the -// entry for the specified extrabits in the table corresponding to the token -// (as defined in cost element av1_extra_bits) -// e.g. {9, 63} maps to cat5_cost[63 >> 1], {1, 1} maps to sign_cost[1 >> 1] -static const int dct_cat_lt_10_value_cost[] = { - 3773, 3750, 3704, 3681, 3623, 3600, 3554, 3531, 3432, 3409, 3363, 3340, 3282, - 3259, 3213, 3190, 3136, 3113, 3067, 3044, 2986, 2963, 2917, 2894, 2795, 2772, - 2726, 2703, 2645, 2622, 2576, 2553, 3197, 3116, 3058, 2977, 2881, 2800, 2742, - 2661, 2615, 2534, 2476, 2395, 2299, 2218, 2160, 2079, 2566, 2427, 2334, 2195, - 2023, 1884, 1791, 1652, 1893, 1696, 1453, 1256, 1229, 864, 512, 512, 512, - 512, 0, 512, 512, 512, 512, 864, 1229, 1256, 1453, 1696, 1893, 1652, - 1791, 1884, 2023, 2195, 2334, 2427, 2566, 2079, 2160, 2218, 2299, 2395, 2476, - 2534, 2615, 2661, 2742, 2800, 2881, 2977, 3058, 3116, 3197, 2553, 2576, 2622, - 2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963, 2986, 3044, 3067, 3113, 3136, - 3190, 3213, 3259, 3282, 3340, 3363, 3409, 3432, 3531, 3554, 3600, 3623, 3681, - 3704, 3750, 3773, -}; -const int *av1_dct_cat_lt_10_value_cost = - dct_cat_lt_10_value_cost + - (sizeof(dct_cat_lt_10_value_cost) / sizeof(*dct_cat_lt_10_value_cost)) / 2; - -// Array indices are identical to previously-existing CONTEXT_NODE indices -/* clang-format off */ -const aom_tree_index av1_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = { - -EOB_TOKEN, 2, // 0 = EOB - -ZERO_TOKEN, 4, // 1 = ZERO - -ONE_TOKEN, 6, // 2 = ONE - 8, 12, // 3 = LOW_VAL - -TWO_TOKEN, 10, // 4 = TWO - -THREE_TOKEN, -FOUR_TOKEN, // 5 = THREE - 14, 16, // 6 = HIGH_LOW - -CATEGORY1_TOKEN, -CATEGORY2_TOKEN, // 7 = CAT_ONE - 18, 20, // 8 = CAT_THREEFOUR - -CATEGORY3_TOKEN, -CATEGORY4_TOKEN, // 9 = CAT_THREE - -CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 10 = CAT_FIVE -}; -/* clang-format on */ - -static const int16_t zero_cost[] = { 0 }; -static const int16_t sign_cost[1] = { 512 }; -static const int16_t cat1_cost[1 << 1] = { 864, 1229 }; -static const int16_t cat2_cost[1 << 2] = { 1256, 1453, 1696, 1893 }; -static const int16_t cat3_cost[1 << 3] = { 1652, 1791, 1884, 2023, - 2195, 2334, 2427, 2566 }; -static const int16_t cat4_cost[1 << 4] = { 2079, 2160, 2218, 2299, 2395, 2476, - 2534, 2615, 2661, 2742, 2800, 2881, - 2977, 3058, 3116, 3197 }; -static const int16_t cat5_cost[1 << 5] = { - 2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963, - 2986, 3044, 3067, 3113, 3136, 3190, 3213, 3259, 3282, 3340, 3363, - 3409, 3432, 3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773 -}; -const int16_t av1_cat6_low_cost[256] = { - 3378, 3390, 3401, 3413, 3435, 3447, 3458, 3470, 3517, 3529, 3540, 3552, 3574, - 3586, 3597, 3609, 3671, 3683, 3694, 3706, 3728, 3740, 3751, 3763, 3810, 3822, - 3833, 3845, 3867, 3879, 3890, 3902, 3973, 3985, 3996, 4008, 4030, 4042, 4053, - 4065, 4112, 4124, 4135, 4147, 4169, 4181, 4192, 4204, 4266, 4278, 4289, 4301, - 4323, 4335, 4346, 4358, 4405, 4417, 4428, 4440, 4462, 4474, 4485, 4497, 4253, - 4265, 4276, 4288, 4310, 4322, 4333, 4345, 4392, 4404, 4415, 4427, 4449, 4461, - 4472, 4484, 4546, 4558, 4569, 4581, 4603, 4615, 4626, 4638, 4685, 4697, 4708, - 4720, 4742, 4754, 4765, 4777, 4848, 4860, 4871, 4883, 4905, 4917, 4928, 4940, - 4987, 4999, 5010, 5022, 5044, 5056, 5067, 5079, 5141, 5153, 5164, 5176, 5198, - 5210, 5221, 5233, 5280, 5292, 5303, 5315, 5337, 5349, 5360, 5372, 4988, 5000, - 5011, 5023, 5045, 5057, 5068, 5080, 5127, 5139, 5150, 5162, 5184, 5196, 5207, - 5219, 5281, 5293, 5304, 5316, 5338, 5350, 5361, 5373, 5420, 5432, 5443, 5455, - 5477, 5489, 5500, 5512, 5583, 5595, 5606, 5618, 5640, 5652, 5663, 5675, 5722, - 5734, 5745, 5757, 5779, 5791, 5802, 5814, 5876, 5888, 5899, 5911, 5933, 5945, - 5956, 5968, 6015, 6027, 6038, 6050, 6072, 6084, 6095, 6107, 5863, 5875, 5886, - 5898, 5920, 5932, 5943, 5955, 6002, 6014, 6025, 6037, 6059, 6071, 6082, 6094, - 6156, 6168, 6179, 6191, 6213, 6225, 6236, 6248, 6295, 6307, 6318, 6330, 6352, - 6364, 6375, 6387, 6458, 6470, 6481, 6493, 6515, 6527, 6538, 6550, 6597, 6609, - 6620, 6632, 6654, 6666, 6677, 6689, 6751, 6763, 6774, 6786, 6808, 6820, 6831, - 6843, 6890, 6902, 6913, 6925, 6947, 6959, 6970, 6982 -}; -const int av1_cat6_high_cost[CAT6_HIGH_COST_ENTRIES] = { - 100, 2263, 2739, 4902, 3160, 5323, 5799, 7962, 3678, 5841, 6317, - 8480, 6738, 8901, 9377, 11540, 3678, 5841, 6317, 8480, 6738, 8901, - 9377, 11540, 7256, 9419, 9895, 12058, 10316, 12479, 12955, 15118, 3678, - 5841, 6317, 8480, 6738, 8901, 9377, 11540, 7256, 9419, 9895, 12058, - 10316, 12479, 12955, 15118, 7256, 9419, 9895, 12058, 10316, 12479, 12955, - 15118, 10834, 12997, 13473, 15636, 13894, 16057, 16533, 18696, -#if CONFIG_HIGHBITDEPTH - 4193, 6356, 6832, 8995, 7253, 9416, 9892, 12055, 7771, 9934, 10410, - 12573, 10831, 12994, 13470, 15633, 7771, 9934, 10410, 12573, 10831, 12994, - 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771, - 9934, 10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, - 14409, 16572, 17048, 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, - 19211, 14927, 17090, 17566, 19729, 17987, 20150, 20626, 22789, 4193, 6356, - 6832, 8995, 7253, 9416, 9892, 12055, 7771, 9934, 10410, 12573, 10831, - 12994, 13470, 15633, 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, - 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771, 9934, 10410, - 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, - 17048, 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, - 17090, 17566, 19729, 17987, 20150, 20626, 22789, 8286, 10449, 10925, 13088, - 11346, 13509, 13985, 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, - 19726, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, - 18081, 20244, 18502, 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, - 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, - 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, - 23822, 22080, 24243, 24719, 26882, 4193, 6356, 6832, 8995, 7253, 9416, - 9892, 12055, 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, 7771, - 9934, 10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, - 14409, 16572, 17048, 19211, 7771, 9934, 10410, 12573, 10831, 12994, 13470, - 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 11349, 13512, - 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, 17566, 19729, 17987, - 20150, 20626, 22789, 8286, 10449, 10925, 13088, 11346, 13509, 13985, 16148, - 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, - 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, - 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, - 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, - 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, - 26882, 8286, 10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, - 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, - 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, - 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, - 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, - 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, - 14542, 15018, 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, - 19017, 21180, 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, - 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, - 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, - 24758, 25234, 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, - 23113, 25276, 25752, 27915, 26173, 28336, 28812, 30975, 4193, 6356, 6832, - 8995, 7253, 9416, 9892, 12055, 7771, 9934, 10410, 12573, 10831, 12994, - 13470, 15633, 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, 11349, - 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771, 9934, 10410, 12573, - 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, - 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, - 17566, 19729, 17987, 20150, 20626, 22789, 8286, 10449, 10925, 13088, 11346, - 13509, 13985, 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, - 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, - 20244, 18502, 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, - 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, - 17605, 18081, 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, - 22080, 24243, 24719, 26882, 8286, 10449, 10925, 13088, 11346, 13509, 13985, - 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, - 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, - 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, - 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, - 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, - 24719, 26882, 12379, 14542, 15018, 17181, 15439, 17602, 18078, 20241, 15957, - 18120, 18596, 20759, 19017, 21180, 21656, 23819, 15957, 18120, 18596, 20759, - 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, - 27397, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, - 22174, 24337, 22595, 24758, 25234, 27397, 19535, 21698, 22174, 24337, 22595, - 24758, 25234, 27397, 23113, 25276, 25752, 27915, 26173, 28336, 28812, 30975, - 8286, 10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, 14503, - 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, 17087, - 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 11864, - 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, - 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, 21141, - 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, 14542, - 15018, 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, - 21180, 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, - 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, - 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, - 25234, 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, - 25276, 25752, 27915, 26173, 28336, 28812, 30975, 12379, 14542, 15018, 17181, - 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180, 21656, - 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, - 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759, 19017, - 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, - 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276, 25752, - 27915, 26173, 28336, 28812, 30975, 16472, 18635, 19111, 21274, 19532, 21695, - 22171, 24334, 20050, 22213, 22689, 24852, 23110, 25273, 25749, 27912, 20050, - 22213, 22689, 24852, 23110, 25273, 25749, 27912, 23628, 25791, 26267, 28430, - 26688, 28851, 29327, 31490, 20050, 22213, 22689, 24852, 23110, 25273, 25749, - 27912, 23628, 25791, 26267, 28430, 26688, 28851, 29327, 31490, 23628, 25791, - 26267, 28430, 26688, 28851, 29327, 31490, 27206, 29369, 29845, 32008, 30266, - 32429, 32905, 35068 -#endif -}; - -const uint8_t av1_cat6_skipped_bits_discount[8] = { - 0, 3, 6, 9, 12, 18, 24, 30 -}; - -#if CONFIG_NEW_MULTISYMBOL -const av1_extra_bit av1_extra_bits[ENTROPY_TOKENS] = { - { 0, 0, 0, zero_cost }, // ZERO_TOKEN - { 0, 0, 1, sign_cost }, // ONE_TOKEN - { 0, 0, 2, sign_cost }, // TWO_TOKEN - { 0, 0, 3, sign_cost }, // THREE_TOKEN - { 0, 0, 4, sign_cost }, // FOUR_TOKEN - { av1_cat1_cdf, 1, CAT1_MIN_VAL, cat1_cost }, // CATEGORY1_TOKEN - { av1_cat2_cdf, 2, CAT2_MIN_VAL, cat2_cost }, // CATEGORY2_TOKEN - { av1_cat3_cdf, 3, CAT3_MIN_VAL, cat3_cost }, // CATEGORY3_TOKEN - { av1_cat4_cdf, 4, CAT4_MIN_VAL, cat4_cost }, // CATEGORY4_TOKEN - { av1_cat5_cdf, 5, CAT5_MIN_VAL, cat5_cost }, // CATEGORY5_TOKEN - { av1_cat6_cdf, 18, CAT6_MIN_VAL, 0 }, // CATEGORY6_TOKEN - { 0, 0, 0, zero_cost } // EOB_TOKEN -}; -#else -const av1_extra_bit av1_extra_bits[ENTROPY_TOKENS] = { - { 0, 0, 0, zero_cost }, // ZERO_TOKEN - { 0, 0, 1, sign_cost }, // ONE_TOKEN - { 0, 0, 2, sign_cost }, // TWO_TOKEN - { 0, 0, 3, sign_cost }, // THREE_TOKEN - { 0, 0, 4, sign_cost }, // FOUR_TOKEN - { av1_cat1_prob, 1, CAT1_MIN_VAL, cat1_cost }, // CATEGORY1_TOKEN - { av1_cat2_prob, 2, CAT2_MIN_VAL, cat2_cost }, // CATEGORY2_TOKEN - { av1_cat3_prob, 3, CAT3_MIN_VAL, cat3_cost }, // CATEGORY3_TOKEN - { av1_cat4_prob, 4, CAT4_MIN_VAL, cat4_cost }, // CATEGORY4_TOKEN - { av1_cat5_prob, 5, CAT5_MIN_VAL, cat5_cost }, // CATEGORY5_TOKEN - { av1_cat6_prob, 18, CAT6_MIN_VAL, 0 }, // CATEGORY6_TOKEN - { 0, 0, 0, zero_cost } // EOB_TOKEN -}; -#endif - -#if !CONFIG_PVQ || CONFIG_VAR_TX -static void cost_coeffs_b(int plane, int block, int blk_row, int blk_col, - BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { - struct tokenize_b_args *const args = arg; - const AV1_COMP *const cpi = args->cpi; - const AV1_COMMON *cm = &cpi->common; - ThreadData *const td = args->td; - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - struct macroblock_plane *p = &x->plane[plane]; - struct macroblockd_plane *pd = &xd->plane[plane]; - const PLANE_TYPE type = pd->plane_type; - const TX_TYPE tx_type = - av1_get_tx_type(type, xd, blk_row, blk_col, block, tx_size); - const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi); - const int rate = av1_cost_coeffs( - cpi, x, plane, blk_row, blk_col, block, tx_size, scan_order, - pd->above_context + blk_col, pd->left_context + blk_row, 0); - args->this_rate += rate; - (void)plane_bsize; - av1_set_contexts(xd, pd, plane, tx_size, p->eobs[block] > 0, blk_col, - blk_row); -} - -static void set_entropy_context_b(int plane, int block, int blk_row, - int blk_col, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg) { - struct tokenize_b_args *const args = arg; - ThreadData *const td = args->td; - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - struct macroblock_plane *p = &x->plane[plane]; - struct macroblockd_plane *pd = &xd->plane[plane]; - (void)plane_bsize; - av1_set_contexts(xd, pd, plane, tx_size, p->eobs[block] > 0, blk_col, - blk_row); -} - -static INLINE void add_token(TOKENEXTRA **t, - aom_cdf_prob (*tail_cdf)[CDF_SIZE(ENTROPY_TOKENS)], - aom_cdf_prob (*head_cdf)[CDF_SIZE(ENTROPY_TOKENS)], - int eob_val, int first_val, int32_t extra, - uint8_t token) { - (*t)->token = token; - (*t)->extra = extra; - (*t)->tail_cdf = tail_cdf; - (*t)->head_cdf = head_cdf; - (*t)->eob_val = eob_val; - (*t)->first_val = first_val; - (*t)++; - - if (token == BLOCK_Z_TOKEN) { - update_cdf(*head_cdf, 0, HEAD_TOKENS + 1); - } else { - if (eob_val != LAST_EOB) { - const int symb = 2 * AOMMIN(token, TWO_TOKEN) - eob_val + first_val; - update_cdf(*head_cdf, symb, HEAD_TOKENS + first_val); - } - if (token > ONE_TOKEN) - update_cdf(*tail_cdf, token - TWO_TOKEN, TAIL_TOKENS); - } -} -#endif // !CONFIG_PVQ || CONFIG_VAR_TX - static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t, - int calc_rate) { + int plane, int calc_rate, int allow_update_cdf, + FRAME_COUNTS *counts) { const uint8_t *const color_map = param->color_map; MapCdf map_cdf = param->map_cdf; ColorCost color_cost = param->color_cost; @@ -338,28 +37,37 @@ static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t, const int rows = param->rows; const int cols = param->cols; const int n = param->n_colors; - + const int palette_size_idx = n - PALETTE_MIN_SIZE; int this_rate = 0; uint8_t color_order[PALETTE_MAX_SIZE]; -#if CONFIG_PALETTE_THROUGHPUT + + (void)plane; + (void)counts; + for (int k = 1; k < rows + cols - 1; ++k) { for (int j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) { int i = k - j; -#else - for (int i = 0; i < rows; ++i) { - for (int j = (i == 0 ? 1 : 0); j < cols; ++j) { -#endif // CONFIG_PALETTE_THROUGHPUT int color_new_idx; const int color_ctx = av1_get_palette_color_index_context( color_map, plane_block_width, i, j, n, color_order, &color_new_idx); assert(color_new_idx >= 0 && color_new_idx < n); if (calc_rate) { - this_rate += - (*color_cost)[n - PALETTE_MIN_SIZE][color_ctx][color_new_idx]; + this_rate += (*color_cost)[palette_size_idx][color_ctx][color_new_idx]; } else { (*t)->token = color_new_idx; - (*t)->color_map_cdf = map_cdf[n - PALETTE_MIN_SIZE][color_ctx]; + (*t)->color_map_cdf = map_cdf[palette_size_idx][color_ctx]; ++(*t); + if (allow_update_cdf) + update_cdf(map_cdf[palette_size_idx][color_ctx], color_new_idx, n); +#if CONFIG_ENTROPY_STATS + if (plane) { + ++counts->palette_uv_color_index[palette_size_idx][color_ctx] + [color_new_idx]; + } else { + ++counts->palette_y_color_index[palette_size_idx][color_ctx] + [color_new_idx]; + } +#endif } } } @@ -370,7 +78,7 @@ static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t, static void get_palette_params(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize, Av1ColorMapParam *params) { const MACROBLOCKD *const xd = &x->e_mbd; - const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const MB_MODE_INFO *const mbmi = xd->mi[0]; const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; params->color_map = xd->plane[plane].color_index_map; params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf @@ -382,263 +90,62 @@ static void get_palette_params(const MACROBLOCK *const x, int plane, ¶ms->rows, ¶ms->cols); } -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK -static void get_mrc_params(const MACROBLOCK *const x, int block, - TX_SIZE tx_size, Av1ColorMapParam *params) { - memset(params, 0, sizeof(*params)); - const MACROBLOCKD *const xd = &x->e_mbd; - const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - const int is_inter = is_inter_block(mbmi); - params->color_map = BLOCK_OFFSET(xd->mrc_mask, block); - params->map_cdf = is_inter ? xd->tile_ctx->mrc_mask_inter_cdf - : xd->tile_ctx->mrc_mask_intra_cdf; - params->color_cost = - is_inter ? &x->mrc_mask_inter_cost : &x->mrc_mask_intra_cost; - params->n_colors = 2; - params->plane_width = tx_size_wide[tx_size]; - params->rows = tx_size_high[tx_size]; - params->cols = tx_size_wide[tx_size]; -} -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - static void get_color_map_params(const MACROBLOCK *const x, int plane, - int block, BLOCK_SIZE bsize, TX_SIZE tx_size, + BLOCK_SIZE bsize, TX_SIZE tx_size, COLOR_MAP_TYPE type, Av1ColorMapParam *params) { - (void)block; (void)tx_size; memset(params, 0, sizeof(*params)); switch (type) { case PALETTE_MAP: get_palette_params(x, plane, bsize, params); break; -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - case MRC_MAP: get_mrc_params(x, block, tx_size, params); break; -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK default: assert(0 && "Invalid color map type"); return; } } -int av1_cost_color_map(const MACROBLOCK *const x, int plane, int block, - BLOCK_SIZE bsize, TX_SIZE tx_size, COLOR_MAP_TYPE type) { +int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize, + TX_SIZE tx_size, COLOR_MAP_TYPE type) { assert(plane == 0 || plane == 1); Av1ColorMapParam color_map_params; - get_color_map_params(x, plane, block, bsize, tx_size, type, - &color_map_params); - return cost_and_tokenize_map(&color_map_params, NULL, 1); + get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params); + return cost_and_tokenize_map(&color_map_params, NULL, plane, 1, 0, NULL); } -void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, int block, +void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size, - COLOR_MAP_TYPE type) { + COLOR_MAP_TYPE type, int allow_update_cdf, + FRAME_COUNTS *counts) { assert(plane == 0 || plane == 1); -#if CONFIG_MRC_TX - if (type == MRC_MAP) { - const int is_inter = is_inter_block(&x->e_mbd.mi[0]->mbmi); - if ((is_inter && !SIGNAL_MRC_MASK_INTER) || - (!is_inter && !SIGNAL_MRC_MASK_INTRA)) - return; - } -#endif // CONFIG_MRC_TX Av1ColorMapParam color_map_params; - get_color_map_params(x, plane, block, bsize, tx_size, type, - &color_map_params); + get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params); // The first color index does not use context or entropy. (*t)->token = color_map_params.color_map[0]; (*t)->color_map_cdf = NULL; ++(*t); - cost_and_tokenize_map(&color_map_params, t, 0); -} - -#if CONFIG_PVQ -static void add_pvq_block(AV1_COMMON *const cm, MACROBLOCK *const x, - PVQ_INFO *pvq) { - PVQ_QUEUE *q = x->pvq_q; - if (q->curr_pos >= q->buf_len) { - int new_buf_len = 2 * q->buf_len + 1; - PVQ_INFO *new_buf; - CHECK_MEM_ERROR(cm, new_buf, aom_malloc(new_buf_len * sizeof(PVQ_INFO))); - memcpy(new_buf, q->buf, q->buf_len * sizeof(PVQ_INFO)); - aom_free(q->buf); - q->buf = new_buf; - q->buf_len = new_buf_len; - } - OD_COPY(q->buf + q->curr_pos, pvq, 1); - ++q->curr_pos; -} - -// NOTE: This does not actually generate tokens, instead we store the encoding -// decisions made for PVQ in a queue that we will read from when -// actually writing the bitstream in write_modes_b -static void tokenize_pvq(int plane, int block, int blk_row, int blk_col, - BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { - struct tokenize_b_args *const args = arg; - const AV1_COMP *cpi = args->cpi; - const AV1_COMMON *const cm = &cpi->common; - ThreadData *const td = args->td; - MACROBLOCK *const x = &td->mb; - PVQ_INFO *pvq_info; - - (void)block; - (void)blk_row; - (void)blk_col; - (void)plane_bsize; - (void)tx_size; - - assert(block < MAX_PVQ_BLOCKS_IN_SB); - pvq_info = &x->pvq[block][plane]; - add_pvq_block((AV1_COMMON * const) cm, x, pvq_info); -} -#endif // CONFIG_PVQ - -static void tokenize_b(int plane, int block, int blk_row, int blk_col, - BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { -#if !CONFIG_PVQ - struct tokenize_b_args *const args = arg; - const AV1_COMP *cpi = args->cpi; - const AV1_COMMON *const cm = &cpi->common; - ThreadData *const td = args->td; - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - TOKENEXTRA **tp = args->tp; - uint8_t token_cache[MAX_TX_SQUARE]; - struct macroblock_plane *p = &x->plane[plane]; - struct macroblockd_plane *pd = &xd->plane[plane]; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - int pt; /* near block/prev token context index */ - int c; - TOKENEXTRA *t = *tp; /* store tokens starting here */ - const int eob = p->eobs[block]; - const PLANE_TYPE type = pd->plane_type; - const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); -#if CONFIG_SUPERTX - const int segment_id = AOMMIN(mbmi->segment_id, mbmi->segment_id_supertx); -#else - const int segment_id = mbmi->segment_id; -#endif // CONFIG_SUEPRTX - const int16_t *scan, *nb; - const TX_TYPE tx_type = - av1_get_tx_type(type, xd, blk_row, blk_col, block, tx_size); - const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi); - const int ref = is_inter_block(mbmi); - FRAME_CONTEXT *ec_ctx = xd->tile_ctx; - aom_cdf_prob( - *const coef_head_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] = - ec_ctx->coef_head_cdfs[txsize_sqr_map[tx_size]][type][ref]; - aom_cdf_prob( - *const coef_tail_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] = - ec_ctx->coef_tail_cdfs[txsize_sqr_map[tx_size]][type][ref]; - int eob_val; - int first_val = 1; - const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size); - const uint8_t *const band = get_band_translate(tx_size); - int16_t token; - EXTRABIT extra; - (void)plane_bsize; - pt = get_entropy_context(tx_size, pd->above_context + blk_col, - pd->left_context + blk_row); - scan = scan_order->scan; - nb = scan_order->neighbors; - c = 0; - -#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - if (tx_type == MRC_DCT) - av1_tokenize_color_map(x, plane, block, &t, plane_bsize, tx_size, MRC_MAP); -#endif // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK - - if (eob == 0) - add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt], 1, - 1, 0, BLOCK_Z_TOKEN); - - while (c < eob) { - int v = qcoeff[scan[c]]; - first_val = (c == 0); - - if (!v) { - add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt], - 0, first_val, 0, ZERO_TOKEN); - token_cache[scan[c]] = 0; - } else { - eob_val = - (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB; - av1_get_token_extra(v, &token, &extra); - add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt], - eob_val, first_val, extra, (uint8_t)token); - token_cache[scan[c]] = av1_pt_energy_class[token]; - } - ++c; - pt = get_coef_context(nb, token_cache, AOMMIN(c, eob - 1)); - } - -#if CONFIG_COEF_INTERLEAVE - t->token = EOSB_TOKEN; - t++; -#endif - - *tp = t; - -#if CONFIG_ADAPT_SCAN - // Since dqcoeff is not available here, we pass qcoeff into - // av1_update_scan_count_facade(). The update behavior should be the same - // because av1_update_scan_count_facade() only cares if coefficients are zero - // or not. - av1_update_scan_count_facade((AV1_COMMON *)cm, td->counts, tx_size, tx_type, - qcoeff, c); -#endif - - av1_set_contexts(xd, pd, plane, tx_size, c > 0, blk_col, blk_row); -#else // !CONFIG_PVQ - tokenize_pvq(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg); -#endif // !CONFIG_PVQ -} - -struct is_skippable_args { - uint16_t *eobs; - int *skippable; -}; -static void is_skippable(int plane, int block, int blk_row, int blk_col, - BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *argv) { - struct is_skippable_args *args = argv; - (void)plane; - (void)plane_bsize; - (void)tx_size; - (void)blk_row; - (void)blk_col; - args->skippable[0] &= (!args->eobs[block]); + cost_and_tokenize_map(&color_map_params, t, plane, 0, allow_update_cdf, + counts); } -// TODO(yaowu): rewrite and optimize this function to remove the usage of -// av1_foreach_transform_block() and simplify is_skippable(). -int av1_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { - int result = 1; - struct is_skippable_args args = { x->plane[plane].eobs, &result }; - av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, is_skippable, - &args); - return result; -} - -#if CONFIG_VAR_TX void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run, TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int blk_row, int blk_col, int block, int plane, void *arg) { MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO *const mbmi = xd->mi[0]; const struct macroblockd_plane *const pd = &xd->plane[plane]; - const BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; - const int tx_row = blk_row >> (1 - pd->subsampling_y); - const int tx_col = blk_col >> (1 - pd->subsampling_x); const int max_blocks_high = max_block_high(xd, plane_bsize, plane); const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); - TX_SIZE plane_tx_size; if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; - plane_tx_size = - plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0] - : mbmi->inter_tx_size[tx_row][tx_col]; + const TX_SIZE plane_tx_size = + plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x, + pd->subsampling_y) + : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, + blk_col)]; - if (tx_size == plane_tx_size) { - plane_bsize = get_plane_block_size(mbmi->sb_type, pd); -#if CONFIG_LV_MAP + if (tx_size == plane_tx_size || plane) { + plane_bsize = get_plane_block_size(mbmi->sb_type, pd->subsampling_x, + pd->subsampling_y); if (!dry_run) { av1_update_and_record_txb_context(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg); @@ -649,120 +156,71 @@ void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run, printf("DRY_RUN_COSTCOEFFS is not supported yet\n"); assert(0); } -#else - if (!dry_run) - tokenize_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg); - else if (dry_run == DRY_RUN_NORMAL) - set_entropy_context_b(plane, block, blk_row, blk_col, plane_bsize, - tx_size, arg); - else if (dry_run == DRY_RUN_COSTCOEFFS) - cost_coeffs_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg); -#endif } else { -#if CONFIG_RECT_TX_EXT - int is_qttx = plane_tx_size == quarter_txsize_lookup[plane_bsize]; - const TX_SIZE sub_txs = is_qttx ? plane_tx_size : sub_tx_size_map[tx_size]; -#else // Half the block size in transform block unit. const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; -#endif - const int bsl = tx_size_wide_unit[sub_txs]; - int i; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int step = bsw * bsh; - assert(bsl > 0); + assert(bsw > 0 && bsh > 0); - for (i = 0; i < 4; ++i) { -#if CONFIG_RECT_TX_EXT - int is_wide_tx = tx_size_wide_unit[sub_txs] > tx_size_high_unit[sub_txs]; - const int offsetr = - is_qttx ? (is_wide_tx ? i * tx_size_high_unit[sub_txs] : 0) - : blk_row + ((i >> 1) * bsl); - const int offsetc = - is_qttx ? (is_wide_tx ? 0 : i * tx_size_wide_unit[sub_txs]) - : blk_col + ((i & 0x01) * bsl); -#else - const int offsetr = blk_row + ((i >> 1) * bsl); - const int offsetc = blk_col + ((i & 0x01) * bsl); -#endif + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + const int offsetr = blk_row + row; + const int offsetc = blk_col + col; - int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs]; + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; - if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; - - tokenize_vartx(td, t, dry_run, sub_txs, plane_bsize, offsetr, offsetc, - block, plane, arg); - block += step; + tokenize_vartx(td, t, dry_run, sub_txs, plane_bsize, offsetr, offsetc, + block, plane, arg); + block += step; + } } } } void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row, int mi_col, - BLOCK_SIZE bsize, int *rate) { + BLOCK_SIZE bsize, int *rate, + uint8_t allow_update_cdf) { const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; -#if CONFIG_LV_MAP + MB_MODE_INFO *const mbmi = xd->mi[0]; (void)t; -#else - TOKENEXTRA *t_backup = *t; -#endif - const int ctx = av1_get_skip_context(xd); - const int skip_inc = - !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); - struct tokenize_b_args arg = { cpi, td, t, 0 }; - int plane; + struct tokenize_b_args arg = { cpi, td, t, 0, allow_update_cdf }; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; if (mbmi->skip) { - if (!dry_run) td->counts->skip[ctx][1] += skip_inc; - av1_reset_skip_context(xd, mi_row, mi_col, bsize); -#if !CONFIG_LV_MAP - if (dry_run) *t = t_backup; -#endif + av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes); return; } - if (!dry_run) td->counts->skip[ctx][0] += skip_inc; -#if !CONFIG_LV_MAP - else - *t = t_backup; -#endif - - for (plane = 0; plane < MAX_MB_PLANE; ++plane) { -#if CONFIG_CB4X4 + for (int plane = 0; plane < num_planes; ++plane) { if (!is_chroma_reference(mi_row, mi_col, bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y)) { -#if !CONFIG_PVQ && !CONFIG_LV_MAP - if (!dry_run) { - (*t)->token = EOSB_TOKEN; - (*t)++; - } -#endif continue; } -#endif const struct macroblockd_plane *const pd = &xd->plane[plane]; -#if CONFIG_CHROMA_SUB8X8 + const BLOCK_SIZE bsizec = + scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y); const BLOCK_SIZE plane_bsize = - AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd)); -#else - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); -#endif + get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y); const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; - const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0]; - const TX_SIZE max_tx_size = get_vartx_max_txsize( - mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y); + const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size]; int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0]; - int bh = block_size_high[txb_size] >> tx_size_wide_log2[0]; + int bh = block_size_high[txb_size] >> tx_size_high_log2[0]; int idx, idy; int block = 0; int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; - const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd); + const BLOCK_SIZE max_unit_bsize = + get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y); int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0]; int mu_blocks_high = @@ -785,144 +243,6 @@ void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t, } } } -#if !CONFIG_LV_MAP - if (!dry_run) { - (*t)->token = EOSB_TOKEN; - (*t)++; - } -#endif - } - if (rate) *rate += arg.this_rate; -} -#endif // CONFIG_VAR_TX - -void av1_tokenize_sb(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t, - RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate, - const int mi_row, const int mi_col) { - const AV1_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - const int ctx = av1_get_skip_context(xd); - const int skip_inc = - !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); - struct tokenize_b_args arg = { cpi, td, t, 0 }; - if (mbmi->skip) { - if (!dry_run) td->counts->skip[ctx][1] += skip_inc; - av1_reset_skip_context(xd, mi_row, mi_col, bsize); - return; - } - - if (!dry_run) { -#if CONFIG_COEF_INTERLEAVE - td->counts->skip[ctx][0] += skip_inc; - av1_foreach_transformed_block_interleave(xd, bsize, tokenize_b, &arg); -#else - int plane; - - td->counts->skip[ctx][0] += skip_inc; - for (plane = 0; plane < MAX_MB_PLANE; ++plane) { -#if CONFIG_CB4X4 - if (!is_chroma_reference(mi_row, mi_col, bsize, - xd->plane[plane].subsampling_x, - xd->plane[plane].subsampling_y)) { -#if !CONFIG_PVQ - (*t)->token = EOSB_TOKEN; - (*t)++; -#endif - continue; - } -#else - (void)mi_row; - (void)mi_col; -#endif - av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b, - &arg); -#if !CONFIG_PVQ - (*t)->token = EOSB_TOKEN; - (*t)++; -#endif // !CONFIG_PVQ - } -#endif - } -#if !CONFIG_PVQ - else if (dry_run == DRY_RUN_NORMAL) { - int plane; - for (plane = 0; plane < MAX_MB_PLANE; ++plane) { -#if CONFIG_CB4X4 - if (!is_chroma_reference(mi_row, mi_col, bsize, - xd->plane[plane].subsampling_x, - xd->plane[plane].subsampling_y)) - continue; -#else - (void)mi_row; - (void)mi_col; -#endif - av1_foreach_transformed_block_in_plane(xd, bsize, plane, - set_entropy_context_b, &arg); - } - } else if (dry_run == DRY_RUN_COSTCOEFFS) { - int plane; - for (plane = 0; plane < MAX_MB_PLANE; ++plane) { -#if CONFIG_CB4X4 - if (!is_chroma_reference(mi_row, mi_col, bsize, - xd->plane[plane].subsampling_x, - xd->plane[plane].subsampling_y)) - continue; -#else - (void)mi_row; - (void)mi_col; -#endif - av1_foreach_transformed_block_in_plane(xd, bsize, plane, cost_coeffs_b, - &arg); - } - } -#endif // !CONFIG_PVQ - - if (rate) *rate += arg.this_rate; -} - -#if CONFIG_SUPERTX -void av1_tokenize_sb_supertx(const AV1_COMP *cpi, ThreadData *td, - TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row, - int mi_col, BLOCK_SIZE bsize, int *rate) { - const AV1_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &td->mb.e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - TOKENEXTRA *t_backup = *t; - const int ctx = av1_get_skip_context(xd); - const int skip_inc = - !segfeature_active(&cm->seg, mbmi->segment_id_supertx, SEG_LVL_SKIP); - struct tokenize_b_args arg = { cpi, td, t, 0 }; - if (mbmi->skip) { - if (!dry_run) td->counts->skip[ctx][1] += skip_inc; - av1_reset_skip_context(xd, mi_row, mi_col, bsize); - if (dry_run) *t = t_backup; - return; - } - - if (!dry_run) { - int plane; - td->counts->skip[ctx][0] += skip_inc; - - for (plane = 0; plane < MAX_MB_PLANE; ++plane) { - av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b, - &arg); - (*t)->token = EOSB_TOKEN; - (*t)++; - } - } else if (dry_run == DRY_RUN_NORMAL) { - int plane; - for (plane = 0; plane < MAX_MB_PLANE; ++plane) - av1_foreach_transformed_block_in_plane(xd, bsize, plane, - set_entropy_context_b, &arg); - *t = t_backup; - } else if (dry_run == DRY_RUN_COSTCOEFFS) { - int plane; - for (plane = 0; plane < MAX_MB_PLANE; ++plane) - av1_foreach_transformed_block_in_plane(xd, bsize, plane, cost_coeffs_b, - &arg); } if (rate) *rate += arg.this_rate; } -#endif // CONFIG_SUPERTX diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h index 20000e502..de1cbe99c 100644 --- a/third_party/aom/av1/encoder/tokenize.h +++ b/third_party/aom/av1/encoder/tokenize.h @@ -13,51 +13,29 @@ #define AV1_ENCODER_TOKENIZE_H_ #include "av1/common/entropy.h" - #include "av1/encoder/block.h" -#include "av1/encoder/treewriter.h" +#include "aom_dsp/bitwriter.h" #ifdef __cplusplus extern "C" { #endif -#define EOSB_TOKEN 127 // Not signalled, encoder only - -#if CONFIG_HIGHBITDEPTH -typedef int32_t EXTRABIT; -#else -typedef int16_t EXTRABIT; -#endif - -typedef struct { - int16_t token; - EXTRABIT extra; -} TOKENVALUE; - typedef struct { - aom_cdf_prob (*tail_cdf)[CDF_SIZE(ENTROPY_TOKENS)]; - aom_cdf_prob (*head_cdf)[CDF_SIZE(ENTROPY_TOKENS)]; aom_cdf_prob *color_map_cdf; - int eob_val; - int first_val; - const aom_prob *context_tree; - EXTRABIT extra; + // TODO(yaowu: use packed enum type if appropriate) uint8_t token; } TOKENEXTRA; -extern const aom_tree_index av1_coef_tree[]; -extern const aom_tree_index av1_coef_con_tree[]; - -int av1_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); - struct AV1_COMP; struct ThreadData; +struct FRAME_COUNTS; struct tokenize_b_args { const struct AV1_COMP *cpi; struct ThreadData *td; TOKENEXTRA **tp; int this_rate; + uint8_t allow_update_cdf; }; typedef enum { @@ -69,78 +47,22 @@ typedef enum { // Note in all the tokenize functions rate if non NULL is incremented // with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS, // otherwise rate is not incremented. -#if CONFIG_VAR_TX void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row, - int mi_col, BLOCK_SIZE bsize, int *rate); -#endif + int mi_col, BLOCK_SIZE bsize, int *rate, + uint8_t allow_update_cdf); -int av1_cost_color_map(const MACROBLOCK *const x, int plane, int block, - BLOCK_SIZE bsize, TX_SIZE tx_size, COLOR_MAP_TYPE type); +int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize, + TX_SIZE tx_size, COLOR_MAP_TYPE type); -void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, int block, +void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size, - COLOR_MAP_TYPE type); - -void av1_tokenize_sb(const struct AV1_COMP *cpi, struct ThreadData *td, - TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize, - int *rate, const int mi_row, const int mi_col); -#if CONFIG_SUPERTX -void av1_tokenize_sb_supertx(const struct AV1_COMP *cpi, struct ThreadData *td, - TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row, - int mi_col, BLOCK_SIZE bsize, int *rate); -#endif - -extern const int16_t *av1_dct_value_cost_ptr; -/* TODO: The Token field should be broken out into a separate char array to - * improve cache locality, since it's needed for costing when the rest of the - * fields are not. - */ -extern const TOKENVALUE *av1_dct_value_tokens_ptr; -extern const TOKENVALUE *av1_dct_cat_lt_10_value_tokens; -extern const int *av1_dct_cat_lt_10_value_cost; -extern const int16_t av1_cat6_low_cost[256]; -#if CONFIG_HIGHBITDEPTH -#define CAT6_HIGH_COST_ENTRIES 1024 -#else -#define CAT6_HIGH_COST_ENTRIES 64 -#endif -extern const int av1_cat6_high_cost[CAT6_HIGH_COST_ENTRIES]; -extern const uint8_t av1_cat6_skipped_bits_discount[8]; - -static INLINE void av1_get_token_extra(int v, int16_t *token, EXTRABIT *extra) { - if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) { - *token = CATEGORY6_TOKEN; - if (v >= CAT6_MIN_VAL) - *extra = 2 * v - 2 * CAT6_MIN_VAL; - else - *extra = -2 * v - 2 * CAT6_MIN_VAL + 1; - return; - } - *token = av1_dct_cat_lt_10_value_tokens[v].token; - *extra = av1_dct_cat_lt_10_value_tokens[v].extra; -} -static INLINE int16_t av1_get_token(int v) { - if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) return 10; - return av1_dct_cat_lt_10_value_tokens[v].token; -} - -static INLINE int av1_get_token_cost(int v, int16_t *token, int cat6_bits) { - if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) { - EXTRABIT extrabits; - *token = CATEGORY6_TOKEN; - extrabits = abs(v) - CAT6_MIN_VAL; - return av1_cat6_low_cost[extrabits & 0xff] + - av1_cat6_high_cost[extrabits >> 8] - - av1_cat6_skipped_bits_discount[18 - cat6_bits]; - } - *token = av1_dct_cat_lt_10_value_tokens[v].token; - return av1_dct_cat_lt_10_value_cost[v]; -} + COLOR_MAP_TYPE type, int allow_update_cdf, + struct FRAME_COUNTS *counts); static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id, TX_SIZE tx_size) { - const int eob_max = tx_size_2d[tx_size]; + const int eob_max = av1_get_max_eob(tx_size); return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; } diff --git a/third_party/aom/av1/encoder/treewriter.c b/third_party/aom/av1/encoder/treewriter.c deleted file mode 100644 index 50be72413..000000000 --- a/third_party/aom/av1/encoder/treewriter.c +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "av1/encoder/treewriter.h" - -static void tree2tok(struct av1_token *tokens, const aom_tree_index *tree, - int i, int v, int l) { - v += v; - ++l; - - do { - const aom_tree_index j = tree[i++]; - if (j <= 0) { - tokens[-j].value = v; - tokens[-j].len = l; - } else { - tree2tok(tokens, tree, j, v, l); - } - } while (++v & 1); -} - -void av1_tokens_from_tree(struct av1_token *tokens, - const aom_tree_index *tree) { - tree2tok(tokens, tree, 0, 0, 0); -} - -static unsigned int convert_distribution(unsigned int i, aom_tree tree, - unsigned int branch_ct[][2], - const unsigned int num_events[]) { - unsigned int left, right; - - if (tree[i] <= 0) - left = num_events[-tree[i]]; - else - left = convert_distribution(tree[i], tree, branch_ct, num_events); - - if (tree[i + 1] <= 0) - right = num_events[-tree[i + 1]]; - else - right = convert_distribution(tree[i + 1], tree, branch_ct, num_events); - - branch_ct[i >> 1][0] = left; - branch_ct[i >> 1][1] = right; - return left + right; -} - -void av1_tree_probs_from_distribution(aom_tree tree, - unsigned int branch_ct[/* n-1 */][2], - const unsigned int num_events[/* n */]) { - convert_distribution(0, tree, branch_ct, num_events); -} diff --git a/third_party/aom/av1/encoder/treewriter.h b/third_party/aom/av1/encoder/treewriter.h deleted file mode 100644 index 9a4cb86cb..000000000 --- a/third_party/aom/av1/encoder/treewriter.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AV1_ENCODER_TREEWRITER_H_ -#define AV1_ENCODER_TREEWRITER_H_ - -#include "aom_dsp/bitwriter.h" - -#ifdef __cplusplus -extern "C" { -#endif - -void av1_tree_probs_from_distribution(aom_tree tree, - unsigned int branch_ct[/* n - 1 */][2], - const unsigned int num_events[/* n */]); - -struct av1_token { - int value; - int len; -}; - -void av1_tokens_from_tree(struct av1_token *, const aom_tree_index *); - -static INLINE void av1_write_token(aom_writer *w, const aom_tree_index *tree, - const aom_prob *probs, - const struct av1_token *token) { - aom_write_tree(w, tree, probs, token->value, token->len, 0); -} - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AV1_ENCODER_TREEWRITER_H_ diff --git a/third_party/aom/av1/encoder/tx_prune_model_weights.h b/third_party/aom/av1/encoder/tx_prune_model_weights.h new file mode 100644 index 000000000..69063b801 --- /dev/null +++ b/third_party/aom/av1/encoder/tx_prune_model_weights.h @@ -0,0 +1,2086 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ +#define AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +// Tx type model for 4x4 block. +static const float av1_tx_type_nn_weights_4x4_layer0[32] = { + 0.72406f, -0.40019f, 0.51795f, -0.43881f, -0.49746f, -0.41780f, -0.39409f, + -0.16183f, -1.00135f, -0.41733f, -0.96534f, 0.93272f, 1.06229f, 0.04188f, + 0.60919f, 0.92405f, -0.39359f, 0.70570f, 0.75375f, 1.11966f, -1.86360f, + -0.35421f, 0.18743f, 0.13346f, -0.21262f, 0.07050f, 0.10533f, -0.47402f, + 1.33417f, 1.72899f, 1.17983f, 0.10552f, +}; + +static const float av1_tx_type_nn_bias_4x4_layer0[8] = { + 1.96273f, -0.69845f, -0.10999f, -1.11311f, + 1.35101f, 0.43842f, -0.29264f, -1.15376f, +}; + +static const float av1_tx_type_nn_weights_4x4_layer1[32] = { + 0.79770f, 0.08520f, 0.23298f, 0.05285f, 0.87506f, -0.90784f, -0.06197f, + -1.00580f, 0.68639f, -0.34881f, 0.15366f, -1.64658f, 0.80755f, -0.26293f, + 0.10253f, -0.23915f, 1.14696f, -0.10928f, -1.61377f, 0.00863f, 0.98599f, + -0.43872f, 0.61196f, -0.03787f, 1.01060f, 0.17643f, -0.00208f, -0.15738f, + 0.06517f, 0.72885f, 0.24387f, 1.28535f, +}; + +static const float av1_tx_type_nn_bias_4x4_layer1[4] = { + 1.23769f, + 1.40308f, + 0.09871f, + 1.82070f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x4 = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_4x4_layer0, + av1_tx_type_nn_weights_4x4_layer1, + }, + { + av1_tx_type_nn_bias_4x4_layer0, + av1_tx_type_nn_bias_4x4_layer1, + }, +}; +/******************************************************************************/ + +// Tx type model for 4x8 block. +static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = { + 0.68355f, -0.06887f, 0.68525f, -0.86048f, -0.35906f, -0.28597f, -0.21108f, + 0.12591f, -1.13025f, -0.65695f, -0.25658f, 0.39155f, 0.89011f, 0.19258f, + 0.28316f, 0.61172f, 0.52587f, 0.99182f, 0.75704f, 0.66788f, -1.61814f, + -1.23483f, -0.62868f, -0.11902f, 0.33295f, 0.64796f, 0.92345f, -0.71821f, + 0.07575f, 0.34687f, 0.20518f, -0.87850f, +}; + +static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = { + 1.14049f, -0.18583f, 1.92114f, -0.72057f, + 1.32715f, 0.96713f, 1.09877f, -0.64345f, +}; + +static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = { + 0.71978f, 0.06896f, 1.48617f, 0.97124f, -0.02487f, -0.95359f, 0.68983f, + -0.16313f, 0.51324f, -0.33770f, 0.45938f, -1.08238f, 0.72938f, 0.42300f, + 0.85691f, -0.03783f, 1.12617f, -0.04034f, 0.36923f, 0.25638f, 1.10167f, + 0.41633f, 0.72602f, -0.14797f, 0.66888f, 0.11437f, -0.99797f, -0.20725f, + 1.01163f, 2.06308f, 1.23331f, -0.15481f, +}; + +static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = { + 2.14443f, + 1.98356f, + 0.74616f, + 2.58795f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_4x8_hor_layer0, + av1_tx_type_nn_weights_4x8_hor_layer1, + }, + { + av1_tx_type_nn_bias_4x8_hor_layer0, + av1_tx_type_nn_bias_4x8_hor_layer1, + }, +}; + +static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = { + 0.88859f, 1.02796f, 1.15509f, 0.61719f, 0.85804f, 1.17581f, 0.93524f, + 0.06546f, 0.08018f, -0.78562f, -0.36614f, 0.14149f, -0.30069f, -0.52647f, + -0.82789f, 0.60527f, -1.74026f, -0.20271f, 0.09875f, 0.03708f, 0.09430f, + -0.24043f, -0.38433f, 1.21014f, 1.42443f, 0.69586f, 1.07812f, 1.21748f, + 1.10989f, 0.93122f, 1.04127f, 0.39424f, 0.95592f, 0.12904f, 0.46330f, + 0.49722f, 0.46303f, 0.36979f, 0.60227f, 0.39345f, -2.01632f, -0.05706f, + 0.07766f, -0.01271f, -0.16577f, -0.21957f, -0.14800f, 0.24898f, 0.27662f, + 0.42028f, 0.44748f, 1.14585f, 1.38805f, 0.46182f, -0.22982f, -0.07324f, + 0.29886f, -0.46959f, -0.04228f, -0.01064f, 0.24260f, -0.32282f, -0.23804f, + 1.44466f, -0.42190f, -0.36385f, 0.39746f, 0.38557f, -0.09624f, -0.21540f, + 0.57385f, -0.72878f, -0.39677f, -0.00717f, 0.60499f, 1.33849f, 1.05337f, + 1.11947f, 0.38487f, 0.86534f, -0.33970f, 0.71140f, 0.20772f, 0.61132f, + 0.06181f, -0.20027f, 0.13736f, -0.72321f, 0.64586f, -0.56740f, -0.90912f, + -0.20452f, 0.15381f, -0.84346f, 0.19550f, 0.63164f, 1.35441f, 0.63218f, + 0.82883f, 0.38803f, -0.23874f, -0.02962f, 0.23846f, -0.06822f, -0.40159f, + -0.17850f, -0.69524f, 1.12299f, -0.08286f, -0.14150f, -0.28456f, -0.41519f, + -0.12792f, -0.55286f, 0.51655f, 0.06636f, 0.73759f, 0.70072f, 0.12616f, + 0.31282f, 0.17130f, -1.34233f, 0.37221f, 0.95838f, 0.16286f, 1.04301f, + 0.73600f, -0.11233f, +}; + +static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = { + -0.89131f, 0.09124f, -0.71678f, -1.19929f, 0.98963f, 0.16896f, + -0.44943f, -0.97532f, -0.13997f, 1.07136f, -0.46362f, -0.45253f, + -0.63015f, -0.20008f, 1.24048f, -0.21265f, +}; + +static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = { + -0.79795f, 0.45973f, -0.54188f, -1.05095f, 0.64404f, -0.56470f, -0.57018f, + 0.61644f, 0.50229f, 1.14006f, 0.13805f, -0.42058f, -0.07468f, 0.66203f, + 0.93180f, -0.59662f, -0.25152f, 0.00336f, 1.09769f, -1.11921f, 0.15151f, + 0.58750f, -0.42480f, -0.95908f, -0.10980f, 1.31715f, 0.06665f, -0.52371f, + 0.37228f, -0.12364f, 0.54876f, -0.32698f, 0.39863f, -0.97669f, -1.06351f, + 1.82755f, 1.02851f, 0.10322f, -0.08322f, 0.08891f, -0.05715f, 0.93503f, + 0.02096f, -0.39506f, -0.99330f, -0.09407f, 0.75108f, -0.30104f, 1.78314f, + -0.01786f, -0.17392f, 0.00461f, 0.41394f, 0.92566f, 1.11251f, -0.71380f, + -0.04907f, 0.12736f, 0.00208f, 0.94451f, -0.31783f, -0.19655f, 0.64619f, + 0.50359f, +}; + +static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = { + 0.39274f, + 1.27276f, + 0.30322f, + 2.55238f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_4x8_ver_layer0, + av1_tx_type_nn_weights_4x8_ver_layer1, + }, + { + av1_tx_type_nn_bias_4x8_ver_layer0, + av1_tx_type_nn_bias_4x8_ver_layer1, + }, +}; +/******************************************************************************/ + +// Tx type model for 8x4 block. +static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = { + 0.64828f, 0.61618f, 0.98975f, -0.14562f, 0.26957f, 1.80872f, 0.58299f, + -0.06917f, 0.00937f, -0.74073f, -0.66045f, -0.04576f, -0.39802f, -0.76960f, + -0.85166f, 0.88799f, -0.70694f, -0.34366f, -0.54906f, -0.39502f, -0.29465f, + -0.49650f, -0.32171f, 1.37181f, 1.30432f, 0.71843f, 1.01916f, 1.01582f, + 0.90999f, 0.86334f, 1.04603f, 0.40734f, 0.96187f, 0.53742f, 0.07510f, + 0.44167f, 0.02049f, -0.02874f, 0.97191f, 1.03647f, -2.62751f, -0.01390f, + -0.09282f, -0.02522f, -0.30849f, -0.19386f, -0.51356f, 0.52778f, 0.77191f, + 0.75416f, 0.69067f, 0.93561f, 1.35982f, 0.76193f, 0.57869f, 0.00251f, + -0.87244f, -0.26922f, -0.06682f, 0.07176f, 0.51142f, 0.58948f, 0.13914f, + 0.71165f, -0.40329f, -0.33201f, 0.35293f, 0.33437f, -0.01812f, -0.24765f, + 0.26810f, -0.77088f, 1.35707f, 0.22243f, 0.78402f, 0.66191f, 0.79890f, + 1.90669f, 0.73189f, 0.24222f, -0.34682f, 0.66990f, 0.19554f, 0.58414f, + 0.05060f, -0.21271f, 0.11656f, -0.74907f, 0.68837f, -0.39147f, -1.78263f, + -0.69918f, -0.06838f, -0.26927f, 0.38502f, 0.08305f, 1.29848f, 0.67328f, + 0.67269f, 0.65805f, -0.47778f, -1.02617f, 0.16523f, 0.12223f, -0.35294f, + -0.15866f, -0.56224f, 1.25895f, -0.21422f, -0.33518f, -0.33519f, -0.37414f, + 0.55122f, 0.14806f, 0.44312f, -0.07865f, 0.75295f, 0.10766f, 0.59922f, + 0.48837f, -0.19099f, -2.07991f, 0.35755f, 0.87813f, 0.07559f, 1.00724f, + 0.25223f, -0.06761f, +}; + +static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = { + -0.54227f, 0.08599f, -0.77447f, -1.10920f, 0.89298f, 0.05454f, + -0.73681f, 0.21048f, -0.41041f, 1.25690f, -0.60918f, 0.14661f, + -0.65392f, -0.25881f, 1.67995f, -0.03550f, +}; + +static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = { + -0.22312f, 0.73552f, 0.48399f, -0.66996f, 0.36527f, -0.42228f, -1.10793f, + 0.31167f, 0.16177f, 1.69315f, -0.06287f, -0.35804f, -0.24889f, 0.80824f, + 1.08952f, -0.62838f, 0.30066f, -0.19043f, -0.00518f, -1.31005f, 0.65797f, + 1.07714f, -0.24253f, 0.49779f, 0.05848f, 1.08914f, 0.08015f, -0.38853f, + 0.35108f, -0.11026f, 0.64528f, -0.37615f, 0.39995f, -0.58117f, -1.29627f, + 1.74169f, 0.75558f, -0.04910f, 0.35020f, 0.04556f, 0.12634f, 1.27223f, + 0.02608f, -0.19687f, -0.78649f, -0.22746f, 1.02589f, -0.28411f, 1.42443f, + -0.42115f, -0.21153f, -0.01733f, 0.62001f, 0.87167f, 1.66008f, -0.39179f, + -0.06293f, 0.27012f, 0.16871f, 0.64597f, 0.67358f, -0.20053f, 0.95830f, + 0.44232f, +}; + +static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = { + 0.14889f, + 1.74197f, + 0.53696f, + 2.87574f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_8x4_hor_layer0, + av1_tx_type_nn_weights_8x4_hor_layer1, + }, + { + av1_tx_type_nn_bias_8x4_hor_layer0, + av1_tx_type_nn_bias_8x4_hor_layer1, + }, +}; + +static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = { + 0.81919f, 0.15527f, 0.60055f, -0.54617f, -0.35510f, -0.28223f, -0.20478f, + 0.15001f, -1.84806f, -0.30274f, -0.00865f, 0.33939f, 1.11970f, 0.44630f, + 0.32074f, 0.39637f, 0.08149f, 1.28070f, 0.86703f, 0.76503f, -1.83991f, + -1.13575f, -0.68605f, -0.23690f, 0.07099f, 0.64960f, 0.82543f, -0.72028f, + 0.08220f, 0.34338f, 0.20245f, -0.88920f, +}; + +static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = { + 1.14995f, -0.16021f, 2.38325f, -0.65179f, + 1.09624f, 1.07662f, 0.63837f, -0.64847f, +}; + +static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = { + 0.10278f, 0.06819f, 1.73885f, 1.29889f, -0.18482f, -1.06132f, 0.67003f, + -0.23280f, 0.50181f, -0.33890f, 0.43524f, -1.03147f, 1.09640f, 0.66332f, + 0.47652f, -0.02251f, 0.94245f, -0.03861f, 0.84776f, 0.28377f, 0.92044f, + 0.23572f, 0.52082f, -0.16266f, 0.45290f, 0.11342f, -0.50310f, -0.92633f, + 1.46345f, 1.84714f, 1.06804f, -0.13610f, +}; + +static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = { + 2.41028f, + 1.95675f, + 0.82387f, + 2.41923f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_8x4_ver_layer0, + av1_tx_type_nn_weights_8x4_ver_layer1, + }, + { + av1_tx_type_nn_bias_8x4_ver_layer0, + av1_tx_type_nn_bias_8x4_ver_layer1, + }, +}; +/******************************************************************************/ + +// Tx type model for 8x8 block. +static const float av1_tx_type_nn_weights_8x8_layer0[128] = { + 0.98214f, 1.05643f, 0.91173f, 0.24165f, 0.39961f, 0.25736f, 0.68593f, + 0.10553f, 0.13353f, -0.49687f, -1.66413f, 1.16584f, 2.25147f, -0.72247f, + -2.65486f, -0.03628f, -1.47746f, -1.07644f, -1.25551f, -0.91260f, -1.26199f, + -1.06022f, -1.42138f, 1.10500f, 2.96552f, -0.40638f, 0.02258f, -0.23137f, + 0.34922f, -0.01454f, 0.41251f, 0.35944f, -1.56742f, 0.01406f, 0.88114f, + 1.42462f, 0.87243f, 0.02439f, 0.07035f, 0.34303f, -3.16843f, 0.25798f, + 0.07494f, 0.38926f, -0.12267f, 0.09049f, -0.36711f, 0.01551f, 1.41269f, + 1.33505f, 1.43627f, 1.41909f, 1.44605f, 1.43008f, 1.36721f, 0.19443f, + -0.08606f, 0.17285f, 0.63692f, 0.92092f, 0.61007f, 0.87100f, -0.33631f, + 1.98025f, -0.40686f, -0.33808f, 0.34919f, 0.33817f, -0.01807f, -0.25259f, + 0.26442f, -0.76979f, 1.07788f, -1.38747f, 1.34315f, 2.79947f, 2.02838f, + -0.25062f, 0.00174f, 1.25888f, 0.17344f, 0.20897f, 1.28765f, 1.95749f, + 1.62351f, 1.04556f, 0.43858f, 0.12463f, 1.66399f, 0.03971f, 0.36614f, + 0.56932f, 0.15982f, 0.11587f, 0.21402f, 1.89386f, -0.91267f, -0.79781f, + 1.79155f, 0.60147f, -0.90118f, -4.32718f, -0.58154f, -0.02181f, -0.40734f, + -0.11409f, -0.79470f, 0.69697f, -0.16588f, -0.16090f, -0.21236f, -0.52776f, + -0.64455f, 0.09173f, 0.80766f, 0.76097f, 0.20295f, -0.93467f, -0.43509f, + 0.59659f, 0.07788f, -3.79459f, 0.16268f, 0.47343f, 0.05106f, -0.24880f, + 1.18941f, 0.10346f, +}; + +static const float av1_tx_type_nn_bias_8x8_layer0[16] = { + 0.75780f, 0.25628f, 0.19911f, -0.41384f, 1.33909f, 0.31498f, + -1.37171f, -1.09561f, -0.44056f, 0.49001f, -0.65804f, -1.96031f, + 0.64806f, -0.52520f, 1.38838f, 0.15519f, +}; + +static const float av1_tx_type_nn_weights_8x8_layer1[64] = { + -0.63856f, -2.02670f, -0.92947f, 0.00216f, 1.47710f, -2.01099f, -2.11289f, + -0.92288f, 0.19296f, 1.37866f, -0.85975f, -0.78624f, -2.10392f, 0.13976f, + 1.06968f, -2.04120f, 0.57991f, -1.84941f, -0.81512f, -2.08254f, -0.47334f, + 0.12256f, -1.39594f, -1.02829f, 0.06134f, 2.25646f, -1.25196f, -2.65317f, + -1.94473f, 0.10989f, 0.55446f, -1.76557f, 0.33455f, -1.85556f, -3.01878f, + -0.25100f, 1.65520f, -1.61409f, 1.16336f, -1.15560f, 0.13631f, 1.50733f, + -1.07538f, -0.91200f, -1.93132f, 0.09271f, 0.24425f, -1.80655f, -0.01138f, + -1.36421f, -0.62970f, -0.84766f, -0.34714f, -0.50531f, 1.91005f, -1.60316f, + -0.02495f, 1.04938f, 0.28411f, -0.79809f, -1.48232f, 0.00766f, 0.94016f, + -1.11974f, +}; + +static const float av1_tx_type_nn_bias_8x8_layer1[4] = { + 0.53574f, + 1.57736f, + -0.13698f, + 2.64613f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x8 = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_8x8_layer0, + av1_tx_type_nn_weights_8x8_layer1, + }, + { + av1_tx_type_nn_bias_8x8_layer0, + av1_tx_type_nn_bias_8x8_layer1, + }, +}; +/******************************************************************************/ + +// Tx type model for 8x16 block. +static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = { + 1.36274f, 1.37313f, 1.26859f, 1.26459f, 1.37979f, 1.47217f, 1.29710f, + 0.15765f, 0.31552f, -0.05727f, 0.25562f, 0.47925f, -0.32913f, -0.55757f, + -0.98010f, 0.08568f, -0.62754f, 0.12834f, -0.03717f, 0.06286f, 0.26159f, + 0.26023f, -0.62605f, 1.34500f, 1.47720f, 0.47937f, 0.84793f, 0.87866f, + 0.81260f, 0.74761f, 0.84217f, 0.53321f, -0.78232f, 0.35321f, 0.41240f, + 0.45002f, 0.88973f, 0.51055f, 0.91115f, -0.45512f, -2.37418f, -0.25205f, + 0.05893f, -0.15685f, -0.25156f, -0.17104f, -0.12230f, 0.17802f, 0.18796f, + -0.05797f, 0.26484f, 1.23515f, 1.70393f, 0.46022f, -0.14354f, 0.08501f, + -0.84625f, -0.42578f, -0.29345f, -0.51797f, -0.56515f, -0.47465f, 0.23970f, + 1.59912f, -0.40332f, -0.33209f, 0.37274f, 0.36831f, -0.00248f, -0.24295f, + 0.29539f, -0.76136f, -0.22531f, 0.12371f, 0.37889f, 1.02639f, 1.73330f, + 1.09686f, 1.04111f, 0.69006f, -1.27157f, 0.94013f, 0.61621f, 0.62274f, + 0.48759f, 0.55672f, 0.62597f, -0.38846f, 1.72124f, 0.08214f, -0.06650f, + 0.32617f, 0.10958f, 0.24650f, 0.10740f, 1.16861f, 0.50701f, 0.45383f, + 0.90016f, -0.00695f, -0.11986f, -0.07834f, 0.20346f, 0.25863f, -0.40889f, + -0.11344f, -0.79108f, 0.76259f, -0.14562f, -0.15459f, -0.20954f, -0.51306f, + 0.02743f, -0.82456f, -0.00861f, -0.27274f, 0.28762f, 0.07282f, 0.26410f, + 0.53413f, -0.22208f, -0.85031f, -1.39129f, -0.74519f, 0.09771f, 0.80313f, + 1.07698f, 0.02531f, +}; + +static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = { + -1.30434f, -1.19259f, -0.43467f, -0.85386f, 0.96584f, 0.29276f, + -0.41990f, -0.96924f, -0.30933f, 0.95264f, -0.25330f, -1.19584f, + 1.46564f, -0.42959f, 1.55720f, 0.18479f, +}; + +static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = { + -1.72959f, -0.21670f, 0.10616f, -0.02006f, 0.15084f, -0.85303f, -0.27535f, + 0.58704f, 0.23683f, 1.19743f, 0.77971f, 0.49874f, 0.19508f, 0.19641f, + 1.47895f, -0.52173f, -0.56746f, -0.50761f, 0.15864f, -0.95168f, 0.48103f, + 0.91904f, -0.11700f, 0.62863f, 0.06526f, 1.63803f, -0.72325f, -1.80449f, + 0.66373f, 0.12831f, 0.27139f, -0.26346f, 1.50852f, 0.25079f, -0.54255f, + 1.78815f, 1.39691f, -0.44989f, -0.18511f, -1.52903f, 0.13983f, 1.06906f, + -0.30184f, 0.37566f, 0.46209f, 0.10440f, 0.64695f, -0.34002f, 1.96990f, + 0.21189f, -0.91248f, -0.11263f, 0.26708f, 1.27405f, 1.89776f, 0.02081f, + -0.06977f, -0.02584f, 0.47733f, 0.27117f, 1.33315f, -0.09175f, 0.48747f, + 1.16772f, +}; + +static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = { + 1.25783f, + 1.19452f, + 0.69964f, + 2.41982f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_8x16_hor_layer0, + av1_tx_type_nn_weights_8x16_hor_layer1, + }, + { + av1_tx_type_nn_bias_8x16_hor_layer0, + av1_tx_type_nn_bias_8x16_hor_layer1, + }, +}; + +static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = { + 0.90888f, 0.86305f, 0.81674f, 0.75352f, 1.07834f, 0.99048f, 0.96355f, + 0.13836f, -0.51334f, 0.19906f, 1.84608f, 0.67828f, 0.45876f, 0.08325f, + 0.28190f, -0.01958f, -1.96553f, 0.27837f, -0.05929f, 0.13491f, 0.21036f, + 0.05797f, -0.01373f, 0.73765f, 1.39603f, -0.53767f, 0.10362f, 0.03420f, + 0.41909f, 0.09510f, 0.32284f, 0.83860f, 0.13954f, 0.48434f, 1.47762f, + 0.45891f, 0.23613f, 0.13013f, 0.82097f, -0.03251f, -1.89757f, 0.21589f, + -0.10370f, 0.02530f, -0.25659f, 0.01466f, -0.23661f, 0.22783f, 0.92100f, + 1.02915f, 1.20358f, 1.17251f, 0.97749f, 1.04696f, 0.91333f, 0.54576f, + -0.52792f, 0.02217f, 0.25652f, 0.31405f, -0.18398f, 0.04572f, -0.81359f, + 1.82883f, -0.40047f, -0.33056f, 0.35255f, 0.34448f, -0.00339f, -0.23857f, + 0.28925f, -0.77175f, -0.24325f, -0.21420f, 1.11451f, 1.39553f, 0.51573f, + 0.05476f, 1.13791f, 0.94959f, -0.35710f, 0.67467f, 0.16722f, 0.61213f, + 0.07683f, -0.20613f, 0.13440f, -0.72131f, -0.15418f, -0.17688f, -0.16510f, + -0.19226f, 0.09270f, -2.43559f, -0.12669f, 0.05074f, 0.30414f, 0.00927f, + 0.60630f, 0.00801f, -1.07310f, -0.06227f, 2.10607f, 0.02382f, -0.39891f, + -0.09149f, -0.78596f, 0.83966f, -0.14802f, -0.14083f, -0.20831f, -0.55136f, + 0.08566f, -0.00647f, 0.07044f, 0.53408f, 0.85720f, -0.07393f, 0.24476f, + 0.43767f, 0.30519f, -1.89430f, 0.23252f, 1.63790f, 0.17316f, -0.03903f, + 0.25269f, 0.01562f, +}; + +static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = { + -0.83370f, -0.20704f, -0.60437f, -0.81664f, 1.16998f, 0.16745f, + -1.34680f, -1.07083f, -0.34649f, 0.65598f, -0.56278f, 0.22660f, + -0.25956f, -0.29608f, 1.24359f, -0.09167f, +}; + +static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = { + -0.71147f, -0.63964f, -0.69220f, 0.22326f, 0.67191f, -0.58894f, -0.98464f, + 0.23583f, 0.22824f, 1.39838f, 0.09920f, -0.59411f, -0.67101f, 0.19088f, + 0.83025f, -0.66991f, -0.42889f, -0.49969f, 1.39532f, -1.02000f, 0.62101f, + 0.57175f, -0.83226f, 0.01551f, 0.05604f, 1.23028f, 0.02030f, -0.55995f, + -0.42349f, 0.15375f, 0.52132f, -0.52421f, 0.89586f, -0.73778f, -0.10911f, + 0.22447f, 1.16858f, -0.48169f, 1.73890f, -0.69860f, 0.12504f, 1.10492f, + 0.04391f, -0.85670f, -0.49257f, 0.09616f, 0.76518f, -0.44854f, 1.50938f, + 0.62246f, -0.40366f, -0.11182f, -0.01680f, 0.59724f, 1.32170f, -1.09061f, + -0.04278f, -0.02449f, 0.25024f, 1.26239f, 0.42345f, -0.10031f, 0.80871f, + 0.44198f, +}; + +static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = { + 0.68329f, + 1.33555f, + 0.25943f, + 3.23439f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_8x16_ver_layer0, + av1_tx_type_nn_weights_8x16_ver_layer1, + }, + { + av1_tx_type_nn_bias_8x16_ver_layer0, + av1_tx_type_nn_bias_8x16_ver_layer1, + }, +}; +/******************************************************************************/ + +// Tx type model for 16x8 block. +static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = { + 0.89821f, 0.90804f, 1.13052f, 0.74855f, 1.02053f, 0.91260f, 0.97102f, + 0.16808f, -0.19982f, -0.33296f, -0.22490f, -0.22481f, -0.09332f, -2.44338f, + -0.12236f, -0.03158f, -1.43561f, 0.07794f, 0.16586f, 0.09731f, 0.12967f, + 0.09725f, -0.16826f, 1.26640f, 0.88004f, 0.27312f, -0.07993f, 0.33640f, + 0.11732f, 0.33384f, 0.97066f, -0.61744f, -0.48545f, 0.44622f, 0.73744f, + 0.32262f, -0.05713f, 0.42280f, 1.10378f, 0.18540f, -2.07906f, 0.11443f, + 0.37877f, 0.24136f, -0.12524f, -0.12434f, 0.02116f, 0.11716f, 1.28267f, + 1.01508f, 1.26184f, 1.22545f, 1.29582f, 1.18855f, 1.27564f, 0.42001f, + -0.41481f, 0.06725f, -0.13133f, -0.24801f, 0.16515f, 0.16228f, 0.35197f, + 0.53610f, -0.39805f, -0.32584f, 0.40096f, 0.38621f, -0.00030f, -0.23434f, + 0.29149f, -0.76542f, 0.04996f, -0.30036f, 1.48687f, 0.90852f, -0.03083f, + -0.15953f, 1.19259f, 0.87690f, -1.08977f, 0.78757f, 0.81149f, 0.54089f, + 0.35400f, 0.37919f, 0.84997f, -0.20449f, 0.39601f, -0.37596f, 0.64748f, + 0.26021f, 0.37354f, 0.23593f, 0.16335f, 1.70681f, 0.31800f, -0.00964f, + 0.82687f, -0.78372f, -1.47438f, 0.32410f, 1.37436f, 0.07476f, -0.40574f, + -0.10353f, -0.79300f, 0.74381f, -0.15601f, -0.14380f, -0.20961f, -0.52697f, + 0.04669f, -0.00870f, 0.05624f, -0.09036f, 0.25701f, 0.30336f, 0.24199f, + 0.45579f, 0.66330f, -1.81834f, 0.74965f, 1.22747f, 0.25072f, 0.25100f, + 0.43289f, -0.00362f, +}; + +static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = { + -0.87643f, 0.36754f, -0.86409f, 1.37761f, 1.22688f, 0.09074f, + -1.47139f, -1.06100f, -0.24087f, 1.10382f, -0.32837f, -1.39592f, + -0.14741f, -0.43954f, 1.72137f, -0.21704f, +}; + +static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = { + -0.81860f, -0.80745f, -0.43612f, 0.58656f, 0.37455f, -0.56519f, -1.71536f, + 0.23278f, 0.23951f, 1.09610f, 0.49986f, 0.43375f, -0.53182f, 0.17376f, + 1.05626f, -0.61743f, -0.71777f, -0.66943f, 1.40091f, 0.34426f, 1.14295f, + 0.45571f, -0.52504f, -0.00303f, 0.06044f, 0.66119f, -0.60340f, -1.14344f, + -0.28045f, 0.12742f, 0.61484f, -0.41016f, 1.36102f, -0.86969f, -0.52728f, + 1.01725f, 0.67083f, -0.10138f, 1.36406f, 0.34066f, 0.12498f, 0.86595f, + -0.39636f, -0.27888f, -0.40244f, 0.09847f, 0.81178f, -0.45313f, 1.39127f, + 0.99865f, -0.57908f, 0.55072f, 0.49638f, 1.11524f, 1.85504f, -0.28316f, + -0.05195f, -0.23284f, 0.26461f, -1.28120f, 0.60707f, -0.06110f, 0.74085f, + 0.63304f, +}; + +static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = { + 0.71765f, + 1.40400f, + 0.32221f, + 3.07234f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_16x8_hor_layer0, + av1_tx_type_nn_weights_16x8_hor_layer1, + }, + { + av1_tx_type_nn_bias_16x8_hor_layer0, + av1_tx_type_nn_bias_16x8_hor_layer1, + }, +}; + +static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = { + 1.20497f, 1.23691f, 1.23738f, 1.07773f, 1.15264f, 1.31959f, 1.15365f, + 0.17179f, 0.68612f, 0.55636f, 0.57145f, 0.67022f, 0.19636f, -1.27420f, + -1.36428f, -0.16706f, -1.20934f, -0.87794f, -0.97146f, -0.74722f, -1.14493f, + -1.02689f, -0.88153f, 0.83857f, 1.53355f, 0.13601f, 0.35451f, 0.53750f, + 0.62381f, 0.32438f, 0.59405f, 0.33090f, -1.52948f, -0.46094f, 0.42634f, + 0.48763f, 0.30707f, 0.52553f, 0.71427f, -0.31287f, -2.37106f, -0.18756f, + 0.16561f, -0.00431f, -0.13747f, -0.09336f, -0.16511f, 0.13454f, 0.45010f, + -0.00317f, -0.06403f, 0.95442f, 1.59636f, 0.30602f, -0.05515f, 0.05467f, + -0.21758f, -0.19192f, -0.17935f, -0.00545f, 0.35409f, 0.26141f, -0.32174f, + 1.78129f, -0.40161f, -0.33158f, 0.38084f, 0.38081f, 0.01053f, -0.23567f, + 0.29239f, -0.76159f, -0.19373f, 0.13649f, 0.66949f, 1.19733f, 1.92557f, + 1.16691f, 0.94955f, 0.62324f, -0.85434f, -0.07699f, 0.87683f, 0.95911f, + 0.86106f, 0.57959f, 0.40146f, -0.35851f, 1.55427f, 0.15349f, -0.01582f, + 0.32517f, 0.03784f, 0.15916f, 0.09024f, 1.43187f, 0.56160f, 0.11521f, + 0.52476f, -0.26107f, -0.38167f, -0.31596f, 0.31304f, -0.65366f, -0.40680f, + -0.11082f, -0.78585f, 0.77906f, -0.13322f, -0.13747f, -0.21001f, -0.53204f, + -0.06752f, -0.84741f, -0.53442f, -0.16284f, 0.54027f, 0.13586f, -0.42001f, + 0.85388f, 0.08300f, -0.89325f, -1.73681f, -0.70473f, 0.23151f, 0.69549f, + 0.72124f, 0.12769f, +}; + +static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = { + -1.15644f, -0.31062f, 0.20697f, -0.60304f, -1.19498f, 0.21451f, + -0.42825f, -0.71800f, -0.25816f, 1.47408f, -0.24423f, -1.45773f, + -0.55834f, -0.36938f, 1.56759f, 0.07238f, +}; + +static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = { + -1.45227f, -0.67141f, 0.75237f, 0.32681f, -0.70528f, -0.76730f, -0.49777f, + 0.02418f, 0.25096f, 1.14840f, 0.23548f, 0.48755f, 0.33164f, 0.21050f, + 1.41651f, -0.28888f, -0.76668f, 0.04439f, 0.67538f, -1.06438f, 0.68128f, + 0.95824f, 0.08530f, -0.03635f, 0.06820f, 1.38621f, -0.50424f, -1.72992f, + -0.20949f, 0.13400f, 0.93366f, -0.05324f, 1.41593f, -0.75119f, -1.80912f, + 1.05440f, 0.62580f, -0.30867f, -0.07025f, -0.34654f, 0.13621f, 1.74426f, + -0.22417f, 0.47031f, -0.08142f, 0.10151f, 0.42498f, 0.06635f, 1.50623f, + 1.04130f, 0.85107f, 0.23382f, 0.69800f, 1.10856f, 1.18767f, -0.69395f, + -0.07985f, 0.50412f, 0.46019f, 0.49214f, 0.44219f, -0.09502f, 0.75745f, + 0.99208f, +}; + +static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = { + 0.68774f, + 0.88572f, + 0.77462f, + 3.05667f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_16x8_ver_layer0, + av1_tx_type_nn_weights_16x8_ver_layer1, + }, + { + av1_tx_type_nn_bias_16x8_ver_layer0, + av1_tx_type_nn_bias_16x8_ver_layer1, + }, +}; +/******************************************************************************/ + +// Tx type model for 16x16 block. +static const float av1_tx_type_nn_weights_16x16_layer0[128] = { + 1.26592f, 1.36313f, 1.30956f, 1.29926f, 1.48816f, 1.68851f, 1.32000f, + 0.13321f, -0.22477f, -0.88906f, -0.19622f, 1.69605f, 1.22180f, -1.57771f, + -1.15765f, 0.05710f, -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f, + -0.77952f, -1.15723f, 1.17809f, 1.35602f, -0.05243f, -0.37596f, 0.26108f, + 0.17611f, -0.10323f, 0.77279f, -0.48911f, -0.79308f, 0.55112f, 0.43918f, + 0.27872f, 0.28714f, 0.45830f, 1.05689f, 0.03705f, -2.49975f, -0.01940f, + 0.05709f, 0.07942f, -0.13290f, -0.10359f, 0.00143f, 0.37303f, 0.96470f, + 0.53293f, 1.14459f, 0.89185f, 0.43378f, 0.47764f, 0.90924f, 0.15279f, + -0.15361f, 0.02949f, 0.42240f, 0.68143f, 0.89588f, 0.73754f, 0.10974f, + 1.57755f, -0.39870f, -0.32914f, 0.35638f, 0.34991f, -0.00003f, -0.23373f, + 0.29630f, -0.76699f, -0.01356f, 0.04234f, 0.84253f, 1.92078f, 0.93160f, + 0.71993f, 0.71604f, 0.76455f, -1.59782f, 0.32332f, 1.11628f, 0.33062f, + -0.03728f, -0.05710f, 0.80447f, -0.14719f, 1.34658f, -0.05718f, 0.64015f, + 0.21926f, 0.41653f, 0.12720f, 0.54092f, 1.39411f, 1.81819f, -0.24513f, + 0.00955f, 0.38011f, -0.57787f, -0.41759f, 0.68834f, -0.31783f, -0.40607f, + -0.10107f, -0.79374f, 0.75599f, -0.16282f, -0.14490f, -0.20783f, -0.55019f, + -0.13793f, -0.22293f, 0.18305f, 0.12445f, 0.56830f, 0.24567f, 0.09278f, + 0.70803f, 0.35803f, -1.52676f, -0.89624f, 0.77665f, 0.19877f, 0.77175f, + 0.50355f, 0.08592f, +}; + +static const float av1_tx_type_nn_bias_16x16_layer0[16] = { + -1.31834f, 0.14346f, -0.10062f, 0.84489f, 0.95617f, -0.06720f, + -0.68502f, -0.91442f, -0.31932f, 0.25276f, -0.15138f, -1.57661f, + -0.14062f, -0.42120f, 0.94573f, -0.09287f, +}; + +static const float av1_tx_type_nn_weights_16x16_layer1[64] = { + -1.80333f, -1.06353f, 0.55139f, 0.74644f, 0.13747f, -0.93018f, -0.10286f, + 0.67133f, 0.24460f, 1.44583f, 0.02173f, 0.26037f, -0.73687f, 0.19566f, + 0.61846f, -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f, + 0.96224f, -0.59139f, 0.03813f, 0.05403f, 1.33427f, -0.54375f, -1.92181f, + 0.54704f, 0.13608f, 0.22151f, -0.38076f, 1.18390f, -0.77508f, -1.84283f, + 1.00894f, 0.62318f, -0.15296f, 1.27600f, 0.22822f, 0.12751f, 0.93910f, + -0.28502f, 0.53912f, -0.96889f, 0.10182f, 0.81508f, -0.43028f, 2.67386f, + 0.52204f, 0.49820f, -0.41711f, 1.05038f, 1.12192f, 0.74349f, -0.75417f, + -0.03718f, -0.35769f, 0.89651f, 0.63236f, 0.54215f, -0.07894f, 0.48274f, + 1.08829f, +}; + +static const float av1_tx_type_nn_bias_16x16_layer1[4] = { + 0.81986f, + 1.26865f, + 0.11118f, + 2.48404f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x16 = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_16x16_layer0, + av1_tx_type_nn_weights_16x16_layer1, + }, + { + av1_tx_type_nn_bias_16x16_layer0, + av1_tx_type_nn_bias_16x16_layer1, + }, +}; +/******************************************************************************/ + +// Tx type model for 16x32 block. +static const float av1_tx_type_nn_weights_16x32_hor_layer0[128] = { + 0.89821f, 0.90804f, 1.13052f, 0.74855f, 1.02053f, 0.91260f, 0.97102f, + 0.16808f, -0.19982f, -0.33296f, -0.22490f, -0.22481f, -0.09332f, -2.44338f, + -0.12236f, -0.03158f, -1.43561f, 0.07794f, 0.16586f, 0.09731f, 0.12967f, + 0.09725f, -0.16826f, 1.26640f, 0.88004f, 0.27312f, -0.07993f, 0.33640f, + 0.11732f, 0.33384f, 0.97066f, -0.61744f, -0.48545f, 0.44622f, 0.73744f, + 0.32262f, -0.05713f, 0.42280f, 1.10378f, 0.18540f, -2.07906f, 0.11443f, + 0.37877f, 0.24136f, -0.12524f, -0.12434f, 0.02116f, 0.11716f, 1.28267f, + 1.01508f, 1.26184f, 1.22545f, 1.29582f, 1.18855f, 1.27564f, 0.42001f, + -0.41481f, 0.06725f, -0.13133f, -0.24801f, 0.16515f, 0.16228f, 0.35197f, + 0.53610f, -0.39805f, -0.32584f, 0.40096f, 0.38621f, -0.00030f, -0.23434f, + 0.29149f, -0.76542f, 0.04996f, -0.30036f, 1.48687f, 0.90852f, -0.03083f, + -0.15953f, 1.19259f, 0.87690f, -1.08977f, 0.78757f, 0.81149f, 0.54089f, + 0.35400f, 0.37919f, 0.84997f, -0.20449f, 0.39601f, -0.37596f, 0.64748f, + 0.26021f, 0.37354f, 0.23593f, 0.16335f, 1.70681f, 0.31800f, -0.00964f, + 0.82687f, -0.78372f, -1.47438f, 0.32410f, 1.37436f, 0.07476f, -0.40574f, + -0.10353f, -0.79300f, 0.74381f, -0.15601f, -0.14380f, -0.20961f, -0.52697f, + 0.04669f, -0.00870f, 0.05624f, -0.09036f, 0.25701f, 0.30336f, 0.24199f, + 0.45579f, 0.66330f, -1.81834f, 0.74965f, 1.22747f, 0.25072f, 0.25100f, + 0.43289f, -0.00362f, +}; + +static const float av1_tx_type_nn_bias_16x32_hor_layer0[16] = { + -0.87643f, 0.36754f, -0.86409f, 1.37761f, 1.22688f, 0.09074f, + -1.47139f, -1.06100f, -0.24087f, 1.10382f, -0.32837f, -1.39592f, + -0.14741f, -0.43954f, 1.72137f, -0.21704f, +}; + +static const float av1_tx_type_nn_weights_16x32_hor_layer1[64] = { + -0.81860f, -0.80745f, -0.43612f, 0.58656f, 0.37455f, -0.56519f, -1.71536f, + 0.23278f, 0.23951f, 1.09610f, 0.49986f, 0.43375f, -0.53182f, 0.17376f, + 1.05626f, -0.61743f, -0.71777f, -0.66943f, 1.40091f, 0.34426f, 1.14295f, + 0.45571f, -0.52504f, -0.00303f, 0.06044f, 0.66119f, -0.60340f, -1.14344f, + -0.28045f, 0.12742f, 0.61484f, -0.41016f, 1.36102f, -0.86969f, -0.52728f, + 1.01725f, 0.67083f, -0.10138f, 1.36406f, 0.34066f, 0.12498f, 0.86595f, + -0.39636f, -0.27888f, -0.40244f, 0.09847f, 0.81178f, -0.45313f, 1.39127f, + 0.99865f, -0.57908f, 0.55072f, 0.49638f, 1.11524f, 1.85504f, -0.28316f, + -0.05195f, -0.23284f, 0.26461f, -1.28120f, 0.60707f, -0.06110f, 0.74085f, + 0.63304f, +}; + +static const float av1_tx_type_nn_bias_16x32_hor_layer1[4] = { + 0.71765f, + 1.40400f, + 0.32221f, + 3.07234f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x32_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_16x32_hor_layer0, + av1_tx_type_nn_weights_16x32_hor_layer1, + }, + { + av1_tx_type_nn_bias_16x32_hor_layer0, + av1_tx_type_nn_bias_16x32_hor_layer1, + }, +}; + +static const float av1_tx_type_nn_weights_16x32_ver_layer0[512] = { + -0.01219f, 0.51494f, 0.25450f, 0.45788f, -0.87277f, 0.32954f, -0.04851f, + -0.24321f, -0.40000f, 0.21915f, 0.14108f, 0.98268f, 0.18989f, 0.54298f, + 0.36349f, 0.38931f, 1.08124f, 0.87199f, 1.03553f, 1.14777f, 1.04254f, + 1.11336f, 0.92198f, 0.84715f, 1.89363f, 1.21587f, 0.72377f, 1.25097f, + 0.84231f, 0.95529f, 1.12346f, 0.19113f, -0.04559f, 0.56859f, 0.59747f, + 0.60176f, 0.82465f, 0.59009f, 0.67240f, 1.58674f, -0.92951f, -0.23449f, + 0.11923f, -0.19151f, -0.15914f, 0.03146f, -0.16541f, 0.17181f, -0.21834f, + 0.21906f, 0.96708f, 0.36085f, -0.42380f, -2.25681f, -0.48812f, 0.72875f, + 0.06585f, 0.18818f, -0.02109f, -0.10996f, 0.00187f, -0.02078f, 0.04484f, + -0.07171f, 0.94773f, -0.33466f, 0.28484f, 0.14791f, 0.30274f, 0.13377f, + 0.40970f, 0.45133f, 1.69265f, -0.36422f, -0.15889f, 0.07670f, 0.44675f, + -0.28665f, -0.07097f, 1.03803f, -0.83274f, -0.24571f, 0.08039f, -0.23790f, + -0.23276f, -0.28031f, 0.26451f, -0.18513f, -2.23336f, -0.62073f, 0.32495f, + -0.67644f, -0.08559f, -0.36382f, -0.24515f, -0.01899f, 0.09012f, 0.19723f, + 0.04017f, 0.31624f, 0.58369f, 0.30411f, -0.81165f, -2.58541f, -0.20491f, + 0.68089f, -0.14799f, 0.13925f, 0.12867f, 0.15229f, 0.06887f, -0.03784f, + 0.02288f, -0.28712f, 0.14107f, 0.29485f, -0.11662f, 0.25239f, 0.30311f, + -0.07377f, -0.10962f, 0.59856f, 0.47967f, 0.01847f, -0.27889f, 0.46786f, + 0.18118f, 0.09355f, -2.10076f, 0.38823f, 0.28202f, 0.29104f, 0.86977f, + 0.52377f, 0.21161f, 0.72888f, -0.00952f, 0.15982f, -0.14651f, 0.28763f, + -0.14155f, 0.00093f, 0.08351f, 0.34685f, -0.22066f, 0.20378f, 0.25416f, + 0.03423f, -0.11068f, -0.41612f, 0.56913f, -0.06697f, -0.12585f, -0.21033f, + -0.14513f, -0.04477f, -0.35778f, 0.03437f, 0.06956f, -0.25356f, -1.46010f, + -0.08142f, 0.11926f, -0.63551f, -0.13882f, 0.34164f, 0.10821f, 1.07323f, + -0.62435f, -0.27116f, 0.25971f, 0.11952f, -0.39480f, -0.05474f, -0.12582f, + 0.28289f, 0.13723f, 0.58369f, 0.41865f, 0.28574f, 1.01357f, 0.46661f, + 0.61717f, 0.85708f, -0.03930f, -0.38013f, -0.33888f, -0.20561f, -0.19087f, + -0.01041f, 0.12119f, -0.20786f, 0.55915f, 0.67511f, 0.55554f, 0.56540f, + 0.76647f, 0.54766f, 0.45166f, 0.61384f, 0.95407f, -0.06811f, -0.62132f, + 0.12713f, 0.63713f, 2.04090f, 1.17054f, 0.00469f, -0.93692f, -0.24136f, + -0.04281f, -0.15787f, 0.37956f, -0.09174f, -0.72494f, 0.55285f, -1.40996f, + -0.54077f, 0.38445f, -0.08258f, 0.64259f, -0.54058f, -0.49865f, 1.41371f, + 0.89014f, 0.78788f, 0.37919f, 0.87447f, -0.00760f, -0.00947f, 0.16323f, + -0.36632f, -1.38115f, -0.24619f, 0.40490f, -0.08871f, -0.25365f, -0.60842f, + 0.11128f, 0.18658f, -0.86001f, -0.28271f, 0.39572f, -0.29930f, -0.10110f, + 0.33706f, 0.21731f, 0.15383f, -0.01707f, 0.02812f, 0.31192f, 0.39742f, + 0.38260f, -0.48263f, 0.57385f, 0.53239f, -0.60013f, -0.63211f, -0.45140f, + -0.73520f, -0.95260f, -0.70633f, -0.96190f, 0.01747f, -0.05195f, -0.07138f, + -1.09535f, -0.63548f, -1.55700f, -0.35721f, -0.18923f, 0.77568f, 0.09419f, + 0.36919f, -0.32761f, -0.06597f, -0.38988f, -0.43674f, -0.24284f, 0.36906f, + 0.28414f, 0.19273f, -0.68516f, 0.09514f, -0.45381f, 0.19917f, -0.32377f, + 1.32549f, 0.08244f, -0.64405f, 0.13195f, 2.85307f, 0.47631f, -0.33408f, + 0.04168f, 0.18585f, -0.18029f, 0.07986f, -0.08816f, -0.00703f, -0.01515f, + -0.13164f, 0.00571f, 0.05676f, 1.51425f, 0.73360f, 0.43486f, -0.08223f, + -0.06183f, -0.57098f, -0.29948f, 0.05945f, 0.19238f, -0.47980f, -0.35902f, + -0.19931f, 0.43443f, 0.67436f, 0.78573f, 0.25703f, 1.01863f, 0.99047f, + 0.95228f, 1.02429f, 1.19264f, 0.29935f, -0.26583f, -0.98749f, -0.46167f, + -0.29727f, -0.10515f, -0.39790f, -0.59321f, -0.61925f, -0.95452f, 0.04292f, + -0.48273f, -0.91195f, -0.45971f, -0.46355f, -0.88319f, -0.51712f, -0.47682f, + -0.86110f, -0.59178f, -0.57163f, -0.94946f, 0.19627f, -0.18699f, 0.11037f, + 1.39110f, 0.05715f, 3.00762f, 1.52243f, 0.25028f, 0.12779f, -0.12871f, + 0.04764f, 0.08288f, -0.16572f, -0.06580f, 0.05845f, -0.01474f, 0.04886f, + -0.10000f, 0.12911f, -0.01416f, -0.12472f, 0.14358f, 0.16554f, 0.08853f, + 0.13418f, -0.05408f, -0.13871f, -0.00049f, 0.20725f, -0.05603f, 0.27885f, + -0.14277f, 0.29653f, -0.24739f, 0.10101f, -0.17068f, -2.43802f, 0.41834f, + 0.49784f, 0.34949f, 0.98487f, 0.16792f, 1.07355f, 0.32546f, 1.32377f, + -0.08584f, 0.85214f, -0.05721f, 0.90307f, 0.20167f, 0.52664f, -0.14478f, + 0.64997f, 0.06846f, 0.32475f, 0.64453f, 0.70143f, -0.03091f, -0.24958f, + -0.39021f, -0.57693f, -0.18319f, 0.11793f, -0.05948f, 0.36670f, -0.27932f, + 0.14800f, -0.55459f, -0.89673f, 0.65922f, 0.54308f, -0.16731f, -0.59731f, + -0.20705f, -0.18183f, -0.05645f, -0.06829f, -0.40210f, -0.27955f, 0.28054f, + 0.57665f, 0.14171f, 0.54693f, -0.22144f, -0.59664f, 0.13295f, 0.07057f, + -0.19698f, 0.03328f, -0.09687f, -0.32390f, -0.11506f, -0.40406f, -0.11473f, + 0.10399f, -0.29824f, 0.16028f, 0.00053f, 0.22699f, 0.04203f, -0.43880f, + -0.12654f, 0.12172f, 0.21087f, -0.46350f, -0.22081f, -0.06173f, -0.23287f, + 0.90314f, 0.04466f, -0.06149f, 0.32682f, 0.16609f, -0.58991f, -0.03786f, + -0.41329f, 0.02632f, 0.23411f, 0.25344f, 0.16468f, 0.31007f, 0.21845f, + 0.32462f, 0.33945f, 0.11527f, -0.35926f, -0.18584f, 0.29340f, 0.78199f, + 2.39287f, 0.53838f, -1.55085f, 0.02238f, -0.26153f, -0.42498f, -0.02460f, + 0.19261f, -0.10870f, -0.08453f, -0.39561f, 0.08600f, 0.36310f, 0.58439f, + -0.59526f, 0.13104f, -0.06703f, -0.17529f, -0.41431f, -0.23121f, -0.32394f, + -0.33324f, -0.21405f, -0.41702f, -0.29236f, -0.31766f, -0.33512f, -0.22679f, + -0.13680f, -0.00118f, -1.81744f, -2.34798f, -1.08048f, -0.29883f, -0.29123f, + -0.01752f, +}; + +static const float av1_tx_type_nn_bias_16x32_ver_layer0[32] = { + 1.02458f, -1.02185f, -0.18978f, 0.05981f, -0.94931f, 0.34544f, 0.04415f, + -0.60036f, -0.11368f, -0.14154f, 1.23438f, 0.51640f, -0.57587f, -0.91380f, + 0.95720f, 0.68298f, -0.06353f, -2.14960f, -0.11080f, 0.79380f, -0.94199f, + 0.43040f, 0.01358f, 0.07201f, -0.49689f, -0.14839f, -0.80132f, -0.13925f, + -0.11834f, -0.24998f, -0.33976f, 0.35497f, +}; + +static const float av1_tx_type_nn_weights_16x32_ver_layer1[128] = { + 0.87367f, -1.06469f, -0.50829f, -0.70540f, 1.14596f, -1.12346f, -0.94467f, + 0.01380f, -0.18911f, 0.07961f, -0.18626f, 0.61902f, -0.64423f, 1.21545f, + 1.01149f, 0.26309f, 1.50380f, 1.93940f, -0.64064f, 1.03987f, -1.88000f, + -0.44574f, -1.53303f, 1.36307f, 1.00292f, 0.37031f, 0.21594f, 0.16758f, + 0.02592f, -0.77431f, -0.31797f, -1.53826f, 1.14013f, -1.21957f, 0.04571f, + -0.22168f, 0.32299f, 0.25949f, -0.13306f, 0.17850f, 0.92494f, 0.19999f, + 0.07494f, -0.03362f, -0.53453f, 1.02970f, -0.22947f, 0.73964f, 1.08445f, + 0.16855f, -0.02686f, 0.25254f, 0.05952f, 0.02194f, 0.05649f, 0.39195f, + 0.14139f, 0.53843f, -0.06959f, -0.06993f, -0.14151f, -0.53147f, 0.17481f, + -1.21977f, 0.62932f, 1.07173f, 0.24049f, -0.51574f, 0.97492f, -0.28169f, + -0.15406f, -0.05441f, -0.25415f, 0.16583f, 0.43674f, -0.00593f, -0.09277f, + 0.61402f, 1.35562f, -0.03926f, 0.18967f, -0.29548f, -0.55509f, 0.23661f, + 0.05023f, 0.36226f, -0.83314f, 0.39357f, 0.19943f, -0.63431f, -0.03847f, + 0.12213f, 0.62024f, -0.11704f, -0.22483f, 0.96624f, 0.18518f, 0.09181f, + -0.63068f, 0.66797f, 0.74107f, 0.40624f, 0.70636f, -0.06921f, 0.34175f, + -0.15513f, 2.07844f, 0.22126f, 0.52919f, 0.26793f, -0.50018f, 1.10549f, + 0.10970f, 0.05831f, 0.82842f, -1.22975f, 1.78377f, 0.92679f, 2.01480f, + -1.19011f, -0.53381f, 0.38533f, 0.45579f, -0.10683f, -0.40828f, 0.31398f, + 0.14978f, 0.91325f, +}; + +static const float av1_tx_type_nn_bias_16x32_ver_layer1[4] = { + 1.03659f, + 1.80249f, + 1.25710f, + 1.32000f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x32_ver = { + 16, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_16x32_ver_layer0, + av1_tx_type_nn_weights_16x32_ver_layer1, + }, + { + av1_tx_type_nn_bias_16x32_ver_layer0, + av1_tx_type_nn_bias_16x32_ver_layer1, + }, +}; +/******************************************************************************/ + +// Tx type model for 32x16 block. +static const float av1_tx_type_nn_weights_32x16_hor_layer0[512] = { + -0.07289f, 0.30798f, 0.41881f, 0.33434f, -0.01599f, 0.85307f, -0.16060f, + -0.07922f, -0.04693f, 0.29186f, 0.44117f, 1.02417f, 0.12447f, 0.46321f, + 0.40060f, 0.50140f, 0.48338f, 0.47298f, 0.36585f, 0.42821f, 0.41289f, + 0.47534f, 0.42900f, 0.26061f, 0.45887f, 0.38163f, 0.17302f, 1.00888f, + 1.79910f, 1.36140f, 0.24471f, 0.04557f, 1.10823f, 0.74325f, 0.91210f, + 0.81387f, 0.98865f, -0.09874f, 0.55146f, 0.19385f, -0.50752f, -0.17249f, + 0.27261f, -0.02763f, -0.03286f, 0.09122f, 0.07015f, 0.20012f, 0.68983f, + -1.25345f, -0.00145f, 0.71567f, 0.54948f, -0.56154f, -0.28918f, 0.11997f, + -0.09907f, 0.09195f, 0.05768f, 0.15558f, 0.11284f, -0.35195f, -0.08723f, + -0.03571f, 0.94031f, 0.63737f, 0.98202f, 0.93826f, 0.87126f, 0.88530f, + 0.97697f, 0.55283f, 0.58670f, 0.86502f, 0.97008f, 0.99709f, 0.66214f, + 0.96660f, 0.99890f, 0.31945f, -1.00301f, 0.13215f, -0.03950f, 0.21148f, + 0.05128f, 0.10955f, 0.44839f, -0.33438f, -2.09773f, 0.13908f, 0.58669f, + 0.25268f, -0.24006f, 0.01286f, -0.05732f, 0.03401f, -0.06896f, 0.35397f, + 0.05133f, -0.21449f, -0.38437f, -0.32326f, -0.38731f, -0.44419f, 0.25968f, + -0.29422f, -0.12553f, -0.08896f, -0.16400f, -0.22309f, 0.21380f, -0.26912f, + 0.06866f, -0.25694f, 0.17632f, 0.32032f, -0.10666f, 0.26278f, 0.31877f, + -0.09338f, -0.14289f, 0.54232f, 0.46070f, 0.00059f, -0.27914f, 0.45177f, + 0.16274f, -0.08811f, -0.45791f, 0.53946f, -0.16794f, 0.16229f, 0.11840f, + -0.24435f, 0.26894f, -0.33180f, -0.47314f, 0.34061f, -0.13939f, 0.13321f, + -0.05208f, -0.18139f, -0.35234f, 1.37298f, -0.19360f, 0.21728f, 0.26088f, + 0.04045f, -0.10763f, -0.40470f, 0.50026f, -0.06726f, -0.12871f, -0.20963f, + -0.14583f, -0.04711f, -0.35988f, 0.03091f, 0.06491f, -0.31668f, -0.52190f, + 0.23397f, -0.13984f, -0.15207f, -0.49977f, 0.51205f, 0.12559f, -0.03631f, + 0.33447f, -0.36684f, 0.17533f, 0.15671f, -0.00096f, 0.06817f, 0.20922f, + 0.34006f, 0.71260f, 0.45024f, 0.53033f, 0.15645f, 0.76019f, 0.56870f, + 0.83066f, 0.63022f, 1.74436f, -0.24798f, 0.06795f, -0.00749f, 0.17795f, + 0.10371f, 0.06527f, 0.41054f, 0.49003f, 0.34630f, 0.02615f, 0.30320f, + -0.47133f, -0.49584f, 0.21775f, 0.27530f, -0.29977f, -0.64269f, 0.52627f, + -0.02492f, 0.08077f, 0.40786f, -0.36015f, -0.70714f, -1.98185f, -0.28187f, + 0.35018f, -0.06105f, -0.12710f, 0.06606f, -0.27805f, 0.44630f, -0.84731f, + -0.26699f, 0.25856f, 0.06194f, -0.18674f, -0.11560f, -0.43277f, 1.10579f, + 0.95876f, 0.17415f, 0.56386f, 0.68426f, 0.50180f, 0.24844f, 0.12347f, + 0.15281f, -0.19089f, 0.52279f, 0.41860f, -0.05270f, -0.17029f, -0.03542f, + 0.10621f, -0.25088f, 0.24070f, -0.08951f, 0.29950f, -0.36720f, 0.02151f, + 0.20129f, -0.70066f, -0.23144f, -0.20070f, -0.39262f, -0.01597f, -0.05591f, + 0.23814f, -0.25991f, 0.05812f, 0.60554f, -0.06106f, -0.58326f, 0.28762f, + -0.18747f, 0.08232f, -0.04243f, -0.03293f, 0.14722f, -0.13017f, -0.67263f, + 0.38698f, -0.18207f, -0.11496f, -0.27976f, -0.55345f, 1.42872f, 0.04684f, + 0.04214f, 0.00030f, 0.02410f, 0.19966f, -0.04246f, 0.00442f, 0.23121f, + 0.13364f, 0.21548f, -0.12748f, -0.14066f, -0.28354f, 0.59937f, -0.27553f, + 1.57503f, -0.01050f, -0.17724f, 0.44110f, -0.80334f, 0.72064f, 1.00501f, + -0.72638f, 0.02774f, 0.48540f, -0.72016f, -0.27721f, 0.31559f, 0.07322f, + 0.20279f, -0.19647f, 0.02352f, 0.12662f, 0.19743f, 0.30543f, 0.25712f, + 0.44702f, 0.16417f, 0.17888f, -2.58469f, 0.20555f, 0.57782f, -0.10892f, + 0.14527f, 0.82251f, 0.04200f, 0.44626f, 0.10818f, 0.71204f, 0.62903f, + 0.69178f, 0.73603f, 0.52717f, 0.83020f, 0.48824f, 1.03270f, -0.00152f, + 0.07958f, 0.24181f, -0.78839f, -0.74214f, -0.72998f, -1.58694f, 0.17735f, + 0.56318f, 0.32580f, -0.58503f, -0.33673f, -0.00838f, 0.48924f, 0.43362f, + 0.12750f, 0.00295f, 0.38624f, 0.17037f, 0.00729f, -0.26256f, -0.41669f, + 0.36847f, 0.22424f, 1.33334f, 0.18112f, 0.37682f, 0.49173f, -0.45240f, + -0.04857f, -0.35038f, -0.83099f, -0.01988f, 0.03497f, 0.38033f, 0.13685f, + 0.17597f, 0.28668f, 0.31193f, -0.43281f, 0.43267f, -0.50495f, 0.01969f, + 0.14131f, -0.09326f, -0.39425f, -0.62048f, -0.09119f, -0.28306f, -0.52671f, + -0.38584f, -0.10953f, 0.19669f, 0.34540f, -0.49941f, 0.04605f, -0.43535f, + 0.27519f, 0.03659f, -0.31961f, 0.13330f, 0.87009f, 0.20101f, -0.70392f, + -0.27883f, 0.33874f, -0.34308f, 0.67760f, 0.88195f, 0.55752f, -0.26563f, + 0.17875f, 0.06964f, 0.87607f, 1.47616f, 0.46747f, -0.56408f, -0.39352f, + -0.16427f, -0.41185f, 0.14187f, 0.19265f, -0.58613f, 0.56345f, -0.17729f, + -0.11320f, 0.08752f, -0.01329f, 1.20981f, 0.45170f, -0.20571f, -0.01150f, + 0.26476f, 0.13508f, 0.22020f, -0.42684f, -0.22499f, -1.51212f, 0.86648f, + 0.21776f, 0.24666f, 0.71339f, 0.42742f, -0.00952f, 0.14762f, 0.07693f, + -0.19599f, 0.03075f, -0.09703f, -0.32483f, -0.11616f, -0.40461f, -0.11693f, + 0.10038f, -0.30038f, 0.14686f, 0.00548f, 0.20350f, 0.00763f, -0.43756f, + -0.01997f, 0.00902f, 0.07470f, -0.41441f, -0.20605f, 0.07626f, -0.34973f, + 0.47455f, -0.15251f, -0.05325f, 0.04964f, 0.32477f, -0.54604f, 0.25273f, + -0.18461f, -0.30841f, 0.64908f, 0.60752f, 0.64148f, 0.72788f, 0.71232f, + 0.58597f, 0.73017f, 0.58857f, 0.71908f, 0.59860f, 0.61849f, 0.99398f, + 0.39572f, -0.36165f, -1.88646f, 0.14384f, -0.60541f, -0.21380f, -0.55498f, + -0.50960f, -0.08801f, 0.51892f, 0.19126f, 0.57879f, 1.19447f, 0.25673f, + -0.21631f, -0.43562f, -0.27497f, -0.02206f, -0.56169f, 0.58952f, -0.60983f, + -0.64088f, -0.69087f, -0.56261f, -0.74089f, -0.65063f, -0.66978f, -0.60836f, + -0.92770f, -0.77182f, -1.61031f, -0.70007f, -0.68402f, -0.42242f, -0.66722f, + -0.14533f, +}; + +static const float av1_tx_type_nn_bias_32x16_hor_layer0[32] = { + 1.53781f, -0.49320f, -0.31646f, 0.02826f, -1.05554f, 0.06559f, -0.12399f, + -0.61671f, -0.28956f, -0.15419f, 0.87189f, -0.43375f, -1.08477f, -0.66006f, + 0.36233f, 0.82678f, -0.92342f, -1.47101f, -0.02937f, -0.16497f, -0.75457f, + 0.50173f, -0.07560f, 0.71598f, 1.50795f, -0.04745f, -0.14008f, -0.18510f, + -0.14988f, -0.67044f, 0.79659f, 0.70610f, +}; + +static const float av1_tx_type_nn_weights_32x16_hor_layer1[128] = { + 0.84983f, -0.62530f, -0.82600f, -0.52563f, -0.11942f, -0.50279f, -0.13425f, + -0.02850f, 0.50767f, 0.10252f, 0.24540f, 0.67748f, -0.43483f, -0.22242f, + 0.23431f, 0.57287f, 0.69560f, 1.13814f, -0.47427f, -0.55858f, -1.47072f, + 0.26587f, -0.36335f, 0.83060f, 1.01645f, -0.52895f, -0.11614f, 0.17390f, + -0.13664f, -0.83098f, -0.07985f, -1.36820f, 0.47759f, -0.55612f, 0.46852f, + 0.07406f, -0.80467f, 0.23059f, 0.09992f, -0.06164f, 0.13541f, 0.06135f, + 0.83605f, -0.53224f, -0.13867f, 0.93838f, -0.61290f, 0.27732f, -0.46688f, + -0.41810f, 0.12885f, 0.13619f, -0.24612f, 0.07215f, 0.98866f, 0.10993f, + 1.05799f, -0.27146f, -0.00079f, -0.08585f, 0.08322f, -0.33809f, 0.67598f, + -1.06515f, 1.28866f, 0.61028f, -0.31704f, -0.59905f, 1.62151f, 0.10969f, + 0.20671f, -0.17818f, 0.14170f, 0.19322f, 0.30602f, 0.93111f, 0.19011f, + -0.45609f, 0.82506f, 0.32936f, -0.07858f, -0.27106f, -0.31638f, 0.23299f, + 0.81491f, 0.32584f, -0.52093f, -0.32472f, 0.53643f, -0.42605f, 0.01641f, + 0.09002f, 0.15832f, -0.08790f, 0.05511f, 1.00730f, 0.46309f, 0.68166f, + -0.18835f, 0.64512f, -1.00540f, 0.86802f, 0.18981f, -0.06982f, -0.24514f, + -0.08027f, 0.61199f, -0.20830f, 0.72001f, 0.17477f, 0.06511f, 0.00801f, + -0.43590f, 0.37257f, 0.70323f, 0.60233f, 1.62541f, 0.74383f, -0.22254f, + -0.33892f, 0.22881f, 0.62817f, 0.68915f, -0.06417f, 0.00969f, 1.65869f, + 0.89060f, 0.75948f, +}; + +static const float av1_tx_type_nn_bias_32x16_hor_layer1[4] = { + 0.95359f, + 1.56043f, + 1.06017f, + 2.54520f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_32x16_hor = { + 16, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_32x16_hor_layer0, + av1_tx_type_nn_weights_32x16_hor_layer1, + }, + { + av1_tx_type_nn_bias_32x16_hor_layer0, + av1_tx_type_nn_bias_32x16_hor_layer1, + }, +}; + +static const float av1_tx_type_nn_weights_32x16_ver_layer0[128] = { + 1.30219f, 1.30548f, 1.33334f, 1.20560f, 1.01572f, 1.38100f, 1.37504f, + 0.12599f, -0.96957f, 0.19400f, 0.75734f, 0.11295f, -0.40447f, -1.53062f, + -0.82980f, 0.02168f, -1.11289f, -0.66861f, -0.83663f, -0.91455f, -0.78618f, + -0.87176f, -1.10711f, 0.71207f, 1.49689f, -0.12715f, 0.29357f, 0.35234f, + 0.61016f, 0.80708f, 0.83564f, 1.05961f, -0.99842f, 0.82004f, 0.02638f, + 0.44606f, 0.32298f, 0.21321f, 0.47290f, -0.71442f, -2.81050f, -0.02520f, + -0.08919f, 0.00369f, -0.05257f, -0.07011f, -0.16394f, 0.06290f, 0.80086f, + 0.32349f, 0.47411f, 1.36126f, 1.68162f, 0.91325f, -0.27495f, 0.00262f, + 0.06025f, 0.42832f, 0.36965f, 0.38063f, 0.32772f, 0.40914f, 0.44510f, + 3.02239f, -1.84077f, 0.49536f, -0.27340f, -0.10437f, -0.34293f, -0.08047f, + -0.29651f, -0.97111f, -0.34187f, 0.52869f, 1.27240f, 1.20306f, 1.19121f, + 1.28742f, 0.26393f, -0.62319f, 0.92285f, -0.08303f, -0.33118f, -0.13053f, + 0.24875f, -0.52089f, 0.44691f, -1.08908f, 1.20921f, 0.36538f, -0.46792f, + -0.18855f, -0.13443f, -0.28472f, -0.10353f, 0.06911f, 0.68519f, 0.08228f, + -0.49027f, -0.34381f, 0.04719f, -0.33298f, 0.72525f, 0.09538f, -0.29216f, + -0.07260f, -0.55827f, 0.54542f, -0.10144f, -0.09292f, -0.14427f, -0.38361f, + -0.41559f, 0.75338f, -0.04530f, 0.27944f, 0.06932f, -0.11537f, 0.29568f, + 1.92155f, -0.98996f, -0.08841f, 0.49386f, 0.15947f, 0.53290f, 1.46747f, + 0.59360f, 0.25468f, +}; + +static const float av1_tx_type_nn_bias_32x16_ver_layer0[16] = { + -1.19673f, 0.33043f, 0.24408f, 0.46221f, 2.00646f, 0.19031f, + -0.64944f, -0.43452f, 1.04400f, 1.47371f, 0.52460f, -1.39577f, + 0.83852f, -0.25536f, 1.33200f, -0.24444f, +}; + +static const float av1_tx_type_nn_weights_32x16_ver_layer1[64] = { + -1.31447f, -0.86455f, 0.85217f, 1.00048f, 0.37395f, -1.35713f, -0.54032f, + 0.82803f, 0.89606f, 1.57696f, 0.68067f, 0.42512f, -0.26250f, 0.14621f, + 0.93249f, -0.77690f, -0.93652f, -0.44488f, 0.68360f, -0.88178f, 1.89111f, + 0.67700f, -0.29310f, 0.91604f, -1.21881f, 1.11188f, 0.45045f, -0.86119f, + -0.09294f, 0.09360f, 0.80794f, 0.41027f, 1.80399f, -0.50362f, -1.44689f, + 0.85148f, 0.90707f, -0.18458f, 0.14165f, 1.17367f, 0.70869f, 1.57147f, + 0.24692f, 0.16626f, 0.56794f, 0.07313f, 0.14728f, -0.74296f, 1.74127f, + 1.26560f, 0.17753f, 1.10194f, 0.56435f, 1.73779f, 1.42841f, -1.16773f, + 0.24584f, 0.10813f, -0.60187f, 0.79802f, 0.75229f, -0.06112f, 1.77282f, + 1.01058f, +}; + +static const float av1_tx_type_nn_bias_32x16_ver_layer1[4] = { + 0.83082f, + 2.03845f, + 0.59627f, + 2.31341f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_32x16_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_32x16_ver_layer0, + av1_tx_type_nn_weights_32x16_ver_layer1, + }, + { + av1_tx_type_nn_bias_32x16_ver_layer0, + av1_tx_type_nn_bias_32x16_ver_layer1, + }, +}; +/******************************************************************************/ + +// Map tx_size to its corresponding neural net model for tx type prediction. +static const NN_CONFIG *av1_tx_type_nnconfig_map_hor[] = { + &av1_tx_type_nnconfig_4x4, // 4x4 + &av1_tx_type_nnconfig_8x8, // 8x8 + &av1_tx_type_nnconfig_16x16, // 16x16 + NULL, // 32x32 + NULL, // 64x64 + &av1_tx_type_nnconfig_4x8_hor, // 4x8 + &av1_tx_type_nnconfig_8x4_hor, // 8x4 + &av1_tx_type_nnconfig_8x16_hor, // 8x16 + &av1_tx_type_nnconfig_16x8_hor, // 16x8 + &av1_tx_type_nnconfig_16x32_hor, // 16x32 + &av1_tx_type_nnconfig_32x16_hor, // 32x16 + NULL, // 32x64 + NULL, // 64x32 + NULL, // 4x16 + NULL, // 16x4 + NULL, // 8x32 + NULL, // 32x8 + NULL, // 16x64 + NULL, // 64x16 +}; + +static const NN_CONFIG *av1_tx_type_nnconfig_map_ver[] = { + &av1_tx_type_nnconfig_4x4, // 4x4 transform + &av1_tx_type_nnconfig_8x8, // 8x8 transform + &av1_tx_type_nnconfig_16x16, // 16x16 transform + NULL, // 32x32 transform + NULL, // 64x64 transform + &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform + &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform + &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform + &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform + &av1_tx_type_nnconfig_16x32_ver, // 16x32 transform + &av1_tx_type_nnconfig_32x16_ver, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + NULL, // 4x16 transform + NULL, // 16x4 transform + NULL, // 8x32 transform + NULL, // 32x8 transform + NULL, // 16x64 transform + NULL, // 64x16 transform +}; + +// Tx split model for 4x8 block. +static const float av1_tx_split_nn_weights_4x8_layer0[8 * 16] = { + 0.068650f, -0.732073f, -0.040361f, 0.322550f, -0.021123f, 0.212518f, + -0.350546f, 0.435987f, -0.111756f, -0.401568f, 0.069548f, -0.313000f, + 0.073918f, -0.373805f, -0.775810f, -0.124753f, 0.181094f, -0.602641f, + -0.026219f, -0.350112f, 0.020599f, -0.311752f, -0.476482f, -0.669465f, + -0.310921f, 0.348869f, -0.115984f, 0.154250f, 0.200485f, -0.016689f, + 0.020392f, 0.413810f, 0.634064f, -0.627530f, 0.399178f, -0.012284f, + 0.472030f, 0.091087f, -0.706100f, -0.447944f, -0.274226f, 0.445656f, + 0.309339f, 0.505522f, 0.038496f, -0.152809f, 0.408684f, -0.068151f, + 0.271612f, 0.353233f, -0.150365f, 0.075212f, -0.035096f, 0.346615f, + 0.124382f, 0.477072f, 0.216288f, 0.070548f, -0.106362f, 0.681613f, + -0.145502f, -0.218631f, -0.099248f, -0.001983f, -0.196819f, -0.969045f, + 0.063009f, -0.123053f, 0.104875f, -0.137581f, -0.282933f, -0.003624f, + -0.315659f, -0.333523f, -0.503000f, -0.100063f, -0.536711f, -0.059978f, + -0.670248f, -0.353762f, 0.181109f, 0.289715f, -0.071206f, 0.261141f, + 0.052796f, -0.114554f, -0.139214f, -0.261380f, 0.075984f, -0.647925f, + -0.099528f, -0.677814f, 0.015712f, -0.389385f, -0.095622f, -0.165117f, + -0.109454f, -0.175240f, -0.393914f, 0.212330f, 0.037822f, 0.248280f, + 0.180197f, 0.110493f, -0.525727f, -0.092329f, -0.524029f, -0.407364f, + -0.542373f, -0.435626f, -0.912194f, 0.062794f, 0.160433f, 0.741485f, + -0.103659f, -0.119327f, -0.055275f, 0.334358f, 0.014713f, 0.046327f, + 0.831114f, -0.576682f, 0.354369f, -0.082088f, 0.452331f, 0.039730f, + -0.792429f, -0.385862f, +}; + +static const float av1_tx_split_nn_bias_4x8_layer0[16] = { + 0.238621f, 2.186830f, 1.383035f, -0.867139f, 1.257119f, -0.351571f, + -0.240650f, -0.971692f, 2.744843f, 1.116991f, 0.139062f, -0.165332f, + 0.262171f, -1.598153f, -1.427340f, -1.602306f, +}; + +static const float av1_tx_split_nn_weights_4x8_layer1[16] = { + -0.367134f, 1.373058f, -0.897039f, -0.326819f, -0.734030f, -0.290413f, + -0.501249f, 0.505321f, -0.537692f, -0.767893f, 0.268697f, 0.278987f, + 0.085082f, 0.614986f, 0.847904f, 0.637578f, +}; + +static const float av1_tx_split_nn_bias_4x8_layer1[1] = { + 0.20586078f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_4x8 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_4x8_layer0, + av1_tx_split_nn_weights_4x8_layer1, + }, + { + av1_tx_split_nn_bias_4x8_layer0, + av1_tx_split_nn_bias_4x8_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 8x8 block. +static const float av1_tx_split_nn_weights_8x8_layer0[144] = { + 0.177983f, -0.938386f, -0.074460f, -0.221843f, -0.073182f, -0.295155f, + -0.098202f, -0.279510f, 0.001054f, -0.119319f, -1.835282f, -0.581507f, + -1.222222f, -1.049006f, -0.807508f, -0.454252f, -0.774879f, -0.180607f, + -0.886976f, -0.231971f, -0.824677f, -0.351872f, -1.323819f, 0.235378f, + 0.015331f, -0.341818f, 0.145549f, -0.348362f, 0.147647f, -0.323400f, + 0.047558f, -0.553025f, -0.295485f, -0.330368f, -0.530605f, -0.407516f, + 0.447740f, 0.782381f, -0.179164f, -0.584675f, -0.052645f, 0.038656f, + -0.096783f, 0.038342f, -0.170762f, -0.405844f, -0.552665f, -0.509866f, + 0.757204f, -1.296465f, 0.631015f, 0.009265f, 0.646192f, 0.044523f, + 0.653161f, 0.033820f, 0.849639f, -0.068555f, -1.036085f, -0.511652f, + 0.104693f, -1.458690f, 0.286051f, -0.089800f, 0.381564f, -0.302640f, + 0.304465f, -0.268706f, 0.432603f, -0.117914f, -2.070031f, -0.565696f, + -0.073027f, -1.783570f, -0.318144f, -0.320990f, -0.343966f, -0.140996f, + -0.322977f, -0.232147f, -0.373210f, -0.158266f, -1.922305f, -0.634373f, + 0.101894f, -0.221847f, 0.018412f, -0.423887f, -0.266684f, -0.444930f, + -0.196237f, 0.106638f, -0.065834f, -0.538401f, -0.280772f, -0.620348f, + 1.089957f, -0.799928f, 0.504112f, -0.165763f, 0.578741f, -0.172653f, + 0.547316f, -0.143484f, 0.717220f, -0.297190f, -1.237854f, -0.074819f, + -0.977304f, -0.484092f, -0.646427f, -0.451443f, -0.612126f, -0.224475f, + -0.731608f, -0.257077f, -0.665857f, -0.346742f, -1.216372f, 0.227267f, + 0.231249f, -1.693073f, -0.035899f, 0.380845f, -0.058476f, 0.409405f, + -0.066679f, 0.406731f, -0.068501f, 0.396748f, 0.639462f, 0.150834f, + -0.418659f, -1.421931f, 0.101889f, 0.083573f, 0.129746f, 0.134460f, + 0.081185f, 0.127420f, 0.083664f, 0.051096f, 1.361688f, 0.386093f, +}; + +static const float av1_tx_split_nn_bias_8x8_layer0[12] = { + 4.280443f, 2.218902f, -0.256953f, 3.161431f, 2.082548f, 2.506052f, + 2.563224f, 1.421976f, -1.627813f, -1.436085f, 2.297265f, 1.500469f, +}; + +static const float av1_tx_split_nn_weights_8x8_layer1[12] = { + 1.178833f, -0.428527f, -0.078737f, 0.381434f, -0.466895f, -0.901745f, + -0.766968f, -0.356663f, 0.450146f, 0.509370f, -0.356604f, -0.443506f, +}; + +static const float av1_tx_split_nn_bias_8x8_layer1[1] = { + -0.156294f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_8x8 = { + 12, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 12, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_8x8_layer0, + av1_tx_split_nn_weights_8x8_layer1, + }, + { + av1_tx_split_nn_bias_8x8_layer0, + av1_tx_split_nn_bias_8x8_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 8x16 block. +static const float av1_tx_split_nn_weights_8x16_layer0[8 * 64] = { + 0.374660f, 0.218905f, -0.139779f, 0.212141f, 0.056517f, 0.051114f, + 0.042860f, -0.273258f, -0.340809f, 0.138983f, -0.216996f, -0.241519f, + -0.123244f, 0.078577f, -0.472273f, -0.194201f, 0.125056f, 0.239761f, + -0.332782f, 0.174782f, -0.211400f, -0.129795f, 0.062195f, 0.113176f, + -0.008869f, 0.140764f, 0.059833f, 0.163826f, 0.359293f, -0.109797f, + -0.022091f, -0.059536f, -0.188226f, 0.179709f, 0.031386f, 0.164790f, + 0.214364f, 0.198555f, 0.152262f, -0.242980f, 0.319367f, -0.136902f, + 0.046524f, -0.043591f, 0.342178f, -0.011757f, -0.014286f, 0.072871f, + -0.278314f, -0.345303f, -0.252103f, -0.107154f, -0.235101f, -0.106739f, + -0.120865f, -0.160042f, 0.240028f, 0.112902f, -0.141587f, -0.703012f, + -0.136591f, 0.318993f, -0.154417f, -0.054668f, 0.192870f, 0.176166f, + -0.029965f, 0.266942f, -0.178384f, 0.038680f, 0.134403f, -0.002426f, + 0.534825f, -0.070923f, 0.413281f, 0.418148f, 0.093729f, 0.016454f, + 0.305358f, -0.040512f, 0.069904f, -0.227588f, -0.362220f, -0.031604f, + -0.394901f, 0.071506f, -0.342833f, -0.142550f, -0.164005f, 0.182600f, + 0.213062f, 0.076805f, 0.278758f, 0.125613f, -0.035552f, 0.040971f, + 0.182785f, -0.227961f, -0.105413f, -0.074949f, -0.084629f, -0.254767f, + 0.114657f, 0.047121f, 0.195902f, 0.264759f, 0.017799f, 0.210230f, + 0.150749f, -0.142142f, 0.182494f, -0.142415f, -0.259782f, -0.114830f, + -0.198826f, 0.000061f, -0.375668f, -0.276656f, -0.373202f, 0.210298f, + 0.422680f, 0.066960f, 0.351106f, -0.209034f, 0.367195f, -0.110274f, + 0.115573f, -0.066642f, -0.389673f, -0.260447f, 0.056949f, -0.180425f, + 0.069922f, -0.153506f, -0.097053f, -0.111757f, 0.094069f, 0.144837f, + -0.052984f, -0.506681f, -0.034474f, 0.279057f, -0.105025f, 0.006656f, + -0.125017f, -0.114096f, 0.103153f, -0.117402f, -0.359472f, 0.072534f, + 0.110291f, 0.003088f, -0.456897f, 0.038331f, -0.322298f, 0.113942f, + -0.119916f, -0.194392f, 0.093167f, 0.193459f, 0.074671f, 0.033602f, + 0.004440f, -0.179578f, -0.036637f, -0.216172f, -0.296530f, -0.318992f, + 0.319160f, -0.066218f, 0.291246f, 0.181292f, 0.089914f, 0.025273f, + 0.303128f, 0.019063f, 0.078545f, -0.396919f, 0.014065f, -0.122121f, + 0.037107f, -0.151886f, -0.299392f, -0.172207f, -0.124571f, -0.232553f, + 0.102970f, -0.225040f, 0.061059f, -0.258188f, -0.469871f, -0.099607f, + -0.061524f, -0.213700f, 0.070237f, -0.289134f, -0.238225f, 0.256403f, + -0.119344f, 0.067782f, -0.398983f, -0.123975f, -0.200205f, -0.047038f, + 0.026569f, 0.031037f, 0.094302f, -0.101239f, 0.433307f, -0.303612f, + 0.088537f, -0.164436f, 0.202471f, -0.048592f, -0.251904f, 0.122577f, + -0.309874f, -0.263405f, -0.292503f, 0.216589f, 0.035378f, 0.136599f, + -0.145844f, -0.018211f, 0.174084f, -0.449941f, -0.001428f, 0.064134f, + 0.039652f, 0.111083f, -0.246076f, -0.204733f, 0.056559f, -0.000123f, + 0.104049f, 0.138512f, -0.128309f, 0.087855f, 0.232784f, 0.247138f, + 0.162766f, 0.154829f, 0.313605f, -0.164115f, -0.050844f, 0.156549f, + 0.185279f, -0.238962f, -0.308281f, -0.179592f, -0.193262f, 0.201670f, + -0.203399f, -0.096831f, -0.127867f, 0.310674f, -0.008181f, 0.004078f, + -0.211038f, -0.193480f, -0.185639f, -0.150202f, -0.204858f, -0.240758f, + 0.114268f, -0.032535f, -0.052403f, -0.234333f, -0.064072f, -0.208444f, + -0.352853f, -0.224001f, -0.156330f, 0.215436f, 0.171846f, 0.291849f, + 0.108832f, 0.046991f, -0.127801f, 0.032485f, 0.141493f, 0.123319f, + -0.057250f, 0.315346f, -0.061317f, -0.465086f, -0.130179f, -0.217841f, + -0.239089f, -0.073251f, -0.327718f, 0.054905f, -0.283169f, -0.028900f, + 0.071450f, 0.270072f, 0.248891f, 0.088052f, 0.253319f, 0.122808f, + 0.175490f, -0.147805f, 0.089169f, -0.045457f, -0.330788f, 0.099791f, + -0.137376f, -0.195977f, -0.350942f, -0.284930f, -0.559037f, 0.030504f, + 0.162554f, -0.199100f, -0.050453f, -0.131320f, -0.077863f, -0.066253f, + -0.379723f, -0.424047f, -0.081182f, -0.252261f, -0.102815f, 0.058240f, + -0.182036f, 0.176772f, -0.070823f, 0.216054f, -0.211533f, -0.232992f, + 0.279346f, 0.117984f, 0.236674f, 0.126625f, -0.046220f, 0.044919f, + 0.278492f, 0.083944f, 0.180512f, 0.217994f, 0.401170f, -0.064417f, + 0.011636f, -0.139597f, -0.050020f, -0.268438f, -0.032803f, 0.024908f, + -0.085713f, -0.012984f, -0.055192f, -0.338657f, 0.045826f, -0.312849f, + -0.023393f, -0.168800f, -0.030886f, -0.131816f, -0.253542f, -0.104812f, + -0.354389f, 0.169464f, 0.094151f, -0.217122f, -0.456397f, 0.211478f, + 0.219232f, -0.155519f, -0.353700f, -0.264759f, -0.034709f, 0.034409f, + -0.148639f, -0.132850f, -0.216791f, -0.118492f, 0.173721f, -0.144181f, + 0.335028f, 0.176439f, 0.105980f, 0.169390f, 0.155615f, -0.040618f, + -0.176029f, 0.155569f, -0.184833f, -0.171099f, -0.178663f, -0.032051f, + -0.434334f, 0.092238f, -0.263103f, 0.061804f, -0.172957f, 0.005962f, + -0.100176f, 0.125898f, 0.048092f, -0.088141f, 0.247196f, -0.221601f, + -0.114474f, -0.124410f, -0.156393f, -0.181782f, -0.083562f, 0.034937f, + 0.403401f, -0.046200f, 0.322259f, 0.219678f, 0.109850f, 0.051837f, + 0.196861f, -0.019118f, 0.248818f, -0.137567f, 0.127862f, 0.052293f, + 0.298726f, 0.275788f, 0.015344f, 0.058714f, 0.283691f, -0.053794f, + -0.123270f, -0.227761f, -0.141744f, -0.268515f, -0.007189f, -0.242117f, + -0.252396f, -0.069017f, 0.034803f, -0.003388f, -0.262577f, 0.062115f, + -0.298393f, 0.215415f, -0.153615f, 0.289902f, 0.085886f, -0.504290f, + 0.077178f, 0.150861f, -0.228848f, -0.261020f, 0.198204f, 0.162113f, + 0.346418f, -0.286950f, 0.354756f, -0.226419f, 0.024720f, 0.208037f, + 0.107286f, -0.110849f, 0.104415f, -0.207725f, 0.063932f, -0.037748f, + -0.167037f, -0.068282f, 0.320815f, -0.051884f, 0.099989f, -0.078388f, + 0.127071f, 0.046675f, -0.336571f, -0.273080f, 0.264694f, -0.007352f, + -0.093828f, 0.094773f, -0.144434f, 0.091795f, -0.031615f, 0.056914f, + 0.064673f, -0.136669f, 0.344734f, 0.225926f, 0.283451f, -0.068354f, + 0.030572f, 0.180784f, -0.378047f, -0.092962f, -0.083291f, 0.038970f, + 0.052094f, -0.017932f, 0.216302f, -0.184396f, 0.079888f, 0.210406f, + -0.020627f, 0.244744f, 0.336972f, -0.182914f, -0.220976f, -0.304225f, + -0.330974f, -0.370868f, -0.084935f, -0.136489f, -0.210082f, -0.188088f, + -0.408768f, 0.184693f, +}; + +static const float av1_tx_split_nn_bias_8x16_layer0[64] = { + -0.274107f, 0.445751f, 0.234359f, 0.291593f, 0.163298f, 0.183707f, + -0.548839f, -0.190779f, -0.163346f, -0.669028f, 0.399209f, -0.354974f, + 0.000000f, -0.254630f, 0.220149f, 0.371104f, 0.789759f, 0.270300f, + 0.195126f, -0.206958f, 0.917708f, -0.256232f, 1.131933f, 1.178944f, + 0.461270f, 0.246169f, -0.818614f, -0.111986f, 0.759355f, 0.154889f, + 0.470299f, -1.025250f, 0.678678f, 0.959346f, -0.164105f, 0.544079f, + -0.448733f, 0.649221f, -0.536672f, 0.962758f, -0.256427f, 0.808664f, + -0.118694f, 0.684873f, -0.015635f, -0.046469f, 0.075481f, 0.412647f, + 0.454456f, -0.107169f, 0.775235f, -0.261629f, -1.194849f, 0.010093f, + -0.231289f, 0.658286f, -0.769320f, 0.564545f, 0.482962f, -0.131378f, + -0.255844f, -0.078400f, 0.476752f, 0.643001f, +}; + +static const float av1_tx_split_nn_weights_8x16_layer1[64] = { + -0.145065f, -0.145101f, 0.174786f, 0.196692f, 0.102025f, -0.087735f, + 0.386353f, -0.660539f, -0.183940f, 0.490045f, -0.276404f, -0.145669f, + 0.209846f, -0.085574f, -0.156821f, -0.377450f, -0.950010f, 0.450709f, + -0.108545f, -0.261181f, 1.435606f, -0.176621f, -1.158548f, 2.035680f, + 0.218069f, -0.138629f, 0.305958f, -0.277194f, -0.602468f, 0.203873f, + 0.120720f, 0.216095f, -0.434502f, -0.579746f, -0.239450f, 0.755529f, + 0.545643f, 0.232091f, 0.330169f, 0.988136f, -0.070465f, -0.345584f, + -0.162455f, -0.617064f, 0.123881f, -0.201098f, 0.222756f, 0.112932f, + 0.048647f, -0.147890f, 0.394584f, -0.262148f, 0.280564f, -0.195432f, + -0.047515f, 1.133410f, 0.255415f, -0.299032f, -0.397807f, -0.153246f, + -0.256734f, 0.177370f, 0.213522f, -0.530158f, +}; + +static const float av1_tx_split_nn_bias_8x16_layer1[1] = { + 0.14910713f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_8x16 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 64, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_8x16_layer0, + av1_tx_split_nn_weights_8x16_layer1, + }, + { + av1_tx_split_nn_bias_8x16_layer0, + av1_tx_split_nn_bias_8x16_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 16x16 block. +static const float av1_tx_split_nn_weights_16x16_layer0[12 * 24] = { + -0.177215f, -0.297166f, 0.299924f, 0.207878f, 0.216871f, 0.173264f, + 0.295464f, 0.048395f, 0.154731f, 0.305880f, 0.056787f, -0.166617f, + 0.115653f, -0.529477f, -0.073995f, -0.211746f, -0.018169f, 0.000788f, + -0.024940f, -0.007055f, 0.001392f, 0.021678f, -1.594600f, -0.099593f, + 0.332930f, 0.103574f, 0.158249f, 0.182601f, 0.332665f, 0.226207f, + -0.139566f, 0.185531f, 0.099074f, -0.185654f, -0.203121f, -0.285678f, + -0.313453f, -0.294452f, -0.143707f, -0.031265f, -0.453030f, -0.061874f, + -0.066150f, -0.099058f, -0.458879f, 0.127544f, 0.338314f, -0.161350f, + 0.030091f, -0.075528f, 0.004320f, 0.353690f, -0.013480f, -0.420402f, + -0.004659f, -0.329401f, -0.001745f, 0.227384f, -0.055183f, 0.121405f, + 0.160340f, 0.143603f, -0.221813f, 0.079107f, -0.657639f, -0.084348f, + -0.303414f, 0.046774f, -0.367679f, 0.060005f, 0.168645f, 0.084421f, + -0.133625f, 0.301375f, 0.079412f, -0.419303f, 0.017235f, 0.068637f, + 0.018384f, -0.428325f, -0.019753f, 0.149444f, -0.474836f, -0.287162f, + 0.198083f, 0.028292f, -0.299092f, -0.005849f, -0.256245f, 0.233277f, + -0.217561f, -0.264003f, 0.269411f, 0.207032f, -0.339411f, -0.198431f, + -0.028521f, 0.158076f, 0.177116f, 0.345702f, -0.145132f, 0.064623f, + -0.090867f, 0.288816f, -0.263198f, -0.071028f, -0.044546f, 0.380017f, + -0.014100f, -0.271192f, -0.318559f, 0.129015f, -0.050314f, -0.093355f, + -0.578498f, 0.099090f, -0.133080f, -0.029975f, -0.059828f, -0.157765f, + -0.321153f, -0.343671f, -0.242959f, 0.128304f, 0.017170f, 0.072787f, + -0.475838f, -0.003806f, -0.068615f, 0.150556f, -0.159903f, -0.416513f, + 0.218794f, -0.290456f, -0.084569f, -0.170014f, -0.044414f, -0.153069f, + -0.077329f, -0.089747f, -0.096526f, 0.537952f, 0.134725f, -0.006469f, + -0.323335f, -0.168183f, -0.107163f, -0.139954f, 0.011286f, -0.021712f, + -0.513992f, 0.259135f, -0.319808f, 0.077811f, 0.104613f, 0.370571f, + 0.185244f, 0.065530f, -0.091098f, -0.573741f, 0.111934f, 0.437417f, + -0.123691f, 0.220641f, -0.024783f, -0.149460f, -0.354185f, -0.134127f, + 0.038015f, -0.380596f, 0.250980f, 0.142208f, 0.135170f, -0.131129f, + -0.357556f, -0.530945f, 0.159672f, -0.147025f, -0.377829f, -0.504508f, + -0.492870f, 0.020753f, 0.142818f, 0.025172f, 0.086140f, 0.091283f, + 0.087491f, -0.186415f, 0.177785f, -0.195121f, -1.191148f, -0.477102f, + 0.023371f, 0.227004f, -0.023502f, -0.242913f, -0.074398f, -0.153480f, + 0.162900f, 0.415509f, -0.162565f, -0.131709f, -0.258852f, -0.252027f, + -0.080845f, -0.330274f, 0.021874f, 0.232398f, 0.069277f, 0.220567f, + -0.024237f, -0.366771f, 0.081673f, -0.429906f, -0.302170f, 0.061045f, + 0.352777f, -0.230376f, 0.408153f, 0.064758f, 0.142051f, 0.007219f, + 0.622878f, 0.212577f, 0.036489f, 0.081150f, -0.284767f, 0.107763f, + -0.529786f, -0.072190f, -0.300421f, -0.287959f, -0.568900f, 0.011547f, + -0.131696f, -0.356854f, -0.587962f, -0.026598f, 0.405829f, 0.057565f, + 0.414265f, -0.159155f, 0.221456f, 0.146314f, 0.265776f, -0.006516f, + 0.473978f, -0.186431f, 0.288672f, -0.060437f, 0.083380f, -0.205641f, + 0.360016f, 0.222041f, 0.420011f, 0.024579f, 0.377546f, 0.250380f, + -0.069900f, 0.296743f, 0.073532f, -0.243225f, -0.374987f, -0.387288f, + -0.237255f, -0.287013f, 0.417831f, -0.252988f, -0.257652f, -0.066775f, + -0.253926f, 0.057841f, 0.346133f, -0.157797f, -0.406028f, -0.286893f, + 0.274507f, -0.452561f, 0.143381f, -0.097755f, 0.021242f, 0.034561f, + 0.044115f, 0.004065f, 0.066729f, 0.043558f, 0.102991f, -0.477574f, +}; + +static const float av1_tx_split_nn_bias_16x16_layer0[24] = { + -0.479033f, 1.467402f, -0.366291f, 0.372511f, 0.715322f, -0.605500f, + 0.176848f, 0.032318f, 0.237429f, -0.046047f, 0.452082f, 0.451805f, + -0.822845f, 0.636762f, -0.057350f, 1.163978f, 0.728287f, 0.603654f, + -0.245519f, -0.893569f, -1.428185f, 0.808870f, -0.076159f, 1.231976f, +}; + +static const float av1_tx_split_nn_weights_16x16_layer1[24] = { + -0.176161f, 1.670188f, -0.180755f, -0.321326f, 0.249728f, -0.170504f, + -0.538432f, 0.033893f, 0.149842f, 0.404140f, -0.377812f, 0.338838f, + -0.176091f, 0.249844f, -0.362533f, 1.412460f, 0.196862f, 0.278194f, + -0.140444f, 0.297746f, 0.172533f, 0.116470f, -0.151656f, -0.603250f, +}; + +static const float av1_tx_split_nn_bias_16x16_layer1[1] = { + 0.184803f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_16x16 = { + 12, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 24, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_16x16_layer0, + av1_tx_split_nn_weights_16x16_layer1, + }, + { + av1_tx_split_nn_bias_16x16_layer0, + av1_tx_split_nn_bias_16x16_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 32x32 block. +static const float av1_tx_split_nn_weights_32x32_layer0[12 * 32] = { + -0.439303f, 0.004813f, -0.365052f, -0.116868f, -0.356716f, -0.196537f, + -0.196770f, -0.076096f, 0.357004f, -0.044909f, -0.112910f, -0.129081f, + 0.156725f, -0.386346f, 0.038971f, 0.160696f, 0.204923f, -0.384333f, + -0.319546f, 0.028179f, -0.250524f, -0.289669f, -0.284138f, -0.258963f, + -0.180854f, -0.000807f, -0.029620f, -0.353134f, 0.212408f, 0.141414f, + 0.303016f, 0.098066f, 0.482455f, 0.036069f, -0.166279f, 0.210119f, + -0.086337f, -0.023550f, -0.250796f, -0.183945f, -0.393856f, 0.170608f, + -0.306403f, 0.026318f, -0.277296f, 0.092684f, -0.033584f, -0.018371f, + -0.025043f, -0.257659f, -0.139163f, -0.206949f, -0.190105f, 0.028053f, + 0.361851f, -0.364726f, -0.096771f, -0.184166f, -0.433228f, -0.182191f, + -0.097051f, 0.259172f, 0.016432f, 0.259358f, 0.145059f, 0.037196f, + 0.091581f, -0.219644f, 0.140384f, -0.446837f, -0.234531f, 0.149508f, + -0.083429f, 0.186189f, -0.099890f, -0.111277f, 0.495214f, 0.085053f, + -0.266613f, -0.051366f, 0.148593f, 0.111875f, 0.077787f, -0.371653f, + -0.146157f, -0.229235f, 0.076203f, 0.488975f, 0.096771f, -0.009483f, + 0.192985f, 0.246273f, -0.192671f, -0.557890f, -0.292650f, -0.088907f, + -0.106892f, -0.329659f, 0.012105f, -0.359326f, 0.170723f, -0.004357f, + 0.171593f, -0.478768f, -0.236016f, -0.035077f, 0.133731f, 0.137962f, + -0.397926f, -0.155164f, -0.276709f, -0.186602f, -0.258301f, 0.036965f, + -0.649359f, 0.127605f, 0.097930f, 0.182775f, -0.313324f, 0.053349f, + 0.204203f, -0.222948f, -0.059008f, -0.049759f, -0.056848f, 0.087497f, + -0.039987f, -0.055042f, -0.041623f, -0.078424f, -0.317291f, -0.191398f, + 0.632147f, 0.221825f, 0.268394f, -0.096357f, 0.442545f, -0.007117f, + -0.036125f, 0.000525f, 0.088092f, -0.203653f, 0.086925f, 0.439141f, + 0.329889f, -0.370050f, -0.194306f, -0.207430f, 0.132779f, -0.217614f, + -0.039444f, -0.053019f, -0.260725f, -0.116563f, -0.271048f, 0.283737f, + -0.007300f, 0.062257f, -0.347865f, -0.296767f, -0.359123f, 0.230459f, + -0.189117f, -0.087622f, -0.561091f, 0.184182f, -0.044980f, 0.012643f, + 0.241672f, 0.050272f, -0.204851f, -0.159285f, -0.064081f, -0.118666f, + -0.269471f, 0.231668f, 0.135749f, -0.131162f, 0.062760f, 0.100949f, + 0.074967f, -0.056918f, 0.251707f, 0.034098f, 0.341290f, -0.105027f, + 0.313246f, -0.092679f, -0.014632f, -0.390967f, 0.136881f, -0.241554f, + 0.097674f, 0.110832f, -0.390245f, 0.017654f, -0.506222f, 0.065252f, + 0.244834f, -0.171352f, -0.331702f, 0.111043f, 0.125217f, -0.058116f, + -0.382595f, -0.052545f, 0.114261f, -0.493617f, 0.243984f, -0.171053f, + 0.165009f, -0.063020f, 0.096502f, 0.341339f, -0.013443f, 0.056372f, + 0.339284f, 0.398376f, 0.389409f, 0.257252f, 0.517368f, 0.078856f, + 0.087716f, -0.171092f, 0.227461f, 0.125307f, -0.054423f, -0.143161f, + 0.224041f, -0.086477f, -0.092548f, 0.072392f, -0.061608f, 0.258347f, + 0.147033f, -0.478244f, -0.204869f, 0.038552f, -0.144563f, 0.224087f, + -0.296705f, 0.153889f, -0.064624f, 0.085265f, -0.103826f, 0.127971f, + 0.019965f, 0.111937f, -0.074187f, -0.029518f, -0.127305f, -0.012210f, + 0.042714f, 0.070052f, -0.202360f, 0.348144f, -0.132097f, -0.209585f, + -0.248286f, -0.065774f, -0.089482f, -0.133226f, 0.325430f, -0.013468f, + -0.406090f, -0.144936f, 0.208620f, 0.343445f, -0.059639f, 0.114857f, + -0.069431f, -0.218725f, 0.190575f, -0.368101f, 0.030030f, 0.062815f, + -0.239369f, -0.537852f, 0.022487f, 0.023038f, 0.190788f, 0.040123f, + -0.004304f, 0.060749f, -0.108929f, 0.136796f, -0.542875f, -0.227074f, + -0.182244f, 0.082559f, 0.019149f, 0.178854f, 0.120284f, 0.009070f, + 0.068268f, -0.544822f, 0.120536f, 0.354028f, -0.119890f, -0.122055f, + -0.405335f, 0.122341f, -0.304412f, 0.062405f, -0.302568f, -0.276505f, + -0.120915f, -0.221841f, 0.282007f, -0.253971f, 0.059517f, -0.144976f, + 0.149391f, -0.047355f, -0.167742f, -0.392333f, -0.041132f, 0.342135f, + 0.017485f, 0.021038f, -0.023728f, -0.192181f, -0.103996f, 0.092873f, + -0.114365f, -0.397732f, -0.065421f, 0.053084f, 0.035201f, 0.053019f, + -0.105377f, -0.039500f, 0.131904f, -0.123911f, -0.390328f, -0.125198f, + -0.000126f, 0.014864f, -0.220187f, 0.084056f, -0.492155f, -0.164979f, + 0.133592f, 0.121519f, -0.240813f, 0.186680f, 0.118673f, 0.235006f, + -0.239894f, -0.185759f, -0.336992f, 0.209620f, -0.298845f, 0.127803f, + -0.083992f, 0.194340f, -0.245378f, 0.212308f, 0.142512f, -0.163324f, + 0.383495f, 0.291065f, 0.286620f, -0.239957f, 0.225127f, -0.174424f, + 0.297231f, -0.045434f, 0.156444f, -0.184273f, -0.204567f, 0.202551f, + 0.370019f, -0.073910f, 0.344897f, 0.063100f, 0.338547f, -0.099145f, + 0.391863f, -0.214244f, -0.241734f, -0.281851f, -0.035133f, -0.153157f, +}; + +static const float av1_tx_split_nn_bias_32x32_layer0[32] = { + 0.143343f, -0.021982f, -0.314939f, 0.170867f, -0.081248f, 0.125758f, + -0.355762f, 0.279798f, 1.027712f, -0.434660f, 1.072005f, 0.668893f, + -0.031216f, -0.528650f, 0.328349f, 0.543645f, -0.188810f, 0.221110f, + -1.638637f, 0.058045f, -1.731105f, -0.444284f, 0.513693f, 0.890025f, + 0.160288f, 0.393312f, 0.332856f, -0.080767f, 0.299822f, 0.235876f, + 0.254942f, -0.017796f, +}; + +static const float av1_tx_split_nn_weights_32x32_layer1[32] = { + -0.090326f, -0.267553f, -0.026071f, 0.100912f, 0.279137f, 0.079064f, + -0.074885f, 0.053804f, 0.736810f, -0.031693f, -0.970514f, 0.174069f, + 0.095940f, -0.065047f, 0.052911f, 0.176728f, -0.058274f, 0.148364f, + -0.162210f, 0.093875f, -0.367663f, 0.020876f, 0.137280f, -1.099116f, + 0.146854f, 0.075590f, 0.228534f, 0.141993f, 0.072143f, 0.101421f, + -0.068547f, -0.154148f, +}; + +static const float av1_tx_split_nn_bias_32x32_layer1[1] = { + 0.316622f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_32x32 = { + 12, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_32x32_layer0, + av1_tx_split_nn_weights_32x32_layer1, + }, + { + av1_tx_split_nn_bias_32x32_layer0, + av1_tx_split_nn_bias_32x32_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 64x64 block. +static const float av1_tx_split_nn_weights_64x64_layer0[12 * 32] = { + -0.006828f, 0.149944f, -0.017614f, -0.044599f, -0.024517f, 0.507698f, + 0.001039f, 0.037164f, 0.015091f, -0.306620f, -0.162047f, -0.369440f, + 0.396310f, 0.087121f, 0.208609f, -0.083068f, 0.493774f, 0.217682f, + 0.377393f, 0.172879f, 0.397422f, 0.078919f, 0.741350f, 0.064169f, + -0.099989f, -0.192983f, -0.278230f, -0.310048f, -0.439965f, -0.226698f, + -0.436596f, -0.007551f, -0.396721f, 0.153570f, -0.190838f, -0.071869f, + 0.048799f, -0.301301f, -0.005015f, 0.500480f, -0.030622f, -0.559095f, + -0.032634f, -0.054160f, -0.056979f, -0.456545f, 0.306536f, -0.411323f, + -0.005366f, -0.069496f, 0.019990f, 0.327931f, -0.002516f, 0.393190f, + 0.001759f, 0.035093f, -0.030302f, -0.528984f, 0.174781f, 0.241462f, + -0.415427f, -0.164502f, 0.143065f, -0.122595f, 0.082049f, -0.143346f, + 0.055642f, -0.124701f, 0.004050f, -0.216235f, -2.681730f, 0.101658f, + 0.381239f, 0.465936f, 0.331154f, 0.301708f, -0.360171f, 0.054886f, + -0.118658f, 0.287921f, 0.277859f, 0.203784f, 0.247809f, 0.656924f, + -0.354628f, 0.315081f, 0.105108f, -0.510179f, 0.059267f, 0.061386f, + 0.076423f, 0.347119f, 0.100134f, 0.028402f, -0.118621f, -0.238689f, + 0.080141f, -0.138863f, 0.009009f, -0.100526f, -0.138875f, 0.066992f, + 0.005949f, 0.564336f, 0.046994f, 0.004655f, 0.366047f, 0.014695f, + -0.146928f, -0.024665f, -0.440357f, -0.109395f, 0.527231f, -0.020925f, + -0.227236f, -0.068141f, 0.282009f, 0.040192f, -0.267100f, 0.229228f, + 0.133861f, 0.338706f, -0.030178f, -0.040919f, -0.026343f, -0.330338f, + -0.066931f, -0.110580f, -0.072056f, 0.599457f, -0.020738f, 0.169200f, + 0.836240f, -0.157548f, 0.386273f, 0.002404f, 0.329410f, -0.007020f, + 0.351705f, -0.041259f, 0.388861f, 0.003899f, 0.582627f, 0.023572f, + 0.409912f, -0.158472f, 0.536383f, 0.525093f, 0.604247f, 0.439159f, + 0.692832f, 0.046272f, 0.590367f, -0.082166f, 0.262357f, 0.478671f, + 0.031935f, 0.042675f, 0.120002f, 0.398616f, -0.078967f, 0.227986f, + -0.044679f, 0.151061f, -0.085564f, 0.220205f, -0.265606f, -0.203623f, + 0.204719f, -0.125922f, 0.038544f, -0.269379f, 0.025866f, 0.109967f, + 0.019064f, -0.237297f, -0.309746f, -0.329118f, -0.278368f, -0.063859f, + 0.278496f, 0.018620f, 0.209971f, 0.296250f, 0.142850f, 0.288689f, + 0.137084f, 0.130517f, 0.128171f, -0.155396f, -0.008449f, -0.099845f, + 0.173455f, -0.059909f, -0.147318f, 0.102851f, -0.251389f, -0.001448f, + 0.103907f, 0.297273f, -0.027846f, 0.028260f, -0.382601f, 0.346695f, + -0.601641f, 0.162366f, -0.477495f, -0.042731f, -0.387871f, -0.051791f, + -0.401498f, -0.048446f, -0.456270f, -0.062287f, 0.493919f, 0.003008f, + 0.099917f, -0.358525f, -0.094903f, -0.022811f, -0.062259f, 0.019455f, + -0.050644f, 0.020041f, -0.132912f, -0.061578f, -3.083691f, -0.014961f, + -0.129115f, -0.710559f, 0.157213f, -0.844037f, -0.121991f, -0.943386f, + -0.231269f, -0.003462f, 0.331478f, -0.132703f, -1.285993f, -0.120957f, + -0.373755f, -0.322609f, 0.309059f, -0.131523f, -0.118334f, -0.063805f, + -0.104251f, 0.012166f, -0.094699f, -0.283753f, 0.128168f, -0.526929f, + -0.050331f, 0.186153f, 0.005913f, -0.221236f, 0.036363f, 0.160909f, + -0.001342f, -0.382749f, 0.037820f, 0.281689f, -0.024275f, 0.028854f, + 0.318291f, 0.318526f, 0.035778f, 0.034031f, 0.189663f, -0.293367f, + 0.082022f, 0.127923f, 0.078866f, -0.081361f, -0.268117f, 0.246675f, + 0.248605f, -0.215479f, -0.073084f, 0.496140f, -0.067327f, 0.396237f, + -0.120739f, 0.033752f, -0.044120f, -0.218941f, -0.028078f, 0.195132f, + -0.040400f, 0.281604f, -0.100471f, 0.415207f, -0.258503f, -0.429749f, + 0.150569f, -0.010859f, 0.136448f, 0.026589f, 0.148466f, 0.110764f, + 0.380967f, 0.009177f, 0.103075f, 0.116417f, 0.226273f, -0.327746f, + 0.169346f, 0.284553f, -0.094986f, 0.312745f, -0.147840f, 0.025062f, + -0.494482f, 0.112388f, -0.213962f, 0.107050f, -0.433371f, -0.096276f, + -0.244835f, -0.003518f, -0.459148f, -0.145080f, 0.017150f, 0.042846f, + -0.237479f, 0.104746f, 0.158677f, 0.358937f, 0.099921f, 0.277109f, + 0.012410f, -0.062897f, 0.116130f, 0.255309f, 0.341628f, 0.145002f, + -0.429344f, -0.016433f, -0.068985f, 0.285194f, -0.286719f, -0.018298f, + -0.179369f, -0.194655f, -0.165380f, 0.026071f, -0.428268f, -0.379929f, + -0.727543f, 0.179610f, -0.963979f, -0.042026f, -0.616202f, 0.133401f, + -0.784966f, 0.061205f, -0.713357f, 0.129795f, 0.120512f, -0.339545f, + 0.353557f, 0.114906f, -0.329813f, -0.209987f, 0.085410f, 0.214313f, + -0.122082f, 0.335770f, -0.020937f, 0.202456f, 0.289023f, -0.421186f, + 0.337905f, 0.407663f, 0.132771f, 0.071734f, 0.213914f, 0.128595f, + 0.302659f, -0.209501f, 0.217756f, 0.253079f, -0.089505f, -0.205614f, +}; + +static const float av1_tx_split_nn_bias_64x64_layer0[32] = { + 0.296914f, -1.826816f, 0.346130f, 0.969520f, -0.528154f, 1.175862f, + -0.075985f, -0.097323f, -0.233059f, 0.004846f, 0.401279f, -2.272435f, + 0.086257f, 0.414162f, -0.194786f, -0.233887f, -0.113215f, -2.453546f, + 0.861214f, 0.298361f, 0.267397f, -0.158557f, -0.119911f, -0.098134f, + -0.339263f, 0.385871f, -0.678123f, 0.263218f, 0.251611f, -1.155773f, + -0.365437f, 0.229255f, +}; + +static const float av1_tx_split_nn_weights_64x64_layer1[32] = { + 0.502104f, -0.708023f, 0.419648f, 1.583418f, 0.419355f, -1.462981f, + -0.439623f, 0.405691f, 0.823257f, 0.061654f, 0.750875f, 0.775031f, + -0.387909f, 0.447385f, 0.284690f, 0.353262f, -0.224347f, 0.832864f, + -1.708491f, -1.042447f, -0.272829f, 0.540640f, 0.310509f, 0.723745f, + 0.245592f, -0.218417f, -0.597987f, -0.362301f, 0.702217f, -0.692614f, + 0.207812f, 0.513560f, +}; + +static const float av1_tx_split_nn_bias_64x64_layer1[1] = { -0.2307045f }; + +static const NN_CONFIG av1_tx_split_nnconfig_64x64 = { + 12, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_64x64_layer0, + av1_tx_split_nn_weights_64x64_layer1, + }, + { + av1_tx_split_nn_bias_64x64_layer0, + av1_tx_split_nn_bias_64x64_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 4x16 block. +static const float av1_tx_split_nn_weights_4x16_layer0[8 * 16] = { + -1.344184f, -1.454625f, -0.703110f, -0.140570f, -0.841536f, -0.068131f, + -2.128968f, -0.655518f, 0.432180f, 0.879752f, -0.222211f, 0.061615f, + -0.230969f, 0.569496f, 1.424188f, 0.598063f, -0.436005f, -0.737606f, + -0.137875f, -0.085730f, -0.076512f, -0.583101f, -0.937377f, -0.203556f, + -0.215797f, -0.015361f, -0.124098f, -0.411917f, 0.340441f, -0.331752f, + -0.472607f, -0.097714f, -0.930572f, -1.354713f, -0.550724f, 0.176212f, + -0.636060f, 0.183271f, -0.610212f, 0.345895f, -1.100906f, -1.605713f, + 0.111888f, -0.140937f, 0.063013f, -0.013315f, -0.273472f, -0.255870f, + 1.200328f, 0.274002f, 1.005776f, 0.322392f, 1.222373f, 0.158227f, + 0.408810f, 0.145022f, 0.139842f, -1.249412f, 0.286672f, -0.635699f, + 0.312562f, -0.495606f, -1.117034f, -0.085107f, -0.097484f, -0.341521f, + -0.132199f, -0.863055f, 0.217579f, -1.161425f, -0.302087f, -1.357271f, + -0.520724f, -1.211069f, -1.048729f, -0.333087f, -1.171527f, -0.280824f, + -2.057684f, -0.228755f, 0.606278f, 0.101198f, -0.314847f, -1.303255f, + -0.294964f, 1.301923f, 0.041712f, 0.077593f, -1.152746f, 0.495315f, + -0.751566f, 0.230249f, -0.840661f, 0.100731f, 1.346269f, 0.649898f, + -1.432258f, -0.456710f, -1.018123f, -0.348559f, -1.225226f, -0.170717f, + -0.354072f, 0.068292f, -0.234168f, 0.277503f, 0.179134f, 0.907420f, + 0.354626f, -0.627210f, 0.905779f, 0.512612f, 0.161190f, -0.843177f, + 0.014953f, -0.354983f, 0.011116f, -0.429598f, -1.017138f, -0.211432f, + 0.941840f, -0.281747f, 0.957776f, -0.541914f, 1.041880f, -0.433580f, + -1.416451f, -0.166467f, +}; + +static const float av1_tx_split_nn_bias_4x16_layer0[16] = { + 3.086118f, -3.235095f, 4.830956f, -0.165706f, 0.955031f, 4.055783f, + -0.311489f, 4.660205f, -0.576277f, -0.248111f, -0.790519f, -1.686412f, + -1.191704f, -3.800073f, 4.121552f, -1.399397f, +}; + +static const float av1_tx_split_nn_weights_4x16_layer1[16] = { + -0.758677f, 0.388776f, 0.439906f, 0.011390f, -0.084319f, -0.667969f, + -0.467316f, -0.875491f, -0.160668f, 0.805292f, 0.114393f, -0.549682f, + 0.462109f, 0.343315f, 1.092593f, 0.483152f, +}; + +static const float av1_tx_split_nn_bias_4x16_layer1[1] = { + 0.8205083f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_4x16 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_4x16_layer0, + av1_tx_split_nn_weights_4x16_layer1, + }, + { + av1_tx_split_nn_bias_4x16_layer0, + av1_tx_split_nn_bias_4x16_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 16x32 block. +static const float av1_tx_split_nn_weights_16x32_layer0[8 * 32] = { + 0.180713f, 0.033211f, 0.607561f, 0.138642f, 0.637204f, -0.000940f, + 0.012630f, 0.358109f, 0.022238f, 0.190418f, 0.079088f, 0.065925f, + 0.038242f, 0.162380f, -0.122728f, 0.379382f, -0.303283f, -0.327550f, + 0.029120f, -0.284553f, 0.269588f, -0.309805f, -0.241036f, -0.161103f, + -0.304887f, 0.239843f, -0.149146f, 0.311234f, -0.073640f, -0.132718f, + 0.178901f, 0.474712f, 0.020280f, 0.063685f, -0.609170f, -0.013658f, + -0.338074f, 0.250429f, 0.082978f, -0.186315f, -0.788959f, 0.039859f, + -0.426461f, -0.001524f, -0.447211f, 0.378102f, 0.315617f, 0.017428f, + 0.745494f, -0.219024f, 0.512836f, 0.200522f, 0.680449f, 0.313686f, + -0.412569f, -0.132927f, 0.631120f, 0.042735f, 0.336153f, 0.044772f, + 0.432606f, 0.175681f, -0.634411f, -0.073509f, -0.040643f, -0.559260f, + -0.104034f, -0.570495f, -0.247365f, 0.063256f, -0.582021f, -0.492585f, + -0.194955f, -0.207934f, -0.506627f, 0.021743f, -0.416518f, 0.320876f, + 0.115889f, 0.149399f, -0.229376f, 0.095505f, 0.115191f, -0.471921f, + 0.113068f, 0.343684f, -0.036831f, 0.021240f, 0.295112f, 0.031166f, + 0.448201f, -0.132241f, 0.164032f, 0.355572f, 0.072154f, 0.017335f, + -0.046113f, 0.178719f, -0.026881f, -0.242590f, 0.055073f, -0.012958f, + 0.077904f, 0.351356f, 0.107655f, 0.260568f, -0.080052f, -0.197553f, + 0.085763f, 0.263416f, -0.327741f, 0.158855f, 0.056899f, -0.162121f, + 0.339518f, -0.571204f, 0.264966f, -0.252214f, -0.202560f, -0.134213f, + -0.330188f, 0.009470f, -0.468376f, -0.065240f, -0.307957f, 0.116479f, + -0.222238f, -0.458716f, 0.186493f, -0.391415f, 0.118649f, -0.104653f, + -0.259958f, -0.332081f, -0.403785f, -0.050147f, -0.573511f, 0.177117f, + -0.598358f, 0.164947f, -0.119694f, -0.058520f, 0.203829f, -0.267404f, + -0.048202f, -0.600006f, 0.181594f, -0.731805f, 0.146417f, -0.687148f, + -1.210525f, -0.450101f, -0.620635f, 0.208825f, -0.611357f, 0.112202f, + -0.309468f, -0.323545f, 0.357770f, 0.308061f, 0.553199f, 0.049012f, + 0.530093f, -0.208597f, 0.607882f, -0.058120f, -0.527634f, 0.018136f, + 0.060753f, 0.118894f, 0.175649f, 0.014731f, 0.428318f, -0.106465f, + -0.119077f, 0.080179f, 0.524997f, 0.368286f, 0.528286f, 0.213659f, + 0.639286f, 0.195079f, -0.049815f, -0.092008f, -0.302958f, 0.298149f, + -0.173870f, -0.145205f, -0.233589f, -0.303368f, 0.141275f, 0.325622f, + -0.115293f, 0.155188f, 0.047225f, 0.231050f, -0.167447f, 0.349754f, + 0.295544f, -0.319466f, 0.095144f, 0.174612f, -0.194652f, 0.305915f, + -0.239008f, -0.037453f, 0.280696f, 0.125850f, 0.749196f, -0.101919f, + 0.791808f, -0.236811f, 0.064157f, 0.032865f, -0.225911f, 0.350384f, + 0.723183f, -0.103992f, 0.483085f, -0.123992f, 0.602138f, 0.023895f, + -0.692601f, -0.118387f, 0.162527f, 0.145178f, -0.184702f, -0.017753f, + -0.159436f, 0.124105f, -0.131067f, 0.310275f, 0.151499f, 0.138924f, + 0.537459f, 0.263212f, 0.615896f, 0.281255f, 0.021293f, -0.473459f, + 0.210145f, -0.056682f, 0.063658f, 0.377254f, -0.314410f, -0.183487f, + 0.300384f, 0.328471f, 0.164694f, -0.159272f, -0.160942f, -0.502861f, + -0.129147f, 0.045916f, -0.606865f, -0.101378f, +}; + +static const float av1_tx_split_nn_bias_16x32_layer0[32] = { + 0.051664f, -0.212487f, -0.077596f, -0.818467f, 0.638475f, -0.759937f, + 0.157198f, 0.989640f, 1.586035f, 0.431144f, 0.041605f, 0.543085f, + 0.498379f, 0.320504f, 0.134233f, 0.670979f, -0.105562f, -1.574879f, + 1.261812f, -0.287530f, -1.610592f, 0.730899f, -0.894240f, -0.657790f, + 0.270806f, -0.181708f, 0.298578f, 0.817240f, -0.221508f, -0.201771f, + -0.294389f, 1.456413f, +}; + +static const float av1_tx_split_nn_weights_16x32_layer1[32] = { + 1.208914f, 0.324728f, 0.383352f, -0.874321f, 0.172565f, -0.580927f, + -0.432927f, 0.433698f, -0.801935f, 0.672028f, 0.563493f, 0.260077f, + -0.200557f, -0.121638f, 0.530735f, -0.525196f, 0.281799f, 0.624204f, + -0.662775f, -0.230887f, 0.980989f, 0.223437f, -0.790591f, 0.600724f, + -0.273445f, 0.427635f, -0.501641f, -0.878390f, 0.234731f, -0.172550f, + 0.418904f, 1.792187f, +}; + +static const float av1_tx_split_nn_bias_16x32_layer1[1] = { + -0.29233751f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_16x32 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_16x32_layer0, + av1_tx_split_nn_weights_16x32_layer1, + }, + { + av1_tx_split_nn_bias_16x32_layer0, + av1_tx_split_nn_bias_16x32_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 32x64 block. +static const float av1_tx_split_nn_weights_32x64_layer0[8 * 32] = { + 0.031614f, -0.110926f, 0.052418f, -0.702506f, 0.045708f, 0.238329f, + -0.021806f, -0.208128f, 0.509745f, -0.293891f, 0.277788f, 0.113937f, + 0.741576f, 0.062848f, 0.351878f, 0.212532f, 0.385842f, 0.081517f, + 0.398502f, -0.015156f, 0.242616f, 0.214619f, -0.182678f, -0.170546f, + 0.110605f, -0.236749f, -0.023831f, -0.285243f, 0.147156f, -0.257639f, + 0.341355f, -0.571641f, -0.721797f, 0.139588f, -0.518494f, -0.206526f, + -0.570560f, -0.184295f, 0.110271f, 0.210292f, -0.109132f, -0.001080f, + 0.129251f, -0.204230f, -0.396312f, -0.183024f, 0.421243f, -0.013154f, + 0.222627f, 0.169826f, 0.226037f, 0.218153f, -0.343528f, 0.274906f, + -0.156632f, 0.250261f, -0.484020f, 0.019909f, -0.349575f, -0.286643f, + -0.507396f, 0.202446f, -0.154110f, -0.292644f, 0.122666f, 0.306963f, + 0.424895f, 0.005579f, 0.494094f, -0.079551f, 0.473740f, 0.352414f, + -0.356917f, 0.264331f, -0.554487f, 0.119978f, 0.012291f, -0.141641f, + -0.254714f, -0.213723f, -0.116701f, -0.011267f, 0.190025f, -0.118501f, + 0.305151f, -0.316782f, -0.220801f, -0.308420f, -0.324285f, 0.421329f, + -0.177066f, -0.055114f, 0.229698f, -0.199523f, 0.054278f, 0.365020f, + -0.060586f, -0.300618f, 0.157563f, -0.064338f, -0.005711f, -0.176991f, + -0.424502f, -0.111914f, 0.092608f, 0.126621f, 0.078547f, 0.148008f, + 0.024221f, 0.124599f, 0.001343f, 0.059402f, 0.453753f, 0.047102f, + 0.242544f, 0.055735f, -0.067451f, -0.170061f, -0.170469f, -0.232173f, + 0.214908f, 0.248889f, 0.544348f, -0.084566f, 0.402478f, 0.298031f, + 0.099038f, -0.238019f, -0.475085f, -0.070042f, -0.754955f, -0.049095f, + -0.783801f, -0.099857f, -0.582008f, -0.055194f, -0.103655f, 0.143689f, + 0.100219f, 0.293934f, 0.099271f, -0.036320f, 0.356626f, -0.261445f, + 0.879544f, 0.000878f, 0.532920f, -0.093918f, 0.508867f, -0.040215f, + -0.789042f, -0.145380f, -0.090040f, -0.066636f, 0.015212f, 0.352989f, + -0.058831f, -0.164588f, 0.039890f, 0.122861f, 0.222508f, 0.061217f, + 0.466487f, 0.022666f, 0.423777f, -0.002200f, -0.656835f, -0.099760f, + -0.520606f, 0.303204f, -0.563620f, -0.160922f, -0.243203f, 0.313354f, + -0.336516f, -0.206764f, -0.236040f, 0.325899f, -0.418748f, 0.163205f, + -0.476242f, -0.121928f, 0.139178f, -0.157193f, -0.531766f, -0.180202f, + -0.485254f, 0.187703f, -0.440072f, 0.137854f, 0.029139f, 0.109530f, + -0.078475f, -0.360618f, -0.334672f, -0.350890f, -0.403976f, 0.180336f, + -0.304542f, 0.005123f, 0.413995f, 0.314639f, 0.342648f, -0.293264f, + 0.358135f, -0.180425f, -0.369530f, -0.048413f, 0.498366f, 0.121875f, + 0.270948f, -0.187966f, 0.342503f, 0.174420f, -0.352105f, 0.088080f, + 0.008277f, 0.020275f, -0.002381f, 0.504389f, -0.018832f, -0.366047f, + -0.090947f, -0.168150f, 0.016184f, -0.328914f, 0.089579f, -0.017349f, + 0.005844f, -0.005010f, -1.857514f, -0.282426f, 0.010177f, -0.214727f, + -0.182529f, 0.156943f, -0.162032f, -0.472654f, 0.069432f, 0.016901f, + -0.767905f, 0.137129f, -0.411463f, 0.049056f, -0.431657f, -0.037641f, + 0.785500f, 0.046225f, 0.195831f, 0.245204f, 0.368614f, 0.212261f, + 0.440626f, -0.158048f, -0.461031f, -0.146280f, +}; + +static const float av1_tx_split_nn_bias_32x64_layer0[32] = { + 0.490777f, -1.894238f, 0.621333f, -0.076756f, 0.286298f, 0.286375f, + -0.126431f, -0.350034f, -1.017572f, 0.620125f, 0.408128f, 0.238756f, + -0.060728f, 0.210912f, 0.043124f, 0.445649f, 0.907025f, 0.360272f, + 1.083101f, -0.068952f, 1.062348f, 0.396354f, 0.280075f, 0.501732f, + 0.328422f, 0.066241f, 0.474697f, 0.126313f, 0.741206f, 0.314796f, + 0.552712f, 0.299410f, +}; + +static const float av1_tx_split_nn_weights_32x64_layer1[32] = { + 1.033823f, 0.603439f, 0.304591f, -0.279940f, -0.780909f, -0.132801f, + 0.154059f, 0.662014f, -0.718368f, 0.198733f, 0.039766f, -0.208516f, + -0.104909f, -0.394209f, 0.081617f, 0.365041f, -0.874960f, -0.063315f, + -1.189897f, 0.337225f, 0.410893f, 0.307519f, 0.221323f, 0.233895f, + 0.469536f, 0.438557f, 0.280144f, 0.422423f, -1.394513f, 0.781900f, + 0.352981f, 0.111265f, +}; + +static const float av1_tx_split_nn_bias_32x64_layer1[1] = { + -0.18160765f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_32x64 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_32x64_layer0, + av1_tx_split_nn_weights_32x64_layer1, + }, + { + av1_tx_split_nn_bias_32x64_layer0, + av1_tx_split_nn_bias_32x64_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 8x32 block. +static const float av1_tx_split_nn_weights_8x32_layer0[8 * 24] = { + -0.687846f, 0.121404f, -0.372905f, 0.126770f, -0.103298f, -0.101650f, + -0.148490f, -0.271740f, 0.682915f, -0.079765f, 0.634347f, -0.151503f, + 0.287692f, -0.079072f, -0.236948f, 0.065064f, 0.713383f, 0.397123f, + 0.553621f, 0.368529f, 0.767663f, -0.046601f, -0.392402f, -0.294822f, + -0.292325f, -0.010573f, -0.837945f, 0.050113f, -0.811360f, 0.199162f, + 0.150832f, 0.011602f, 0.369694f, -0.225876f, 0.234113f, -0.269808f, + 0.303805f, -0.190281f, -0.451136f, 0.209755f, -0.308894f, 0.326956f, + 0.313591f, 0.089923f, -0.095754f, 0.390981f, 0.467366f, 0.169670f, + 0.853322f, 0.054055f, 0.830319f, -0.121918f, 0.262019f, -0.093526f, + 0.385558f, 0.419174f, 0.040198f, -0.347030f, -0.450492f, -0.106764f, + 0.487502f, -0.204188f, 0.430374f, -0.116388f, 0.236407f, -0.157376f, + 0.732294f, -0.651387f, 0.347446f, 0.342575f, 0.048406f, 0.187657f, + 0.434899f, -0.447782f, 0.032728f, -0.071168f, -0.255327f, 0.104174f, + 0.095689f, -0.431743f, 0.725694f, 0.031797f, 0.523171f, 0.061801f, + 0.469804f, -0.071068f, -0.059024f, -0.211937f, 0.392134f, -0.321490f, + 0.366060f, -0.427798f, 0.166771f, 0.299652f, 0.044660f, 0.205142f, + 0.039133f, -0.051835f, -0.465475f, 0.216976f, -0.341156f, 0.095358f, + 0.230807f, 0.201674f, 0.279266f, -0.713534f, -0.091690f, -0.569708f, + -0.119001f, 0.252160f, -1.544578f, -0.284477f, 0.555348f, 0.226471f, + 0.347690f, 0.034365f, 0.770835f, -0.241859f, -0.130241f, 0.292936f, + 0.396622f, -0.417916f, 0.492224f, 0.125517f, 0.344824f, 0.232172f, + -0.432106f, -0.278745f, 0.035069f, -0.307247f, -0.120760f, 0.170950f, + 0.433601f, 0.044286f, 0.141463f, -0.041382f, 0.529346f, 0.010868f, + -0.323674f, 0.185205f, 0.623459f, 0.232842f, -0.406693f, -0.142944f, + 0.222988f, 0.343634f, 0.065401f, 0.002621f, 0.805335f, -0.426926f, + 0.279181f, 0.131364f, 0.192339f, -0.402391f, 0.544120f, -0.060618f, + 0.467780f, 0.165224f, -0.373131f, 0.002427f, 0.688064f, 0.322317f, + 0.259713f, 0.130583f, 0.185032f, -0.189111f, -0.067821f, 0.010875f, + 0.644724f, -0.179291f, 0.463222f, 0.155230f, 0.721384f, -0.046019f, + 0.438501f, 0.440027f, -0.462090f, -0.002039f, -0.468026f, -0.008890f, + -0.328530f, 0.370102f, 0.482531f, 0.043471f, -0.469732f, -0.532663f, + 0.122081f, -0.379659f, 0.037219f, -0.519913f, -0.128975f, -0.404365f, +}; + +static const float av1_tx_split_nn_bias_8x32_layer0[24] = { + -1.198965f, 0.395204f, -0.408627f, -0.021654f, -0.658355f, 0.154525f, + -0.288354f, 1.207574f, 0.411608f, 0.964678f, -1.176893f, 1.059006f, + -0.472969f, 2.087975f, 1.065536f, 0.595569f, 0.197907f, -0.349938f, + 1.013651f, -0.931093f, -0.973595f, -0.459094f, -1.253062f, 1.624782f, +}; + +static const float av1_tx_split_nn_weights_8x32_layer1[24] = { + 0.815787f, -0.393465f, -0.483427f, -0.565592f, 0.493494f, 0.430229f, + -0.507073f, -0.251379f, -0.353418f, -0.495445f, 0.820029f, 0.649146f, + -0.487383f, 1.844503f, 0.480324f, -0.982705f, -0.501446f, -0.220584f, + 0.334299f, 0.802238f, 0.805838f, -0.487848f, 0.300772f, -1.232857f, +}; + +static const float av1_tx_split_nn_bias_8x32_layer1[1] = { + 0.13435879f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_8x32 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 24, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_8x32_layer0, + av1_tx_split_nn_weights_8x32_layer1, + }, + { + av1_tx_split_nn_bias_8x32_layer0, + av1_tx_split_nn_bias_8x32_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 16x32 block. +static const float av1_tx_split_nn_weights_16x64_layer0[8 * 16] = { + -0.378223f, -0.124216f, -0.514089f, -0.110117f, -0.585801f, -0.094838f, + -0.455385f, -0.220254f, -0.504568f, -0.082351f, -0.476420f, -0.253993f, + -0.454709f, -0.059461f, 0.210313f, -0.155683f, 0.192968f, -0.127804f, + 0.471996f, 0.253377f, 0.472625f, 0.485322f, 0.150560f, 0.164868f, + -0.475587f, 0.447559f, -0.455759f, -0.306665f, -0.194866f, -0.283716f, + -0.243897f, 0.293020f, -0.308298f, -0.191904f, -0.468568f, 0.014053f, + -0.618848f, 0.096273f, -0.444586f, 0.347750f, -0.280643f, -0.062872f, + 0.118661f, 0.540099f, 0.104141f, -0.279300f, -0.098721f, -0.173427f, + -0.984558f, -0.424559f, -0.411928f, -0.120875f, -0.488999f, -0.050716f, + -0.523103f, 0.093620f, -0.930396f, -0.431997f, -1.163297f, 0.190384f, + -0.422581f, -0.005354f, 0.450552f, 0.369210f, 0.562484f, 0.679922f, + 0.282099f, -0.039075f, 0.404196f, 0.006371f, 0.069679f, -0.196160f, + -0.213675f, 0.275187f, -0.104235f, -0.193090f, 0.003116f, -0.252454f, + -0.094591f, 0.210439f, -0.137070f, 0.145043f, 0.024558f, 0.121718f, + 0.010138f, 0.301651f, -0.377990f, 0.444414f, 0.001845f, -0.095334f, + 0.550259f, 0.087603f, 0.792492f, -0.044584f, 0.641706f, -0.328458f, + -0.447791f, 0.135376f, 0.356385f, 0.135748f, 0.310370f, 0.293757f, + -0.062000f, -0.056368f, 0.343930f, 0.312039f, 0.370763f, 0.452381f, + -0.023630f, -0.185909f, 0.422277f, -0.006306f, 0.045166f, 0.423359f, + -0.157735f, -0.084901f, 0.219527f, -0.209510f, 0.575057f, 0.249276f, + 0.069267f, 0.233898f, -0.229392f, 0.117197f, -0.038551f, 0.293976f, + 0.101996f, 0.120878f, +}; + +static const float av1_tx_split_nn_bias_16x64_layer0[16] = { + 1.036995f, 0.160249f, 0.100264f, 0.694881f, 0.694677f, 0.128379f, + -0.843405f, -0.405515f, 0.104139f, 0.182980f, -0.025472f, 0.901067f, + -0.299866f, -0.103079f, -0.190352f, -0.048121f, +}; + +static const float av1_tx_split_nn_weights_16x64_layer1[16] = { + -1.778868f, 0.174690f, 0.211991f, 0.712138f, 0.589352f, 0.466652f, + 1.029146f, -0.490044f, 0.483015f, 0.600215f, -0.577776f, -0.755546f, + 0.348337f, -0.205082f, 0.347129f, -0.322277f, +}; + +static const float av1_tx_split_nn_bias_16x64_layer1[1] = { + 0.04230947f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_16x64 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_16x64_layer0, + av1_tx_split_nn_weights_16x64_layer1, + }, + { + av1_tx_split_nn_bias_16x64_layer0, + av1_tx_split_nn_bias_16x64_layer1, + }, +}; +/******************************************************************************/ + +// Map block size to its corresponding neural net model for tx split prediction. +static const NN_CONFIG *av1_tx_split_nnconfig_map[TX_SIZES_ALL] = { + NULL, // TX_4X4, + &av1_tx_split_nnconfig_8x8, // TX_8X8, + &av1_tx_split_nnconfig_16x16, // TX_16X16, + &av1_tx_split_nnconfig_32x32, // TX_32X32, + &av1_tx_split_nnconfig_64x64, // TX_64X64, + &av1_tx_split_nnconfig_4x8, // TX_4X8, + &av1_tx_split_nnconfig_4x8, // TX_8X4, + &av1_tx_split_nnconfig_8x16, // TX_8X16, + &av1_tx_split_nnconfig_8x16, // TX_16X8, + &av1_tx_split_nnconfig_16x32, // TX_16X32, + &av1_tx_split_nnconfig_16x32, // TX_32X16, + &av1_tx_split_nnconfig_32x64, // TX_32X64, + &av1_tx_split_nnconfig_32x64, // TX_64X32, + &av1_tx_split_nnconfig_4x16, // TX_4X16, + &av1_tx_split_nnconfig_4x16, // TX_16X4, + &av1_tx_split_nnconfig_8x32, // TX_8X32, + &av1_tx_split_nnconfig_8x32, // TX_32X8, + &av1_tx_split_nnconfig_16x64, // TX_16X64, + &av1_tx_split_nnconfig_16x64, // TX_64X16, +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c new file mode 100644 index 000000000..84065d6de --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c @@ -0,0 +1,1205 @@ +#include "av1/encoder/x86/av1_txfm1d_sse4.h" + +void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output, + int8_t cos_bit) { + __m128i buf0[32]; + __m128i buf1[32]; + const int32_t *cospi; + // stage 0 + // stage 1 + buf1[0] = _mm_add_epi32(input[0], input[31]); + buf1[31] = _mm_sub_epi32(input[0], input[31]); + buf1[1] = _mm_add_epi32(input[1], input[30]); + buf1[30] = _mm_sub_epi32(input[1], input[30]); + buf1[2] = _mm_add_epi32(input[2], input[29]); + buf1[29] = _mm_sub_epi32(input[2], input[29]); + buf1[3] = _mm_add_epi32(input[3], input[28]); + buf1[28] = _mm_sub_epi32(input[3], input[28]); + buf1[4] = _mm_add_epi32(input[4], input[27]); + buf1[27] = _mm_sub_epi32(input[4], input[27]); + buf1[5] = _mm_add_epi32(input[5], input[26]); + buf1[26] = _mm_sub_epi32(input[5], input[26]); + buf1[6] = _mm_add_epi32(input[6], input[25]); + buf1[25] = _mm_sub_epi32(input[6], input[25]); + buf1[7] = _mm_add_epi32(input[7], input[24]); + buf1[24] = _mm_sub_epi32(input[7], input[24]); + buf1[8] = _mm_add_epi32(input[8], input[23]); + buf1[23] = _mm_sub_epi32(input[8], input[23]); + buf1[9] = _mm_add_epi32(input[9], input[22]); + buf1[22] = _mm_sub_epi32(input[9], input[22]); + buf1[10] = _mm_add_epi32(input[10], input[21]); + buf1[21] = _mm_sub_epi32(input[10], input[21]); + buf1[11] = _mm_add_epi32(input[11], input[20]); + buf1[20] = _mm_sub_epi32(input[11], input[20]); + buf1[12] = _mm_add_epi32(input[12], input[19]); + buf1[19] = _mm_sub_epi32(input[12], input[19]); + buf1[13] = _mm_add_epi32(input[13], input[18]); + buf1[18] = _mm_sub_epi32(input[13], input[18]); + buf1[14] = _mm_add_epi32(input[14], input[17]); + buf1[17] = _mm_sub_epi32(input[14], input[17]); + buf1[15] = _mm_add_epi32(input[15], input[16]); + buf1[16] = _mm_sub_epi32(input[15], input[16]); + + // stage 2 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm_add_epi32(buf1[0], buf1[15]); + buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]); + buf0[1] = _mm_add_epi32(buf1[1], buf1[14]); + buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]); + buf0[2] = _mm_add_epi32(buf1[2], buf1[13]); + buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]); + buf0[3] = _mm_add_epi32(buf1[3], buf1[12]); + buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]); + buf0[4] = _mm_add_epi32(buf1[4], buf1[11]); + buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]); + buf0[5] = _mm_add_epi32(buf1[5], buf1[10]); + buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]); + buf0[6] = _mm_add_epi32(buf1[6], buf1[9]); + buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]); + buf0[7] = _mm_add_epi32(buf1[7], buf1[8]); + buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + buf0[18] = buf1[18]; + buf0[19] = buf1[19]; + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23], + buf0[24], cos_bit); + buf0[28] = buf1[28]; + buf0[29] = buf1[29]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 3 + cospi = cospi_arr(cos_bit); + buf1[0] = _mm_add_epi32(buf0[0], buf0[7]); + buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]); + buf1[1] = _mm_add_epi32(buf0[1], buf0[6]); + buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]); + buf1[2] = _mm_add_epi32(buf0[2], buf0[5]); + buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]); + buf1[3] = _mm_add_epi32(buf0[3], buf0[4]); + buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]); + buf1[8] = buf0[8]; + buf1[9] = buf0[9]; + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11], + buf1[12], cos_bit); + buf1[14] = buf0[14]; + buf1[15] = buf0[15]; + buf1[16] = _mm_add_epi32(buf0[16], buf0[23]); + buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]); + buf1[17] = _mm_add_epi32(buf0[17], buf0[22]); + buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]); + buf1[18] = _mm_add_epi32(buf0[18], buf0[21]); + buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]); + buf1[19] = _mm_add_epi32(buf0[19], buf0[20]); + buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]); + buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]); + buf1[31] = _mm_add_epi32(buf0[31], buf0[24]); + buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]); + buf1[30] = _mm_add_epi32(buf0[30], buf0[25]); + buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]); + buf1[29] = _mm_add_epi32(buf0[29], buf0[26]); + buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]); + buf1[28] = _mm_add_epi32(buf0[28], buf0[27]); + + // stage 4 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm_add_epi32(buf1[0], buf1[3]); + buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]); + buf0[1] = _mm_add_epi32(buf1[1], buf1[2]); + buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]); + buf0[4] = buf1[4]; + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6], + cos_bit); + buf0[7] = buf1[7]; + buf0[8] = _mm_add_epi32(buf1[8], buf1[11]); + buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]); + buf0[9] = _mm_add_epi32(buf1[9], buf1[10]); + buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]); + buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]); + buf0[15] = _mm_add_epi32(buf1[15], buf1[12]); + buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]); + buf0[14] = _mm_add_epi32(buf1[14], buf1[13]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19], + buf0[28], cos_bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + buf0[22] = buf1[22]; + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[25] = buf1[25]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 5 + cospi = cospi_arr(cos_bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1], + cos_bit); + btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3], + cos_bit); + buf1[4] = _mm_add_epi32(buf0[4], buf0[5]); + buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]); + buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]); + buf1[7] = _mm_add_epi32(buf0[7], buf0[6]); + buf1[8] = buf0[8]; + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], + buf1[14], cos_bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + buf1[11] = buf0[11]; + buf1[12] = buf0[12]; + buf1[15] = buf0[15]; + buf1[16] = _mm_add_epi32(buf0[16], buf0[19]); + buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]); + buf1[17] = _mm_add_epi32(buf0[17], buf0[18]); + buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]); + buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]); + buf1[23] = _mm_add_epi32(buf0[23], buf0[20]); + buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]); + buf1[22] = _mm_add_epi32(buf0[22], buf0[21]); + buf1[24] = _mm_add_epi32(buf0[24], buf0[27]); + buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]); + buf1[25] = _mm_add_epi32(buf0[25], buf0[26]); + buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]); + buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]); + buf1[31] = _mm_add_epi32(buf0[31], buf0[28]); + buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]); + buf1[30] = _mm_add_epi32(buf0[30], buf0[29]); + + // stage 6 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7], + cos_bit); + btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6], + cos_bit); + buf0[8] = _mm_add_epi32(buf1[8], buf1[9]); + buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]); + buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]); + buf0[11] = _mm_add_epi32(buf1[11], buf1[10]); + buf0[12] = _mm_add_epi32(buf1[12], buf1[13]); + buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]); + buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]); + buf0[15] = _mm_add_epi32(buf1[15], buf1[14]); + buf0[16] = buf1[16]; + btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], + buf0[30], cos_bit); + btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + buf0[19] = buf1[19]; + buf0[20] = buf1[20]; + btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[27] = buf1[27]; + buf0[28] = buf1[28]; + buf0[31] = buf1[31]; + + // stage 7 + cospi = cospi_arr(cos_bit); + buf1[0] = buf0[0]; + buf1[1] = buf0[1]; + buf1[2] = buf0[2]; + buf1[3] = buf0[3]; + buf1[4] = buf0[4]; + buf1[5] = buf0[5]; + buf1[6] = buf0[6]; + buf1[7] = buf0[7]; + btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15], + cos_bit); + btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], + buf1[14], cos_bit); + btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11], + buf1[12], cos_bit); + buf1[16] = _mm_add_epi32(buf0[16], buf0[17]); + buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]); + buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]); + buf1[19] = _mm_add_epi32(buf0[19], buf0[18]); + buf1[20] = _mm_add_epi32(buf0[20], buf0[21]); + buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]); + buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]); + buf1[23] = _mm_add_epi32(buf0[23], buf0[22]); + buf1[24] = _mm_add_epi32(buf0[24], buf0[25]); + buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]); + buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]); + buf1[27] = _mm_add_epi32(buf0[27], buf0[26]); + buf1[28] = _mm_add_epi32(buf0[28], buf0[29]); + buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]); + buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]); + buf1[31] = _mm_add_epi32(buf0[31], buf0[30]); + + // stage 8 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + buf0[4] = buf1[4]; + buf0[5] = buf1[5]; + buf0[6] = buf1[6]; + buf0[7] = buf1[7]; + buf0[8] = buf1[8]; + buf0[9] = buf1[9]; + buf0[10] = buf1[10]; + buf0[11] = buf1[11]; + buf0[12] = buf1[12]; + buf0[13] = buf1[13]; + buf0[14] = buf1[14]; + buf0[15] = buf1[15]; + btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], + buf0[31], cos_bit); + btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17], + buf0[30], cos_bit); + btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19], + buf0[28], cos_bit); + btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], + buf0[24], cos_bit); + + // stage 9 + output[0] = buf0[0]; + output[1] = buf0[16]; + output[2] = buf0[8]; + output[3] = buf0[24]; + output[4] = buf0[4]; + output[5] = buf0[20]; + output[6] = buf0[12]; + output[7] = buf0[28]; + output[8] = buf0[2]; + output[9] = buf0[18]; + output[10] = buf0[10]; + output[11] = buf0[26]; + output[12] = buf0[6]; + output[13] = buf0[22]; + output[14] = buf0[14]; + output[15] = buf0[30]; + output[16] = buf0[1]; + output[17] = buf0[17]; + output[18] = buf0[9]; + output[19] = buf0[25]; + output[20] = buf0[5]; + output[21] = buf0[21]; + output[22] = buf0[13]; + output[23] = buf0[29]; + output[24] = buf0[3]; + output[25] = buf0[19]; + output[26] = buf0[11]; + output[27] = buf0[27]; + output[28] = buf0[7]; + output[29] = buf0[23]; + output[30] = buf0[15]; + output[31] = buf0[31]; +} + +void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range) { + const int txfm_size = 4; + const int num_per_128 = 4; + const int32_t *cospi; + __m128i buf0[4]; + __m128i buf1[4]; + int col_num = txfm_size / num_per_128; + int col; + (void)stage_range; + for (col = 0; col < col_num; col++) { + // stage 0; + int32_t stage_idx = 0; + int j; + for (j = 0; j < 4; ++j) { + buf0[j] = input[j * col_num + col]; + } + + // stage 1 + stage_idx++; + buf1[0] = buf0[3]; + buf1[1] = buf0[0]; + buf1[2] = buf0[1]; + buf1[3] = buf0[2]; + + // stage 2 + stage_idx++; + + cospi = cospi_arr(cos_bit); + btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1], + cos_bit); + btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2], + buf0[3], cos_bit); + + // stage 3 + stage_idx++; + buf1[0] = _mm_add_epi32(buf0[0], buf0[2]); + buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]); + buf1[1] = _mm_add_epi32(buf0[1], buf0[3]); + buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]); + + // stage 4 + stage_idx++; + + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], + buf0[3], cos_bit); + + // stage 5 + stage_idx++; + buf1[0] = buf0[0]; + buf1[1] = _mm_sub_epi32(_mm_setzero_si128(), buf0[2]); + buf1[2] = buf0[3]; + buf1[3] = _mm_sub_epi32(_mm_setzero_si128(), buf0[1]); + + for (j = 0; j < 4; ++j) { + output[j * col_num + col] = buf1[j]; + } + } +} + +void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32 = _mm_set1_epi32(-cospi[32]); + __m128i cospi_p32 = _mm_set1_epi32(cospi[32]); + __m128i cospi_m16 = _mm_set1_epi32(-cospi[16]); + __m128i cospi_p48 = _mm_set1_epi32(cospi[48]); + __m128i cospi_m48 = _mm_set1_epi32(-cospi[48]); + __m128i cospi_p16 = _mm_set1_epi32(cospi[16]); + __m128i cospi_m08 = _mm_set1_epi32(-cospi[8]); + __m128i cospi_p56 = _mm_set1_epi32(cospi[56]); + __m128i cospi_m56 = _mm_set1_epi32(-cospi[56]); + __m128i cospi_m40 = _mm_set1_epi32(-cospi[40]); + __m128i cospi_p24 = _mm_set1_epi32(cospi[24]); + __m128i cospi_m24 = _mm_set1_epi32(-cospi[24]); + __m128i cospi_p08 = _mm_set1_epi32(cospi[8]); + __m128i cospi_p40 = _mm_set1_epi32(cospi[40]); + __m128i cospi_p60 = _mm_set1_epi32(cospi[60]); + __m128i cospi_p04 = _mm_set1_epi32(cospi[4]); + __m128i cospi_p28 = _mm_set1_epi32(cospi[28]); + __m128i cospi_p36 = _mm_set1_epi32(cospi[36]); + __m128i cospi_p44 = _mm_set1_epi32(cospi[44]); + __m128i cospi_p20 = _mm_set1_epi32(cospi[20]); + __m128i cospi_p12 = _mm_set1_epi32(cospi[12]); + __m128i cospi_p52 = _mm_set1_epi32(cospi[52]); + __m128i cospi_m04 = _mm_set1_epi32(-cospi[4]); + __m128i cospi_m60 = _mm_set1_epi32(-cospi[60]); + __m128i cospi_m36 = _mm_set1_epi32(-cospi[36]); + __m128i cospi_m28 = _mm_set1_epi32(-cospi[28]); + __m128i cospi_m20 = _mm_set1_epi32(-cospi[20]); + __m128i cospi_m44 = _mm_set1_epi32(-cospi[44]); + __m128i cospi_m52 = _mm_set1_epi32(-cospi[52]); + __m128i cospi_m12 = _mm_set1_epi32(-cospi[12]); + __m128i cospi_p62 = _mm_set1_epi32(cospi[62]); + __m128i cospi_p02 = _mm_set1_epi32(cospi[2]); + __m128i cospi_p30 = _mm_set1_epi32(cospi[30]); + __m128i cospi_p34 = _mm_set1_epi32(cospi[34]); + __m128i cospi_p46 = _mm_set1_epi32(cospi[46]); + __m128i cospi_p18 = _mm_set1_epi32(cospi[18]); + __m128i cospi_p14 = _mm_set1_epi32(cospi[14]); + __m128i cospi_p50 = _mm_set1_epi32(cospi[50]); + __m128i cospi_p54 = _mm_set1_epi32(cospi[54]); + __m128i cospi_p10 = _mm_set1_epi32(cospi[10]); + __m128i cospi_p22 = _mm_set1_epi32(cospi[22]); + __m128i cospi_p42 = _mm_set1_epi32(cospi[42]); + __m128i cospi_p38 = _mm_set1_epi32(cospi[38]); + __m128i cospi_p26 = _mm_set1_epi32(cospi[26]); + __m128i cospi_p06 = _mm_set1_epi32(cospi[6]); + __m128i cospi_p58 = _mm_set1_epi32(cospi[58]); + __m128i cospi_p63 = _mm_set1_epi32(cospi[63]); + __m128i cospi_p01 = _mm_set1_epi32(cospi[1]); + __m128i cospi_p31 = _mm_set1_epi32(cospi[31]); + __m128i cospi_p33 = _mm_set1_epi32(cospi[33]); + __m128i cospi_p47 = _mm_set1_epi32(cospi[47]); + __m128i cospi_p17 = _mm_set1_epi32(cospi[17]); + __m128i cospi_p15 = _mm_set1_epi32(cospi[15]); + __m128i cospi_p49 = _mm_set1_epi32(cospi[49]); + __m128i cospi_p55 = _mm_set1_epi32(cospi[55]); + __m128i cospi_p09 = _mm_set1_epi32(cospi[9]); + __m128i cospi_p23 = _mm_set1_epi32(cospi[23]); + __m128i cospi_p41 = _mm_set1_epi32(cospi[41]); + __m128i cospi_p39 = _mm_set1_epi32(cospi[39]); + __m128i cospi_p25 = _mm_set1_epi32(cospi[25]); + __m128i cospi_p07 = _mm_set1_epi32(cospi[7]); + __m128i cospi_p57 = _mm_set1_epi32(cospi[57]); + __m128i cospi_p59 = _mm_set1_epi32(cospi[59]); + __m128i cospi_p05 = _mm_set1_epi32(cospi[5]); + __m128i cospi_p27 = _mm_set1_epi32(cospi[27]); + __m128i cospi_p37 = _mm_set1_epi32(cospi[37]); + __m128i cospi_p43 = _mm_set1_epi32(cospi[43]); + __m128i cospi_p21 = _mm_set1_epi32(cospi[21]); + __m128i cospi_p11 = _mm_set1_epi32(cospi[11]); + __m128i cospi_p53 = _mm_set1_epi32(cospi[53]); + __m128i cospi_p51 = _mm_set1_epi32(cospi[51]); + __m128i cospi_p13 = _mm_set1_epi32(cospi[13]); + __m128i cospi_p19 = _mm_set1_epi32(cospi[19]); + __m128i cospi_p45 = _mm_set1_epi32(cospi[45]); + __m128i cospi_p35 = _mm_set1_epi32(cospi[35]); + __m128i cospi_p29 = _mm_set1_epi32(cospi[29]); + __m128i cospi_p03 = _mm_set1_epi32(cospi[3]); + __m128i cospi_p61 = _mm_set1_epi32(cospi[61]); + + // stage 1 + __m128i x1[64]; + x1[0] = _mm_add_epi32(input[0], input[63]); + x1[63] = _mm_sub_epi32(input[0], input[63]); + x1[1] = _mm_add_epi32(input[1], input[62]); + x1[62] = _mm_sub_epi32(input[1], input[62]); + x1[2] = _mm_add_epi32(input[2], input[61]); + x1[61] = _mm_sub_epi32(input[2], input[61]); + x1[3] = _mm_add_epi32(input[3], input[60]); + x1[60] = _mm_sub_epi32(input[3], input[60]); + x1[4] = _mm_add_epi32(input[4], input[59]); + x1[59] = _mm_sub_epi32(input[4], input[59]); + x1[5] = _mm_add_epi32(input[5], input[58]); + x1[58] = _mm_sub_epi32(input[5], input[58]); + x1[6] = _mm_add_epi32(input[6], input[57]); + x1[57] = _mm_sub_epi32(input[6], input[57]); + x1[7] = _mm_add_epi32(input[7], input[56]); + x1[56] = _mm_sub_epi32(input[7], input[56]); + x1[8] = _mm_add_epi32(input[8], input[55]); + x1[55] = _mm_sub_epi32(input[8], input[55]); + x1[9] = _mm_add_epi32(input[9], input[54]); + x1[54] = _mm_sub_epi32(input[9], input[54]); + x1[10] = _mm_add_epi32(input[10], input[53]); + x1[53] = _mm_sub_epi32(input[10], input[53]); + x1[11] = _mm_add_epi32(input[11], input[52]); + x1[52] = _mm_sub_epi32(input[11], input[52]); + x1[12] = _mm_add_epi32(input[12], input[51]); + x1[51] = _mm_sub_epi32(input[12], input[51]); + x1[13] = _mm_add_epi32(input[13], input[50]); + x1[50] = _mm_sub_epi32(input[13], input[50]); + x1[14] = _mm_add_epi32(input[14], input[49]); + x1[49] = _mm_sub_epi32(input[14], input[49]); + x1[15] = _mm_add_epi32(input[15], input[48]); + x1[48] = _mm_sub_epi32(input[15], input[48]); + x1[16] = _mm_add_epi32(input[16], input[47]); + x1[47] = _mm_sub_epi32(input[16], input[47]); + x1[17] = _mm_add_epi32(input[17], input[46]); + x1[46] = _mm_sub_epi32(input[17], input[46]); + x1[18] = _mm_add_epi32(input[18], input[45]); + x1[45] = _mm_sub_epi32(input[18], input[45]); + x1[19] = _mm_add_epi32(input[19], input[44]); + x1[44] = _mm_sub_epi32(input[19], input[44]); + x1[20] = _mm_add_epi32(input[20], input[43]); + x1[43] = _mm_sub_epi32(input[20], input[43]); + x1[21] = _mm_add_epi32(input[21], input[42]); + x1[42] = _mm_sub_epi32(input[21], input[42]); + x1[22] = _mm_add_epi32(input[22], input[41]); + x1[41] = _mm_sub_epi32(input[22], input[41]); + x1[23] = _mm_add_epi32(input[23], input[40]); + x1[40] = _mm_sub_epi32(input[23], input[40]); + x1[24] = _mm_add_epi32(input[24], input[39]); + x1[39] = _mm_sub_epi32(input[24], input[39]); + x1[25] = _mm_add_epi32(input[25], input[38]); + x1[38] = _mm_sub_epi32(input[25], input[38]); + x1[26] = _mm_add_epi32(input[26], input[37]); + x1[37] = _mm_sub_epi32(input[26], input[37]); + x1[27] = _mm_add_epi32(input[27], input[36]); + x1[36] = _mm_sub_epi32(input[27], input[36]); + x1[28] = _mm_add_epi32(input[28], input[35]); + x1[35] = _mm_sub_epi32(input[28], input[35]); + x1[29] = _mm_add_epi32(input[29], input[34]); + x1[34] = _mm_sub_epi32(input[29], input[34]); + x1[30] = _mm_add_epi32(input[30], input[33]); + x1[33] = _mm_sub_epi32(input[30], input[33]); + x1[31] = _mm_add_epi32(input[31], input[32]); + x1[32] = _mm_sub_epi32(input[31], input[32]); + + // stage 2 + __m128i x2[64]; + x2[0] = _mm_add_epi32(x1[0], x1[31]); + x2[31] = _mm_sub_epi32(x1[0], x1[31]); + x2[1] = _mm_add_epi32(x1[1], x1[30]); + x2[30] = _mm_sub_epi32(x1[1], x1[30]); + x2[2] = _mm_add_epi32(x1[2], x1[29]); + x2[29] = _mm_sub_epi32(x1[2], x1[29]); + x2[3] = _mm_add_epi32(x1[3], x1[28]); + x2[28] = _mm_sub_epi32(x1[3], x1[28]); + x2[4] = _mm_add_epi32(x1[4], x1[27]); + x2[27] = _mm_sub_epi32(x1[4], x1[27]); + x2[5] = _mm_add_epi32(x1[5], x1[26]); + x2[26] = _mm_sub_epi32(x1[5], x1[26]); + x2[6] = _mm_add_epi32(x1[6], x1[25]); + x2[25] = _mm_sub_epi32(x1[6], x1[25]); + x2[7] = _mm_add_epi32(x1[7], x1[24]); + x2[24] = _mm_sub_epi32(x1[7], x1[24]); + x2[8] = _mm_add_epi32(x1[8], x1[23]); + x2[23] = _mm_sub_epi32(x1[8], x1[23]); + x2[9] = _mm_add_epi32(x1[9], x1[22]); + x2[22] = _mm_sub_epi32(x1[9], x1[22]); + x2[10] = _mm_add_epi32(x1[10], x1[21]); + x2[21] = _mm_sub_epi32(x1[10], x1[21]); + x2[11] = _mm_add_epi32(x1[11], x1[20]); + x2[20] = _mm_sub_epi32(x1[11], x1[20]); + x2[12] = _mm_add_epi32(x1[12], x1[19]); + x2[19] = _mm_sub_epi32(x1[12], x1[19]); + x2[13] = _mm_add_epi32(x1[13], x1[18]); + x2[18] = _mm_sub_epi32(x1[13], x1[18]); + x2[14] = _mm_add_epi32(x1[14], x1[17]); + x2[17] = _mm_sub_epi32(x1[14], x1[17]); + x2[15] = _mm_add_epi32(x1[15], x1[16]); + x2[16] = _mm_sub_epi32(x1[15], x1[16]); + x2[32] = x1[32]; + x2[33] = x1[33]; + x2[34] = x1[34]; + x2[35] = x1[35]; + x2[36] = x1[36]; + x2[37] = x1[37]; + x2[38] = x1[38]; + x2[39] = x1[39]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48], + __rounding, cos_bit); + x2[56] = x1[56]; + x2[57] = x1[57]; + x2[58] = x1[58]; + x2[59] = x1[59]; + x2[60] = x1[60]; + x2[61] = x1[61]; + x2[62] = x1[62]; + x2[63] = x1[63]; + + // stage 3 + __m128i x3[64]; + x3[0] = _mm_add_epi32(x2[0], x2[15]); + x3[15] = _mm_sub_epi32(x2[0], x2[15]); + x3[1] = _mm_add_epi32(x2[1], x2[14]); + x3[14] = _mm_sub_epi32(x2[1], x2[14]); + x3[2] = _mm_add_epi32(x2[2], x2[13]); + x3[13] = _mm_sub_epi32(x2[2], x2[13]); + x3[3] = _mm_add_epi32(x2[3], x2[12]); + x3[12] = _mm_sub_epi32(x2[3], x2[12]); + x3[4] = _mm_add_epi32(x2[4], x2[11]); + x3[11] = _mm_sub_epi32(x2[4], x2[11]); + x3[5] = _mm_add_epi32(x2[5], x2[10]); + x3[10] = _mm_sub_epi32(x2[5], x2[10]); + x3[6] = _mm_add_epi32(x2[6], x2[9]); + x3[9] = _mm_sub_epi32(x2[6], x2[9]); + x3[7] = _mm_add_epi32(x2[7], x2[8]); + x3[8] = _mm_sub_epi32(x2[7], x2[8]); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24], + __rounding, cos_bit); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + x3[32] = _mm_add_epi32(x2[32], x2[47]); + x3[47] = _mm_sub_epi32(x2[32], x2[47]); + x3[33] = _mm_add_epi32(x2[33], x2[46]); + x3[46] = _mm_sub_epi32(x2[33], x2[46]); + x3[34] = _mm_add_epi32(x2[34], x2[45]); + x3[45] = _mm_sub_epi32(x2[34], x2[45]); + x3[35] = _mm_add_epi32(x2[35], x2[44]); + x3[44] = _mm_sub_epi32(x2[35], x2[44]); + x3[36] = _mm_add_epi32(x2[36], x2[43]); + x3[43] = _mm_sub_epi32(x2[36], x2[43]); + x3[37] = _mm_add_epi32(x2[37], x2[42]); + x3[42] = _mm_sub_epi32(x2[37], x2[42]); + x3[38] = _mm_add_epi32(x2[38], x2[41]); + x3[41] = _mm_sub_epi32(x2[38], x2[41]); + x3[39] = _mm_add_epi32(x2[39], x2[40]); + x3[40] = _mm_sub_epi32(x2[39], x2[40]); + x3[48] = _mm_sub_epi32(x2[63], x2[48]); + x3[63] = _mm_add_epi32(x2[63], x2[48]); + x3[49] = _mm_sub_epi32(x2[62], x2[49]); + x3[62] = _mm_add_epi32(x2[62], x2[49]); + x3[50] = _mm_sub_epi32(x2[61], x2[50]); + x3[61] = _mm_add_epi32(x2[61], x2[50]); + x3[51] = _mm_sub_epi32(x2[60], x2[51]); + x3[60] = _mm_add_epi32(x2[60], x2[51]); + x3[52] = _mm_sub_epi32(x2[59], x2[52]); + x3[59] = _mm_add_epi32(x2[59], x2[52]); + x3[53] = _mm_sub_epi32(x2[58], x2[53]); + x3[58] = _mm_add_epi32(x2[58], x2[53]); + x3[54] = _mm_sub_epi32(x2[57], x2[54]); + x3[57] = _mm_add_epi32(x2[57], x2[54]); + x3[55] = _mm_sub_epi32(x2[56], x2[55]); + x3[56] = _mm_add_epi32(x2[56], x2[55]); + + // stage 4 + __m128i x4[64]; + x4[0] = _mm_add_epi32(x3[0], x3[7]); + x4[7] = _mm_sub_epi32(x3[0], x3[7]); + x4[1] = _mm_add_epi32(x3[1], x3[6]); + x4[6] = _mm_sub_epi32(x3[1], x3[6]); + x4[2] = _mm_add_epi32(x3[2], x3[5]); + x4[5] = _mm_sub_epi32(x3[2], x3[5]); + x4[3] = _mm_add_epi32(x3[3], x3[4]); + x4[4] = _mm_sub_epi32(x3[3], x3[4]); + x4[8] = x3[8]; + x4[9] = x3[9]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12], + __rounding, cos_bit); + x4[14] = x3[14]; + x4[15] = x3[15]; + x4[16] = _mm_add_epi32(x3[16], x3[23]); + x4[23] = _mm_sub_epi32(x3[16], x3[23]); + x4[17] = _mm_add_epi32(x3[17], x3[22]); + x4[22] = _mm_sub_epi32(x3[17], x3[22]); + x4[18] = _mm_add_epi32(x3[18], x3[21]); + x4[21] = _mm_sub_epi32(x3[18], x3[21]); + x4[19] = _mm_add_epi32(x3[19], x3[20]); + x4[20] = _mm_sub_epi32(x3[19], x3[20]); + x4[24] = _mm_sub_epi32(x3[31], x3[24]); + x4[31] = _mm_add_epi32(x3[31], x3[24]); + x4[25] = _mm_sub_epi32(x3[30], x3[25]); + x4[30] = _mm_add_epi32(x3[30], x3[25]); + x4[26] = _mm_sub_epi32(x3[29], x3[26]); + x4[29] = _mm_add_epi32(x3[29], x3[26]); + x4[27] = _mm_sub_epi32(x3[28], x3[27]); + x4[28] = _mm_add_epi32(x3[28], x3[27]); + x4[32] = x3[32]; + x4[33] = x3[33]; + x4[34] = x3[34]; + x4[35] = x3[35]; + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52], + __rounding, cos_bit); + x4[44] = x3[44]; + x4[45] = x3[45]; + x4[46] = x3[46]; + x4[47] = x3[47]; + x4[48] = x3[48]; + x4[49] = x3[49]; + x4[50] = x3[50]; + x4[51] = x3[51]; + x4[60] = x3[60]; + x4[61] = x3[61]; + x4[62] = x3[62]; + x4[63] = x3[63]; + + // stage 5 + __m128i x5[64]; + x5[0] = _mm_add_epi32(x4[0], x4[3]); + x5[3] = _mm_sub_epi32(x4[0], x4[3]); + x5[1] = _mm_add_epi32(x4[1], x4[2]); + x5[2] = _mm_sub_epi32(x4[1], x4[2]); + x5[4] = x4[4]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6], + __rounding, cos_bit); + x5[7] = x4[7]; + x5[8] = _mm_add_epi32(x4[8], x4[11]); + x5[11] = _mm_sub_epi32(x4[8], x4[11]); + x5[9] = _mm_add_epi32(x4[9], x4[10]); + x5[10] = _mm_sub_epi32(x4[9], x4[10]); + x5[12] = _mm_sub_epi32(x4[15], x4[12]); + x5[15] = _mm_add_epi32(x4[15], x4[12]); + x5[13] = _mm_sub_epi32(x4[14], x4[13]); + x5[14] = _mm_add_epi32(x4[14], x4[13]); + x5[16] = x4[16]; + x5[17] = x4[17]; + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26], + __rounding, cos_bit); + x5[22] = x4[22]; + x5[23] = x4[23]; + x5[24] = x4[24]; + x5[25] = x4[25]; + x5[30] = x4[30]; + x5[31] = x4[31]; + x5[32] = _mm_add_epi32(x4[32], x4[39]); + x5[39] = _mm_sub_epi32(x4[32], x4[39]); + x5[33] = _mm_add_epi32(x4[33], x4[38]); + x5[38] = _mm_sub_epi32(x4[33], x4[38]); + x5[34] = _mm_add_epi32(x4[34], x4[37]); + x5[37] = _mm_sub_epi32(x4[34], x4[37]); + x5[35] = _mm_add_epi32(x4[35], x4[36]); + x5[36] = _mm_sub_epi32(x4[35], x4[36]); + x5[40] = _mm_sub_epi32(x4[47], x4[40]); + x5[47] = _mm_add_epi32(x4[47], x4[40]); + x5[41] = _mm_sub_epi32(x4[46], x4[41]); + x5[46] = _mm_add_epi32(x4[46], x4[41]); + x5[42] = _mm_sub_epi32(x4[45], x4[42]); + x5[45] = _mm_add_epi32(x4[45], x4[42]); + x5[43] = _mm_sub_epi32(x4[44], x4[43]); + x5[44] = _mm_add_epi32(x4[44], x4[43]); + x5[48] = _mm_add_epi32(x4[48], x4[55]); + x5[55] = _mm_sub_epi32(x4[48], x4[55]); + x5[49] = _mm_add_epi32(x4[49], x4[54]); + x5[54] = _mm_sub_epi32(x4[49], x4[54]); + x5[50] = _mm_add_epi32(x4[50], x4[53]); + x5[53] = _mm_sub_epi32(x4[50], x4[53]); + x5[51] = _mm_add_epi32(x4[51], x4[52]); + x5[52] = _mm_sub_epi32(x4[51], x4[52]); + x5[56] = _mm_sub_epi32(x4[63], x4[56]); + x5[63] = _mm_add_epi32(x4[63], x4[56]); + x5[57] = _mm_sub_epi32(x4[62], x4[57]); + x5[62] = _mm_add_epi32(x4[62], x4[57]); + x5[58] = _mm_sub_epi32(x4[61], x4[58]); + x5[61] = _mm_add_epi32(x4[61], x4[58]); + x5[59] = _mm_sub_epi32(x4[60], x4[59]); + x5[60] = _mm_add_epi32(x4[60], x4[59]); + + // stage 6 + __m128i x6[64]; + btf_32_type0_sse4_1_new(cospi_p32, cospi_p32, x5[0], x5[1], x6[0], x6[1], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p48, cospi_p16, x5[2], x5[3], x6[2], x6[3], + __rounding, cos_bit); + x6[4] = _mm_add_epi32(x5[4], x5[5]); + x6[5] = _mm_sub_epi32(x5[4], x5[5]); + x6[6] = _mm_sub_epi32(x5[7], x5[6]); + x6[7] = _mm_add_epi32(x5[7], x5[6]); + x6[8] = x5[8]; + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13], + __rounding, cos_bit); + x6[11] = x5[11]; + x6[12] = x5[12]; + x6[15] = x5[15]; + x6[16] = _mm_add_epi32(x5[16], x5[19]); + x6[19] = _mm_sub_epi32(x5[16], x5[19]); + x6[17] = _mm_add_epi32(x5[17], x5[18]); + x6[18] = _mm_sub_epi32(x5[17], x5[18]); + x6[20] = _mm_sub_epi32(x5[23], x5[20]); + x6[23] = _mm_add_epi32(x5[23], x5[20]); + x6[21] = _mm_sub_epi32(x5[22], x5[21]); + x6[22] = _mm_add_epi32(x5[22], x5[21]); + x6[24] = _mm_add_epi32(x5[24], x5[27]); + x6[27] = _mm_sub_epi32(x5[24], x5[27]); + x6[25] = _mm_add_epi32(x5[25], x5[26]); + x6[26] = _mm_sub_epi32(x5[25], x5[26]); + x6[28] = _mm_sub_epi32(x5[31], x5[28]); + x6[31] = _mm_add_epi32(x5[31], x5[28]); + x6[29] = _mm_sub_epi32(x5[30], x5[29]); + x6[30] = _mm_add_epi32(x5[30], x5[29]); + x6[32] = x5[32]; + x6[33] = x5[33]; + btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58], + __rounding, cos_bit); + x6[38] = x5[38]; + x6[39] = x5[39]; + x6[40] = x5[40]; + x6[41] = x5[41]; + btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50], + __rounding, cos_bit); + x6[46] = x5[46]; + x6[47] = x5[47]; + x6[48] = x5[48]; + x6[49] = x5[49]; + x6[54] = x5[54]; + x6[55] = x5[55]; + x6[56] = x5[56]; + x6[57] = x5[57]; + x6[62] = x5[62]; + x6[63] = x5[63]; + + // stage 7 + __m128i x7[64]; + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + btf_32_type1_sse4_1_new(cospi_p56, cospi_p08, x6[4], x6[7], x7[4], x7[7], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p24, cospi_p40, x6[5], x6[6], x7[5], x7[6], + __rounding, cos_bit); + x7[8] = _mm_add_epi32(x6[8], x6[9]); + x7[9] = _mm_sub_epi32(x6[8], x6[9]); + x7[10] = _mm_sub_epi32(x6[11], x6[10]); + x7[11] = _mm_add_epi32(x6[11], x6[10]); + x7[12] = _mm_add_epi32(x6[12], x6[13]); + x7[13] = _mm_sub_epi32(x6[12], x6[13]); + x7[14] = _mm_sub_epi32(x6[15], x6[14]); + x7[15] = _mm_add_epi32(x6[15], x6[14]); + x7[16] = x6[16]; + btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29], + __rounding, cos_bit); + x7[19] = x6[19]; + x7[20] = x6[20]; + btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25], + __rounding, cos_bit); + x7[23] = x6[23]; + x7[24] = x6[24]; + x7[27] = x6[27]; + x7[28] = x6[28]; + x7[31] = x6[31]; + x7[32] = _mm_add_epi32(x6[32], x6[35]); + x7[35] = _mm_sub_epi32(x6[32], x6[35]); + x7[33] = _mm_add_epi32(x6[33], x6[34]); + x7[34] = _mm_sub_epi32(x6[33], x6[34]); + x7[36] = _mm_sub_epi32(x6[39], x6[36]); + x7[39] = _mm_add_epi32(x6[39], x6[36]); + x7[37] = _mm_sub_epi32(x6[38], x6[37]); + x7[38] = _mm_add_epi32(x6[38], x6[37]); + x7[40] = _mm_add_epi32(x6[40], x6[43]); + x7[43] = _mm_sub_epi32(x6[40], x6[43]); + x7[41] = _mm_add_epi32(x6[41], x6[42]); + x7[42] = _mm_sub_epi32(x6[41], x6[42]); + x7[44] = _mm_sub_epi32(x6[47], x6[44]); + x7[47] = _mm_add_epi32(x6[47], x6[44]); + x7[45] = _mm_sub_epi32(x6[46], x6[45]); + x7[46] = _mm_add_epi32(x6[46], x6[45]); + x7[48] = _mm_add_epi32(x6[48], x6[51]); + x7[51] = _mm_sub_epi32(x6[48], x6[51]); + x7[49] = _mm_add_epi32(x6[49], x6[50]); + x7[50] = _mm_sub_epi32(x6[49], x6[50]); + x7[52] = _mm_sub_epi32(x6[55], x6[52]); + x7[55] = _mm_add_epi32(x6[55], x6[52]); + x7[53] = _mm_sub_epi32(x6[54], x6[53]); + x7[54] = _mm_add_epi32(x6[54], x6[53]); + x7[56] = _mm_add_epi32(x6[56], x6[59]); + x7[59] = _mm_sub_epi32(x6[56], x6[59]); + x7[57] = _mm_add_epi32(x6[57], x6[58]); + x7[58] = _mm_sub_epi32(x6[57], x6[58]); + x7[60] = _mm_sub_epi32(x6[63], x6[60]); + x7[63] = _mm_add_epi32(x6[63], x6[60]); + x7[61] = _mm_sub_epi32(x6[62], x6[61]); + x7[62] = _mm_add_epi32(x6[62], x6[61]); + + // stage 8 + __m128i x8[64]; + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + btf_32_type1_sse4_1_new(cospi_p60, cospi_p04, x7[8], x7[15], x8[8], x8[15], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p28, cospi_p36, x7[9], x7[14], x8[9], x8[14], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p44, cospi_p20, x7[10], x7[13], x8[10], x8[13], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p12, cospi_p52, x7[11], x7[12], x8[11], x8[12], + __rounding, cos_bit); + x8[16] = _mm_add_epi32(x7[16], x7[17]); + x8[17] = _mm_sub_epi32(x7[16], x7[17]); + x8[18] = _mm_sub_epi32(x7[19], x7[18]); + x8[19] = _mm_add_epi32(x7[19], x7[18]); + x8[20] = _mm_add_epi32(x7[20], x7[21]); + x8[21] = _mm_sub_epi32(x7[20], x7[21]); + x8[22] = _mm_sub_epi32(x7[23], x7[22]); + x8[23] = _mm_add_epi32(x7[23], x7[22]); + x8[24] = _mm_add_epi32(x7[24], x7[25]); + x8[25] = _mm_sub_epi32(x7[24], x7[25]); + x8[26] = _mm_sub_epi32(x7[27], x7[26]); + x8[27] = _mm_add_epi32(x7[27], x7[26]); + x8[28] = _mm_add_epi32(x7[28], x7[29]); + x8[29] = _mm_sub_epi32(x7[28], x7[29]); + x8[30] = _mm_sub_epi32(x7[31], x7[30]); + x8[31] = _mm_add_epi32(x7[31], x7[30]); + x8[32] = x7[32]; + btf_32_type0_sse4_1_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61], + __rounding, cos_bit); + x8[35] = x7[35]; + x8[36] = x7[36]; + btf_32_type0_sse4_1_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57], + __rounding, cos_bit); + x8[39] = x7[39]; + x8[40] = x7[40]; + btf_32_type0_sse4_1_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53], + __rounding, cos_bit); + x8[43] = x7[43]; + x8[44] = x7[44]; + btf_32_type0_sse4_1_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49], + __rounding, cos_bit); + x8[47] = x7[47]; + x8[48] = x7[48]; + x8[51] = x7[51]; + x8[52] = x7[52]; + x8[55] = x7[55]; + x8[56] = x7[56]; + x8[59] = x7[59]; + x8[60] = x7[60]; + x8[63] = x7[63]; + + // stage 9 + __m128i x9[64]; + x9[0] = x8[0]; + x9[1] = x8[1]; + x9[2] = x8[2]; + x9[3] = x8[3]; + x9[4] = x8[4]; + x9[5] = x8[5]; + x9[6] = x8[6]; + x9[7] = x8[7]; + x9[8] = x8[8]; + x9[9] = x8[9]; + x9[10] = x8[10]; + x9[11] = x8[11]; + x9[12] = x8[12]; + x9[13] = x8[13]; + x9[14] = x8[14]; + x9[15] = x8[15]; + btf_32_type1_sse4_1_new(cospi_p62, cospi_p02, x8[16], x8[31], x9[16], x9[31], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p30, cospi_p34, x8[17], x8[30], x9[17], x9[30], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p46, cospi_p18, x8[18], x8[29], x9[18], x9[29], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p14, cospi_p50, x8[19], x8[28], x9[19], x9[28], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p54, cospi_p10, x8[20], x8[27], x9[20], x9[27], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p22, cospi_p42, x8[21], x8[26], x9[21], x9[26], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p38, cospi_p26, x8[22], x8[25], x9[22], x9[25], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p06, cospi_p58, x8[23], x8[24], x9[23], x9[24], + __rounding, cos_bit); + x9[32] = _mm_add_epi32(x8[32], x8[33]); + x9[33] = _mm_sub_epi32(x8[32], x8[33]); + x9[34] = _mm_sub_epi32(x8[35], x8[34]); + x9[35] = _mm_add_epi32(x8[35], x8[34]); + x9[36] = _mm_add_epi32(x8[36], x8[37]); + x9[37] = _mm_sub_epi32(x8[36], x8[37]); + x9[38] = _mm_sub_epi32(x8[39], x8[38]); + x9[39] = _mm_add_epi32(x8[39], x8[38]); + x9[40] = _mm_add_epi32(x8[40], x8[41]); + x9[41] = _mm_sub_epi32(x8[40], x8[41]); + x9[42] = _mm_sub_epi32(x8[43], x8[42]); + x9[43] = _mm_add_epi32(x8[43], x8[42]); + x9[44] = _mm_add_epi32(x8[44], x8[45]); + x9[45] = _mm_sub_epi32(x8[44], x8[45]); + x9[46] = _mm_sub_epi32(x8[47], x8[46]); + x9[47] = _mm_add_epi32(x8[47], x8[46]); + x9[48] = _mm_add_epi32(x8[48], x8[49]); + x9[49] = _mm_sub_epi32(x8[48], x8[49]); + x9[50] = _mm_sub_epi32(x8[51], x8[50]); + x9[51] = _mm_add_epi32(x8[51], x8[50]); + x9[52] = _mm_add_epi32(x8[52], x8[53]); + x9[53] = _mm_sub_epi32(x8[52], x8[53]); + x9[54] = _mm_sub_epi32(x8[55], x8[54]); + x9[55] = _mm_add_epi32(x8[55], x8[54]); + x9[56] = _mm_add_epi32(x8[56], x8[57]); + x9[57] = _mm_sub_epi32(x8[56], x8[57]); + x9[58] = _mm_sub_epi32(x8[59], x8[58]); + x9[59] = _mm_add_epi32(x8[59], x8[58]); + x9[60] = _mm_add_epi32(x8[60], x8[61]); + x9[61] = _mm_sub_epi32(x8[60], x8[61]); + x9[62] = _mm_sub_epi32(x8[63], x8[62]); + x9[63] = _mm_add_epi32(x8[63], x8[62]); + + // stage 10 + __m128i x10[64]; + x10[0] = x9[0]; + x10[1] = x9[1]; + x10[2] = x9[2]; + x10[3] = x9[3]; + x10[4] = x9[4]; + x10[5] = x9[5]; + x10[6] = x9[6]; + x10[7] = x9[7]; + x10[8] = x9[8]; + x10[9] = x9[9]; + x10[10] = x9[10]; + x10[11] = x9[11]; + x10[12] = x9[12]; + x10[13] = x9[13]; + x10[14] = x9[14]; + x10[15] = x9[15]; + x10[16] = x9[16]; + x10[17] = x9[17]; + x10[18] = x9[18]; + x10[19] = x9[19]; + x10[20] = x9[20]; + x10[21] = x9[21]; + x10[22] = x9[22]; + x10[23] = x9[23]; + x10[24] = x9[24]; + x10[25] = x9[25]; + x10[26] = x9[26]; + x10[27] = x9[27]; + x10[28] = x9[28]; + x10[29] = x9[29]; + x10[30] = x9[30]; + x10[31] = x9[31]; + btf_32_type1_sse4_1_new(cospi_p63, cospi_p01, x9[32], x9[63], x10[32], + x10[63], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p31, cospi_p33, x9[33], x9[62], x10[33], + x10[62], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p47, cospi_p17, x9[34], x9[61], x10[34], + x10[61], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p15, cospi_p49, x9[35], x9[60], x10[35], + x10[60], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p55, cospi_p09, x9[36], x9[59], x10[36], + x10[59], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p23, cospi_p41, x9[37], x9[58], x10[37], + x10[58], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p39, cospi_p25, x9[38], x9[57], x10[38], + x10[57], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p07, cospi_p57, x9[39], x9[56], x10[39], + x10[56], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p59, cospi_p05, x9[40], x9[55], x10[40], + x10[55], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p27, cospi_p37, x9[41], x9[54], x10[41], + x10[54], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p43, cospi_p21, x9[42], x9[53], x10[42], + x10[53], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p11, cospi_p53, x9[43], x9[52], x10[43], + x10[52], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p51, cospi_p13, x9[44], x9[51], x10[44], + x10[51], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p19, cospi_p45, x9[45], x9[50], x10[45], + x10[50], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p35, cospi_p29, x9[46], x9[49], x10[46], + x10[49], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47], + x10[48], __rounding, cos_bit); + + // stage 11 + output[0] = x10[0]; + output[1] = x10[32]; + output[2] = x10[16]; + output[3] = x10[48]; + output[4] = x10[8]; + output[5] = x10[40]; + output[6] = x10[24]; + output[7] = x10[56]; + output[8] = x10[4]; + output[9] = x10[36]; + output[10] = x10[20]; + output[11] = x10[52]; + output[12] = x10[12]; + output[13] = x10[44]; + output[14] = x10[28]; + output[15] = x10[60]; + output[16] = x10[2]; + output[17] = x10[34]; + output[18] = x10[18]; + output[19] = x10[50]; + output[20] = x10[10]; + output[21] = x10[42]; + output[22] = x10[26]; + output[23] = x10[58]; + output[24] = x10[6]; + output[25] = x10[38]; + output[26] = x10[22]; + output[27] = x10[54]; + output[28] = x10[14]; + output[29] = x10[46]; + output[30] = x10[30]; + output[31] = x10[62]; + output[32] = x10[1]; + output[33] = x10[33]; + output[34] = x10[17]; + output[35] = x10[49]; + output[36] = x10[9]; + output[37] = x10[41]; + output[38] = x10[25]; + output[39] = x10[57]; + output[40] = x10[5]; + output[41] = x10[37]; + output[42] = x10[21]; + output[43] = x10[53]; + output[44] = x10[13]; + output[45] = x10[45]; + output[46] = x10[29]; + output[47] = x10[61]; + output[48] = x10[3]; + output[49] = x10[35]; + output[50] = x10[19]; + output[51] = x10[51]; + output[52] = x10[11]; + output[53] = x10[43]; + output[54] = x10[27]; + output[55] = x10[59]; + output[56] = x10[7]; + output[57] = x10[39]; + output[58] = x10[23]; + output[59] = x10[55]; + output[60] = x10[15]; + output[61] = x10[47]; + output[62] = x10[31]; + output[63] = x10[63]; +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c new file mode 100644 index 000000000..abb95f31e --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" + +#include "av1/common/enums.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_txfm1d_sse4.h" +#include "av1/encoder/x86/av1_fwd_txfm_sse2.h" + +static INLINE void int16_array_with_stride_to_int32_array_without_stride( + const int16_t *input, int stride, int32_t *output, int txfm1d_size) { + int r, c; + for (r = 0; r < txfm1d_size; r++) { + for (c = 0; c < txfm1d_size; c++) { + output[r * txfm1d_size + c] = (int32_t)input[r * stride + c]; + } + } +} + +typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +static void fdct32_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range) { + const int txfm_size = 32; + const int num_per_128 = 4; + __m128i buf0[32]; + __m128i buf1[32]; + int col_num = txfm_size / num_per_128; + int col; + (void)stage_range; + for (col = 0; col < col_num; col++) { + int j; + for (j = 0; j < 32; ++j) { + buf0[j] = input[j * col_num + col]; + } + av1_fdct32_new_sse4_1(buf0, buf1, cos_bit); + for (j = 0; j < 32; ++j) { + output[j * col_num + col] = buf1[j]; + } + } +} + +static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { + switch (txfm_type) { + case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break; + default: assert(0); + } + return NULL; +} + +static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output, + const int stride, + const TXFM_2D_FLIP_CFG *cfg, + int32_t *txfm_buf) { + // TODO(sarahparker) This does not currently support rectangular transforms + // and will break without splitting txfm_size out into row and col size. + // Rectangular transforms use c code only, so it should be ok for now. + // It will be corrected when there are sse implementations for rectangular + // transforms. + assert(cfg->tx_size < TX_SIZES); + const int txfm_size = tx_size_wide[cfg->tx_size]; + const int8_t *shift = cfg->shift; + const int8_t *stage_range_col = cfg->stage_range_col; + const int8_t *stage_range_row = cfg->stage_range_row; + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); + const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row); + + __m128i *buf_128 = (__m128i *)txfm_buf; + __m128i *out_128 = (__m128i *)output; + int num_per_128 = 4; + int txfm2d_size_128 = txfm_size * txfm_size / num_per_128; + + int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf, + txfm_size); + av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]); + txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); + av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); + transpose_32(txfm_size, out_128, buf_128); + txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row); + av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]); + transpose_32(txfm_size, buf_128, out_128); +} + +void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg); + (void)bd; + fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf); +} + +static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA, + const __m128i *inputB, __m128i *output) { + __m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]); + __m128i temp1 = _mm_unpackhi_epi32(inputA[0], inputA[2]); + __m128i temp2 = _mm_unpacklo_epi32(inputA[1], inputA[3]); + __m128i temp3 = _mm_unpackhi_epi32(inputA[1], inputA[3]); + + output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2); + output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2); + output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3); + output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3); + + temp0 = _mm_unpacklo_epi32(inputB[0], inputB[2]); + temp1 = _mm_unpackhi_epi32(inputB[0], inputB[2]); + temp2 = _mm_unpacklo_epi32(inputB[1], inputB[3]); + temp3 = _mm_unpackhi_epi32(inputB[1], inputB[3]); + + output[4 * stride] = _mm_unpacklo_epi32(temp0, temp2); + output[5 * stride] = _mm_unpackhi_epi32(temp0, temp2); + output[6 * stride] = _mm_unpacklo_epi32(temp1, temp3); + output[7 * stride] = _mm_unpackhi_epi32(temp1, temp3); +} + +static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X64; + __m128i buf0[64], buf1[512]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = fdct8x64_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div8); ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i bufA[64]; + __m128i bufB[64]; + __m128i *buf = buf1 + width * i; + for (int j = 0; j < width; ++j) { + bufA[j] = _mm_cvtepi16_epi32(buf[j]); + bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); + } + av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row); + av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row); + av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]); + av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]); + + int32_t *output8 = output + 8 * 32 * i; + for (int j = 0; j < width_div8; ++j) { + __m128i *out = (__m128i *)(output8 + 4 * j); + transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out); + } + } +} + +static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_64X32; + __m128i buf0[64], buf1[256]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div8); ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + assert(tx_type == DCT_DCT); + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i bufA[64]; + __m128i bufB[64]; + __m128i *buf = buf1 + width * i; + for (int j = 0; j < width; ++j) { + bufA[j] = _mm_cvtepi16_epi32(buf[j]); + bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); + } + av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row); + av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row); + av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]); + av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]); + + int32_t *output8 = output + 8 * 32 * i; + for (int j = 0; j < width_div8; ++j) { + __m128i *out = (__m128i *)(output8 + 4 * j); + transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out); + } + } +} + +static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_32X64; + __m128i buf0[64], buf1[256]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = fdct8x64_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div8); ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i bufA[32]; + __m128i bufB[32]; + __m128i *buf = buf1 + width * i; + for (int j = 0; j < width; ++j) { + bufA[j] = _mm_cvtepi16_epi32(buf[j]); + bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); + } + av1_fdct32_new_sse4_1(bufA, bufA, cos_bit_row); + av1_fdct32_new_sse4_1(bufB, bufB, cos_bit_row); + av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]); + av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]); + + int32_t *output8 = output + 8 * 32 * i; + for (int j = 0; j < (32 / 4); ++j) { + __m128i *out = (__m128i *)(output8 + 4 * j); + transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out); + } + } +} + +static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { + av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform + av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform + av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform + av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform + lowbd_fwd_txfm2d_64x64_sse4_1, // 64x64 transform + av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform + av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform + av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform + av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform + av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform + av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform + lowbd_fwd_txfm2d_32x64_sse4_1, // 32x64 transform + lowbd_fwd_txfm2d_64x32_sse4_1, // 64x32 transform + av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform + av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform + av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform + av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform + av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform + av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform +}; + +void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; + if ((fwd_txfm2d_func == NULL) || + (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) { + av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); + } else { + fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c new file mode 100644 index 000000000..6aae7ce1e --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c @@ -0,0 +1,2889 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_fwd_txfm_sse2.h" + +// TODO(linfengz): refine fdct4x8 and fadst4x8 optimization (if possible). + +static void fdct4x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + __m128i u[4], v[4]; + + u[0] = _mm_unpacklo_epi16(input[0], input[1]); + u[1] = _mm_unpacklo_epi16(input[3], input[2]); + + v[0] = _mm_add_epi16(u[0], u[1]); + v[1] = _mm_sub_epi16(u[0], u[1]); + + u[0] = _mm_madd_epi16(v[0], cospi_p32_p32); // 0 + u[1] = _mm_madd_epi16(v[0], cospi_p32_m32); // 2 + u[2] = _mm_madd_epi16(v[1], cospi_p16_p48); // 1 + u[3] = _mm_madd_epi16(v[1], cospi_p48_m16); // 3 + + v[0] = _mm_add_epi32(u[0], __rounding); + v[1] = _mm_add_epi32(u[1], __rounding); + v[2] = _mm_add_epi32(u[2], __rounding); + v[3] = _mm_add_epi32(u[3], __rounding); + u[0] = _mm_srai_epi32(v[0], cos_bit); + u[1] = _mm_srai_epi32(v[1], cos_bit); + u[2] = _mm_srai_epi32(v[2], cos_bit); + u[3] = _mm_srai_epi32(v[3], cos_bit); + + output[0] = _mm_packs_epi32(u[0], u[1]); + output[1] = _mm_packs_epi32(u[2], u[3]); + output[2] = _mm_srli_si128(output[0], 8); + output[3] = _mm_srli_si128(output[1], 8); +} + +static void fdct8x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + + // stage 1 + __m128i x1[4]; + x1[0] = _mm_adds_epi16(input[0], input[3]); + x1[3] = _mm_subs_epi16(input[0], input[3]); + x1[1] = _mm_adds_epi16(input[1], input[2]); + x1[2] = _mm_subs_epi16(input[1], input[2]); + + // stage 2 + __m128i x2[4]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x1[2], x1[3], x2[2], x2[3]); + + // stage 3 + output[0] = x2[0]; + output[1] = x2[2]; + output[2] = x2[1]; + output[3] = x2[3]; +} + +static void fdct4x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + + // stage 1 + __m128i x1[8]; + x1[0] = _mm_adds_epi16(input[0], input[7]); + x1[7] = _mm_subs_epi16(input[0], input[7]); + x1[1] = _mm_adds_epi16(input[1], input[6]); + x1[6] = _mm_subs_epi16(input[1], input[6]); + x1[2] = _mm_adds_epi16(input[2], input[5]); + x1[5] = _mm_subs_epi16(input[2], input[5]); + x1[3] = _mm_adds_epi16(input[3], input[4]); + x1[4] = _mm_subs_epi16(input[3], input[4]); + + // stage 2 + __m128i x2[8]; + x2[0] = _mm_adds_epi16(x1[0], x1[3]); + x2[3] = _mm_subs_epi16(x1[0], x1[3]); + x2[1] = _mm_adds_epi16(x1[1], x1[2]); + x2[2] = _mm_subs_epi16(x1[1], x1[2]); + x2[4] = x1[4]; + btf_16_w4_sse2(&cospi_m32_p32, &cospi_p32_p32, __rounding, cos_bit, &x1[5], + &x1[6], &x2[5], &x2[6]); + x2[7] = x1[7]; + + // stage 3 + __m128i x3[8]; + btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x2[0], + &x2[1], &x3[0], &x3[1]); + btf_16_w4_sse2(&cospi_p48_p16, &cospi_m16_p48, __rounding, cos_bit, &x2[2], + &x2[3], &x3[2], &x3[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[5]); + x3[5] = _mm_subs_epi16(x2[4], x2[5]); + x3[6] = _mm_subs_epi16(x2[7], x2[6]); + x3[7] = _mm_adds_epi16(x2[7], x2[6]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w4_sse2(&cospi_p56_p08, &cospi_m08_p56, __rounding, cos_bit, &x3[4], + &x3[7], &x4[4], &x4[7]); + btf_16_w4_sse2(&cospi_p24_p40, &cospi_m40_p24, __rounding, cos_bit, &x3[5], + &x3[6], &x4[5], &x4[6]); + + // stage 5 + output[0] = x4[0]; + output[1] = x4[4]; + output[2] = x4[2]; + output[3] = x4[6]; + output[4] = x4[1]; + output[5] = x4[5]; + output[6] = x4[3]; + output[7] = x4[7]; +} + +static void fdct8x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + + // stage 1 + __m128i x1[8]; + x1[0] = _mm_adds_epi16(input[0], input[7]); + x1[7] = _mm_subs_epi16(input[0], input[7]); + x1[1] = _mm_adds_epi16(input[1], input[6]); + x1[6] = _mm_subs_epi16(input[1], input[6]); + x1[2] = _mm_adds_epi16(input[2], input[5]); + x1[5] = _mm_subs_epi16(input[2], input[5]); + x1[3] = _mm_adds_epi16(input[3], input[4]); + x1[4] = _mm_subs_epi16(input[3], input[4]); + + // stage 2 + __m128i x2[8]; + x2[0] = _mm_adds_epi16(x1[0], x1[3]); + x2[3] = _mm_subs_epi16(x1[0], x1[3]); + x2[1] = _mm_adds_epi16(x1[1], x1[2]); + x2[2] = _mm_subs_epi16(x1[1], x1[2]); + x2[4] = x1[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]); + x2[7] = x1[7]; + + // stage 3 + __m128i x3[8]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[5]); + x3[5] = _mm_subs_epi16(x2[4], x2[5]); + x3[6] = _mm_subs_epi16(x2[7], x2[6]); + x3[7] = _mm_adds_epi16(x2[7], x2[6]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], x4[4], x4[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], x4[5], x4[6]); + + // stage 5 + output[0] = x4[0]; + output[1] = x4[4]; + output[2] = x4[2]; + output[3] = x4[6]; + output[4] = x4[1]; + output[5] = x4[5]; + output[6] = x4[3]; + output[7] = x4[7]; +} + +static void fdct8x16_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + + // stage 1 + __m128i x1[16]; + x1[0] = _mm_adds_epi16(input[0], input[15]); + x1[15] = _mm_subs_epi16(input[0], input[15]); + x1[1] = _mm_adds_epi16(input[1], input[14]); + x1[14] = _mm_subs_epi16(input[1], input[14]); + x1[2] = _mm_adds_epi16(input[2], input[13]); + x1[13] = _mm_subs_epi16(input[2], input[13]); + x1[3] = _mm_adds_epi16(input[3], input[12]); + x1[12] = _mm_subs_epi16(input[3], input[12]); + x1[4] = _mm_adds_epi16(input[4], input[11]); + x1[11] = _mm_subs_epi16(input[4], input[11]); + x1[5] = _mm_adds_epi16(input[5], input[10]); + x1[10] = _mm_subs_epi16(input[5], input[10]); + x1[6] = _mm_adds_epi16(input[6], input[9]); + x1[9] = _mm_subs_epi16(input[6], input[9]); + x1[7] = _mm_adds_epi16(input[7], input[8]); + x1[8] = _mm_subs_epi16(input[7], input[8]); + + // stage 2 + __m128i x2[16]; + x2[0] = _mm_adds_epi16(x1[0], x1[7]); + x2[7] = _mm_subs_epi16(x1[0], x1[7]); + x2[1] = _mm_adds_epi16(x1[1], x1[6]); + x2[6] = _mm_subs_epi16(x1[1], x1[6]); + x2[2] = _mm_adds_epi16(x1[2], x1[5]); + x2[5] = _mm_subs_epi16(x1[2], x1[5]); + x2[3] = _mm_adds_epi16(x1[3], x1[4]); + x2[4] = _mm_subs_epi16(x1[3], x1[4]); + x2[8] = x1[8]; + x2[9] = x1[9]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[10], x1[13], x2[10], x2[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[11], x1[12], x2[11], x2[12]); + x2[14] = x1[14]; + x2[15] = x1[15]; + + // stage 3 + __m128i x3[16]; + x3[0] = _mm_adds_epi16(x2[0], x2[3]); + x3[3] = _mm_subs_epi16(x2[0], x2[3]); + x3[1] = _mm_adds_epi16(x2[1], x2[2]); + x3[2] = _mm_subs_epi16(x2[1], x2[2]); + x3[4] = x2[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[5], x2[6], x3[5], x3[6]); + x3[7] = x2[7]; + x3[8] = _mm_adds_epi16(x2[8], x2[11]); + x3[11] = _mm_subs_epi16(x2[8], x2[11]); + x3[9] = _mm_adds_epi16(x2[9], x2[10]); + x3[10] = _mm_subs_epi16(x2[9], x2[10]); + x3[12] = _mm_subs_epi16(x2[15], x2[12]); + x3[15] = _mm_adds_epi16(x2[15], x2[12]); + x3[13] = _mm_subs_epi16(x2[14], x2[13]); + x3[14] = _mm_adds_epi16(x2[14], x2[13]); + + // stage 4 + __m128i x4[16]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x3[2], x3[3], x4[2], x4[3]); + x4[4] = _mm_adds_epi16(x3[4], x3[5]); + x4[5] = _mm_subs_epi16(x3[4], x3[5]); + x4[6] = _mm_subs_epi16(x3[7], x3[6]); + x4[7] = _mm_adds_epi16(x3[7], x3[6]); + x4[8] = x3[8]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]); + x4[11] = x3[11]; + x4[12] = x3[12]; + x4[15] = x3[15]; + + // stage 5 + __m128i x5[16]; + x5[0] = x4[0]; + x5[1] = x4[1]; + x5[2] = x4[2]; + x5[3] = x4[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x4[4], x4[7], x5[4], x5[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x4[5], x4[6], x5[5], x5[6]); + x5[8] = _mm_adds_epi16(x4[8], x4[9]); + x5[9] = _mm_subs_epi16(x4[8], x4[9]); + x5[10] = _mm_subs_epi16(x4[11], x4[10]); + x5[11] = _mm_adds_epi16(x4[11], x4[10]); + x5[12] = _mm_adds_epi16(x4[12], x4[13]); + x5[13] = _mm_subs_epi16(x4[12], x4[13]); + x5[14] = _mm_subs_epi16(x4[15], x4[14]); + x5[15] = _mm_adds_epi16(x4[15], x4[14]); + + // stage 6 + __m128i x6[16]; + x6[0] = x5[0]; + x6[1] = x5[1]; + x6[2] = x5[2]; + x6[3] = x5[3]; + x6[4] = x5[4]; + x6[5] = x5[5]; + x6[6] = x5[6]; + x6[7] = x5[7]; + btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x5[8], x5[15], x6[8], x6[15]); + btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x5[9], x5[14], x6[9], x6[14]); + btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x5[10], x5[13], x6[10], x6[13]); + btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x5[11], x5[12], x6[11], x6[12]); + + // stage 7 + output[0] = x6[0]; + output[1] = x6[8]; + output[2] = x6[4]; + output[3] = x6[12]; + output[4] = x6[2]; + output[5] = x6[10]; + output[6] = x6[6]; + output[7] = x6[14]; + output[8] = x6[1]; + output[9] = x6[9]; + output[10] = x6[5]; + output[11] = x6[13]; + output[12] = x6[3]; + output[13] = x6[11]; + output[14] = x6[7]; + output[15] = x6[15]; +} + +void fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]); + __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]); + __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]); + __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]); + __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]); + __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]); + __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]); + __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]); + __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]); + __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]); + __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]); + __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]); + __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]); + __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]); + __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]); + __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]); + + // stage 1 + __m128i x1[32]; + x1[0] = _mm_adds_epi16(input[0], input[31]); + x1[31] = _mm_subs_epi16(input[0], input[31]); + x1[1] = _mm_adds_epi16(input[1], input[30]); + x1[30] = _mm_subs_epi16(input[1], input[30]); + x1[2] = _mm_adds_epi16(input[2], input[29]); + x1[29] = _mm_subs_epi16(input[2], input[29]); + x1[3] = _mm_adds_epi16(input[3], input[28]); + x1[28] = _mm_subs_epi16(input[3], input[28]); + x1[4] = _mm_adds_epi16(input[4], input[27]); + x1[27] = _mm_subs_epi16(input[4], input[27]); + x1[5] = _mm_adds_epi16(input[5], input[26]); + x1[26] = _mm_subs_epi16(input[5], input[26]); + x1[6] = _mm_adds_epi16(input[6], input[25]); + x1[25] = _mm_subs_epi16(input[6], input[25]); + x1[7] = _mm_adds_epi16(input[7], input[24]); + x1[24] = _mm_subs_epi16(input[7], input[24]); + x1[8] = _mm_adds_epi16(input[8], input[23]); + x1[23] = _mm_subs_epi16(input[8], input[23]); + x1[9] = _mm_adds_epi16(input[9], input[22]); + x1[22] = _mm_subs_epi16(input[9], input[22]); + x1[10] = _mm_adds_epi16(input[10], input[21]); + x1[21] = _mm_subs_epi16(input[10], input[21]); + x1[11] = _mm_adds_epi16(input[11], input[20]); + x1[20] = _mm_subs_epi16(input[11], input[20]); + x1[12] = _mm_adds_epi16(input[12], input[19]); + x1[19] = _mm_subs_epi16(input[12], input[19]); + x1[13] = _mm_adds_epi16(input[13], input[18]); + x1[18] = _mm_subs_epi16(input[13], input[18]); + x1[14] = _mm_adds_epi16(input[14], input[17]); + x1[17] = _mm_subs_epi16(input[14], input[17]); + x1[15] = _mm_adds_epi16(input[15], input[16]); + x1[16] = _mm_subs_epi16(input[15], input[16]); + + // stage 2 + __m128i x2[32]; + x2[0] = _mm_adds_epi16(x1[0], x1[15]); + x2[15] = _mm_subs_epi16(x1[0], x1[15]); + x2[1] = _mm_adds_epi16(x1[1], x1[14]); + x2[14] = _mm_subs_epi16(x1[1], x1[14]); + x2[2] = _mm_adds_epi16(x1[2], x1[13]); + x2[13] = _mm_subs_epi16(x1[2], x1[13]); + x2[3] = _mm_adds_epi16(x1[3], x1[12]); + x2[12] = _mm_subs_epi16(x1[3], x1[12]); + x2[4] = _mm_adds_epi16(x1[4], x1[11]); + x2[11] = _mm_subs_epi16(x1[4], x1[11]); + x2[5] = _mm_adds_epi16(x1[5], x1[10]); + x2[10] = _mm_subs_epi16(x1[5], x1[10]); + x2[6] = _mm_adds_epi16(x1[6], x1[9]); + x2[9] = _mm_subs_epi16(x1[6], x1[9]); + x2[7] = _mm_adds_epi16(x1[7], x1[8]); + x2[8] = _mm_subs_epi16(x1[7], x1[8]); + x2[16] = x1[16]; + x2[17] = x1[17]; + x2[18] = x1[18]; + x2[19] = x1[19]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[20], x1[27], x2[20], x2[27]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[21], x1[26], x2[21], x2[26]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[22], x1[25], x2[22], x2[25]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[23], x1[24], x2[23], x2[24]); + x2[28] = x1[28]; + x2[29] = x1[29]; + x2[30] = x1[30]; + x2[31] = x1[31]; + + // stage 3 + __m128i x3[32]; + x3[0] = _mm_adds_epi16(x2[0], x2[7]); + x3[7] = _mm_subs_epi16(x2[0], x2[7]); + x3[1] = _mm_adds_epi16(x2[1], x2[6]); + x3[6] = _mm_subs_epi16(x2[1], x2[6]); + x3[2] = _mm_adds_epi16(x2[2], x2[5]); + x3[5] = _mm_subs_epi16(x2[2], x2[5]); + x3[3] = _mm_adds_epi16(x2[3], x2[4]); + x3[4] = _mm_subs_epi16(x2[3], x2[4]); + x3[8] = x2[8]; + x3[9] = x2[9]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[10], x2[13], x3[10], x3[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[11], x2[12], x3[11], x3[12]); + x3[14] = x2[14]; + x3[15] = x2[15]; + x3[16] = _mm_adds_epi16(x2[16], x2[23]); + x3[23] = _mm_subs_epi16(x2[16], x2[23]); + x3[17] = _mm_adds_epi16(x2[17], x2[22]); + x3[22] = _mm_subs_epi16(x2[17], x2[22]); + x3[18] = _mm_adds_epi16(x2[18], x2[21]); + x3[21] = _mm_subs_epi16(x2[18], x2[21]); + x3[19] = _mm_adds_epi16(x2[19], x2[20]); + x3[20] = _mm_subs_epi16(x2[19], x2[20]); + x3[24] = _mm_subs_epi16(x2[31], x2[24]); + x3[31] = _mm_adds_epi16(x2[31], x2[24]); + x3[25] = _mm_subs_epi16(x2[30], x2[25]); + x3[30] = _mm_adds_epi16(x2[30], x2[25]); + x3[26] = _mm_subs_epi16(x2[29], x2[26]); + x3[29] = _mm_adds_epi16(x2[29], x2[26]); + x3[27] = _mm_subs_epi16(x2[28], x2[27]); + x3[28] = _mm_adds_epi16(x2[28], x2[27]); + + // stage 4 + __m128i x4[32]; + x4[0] = _mm_adds_epi16(x3[0], x3[3]); + x4[3] = _mm_subs_epi16(x3[0], x3[3]); + x4[1] = _mm_adds_epi16(x3[1], x3[2]); + x4[2] = _mm_subs_epi16(x3[1], x3[2]); + x4[4] = x3[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]); + x4[7] = x3[7]; + x4[8] = _mm_adds_epi16(x3[8], x3[11]); + x4[11] = _mm_subs_epi16(x3[8], x3[11]); + x4[9] = _mm_adds_epi16(x3[9], x3[10]); + x4[10] = _mm_subs_epi16(x3[9], x3[10]); + x4[12] = _mm_subs_epi16(x3[15], x3[12]); + x4[15] = _mm_adds_epi16(x3[15], x3[12]); + x4[13] = _mm_subs_epi16(x3[14], x3[13]); + x4[14] = _mm_adds_epi16(x3[14], x3[13]); + x4[16] = x3[16]; + x4[17] = x3[17]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[18], x3[29], x4[18], x4[29]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[19], x3[28], x4[19], x4[28]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[20], x3[27], x4[20], x4[27]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[21], x3[26], x4[21], x4[26]); + x4[22] = x3[22]; + x4[23] = x3[23]; + x4[24] = x3[24]; + x4[25] = x3[25]; + x4[30] = x3[30]; + x4[31] = x3[31]; + + // stage 5 + __m128i x5[32]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x4[0], x4[1], x5[0], x5[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x4[2], x4[3], x5[2], x5[3]); + x5[4] = _mm_adds_epi16(x4[4], x4[5]); + x5[5] = _mm_subs_epi16(x4[4], x4[5]); + x5[6] = _mm_subs_epi16(x4[7], x4[6]); + x5[7] = _mm_adds_epi16(x4[7], x4[6]); + x5[8] = x4[8]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[9], x4[14], x5[9], x5[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[10], x4[13], x5[10], x5[13]); + x5[11] = x4[11]; + x5[12] = x4[12]; + x5[15] = x4[15]; + x5[16] = _mm_adds_epi16(x4[16], x4[19]); + x5[19] = _mm_subs_epi16(x4[16], x4[19]); + x5[17] = _mm_adds_epi16(x4[17], x4[18]); + x5[18] = _mm_subs_epi16(x4[17], x4[18]); + x5[20] = _mm_subs_epi16(x4[23], x4[20]); + x5[23] = _mm_adds_epi16(x4[23], x4[20]); + x5[21] = _mm_subs_epi16(x4[22], x4[21]); + x5[22] = _mm_adds_epi16(x4[22], x4[21]); + x5[24] = _mm_adds_epi16(x4[24], x4[27]); + x5[27] = _mm_subs_epi16(x4[24], x4[27]); + x5[25] = _mm_adds_epi16(x4[25], x4[26]); + x5[26] = _mm_subs_epi16(x4[25], x4[26]); + x5[28] = _mm_subs_epi16(x4[31], x4[28]); + x5[31] = _mm_adds_epi16(x4[31], x4[28]); + x5[29] = _mm_subs_epi16(x4[30], x4[29]); + x5[30] = _mm_adds_epi16(x4[30], x4[29]); + + // stage 6 + __m128i x6[32]; + x6[0] = x5[0]; + x6[1] = x5[1]; + x6[2] = x5[2]; + x6[3] = x5[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x5[4], x5[7], x6[4], x6[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x5[5], x5[6], x6[5], x6[6]); + x6[8] = _mm_adds_epi16(x5[8], x5[9]); + x6[9] = _mm_subs_epi16(x5[8], x5[9]); + x6[10] = _mm_subs_epi16(x5[11], x5[10]); + x6[11] = _mm_adds_epi16(x5[11], x5[10]); + x6[12] = _mm_adds_epi16(x5[12], x5[13]); + x6[13] = _mm_subs_epi16(x5[12], x5[13]); + x6[14] = _mm_subs_epi16(x5[15], x5[14]); + x6[15] = _mm_adds_epi16(x5[15], x5[14]); + x6[16] = x5[16]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[17], x5[30], x6[17], x6[30]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[18], x5[29], x6[18], x6[29]); + x6[19] = x5[19]; + x6[20] = x5[20]; + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[21], x5[26], x6[21], x6[26]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[22], x5[25], x6[22], x6[25]); + x6[23] = x5[23]; + x6[24] = x5[24]; + x6[27] = x5[27]; + x6[28] = x5[28]; + x6[31] = x5[31]; + + // stage 7 + __m128i x7[32]; + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + x7[4] = x6[4]; + x7[5] = x6[5]; + x7[6] = x6[6]; + x7[7] = x6[7]; + btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x6[8], x6[15], x7[8], x7[15]); + btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x6[9], x6[14], x7[9], x7[14]); + btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x6[10], x6[13], x7[10], x7[13]); + btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x6[11], x6[12], x7[11], x7[12]); + x7[16] = _mm_adds_epi16(x6[16], x6[17]); + x7[17] = _mm_subs_epi16(x6[16], x6[17]); + x7[18] = _mm_subs_epi16(x6[19], x6[18]); + x7[19] = _mm_adds_epi16(x6[19], x6[18]); + x7[20] = _mm_adds_epi16(x6[20], x6[21]); + x7[21] = _mm_subs_epi16(x6[20], x6[21]); + x7[22] = _mm_subs_epi16(x6[23], x6[22]); + x7[23] = _mm_adds_epi16(x6[23], x6[22]); + x7[24] = _mm_adds_epi16(x6[24], x6[25]); + x7[25] = _mm_subs_epi16(x6[24], x6[25]); + x7[26] = _mm_subs_epi16(x6[27], x6[26]); + x7[27] = _mm_adds_epi16(x6[27], x6[26]); + x7[28] = _mm_adds_epi16(x6[28], x6[29]); + x7[29] = _mm_subs_epi16(x6[28], x6[29]); + x7[30] = _mm_subs_epi16(x6[31], x6[30]); + x7[31] = _mm_adds_epi16(x6[31], x6[30]); + + // stage 8 + __m128i x8[32]; + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + x8[8] = x7[8]; + x8[9] = x7[9]; + x8[10] = x7[10]; + x8[11] = x7[11]; + x8[12] = x7[12]; + x8[13] = x7[13]; + x8[14] = x7[14]; + x8[15] = x7[15]; + btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x7[16], x7[31], x8[16], x8[31]); + btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x7[17], x7[30], x8[17], x8[30]); + btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x7[18], x7[29], x8[18], x8[29]); + btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x7[19], x7[28], x8[19], x8[28]); + btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x7[20], x7[27], x8[20], x8[27]); + btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x7[21], x7[26], x8[21], x8[26]); + btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x7[22], x7[25], x8[22], x8[25]); + btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x7[23], x7[24], x8[23], x8[24]); + + // stage 9 + output[0] = x8[0]; + output[1] = x8[16]; + output[2] = x8[8]; + output[3] = x8[24]; + output[4] = x8[4]; + output[5] = x8[20]; + output[6] = x8[12]; + output[7] = x8[28]; + output[8] = x8[2]; + output[9] = x8[18]; + output[10] = x8[10]; + output[11] = x8[26]; + output[12] = x8[6]; + output[13] = x8[22]; + output[14] = x8[14]; + output[15] = x8[30]; + output[16] = x8[1]; + output[17] = x8[17]; + output[18] = x8[9]; + output[19] = x8[25]; + output[20] = x8[5]; + output[21] = x8[21]; + output[22] = x8[13]; + output[23] = x8[29]; + output[24] = x8[3]; + output[25] = x8[19]; + output[26] = x8[11]; + output[27] = x8[27]; + output[28] = x8[7]; + output[29] = x8[23]; + output[30] = x8[15]; + output[31] = x8[31]; +} + +void fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]); + __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); + __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]); + __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); + __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]); + __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]); + __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]); + __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]); + __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]); + __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]); + __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]); + __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]); + __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]); + __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]); + __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]); + __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]); + __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]); + __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]); + __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]); + __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]); + __m128i cospi_p63_p01 = pair_set_epi16(cospi[63], cospi[1]); + __m128i cospi_m01_p63 = pair_set_epi16(-cospi[1], cospi[63]); + __m128i cospi_p31_p33 = pair_set_epi16(cospi[31], cospi[33]); + __m128i cospi_m33_p31 = pair_set_epi16(-cospi[33], cospi[31]); + __m128i cospi_p47_p17 = pair_set_epi16(cospi[47], cospi[17]); + __m128i cospi_m17_p47 = pair_set_epi16(-cospi[17], cospi[47]); + __m128i cospi_p15_p49 = pair_set_epi16(cospi[15], cospi[49]); + __m128i cospi_m49_p15 = pair_set_epi16(-cospi[49], cospi[15]); + __m128i cospi_p55_p09 = pair_set_epi16(cospi[55], cospi[9]); + __m128i cospi_m09_p55 = pair_set_epi16(-cospi[9], cospi[55]); + __m128i cospi_p23_p41 = pair_set_epi16(cospi[23], cospi[41]); + __m128i cospi_m41_p23 = pair_set_epi16(-cospi[41], cospi[23]); + __m128i cospi_p39_p25 = pair_set_epi16(cospi[39], cospi[25]); + __m128i cospi_m25_p39 = pair_set_epi16(-cospi[25], cospi[39]); + __m128i cospi_p07_p57 = pair_set_epi16(cospi[7], cospi[57]); + __m128i cospi_m57_p07 = pair_set_epi16(-cospi[57], cospi[7]); + __m128i cospi_p59_p05 = pair_set_epi16(cospi[59], cospi[5]); + __m128i cospi_m05_p59 = pair_set_epi16(-cospi[5], cospi[59]); + __m128i cospi_p27_p37 = pair_set_epi16(cospi[27], cospi[37]); + __m128i cospi_m37_p27 = pair_set_epi16(-cospi[37], cospi[27]); + __m128i cospi_p43_p21 = pair_set_epi16(cospi[43], cospi[21]); + __m128i cospi_m21_p43 = pair_set_epi16(-cospi[21], cospi[43]); + __m128i cospi_p11_p53 = pair_set_epi16(cospi[11], cospi[53]); + __m128i cospi_m53_p11 = pair_set_epi16(-cospi[53], cospi[11]); + __m128i cospi_p51_p13 = pair_set_epi16(cospi[51], cospi[13]); + __m128i cospi_m13_p51 = pair_set_epi16(-cospi[13], cospi[51]); + __m128i cospi_p19_p45 = pair_set_epi16(cospi[19], cospi[45]); + __m128i cospi_m45_p19 = pair_set_epi16(-cospi[45], cospi[19]); + __m128i cospi_p35_p29 = pair_set_epi16(cospi[35], cospi[29]); + __m128i cospi_m29_p35 = pair_set_epi16(-cospi[29], cospi[35]); + __m128i cospi_p03_p61 = pair_set_epi16(cospi[3], cospi[61]); + __m128i cospi_m61_p03 = pair_set_epi16(-cospi[61], cospi[3]); + + // stage 1 + __m128i x1[64]; + x1[0] = _mm_adds_epi16(input[0], input[63]); + x1[63] = _mm_subs_epi16(input[0], input[63]); + x1[1] = _mm_adds_epi16(input[1], input[62]); + x1[62] = _mm_subs_epi16(input[1], input[62]); + x1[2] = _mm_adds_epi16(input[2], input[61]); + x1[61] = _mm_subs_epi16(input[2], input[61]); + x1[3] = _mm_adds_epi16(input[3], input[60]); + x1[60] = _mm_subs_epi16(input[3], input[60]); + x1[4] = _mm_adds_epi16(input[4], input[59]); + x1[59] = _mm_subs_epi16(input[4], input[59]); + x1[5] = _mm_adds_epi16(input[5], input[58]); + x1[58] = _mm_subs_epi16(input[5], input[58]); + x1[6] = _mm_adds_epi16(input[6], input[57]); + x1[57] = _mm_subs_epi16(input[6], input[57]); + x1[7] = _mm_adds_epi16(input[7], input[56]); + x1[56] = _mm_subs_epi16(input[7], input[56]); + x1[8] = _mm_adds_epi16(input[8], input[55]); + x1[55] = _mm_subs_epi16(input[8], input[55]); + x1[9] = _mm_adds_epi16(input[9], input[54]); + x1[54] = _mm_subs_epi16(input[9], input[54]); + x1[10] = _mm_adds_epi16(input[10], input[53]); + x1[53] = _mm_subs_epi16(input[10], input[53]); + x1[11] = _mm_adds_epi16(input[11], input[52]); + x1[52] = _mm_subs_epi16(input[11], input[52]); + x1[12] = _mm_adds_epi16(input[12], input[51]); + x1[51] = _mm_subs_epi16(input[12], input[51]); + x1[13] = _mm_adds_epi16(input[13], input[50]); + x1[50] = _mm_subs_epi16(input[13], input[50]); + x1[14] = _mm_adds_epi16(input[14], input[49]); + x1[49] = _mm_subs_epi16(input[14], input[49]); + x1[15] = _mm_adds_epi16(input[15], input[48]); + x1[48] = _mm_subs_epi16(input[15], input[48]); + x1[16] = _mm_adds_epi16(input[16], input[47]); + x1[47] = _mm_subs_epi16(input[16], input[47]); + x1[17] = _mm_adds_epi16(input[17], input[46]); + x1[46] = _mm_subs_epi16(input[17], input[46]); + x1[18] = _mm_adds_epi16(input[18], input[45]); + x1[45] = _mm_subs_epi16(input[18], input[45]); + x1[19] = _mm_adds_epi16(input[19], input[44]); + x1[44] = _mm_subs_epi16(input[19], input[44]); + x1[20] = _mm_adds_epi16(input[20], input[43]); + x1[43] = _mm_subs_epi16(input[20], input[43]); + x1[21] = _mm_adds_epi16(input[21], input[42]); + x1[42] = _mm_subs_epi16(input[21], input[42]); + x1[22] = _mm_adds_epi16(input[22], input[41]); + x1[41] = _mm_subs_epi16(input[22], input[41]); + x1[23] = _mm_adds_epi16(input[23], input[40]); + x1[40] = _mm_subs_epi16(input[23], input[40]); + x1[24] = _mm_adds_epi16(input[24], input[39]); + x1[39] = _mm_subs_epi16(input[24], input[39]); + x1[25] = _mm_adds_epi16(input[25], input[38]); + x1[38] = _mm_subs_epi16(input[25], input[38]); + x1[26] = _mm_adds_epi16(input[26], input[37]); + x1[37] = _mm_subs_epi16(input[26], input[37]); + x1[27] = _mm_adds_epi16(input[27], input[36]); + x1[36] = _mm_subs_epi16(input[27], input[36]); + x1[28] = _mm_adds_epi16(input[28], input[35]); + x1[35] = _mm_subs_epi16(input[28], input[35]); + x1[29] = _mm_adds_epi16(input[29], input[34]); + x1[34] = _mm_subs_epi16(input[29], input[34]); + x1[30] = _mm_adds_epi16(input[30], input[33]); + x1[33] = _mm_subs_epi16(input[30], input[33]); + x1[31] = _mm_adds_epi16(input[31], input[32]); + x1[32] = _mm_subs_epi16(input[31], input[32]); + + // stage 2 + __m128i x2[64]; + x2[0] = _mm_adds_epi16(x1[0], x1[31]); + x2[31] = _mm_subs_epi16(x1[0], x1[31]); + x2[1] = _mm_adds_epi16(x1[1], x1[30]); + x2[30] = _mm_subs_epi16(x1[1], x1[30]); + x2[2] = _mm_adds_epi16(x1[2], x1[29]); + x2[29] = _mm_subs_epi16(x1[2], x1[29]); + x2[3] = _mm_adds_epi16(x1[3], x1[28]); + x2[28] = _mm_subs_epi16(x1[3], x1[28]); + x2[4] = _mm_adds_epi16(x1[4], x1[27]); + x2[27] = _mm_subs_epi16(x1[4], x1[27]); + x2[5] = _mm_adds_epi16(x1[5], x1[26]); + x2[26] = _mm_subs_epi16(x1[5], x1[26]); + x2[6] = _mm_adds_epi16(x1[6], x1[25]); + x2[25] = _mm_subs_epi16(x1[6], x1[25]); + x2[7] = _mm_adds_epi16(x1[7], x1[24]); + x2[24] = _mm_subs_epi16(x1[7], x1[24]); + x2[8] = _mm_adds_epi16(x1[8], x1[23]); + x2[23] = _mm_subs_epi16(x1[8], x1[23]); + x2[9] = _mm_adds_epi16(x1[9], x1[22]); + x2[22] = _mm_subs_epi16(x1[9], x1[22]); + x2[10] = _mm_adds_epi16(x1[10], x1[21]); + x2[21] = _mm_subs_epi16(x1[10], x1[21]); + x2[11] = _mm_adds_epi16(x1[11], x1[20]); + x2[20] = _mm_subs_epi16(x1[11], x1[20]); + x2[12] = _mm_adds_epi16(x1[12], x1[19]); + x2[19] = _mm_subs_epi16(x1[12], x1[19]); + x2[13] = _mm_adds_epi16(x1[13], x1[18]); + x2[18] = _mm_subs_epi16(x1[13], x1[18]); + x2[14] = _mm_adds_epi16(x1[14], x1[17]); + x2[17] = _mm_subs_epi16(x1[14], x1[17]); + x2[15] = _mm_adds_epi16(x1[15], x1[16]); + x2[16] = _mm_subs_epi16(x1[15], x1[16]); + x2[32] = x1[32]; + x2[33] = x1[33]; + x2[34] = x1[34]; + x2[35] = x1[35]; + x2[36] = x1[36]; + x2[37] = x1[37]; + x2[38] = x1[38]; + x2[39] = x1[39]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[40], x1[55], x2[40], x2[55]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[41], x1[54], x2[41], x2[54]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[42], x1[53], x2[42], x2[53]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[43], x1[52], x2[43], x2[52]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[44], x1[51], x2[44], x2[51]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[45], x1[50], x2[45], x2[50]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[46], x1[49], x2[46], x2[49]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[47], x1[48], x2[47], x2[48]); + x2[56] = x1[56]; + x2[57] = x1[57]; + x2[58] = x1[58]; + x2[59] = x1[59]; + x2[60] = x1[60]; + x2[61] = x1[61]; + x2[62] = x1[62]; + x2[63] = x1[63]; + + // stage 3 + __m128i x3[64]; + x3[0] = _mm_adds_epi16(x2[0], x2[15]); + x3[15] = _mm_subs_epi16(x2[0], x2[15]); + x3[1] = _mm_adds_epi16(x2[1], x2[14]); + x3[14] = _mm_subs_epi16(x2[1], x2[14]); + x3[2] = _mm_adds_epi16(x2[2], x2[13]); + x3[13] = _mm_subs_epi16(x2[2], x2[13]); + x3[3] = _mm_adds_epi16(x2[3], x2[12]); + x3[12] = _mm_subs_epi16(x2[3], x2[12]); + x3[4] = _mm_adds_epi16(x2[4], x2[11]); + x3[11] = _mm_subs_epi16(x2[4], x2[11]); + x3[5] = _mm_adds_epi16(x2[5], x2[10]); + x3[10] = _mm_subs_epi16(x2[5], x2[10]); + x3[6] = _mm_adds_epi16(x2[6], x2[9]); + x3[9] = _mm_subs_epi16(x2[6], x2[9]); + x3[7] = _mm_adds_epi16(x2[7], x2[8]); + x3[8] = _mm_subs_epi16(x2[7], x2[8]); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[20], x2[27], x3[20], x3[27]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[21], x2[26], x3[21], x3[26]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[22], x2[25], x3[22], x3[25]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[23], x2[24], x3[23], x3[24]); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + x3[32] = _mm_adds_epi16(x2[32], x2[47]); + x3[47] = _mm_subs_epi16(x2[32], x2[47]); + x3[33] = _mm_adds_epi16(x2[33], x2[46]); + x3[46] = _mm_subs_epi16(x2[33], x2[46]); + x3[34] = _mm_adds_epi16(x2[34], x2[45]); + x3[45] = _mm_subs_epi16(x2[34], x2[45]); + x3[35] = _mm_adds_epi16(x2[35], x2[44]); + x3[44] = _mm_subs_epi16(x2[35], x2[44]); + x3[36] = _mm_adds_epi16(x2[36], x2[43]); + x3[43] = _mm_subs_epi16(x2[36], x2[43]); + x3[37] = _mm_adds_epi16(x2[37], x2[42]); + x3[42] = _mm_subs_epi16(x2[37], x2[42]); + x3[38] = _mm_adds_epi16(x2[38], x2[41]); + x3[41] = _mm_subs_epi16(x2[38], x2[41]); + x3[39] = _mm_adds_epi16(x2[39], x2[40]); + x3[40] = _mm_subs_epi16(x2[39], x2[40]); + x3[48] = _mm_subs_epi16(x2[63], x2[48]); + x3[63] = _mm_adds_epi16(x2[63], x2[48]); + x3[49] = _mm_subs_epi16(x2[62], x2[49]); + x3[62] = _mm_adds_epi16(x2[62], x2[49]); + x3[50] = _mm_subs_epi16(x2[61], x2[50]); + x3[61] = _mm_adds_epi16(x2[61], x2[50]); + x3[51] = _mm_subs_epi16(x2[60], x2[51]); + x3[60] = _mm_adds_epi16(x2[60], x2[51]); + x3[52] = _mm_subs_epi16(x2[59], x2[52]); + x3[59] = _mm_adds_epi16(x2[59], x2[52]); + x3[53] = _mm_subs_epi16(x2[58], x2[53]); + x3[58] = _mm_adds_epi16(x2[58], x2[53]); + x3[54] = _mm_subs_epi16(x2[57], x2[54]); + x3[57] = _mm_adds_epi16(x2[57], x2[54]); + x3[55] = _mm_subs_epi16(x2[56], x2[55]); + x3[56] = _mm_adds_epi16(x2[56], x2[55]); + + // stage 4 + __m128i x4[64]; + x4[0] = _mm_adds_epi16(x3[0], x3[7]); + x4[7] = _mm_subs_epi16(x3[0], x3[7]); + x4[1] = _mm_adds_epi16(x3[1], x3[6]); + x4[6] = _mm_subs_epi16(x3[1], x3[6]); + x4[2] = _mm_adds_epi16(x3[2], x3[5]); + x4[5] = _mm_subs_epi16(x3[2], x3[5]); + x4[3] = _mm_adds_epi16(x3[3], x3[4]); + x4[4] = _mm_subs_epi16(x3[3], x3[4]); + x4[8] = x3[8]; + x4[9] = x3[9]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[10], x3[13], x4[10], x4[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[11], x3[12], x4[11], x4[12]); + x4[14] = x3[14]; + x4[15] = x3[15]; + x4[16] = _mm_adds_epi16(x3[16], x3[23]); + x4[23] = _mm_subs_epi16(x3[16], x3[23]); + x4[17] = _mm_adds_epi16(x3[17], x3[22]); + x4[22] = _mm_subs_epi16(x3[17], x3[22]); + x4[18] = _mm_adds_epi16(x3[18], x3[21]); + x4[21] = _mm_subs_epi16(x3[18], x3[21]); + x4[19] = _mm_adds_epi16(x3[19], x3[20]); + x4[20] = _mm_subs_epi16(x3[19], x3[20]); + x4[24] = _mm_subs_epi16(x3[31], x3[24]); + x4[31] = _mm_adds_epi16(x3[31], x3[24]); + x4[25] = _mm_subs_epi16(x3[30], x3[25]); + x4[30] = _mm_adds_epi16(x3[30], x3[25]); + x4[26] = _mm_subs_epi16(x3[29], x3[26]); + x4[29] = _mm_adds_epi16(x3[29], x3[26]); + x4[27] = _mm_subs_epi16(x3[28], x3[27]); + x4[28] = _mm_adds_epi16(x3[28], x3[27]); + x4[32] = x3[32]; + x4[33] = x3[33]; + x4[34] = x3[34]; + x4[35] = x3[35]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[36], x3[59], x4[36], x4[59]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[37], x3[58], x4[37], x4[58]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[38], x3[57], x4[38], x4[57]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[39], x3[56], x4[39], x4[56]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[40], x3[55], x4[40], x4[55]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[41], x3[54], x4[41], x4[54]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[42], x3[53], x4[42], x4[53]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[43], x3[52], x4[43], x4[52]); + x4[44] = x3[44]; + x4[45] = x3[45]; + x4[46] = x3[46]; + x4[47] = x3[47]; + x4[48] = x3[48]; + x4[49] = x3[49]; + x4[50] = x3[50]; + x4[51] = x3[51]; + x4[60] = x3[60]; + x4[61] = x3[61]; + x4[62] = x3[62]; + x4[63] = x3[63]; + + // stage 5 + __m128i x5[64]; + x5[0] = _mm_adds_epi16(x4[0], x4[3]); + x5[3] = _mm_subs_epi16(x4[0], x4[3]); + x5[1] = _mm_adds_epi16(x4[1], x4[2]); + x5[2] = _mm_subs_epi16(x4[1], x4[2]); + x5[4] = x4[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x4[5], x4[6], x5[5], x5[6]); + x5[7] = x4[7]; + x5[8] = _mm_adds_epi16(x4[8], x4[11]); + x5[11] = _mm_subs_epi16(x4[8], x4[11]); + x5[9] = _mm_adds_epi16(x4[9], x4[10]); + x5[10] = _mm_subs_epi16(x4[9], x4[10]); + x5[12] = _mm_subs_epi16(x4[15], x4[12]); + x5[15] = _mm_adds_epi16(x4[15], x4[12]); + x5[13] = _mm_subs_epi16(x4[14], x4[13]); + x5[14] = _mm_adds_epi16(x4[14], x4[13]); + x5[16] = x4[16]; + x5[17] = x4[17]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[18], x4[29], x5[18], x5[29]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[19], x4[28], x5[19], x5[28]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[20], x4[27], x5[20], x5[27]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[21], x4[26], x5[21], x5[26]); + x5[22] = x4[22]; + x5[23] = x4[23]; + x5[24] = x4[24]; + x5[25] = x4[25]; + x5[30] = x4[30]; + x5[31] = x4[31]; + x5[32] = _mm_adds_epi16(x4[32], x4[39]); + x5[39] = _mm_subs_epi16(x4[32], x4[39]); + x5[33] = _mm_adds_epi16(x4[33], x4[38]); + x5[38] = _mm_subs_epi16(x4[33], x4[38]); + x5[34] = _mm_adds_epi16(x4[34], x4[37]); + x5[37] = _mm_subs_epi16(x4[34], x4[37]); + x5[35] = _mm_adds_epi16(x4[35], x4[36]); + x5[36] = _mm_subs_epi16(x4[35], x4[36]); + x5[40] = _mm_subs_epi16(x4[47], x4[40]); + x5[47] = _mm_adds_epi16(x4[47], x4[40]); + x5[41] = _mm_subs_epi16(x4[46], x4[41]); + x5[46] = _mm_adds_epi16(x4[46], x4[41]); + x5[42] = _mm_subs_epi16(x4[45], x4[42]); + x5[45] = _mm_adds_epi16(x4[45], x4[42]); + x5[43] = _mm_subs_epi16(x4[44], x4[43]); + x5[44] = _mm_adds_epi16(x4[44], x4[43]); + x5[48] = _mm_adds_epi16(x4[48], x4[55]); + x5[55] = _mm_subs_epi16(x4[48], x4[55]); + x5[49] = _mm_adds_epi16(x4[49], x4[54]); + x5[54] = _mm_subs_epi16(x4[49], x4[54]); + x5[50] = _mm_adds_epi16(x4[50], x4[53]); + x5[53] = _mm_subs_epi16(x4[50], x4[53]); + x5[51] = _mm_adds_epi16(x4[51], x4[52]); + x5[52] = _mm_subs_epi16(x4[51], x4[52]); + x5[56] = _mm_subs_epi16(x4[63], x4[56]); + x5[63] = _mm_adds_epi16(x4[63], x4[56]); + x5[57] = _mm_subs_epi16(x4[62], x4[57]); + x5[62] = _mm_adds_epi16(x4[62], x4[57]); + x5[58] = _mm_subs_epi16(x4[61], x4[58]); + x5[61] = _mm_adds_epi16(x4[61], x4[58]); + x5[59] = _mm_subs_epi16(x4[60], x4[59]); + x5[60] = _mm_adds_epi16(x4[60], x4[59]); + + // stage 6 + __m128i x6[64]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x5[0], x5[1], x6[0], x6[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x5[2], x5[3], x6[2], x6[3]); + x6[4] = _mm_adds_epi16(x5[4], x5[5]); + x6[5] = _mm_subs_epi16(x5[4], x5[5]); + x6[6] = _mm_subs_epi16(x5[7], x5[6]); + x6[7] = _mm_adds_epi16(x5[7], x5[6]); + x6[8] = x5[8]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x5[9], x5[14], x6[9], x6[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x5[10], x5[13], x6[10], x6[13]); + x6[11] = x5[11]; + x6[12] = x5[12]; + x6[15] = x5[15]; + x6[16] = _mm_adds_epi16(x5[16], x5[19]); + x6[19] = _mm_subs_epi16(x5[16], x5[19]); + x6[17] = _mm_adds_epi16(x5[17], x5[18]); + x6[18] = _mm_subs_epi16(x5[17], x5[18]); + x6[20] = _mm_subs_epi16(x5[23], x5[20]); + x6[23] = _mm_adds_epi16(x5[23], x5[20]); + x6[21] = _mm_subs_epi16(x5[22], x5[21]); + x6[22] = _mm_adds_epi16(x5[22], x5[21]); + x6[24] = _mm_adds_epi16(x5[24], x5[27]); + x6[27] = _mm_subs_epi16(x5[24], x5[27]); + x6[25] = _mm_adds_epi16(x5[25], x5[26]); + x6[26] = _mm_subs_epi16(x5[25], x5[26]); + x6[28] = _mm_subs_epi16(x5[31], x5[28]); + x6[31] = _mm_adds_epi16(x5[31], x5[28]); + x6[29] = _mm_subs_epi16(x5[30], x5[29]); + x6[30] = _mm_adds_epi16(x5[30], x5[29]); + x6[32] = x5[32]; + x6[33] = x5[33]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[34], x5[61], x6[34], x6[61]); + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[35], x5[60], x6[35], x6[60]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[36], x5[59], x6[36], x6[59]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[37], x5[58], x6[37], x6[58]); + x6[38] = x5[38]; + x6[39] = x5[39]; + x6[40] = x5[40]; + x6[41] = x5[41]; + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[42], x5[53], x6[42], x6[53]); + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[43], x5[52], x6[43], x6[52]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[44], x5[51], x6[44], x6[51]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[45], x5[50], x6[45], x6[50]); + x6[46] = x5[46]; + x6[47] = x5[47]; + x6[48] = x5[48]; + x6[49] = x5[49]; + x6[54] = x5[54]; + x6[55] = x5[55]; + x6[56] = x5[56]; + x6[57] = x5[57]; + x6[62] = x5[62]; + x6[63] = x5[63]; + + // stage 7 + __m128i x7[64]; + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x6[4], x6[7], x7[4], x7[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x6[5], x6[6], x7[5], x7[6]); + x7[8] = _mm_adds_epi16(x6[8], x6[9]); + x7[9] = _mm_subs_epi16(x6[8], x6[9]); + x7[10] = _mm_subs_epi16(x6[11], x6[10]); + x7[11] = _mm_adds_epi16(x6[11], x6[10]); + x7[12] = _mm_adds_epi16(x6[12], x6[13]); + x7[13] = _mm_subs_epi16(x6[12], x6[13]); + x7[14] = _mm_subs_epi16(x6[15], x6[14]); + x7[15] = _mm_adds_epi16(x6[15], x6[14]); + x7[16] = x6[16]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x6[17], x6[30], x7[17], x7[30]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x6[18], x6[29], x7[18], x7[29]); + x7[19] = x6[19]; + x7[20] = x6[20]; + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x6[21], x6[26], x7[21], x7[26]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x6[22], x6[25], x7[22], x7[25]); + x7[23] = x6[23]; + x7[24] = x6[24]; + x7[27] = x6[27]; + x7[28] = x6[28]; + x7[31] = x6[31]; + x7[32] = _mm_adds_epi16(x6[32], x6[35]); + x7[35] = _mm_subs_epi16(x6[32], x6[35]); + x7[33] = _mm_adds_epi16(x6[33], x6[34]); + x7[34] = _mm_subs_epi16(x6[33], x6[34]); + x7[36] = _mm_subs_epi16(x6[39], x6[36]); + x7[39] = _mm_adds_epi16(x6[39], x6[36]); + x7[37] = _mm_subs_epi16(x6[38], x6[37]); + x7[38] = _mm_adds_epi16(x6[38], x6[37]); + x7[40] = _mm_adds_epi16(x6[40], x6[43]); + x7[43] = _mm_subs_epi16(x6[40], x6[43]); + x7[41] = _mm_adds_epi16(x6[41], x6[42]); + x7[42] = _mm_subs_epi16(x6[41], x6[42]); + x7[44] = _mm_subs_epi16(x6[47], x6[44]); + x7[47] = _mm_adds_epi16(x6[47], x6[44]); + x7[45] = _mm_subs_epi16(x6[46], x6[45]); + x7[46] = _mm_adds_epi16(x6[46], x6[45]); + x7[48] = _mm_adds_epi16(x6[48], x6[51]); + x7[51] = _mm_subs_epi16(x6[48], x6[51]); + x7[49] = _mm_adds_epi16(x6[49], x6[50]); + x7[50] = _mm_subs_epi16(x6[49], x6[50]); + x7[52] = _mm_subs_epi16(x6[55], x6[52]); + x7[55] = _mm_adds_epi16(x6[55], x6[52]); + x7[53] = _mm_subs_epi16(x6[54], x6[53]); + x7[54] = _mm_adds_epi16(x6[54], x6[53]); + x7[56] = _mm_adds_epi16(x6[56], x6[59]); + x7[59] = _mm_subs_epi16(x6[56], x6[59]); + x7[57] = _mm_adds_epi16(x6[57], x6[58]); + x7[58] = _mm_subs_epi16(x6[57], x6[58]); + x7[60] = _mm_subs_epi16(x6[63], x6[60]); + x7[63] = _mm_adds_epi16(x6[63], x6[60]); + x7[61] = _mm_subs_epi16(x6[62], x6[61]); + x7[62] = _mm_adds_epi16(x6[62], x6[61]); + + // stage 8 + __m128i x8[64]; + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x7[8], x7[15], x8[8], x8[15]); + btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x7[9], x7[14], x8[9], x8[14]); + btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x7[10], x7[13], x8[10], x8[13]); + btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x7[11], x7[12], x8[11], x8[12]); + x8[16] = _mm_adds_epi16(x7[16], x7[17]); + x8[17] = _mm_subs_epi16(x7[16], x7[17]); + x8[18] = _mm_subs_epi16(x7[19], x7[18]); + x8[19] = _mm_adds_epi16(x7[19], x7[18]); + x8[20] = _mm_adds_epi16(x7[20], x7[21]); + x8[21] = _mm_subs_epi16(x7[20], x7[21]); + x8[22] = _mm_subs_epi16(x7[23], x7[22]); + x8[23] = _mm_adds_epi16(x7[23], x7[22]); + x8[24] = _mm_adds_epi16(x7[24], x7[25]); + x8[25] = _mm_subs_epi16(x7[24], x7[25]); + x8[26] = _mm_subs_epi16(x7[27], x7[26]); + x8[27] = _mm_adds_epi16(x7[27], x7[26]); + x8[28] = _mm_adds_epi16(x7[28], x7[29]); + x8[29] = _mm_subs_epi16(x7[28], x7[29]); + x8[30] = _mm_subs_epi16(x7[31], x7[30]); + x8[31] = _mm_adds_epi16(x7[31], x7[30]); + x8[32] = x7[32]; + btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x7[33], x7[62], x8[33], x8[62]); + btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x7[34], x7[61], x8[34], x8[61]); + x8[35] = x7[35]; + x8[36] = x7[36]; + btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x7[37], x7[58], x8[37], x8[58]); + btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x7[38], x7[57], x8[38], x8[57]); + x8[39] = x7[39]; + x8[40] = x7[40]; + btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x7[41], x7[54], x8[41], x8[54]); + btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x7[42], x7[53], x8[42], x8[53]); + x8[43] = x7[43]; + x8[44] = x7[44]; + btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x7[45], x7[50], x8[45], x8[50]); + btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x7[46], x7[49], x8[46], x8[49]); + x8[47] = x7[47]; + x8[48] = x7[48]; + x8[51] = x7[51]; + x8[52] = x7[52]; + x8[55] = x7[55]; + x8[56] = x7[56]; + x8[59] = x7[59]; + x8[60] = x7[60]; + x8[63] = x7[63]; + + // stage 9 + __m128i x9[64]; + x9[0] = x8[0]; + x9[1] = x8[1]; + x9[2] = x8[2]; + x9[3] = x8[3]; + x9[4] = x8[4]; + x9[5] = x8[5]; + x9[6] = x8[6]; + x9[7] = x8[7]; + x9[8] = x8[8]; + x9[9] = x8[9]; + x9[10] = x8[10]; + x9[11] = x8[11]; + x9[12] = x8[12]; + x9[13] = x8[13]; + x9[14] = x8[14]; + x9[15] = x8[15]; + btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x8[16], x8[31], x9[16], x9[31]); + btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x8[17], x8[30], x9[17], x9[30]); + btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x8[18], x8[29], x9[18], x9[29]); + btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x8[19], x8[28], x9[19], x9[28]); + btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x8[20], x8[27], x9[20], x9[27]); + btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x8[21], x8[26], x9[21], x9[26]); + btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x8[22], x8[25], x9[22], x9[25]); + btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x8[23], x8[24], x9[23], x9[24]); + x9[32] = _mm_adds_epi16(x8[32], x8[33]); + x9[33] = _mm_subs_epi16(x8[32], x8[33]); + x9[34] = _mm_subs_epi16(x8[35], x8[34]); + x9[35] = _mm_adds_epi16(x8[35], x8[34]); + x9[36] = _mm_adds_epi16(x8[36], x8[37]); + x9[37] = _mm_subs_epi16(x8[36], x8[37]); + x9[38] = _mm_subs_epi16(x8[39], x8[38]); + x9[39] = _mm_adds_epi16(x8[39], x8[38]); + x9[40] = _mm_adds_epi16(x8[40], x8[41]); + x9[41] = _mm_subs_epi16(x8[40], x8[41]); + x9[42] = _mm_subs_epi16(x8[43], x8[42]); + x9[43] = _mm_adds_epi16(x8[43], x8[42]); + x9[44] = _mm_adds_epi16(x8[44], x8[45]); + x9[45] = _mm_subs_epi16(x8[44], x8[45]); + x9[46] = _mm_subs_epi16(x8[47], x8[46]); + x9[47] = _mm_adds_epi16(x8[47], x8[46]); + x9[48] = _mm_adds_epi16(x8[48], x8[49]); + x9[49] = _mm_subs_epi16(x8[48], x8[49]); + x9[50] = _mm_subs_epi16(x8[51], x8[50]); + x9[51] = _mm_adds_epi16(x8[51], x8[50]); + x9[52] = _mm_adds_epi16(x8[52], x8[53]); + x9[53] = _mm_subs_epi16(x8[52], x8[53]); + x9[54] = _mm_subs_epi16(x8[55], x8[54]); + x9[55] = _mm_adds_epi16(x8[55], x8[54]); + x9[56] = _mm_adds_epi16(x8[56], x8[57]); + x9[57] = _mm_subs_epi16(x8[56], x8[57]); + x9[58] = _mm_subs_epi16(x8[59], x8[58]); + x9[59] = _mm_adds_epi16(x8[59], x8[58]); + x9[60] = _mm_adds_epi16(x8[60], x8[61]); + x9[61] = _mm_subs_epi16(x8[60], x8[61]); + x9[62] = _mm_subs_epi16(x8[63], x8[62]); + x9[63] = _mm_adds_epi16(x8[63], x8[62]); + + // stage 10 + __m128i x10[64]; + x10[0] = x9[0]; + x10[1] = x9[1]; + x10[2] = x9[2]; + x10[3] = x9[3]; + x10[4] = x9[4]; + x10[5] = x9[5]; + x10[6] = x9[6]; + x10[7] = x9[7]; + x10[8] = x9[8]; + x10[9] = x9[9]; + x10[10] = x9[10]; + x10[11] = x9[11]; + x10[12] = x9[12]; + x10[13] = x9[13]; + x10[14] = x9[14]; + x10[15] = x9[15]; + x10[16] = x9[16]; + x10[17] = x9[17]; + x10[18] = x9[18]; + x10[19] = x9[19]; + x10[20] = x9[20]; + x10[21] = x9[21]; + x10[22] = x9[22]; + x10[23] = x9[23]; + x10[24] = x9[24]; + x10[25] = x9[25]; + x10[26] = x9[26]; + x10[27] = x9[27]; + x10[28] = x9[28]; + x10[29] = x9[29]; + x10[30] = x9[30]; + x10[31] = x9[31]; + btf_16_sse2(cospi_p63_p01, cospi_m01_p63, x9[32], x9[63], x10[32], x10[63]); + btf_16_sse2(cospi_p31_p33, cospi_m33_p31, x9[33], x9[62], x10[33], x10[62]); + btf_16_sse2(cospi_p47_p17, cospi_m17_p47, x9[34], x9[61], x10[34], x10[61]); + btf_16_sse2(cospi_p15_p49, cospi_m49_p15, x9[35], x9[60], x10[35], x10[60]); + btf_16_sse2(cospi_p55_p09, cospi_m09_p55, x9[36], x9[59], x10[36], x10[59]); + btf_16_sse2(cospi_p23_p41, cospi_m41_p23, x9[37], x9[58], x10[37], x10[58]); + btf_16_sse2(cospi_p39_p25, cospi_m25_p39, x9[38], x9[57], x10[38], x10[57]); + btf_16_sse2(cospi_p07_p57, cospi_m57_p07, x9[39], x9[56], x10[39], x10[56]); + btf_16_sse2(cospi_p59_p05, cospi_m05_p59, x9[40], x9[55], x10[40], x10[55]); + btf_16_sse2(cospi_p27_p37, cospi_m37_p27, x9[41], x9[54], x10[41], x10[54]); + btf_16_sse2(cospi_p43_p21, cospi_m21_p43, x9[42], x9[53], x10[42], x10[53]); + btf_16_sse2(cospi_p11_p53, cospi_m53_p11, x9[43], x9[52], x10[43], x10[52]); + btf_16_sse2(cospi_p51_p13, cospi_m13_p51, x9[44], x9[51], x10[44], x10[51]); + btf_16_sse2(cospi_p19_p45, cospi_m45_p19, x9[45], x9[50], x10[45], x10[50]); + btf_16_sse2(cospi_p35_p29, cospi_m29_p35, x9[46], x9[49], x10[46], x10[49]); + btf_16_sse2(cospi_p03_p61, cospi_m61_p03, x9[47], x9[48], x10[47], x10[48]); + + // stage 11 + output[0] = x10[0]; + output[1] = x10[32]; + output[2] = x10[16]; + output[3] = x10[48]; + output[4] = x10[8]; + output[5] = x10[40]; + output[6] = x10[24]; + output[7] = x10[56]; + output[8] = x10[4]; + output[9] = x10[36]; + output[10] = x10[20]; + output[11] = x10[52]; + output[12] = x10[12]; + output[13] = x10[44]; + output[14] = x10[28]; + output[15] = x10[60]; + output[16] = x10[2]; + output[17] = x10[34]; + output[18] = x10[18]; + output[19] = x10[50]; + output[20] = x10[10]; + output[21] = x10[42]; + output[22] = x10[26]; + output[23] = x10[58]; + output[24] = x10[6]; + output[25] = x10[38]; + output[26] = x10[22]; + output[27] = x10[54]; + output[28] = x10[14]; + output[29] = x10[46]; + output[30] = x10[30]; + output[31] = x10[62]; + output[32] = x10[1]; + output[33] = x10[33]; + output[34] = x10[17]; + output[35] = x10[49]; + output[36] = x10[9]; + output[37] = x10[41]; + output[38] = x10[25]; + output[39] = x10[57]; + output[40] = x10[5]; + output[41] = x10[37]; + output[42] = x10[21]; + output[43] = x10[53]; + output[44] = x10[13]; + output[45] = x10[45]; + output[46] = x10[29]; + output[47] = x10[61]; + output[48] = x10[3]; + output[49] = x10[35]; + output[50] = x10[19]; + output[51] = x10[51]; + output[52] = x10[11]; + output[53] = x10[43]; + output[54] = x10[27]; + output[55] = x10[59]; + output[56] = x10[7]; + output[57] = x10[39]; + output[58] = x10[23]; + output[59] = x10[55]; + output[60] = x10[15]; + output[61] = x10[47]; + output[62] = x10[31]; + output[63] = x10[63]; +} + +static void fadst4x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *sinpi = sinpi_arr(cos_bit); + const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]); + const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]); + const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]); + const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]); + const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]); + const __m128i __zero = _mm_set1_epi16(0); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + const __m128i in7 = _mm_add_epi16(input[0], input[1]); + __m128i u[8], v[8]; + + u[0] = _mm_unpacklo_epi16(input[0], input[1]); + u[1] = _mm_unpacklo_epi16(input[2], input[3]); + u[2] = _mm_unpacklo_epi16(in7, __zero); + u[3] = _mm_unpacklo_epi16(input[2], __zero); + u[4] = _mm_unpacklo_epi16(input[3], __zero); + + v[0] = _mm_madd_epi16(u[0], sinpi_p01_p02); // s0 + s2 + v[1] = _mm_madd_epi16(u[1], sinpi_p03_p04); // s4 + s5 + v[2] = _mm_madd_epi16(u[2], sinpi_p03_p03); // x1 + v[3] = _mm_madd_epi16(u[0], sinpi_p04_m01); // s1 - s3 + v[4] = _mm_madd_epi16(u[1], sinpi_m03_p02); // -s4 + s6 + v[5] = _mm_madd_epi16(u[3], sinpi_p03_p03); // s4 + v[6] = _mm_madd_epi16(u[4], sinpi_p03_p03); + + u[0] = _mm_add_epi32(v[0], v[1]); + u[1] = _mm_sub_epi32(v[2], v[6]); + u[2] = _mm_add_epi32(v[3], v[4]); + u[3] = _mm_sub_epi32(u[2], u[0]); + u[4] = _mm_slli_epi32(v[5], 2); + u[5] = _mm_sub_epi32(u[4], v[5]); + u[6] = _mm_add_epi32(u[3], u[5]); + + v[0] = _mm_add_epi32(u[0], __rounding); + v[1] = _mm_add_epi32(u[1], __rounding); + v[2] = _mm_add_epi32(u[2], __rounding); + v[3] = _mm_add_epi32(u[6], __rounding); + + u[0] = _mm_srai_epi32(v[0], cos_bit); + u[1] = _mm_srai_epi32(v[1], cos_bit); + u[2] = _mm_srai_epi32(v[2], cos_bit); + u[3] = _mm_srai_epi32(v[3], cos_bit); + + output[0] = _mm_packs_epi32(u[0], u[2]); + output[1] = _mm_packs_epi32(u[1], u[3]); + output[2] = _mm_srli_si128(output[0], 8); + output[3] = _mm_srli_si128(output[1], 8); +} + +static void fadst4x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + + // stage 1 + __m128i x1[8]; + x1[0] = input[0]; + x1[1] = _mm_subs_epi16(__zero, input[7]); + x1[2] = _mm_subs_epi16(__zero, input[3]); + x1[3] = input[4]; + x1[4] = _mm_subs_epi16(__zero, input[1]); + x1[5] = input[6]; + x1[6] = input[2]; + x1[7] = _mm_subs_epi16(__zero, input[5]); + + // stage 2 + __m128i x2[8]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[2], + &x1[3], &x2[2], &x2[3]); + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[6], + &x1[7], &x2[6], &x2[7]); + + // stage 3 + __m128i x3[8]; + x3[0] = _mm_adds_epi16(x2[0], x2[2]); + x3[2] = _mm_subs_epi16(x2[0], x2[2]); + x3[1] = _mm_adds_epi16(x2[1], x2[3]); + x3[3] = _mm_subs_epi16(x2[1], x2[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[6]); + x3[6] = _mm_subs_epi16(x2[4], x2[6]); + x3[5] = _mm_adds_epi16(x2[5], x2[7]); + x3[7] = _mm_subs_epi16(x2[5], x2[7]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w4_sse2(&cospi_p16_p48, &cospi_p48_m16, __rounding, cos_bit, &x3[4], + &x3[5], &x4[4], &x4[5]); + btf_16_w4_sse2(&cospi_m48_p16, &cospi_p16_p48, __rounding, cos_bit, &x3[6], + &x3[7], &x4[6], &x4[7]); + + // stage 5 + __m128i x5[8]; + x5[0] = _mm_adds_epi16(x4[0], x4[4]); + x5[4] = _mm_subs_epi16(x4[0], x4[4]); + x5[1] = _mm_adds_epi16(x4[1], x4[5]); + x5[5] = _mm_subs_epi16(x4[1], x4[5]); + x5[2] = _mm_adds_epi16(x4[2], x4[6]); + x5[6] = _mm_subs_epi16(x4[2], x4[6]); + x5[3] = _mm_adds_epi16(x4[3], x4[7]); + x5[7] = _mm_subs_epi16(x4[3], x4[7]); + + // stage 6 + __m128i x6[8]; + btf_16_w4_sse2(&cospi_p04_p60, &cospi_p60_m04, __rounding, cos_bit, &x5[0], + &x5[1], &x6[0], &x6[1]); + btf_16_w4_sse2(&cospi_p20_p44, &cospi_p44_m20, __rounding, cos_bit, &x5[2], + &x5[3], &x6[2], &x6[3]); + btf_16_w4_sse2(&cospi_p36_p28, &cospi_p28_m36, __rounding, cos_bit, &x5[4], + &x5[5], &x6[4], &x6[5]); + btf_16_w4_sse2(&cospi_p52_p12, &cospi_p12_m52, __rounding, cos_bit, &x5[6], + &x5[7], &x6[6], &x6[7]); + + // stage 7 + output[0] = x6[1]; + output[1] = x6[6]; + output[2] = x6[3]; + output[3] = x6[4]; + output[4] = x6[5]; + output[5] = x6[2]; + output[6] = x6[7]; + output[7] = x6[0]; +} + +static void fadst8x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *sinpi = sinpi_arr(cos_bit); + const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]); + const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]); + const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]); + const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]); + const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]); + const __m128i __zero = _mm_set1_epi16(0); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + const __m128i in7 = _mm_add_epi16(input[0], input[1]); + __m128i u_lo[8], u_hi[8], v_lo[8], v_hi[8]; + + u_lo[0] = _mm_unpacklo_epi16(input[0], input[1]); + u_hi[0] = _mm_unpackhi_epi16(input[0], input[1]); + u_lo[1] = _mm_unpacklo_epi16(input[2], input[3]); + u_hi[1] = _mm_unpackhi_epi16(input[2], input[3]); + u_lo[2] = _mm_unpacklo_epi16(in7, __zero); + u_hi[2] = _mm_unpackhi_epi16(in7, __zero); + u_lo[3] = _mm_unpacklo_epi16(input[2], __zero); + u_hi[3] = _mm_unpackhi_epi16(input[2], __zero); + u_lo[4] = _mm_unpacklo_epi16(input[3], __zero); + u_hi[4] = _mm_unpackhi_epi16(input[3], __zero); + + v_lo[0] = _mm_madd_epi16(u_lo[0], sinpi_p01_p02); // s0 + s2 + v_hi[0] = _mm_madd_epi16(u_hi[0], sinpi_p01_p02); // s0 + s2 + v_lo[1] = _mm_madd_epi16(u_lo[1], sinpi_p03_p04); // s4 + s5 + v_hi[1] = _mm_madd_epi16(u_hi[1], sinpi_p03_p04); // s4 + s5 + v_lo[2] = _mm_madd_epi16(u_lo[2], sinpi_p03_p03); // x1 + v_hi[2] = _mm_madd_epi16(u_hi[2], sinpi_p03_p03); // x1 + v_lo[3] = _mm_madd_epi16(u_lo[0], sinpi_p04_m01); // s1 - s3 + v_hi[3] = _mm_madd_epi16(u_hi[0], sinpi_p04_m01); // s1 - s3 + v_lo[4] = _mm_madd_epi16(u_lo[1], sinpi_m03_p02); // -s4 + s6 + v_hi[4] = _mm_madd_epi16(u_hi[1], sinpi_m03_p02); // -s4 + s6 + v_lo[5] = _mm_madd_epi16(u_lo[3], sinpi_p03_p03); // s4 + v_hi[5] = _mm_madd_epi16(u_hi[3], sinpi_p03_p03); // s4 + v_lo[6] = _mm_madd_epi16(u_lo[4], sinpi_p03_p03); + v_hi[6] = _mm_madd_epi16(u_hi[4], sinpi_p03_p03); + + u_lo[0] = _mm_add_epi32(v_lo[0], v_lo[1]); + u_hi[0] = _mm_add_epi32(v_hi[0], v_hi[1]); + u_lo[1] = _mm_sub_epi32(v_lo[2], v_lo[6]); + u_hi[1] = _mm_sub_epi32(v_hi[2], v_hi[6]); + u_lo[2] = _mm_add_epi32(v_lo[3], v_lo[4]); + u_hi[2] = _mm_add_epi32(v_hi[3], v_hi[4]); + u_lo[3] = _mm_sub_epi32(u_lo[2], u_lo[0]); + u_hi[3] = _mm_sub_epi32(u_hi[2], u_hi[0]); + u_lo[4] = _mm_slli_epi32(v_lo[5], 2); + u_hi[4] = _mm_slli_epi32(v_hi[5], 2); + u_lo[5] = _mm_sub_epi32(u_lo[4], v_lo[5]); + u_hi[5] = _mm_sub_epi32(u_hi[4], v_hi[5]); + u_lo[6] = _mm_add_epi32(u_lo[3], u_lo[5]); + u_hi[6] = _mm_add_epi32(u_hi[3], u_hi[5]); + + v_lo[0] = _mm_add_epi32(u_lo[0], __rounding); + v_hi[0] = _mm_add_epi32(u_hi[0], __rounding); + v_lo[1] = _mm_add_epi32(u_lo[1], __rounding); + v_hi[1] = _mm_add_epi32(u_hi[1], __rounding); + v_lo[2] = _mm_add_epi32(u_lo[2], __rounding); + v_hi[2] = _mm_add_epi32(u_hi[2], __rounding); + v_lo[3] = _mm_add_epi32(u_lo[6], __rounding); + v_hi[3] = _mm_add_epi32(u_hi[6], __rounding); + + u_lo[0] = _mm_srai_epi32(v_lo[0], cos_bit); + u_hi[0] = _mm_srai_epi32(v_hi[0], cos_bit); + u_lo[1] = _mm_srai_epi32(v_lo[1], cos_bit); + u_hi[1] = _mm_srai_epi32(v_hi[1], cos_bit); + u_lo[2] = _mm_srai_epi32(v_lo[2], cos_bit); + u_hi[2] = _mm_srai_epi32(v_hi[2], cos_bit); + u_lo[3] = _mm_srai_epi32(v_lo[3], cos_bit); + u_hi[3] = _mm_srai_epi32(v_hi[3], cos_bit); + + output[0] = _mm_packs_epi32(u_lo[0], u_hi[0]); + output[1] = _mm_packs_epi32(u_lo[1], u_hi[1]); + output[2] = _mm_packs_epi32(u_lo[2], u_hi[2]); + output[3] = _mm_packs_epi32(u_lo[3], u_hi[3]); +} + +static void fadst8x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + + // stage 1 + __m128i x1[8]; + x1[0] = input[0]; + x1[1] = _mm_subs_epi16(__zero, input[7]); + x1[2] = _mm_subs_epi16(__zero, input[3]); + x1[3] = input[4]; + x1[4] = _mm_subs_epi16(__zero, input[1]); + x1[5] = input[6]; + x1[6] = input[2]; + x1[7] = _mm_subs_epi16(__zero, input[5]); + + // stage 2 + __m128i x2[8]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]); + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]); + + // stage 3 + __m128i x3[8]; + x3[0] = _mm_adds_epi16(x2[0], x2[2]); + x3[2] = _mm_subs_epi16(x2[0], x2[2]); + x3[1] = _mm_adds_epi16(x2[1], x2[3]); + x3[3] = _mm_subs_epi16(x2[1], x2[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[6]); + x3[6] = _mm_subs_epi16(x2[4], x2[6]); + x3[5] = _mm_adds_epi16(x2[5], x2[7]); + x3[7] = _mm_subs_epi16(x2[5], x2[7]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]); + + // stage 5 + __m128i x5[8]; + x5[0] = _mm_adds_epi16(x4[0], x4[4]); + x5[4] = _mm_subs_epi16(x4[0], x4[4]); + x5[1] = _mm_adds_epi16(x4[1], x4[5]); + x5[5] = _mm_subs_epi16(x4[1], x4[5]); + x5[2] = _mm_adds_epi16(x4[2], x4[6]); + x5[6] = _mm_subs_epi16(x4[2], x4[6]); + x5[3] = _mm_adds_epi16(x4[3], x4[7]); + x5[7] = _mm_subs_epi16(x4[3], x4[7]); + + // stage 6 + __m128i x6[8]; + btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x5[0], x5[1], x6[0], x6[1]); + btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x5[2], x5[3], x6[2], x6[3]); + btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x5[4], x5[5], x6[4], x6[5]); + btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x5[6], x5[7], x6[6], x6[7]); + + // stage 7 + output[0] = x6[1]; + output[1] = x6[6]; + output[2] = x6[3]; + output[3] = x6[4]; + output[4] = x6[5]; + output[5] = x6[2]; + output[6] = x6[7]; + output[7] = x6[0]; +} + +static void fadst8x16_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); + __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); + __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + + // stage 1 + __m128i x1[16]; + x1[0] = input[0]; + x1[1] = _mm_subs_epi16(__zero, input[15]); + x1[2] = _mm_subs_epi16(__zero, input[7]); + x1[3] = input[8]; + x1[4] = _mm_subs_epi16(__zero, input[3]); + x1[5] = input[12]; + x1[6] = input[4]; + x1[7] = _mm_subs_epi16(__zero, input[11]); + x1[8] = _mm_subs_epi16(__zero, input[1]); + x1[9] = input[14]; + x1[10] = input[6]; + x1[11] = _mm_subs_epi16(__zero, input[9]); + x1[12] = input[2]; + x1[13] = _mm_subs_epi16(__zero, input[13]); + x1[14] = _mm_subs_epi16(__zero, input[5]); + x1[15] = input[10]; + + // stage 2 + __m128i x2[16]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]); + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]); + x2[8] = x1[8]; + x2[9] = x1[9]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x2[10], x2[11]); + x2[12] = x1[12]; + x2[13] = x1[13]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x2[14], x2[15]); + + // stage 3 + __m128i x3[16]; + x3[0] = _mm_adds_epi16(x2[0], x2[2]); + x3[2] = _mm_subs_epi16(x2[0], x2[2]); + x3[1] = _mm_adds_epi16(x2[1], x2[3]); + x3[3] = _mm_subs_epi16(x2[1], x2[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[6]); + x3[6] = _mm_subs_epi16(x2[4], x2[6]); + x3[5] = _mm_adds_epi16(x2[5], x2[7]); + x3[7] = _mm_subs_epi16(x2[5], x2[7]); + x3[8] = _mm_adds_epi16(x2[8], x2[10]); + x3[10] = _mm_subs_epi16(x2[8], x2[10]); + x3[9] = _mm_adds_epi16(x2[9], x2[11]); + x3[11] = _mm_subs_epi16(x2[9], x2[11]); + x3[12] = _mm_adds_epi16(x2[12], x2[14]); + x3[14] = _mm_subs_epi16(x2[12], x2[14]); + x3[13] = _mm_adds_epi16(x2[13], x2[15]); + x3[15] = _mm_subs_epi16(x2[13], x2[15]); + + // stage 4 + __m128i x4[16]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]); + x4[8] = x3[8]; + x4[9] = x3[9]; + x4[10] = x3[10]; + x4[11] = x3[11]; + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[12], x3[13], x4[12], x4[13]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[14], x3[15], x4[14], x4[15]); + + // stage 5 + __m128i x5[16]; + x5[0] = _mm_adds_epi16(x4[0], x4[4]); + x5[4] = _mm_subs_epi16(x4[0], x4[4]); + x5[1] = _mm_adds_epi16(x4[1], x4[5]); + x5[5] = _mm_subs_epi16(x4[1], x4[5]); + x5[2] = _mm_adds_epi16(x4[2], x4[6]); + x5[6] = _mm_subs_epi16(x4[2], x4[6]); + x5[3] = _mm_adds_epi16(x4[3], x4[7]); + x5[7] = _mm_subs_epi16(x4[3], x4[7]); + x5[8] = _mm_adds_epi16(x4[8], x4[12]); + x5[12] = _mm_subs_epi16(x4[8], x4[12]); + x5[9] = _mm_adds_epi16(x4[9], x4[13]); + x5[13] = _mm_subs_epi16(x4[9], x4[13]); + x5[10] = _mm_adds_epi16(x4[10], x4[14]); + x5[14] = _mm_subs_epi16(x4[10], x4[14]); + x5[11] = _mm_adds_epi16(x4[11], x4[15]); + x5[15] = _mm_subs_epi16(x4[11], x4[15]); + + // stage 6 + __m128i x6[16]; + x6[0] = x5[0]; + x6[1] = x5[1]; + x6[2] = x5[2]; + x6[3] = x5[3]; + x6[4] = x5[4]; + x6[5] = x5[5]; + x6[6] = x5[6]; + x6[7] = x5[7]; + btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x5[8], x5[9], x6[8], x6[9]); + btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x5[10], x5[11], x6[10], x6[11]); + btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x5[12], x5[13], x6[12], x6[13]); + btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x5[14], x5[15], x6[14], x6[15]); + + // stage 7 + __m128i x7[16]; + x7[0] = _mm_adds_epi16(x6[0], x6[8]); + x7[8] = _mm_subs_epi16(x6[0], x6[8]); + x7[1] = _mm_adds_epi16(x6[1], x6[9]); + x7[9] = _mm_subs_epi16(x6[1], x6[9]); + x7[2] = _mm_adds_epi16(x6[2], x6[10]); + x7[10] = _mm_subs_epi16(x6[2], x6[10]); + x7[3] = _mm_adds_epi16(x6[3], x6[11]); + x7[11] = _mm_subs_epi16(x6[3], x6[11]); + x7[4] = _mm_adds_epi16(x6[4], x6[12]); + x7[12] = _mm_subs_epi16(x6[4], x6[12]); + x7[5] = _mm_adds_epi16(x6[5], x6[13]); + x7[13] = _mm_subs_epi16(x6[5], x6[13]); + x7[6] = _mm_adds_epi16(x6[6], x6[14]); + x7[14] = _mm_subs_epi16(x6[6], x6[14]); + x7[7] = _mm_adds_epi16(x6[7], x6[15]); + x7[15] = _mm_subs_epi16(x6[7], x6[15]); + + // stage 8 + __m128i x8[16]; + btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x7[0], x7[1], x8[0], x8[1]); + btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x7[2], x7[3], x8[2], x8[3]); + btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x7[4], x7[5], x8[4], x8[5]); + btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x7[6], x7[7], x8[6], x8[7]); + btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x7[8], x7[9], x8[8], x8[9]); + btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x7[10], x7[11], x8[10], x8[11]); + btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x7[12], x7[13], x8[12], x8[13]); + btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x7[14], x7[15], x8[14], x8[15]); + + // stage 9 + output[0] = x8[1]; + output[1] = x8[14]; + output[2] = x8[3]; + output[3] = x8[12]; + output[4] = x8[5]; + output[5] = x8[10]; + output[6] = x8[7]; + output[7] = x8[8]; + output[8] = x8[9]; + output[9] = x8[6]; + output[10] = x8[11]; + output[11] = x8[4]; + output[12] = x8[13]; + output[13] = x8[2]; + output[14] = x8[15]; + output[15] = x8[0]; +} + +static const transform_1d_sse2 col_txfm4x4_arr[TX_TYPES] = { + fdct4x4_new_sse2, // DCT_DCT + fadst4x4_new_sse2, // ADST_DCT + fdct4x4_new_sse2, // DCT_ADST + fadst4x4_new_sse2, // ADST_ADST + fadst4x4_new_sse2, // FLIPADST_DCT + fdct4x4_new_sse2, // DCT_FLIPADST + fadst4x4_new_sse2, // FLIPADST_FLIPADST + fadst4x4_new_sse2, // ADST_FLIPADST + fadst4x4_new_sse2, // FLIPADST_ADST + fidentity4x4_new_sse2, // IDTX + fdct4x4_new_sse2, // V_DCT + fidentity4x4_new_sse2, // H_DCT + fadst4x4_new_sse2, // V_ADST + fidentity4x4_new_sse2, // H_ADST + fadst4x4_new_sse2, // V_FLIPADST + fidentity4x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm4x4_arr[TX_TYPES] = { + fdct4x4_new_sse2, // DCT_DCT + fdct4x4_new_sse2, // ADST_DCT + fadst4x4_new_sse2, // DCT_ADST + fadst4x4_new_sse2, // ADST_ADST + fdct4x4_new_sse2, // FLIPADST_DCT + fadst4x4_new_sse2, // DCT_FLIPADST + fadst4x4_new_sse2, // FLIPADST_FLIPADST + fadst4x4_new_sse2, // ADST_FLIPADST + fadst4x4_new_sse2, // FLIPADST_ADST + fidentity4x4_new_sse2, // IDTX + fidentity4x4_new_sse2, // V_DCT + fdct4x4_new_sse2, // H_DCT + fidentity4x4_new_sse2, // V_ADST + fadst4x4_new_sse2, // H_ADST + fidentity4x4_new_sse2, // V_FLIPADST + fadst4x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm4x8_arr[TX_TYPES] = { + fdct4x8_new_sse2, // DCT_DCT + fadst4x8_new_sse2, // ADST_DCT + fdct4x8_new_sse2, // DCT_ADST + fadst4x8_new_sse2, // ADST_ADST + fadst4x8_new_sse2, // FLIPADST_DCT + fdct4x8_new_sse2, // DCT_FLIPADST + fadst4x8_new_sse2, // FLIPADST_FLIPADST + fadst4x8_new_sse2, // ADST_FLIPADST + fadst4x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fdct4x8_new_sse2, // V_DCT + fidentity8x8_new_sse2, // H_DCT + fadst4x8_new_sse2, // V_ADST + fidentity8x8_new_sse2, // H_ADST + fadst4x8_new_sse2, // V_FLIPADST + fidentity8x8_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x4_arr[TX_TYPES] = { + fdct8x4_new_sse2, // DCT_DCT + fdct8x4_new_sse2, // ADST_DCT + fadst8x4_new_sse2, // DCT_ADST + fadst8x4_new_sse2, // ADST_ADST + fdct8x4_new_sse2, // FLIPADST_DCT + fadst8x4_new_sse2, // DCT_FLIPADST + fadst8x4_new_sse2, // FLIPADST_FLIPADST + fadst8x4_new_sse2, // ADST_FLIPADST + fadst8x4_new_sse2, // FLIPADST_ADST + fidentity8x4_new_sse2, // IDTX + fidentity8x4_new_sse2, // V_DCT + fdct8x4_new_sse2, // H_DCT + fidentity8x4_new_sse2, // V_ADST + fadst8x4_new_sse2, // H_ADST + fidentity8x4_new_sse2, // V_FLIPADST + fadst8x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x4_arr[TX_TYPES] = { + fdct8x4_new_sse2, // DCT_DCT + fadst8x4_new_sse2, // ADST_DCT + fdct8x4_new_sse2, // DCT_ADST + fadst8x4_new_sse2, // ADST_ADST + fadst8x4_new_sse2, // FLIPADST_DCT + fdct8x4_new_sse2, // DCT_FLIPADST + fadst8x4_new_sse2, // FLIPADST_FLIPADST + fadst8x4_new_sse2, // ADST_FLIPADST + fadst8x4_new_sse2, // FLIPADST_ADST + fidentity8x4_new_sse2, // IDTX + fdct8x4_new_sse2, // V_DCT + fidentity8x4_new_sse2, // H_DCT + fadst8x4_new_sse2, // V_ADST + fidentity8x4_new_sse2, // H_ADST + fadst8x4_new_sse2, // V_FLIPADST + fidentity8x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm4x8_arr[TX_TYPES] = { + fdct4x8_new_sse2, // DCT_DCT + fdct4x8_new_sse2, // ADST_DCT + fadst4x8_new_sse2, // DCT_ADST + fadst4x8_new_sse2, // ADST_ADST + fdct4x8_new_sse2, // FLIPADST_DCT + fadst4x8_new_sse2, // DCT_FLIPADST + fadst4x8_new_sse2, // FLIPADST_FLIPADST + fadst4x8_new_sse2, // ADST_FLIPADST + fadst4x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fidentity8x8_new_sse2, // V_DCT + fdct4x8_new_sse2, // H_DCT + fidentity8x8_new_sse2, // V_ADST + fadst4x8_new_sse2, // H_ADST + fidentity8x8_new_sse2, // V_FLIPADST + fadst4x8_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = { + fdct8x8_new_sse2, // DCT_DCT + fadst8x8_new_sse2, // ADST_DCT + fdct8x8_new_sse2, // DCT_ADST + fadst8x8_new_sse2, // ADST_ADST + fadst8x8_new_sse2, // FLIPADST_DCT + fdct8x8_new_sse2, // DCT_FLIPADST + fadst8x8_new_sse2, // FLIPADST_FLIPADST + fadst8x8_new_sse2, // ADST_FLIPADST + fadst8x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fdct8x8_new_sse2, // V_DCT + fidentity8x8_new_sse2, // H_DCT + fadst8x8_new_sse2, // V_ADST + fidentity8x8_new_sse2, // H_ADST + fadst8x8_new_sse2, // V_FLIPADST + fidentity8x8_new_sse2, // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = { + fdct8x8_new_sse2, // DCT_DCT + fdct8x8_new_sse2, // ADST_DCT + fadst8x8_new_sse2, // DCT_ADST + fadst8x8_new_sse2, // ADST_ADST + fdct8x8_new_sse2, // FLIPADST_DCT + fadst8x8_new_sse2, // DCT_FLIPADST + fadst8x8_new_sse2, // FLIPADST_FLIPADST + fadst8x8_new_sse2, // ADST_FLIPADST + fadst8x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fidentity8x8_new_sse2, // V_DCT + fdct8x8_new_sse2, // H_DCT + fidentity8x8_new_sse2, // V_ADST + fadst8x8_new_sse2, // H_ADST + fidentity8x8_new_sse2, // V_FLIPADST + fadst8x8_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = { + fdct8x16_new_sse2, // DCT_DCT + fadst8x16_new_sse2, // ADST_DCT + fdct8x16_new_sse2, // DCT_ADST + fadst8x16_new_sse2, // ADST_ADST + fadst8x16_new_sse2, // FLIPADST_DCT + fdct8x16_new_sse2, // DCT_FLIPADST + fadst8x16_new_sse2, // FLIPADST_FLIPADST + fadst8x16_new_sse2, // ADST_FLIPADST + fadst8x16_new_sse2, // FLIPADST_ADST + fidentity8x16_new_sse2, // IDTX + fdct8x16_new_sse2, // V_DCT + fidentity8x16_new_sse2, // H_DCT + fadst8x16_new_sse2, // V_ADST + fidentity8x16_new_sse2, // H_ADST + fadst8x16_new_sse2, // V_FLIPADST + fidentity8x16_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x16_arr[TX_TYPES] = { + fdct8x16_new_sse2, // DCT_DCT + fdct8x16_new_sse2, // ADST_DCT + fadst8x16_new_sse2, // DCT_ADST + fadst8x16_new_sse2, // ADST_ADST + fdct8x16_new_sse2, // FLIPADST_DCT + fadst8x16_new_sse2, // DCT_FLIPADST + fadst8x16_new_sse2, // FLIPADST_FLIPADST + fadst8x16_new_sse2, // ADST_FLIPADST + fadst8x16_new_sse2, // FLIPADST_ADST + fidentity8x16_new_sse2, // IDTX + fidentity8x16_new_sse2, // V_DCT + fdct8x16_new_sse2, // H_DCT + fidentity8x16_new_sse2, // V_ADST + fadst8x16_new_sse2, // H_ADST + fidentity8x16_new_sse2, // V_FLIPADST + fadst8x16_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x32_arr[TX_TYPES] = { + fdct8x32_new_sse2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity8x32_new_sse2, // IDTX + fidentity8x32_new_sse2, // V_DCT + fdct8x32_new_sse2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[4], buf1[4], *buf; + const int8_t *shift = fwd_txfm_shift_ls[TX_4X4]; + const int txw_idx = get_txw_idx(TX_4X4); + const int txh_idx = get_txh_idx(TX_4X4); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 4; + const int height = 4; + const transform_1d_sse2 col_txfm = col_txfm4x4_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm4x4_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_4x4(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_4x4(buf, buf); + store_buffer_16bit_to_32bit_w4(buf, output, width, height); +} + +void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)stride; + (void)bd; + __m128i buf0[8], buf1[8], *buf; + const int8_t *shift = fwd_txfm_shift_ls[TX_4X8]; + const int txw_idx = get_txw_idx(TX_4X8); + const int txh_idx = get_txh_idx(TX_4X8); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 4; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm4x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_4x8(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x4(buf, buf); + store_rect_buffer_16bit_to_32bit_w4(buf, output, width, height); +} + +void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = fwd_txfm_shift_ls[TX_4X16]; + const int txw_idx = get_txw_idx(TX_4X16); + const int txh_idx = get_txh_idx(TX_4X16); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 4; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_4x8(buf0, buf1); + transpose_16bit_4x8(buf0 + 8, buf1 + 8); + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + 8 * i, buf, width); + } else { + buf = buf1 + 8 * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x4(buf, buf); + store_buffer_16bit_to_32bit_w4(buf, output + 8 * width * i, width, 8); + } +} + +void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[8], buf1[8], *buf; + const int8_t *shift = fwd_txfm_shift_ls[TX_8X4]; + const int txw_idx = get_txw_idx(TX_8X4); + const int txh_idx = get_txh_idx(TX_8X4); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 4; + const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm4x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + else + load_buffer_16bit_to_16bit(input, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height); +} + +void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[8], buf1[8], *buf; + const int8_t *shift = fwd_txfm_shift_ls[TX_8X8]; + const int txw_idx = get_txw_idx(TX_8X8); + const int txh_idx = get_txh_idx(TX_8X8); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + else + load_buffer_16bit_to_16bit(input, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output, width, height); +} + +void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + transpose_16bit_8x8(buf0 + 8, buf1 + 8); + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8); + } +} + +void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[32]; + const int8_t *shift = fwd_txfm_shift_ls[TX_8X32]; + const int txw_idx = get_txw_idx(TX_8X32); + const int txh_idx = get_txh_idx(TX_8X32); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 32; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + transpose_16bit_8x8(buf0 + 8, buf1 + 8); + transpose_16bit_8x8(buf0 + 16, buf1 + 16); + transpose_16bit_8x8(buf0 + 24, buf1 + 24); + + for (int i = 0; i < 4; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8); + } +} + +void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = fwd_txfm_shift_ls[TX_16X4]; + const int txw_idx = get_txw_idx(TX_16X4); + const int txh_idx = get_txh_idx(TX_16X4); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 4; + const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + __m128i *buf; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x4(buf0, buf1 + 8 * i); + } + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_4x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output, width, height); + transpose_16bit_4x8(buf + 8, buf + 8); + store_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height); +} + +void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + __m128i *buf; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 8 * i); + } + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height); + transpose_16bit_8x8(buf + 8, buf + 8); + store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height); +} + +void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[32]; + const int8_t *shift = fwd_txfm_shift_ls[TX_16X16]; + const int txw_idx = get_txw_idx(TX_16X16); + const int txh_idx = get_txh_idx(TX_16X16); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i); + } + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8); + transpose_16bit_8x8(buf + 8, buf + 8); + store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width, + 8); + } +} + +void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[64]; + const int8_t *shift = fwd_txfm_shift_ls[TX_16X32]; + const int txw_idx = get_txw_idx(TX_16X32); + const int txh_idx = get_txh_idx(TX_16X32); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 32; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i); + transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i); + transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i); + } + + for (int i = 0; i < 4; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, + 8); + transpose_16bit_8x8(buf + 8, buf + 8); + store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, + width, 8); + } + } else { + av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[32]; + const int8_t *shift = fwd_txfm_shift_ls[TX_32X8]; + const int txw_idx = get_txw_idx(TX_32X8); + const int txh_idx = get_txh_idx(TX_32X8); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 4; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); + } + + for (int i = 0; i < 1; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, + height); + transpose_16bit_8x8(buf + 8, buf + 8); + store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width, + height); + transpose_16bit_8x8(buf + 16, buf + 16); + store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16, + width, height); + transpose_16bit_8x8(buf + 24, buf + 24); + store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24, + width, height); + } + } else { + av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[64]; + const int8_t *shift = fwd_txfm_shift_ls[TX_32X16]; + const int txw_idx = get_txw_idx(TX_32X16); + const int txh_idx = get_txh_idx(TX_32X16); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 4; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i); + } + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, + 8); + transpose_16bit_8x8(buf + 8, buf + 8); + store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, + width, 8); + transpose_16bit_8x8(buf + 16, buf + 16); + store_rect_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16, + width, 8); + transpose_16bit_8x8(buf + 24, buf + 24); + store_rect_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24, + width, 8); + } + } else { + av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[128]; + const int8_t *shift = fwd_txfm_shift_ls[TX_32X32]; + const int txw_idx = get_txw_idx(TX_32X32); + const int txh_idx = get_txh_idx(TX_32X32); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 32; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 4; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i); + transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i); + transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i); + } + + for (int i = 0; i < 4; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8); + transpose_16bit_8x8(buf + 8, buf + 8); + store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width, + 8); + transpose_16bit_8x8(buf + 16, buf + 16); + store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16, + width, 8); + transpose_16bit_8x8(buf + 24, buf + 24); + store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24, + width, 8); + } + } else { + av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X16; + __m128i buf0[64], buf1[128]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = fdct8x16_new_sse2; + const transform_1d_sse2 row_txfm = fdct8x64_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < height_div8; ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + + for (int i = 0; i < height_div8; i++) { + __m128i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + int32_t *output8 = output + 8 * 32 * i; + for (int j = 0; j < 4; ++j) { + __m128i *buf8 = buf + 8 * j; + transpose_16bit_8x8(buf8, buf8); + store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, 32, 8); + } + } +} + +void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_16X64; + __m128i buf0[64], buf1[128]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = fdct8x64_new_sse2; + const transform_1d_sse2 row_txfm = fdct8x16_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < height_div8; ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + int32_t *output8 = output + 8 * width * i; + for (int j = 0; j < width_div8; ++j) { + __m128i *buf8 = buf + 8 * j; + transpose_16bit_8x8(buf8, buf8); + store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, width, 8); + } + } + // Zero out the bottom 16x32 area. + memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); +} + +static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { + av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform + av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform + av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform + av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform + NULL, // 64x64 transform + av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform + av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform + av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform + av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform + av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform + av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform + av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform + av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform + av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform + av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform + av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform +}; + +void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; + + if ((fwd_txfm2d_func == NULL) || + (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) + av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); + else + fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h new file mode 100644 index 000000000..aa14d3ade --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_ +#define AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_ + +#include <immintrin.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/transpose_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit); +void fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit); + +static INLINE void fidentity4x4_new_sse2(const __m128i *const input, + __m128i *const output, + const int8_t cos_bit) { + (void)cos_bit; + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 4; ++i) { + const __m128i a = _mm_unpacklo_epi16(input[i], one); + const __m128i b = scale_round_sse2(a, NewSqrt2); + output[i] = _mm_packs_epi32(b, b); + } +} + +static INLINE void fidentity8x4_new_sse2(const __m128i *const input, + __m128i *const output, + const int8_t cos_bit) { + (void)cos_bit; + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 4; ++i) { + const __m128i a_lo = _mm_unpacklo_epi16(input[i], one); + const __m128i a_hi = _mm_unpackhi_epi16(input[i], one); + const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); + const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2); + output[i] = _mm_packs_epi32(b_lo, b_hi); + } +} + +static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + + output[0] = _mm_adds_epi16(input[0], input[0]); + output[1] = _mm_adds_epi16(input[1], input[1]); + output[2] = _mm_adds_epi16(input[2], input[2]); + output[3] = _mm_adds_epi16(input[3], input[3]); + output[4] = _mm_adds_epi16(input[4], input[4]); + output[5] = _mm_adds_epi16(input[5], input[5]); + output[6] = _mm_adds_epi16(input[6], input[6]); + output[7] = _mm_adds_epi16(input[7], input[7]); +} + +static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 16; ++i) { + const __m128i a_lo = _mm_unpacklo_epi16(input[i], one); + const __m128i a_hi = _mm_unpackhi_epi16(input[i], one); + const __m128i b_lo = scale_round_sse2(a_lo, 2 * NewSqrt2); + const __m128i b_hi = scale_round_sse2(a_hi, 2 * NewSqrt2); + output[i] = _mm_packs_epi32(b_lo, b_hi); + } +} + +static INLINE void fidentity8x32_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + for (int i = 0; i < 32; ++i) { + output[i] = _mm_slli_epi16(input[i], 2); + } +} + +static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = { + fdct8x32_new_sse2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity8x32_new_sse2, // IDTX + fdct8x32_new_sse2, // V_DCT + fidentity8x32_new_sse2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +#ifdef __cplusplus +} +#endif + +#endif // AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_ diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c index c8d4ccb70..b58911fcb 100644 --- a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c +++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c @@ -11,7 +11,8 @@ #include <immintrin.h> -#include "./av1_rtcd.h" +#include "config/av1_rtcd.h" + #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" @@ -32,7 +33,10 @@ static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *dequant_ptr, int log_scale, __m256i *qp) { __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); - round = _mm_srai_epi16(round, log_scale); + if (log_scale) { + const __m128i round_scale = _mm_set1_epi16(1 << (15 - log_scale)); + round = _mm_mulhrs_epi16(round, round_scale); + } const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); @@ -45,8 +49,8 @@ static INLINE void quantize(const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, int log_scale, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { - const __m256i abs = _mm256_abs_epi32(*c); - __m256i q = _mm256_add_epi32(abs, qp[0]); + const __m256i abs_coeff = _mm256_abs_epi32(*c); + __m256i q = _mm256_add_epi32(abs_coeff, qp[0]); __m256i q_lo = _mm256_mul_epi32(q, qp[1]); __m256i q_hi = _mm256_srli_epi64(q, 32); @@ -56,6 +60,9 @@ static INLINE void quantize(const __m256i *qp, __m256i *c, q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); q_hi = _mm256_slli_epi64(q_hi, 32); q = _mm256_or_si256(q_lo, q_hi); + const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale); + const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s); + q = _mm256_andnot_si256(mask, q); __m256i dq = _mm256_mullo_epi32(q, qp[2]); dq = _mm256_srai_epi32(dq, log_scale); @@ -81,8 +88,8 @@ static INLINE void quantize(const __m256i *qp, __m256i *c, } void av1_highbd_quantize_fp_avx2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale) { @@ -90,14 +97,23 @@ void av1_highbd_quantize_fp_avx2( (void)zbin_ptr; (void)quant_shift_ptr; const unsigned int step = 8; + __m256i qp[3], coeff; - if (LIKELY(!skip_block)) { - __m256i qp[3], coeff; + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp); + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); - init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp); - coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + __m256i eob = _mm256_setzero_si256(); + quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; - __m256i eob = _mm256_setzero_si256(); + update_qp(qp); + while (n_coeffs > 0) { + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; @@ -105,39 +121,17 @@ void av1_highbd_quantize_fp_avx2( dqcoeff_ptr += step; iscan += step; n_coeffs -= step; - - update_qp(qp); - while (n_coeffs > 0) { - coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); - quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); - - coeff_ptr += step; - qcoeff_ptr += step; - dqcoeff_ptr += step; - iscan += step; - n_coeffs -= step; - } - { - __m256i eob_s; - eob_s = _mm256_shuffle_epi32(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 1); - eob = _mm256_max_epi16(eob, eob_s); - const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), - _mm256_extractf128_si256(eob, 1)); - *eob_ptr = _mm_extract_epi16(final_eob, 0); - } - } else { - do { - const __m256i zero = _mm256_setzero_si256(); - _mm256_storeu_si256((__m256i *)qcoeff_ptr, zero); - _mm256_storeu_si256((__m256i *)dqcoeff_ptr, zero); - qcoeff_ptr += step; - dqcoeff_ptr += step; - n_coeffs -= step; - } while (n_coeffs > 0); - *eob_ptr = 0; + } + { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); + const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), + _mm256_extractf128_si256(eob, 1)); + *eob_ptr = _mm_extract_epi16(final_eob, 0); } } diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c index 8d717a083..40b3b460b 100644 --- a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c +++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c @@ -12,8 +12,10 @@ #include <smmintrin.h> #include <stdint.h> -#include "./av1_rtcd.h" +#include "config/av1_rtcd.h" + #include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/synonyms.h" // Coefficient quantization phase 1 // param[0-2] : rounding/quan/dequan constants @@ -36,6 +38,8 @@ static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param, qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift); dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]); dquan[0] = _mm_srli_epi64(dquan[0], scale); + const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale); + qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]); } // Coefficient quantization phase 2 @@ -70,7 +74,8 @@ static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan, qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign); dquan[0] = _mm_sign_epi32(dquan[0], *sign); - + qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]); + dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]); _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]); _mm_storeu_si128((__m128i *)dqAddr, dquan[0]); } @@ -108,12 +113,12 @@ static INLINE uint16_t get_accumulated_eob(__m128i *eob) { } void av1_highbd_quantize_fp_sse4_1( - const tran_low_t *coeff_ptr, intptr_t count, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale) { - __m128i coeff[2], qcoeff[2], dequant[2], qparam[3], coeff_sign; + __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign; __m128i eob = _mm_setzero_si128(); const tran_low_t *src = coeff_ptr; tran_low_t *quanAddr = qcoeff_ptr; @@ -121,7 +126,6 @@ void av1_highbd_quantize_fp_sse4_1( const int shift = 16 - log_scale; const int coeff_stride = 4; const int quan_stride = coeff_stride; - (void)skip_block; (void)zbin_ptr; (void)quant_shift_ptr; (void)scan; @@ -129,29 +133,54 @@ void av1_highbd_quantize_fp_sse4_1( memset(quanAddr, 0, count * sizeof(quanAddr[0])); memset(dquanAddr, 0, count * sizeof(dquanAddr[0])); - if (!skip_block) { - coeff[0] = _mm_loadu_si128((__m128i const *)src); + coeff[0] = _mm_loadu_si128((__m128i const *)src); + const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale); + const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale); + + qparam[0] = _mm_set_epi32(round1, round1, round1, round0); + qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]); + qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]); + qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1], + dequant_ptr[0]); + + // DC and first 3 AC + quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + + // update round/quan/dquan for AC + qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]); + qparam[1] = xx_set1_64_from_32i(quant_ptr[1]); + qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]); + qparam[3] = _mm_set1_epi32(dequant_ptr[1]); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, + quanAddr, dquanAddr); + + // next 4 AC + coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); + quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, + quanAddr + quan_stride, dquanAddr + quan_stride); + + find_eob(quanAddr, iscan, &eob); + + count -= 8; + + // loop for the rest of AC + while (count > 0) { + src += coeff_stride << 1; + quanAddr += quan_stride << 1; + dquanAddr += quan_stride << 1; + iscan += quan_stride << 1; - qparam[0] = - _mm_set_epi32(round_ptr[1] >> log_scale, round_ptr[1] >> log_scale, - round_ptr[1] >> log_scale, round_ptr[0] >> log_scale); - qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[0]); - qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[0]); + coeff[0] = _mm_loadu_si128((__m128i const *)src); + coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); - // DC and first 3 AC quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant, &coeff_sign); - - // update round/quan/dquan for AC - qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]); - qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[1]); - qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[1]); - quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, quanAddr, dquanAddr); - // next 4 AC - coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant, &coeff_sign); quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, @@ -161,34 +190,6 @@ void av1_highbd_quantize_fp_sse4_1( find_eob(quanAddr, iscan, &eob); count -= 8; - - // loop for the rest of AC - while (count > 0) { - src += coeff_stride << 1; - quanAddr += quan_stride << 1; - dquanAddr += quan_stride << 1; - iscan += quan_stride << 1; - - coeff[0] = _mm_loadu_si128((__m128i const *)src); - coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); - - quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, - dequant, &coeff_sign); - quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, - log_scale, quanAddr, dquanAddr); - - quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, - dequant, &coeff_sign); - quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, - log_scale, quanAddr + quan_stride, - dquanAddr + quan_stride); - - find_eob(quanAddr, iscan, &eob); - - count -= 8; - } - *eob_ptr = get_accumulated_eob(&eob); - } else { - *eob_ptr = 0; } + *eob_ptr = get_accumulated_eob(&eob); } diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c index 078a67510..df22aaba7 100644 --- a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c +++ b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c @@ -11,7 +11,8 @@ #include <immintrin.h> -#include "./av1_rtcd.h" +#include "config/av1_rtcd.h" + #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" @@ -57,7 +58,7 @@ static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr, init_one_qp(&round, &qp[0]); init_one_qp(&quant, &qp[1]); - if (log_scale > 0) { + if (log_scale == 1) { qp[1] = _mm256_slli_epi16(qp[1], log_scale); } @@ -94,16 +95,25 @@ static INLINE void update_qp(int log_scale, __m256i *thr, __m256i *qp) { } \ } while (0) +static INLINE uint16_t quant_gather_eob(__m256i eob) { + const __m128i eob_lo = _mm256_castsi256_si128(eob); + const __m128i eob_hi = _mm256_extractf128_si256(eob, 1); + __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi); + eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s); + eob_s = _mm_minpos_epu16(eob_s); + return INT16_MAX - _mm_extract_epi16(eob_s, 0); +} + static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { - const __m256i abs = _mm256_abs_epi16(*c); - __m256i mask = _mm256_cmpgt_epi16(abs, *thr); - mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr)); + const __m256i abs_coeff = _mm256_abs_epi16(*c); + __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr)); const int nzflag = _mm256_movemask_epi8(mask); if (nzflag) { - __m256i q = _mm256_adds_epi16(abs, qp[0]); + __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]); q = _mm256_mulhi_epi16(q, qp[1]); q = _mm256_sign_epi16(q, *c); const __m256i dq = _mm256_mullo_epi16(q, qp[2]); @@ -123,8 +133,8 @@ static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c, } void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, @@ -134,15 +144,26 @@ void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, (void)quant_shift_ptr; const unsigned int step = 16; - if (LIKELY(!skip_block)) { - __m256i qp[3]; - __m256i coeff, thr; - const int log_scale = 0; + __m256i qp[3]; + __m256i coeff, thr; + const int log_scale = 0; - init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); - read_coeff(coeff_ptr, &coeff); + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); + read_coeff(coeff_ptr, &coeff); + + __m256i eob = _mm256_setzero_si256(); + quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + + update_qp(log_scale, &thr, qp); - __m256i eob = _mm256_setzero_si256(); + while (n_coeffs > 0) { + read_coeff(coeff_ptr, &coeff); quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; @@ -150,54 +171,21 @@ void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dqcoeff_ptr += step; iscan_ptr += step; n_coeffs -= step; - - update_qp(log_scale, &thr, qp); - - while (n_coeffs > 0) { - read_coeff(coeff_ptr, &coeff); - quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); - - coeff_ptr += step; - qcoeff_ptr += step; - dqcoeff_ptr += step; - iscan_ptr += step; - n_coeffs -= step; - } - { - __m256i eob_s; - eob_s = _mm256_shuffle_epi32(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 1); - eob = _mm256_max_epi16(eob, eob_s); - const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), - _mm256_extractf128_si256(eob, 1)); - *eob_ptr = _mm_extract_epi16(final_eob, 0); - } - } else { - do { - write_zero(qcoeff_ptr); - write_zero(dqcoeff_ptr); - qcoeff_ptr += step; - dqcoeff_ptr += step; - n_coeffs -= step; - } while (n_coeffs > 0); - *eob_ptr = 0; } + *eob_ptr = quant_gather_eob(eob); } static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { - const __m256i abs = _mm256_abs_epi16(*c); - __m256i mask = _mm256_cmpgt_epi16(abs, *thr); - mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr)); + const __m256i abs_coeff = _mm256_abs_epi16(*c); + __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr)); const int nzflag = _mm256_movemask_epi8(mask); if (nzflag) { - __m256i q = _mm256_adds_epi16(abs, qp[0]); + __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]); q = _mm256_mulhi_epu16(q, qp[1]); __m256i dq = _mm256_mullo_epi16(q, qp[2]); @@ -221,8 +209,8 @@ static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp, } void av1_quantize_fp_32x32_avx2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { @@ -231,15 +219,26 @@ void av1_quantize_fp_32x32_avx2( (void)quant_shift_ptr; const unsigned int step = 16; - if (LIKELY(!skip_block)) { - __m256i qp[3]; - __m256i coeff, thr; - const int log_scale = 1; + __m256i qp[3]; + __m256i coeff, thr; + const int log_scale = 1; - init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); - read_coeff(coeff_ptr, &coeff); + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); + read_coeff(coeff_ptr, &coeff); - __m256i eob = _mm256_setzero_si256(); + __m256i eob = _mm256_setzero_si256(); + quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + + update_qp(log_scale, &thr, qp); + + while (n_coeffs > 0) { + read_coeff(coeff_ptr, &coeff); quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; @@ -247,40 +246,85 @@ void av1_quantize_fp_32x32_avx2( dqcoeff_ptr += step; iscan_ptr += step; n_coeffs -= step; + } + *eob_ptr = quant_gather_eob(eob); +} + +static INLINE void quantize_64x64(const __m256i *thr, const __m256i *qp, + __m256i *c, const int16_t *iscan_ptr, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + __m256i *eob) { + const __m256i abs_coeff = _mm256_abs_epi16(*c); + __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr)); + const int nzflag = _mm256_movemask_epi8(mask); - update_qp(log_scale, &thr, qp); - - while (n_coeffs > 0) { - read_coeff(coeff_ptr, &coeff); - quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, - &eob); - - coeff_ptr += step; - qcoeff_ptr += step; - dqcoeff_ptr += step; - iscan_ptr += step; - n_coeffs -= step; - } - { - __m256i eob_s; - eob_s = _mm256_shuffle_epi32(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 1); - eob = _mm256_max_epi16(eob, eob_s); - const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), - _mm256_extractf128_si256(eob, 1)); - *eob_ptr = _mm_extract_epi16(final_eob, 0); - } + if (nzflag) { + __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]); + __m256i qh = _mm256_mulhi_epi16(q, qp[1]); + __m256i ql = _mm256_mullo_epi16(q, qp[1]); + qh = _mm256_slli_epi16(qh, 2); + ql = _mm256_srli_epi16(ql, 14); + q = _mm256_or_si256(qh, ql); + const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(q, qp[2]), 14); + const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(q, qp[2]), 2); + __m256i dq = _mm256_or_si256(dqh, dql); + + q = _mm256_sign_epi16(q, *c); + dq = _mm256_sign_epi16(dq, *c); + + store_two_quan(q, qcoeff, dq, dqcoeff); + const __m256i zero = _mm256_setzero_si256(); + const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr); + const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero); + const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero); + __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff); + cur_eob = _mm256_and_si256(cur_eob, nzero_coeff); + *eob = _mm256_max_epi16(*eob, cur_eob); } else { - do { - write_zero(qcoeff_ptr); - write_zero(dqcoeff_ptr); - qcoeff_ptr += step; - dqcoeff_ptr += step; - n_coeffs -= step; - } while (n_coeffs > 0); - *eob_ptr = 0; + write_zero(qcoeff); + write_zero(dqcoeff); + } +} + +void av1_quantize_fp_64x64_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + const unsigned int step = 16; + + __m256i qp[3]; + __m256i coeff, thr; + const int log_scale = 2; + + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); + read_coeff(coeff_ptr, &coeff); + + __m256i eob = _mm256_setzero_si256(); + quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + + update_qp(log_scale, &thr, qp); + + while (n_coeffs > 0) { + read_coeff(coeff_ptr, &coeff); + quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; } + *eob_ptr = quant_gather_eob(eob); } diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c index 4f7c09546..b07e7717f 100644 --- a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c +++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c @@ -12,7 +12,8 @@ #include <emmintrin.h> #include <xmmintrin.h> -#include "./av1_rtcd.h" +#include "config/av1_rtcd.h" + #include "aom/aom_integer.h" static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset, @@ -67,16 +68,80 @@ static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) { } } +static INLINE void quantize(const int16_t *iscan_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const __m128i *round0, const __m128i *round1, + const __m128i *quant0, const __m128i *quant1, + const __m128i *dequant0, const __m128i *dequant1, + const __m128i *thr0, const __m128i *thr1, + __m128i *eob) { + __m128i coeff0, coeff1; + // Do DC and first 15 AC + read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1); + + // Poor man's sign extract + const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15); + const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15); + __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0), + _mm_cmpeq_epi16(qcoeff0, *thr0)); + const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1), + _mm_cmpeq_epi16(qcoeff1, *thr1)); + const int16_t nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1); + + if (nzflag) { + qcoeff0 = _mm_adds_epi16(qcoeff0, *round0); + qcoeff1 = _mm_adds_epi16(qcoeff1, *round1); + const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0); + const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs); + + coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0); + coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1); + + write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs); + + const __m128i zero = _mm_setzero_si128(); + // Scan for eob + const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + const __m128i iscan0 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + const __m128i iscan1 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0); + const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1); + const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0); + const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1); + const __m128i eob2 = _mm_max_epi16(eob0, eob1); + *eob = _mm_max_epi16(*eob, eob2); + } else { + write_zero(qcoeff_ptr, n_coeffs); + write_zero(dqcoeff_ptr, n_coeffs); + } +} + void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { - __m128i zero; - __m128i thr; - int16_t nzflag; (void)scan_ptr; (void)zbin_ptr; (void)quant_shift_ptr; @@ -86,167 +151,39 @@ void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; - zero = _mm_setzero_si128(); - - if (!skip_block) { - __m128i eob; - __m128i round, quant, dequant; - { - __m128i coeff0, coeff1; - - // Setup global values - { - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - } - - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - // Do DC and first 15 AC - read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); - } - n_coeffs += 8 * 2; - } - - thr = _mm_srai_epi16(dequant, 1); - - // AC only loop - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - - read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | - _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); - - if (nzflag) { - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs); - } else { - write_zero(qcoeff_ptr, n_coeffs); - write_zero(dqcoeff_ptr, n_coeffs); - } - } - - if (nzflag) { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } - n_coeffs += 8 * 2; - } - - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } - } else { - do { - write_zero(dqcoeff_ptr, n_coeffs); - write_zero(qcoeff_ptr, n_coeffs); - n_coeffs += 8 * 2; - } while (n_coeffs < 0); - *eob_ptr = 0; + + const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr); + const __m128i round1 = _mm_unpackhi_epi64(round0, round0); + const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr); + const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0); + const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr); + const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0); + const __m128i thr0 = _mm_srai_epi16(dequant0, 1); + const __m128i thr1 = _mm_srai_epi16(dequant1, 1); + __m128i eob = _mm_setzero_si128(); + + quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0, + &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob); + + n_coeffs += 8 * 2; + + // AC only loop + while (n_coeffs < 0) { + quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1, + &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1, + &eob); + n_coeffs += 8 * 2; + } + + // Accumulate EOB + { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + *eob_ptr = _mm_extract_epi16(eob, 1); } } diff --git a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm index dcc697ba3..faa2a232a 100644 --- a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm +++ b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm @@ -47,6 +47,9 @@ paddd %1, xmm1 SUM_ACROSS_Q %1 %endmacro + +SECTION .text + ;void ssim_parms_sse2( ; unsigned char *s, ; int sp, diff --git a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h new file mode 100644 index 000000000..0adefecdb --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_TXMF1D_SSE2_H_ +#define AV1_TXMF1D_SSE2_H_ + +#include <smmintrin.h> +#include "av1/common/av1_txfm.h" +#include "av1/common/x86/av1_txfm_sse4.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output, + int8_t cos_bit); +void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output, + int8_t cos_bit); + +void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +static INLINE void transpose_32_4x4(int stride, const __m128i *input, + __m128i *output) { + __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]); + __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]); + __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]); + __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]); + + output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2); + output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2); + output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3); + output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3); +} + +// the entire input block can be represent by a grid of 4x4 blocks +// each 4x4 blocks can be represent by 4 vertical __m128i +// we first transpose each 4x4 block internally +// then transpose the grid +static INLINE void transpose_32(int txfm_size, const __m128i *input, + __m128i *output) { + const int num_per_128 = 4; + const int row_size = txfm_size; + const int col_size = txfm_size / num_per_128; + int r, c; + + // transpose each 4x4 block internally + for (r = 0; r < row_size; r += 4) { + for (c = 0; c < col_size; c++) { + transpose_32_4x4(col_size, &input[r * col_size + c], + &output[c * 4 * col_size + r / 4]); + } + } +} + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \ + do { \ + const __m128i ww0 = _mm_set1_epi32(w0); \ + const __m128i ww1 = _mm_set1_epi32(w1); \ + const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \ + const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \ + out0 = _mm_add_epi32(in0_w0, in1_w1); \ + out0 = av1_round_shift_32_sse4_1(out0, bit); \ + const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \ + const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \ + out1 = _mm_sub_epi32(in0_w1, in1_w0); \ + out1 = av1_round_shift_32_sse4_1(out1, bit); \ + } while (0) + +// out0 = in0*w0 + in1*w1 +// out1 = in1*w0 - in0*w1 +#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \ + do { \ + btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit); \ + } while (0) + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +#define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ + do { \ + const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \ + const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \ + out0 = _mm_add_epi32(in0_w0, in1_w1); \ + out0 = _mm_add_epi32(out0, r); \ + out0 = _mm_srai_epi32(out0, bit); \ + const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \ + const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \ + out1 = _mm_sub_epi32(in0_w1, in1_w0); \ + out1 = _mm_add_epi32(out1, r); \ + out1 = _mm_srai_epi32(out1, bit); \ + } while (0) + +// out0 = in0*w0 + in1*w1 +// out1 = in1*w0 - in0*w1 +#define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ + do { \ + btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit); \ + } while (0) + +#ifdef __cplusplus +} +#endif + +#endif // AV1_TXMF1D_SSE2_H_ diff --git a/third_party/aom/av1/encoder/x86/corner_match_sse4.c b/third_party/aom/av1/encoder/x86/corner_match_sse4.c index 179da0d28..381f757da 100644 --- a/third_party/aom/av1/encoder/x86/corner_match_sse4.c +++ b/third_party/aom/av1/encoder/x86/corner_match_sse4.c @@ -5,7 +5,8 @@ #include <smmintrin.h> -#include "./av1_rtcd.h" +#include "config/av1_rtcd.h" + #include "aom_ports/mem.h" #include "av1/encoder/corner_match.h" diff --git a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c deleted file mode 100644 index e5b19a44c..000000000 --- a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c +++ /dev/null @@ -1,3483 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <emmintrin.h> // SSE2 - -#include "./aom_dsp_rtcd.h" -#include "./av1_rtcd.h" -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/x86/fwd_txfm_sse2.h" -#include "aom_dsp/x86/synonyms.h" -#include "aom_dsp/x86/txfm_common_sse2.h" -#include "aom_ports/mem.h" - -static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, - int stride, int flipud, int fliplr) { - const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); - const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); - __m128i mask; - - if (!flipud) { - in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); - in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); - } else { - in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); - in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); - in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - } - - if (fliplr) { - in[0] = _mm_shufflelo_epi16(in[0], 0x1b); - in[1] = _mm_shufflelo_epi16(in[1], 0x1b); - in[2] = _mm_shufflelo_epi16(in[2], 0x1b); - in[3] = _mm_shufflelo_epi16(in[3], 0x1b); - } - - in[0] = _mm_slli_epi16(in[0], 4); - in[1] = _mm_slli_epi16(in[1], 4); - in[2] = _mm_slli_epi16(in[2], 4); - in[3] = _mm_slli_epi16(in[3], 4); - - mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a); - in[0] = _mm_add_epi16(in[0], mask); - in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b); -} - -static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) { - const __m128i kOne = _mm_set1_epi16(1); - __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]); - __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]); - __m128i out01 = _mm_add_epi16(in01, kOne); - __m128i out23 = _mm_add_epi16(in23, kOne); - out01 = _mm_srai_epi16(out01, 2); - out23 = _mm_srai_epi16(out23, 2); - store_output(&out01, (output + 0 * 8)); - store_output(&out23, (output + 1 * 8)); -} - -static INLINE void transpose_4x4(__m128i *res) { - // Combine and transpose - // 00 01 02 03 20 21 22 23 - // 10 11 12 13 30 31 32 33 - const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); - const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); - - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); - res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); - - // 00 10 20 30 01 11 21 31 - // 02 12 22 32 03 13 23 33 - // only use the first 4 16-bit integers - res[1] = _mm_unpackhi_epi64(res[0], res[0]); - res[3] = _mm_unpackhi_epi64(res[2], res[2]); -} - -static void fdct4_sse2(__m128i *in) { - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i u[4], v[4]; - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpacklo_epi16(in[3], in[2]); - - v[0] = _mm_add_epi16(u[0], u[1]); - v[1] = _mm_sub_epi16(u[0], u[1]); - - u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 - u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 - u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 - u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[1]); - in[1] = _mm_packs_epi32(u[2], u[3]); - transpose_4x4(in); -} - -static void fadst4_sse2(__m128i *in) { - const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); - const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); - const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); - const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); - const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); - const __m128i kZero = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8]; - __m128i in7 = _mm_add_epi16(in[0], in[1]); - - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpacklo_epi16(in[2], in[3]); - u[2] = _mm_unpacklo_epi16(in7, kZero); - u[3] = _mm_unpacklo_epi16(in[2], kZero); - u[4] = _mm_unpacklo_epi16(in[3], kZero); - - v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 - v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 - v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 - v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 - v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 - v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 - v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); - - u[0] = _mm_add_epi32(v[0], v[1]); - u[1] = _mm_sub_epi32(v[2], v[6]); - u[2] = _mm_add_epi32(v[3], v[4]); - u[3] = _mm_sub_epi32(u[2], u[0]); - u[4] = _mm_slli_epi32(v[5], 2); - u[5] = _mm_sub_epi32(u[4], v[5]); - u[6] = _mm_add_epi32(u[3], u[5]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[2]); - in[1] = _mm_packs_epi32(u[1], u[3]); - transpose_4x4(in); -} - -#if CONFIG_EXT_TX -static void fidtx4_sse2(__m128i *in) { - const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0); - const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i v0, v1, v2, v3; - __m128i u0, u1, u2, u3; - - v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16); - v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16); - v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16); - v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16); - - u0 = _mm_madd_epi16(v0, k__sqrt2_epi16); - u1 = _mm_madd_epi16(v1, k__sqrt2_epi16); - u2 = _mm_madd_epi16(v2, k__sqrt2_epi16); - u3 = _mm_madd_epi16(v3, k__sqrt2_epi16); - - v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u0, u2); - in[1] = _mm_packs_epi32(u1, u3); - transpose_4x4(in); -} -#endif // CONFIG_EXT_TX - -void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in[4]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - switch (tx_type) { - case DCT_DCT: aom_fdct4x4_sse2(input, output, stride); break; - case ADST_DCT: - load_buffer_4x4(input, in, stride, 0, 0); - fadst4_sse2(in); - fdct4_sse2(in); - write_buffer_4x4(output, in); - break; - case DCT_ADST: - load_buffer_4x4(input, in, stride, 0, 0); - fdct4_sse2(in); - fadst4_sse2(in); - write_buffer_4x4(output, in); - break; - case ADST_ADST: - load_buffer_4x4(input, in, stride, 0, 0); - fadst4_sse2(in); - fadst4_sse2(in); - write_buffer_4x4(output, in); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_4x4(input, in, stride, 1, 0); - fadst4_sse2(in); - fdct4_sse2(in); - write_buffer_4x4(output, in); - break; - case DCT_FLIPADST: - load_buffer_4x4(input, in, stride, 0, 1); - fdct4_sse2(in); - fadst4_sse2(in); - write_buffer_4x4(output, in); - break; - case FLIPADST_FLIPADST: - load_buffer_4x4(input, in, stride, 1, 1); - fadst4_sse2(in); - fadst4_sse2(in); - write_buffer_4x4(output, in); - break; - case ADST_FLIPADST: - load_buffer_4x4(input, in, stride, 0, 1); - fadst4_sse2(in); - fadst4_sse2(in); - write_buffer_4x4(output, in); - break; - case FLIPADST_ADST: - load_buffer_4x4(input, in, stride, 1, 0); - fadst4_sse2(in); - fadst4_sse2(in); - write_buffer_4x4(output, in); - break; - case IDTX: - load_buffer_4x4(input, in, stride, 0, 0); - fidtx4_sse2(in); - fidtx4_sse2(in); - write_buffer_4x4(output, in); - break; - case V_DCT: - load_buffer_4x4(input, in, stride, 0, 0); - fdct4_sse2(in); - fidtx4_sse2(in); - write_buffer_4x4(output, in); - break; - case H_DCT: - load_buffer_4x4(input, in, stride, 0, 0); - fidtx4_sse2(in); - fdct4_sse2(in); - write_buffer_4x4(output, in); - break; - case V_ADST: - load_buffer_4x4(input, in, stride, 0, 0); - fadst4_sse2(in); - fidtx4_sse2(in); - write_buffer_4x4(output, in); - break; - case H_ADST: - load_buffer_4x4(input, in, stride, 0, 0); - fidtx4_sse2(in); - fadst4_sse2(in); - write_buffer_4x4(output, in); - break; - case V_FLIPADST: - load_buffer_4x4(input, in, stride, 1, 0); - fadst4_sse2(in); - fidtx4_sse2(in); - write_buffer_4x4(output, in); - break; - case H_FLIPADST: - load_buffer_4x4(input, in, stride, 0, 1); - fidtx4_sse2(in); - fadst4_sse2(in); - write_buffer_4x4(output, in); - break; -#endif // CONFIG_EXT_TX - default: assert(0); - } -} - -// load 8x8 array -static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, - int stride, int flipud, int fliplr) { - if (!flipud) { - in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); - in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); - in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); - in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); - in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); - in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); - in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); - in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); - } else { - in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride)); - in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride)); - in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride)); - in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride)); - in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride)); - in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride)); - in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride)); - in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride)); - } - - if (fliplr) { - in[0] = mm_reverse_epi16(in[0]); - in[1] = mm_reverse_epi16(in[1]); - in[2] = mm_reverse_epi16(in[2]); - in[3] = mm_reverse_epi16(in[3]); - in[4] = mm_reverse_epi16(in[4]); - in[5] = mm_reverse_epi16(in[5]); - in[6] = mm_reverse_epi16(in[6]); - in[7] = mm_reverse_epi16(in[7]); - } - - in[0] = _mm_slli_epi16(in[0], 2); - in[1] = _mm_slli_epi16(in[1], 2); - in[2] = _mm_slli_epi16(in[2], 2); - in[3] = _mm_slli_epi16(in[3], 2); - in[4] = _mm_slli_epi16(in[4], 2); - in[5] = _mm_slli_epi16(in[5], 2); - in[6] = _mm_slli_epi16(in[6], 2); - in[7] = _mm_slli_epi16(in[7], 2); -} - -// right shift and rounding -static INLINE void right_shift_8x8(__m128i *res, const int bit) { - __m128i sign0 = _mm_srai_epi16(res[0], 15); - __m128i sign1 = _mm_srai_epi16(res[1], 15); - __m128i sign2 = _mm_srai_epi16(res[2], 15); - __m128i sign3 = _mm_srai_epi16(res[3], 15); - __m128i sign4 = _mm_srai_epi16(res[4], 15); - __m128i sign5 = _mm_srai_epi16(res[5], 15); - __m128i sign6 = _mm_srai_epi16(res[6], 15); - __m128i sign7 = _mm_srai_epi16(res[7], 15); - - if (bit == 2) { - const __m128i const_rounding = _mm_set1_epi16(1); - res[0] = _mm_adds_epi16(res[0], const_rounding); - res[1] = _mm_adds_epi16(res[1], const_rounding); - res[2] = _mm_adds_epi16(res[2], const_rounding); - res[3] = _mm_adds_epi16(res[3], const_rounding); - res[4] = _mm_adds_epi16(res[4], const_rounding); - res[5] = _mm_adds_epi16(res[5], const_rounding); - res[6] = _mm_adds_epi16(res[6], const_rounding); - res[7] = _mm_adds_epi16(res[7], const_rounding); - } - - res[0] = _mm_sub_epi16(res[0], sign0); - res[1] = _mm_sub_epi16(res[1], sign1); - res[2] = _mm_sub_epi16(res[2], sign2); - res[3] = _mm_sub_epi16(res[3], sign3); - res[4] = _mm_sub_epi16(res[4], sign4); - res[5] = _mm_sub_epi16(res[5], sign5); - res[6] = _mm_sub_epi16(res[6], sign6); - res[7] = _mm_sub_epi16(res[7], sign7); - - if (bit == 1) { - res[0] = _mm_srai_epi16(res[0], 1); - res[1] = _mm_srai_epi16(res[1], 1); - res[2] = _mm_srai_epi16(res[2], 1); - res[3] = _mm_srai_epi16(res[3], 1); - res[4] = _mm_srai_epi16(res[4], 1); - res[5] = _mm_srai_epi16(res[5], 1); - res[6] = _mm_srai_epi16(res[6], 1); - res[7] = _mm_srai_epi16(res[7], 1); - } else { - res[0] = _mm_srai_epi16(res[0], 2); - res[1] = _mm_srai_epi16(res[1], 2); - res[2] = _mm_srai_epi16(res[2], 2); - res[3] = _mm_srai_epi16(res[3], 2); - res[4] = _mm_srai_epi16(res[4], 2); - res[5] = _mm_srai_epi16(res[5], 2); - res[6] = _mm_srai_epi16(res[6], 2); - res[7] = _mm_srai_epi16(res[7], 2); - } -} - -// write 8x8 array -static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, - int stride) { - store_output(&res[0], (output + 0 * stride)); - store_output(&res[1], (output + 1 * stride)); - store_output(&res[2], (output + 2 * stride)); - store_output(&res[3], (output + 3 * stride)); - store_output(&res[4], (output + 4 * stride)); - store_output(&res[5], (output + 5 * stride)); - store_output(&res[6], (output + 6 * stride)); - store_output(&res[7], (output + 7 * stride)); -} - -// perform in-place transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 44 54 45 55 46 56 47 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 25 35 - // 44 54 64 74 45 55 65 75 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 -} - -static void fdct8_sse2(__m128i *in) { - // constants - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i v0, v1, v2, v3, v4, v5, v6, v7; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - - // stage 1 - s0 = _mm_add_epi16(in[0], in[7]); - s1 = _mm_add_epi16(in[1], in[6]); - s2 = _mm_add_epi16(in[2], in[5]); - s3 = _mm_add_epi16(in[3], in[4]); - s4 = _mm_sub_epi16(in[3], in[4]); - s5 = _mm_sub_epi16(in[2], in[5]); - s6 = _mm_sub_epi16(in[1], in[6]); - s7 = _mm_sub_epi16(in[0], in[7]); - - u0 = _mm_add_epi16(s0, s3); - u1 = _mm_add_epi16(s1, s2); - u2 = _mm_sub_epi16(s1, s2); - u3 = _mm_sub_epi16(s0, s3); - // interleave and perform butterfly multiplication/addition - v0 = _mm_unpacklo_epi16(u0, u1); - v1 = _mm_unpackhi_epi16(u0, u1); - v2 = _mm_unpacklo_epi16(u2, u3); - v3 = _mm_unpackhi_epi16(u2, u3); - - u0 = _mm_madd_epi16(v0, k__cospi_p16_p16); - u1 = _mm_madd_epi16(v1, k__cospi_p16_p16); - u2 = _mm_madd_epi16(v0, k__cospi_p16_m16); - u3 = _mm_madd_epi16(v1, k__cospi_p16_m16); - u4 = _mm_madd_epi16(v2, k__cospi_p24_p08); - u5 = _mm_madd_epi16(v3, k__cospi_p24_p08); - u6 = _mm_madd_epi16(v2, k__cospi_m08_p24); - u7 = _mm_madd_epi16(v3, k__cospi_m08_p24); - - // shift and rounding - v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u0, u1); - in[2] = _mm_packs_epi32(u4, u5); - in[4] = _mm_packs_epi32(u2, u3); - in[6] = _mm_packs_epi32(u6, u7); - - // stage 2 - // interleave and perform butterfly multiplication/addition - u0 = _mm_unpacklo_epi16(s6, s5); - u1 = _mm_unpackhi_epi16(s6, s5); - v0 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_p16); - - // shift and rounding - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - - u0 = _mm_packs_epi32(v0, v1); - u1 = _mm_packs_epi32(v2, v3); - - // stage 3 - s0 = _mm_add_epi16(s4, u0); - s1 = _mm_sub_epi16(s4, u0); - s2 = _mm_sub_epi16(s7, u1); - s3 = _mm_add_epi16(s7, u1); - - // stage 4 - u0 = _mm_unpacklo_epi16(s0, s3); - u1 = _mm_unpackhi_epi16(s0, s3); - u2 = _mm_unpacklo_epi16(s1, s2); - u3 = _mm_unpackhi_epi16(s1, s2); - - v0 = _mm_madd_epi16(u0, k__cospi_p28_p04); - v1 = _mm_madd_epi16(u1, k__cospi_p28_p04); - v2 = _mm_madd_epi16(u2, k__cospi_p12_p20); - v3 = _mm_madd_epi16(u3, k__cospi_p12_p20); - v4 = _mm_madd_epi16(u2, k__cospi_m20_p12); - v5 = _mm_madd_epi16(u3, k__cospi_m20_p12); - v6 = _mm_madd_epi16(u0, k__cospi_m04_p28); - v7 = _mm_madd_epi16(u1, k__cospi_m04_p28); - - // shift and rounding - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - in[1] = _mm_packs_epi32(v0, v1); - in[3] = _mm_packs_epi32(v4, v5); - in[5] = _mm_packs_epi32(v2, v3); - in[7] = _mm_packs_epi32(v6, v7); - - // transpose - array_transpose_8x8(in, in); -} - -static void fadst8_sse2(__m128i *in) { - // Constants - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__const_0 = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; - __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - - // properly aligned for butterfly input - in0 = in[7]; - in1 = in[0]; - in2 = in[5]; - in3 = in[2]; - in4 = in[3]; - in5 = in[4]; - in6 = in[1]; - in7 = in[6]; - - // column transformation - // stage 1 - // interleave and multiply/add into 32-bit integer - s0 = _mm_unpacklo_epi16(in0, in1); - s1 = _mm_unpackhi_epi16(in0, in1); - s2 = _mm_unpacklo_epi16(in2, in3); - s3 = _mm_unpackhi_epi16(in2, in3); - s4 = _mm_unpacklo_epi16(in4, in5); - s5 = _mm_unpackhi_epi16(in4, in5); - s6 = _mm_unpacklo_epi16(in6, in7); - s7 = _mm_unpackhi_epi16(in6, in7); - - u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); - u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); - u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); - u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); - u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); - u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); - u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); - u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); - u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); - u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); - u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); - u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); - u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); - u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); - u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); - u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); - - // addition - w0 = _mm_add_epi32(u0, u8); - w1 = _mm_add_epi32(u1, u9); - w2 = _mm_add_epi32(u2, u10); - w3 = _mm_add_epi32(u3, u11); - w4 = _mm_add_epi32(u4, u12); - w5 = _mm_add_epi32(u5, u13); - w6 = _mm_add_epi32(u6, u14); - w7 = _mm_add_epi32(u7, u15); - w8 = _mm_sub_epi32(u0, u8); - w9 = _mm_sub_epi32(u1, u9); - w10 = _mm_sub_epi32(u2, u10); - w11 = _mm_sub_epi32(u3, u11); - w12 = _mm_sub_epi32(u4, u12); - w13 = _mm_sub_epi32(u5, u13); - w14 = _mm_sub_epi32(u6, u14); - w15 = _mm_sub_epi32(u7, u15); - - // shift and rounding - v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); - v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); - v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); - v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); - v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); - v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); - v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); - v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); - - u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); - u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); - u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); - u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); - u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); - u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); - u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); - u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); - - // back to 16-bit and pack 8 integers into __m128i - v0 = _mm_add_epi32(w0, w4); - v1 = _mm_add_epi32(w1, w5); - v2 = _mm_add_epi32(w2, w6); - v3 = _mm_add_epi32(w3, w7); - v4 = _mm_sub_epi32(w0, w4); - v5 = _mm_sub_epi32(w1, w5); - v6 = _mm_sub_epi32(w2, w6); - v7 = _mm_sub_epi32(w3, w7); - - w0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - w1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - w2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - w3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - w4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - w5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - w6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - w7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(w0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(w1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(w2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(w3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(w4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(w5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(w6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(w7, DCT_CONST_BITS); - - in[4] = _mm_packs_epi32(u8, u9); - in[5] = _mm_packs_epi32(u10, u11); - in[6] = _mm_packs_epi32(u12, u13); - in[7] = _mm_packs_epi32(u14, u15); - - // stage 2 - s0 = _mm_packs_epi32(v0, v1); - s1 = _mm_packs_epi32(v2, v3); - s2 = _mm_packs_epi32(v4, v5); - s3 = _mm_packs_epi32(v6, v7); - - u0 = _mm_unpacklo_epi16(in[4], in[5]); - u1 = _mm_unpackhi_epi16(in[4], in[5]); - u2 = _mm_unpacklo_epi16(in[6], in[7]); - u3 = _mm_unpackhi_epi16(in[6], in[7]); - - v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); - v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); - v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); - v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); - v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); - v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); - v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); - v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); - - w0 = _mm_add_epi32(v0, v4); - w1 = _mm_add_epi32(v1, v5); - w2 = _mm_add_epi32(v2, v6); - w3 = _mm_add_epi32(v3, v7); - w4 = _mm_sub_epi32(v0, v4); - w5 = _mm_sub_epi32(v1, v5); - w6 = _mm_sub_epi32(v2, v6); - w7 = _mm_sub_epi32(v3, v7); - - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - - // back to 16-bit intergers - s4 = _mm_packs_epi32(u0, u1); - s5 = _mm_packs_epi32(u2, u3); - s6 = _mm_packs_epi32(u4, u5); - s7 = _mm_packs_epi32(u6, u7); - - // stage 3 - u0 = _mm_unpacklo_epi16(s2, s3); - u1 = _mm_unpackhi_epi16(s2, s3); - u2 = _mm_unpacklo_epi16(s6, s7); - u3 = _mm_unpackhi_epi16(s6, s7); - - v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); - v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); - v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); - v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); - - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - s2 = _mm_packs_epi32(v0, v1); - s3 = _mm_packs_epi32(v2, v3); - s6 = _mm_packs_epi32(v4, v5); - s7 = _mm_packs_epi32(v6, v7); - - // FIXME(jingning): do subtract using bit inversion? - in[0] = s0; - in[1] = _mm_sub_epi16(k__const_0, s4); - in[2] = s6; - in[3] = _mm_sub_epi16(k__const_0, s2); - in[4] = s3; - in[5] = _mm_sub_epi16(k__const_0, s7); - in[6] = s5; - in[7] = _mm_sub_epi16(k__const_0, s1); - - // transpose - array_transpose_8x8(in, in); -} - -#if CONFIG_EXT_TX -static void fidtx8_sse2(__m128i *in) { - in[0] = _mm_slli_epi16(in[0], 1); - in[1] = _mm_slli_epi16(in[1], 1); - in[2] = _mm_slli_epi16(in[2], 1); - in[3] = _mm_slli_epi16(in[3], 1); - in[4] = _mm_slli_epi16(in[4], 1); - in[5] = _mm_slli_epi16(in[5], 1); - in[6] = _mm_slli_epi16(in[6], 1); - in[7] = _mm_slli_epi16(in[7], 1); - - array_transpose_8x8(in, in); -} -#endif // CONFIG_EXT_TX - -void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in[8]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - switch (tx_type) { - case DCT_DCT: aom_fdct8x8_sse2(input, output, stride); break; - case ADST_DCT: - load_buffer_8x8(input, in, stride, 0, 0); - fadst8_sse2(in); - fdct8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case DCT_ADST: - load_buffer_8x8(input, in, stride, 0, 0); - fdct8_sse2(in); - fadst8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case ADST_ADST: - load_buffer_8x8(input, in, stride, 0, 0); - fadst8_sse2(in); - fadst8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_8x8(input, in, stride, 1, 0); - fadst8_sse2(in); - fdct8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case DCT_FLIPADST: - load_buffer_8x8(input, in, stride, 0, 1); - fdct8_sse2(in); - fadst8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case FLIPADST_FLIPADST: - load_buffer_8x8(input, in, stride, 1, 1); - fadst8_sse2(in); - fadst8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case ADST_FLIPADST: - load_buffer_8x8(input, in, stride, 0, 1); - fadst8_sse2(in); - fadst8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case FLIPADST_ADST: - load_buffer_8x8(input, in, stride, 1, 0); - fadst8_sse2(in); - fadst8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case IDTX: - load_buffer_8x8(input, in, stride, 0, 0); - fidtx8_sse2(in); - fidtx8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case V_DCT: - load_buffer_8x8(input, in, stride, 0, 0); - fdct8_sse2(in); - fidtx8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case H_DCT: - load_buffer_8x8(input, in, stride, 0, 0); - fidtx8_sse2(in); - fdct8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case V_ADST: - load_buffer_8x8(input, in, stride, 0, 0); - fadst8_sse2(in); - fidtx8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case H_ADST: - load_buffer_8x8(input, in, stride, 0, 0); - fidtx8_sse2(in); - fadst8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case V_FLIPADST: - load_buffer_8x8(input, in, stride, 1, 0); - fadst8_sse2(in); - fidtx8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case H_FLIPADST: - load_buffer_8x8(input, in, stride, 0, 1); - fidtx8_sse2(in); - fadst8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; -#endif // CONFIG_EXT_TX - default: assert(0); - } -} - -static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0, - __m128i *in1, int stride, int flipud, - int fliplr) { - // Load 4 8x8 blocks - const int16_t *topL = input; - const int16_t *topR = input + 8; - const int16_t *botL = input + 8 * stride; - const int16_t *botR = input + 8 * stride + 8; - - const int16_t *tmp; - - if (flipud) { - // Swap left columns - tmp = topL; - topL = botL; - botL = tmp; - // Swap right columns - tmp = topR; - topR = botR; - botR = tmp; - } - - if (fliplr) { - // Swap top rows - tmp = topL; - topL = topR; - topR = tmp; - // Swap bottom rows - tmp = botL; - botL = botR; - botR = tmp; - } - - // load first 8 columns - load_buffer_8x8(topL, in0, stride, flipud, fliplr); - load_buffer_8x8(botL, in0 + 8, stride, flipud, fliplr); - - // load second 8 columns - load_buffer_8x8(topR, in1, stride, flipud, fliplr); - load_buffer_8x8(botR, in1 + 8, stride, flipud, fliplr); -} - -static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0, - __m128i *in1, int stride) { - // write first 8 columns - write_buffer_8x8(output, in0, stride); - write_buffer_8x8(output + 8 * stride, in0 + 8, stride); - // write second 8 columns - output += 8; - write_buffer_8x8(output, in1, stride); - write_buffer_8x8(output + 8 * stride, in1 + 8, stride); -} - -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - -static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { - // perform rounding operations - right_shift_8x8(res0, 2); - right_shift_8x8(res0 + 8, 2); - right_shift_8x8(res1, 2); - right_shift_8x8(res1 + 8, 2); -} - -static void fdct16_8col(__m128i *in) { - // perform 16x16 1-D DCT for 8 columns - __m128i i[8], s[8], p[8], t[8], u[16], v[16]; - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - // stage 1 - i[0] = _mm_add_epi16(in[0], in[15]); - i[1] = _mm_add_epi16(in[1], in[14]); - i[2] = _mm_add_epi16(in[2], in[13]); - i[3] = _mm_add_epi16(in[3], in[12]); - i[4] = _mm_add_epi16(in[4], in[11]); - i[5] = _mm_add_epi16(in[5], in[10]); - i[6] = _mm_add_epi16(in[6], in[9]); - i[7] = _mm_add_epi16(in[7], in[8]); - - s[0] = _mm_sub_epi16(in[7], in[8]); - s[1] = _mm_sub_epi16(in[6], in[9]); - s[2] = _mm_sub_epi16(in[5], in[10]); - s[3] = _mm_sub_epi16(in[4], in[11]); - s[4] = _mm_sub_epi16(in[3], in[12]); - s[5] = _mm_sub_epi16(in[2], in[13]); - s[6] = _mm_sub_epi16(in[1], in[14]); - s[7] = _mm_sub_epi16(in[0], in[15]); - - p[0] = _mm_add_epi16(i[0], i[7]); - p[1] = _mm_add_epi16(i[1], i[6]); - p[2] = _mm_add_epi16(i[2], i[5]); - p[3] = _mm_add_epi16(i[3], i[4]); - p[4] = _mm_sub_epi16(i[3], i[4]); - p[5] = _mm_sub_epi16(i[2], i[5]); - p[6] = _mm_sub_epi16(i[1], i[6]); - p[7] = _mm_sub_epi16(i[0], i[7]); - - u[0] = _mm_add_epi16(p[0], p[3]); - u[1] = _mm_add_epi16(p[1], p[2]); - u[2] = _mm_sub_epi16(p[1], p[2]); - u[3] = _mm_sub_epi16(p[0], p[3]); - - v[0] = _mm_unpacklo_epi16(u[0], u[1]); - v[1] = _mm_unpackhi_epi16(u[0], u[1]); - v[2] = _mm_unpacklo_epi16(u[2], u[3]); - v[3] = _mm_unpackhi_epi16(u[2], u[3]); - - u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); - u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); - u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16); - u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); - u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08); - u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08); - u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24); - u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[1]); - in[4] = _mm_packs_epi32(u[4], u[5]); - in[8] = _mm_packs_epi32(u[2], u[3]); - in[12] = _mm_packs_epi32(u[6], u[7]); - - u[0] = _mm_unpacklo_epi16(p[5], p[6]); - u[1] = _mm_unpackhi_epi16(p[5], p[6]); - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - - u[0] = _mm_packs_epi32(v[0], v[1]); - u[1] = _mm_packs_epi32(v[2], v[3]); - - t[0] = _mm_add_epi16(p[4], u[0]); - t[1] = _mm_sub_epi16(p[4], u[0]); - t[2] = _mm_sub_epi16(p[7], u[1]); - t[3] = _mm_add_epi16(p[7], u[1]); - - u[0] = _mm_unpacklo_epi16(t[0], t[3]); - u[1] = _mm_unpackhi_epi16(t[0], t[3]); - u[2] = _mm_unpacklo_epi16(t[1], t[2]); - u[3] = _mm_unpackhi_epi16(t[1], t[2]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04); - v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04); - v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20); - v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20); - v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12); - v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12); - v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28); - v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - in[2] = _mm_packs_epi32(v[0], v[1]); - in[6] = _mm_packs_epi32(v[4], v[5]); - in[10] = _mm_packs_epi32(v[2], v[3]); - in[14] = _mm_packs_epi32(v[6], v[7]); - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[2], s[5]); - u[1] = _mm_unpackhi_epi16(s[2], s[5]); - u[2] = _mm_unpacklo_epi16(s[3], s[4]); - u[3] = _mm_unpackhi_epi16(s[3], s[4]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[2] = _mm_packs_epi32(v[0], v[1]); - t[3] = _mm_packs_epi32(v[2], v[3]); - t[4] = _mm_packs_epi32(v[4], v[5]); - t[5] = _mm_packs_epi32(v[6], v[7]); - - // stage 3 - p[0] = _mm_add_epi16(s[0], t[3]); - p[1] = _mm_add_epi16(s[1], t[2]); - p[2] = _mm_sub_epi16(s[1], t[2]); - p[3] = _mm_sub_epi16(s[0], t[3]); - p[4] = _mm_sub_epi16(s[7], t[4]); - p[5] = _mm_sub_epi16(s[6], t[5]); - p[6] = _mm_add_epi16(s[6], t[5]); - p[7] = _mm_add_epi16(s[7], t[4]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(p[1], p[6]); - u[1] = _mm_unpackhi_epi16(p[1], p[6]); - u[2] = _mm_unpacklo_epi16(p[2], p[5]); - u[3] = _mm_unpackhi_epi16(p[2], p[5]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); - v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); - v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); - v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); - v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); - v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); - v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[1] = _mm_packs_epi32(v[0], v[1]); - t[2] = _mm_packs_epi32(v[2], v[3]); - t[5] = _mm_packs_epi32(v[4], v[5]); - t[6] = _mm_packs_epi32(v[6], v[7]); - - // stage 5 - s[0] = _mm_add_epi16(p[0], t[1]); - s[1] = _mm_sub_epi16(p[0], t[1]); - s[2] = _mm_sub_epi16(p[3], t[2]); - s[3] = _mm_add_epi16(p[3], t[2]); - s[4] = _mm_add_epi16(p[4], t[5]); - s[5] = _mm_sub_epi16(p[4], t[5]); - s[6] = _mm_sub_epi16(p[7], t[6]); - s[7] = _mm_add_epi16(p[7], t[6]); - - // stage 6 - u[0] = _mm_unpacklo_epi16(s[0], s[7]); - u[1] = _mm_unpackhi_epi16(s[0], s[7]); - u[2] = _mm_unpacklo_epi16(s[1], s[6]); - u[3] = _mm_unpackhi_epi16(s[1], s[6]); - u[4] = _mm_unpacklo_epi16(s[2], s[5]); - u[5] = _mm_unpackhi_epi16(s[2], s[5]); - u[6] = _mm_unpacklo_epi16(s[3], s[4]); - u[7] = _mm_unpackhi_epi16(s[3], s[4]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02); - v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02); - v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18); - v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18); - v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10); - v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10); - v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26); - v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26); - v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06); - v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06); - v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22); - v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22); - v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14); - v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14); - v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30); - v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - in[1] = _mm_packs_epi32(v[0], v[1]); - in[9] = _mm_packs_epi32(v[2], v[3]); - in[5] = _mm_packs_epi32(v[4], v[5]); - in[13] = _mm_packs_epi32(v[6], v[7]); - in[3] = _mm_packs_epi32(v[8], v[9]); - in[11] = _mm_packs_epi32(v[10], v[11]); - in[7] = _mm_packs_epi32(v[12], v[13]); - in[15] = _mm_packs_epi32(v[14], v[15]); -} - -static void fadst16_8col(__m128i *in) { - // perform 16x16 1-D ADST for 8 columns - __m128i s[16], x[16], u[32], v[32]; - const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); - const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); - const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); - - u[0] = _mm_unpacklo_epi16(in[15], in[0]); - u[1] = _mm_unpackhi_epi16(in[15], in[0]); - u[2] = _mm_unpacklo_epi16(in[13], in[2]); - u[3] = _mm_unpackhi_epi16(in[13], in[2]); - u[4] = _mm_unpacklo_epi16(in[11], in[4]); - u[5] = _mm_unpackhi_epi16(in[11], in[4]); - u[6] = _mm_unpacklo_epi16(in[9], in[6]); - u[7] = _mm_unpackhi_epi16(in[9], in[6]); - u[8] = _mm_unpacklo_epi16(in[7], in[8]); - u[9] = _mm_unpackhi_epi16(in[7], in[8]); - u[10] = _mm_unpacklo_epi16(in[5], in[10]); - u[11] = _mm_unpackhi_epi16(in[5], in[10]); - u[12] = _mm_unpacklo_epi16(in[3], in[12]); - u[13] = _mm_unpackhi_epi16(in[3], in[12]); - u[14] = _mm_unpacklo_epi16(in[1], in[14]); - u[15] = _mm_unpackhi_epi16(in[1], in[14]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); - v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); - v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); - v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); - v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); - v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); - v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); - v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); - v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); - v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); - v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); - v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); - v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); - v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); - v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); - v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); - v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); - v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); - v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); - v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); - v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); - v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); - v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); - v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); - v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); - v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); - v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); - v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); - v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); - v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); - v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); - v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); - - u[0] = _mm_add_epi32(v[0], v[16]); - u[1] = _mm_add_epi32(v[1], v[17]); - u[2] = _mm_add_epi32(v[2], v[18]); - u[3] = _mm_add_epi32(v[3], v[19]); - u[4] = _mm_add_epi32(v[4], v[20]); - u[5] = _mm_add_epi32(v[5], v[21]); - u[6] = _mm_add_epi32(v[6], v[22]); - u[7] = _mm_add_epi32(v[7], v[23]); - u[8] = _mm_add_epi32(v[8], v[24]); - u[9] = _mm_add_epi32(v[9], v[25]); - u[10] = _mm_add_epi32(v[10], v[26]); - u[11] = _mm_add_epi32(v[11], v[27]); - u[12] = _mm_add_epi32(v[12], v[28]); - u[13] = _mm_add_epi32(v[13], v[29]); - u[14] = _mm_add_epi32(v[14], v[30]); - u[15] = _mm_add_epi32(v[15], v[31]); - u[16] = _mm_sub_epi32(v[0], v[16]); - u[17] = _mm_sub_epi32(v[1], v[17]); - u[18] = _mm_sub_epi32(v[2], v[18]); - u[19] = _mm_sub_epi32(v[3], v[19]); - u[20] = _mm_sub_epi32(v[4], v[20]); - u[21] = _mm_sub_epi32(v[5], v[21]); - u[22] = _mm_sub_epi32(v[6], v[22]); - u[23] = _mm_sub_epi32(v[7], v[23]); - u[24] = _mm_sub_epi32(v[8], v[24]); - u[25] = _mm_sub_epi32(v[9], v[25]); - u[26] = _mm_sub_epi32(v[10], v[26]); - u[27] = _mm_sub_epi32(v[11], v[27]); - u[28] = _mm_sub_epi32(v[12], v[28]); - u[29] = _mm_sub_epi32(v[13], v[29]); - u[30] = _mm_sub_epi32(v[14], v[30]); - u[31] = _mm_sub_epi32(v[15], v[31]); - - v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); - v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); - v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); - v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); - v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); - v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); - v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); - v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); - v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); - v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); - v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); - v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); - v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); - v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); - v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); - v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); - - u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); - u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); - u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); - u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); - u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); - u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); - u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); - u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); - u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); - u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); - u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); - u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); - u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); - u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); - u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); - u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); - - v[0] = _mm_add_epi32(u[0], u[8]); - v[1] = _mm_add_epi32(u[1], u[9]); - v[2] = _mm_add_epi32(u[2], u[10]); - v[3] = _mm_add_epi32(u[3], u[11]); - v[4] = _mm_add_epi32(u[4], u[12]); - v[5] = _mm_add_epi32(u[5], u[13]); - v[6] = _mm_add_epi32(u[6], u[14]); - v[7] = _mm_add_epi32(u[7], u[15]); - - v[16] = _mm_add_epi32(v[0], v[4]); - v[17] = _mm_add_epi32(v[1], v[5]); - v[18] = _mm_add_epi32(v[2], v[6]); - v[19] = _mm_add_epi32(v[3], v[7]); - v[20] = _mm_sub_epi32(v[0], v[4]); - v[21] = _mm_sub_epi32(v[1], v[5]); - v[22] = _mm_sub_epi32(v[2], v[6]); - v[23] = _mm_sub_epi32(v[3], v[7]); - v[16] = _mm_add_epi32(v[16], k__DCT_CONST_ROUNDING); - v[17] = _mm_add_epi32(v[17], k__DCT_CONST_ROUNDING); - v[18] = _mm_add_epi32(v[18], k__DCT_CONST_ROUNDING); - v[19] = _mm_add_epi32(v[19], k__DCT_CONST_ROUNDING); - v[20] = _mm_add_epi32(v[20], k__DCT_CONST_ROUNDING); - v[21] = _mm_add_epi32(v[21], k__DCT_CONST_ROUNDING); - v[22] = _mm_add_epi32(v[22], k__DCT_CONST_ROUNDING); - v[23] = _mm_add_epi32(v[23], k__DCT_CONST_ROUNDING); - v[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); - v[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); - v[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); - v[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); - v[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); - v[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); - v[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); - v[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); - s[0] = _mm_packs_epi32(v[16], v[17]); - s[1] = _mm_packs_epi32(v[18], v[19]); - s[2] = _mm_packs_epi32(v[20], v[21]); - s[3] = _mm_packs_epi32(v[22], v[23]); - - v[8] = _mm_sub_epi32(u[0], u[8]); - v[9] = _mm_sub_epi32(u[1], u[9]); - v[10] = _mm_sub_epi32(u[2], u[10]); - v[11] = _mm_sub_epi32(u[3], u[11]); - v[12] = _mm_sub_epi32(u[4], u[12]); - v[13] = _mm_sub_epi32(u[5], u[13]); - v[14] = _mm_sub_epi32(u[6], u[14]); - v[15] = _mm_sub_epi32(u[7], u[15]); - - v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - s[4] = _mm_packs_epi32(v[8], v[9]); - s[5] = _mm_packs_epi32(v[10], v[11]); - s[6] = _mm_packs_epi32(v[12], v[13]); - s[7] = _mm_packs_epi32(v[14], v[15]); - // - - s[8] = _mm_packs_epi32(u[16], u[17]); - s[9] = _mm_packs_epi32(u[18], u[19]); - s[10] = _mm_packs_epi32(u[20], u[21]); - s[11] = _mm_packs_epi32(u[22], u[23]); - s[12] = _mm_packs_epi32(u[24], u[25]); - s[13] = _mm_packs_epi32(u[26], u[27]); - s[14] = _mm_packs_epi32(u[28], u[29]); - s[15] = _mm_packs_epi32(u[30], u[31]); - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[9]); - u[1] = _mm_unpackhi_epi16(s[8], s[9]); - u[2] = _mm_unpacklo_epi16(s[10], s[11]); - u[3] = _mm_unpackhi_epi16(s[10], s[11]); - u[4] = _mm_unpacklo_epi16(s[12], s[13]); - u[5] = _mm_unpackhi_epi16(s[12], s[13]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); - v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); - v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); - v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); - v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); - v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); - v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); - v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], v[8]); - u[1] = _mm_add_epi32(v[1], v[9]); - u[2] = _mm_add_epi32(v[2], v[10]); - u[3] = _mm_add_epi32(v[3], v[11]); - u[4] = _mm_add_epi32(v[4], v[12]); - u[5] = _mm_add_epi32(v[5], v[13]); - u[6] = _mm_add_epi32(v[6], v[14]); - u[7] = _mm_add_epi32(v[7], v[15]); - u[8] = _mm_sub_epi32(v[0], v[8]); - u[9] = _mm_sub_epi32(v[1], v[9]); - u[10] = _mm_sub_epi32(v[2], v[10]); - u[11] = _mm_sub_epi32(v[3], v[11]); - u[12] = _mm_sub_epi32(v[4], v[12]); - u[13] = _mm_sub_epi32(v[5], v[13]); - u[14] = _mm_sub_epi32(v[6], v[14]); - u[15] = _mm_sub_epi32(v[7], v[15]); - - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - v[8] = _mm_add_epi32(u[0], u[4]); - v[9] = _mm_add_epi32(u[1], u[5]); - v[10] = _mm_add_epi32(u[2], u[6]); - v[11] = _mm_add_epi32(u[3], u[7]); - v[12] = _mm_sub_epi32(u[0], u[4]); - v[13] = _mm_sub_epi32(u[1], u[5]); - v[14] = _mm_sub_epi32(u[2], u[6]); - v[15] = _mm_sub_epi32(u[3], u[7]); - - v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - s[8] = _mm_packs_epi32(v[8], v[9]); - s[9] = _mm_packs_epi32(v[10], v[11]); - s[10] = _mm_packs_epi32(v[12], v[13]); - s[11] = _mm_packs_epi32(v[14], v[15]); - - x[12] = _mm_packs_epi32(u[8], u[9]); - x[13] = _mm_packs_epi32(u[10], u[11]); - x[14] = _mm_packs_epi32(u[12], u[13]); - x[15] = _mm_packs_epi32(u[14], u[15]); - - // stage 3 - u[0] = _mm_unpacklo_epi16(s[4], s[5]); - u[1] = _mm_unpackhi_epi16(s[4], s[5]); - u[2] = _mm_unpacklo_epi16(s[6], s[7]); - u[3] = _mm_unpackhi_epi16(s[6], s[7]); - u[4] = _mm_unpacklo_epi16(x[12], x[13]); - u[5] = _mm_unpackhi_epi16(x[12], x[13]); - u[6] = _mm_unpacklo_epi16(x[14], x[15]); - u[7] = _mm_unpackhi_epi16(x[14], x[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); - v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); - v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); - v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], v[4]); - u[1] = _mm_add_epi32(v[1], v[5]); - u[2] = _mm_add_epi32(v[2], v[6]); - u[3] = _mm_add_epi32(v[3], v[7]); - u[4] = _mm_sub_epi32(v[0], v[4]); - u[5] = _mm_sub_epi32(v[1], v[5]); - u[6] = _mm_sub_epi32(v[2], v[6]); - u[7] = _mm_sub_epi32(v[3], v[7]); - u[8] = _mm_add_epi32(v[8], v[12]); - u[9] = _mm_add_epi32(v[9], v[13]); - u[10] = _mm_add_epi32(v[10], v[14]); - u[11] = _mm_add_epi32(v[11], v[15]); - u[12] = _mm_sub_epi32(v[8], v[12]); - u[13] = _mm_sub_epi32(v[9], v[13]); - u[14] = _mm_sub_epi32(v[10], v[14]); - u[15] = _mm_sub_epi32(v[11], v[15]); - - u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[4] = _mm_packs_epi32(v[0], v[1]); - s[5] = _mm_packs_epi32(v[2], v[3]); - s[6] = _mm_packs_epi32(v[4], v[5]); - s[7] = _mm_packs_epi32(v[6], v[7]); - - s[12] = _mm_packs_epi32(v[8], v[9]); - s[13] = _mm_packs_epi32(v[10], v[11]); - s[14] = _mm_packs_epi32(v[12], v[13]); - s[15] = _mm_packs_epi32(v[14], v[15]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(s[2], s[3]); - u[1] = _mm_unpackhi_epi16(s[2], s[3]); - u[2] = _mm_unpacklo_epi16(s[6], s[7]); - u[3] = _mm_unpackhi_epi16(s[6], s[7]); - u[4] = _mm_unpacklo_epi16(s[10], s[11]); - u[5] = _mm_unpackhi_epi16(s[10], s[11]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); - v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); - v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); - v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); - v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); - v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); - v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); - v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - in[0] = s[0]; - in[1] = _mm_sub_epi16(kZero, s[8]); - in[2] = s[12]; - in[3] = _mm_sub_epi16(kZero, s[4]); - in[4] = _mm_packs_epi32(v[4], v[5]); - in[5] = _mm_packs_epi32(v[12], v[13]); - in[6] = _mm_packs_epi32(v[8], v[9]); - in[7] = _mm_packs_epi32(v[0], v[1]); - in[8] = _mm_packs_epi32(v[2], v[3]); - in[9] = _mm_packs_epi32(v[10], v[11]); - in[10] = _mm_packs_epi32(v[14], v[15]); - in[11] = _mm_packs_epi32(v[6], v[7]); - in[12] = s[5]; - in[13] = _mm_sub_epi16(kZero, s[13]); - in[14] = s[9]; - in[15] = _mm_sub_epi16(kZero, s[1]); -} - -static void fdct16_sse2(__m128i *in0, __m128i *in1) { - fdct16_8col(in0); - fdct16_8col(in1); - array_transpose_16x16(in0, in1); -} - -static void fadst16_sse2(__m128i *in0, __m128i *in1) { - fadst16_8col(in0); - fadst16_8col(in1); - array_transpose_16x16(in0, in1); -} - -#if CONFIG_EXT_TX -static void fidtx16_sse2(__m128i *in0, __m128i *in1) { - idtx16_8col(in0); - idtx16_8col(in1); - array_transpose_16x16(in0, in1); -} -#endif // CONFIG_EXT_TX - -void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in0[16], in1[16]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - switch (tx_type) { - case DCT_DCT: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fdct16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fdct16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case ADST_DCT: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fadst16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fdct16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case DCT_ADST: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fdct16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fadst16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case ADST_ADST: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fadst16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fadst16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_16x16(input, in0, in1, stride, 1, 0); - fadst16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fdct16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case DCT_FLIPADST: - load_buffer_16x16(input, in0, in1, stride, 0, 1); - fdct16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fadst16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case FLIPADST_FLIPADST: - load_buffer_16x16(input, in0, in1, stride, 1, 1); - fadst16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fadst16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case ADST_FLIPADST: - load_buffer_16x16(input, in0, in1, stride, 0, 1); - fadst16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fadst16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case FLIPADST_ADST: - load_buffer_16x16(input, in0, in1, stride, 1, 0); - fadst16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fadst16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case IDTX: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fidtx16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fidtx16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case V_DCT: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fdct16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fidtx16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case H_DCT: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fidtx16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fdct16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case V_ADST: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fadst16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fidtx16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case H_ADST: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fidtx16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fadst16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case V_FLIPADST: - load_buffer_16x16(input, in0, in1, stride, 1, 0); - fadst16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fidtx16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case H_FLIPADST: - load_buffer_16x16(input, in0, in1, stride, 0, 1); - fidtx16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fadst16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; -#endif // CONFIG_EXT_TX - default: assert(0); break; - } -} - -static INLINE void prepare_4x8_row_first(__m128i *in) { - in[0] = _mm_unpacklo_epi64(in[0], in[2]); - in[1] = _mm_unpacklo_epi64(in[1], in[3]); - transpose_4x4(in); - in[4] = _mm_unpacklo_epi64(in[4], in[6]); - in[5] = _mm_unpacklo_epi64(in[5], in[7]); - transpose_4x4(in + 4); -} - -// Load input into the left-hand half of in (ie, into lanes 0..3 of -// each element of in). The right hand half (lanes 4..7) should be -// treated as being filled with "don't care" values. -static INLINE void load_buffer_4x8(const int16_t *input, __m128i *in, - int stride, int flipud, int fliplr) { - const int shift = 2; - if (!flipud) { - in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); - in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); - in[4] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride)); - in[5] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride)); - in[6] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride)); - in[7] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride)); - } else { - in[0] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride)); - in[1] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride)); - in[2] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride)); - in[3] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride)); - in[4] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); - in[5] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); - in[6] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in[7] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - } - - if (fliplr) { - in[0] = _mm_shufflelo_epi16(in[0], 0x1b); - in[1] = _mm_shufflelo_epi16(in[1], 0x1b); - in[2] = _mm_shufflelo_epi16(in[2], 0x1b); - in[3] = _mm_shufflelo_epi16(in[3], 0x1b); - in[4] = _mm_shufflelo_epi16(in[4], 0x1b); - in[5] = _mm_shufflelo_epi16(in[5], 0x1b); - in[6] = _mm_shufflelo_epi16(in[6], 0x1b); - in[7] = _mm_shufflelo_epi16(in[7], 0x1b); - } - - in[0] = _mm_slli_epi16(in[0], shift); - in[1] = _mm_slli_epi16(in[1], shift); - in[2] = _mm_slli_epi16(in[2], shift); - in[3] = _mm_slli_epi16(in[3], shift); - in[4] = _mm_slli_epi16(in[4], shift); - in[5] = _mm_slli_epi16(in[5], shift); - in[6] = _mm_slli_epi16(in[6], shift); - in[7] = _mm_slli_epi16(in[7], shift); - - scale_sqrt2_8x4(in); - scale_sqrt2_8x4(in + 4); - prepare_4x8_row_first(in); -} - -static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) { - __m128i in01, in23, in45, in67, sign01, sign23, sign45, sign67; - const int shift = 1; - - // revert the 8x8 txfm's transpose - array_transpose_8x8(res, res); - - in01 = _mm_unpacklo_epi64(res[0], res[1]); - in23 = _mm_unpacklo_epi64(res[2], res[3]); - in45 = _mm_unpacklo_epi64(res[4], res[5]); - in67 = _mm_unpacklo_epi64(res[6], res[7]); - - sign01 = _mm_srai_epi16(in01, 15); - sign23 = _mm_srai_epi16(in23, 15); - sign45 = _mm_srai_epi16(in45, 15); - sign67 = _mm_srai_epi16(in67, 15); - - in01 = _mm_sub_epi16(in01, sign01); - in23 = _mm_sub_epi16(in23, sign23); - in45 = _mm_sub_epi16(in45, sign45); - in67 = _mm_sub_epi16(in67, sign67); - - in01 = _mm_srai_epi16(in01, shift); - in23 = _mm_srai_epi16(in23, shift); - in45 = _mm_srai_epi16(in45, shift); - in67 = _mm_srai_epi16(in67, shift); - - store_output(&in01, (output + 0 * 8)); - store_output(&in23, (output + 1 * 8)); - store_output(&in45, (output + 2 * 8)); - store_output(&in67, (output + 3 * 8)); -} - -void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in[8]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - switch (tx_type) { - case DCT_DCT: - load_buffer_4x8(input, in, stride, 0, 0); - fdct4_sse2(in); - fdct4_sse2(in + 4); - fdct8_sse2(in); - break; - case ADST_DCT: - load_buffer_4x8(input, in, stride, 0, 0); - fdct4_sse2(in); - fdct4_sse2(in + 4); - fadst8_sse2(in); - break; - case DCT_ADST: - load_buffer_4x8(input, in, stride, 0, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fdct8_sse2(in); - break; - case ADST_ADST: - load_buffer_4x8(input, in, stride, 0, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fadst8_sse2(in); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_4x8(input, in, stride, 1, 0); - fdct4_sse2(in); - fdct4_sse2(in + 4); - fadst8_sse2(in); - break; - case DCT_FLIPADST: - load_buffer_4x8(input, in, stride, 0, 1); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fdct8_sse2(in); - break; - case FLIPADST_FLIPADST: - load_buffer_4x8(input, in, stride, 1, 1); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fadst8_sse2(in); - break; - case ADST_FLIPADST: - load_buffer_4x8(input, in, stride, 0, 1); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fadst8_sse2(in); - break; - case FLIPADST_ADST: - load_buffer_4x8(input, in, stride, 1, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fadst8_sse2(in); - break; - case IDTX: - load_buffer_4x8(input, in, stride, 0, 0); - fidtx4_sse2(in); - fidtx4_sse2(in + 4); - fidtx8_sse2(in); - break; - case V_DCT: - load_buffer_4x8(input, in, stride, 0, 0); - fidtx4_sse2(in); - fidtx4_sse2(in + 4); - fdct8_sse2(in); - break; - case H_DCT: - load_buffer_4x8(input, in, stride, 0, 0); - fdct4_sse2(in); - fdct4_sse2(in + 4); - fidtx8_sse2(in); - break; - case V_ADST: - load_buffer_4x8(input, in, stride, 0, 0); - fidtx4_sse2(in); - fidtx4_sse2(in + 4); - fadst8_sse2(in); - break; - case H_ADST: - load_buffer_4x8(input, in, stride, 0, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fidtx8_sse2(in); - break; - case V_FLIPADST: - load_buffer_4x8(input, in, stride, 1, 0); - fidtx4_sse2(in); - fidtx4_sse2(in + 4); - fadst8_sse2(in); - break; - case H_FLIPADST: - load_buffer_4x8(input, in, stride, 0, 1); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fidtx8_sse2(in); - break; -#endif - default: assert(0); break; - } - write_buffer_4x8(output, in); -} - -// Load input into the left-hand half of in (ie, into lanes 0..3 of -// each element of in). The right hand half (lanes 4..7) should be -// treated as being filled with "don't care" values. -// The input is split horizontally into two 4x4 -// chunks 'l' and 'r'. Then 'l' is stored in the top-left 4x4 -// block of 'in' and 'r' is stored in the bottom-left block. -// This is to allow us to reuse 4x4 transforms. -static INLINE void load_buffer_8x4(const int16_t *input, __m128i *in, - int stride, int flipud, int fliplr) { - const int shift = 2; - if (!flipud) { - in[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride)); - in[1] = _mm_loadu_si128((const __m128i *)(input + 1 * stride)); - in[2] = _mm_loadu_si128((const __m128i *)(input + 2 * stride)); - in[3] = _mm_loadu_si128((const __m128i *)(input + 3 * stride)); - } else { - in[0] = _mm_loadu_si128((const __m128i *)(input + 3 * stride)); - in[1] = _mm_loadu_si128((const __m128i *)(input + 2 * stride)); - in[2] = _mm_loadu_si128((const __m128i *)(input + 1 * stride)); - in[3] = _mm_loadu_si128((const __m128i *)(input + 0 * stride)); - } - - if (fliplr) { - in[0] = mm_reverse_epi16(in[0]); - in[1] = mm_reverse_epi16(in[1]); - in[2] = mm_reverse_epi16(in[2]); - in[3] = mm_reverse_epi16(in[3]); - } - - in[0] = _mm_slli_epi16(in[0], shift); - in[1] = _mm_slli_epi16(in[1], shift); - in[2] = _mm_slli_epi16(in[2], shift); - in[3] = _mm_slli_epi16(in[3], shift); - - scale_sqrt2_8x4(in); - - in[4] = _mm_shuffle_epi32(in[0], 0xe); - in[5] = _mm_shuffle_epi32(in[1], 0xe); - in[6] = _mm_shuffle_epi32(in[2], 0xe); - in[7] = _mm_shuffle_epi32(in[3], 0xe); -} - -static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) { - __m128i out0, out1, out2, out3, sign0, sign1, sign2, sign3; - const int shift = 1; - sign0 = _mm_srai_epi16(res[0], 15); - sign1 = _mm_srai_epi16(res[1], 15); - sign2 = _mm_srai_epi16(res[2], 15); - sign3 = _mm_srai_epi16(res[3], 15); - - out0 = _mm_sub_epi16(res[0], sign0); - out1 = _mm_sub_epi16(res[1], sign1); - out2 = _mm_sub_epi16(res[2], sign2); - out3 = _mm_sub_epi16(res[3], sign3); - - out0 = _mm_srai_epi16(out0, shift); - out1 = _mm_srai_epi16(out1, shift); - out2 = _mm_srai_epi16(out2, shift); - out3 = _mm_srai_epi16(out3, shift); - - store_output(&out0, (output + 0 * 8)); - store_output(&out1, (output + 1 * 8)); - store_output(&out2, (output + 2 * 8)); - store_output(&out3, (output + 3 * 8)); -} - -void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in[8]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - switch (tx_type) { - case DCT_DCT: - load_buffer_8x4(input, in, stride, 0, 0); - fdct4_sse2(in); - fdct4_sse2(in + 4); - fdct8_sse2(in); - break; - case ADST_DCT: - load_buffer_8x4(input, in, stride, 0, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fdct8_sse2(in); - break; - case DCT_ADST: - load_buffer_8x4(input, in, stride, 0, 0); - fdct4_sse2(in); - fdct4_sse2(in + 4); - fadst8_sse2(in); - break; - case ADST_ADST: - load_buffer_8x4(input, in, stride, 0, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fadst8_sse2(in); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_8x4(input, in, stride, 1, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fdct8_sse2(in); - break; - case DCT_FLIPADST: - load_buffer_8x4(input, in, stride, 0, 1); - fdct4_sse2(in); - fdct4_sse2(in + 4); - fadst8_sse2(in); - break; - case FLIPADST_FLIPADST: - load_buffer_8x4(input, in, stride, 1, 1); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fadst8_sse2(in); - break; - case ADST_FLIPADST: - load_buffer_8x4(input, in, stride, 0, 1); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fadst8_sse2(in); - break; - case FLIPADST_ADST: - load_buffer_8x4(input, in, stride, 1, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fadst8_sse2(in); - break; - case IDTX: - load_buffer_8x4(input, in, stride, 0, 0); - fidtx4_sse2(in); - fidtx4_sse2(in + 4); - fidtx8_sse2(in); - break; - case V_DCT: - load_buffer_8x4(input, in, stride, 0, 0); - fdct4_sse2(in); - fdct4_sse2(in + 4); - fidtx8_sse2(in); - break; - case H_DCT: - load_buffer_8x4(input, in, stride, 0, 0); - fidtx4_sse2(in); - fidtx4_sse2(in + 4); - fdct8_sse2(in); - break; - case V_ADST: - load_buffer_8x4(input, in, stride, 0, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fidtx8_sse2(in); - break; - case H_ADST: - load_buffer_8x4(input, in, stride, 0, 0); - fidtx4_sse2(in); - fidtx4_sse2(in + 4); - fadst8_sse2(in); - break; - case V_FLIPADST: - load_buffer_8x4(input, in, stride, 1, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fidtx8_sse2(in); - break; - case H_FLIPADST: - load_buffer_8x4(input, in, stride, 0, 1); - fidtx4_sse2(in); - fidtx4_sse2(in + 4); - fadst8_sse2(in); - break; -#endif - default: assert(0); break; - } - write_buffer_8x4(output, in); -} - -static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in, - int stride, int flipud, int fliplr) { - // Load 2 8x8 blocks - const int16_t *t = input; - const int16_t *b = input + 8 * stride; - - if (flipud) { - const int16_t *const tmp = t; - t = b; - b = tmp; - } - - load_buffer_8x8(t, in, stride, flipud, fliplr); - scale_sqrt2_8x8(in); - load_buffer_8x8(b, in + 8, stride, flipud, fliplr); - scale_sqrt2_8x8(in + 8); -} - -static INLINE void round_power_of_two_signed(__m128i *x, int n) { - const __m128i rounding = _mm_set1_epi16((1 << n) >> 1); - const __m128i sign = _mm_srai_epi16(*x, 15); - const __m128i res = _mm_add_epi16(_mm_add_epi16(*x, rounding), sign); - *x = _mm_srai_epi16(res, n); -} - -static void row_8x16_rounding(__m128i *in, int bits) { - int i; - for (i = 0; i < 16; i++) { - round_power_of_two_signed(&in[i], bits); - } -} - -void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in[16]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - __m128i *const t = in; // Alias to top 8x8 sub block - __m128i *const b = in + 8; // Alias to bottom 8x8 sub block - - switch (tx_type) { - case DCT_DCT: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fdct8_sse2(t); - fdct8_sse2(b); - row_8x16_rounding(in, 2); - fdct16_8col(in); - break; - case ADST_DCT: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fdct8_sse2(t); - fdct8_sse2(b); - row_8x16_rounding(in, 2); - fadst16_8col(in); - break; - case DCT_ADST: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fadst8_sse2(t); - fadst8_sse2(b); - row_8x16_rounding(in, 2); - fdct16_8col(in); - break; - case ADST_ADST: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fadst8_sse2(t); - fadst8_sse2(b); - row_8x16_rounding(in, 2); - fadst16_8col(in); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_8x16(input, in, stride, 1, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fdct8_sse2(t); - fdct8_sse2(b); - row_8x16_rounding(in, 2); - fadst16_8col(in); - break; - case DCT_FLIPADST: - load_buffer_8x16(input, in, stride, 0, 1); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fadst8_sse2(t); - fadst8_sse2(b); - row_8x16_rounding(in, 2); - fdct16_8col(in); - break; - case FLIPADST_FLIPADST: - load_buffer_8x16(input, in, stride, 1, 1); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fadst8_sse2(t); - fadst8_sse2(b); - row_8x16_rounding(in, 2); - fadst16_8col(in); - break; - case ADST_FLIPADST: - load_buffer_8x16(input, in, stride, 0, 1); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fadst8_sse2(t); - fadst8_sse2(b); - row_8x16_rounding(in, 2); - fadst16_8col(in); - break; - case FLIPADST_ADST: - load_buffer_8x16(input, in, stride, 1, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fadst8_sse2(t); - fadst8_sse2(b); - row_8x16_rounding(in, 2); - fadst16_8col(in); - break; - case IDTX: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fidtx8_sse2(t); - fidtx8_sse2(b); - row_8x16_rounding(in, 2); - idtx16_8col(in); - break; - case V_DCT: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fidtx8_sse2(t); - fidtx8_sse2(b); - row_8x16_rounding(in, 2); - fdct16_8col(in); - break; - case H_DCT: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fdct8_sse2(t); - fdct8_sse2(b); - row_8x16_rounding(in, 2); - idtx16_8col(in); - break; - case V_ADST: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fidtx8_sse2(t); - fidtx8_sse2(b); - row_8x16_rounding(in, 2); - fadst16_8col(in); - break; - case H_ADST: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fadst8_sse2(t); - fadst8_sse2(b); - row_8x16_rounding(in, 2); - idtx16_8col(in); - break; - case V_FLIPADST: - load_buffer_8x16(input, in, stride, 1, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fidtx8_sse2(t); - fidtx8_sse2(b); - row_8x16_rounding(in, 2); - fadst16_8col(in); - break; - case H_FLIPADST: - load_buffer_8x16(input, in, stride, 0, 1); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fadst8_sse2(t); - fadst8_sse2(b); - row_8x16_rounding(in, 2); - idtx16_8col(in); - break; -#endif - default: assert(0); break; - } - write_buffer_8x8(output, t, 8); - write_buffer_8x8(output + 64, b, 8); -} - -static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in, - int stride, int flipud, int fliplr) { - // Load 2 8x8 blocks - const int16_t *l = input; - const int16_t *r = input + 8; - - if (fliplr) { - const int16_t *const tmp = l; - l = r; - r = tmp; - } - - // load first 8 columns - load_buffer_8x8(l, in, stride, flipud, fliplr); - scale_sqrt2_8x8(in); - load_buffer_8x8(r, in + 8, stride, flipud, fliplr); - scale_sqrt2_8x8(in + 8); -} - -#define col_16x8_rounding row_8x16_rounding - -void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in[16]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - __m128i *const l = in; // Alias to left 8x8 sub block - __m128i *const r = in + 8; // Alias to right 8x8 sub block, which we store - // in the second half of the array - - switch (tx_type) { - case DCT_DCT: - load_buffer_16x8(input, in, stride, 0, 0); - fdct8_sse2(l); - fdct8_sse2(r); - col_16x8_rounding(in, 2); - fdct16_8col(in); - break; - case ADST_DCT: - load_buffer_16x8(input, in, stride, 0, 0); - fadst8_sse2(l); - fadst8_sse2(r); - col_16x8_rounding(in, 2); - fdct16_8col(in); - break; - case DCT_ADST: - load_buffer_16x8(input, in, stride, 0, 0); - fdct8_sse2(l); - fdct8_sse2(r); - col_16x8_rounding(in, 2); - fadst16_8col(in); - break; - case ADST_ADST: - load_buffer_16x8(input, in, stride, 0, 0); - fadst8_sse2(l); - fadst8_sse2(r); - col_16x8_rounding(in, 2); - fadst16_8col(in); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_16x8(input, in, stride, 1, 0); - fadst8_sse2(l); - fadst8_sse2(r); - col_16x8_rounding(in, 2); - fdct16_8col(in); - break; - case DCT_FLIPADST: - load_buffer_16x8(input, in, stride, 0, 1); - fdct8_sse2(l); - fdct8_sse2(r); - col_16x8_rounding(in, 2); - fadst16_8col(in); - break; - case FLIPADST_FLIPADST: - load_buffer_16x8(input, in, stride, 1, 1); - fadst8_sse2(l); - fadst8_sse2(r); - col_16x8_rounding(in, 2); - fadst16_8col(in); - break; - case ADST_FLIPADST: - load_buffer_16x8(input, in, stride, 0, 1); - fadst8_sse2(l); - fadst8_sse2(r); - col_16x8_rounding(in, 2); - fadst16_8col(in); - break; - case FLIPADST_ADST: - load_buffer_16x8(input, in, stride, 1, 0); - fadst8_sse2(l); - fadst8_sse2(r); - col_16x8_rounding(in, 2); - fadst16_8col(in); - break; - case IDTX: - load_buffer_16x8(input, in, stride, 0, 0); - fidtx8_sse2(l); - fidtx8_sse2(r); - col_16x8_rounding(in, 2); - idtx16_8col(in); - break; - case V_DCT: - load_buffer_16x8(input, in, stride, 0, 0); - fdct8_sse2(l); - fdct8_sse2(r); - col_16x8_rounding(in, 2); - idtx16_8col(in); - break; - case H_DCT: - load_buffer_16x8(input, in, stride, 0, 0); - fidtx8_sse2(l); - fidtx8_sse2(r); - col_16x8_rounding(in, 2); - fdct16_8col(in); - break; - case V_ADST: - load_buffer_16x8(input, in, stride, 0, 0); - fadst8_sse2(l); - fadst8_sse2(r); - col_16x8_rounding(in, 2); - idtx16_8col(in); - break; - case H_ADST: - load_buffer_16x8(input, in, stride, 0, 0); - fidtx8_sse2(l); - fidtx8_sse2(r); - col_16x8_rounding(in, 2); - fadst16_8col(in); - break; - case V_FLIPADST: - load_buffer_16x8(input, in, stride, 1, 0); - fadst8_sse2(l); - fadst8_sse2(r); - col_16x8_rounding(in, 2); - idtx16_8col(in); - break; - case H_FLIPADST: - load_buffer_16x8(input, in, stride, 0, 1); - fidtx8_sse2(l); - fidtx8_sse2(r); - col_16x8_rounding(in, 2); - fadst16_8col(in); - break; -#endif - default: assert(0); break; - } - array_transpose_8x8(l, l); - array_transpose_8x8(r, r); - write_buffer_8x8(output, l, 16); - write_buffer_8x8(output + 8, r, 16); -} - -// Note: The 16-column 32-element transforms expect their input to be -// split up into a 2x2 grid of 8x16 blocks -static INLINE void fdct32_16col(__m128i *tl, __m128i *tr, __m128i *bl, - __m128i *br) { - fdct32_8col(tl, bl); - fdct32_8col(tr, br); - array_transpose_16x16(tl, tr); - array_transpose_16x16(bl, br); -} - -#if CONFIG_EXT_TX -static INLINE void fidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl, - __m128i *br) { - int i; - for (i = 0; i < 16; ++i) { - tl[i] = _mm_slli_epi16(tl[i], 2); - tr[i] = _mm_slli_epi16(tr[i], 2); - bl[i] = _mm_slli_epi16(bl[i], 2); - br[i] = _mm_slli_epi16(br[i], 2); - } - array_transpose_16x16(tl, tr); - array_transpose_16x16(bl, br); -} -#endif - -static INLINE void load_buffer_16x32(const int16_t *input, __m128i *intl, - __m128i *intr, __m128i *inbl, - __m128i *inbr, int stride, int flipud, - int fliplr) { - int i; - if (flipud) { - input = input + 31 * stride; - stride = -stride; - } - - for (i = 0; i < 16; ++i) { - intl[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2); - intr[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2); - inbl[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 0)), 2); - inbr[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 8)), 2); - } - - if (fliplr) { - __m128i tmp; - for (i = 0; i < 16; ++i) { - tmp = intl[i]; - intl[i] = mm_reverse_epi16(intr[i]); - intr[i] = mm_reverse_epi16(tmp); - tmp = inbl[i]; - inbl[i] = mm_reverse_epi16(inbr[i]); - inbr[i] = mm_reverse_epi16(tmp); - } - } - - scale_sqrt2_8x16(intl); - scale_sqrt2_8x16(intr); - scale_sqrt2_8x16(inbl); - scale_sqrt2_8x16(inbr); -} - -static INLINE void write_buffer_16x32(tran_low_t *output, __m128i *restl, - __m128i *restr, __m128i *resbl, - __m128i *resbr) { - int i; - for (i = 0; i < 16; ++i) { - store_output(&restl[i], output + i * 16 + 0); - store_output(&restr[i], output + i * 16 + 8); - store_output(&resbl[i], output + (i + 16) * 16 + 0); - store_output(&resbr[i], output + (i + 16) * 16 + 8); - } -} - -static INLINE void round_signed_8x8(__m128i *in, const int bit) { - const __m128i rounding = _mm_set1_epi16((1 << bit) >> 1); - __m128i sign0 = _mm_srai_epi16(in[0], 15); - __m128i sign1 = _mm_srai_epi16(in[1], 15); - __m128i sign2 = _mm_srai_epi16(in[2], 15); - __m128i sign3 = _mm_srai_epi16(in[3], 15); - __m128i sign4 = _mm_srai_epi16(in[4], 15); - __m128i sign5 = _mm_srai_epi16(in[5], 15); - __m128i sign6 = _mm_srai_epi16(in[6], 15); - __m128i sign7 = _mm_srai_epi16(in[7], 15); - - in[0] = _mm_add_epi16(_mm_add_epi16(in[0], rounding), sign0); - in[1] = _mm_add_epi16(_mm_add_epi16(in[1], rounding), sign1); - in[2] = _mm_add_epi16(_mm_add_epi16(in[2], rounding), sign2); - in[3] = _mm_add_epi16(_mm_add_epi16(in[3], rounding), sign3); - in[4] = _mm_add_epi16(_mm_add_epi16(in[4], rounding), sign4); - in[5] = _mm_add_epi16(_mm_add_epi16(in[5], rounding), sign5); - in[6] = _mm_add_epi16(_mm_add_epi16(in[6], rounding), sign6); - in[7] = _mm_add_epi16(_mm_add_epi16(in[7], rounding), sign7); - - in[0] = _mm_srai_epi16(in[0], bit); - in[1] = _mm_srai_epi16(in[1], bit); - in[2] = _mm_srai_epi16(in[2], bit); - in[3] = _mm_srai_epi16(in[3], bit); - in[4] = _mm_srai_epi16(in[4], bit); - in[5] = _mm_srai_epi16(in[5], bit); - in[6] = _mm_srai_epi16(in[6], bit); - in[7] = _mm_srai_epi16(in[7], bit); -} - -static INLINE void round_signed_16x16(__m128i *in0, __m128i *in1) { - const int bit = 4; - round_signed_8x8(in0, bit); - round_signed_8x8(in0 + 8, bit); - round_signed_8x8(in1, bit); - round_signed_8x8(in1 + 8, bit); -} - -// Note: -// suffix "t" indicates the transpose operation comes first -static void fdct16t_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - fdct16_8col(in0); - fdct16_8col(in1); -} - -static void fadst16t_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - fadst16_8col(in0); - fadst16_8col(in1); -} - -static INLINE void fdct32t_16col(__m128i *tl, __m128i *tr, __m128i *bl, - __m128i *br) { - array_transpose_16x16(tl, tr); - array_transpose_16x16(bl, br); - fdct32_8col(tl, bl); - fdct32_8col(tr, br); -} - -typedef enum transpose_indicator_ { - transpose, - no_transpose, -} transpose_indicator; - -static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl, - __m128i *br, transpose_indicator t) { - __m128i tmpl[16], tmpr[16]; - int i; - - // Copy the bottom half of the input to temporary storage - for (i = 0; i < 16; ++i) { - tmpl[i] = bl[i]; - tmpr[i] = br[i]; - } - - // Generate the bottom half of the output - for (i = 0; i < 16; ++i) { - bl[i] = _mm_slli_epi16(tl[i], 2); - br[i] = _mm_slli_epi16(tr[i], 2); - } - array_transpose_16x16(bl, br); - - // Copy the temporary storage back to the top half of the input - for (i = 0; i < 16; ++i) { - tl[i] = tmpl[i]; - tr[i] = tmpr[i]; - } - - // Generate the top half of the output - scale_sqrt2_8x16(tl); - scale_sqrt2_8x16(tr); - if (t == transpose) - fdct16t_sse2(tl, tr); - else - fdct16_sse2(tl, tr); -} - -// Note on data layout, for both this and the 32x16 transforms: -// So that we can reuse the 16-element transforms easily, -// we want to split the input into 8x16 blocks. -// For 16x32, this means the input is a 2x2 grid of such blocks. -// For 32x16, it means the input is a 4x1 grid. -void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i intl[16], intr[16], inbl[16], inbr[16]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - switch (tx_type) { - case DCT_DCT: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fdct16t_sse2(intl, intr); - fdct16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fdct32t_16col(intl, intr, inbl, inbr); - break; - case ADST_DCT: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fdct16t_sse2(intl, intr); - fdct16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fhalfright32_16col(intl, intr, inbl, inbr, transpose); - break; - case DCT_ADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fadst16t_sse2(intl, intr); - fadst16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fdct32t_16col(intl, intr, inbl, inbr); - break; - case ADST_ADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fadst16t_sse2(intl, intr); - fadst16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fhalfright32_16col(intl, intr, inbl, inbr, transpose); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0); - fdct16t_sse2(intl, intr); - fdct16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fhalfright32_16col(intl, intr, inbl, inbr, transpose); - break; - case DCT_FLIPADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1); - fadst16t_sse2(intl, intr); - fadst16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fdct32t_16col(intl, intr, inbl, inbr); - break; - case FLIPADST_FLIPADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 1); - fadst16t_sse2(intl, intr); - fadst16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fhalfright32_16col(intl, intr, inbl, inbr, transpose); - break; - case ADST_FLIPADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1); - fadst16t_sse2(intl, intr); - fadst16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fhalfright32_16col(intl, intr, inbl, inbr, transpose); - break; - case FLIPADST_ADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0); - fadst16t_sse2(intl, intr); - fadst16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fhalfright32_16col(intl, intr, inbl, inbr, transpose); - break; - case IDTX: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fidtx16_sse2(intl, intr); - fidtx16_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fidtx32_16col(intl, intr, inbl, inbr); - break; - case V_DCT: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fidtx16_sse2(intl, intr); - fidtx16_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fdct32t_16col(intl, intr, inbl, inbr); - break; - case H_DCT: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fdct16t_sse2(intl, intr); - fdct16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fidtx32_16col(intl, intr, inbl, inbr); - break; - case V_ADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fidtx16_sse2(intl, intr); - fidtx16_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fhalfright32_16col(intl, intr, inbl, inbr, transpose); - break; - case H_ADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fadst16t_sse2(intl, intr); - fadst16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fidtx32_16col(intl, intr, inbl, inbr); - break; - case V_FLIPADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0); - fidtx16_sse2(intl, intr); - fidtx16_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fhalfright32_16col(intl, intr, inbl, inbr, transpose); - break; - case H_FLIPADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1); - fadst16t_sse2(intl, intr); - fadst16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fidtx32_16col(intl, intr, inbl, inbr); - break; -#endif - default: assert(0); break; - } - write_buffer_16x32(output, intl, intr, inbl, inbr); -} - -static INLINE void load_buffer_32x16(const int16_t *input, __m128i *in0, - __m128i *in1, __m128i *in2, __m128i *in3, - int stride, int flipud, int fliplr) { - int i; - if (flipud) { - input += 15 * stride; - stride = -stride; - } - - for (i = 0; i < 16; ++i) { - in0[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2); - in1[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2); - in2[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2); - in3[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2); - } - - if (fliplr) { - for (i = 0; i < 16; ++i) { - __m128i tmp1 = in0[i]; - __m128i tmp2 = in1[i]; - in0[i] = mm_reverse_epi16(in3[i]); - in1[i] = mm_reverse_epi16(in2[i]); - in2[i] = mm_reverse_epi16(tmp2); - in3[i] = mm_reverse_epi16(tmp1); - } - } - - scale_sqrt2_8x16(in0); - scale_sqrt2_8x16(in1); - scale_sqrt2_8x16(in2); - scale_sqrt2_8x16(in3); -} - -static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0, - __m128i *res1, __m128i *res2, - __m128i *res3) { - int i; - for (i = 0; i < 16; ++i) { - store_output(&res0[i], output + i * 32 + 0); - store_output(&res1[i], output + i * 32 + 8); - store_output(&res2[i], output + i * 32 + 16); - store_output(&res3[i], output + i * 32 + 24); - } -} - -void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in0[16], in1[16], in2[16], in3[16]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); - switch (tx_type) { - case DCT_DCT: - fdct16_sse2(in0, in1); - fdct16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fdct32_16col(in0, in1, in2, in3); - break; - case ADST_DCT: - fadst16_sse2(in0, in1); - fadst16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fdct32_16col(in0, in1, in2, in3); - break; - case DCT_ADST: - fdct16_sse2(in0, in1); - fdct16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fhalfright32_16col(in0, in1, in2, in3, no_transpose); - break; - case ADST_ADST: - fadst16_sse2(in0, in1); - fadst16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fhalfright32_16col(in0, in1, in2, in3, no_transpose); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0); - fadst16_sse2(in0, in1); - fadst16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fdct32_16col(in0, in1, in2, in3); - break; - case DCT_FLIPADST: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1); - fdct16_sse2(in0, in1); - fdct16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fhalfright32_16col(in0, in1, in2, in3, no_transpose); - break; - case FLIPADST_FLIPADST: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 1); - fadst16_sse2(in0, in1); - fadst16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fhalfright32_16col(in0, in1, in2, in3, no_transpose); - break; - case ADST_FLIPADST: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1); - fadst16_sse2(in0, in1); - fadst16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fhalfright32_16col(in0, in1, in2, in3, no_transpose); - break; - case FLIPADST_ADST: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0); - fadst16_sse2(in0, in1); - fadst16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fhalfright32_16col(in0, in1, in2, in3, no_transpose); - break; - case IDTX: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); - fidtx16_sse2(in0, in1); - fidtx16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fidtx32_16col(in0, in1, in2, in3); - break; - case V_DCT: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); - fdct16_sse2(in0, in1); - fdct16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fidtx32_16col(in0, in1, in2, in3); - break; - case H_DCT: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); - fidtx16_sse2(in0, in1); - fidtx16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fdct32_16col(in0, in1, in2, in3); - break; - case V_ADST: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); - fadst16_sse2(in0, in1); - fadst16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fidtx32_16col(in0, in1, in2, in3); - break; - case H_ADST: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); - fidtx16_sse2(in0, in1); - fidtx16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fhalfright32_16col(in0, in1, in2, in3, no_transpose); - break; - case V_FLIPADST: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0); - fadst16_sse2(in0, in1); - fadst16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fidtx32_16col(in0, in1, in2, in3); - break; - case H_FLIPADST: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1); - fidtx16_sse2(in0, in1); - fidtx16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fhalfright32_16col(in0, in1, in2, in3, no_transpose); - break; -#endif - default: assert(0); break; - } - write_buffer_32x16(output, in0, in1, in2, in3); -} - -// Note: -// 32x32 hybrid fwd txfm -// 4x2 grids of 8x16 block. Each block is represented by __m128i in[16] -static INLINE void load_buffer_32x32(const int16_t *input, - __m128i *in0 /*in0[32]*/, - __m128i *in1 /*in1[32]*/, - __m128i *in2 /*in2[32]*/, - __m128i *in3 /*in3[32]*/, int stride, - int flipud, int fliplr) { - if (flipud) { - input += 31 * stride; - stride = -stride; - } - - int i; - for (i = 0; i < 32; ++i) { - in0[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2); - in1[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2); - in2[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2); - in3[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2); - } - - if (fliplr) { - for (i = 0; i < 32; ++i) { - __m128i tmp1 = in0[i]; - __m128i tmp2 = in1[i]; - in0[i] = mm_reverse_epi16(in3[i]); - in1[i] = mm_reverse_epi16(in2[i]); - in2[i] = mm_reverse_epi16(tmp2); - in3[i] = mm_reverse_epi16(tmp1); - } - } -} - -static INLINE void swap_16x16(__m128i *b0l /*b0l[16]*/, - __m128i *b0r /*b0r[16]*/, - __m128i *b1l /*b1l[16]*/, - __m128i *b1r /*b1r[16]*/) { - int i; - for (i = 0; i < 16; ++i) { - __m128i tmp0 = b1l[i]; - __m128i tmp1 = b1r[i]; - b1l[i] = b0l[i]; - b1r[i] = b0r[i]; - b0l[i] = tmp0; - b0r[i] = tmp1; - } -} - -static INLINE void fdct32(__m128i *in0, __m128i *in1, __m128i *in2, - __m128i *in3) { - fdct32_8col(in0, &in0[16]); - fdct32_8col(in1, &in1[16]); - fdct32_8col(in2, &in2[16]); - fdct32_8col(in3, &in3[16]); - - array_transpose_16x16(in0, in1); - array_transpose_16x16(&in0[16], &in1[16]); - array_transpose_16x16(in2, in3); - array_transpose_16x16(&in2[16], &in3[16]); - - swap_16x16(&in0[16], &in1[16], in2, in3); -} - -static INLINE void fhalfright32(__m128i *in0, __m128i *in1, __m128i *in2, - __m128i *in3) { - fhalfright32_16col(in0, in1, &in0[16], &in1[16], no_transpose); - fhalfright32_16col(in2, in3, &in2[16], &in3[16], no_transpose); - swap_16x16(&in0[16], &in1[16], in2, in3); -} - -#if CONFIG_EXT_TX -static INLINE void fidtx32(__m128i *in0, __m128i *in1, __m128i *in2, - __m128i *in3) { - fidtx32_16col(in0, in1, &in0[16], &in1[16]); - fidtx32_16col(in2, in3, &in2[16], &in3[16]); - swap_16x16(&in0[16], &in1[16], in2, in3); -} -#endif - -static INLINE void round_signed_32x32(__m128i *in0, __m128i *in1, __m128i *in2, - __m128i *in3) { - round_signed_16x16(in0, in1); - round_signed_16x16(&in0[16], &in1[16]); - round_signed_16x16(in2, in3); - round_signed_16x16(&in2[16], &in3[16]); -} - -static INLINE void write_buffer_32x32(__m128i *in0, __m128i *in1, __m128i *in2, - __m128i *in3, tran_low_t *output) { - int i; - for (i = 0; i < 32; ++i) { - store_output(&in0[i], output + i * 32 + 0); - store_output(&in1[i], output + i * 32 + 8); - store_output(&in2[i], output + i * 32 + 16); - store_output(&in3[i], output + i * 32 + 24); - } -} - -void av1_fht32x32_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in0[32], in1[32], in2[32], in3[32]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "No 32x32 sse2 MRC_DCT implementation"); -#endif - - load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 0); - switch (tx_type) { - case DCT_DCT: - fdct32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fdct32(in0, in1, in2, in3); - break; - case ADST_DCT: - fhalfright32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fdct32(in0, in1, in2, in3); - break; - case DCT_ADST: - fdct32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fhalfright32(in0, in1, in2, in3); - break; - case ADST_ADST: - fhalfright32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fhalfright32(in0, in1, in2, in3); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0); - fhalfright32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fdct32(in0, in1, in2, in3); - break; - case DCT_FLIPADST: - load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1); - fdct32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fhalfright32(in0, in1, in2, in3); - break; - case FLIPADST_FLIPADST: - load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 1); - fhalfright32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fhalfright32(in0, in1, in2, in3); - break; - case ADST_FLIPADST: - load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1); - fhalfright32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fhalfright32(in0, in1, in2, in3); - break; - case FLIPADST_ADST: - load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0); - fhalfright32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fhalfright32(in0, in1, in2, in3); - break; - case IDTX: - fidtx32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fidtx32(in0, in1, in2, in3); - break; - case V_DCT: - fdct32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fidtx32(in0, in1, in2, in3); - break; - case H_DCT: - fidtx32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fdct32(in0, in1, in2, in3); - break; - case V_ADST: - fhalfright32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fidtx32(in0, in1, in2, in3); - break; - case H_ADST: - fidtx32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fhalfright32(in0, in1, in2, in3); - break; - case V_FLIPADST: - load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0); - fhalfright32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fidtx32(in0, in1, in2, in3); - break; - case H_FLIPADST: - load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1); - fidtx32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fhalfright32(in0, in1, in2, in3); - break; -#endif - default: assert(0); - } - write_buffer_32x32(in0, in1, in2, in3, output); -} diff --git a/third_party/aom/av1/encoder/x86/dct_sse2.asm b/third_party/aom/av1/encoder/x86/dct_sse2.asm index a99db3d6e..b18554818 100644 --- a/third_party/aom/av1/encoder/x86/dct_sse2.asm +++ b/third_party/aom/av1/encoder/x86/dct_sse2.asm @@ -63,7 +63,6 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride psllw m0, 2 psllw m1, 2 -%if CONFIG_HIGHBITDEPTH ; sign extension mova m2, m0 mova m3, m1 @@ -79,9 +78,5 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride mova [outputq + 16], m2 mova [outputq + 32], m1 mova [outputq + 48], m3 -%else - mova [outputq], m0 - mova [outputq + 16], m1 -%endif RET diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse2.c b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c new file mode 100644 index 000000000..dedb4d02f --- /dev/null +++ b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c @@ -0,0 +1,505 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <emmintrin.h> // SSE2 + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/txb_common.h" + +static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + __m128i *const level) { + level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride); + level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride); + level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride); + level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride); + level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride); +} + +static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + __m128i *const level) { + level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride); + level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride); + level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride); + level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride); + level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride); +} + +static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + __m128i *const level) { + level[0] = _mm_loadu_si128((__m128i *)(src + 1)); + level[1] = _mm_loadu_si128((__m128i *)(src + stride)); + level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0])); + level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1])); + level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2])); +} + +static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) { + const __m128i const_3 = _mm_set1_epi8(3); + const __m128i const_4 = _mm_set1_epi8(4); + __m128i count; + + count = _mm_min_epu8(level[0], const_3); + level[1] = _mm_min_epu8(level[1], const_3); + level[2] = _mm_min_epu8(level[2], const_3); + level[3] = _mm_min_epu8(level[3], const_3); + level[4] = _mm_min_epu8(level[4], const_3); + count = _mm_add_epi8(count, level[1]); + count = _mm_add_epi8(count, level[2]); + count = _mm_add_epi8(count, level[3]); + count = _mm_add_epi8(count, level[4]); + count = _mm_avg_epu8(count, _mm_setzero_si128()); + count = _mm_min_epu8(count, const_4); + return count; +} + +static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *const coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const __m128i pos_to_offset_large = _mm_set1_epi8(21); + __m128i pos_to_offset = + (height == 4) + ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21) + : _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21, + 21, 21); + __m128i count; + __m128i level[5]; + int8_t *cc = coeff_contexts; + int row = height; + + assert(!(height % 4)); + + do { + load_levels_4x4x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)cc, count); + pos_to_offset = pos_to_offset_large; + levels += 4 * stride; + cc += 16; + row -= 4; + } while (row); + + coeff_contexts[0] = 0; +} + +static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + __m128i count; + __m128i level[5]; + int row = height; + + assert(!(height % 4)); + + do { + load_levels_4x4x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + levels += 4 * stride; + coeff_contexts += 16; + row -= 4; + } while (row); +} + +static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); + __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + __m128i count; + __m128i level[5]; + int row = height; + + assert(!(height % 4)); + + do { + load_levels_4x4x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 4 * stride; + coeff_contexts += 16; + row -= 4; + } while (row); +} + +static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + int8_t *cc = coeff_contexts; + int row = height; + __m128i count; + __m128i level[5]; + __m128i pos_to_offset[3]; + + assert(!(height % 2)); + + if (height == 8) { + pos_to_offset[0] = + _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, + 21, 21, 21, 21, 21); + } else if (height < 8) { + pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21, + 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, + 21, 21, 21, 21, 21); + } else { + pos_to_offset[0] = _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11); + pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, + 21, 21, 21, 21, 21); + } + pos_to_offset[2] = _mm_set1_epi8(21); + + do { + load_levels_8x2x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset[0]); + _mm_store_si128((__m128i *)cc, count); + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + levels += 2 * stride; + cc += 16; + row -= 2; + } while (row); + + coeff_contexts[0] = 0; +} + +static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + const __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + int row = height; + __m128i count; + __m128i level[5]; + + assert(!(height % 2)); + + do { + load_levels_8x2x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + levels += 2 * stride; + coeff_contexts += 16; + row -= 2; + } while (row); +} + +static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); + __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5); + int row = height; + __m128i count; + __m128i level[5]; + + assert(!(height % 2)); + + do { + load_levels_8x2x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 2 * stride; + coeff_contexts += 16; + row -= 2; + } while (row); +} + +static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels, + const int real_width, + const int real_height, + const int width, const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = width + TX_PAD_HOR; + int8_t *cc = coeff_contexts; + int row = height; + __m128i pos_to_offset[5]; + __m128i pos_to_offset_large[3]; + __m128i count; + __m128i level[5]; + + assert(!(width % 16)); + + pos_to_offset_large[2] = _mm_set1_epi8(21); + if (real_width == real_height) { + pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] = + pos_to_offset_large[2]; + } else if (real_width > real_height) { + pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8( + 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21); + pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2]; + } else { // real_width < real_height + pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8( + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11); + pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[4] = pos_to_offset_large[2]; + pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(11); + } + + do { + int w = width; + + do { + load_levels_16x1x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset[0]); + _mm_store_si128((__m128i *)cc, count); + levels += 16; + cc += 16; + w -= 16; + pos_to_offset[0] = pos_to_offset_large[0]; + } while (w); + + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + pos_to_offset[2] = pos_to_offset[3]; + pos_to_offset[3] = pos_to_offset[4]; + pos_to_offset_large[0] = pos_to_offset_large[1]; + pos_to_offset_large[1] = pos_to_offset_large[2]; + levels += TX_PAD_HOR; + } while (--row); + + coeff_contexts[0] = 0; +} + +static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels, + const int width, const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = width + TX_PAD_HOR; + const __m128i pos_to_offset_large = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + __m128i count; + __m128i level[5]; + int row = height; + + assert(!(width % 16)); + + do { + __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + int w = width; + + do { + load_levels_16x1x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 16; + coeff_contexts += 16; + w -= 16; + } while (w); + + levels += TX_PAD_HOR; + } while (--row); +} + +static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels, + const int width, const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = width + TX_PAD_HOR; + __m128i pos_to_offset[3]; + __m128i count; + __m128i level[5]; + int row = height; + + assert(!(width % 16)); + + pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0); + pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5); + pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); + + do { + int w = width; + + do { + load_levels_16x1x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset[0]); + _mm_store_si128((__m128i *)coeff_contexts, count); + levels += 16; + coeff_contexts += 16; + w -= 16; + } while (w); + + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + levels += TX_PAD_HOR; + } while (--row); +} + +// Note: levels[] must be in the range [0, 127], inclusive. +void av1_get_nz_map_contexts_sse2(const uint8_t *const levels, + const int16_t *const scan, const uint16_t eob, + const TX_SIZE tx_size, + const TX_CLASS tx_class, + int8_t *const coeff_contexts) { + const int last_idx = eob - 1; + if (!last_idx) { + coeff_contexts[0] = 0; + return; + } + + const int real_width = tx_size_wide[tx_size]; + const int real_height = tx_size_high[tx_size]; + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + const int stride = width + TX_PAD_HOR; + ptrdiff_t offsets[3]; + + /* coeff_contexts must be 16 byte aligned. */ + assert(!((intptr_t)coeff_contexts & 0xf)); + + if (tx_class == TX_CLASS_2D) { + offsets[0] = 0 * stride + 2; + offsets[1] = 1 * stride + 1; + offsets[2] = 2 * stride + 0; + + if (width == 4) { + get_4_nz_map_contexts_2d(levels, height, offsets, coeff_contexts); + } else if (width == 8) { + get_8_coeff_contexts_2d(levels, height, offsets, coeff_contexts); + } else if (width == 16) { + get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height, + offsets, coeff_contexts); + } else { + get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height, + offsets, coeff_contexts); + } + } else if (tx_class == TX_CLASS_HORIZ) { + offsets[0] = 2; + offsets[1] = 3; + offsets[2] = 4; + if (width == 4) { + get_4_nz_map_contexts_hor(levels, height, offsets, coeff_contexts); + } else if (width == 8) { + get_8_coeff_contexts_hor(levels, height, offsets, coeff_contexts); + } else { + get_16n_coeff_contexts_hor(levels, width, height, offsets, + coeff_contexts); + } + } else { // TX_CLASS_VERT + offsets[0] = 2 * stride; + offsets[1] = 3 * stride; + offsets[2] = 4 * stride; + if (width == 4) { + get_4_nz_map_contexts_ver(levels, height, offsets, coeff_contexts); + } else if (width == 8) { + get_8_coeff_contexts_ver(levels, height, offsets, coeff_contexts); + } else { + get_16n_coeff_contexts_ver(levels, width, height, offsets, + coeff_contexts); + } + } + + const int bwl = get_txb_bwl(tx_size); + const int pos = scan[last_idx]; + if (last_idx <= (height << bwl) / 8) + coeff_contexts[pos] = 1; + else if (last_idx <= (height << bwl) / 4) + coeff_contexts[pos] = 2; + else + coeff_contexts[pos] = 3; +} diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c new file mode 100644 index 000000000..b3a879b0f --- /dev/null +++ b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <emmintrin.h> // SSE2 +#include <smmintrin.h> /* SSE4.1 */ + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/txb_common.h" + +void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = width + TX_PAD_HOR; + memset(levels - TX_PAD_TOP * stride, 0, + sizeof(*levels) * TX_PAD_TOP * stride); + memset(levels + stride * height, 0, + sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END)); + + const __m128i zeros = _mm_setzero_si128(); + int i = 0; + uint8_t *ls = levels; + const tran_low_t *cf = coeff; + if (width == 4) { + do { + const __m128i coeffA = _mm_load_si128((__m128i *)(cf)); + const __m128i coeffB = _mm_load_si128((__m128i *)(cf + width)); + const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); + const __m128i absAB = _mm_abs_epi16(coeffAB); + const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); + const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros); + _mm_storeu_si128((__m128i *)ls, lsAB); + ls += (stride << 1); + cf += (width << 1); + i += 2; + } while (i < height); + } else if (width == 8) { + do { + const __m128i coeffA = _mm_load_si128((__m128i *)(cf)); + const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4)); + const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); + const __m128i absAB = _mm_abs_epi16(coeffAB); + const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); + _mm_storeu_si128((__m128i *)ls, absAB8); + ls += stride; + cf += width; + i += 1; + } while (i < height); + } else { + do { + int j = 0; + do { + const __m128i coeffA = _mm_load_si128((__m128i *)(cf)); + const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4)); + const __m128i coeffC = _mm_load_si128((__m128i *)(cf + 8)); + const __m128i coeffD = _mm_load_si128((__m128i *)(cf + 12)); + const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); + const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD); + const __m128i absAB = _mm_abs_epi16(coeffAB); + const __m128i absCD = _mm_abs_epi16(coeffCD); + const __m128i absABCD = _mm_packs_epi16(absAB, absCD); + _mm_storeu_si128((__m128i *)(ls + j), absABCD); + j += 16; + cf += 16; + } while (j < width); + *(int32_t *)(ls + width) = 0; + ls += stride; + i += 1; + } while (i < height); + } +} diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c index 6599630d0..7d4f69585 100644 --- a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c +++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c @@ -11,7 +11,8 @@ #include <immintrin.h> // AVX2 -#include "./av1_rtcd.h" +#include "config/av1_rtcd.h" + #include "aom/aom_integer.h" static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset, diff --git a/third_party/aom/av1/encoder/x86/error_sse2.asm b/third_party/aom/av1/encoder/x86/error_sse2.asm index 4680f1fab..72e9e22b1 100644 --- a/third_party/aom/av1/encoder/x86/error_sse2.asm +++ b/third_party/aom/av1/encoder/x86/error_sse2.asm @@ -77,49 +77,3 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz movd edx, m5 %endif RET - -; Compute the sum of squared difference between two int16_t vectors. -; int64_t av1_block_error_fp(int16_t *coeff, int16_t *dqcoeff, -; intptr_t block_size) - -INIT_XMM sse2 -cglobal block_error_fp, 3, 3, 6, uqc, dqc, size - pxor m4, m4 ; sse accumulator - pxor m5, m5 ; dedicated zero register - lea uqcq, [uqcq+sizeq*2] - lea dqcq, [dqcq+sizeq*2] - neg sizeq -.loop: - mova m2, [uqcq+sizeq*2] - mova m0, [dqcq+sizeq*2] - mova m3, [uqcq+sizeq*2+mmsize] - mova m1, [dqcq+sizeq*2+mmsize] - psubw m0, m2 - psubw m1, m3 - ; individual errors are max. 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) - pmaddwd m0, m0 - pmaddwd m1, m1 - ; accumulate in 64bit - punpckldq m3, m0, m5 - punpckhdq m0, m5 - paddq m4, m3 - punpckldq m3, m1, m5 - paddq m4, m0 - punpckhdq m1, m5 - paddq m4, m3 - paddq m4, m1 - add sizeq, mmsize - jl .loop - - ; accumulate horizontally and store in return value - movhlps m5, m4 - paddq m4, m5 -%if ARCH_X86_64 - movq rax, m4 -%else - pshufd m5, m4, 0x1 - movd eax, m4 - movd edx, m5 -%endif - RET diff --git a/third_party/aom/av1/encoder/x86/hash_sse42.c b/third_party/aom/av1/encoder/x86/hash_sse42.c new file mode 100644 index 000000000..65fa46311 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/hash_sse42.c @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdint.h> +#include <smmintrin.h> + +// Byte-boundary alignment issues +#define ALIGN_SIZE 8 +#define ALIGN_MASK (ALIGN_SIZE - 1) + +#define CALC_CRC(op, crc, type, buf, len) \ + while ((len) >= sizeof(type)) { \ + (crc) = op((crc), *(type *)(buf)); \ + (len) -= sizeof(type); \ + buf += sizeof(type); \ + } + +/** + * Calculates 32-bit CRC for the input buffer + * polynomial is 0x11EDC6F41 + * @return A 32-bit unsigned integer representing the CRC + */ +uint32_t av1_get_crc32c_value_sse4_2(void *crc_calculator, uint8_t *p, + size_t len) { + (void)crc_calculator; + const uint8_t *buf = p; + uint32_t crc = 0xFFFFFFFF; + + // Align the input to the word boundary + for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) { + crc = _mm_crc32_u8(crc, *buf); + } + +#ifdef __x86_64__ + uint64_t crc64 = crc; + CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len); + crc = (uint32_t)crc64; +#endif + CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len); + CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len); + CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len); + return (crc ^= 0xFFFFFFFF); +} diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c index b684f7a3a..4cd6371a6 100644 --- a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c +++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c @@ -11,11 +11,12 @@ #include <assert.h> #include <smmintrin.h> /* SSE4.1 */ -#include "./av1_rtcd.h" -#include "./aom_config.h" -#include "av1/common/av1_fwd_txfm1d_cfg.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + #include "av1/common/av1_txfm.h" #include "av1/common/x86/highbd_txfm_utility_sse4.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" #include "aom_dsp/txfm_common.h" #include "aom_dsp/x86/txfm_common_sse2.h" #include "aom_ports/mem.h" @@ -121,72 +122,57 @@ static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) { } static void fadst4x4_sse4_1(__m128i *in, int bit) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const int32_t *sinpi = sinpi_arr(bit); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const __m128i kZero = _mm_setzero_si128(); - __m128i s0, s1, s2, s3; + const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]); + const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]); + const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]); + const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]); + __m128i t; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i x0, x1, x2, x3; __m128i u0, u1, u2, u3; __m128i v0, v1, v2, v3; - // stage 0 - // stage 1 - // stage 2 - u0 = _mm_mullo_epi32(in[3], cospi8); - u1 = _mm_mullo_epi32(in[0], cospi56); - u2 = _mm_add_epi32(u0, u1); - s0 = _mm_add_epi32(u2, rnding); - s0 = _mm_srai_epi32(s0, bit); - - v0 = _mm_mullo_epi32(in[3], cospi56); - v1 = _mm_mullo_epi32(in[0], cospi8); - v2 = _mm_sub_epi32(v0, v1); - s1 = _mm_add_epi32(v2, rnding); - s1 = _mm_srai_epi32(s1, bit); - - u0 = _mm_mullo_epi32(in[1], cospi40); - u1 = _mm_mullo_epi32(in[2], cospi24); - u2 = _mm_add_epi32(u0, u1); - s2 = _mm_add_epi32(u2, rnding); - s2 = _mm_srai_epi32(s2, bit); - - v0 = _mm_mullo_epi32(in[1], cospi24); - v1 = _mm_mullo_epi32(in[2], cospi40); - v2 = _mm_sub_epi32(v0, v1); - s3 = _mm_add_epi32(v2, rnding); - s3 = _mm_srai_epi32(s3, bit); - - // stage 3 - u0 = _mm_add_epi32(s0, s2); - u2 = _mm_sub_epi32(s0, s2); - u1 = _mm_add_epi32(s1, s3); - u3 = _mm_sub_epi32(s1, s3); - - // stage 4 - v0 = _mm_mullo_epi32(u2, cospi32); - v1 = _mm_mullo_epi32(u3, cospi32); - v2 = _mm_add_epi32(v0, v1); - s2 = _mm_add_epi32(v2, rnding); - u2 = _mm_srai_epi32(s2, bit); + s0 = _mm_mullo_epi32(in[0], sinpi1); + s1 = _mm_mullo_epi32(in[0], sinpi4); + s2 = _mm_mullo_epi32(in[1], sinpi2); + s3 = _mm_mullo_epi32(in[1], sinpi1); + s4 = _mm_mullo_epi32(in[2], sinpi3); + s5 = _mm_mullo_epi32(in[3], sinpi4); + s6 = _mm_mullo_epi32(in[3], sinpi2); + t = _mm_add_epi32(in[0], in[1]); + s7 = _mm_sub_epi32(t, in[3]); + + t = _mm_add_epi32(s0, s2); + x0 = _mm_add_epi32(t, s5); + x1 = _mm_mullo_epi32(s7, sinpi3); + t = _mm_sub_epi32(s1, s3); + x2 = _mm_add_epi32(t, s6); + x3 = s4; + + s0 = _mm_add_epi32(x0, x3); + s1 = x1; + s2 = _mm_sub_epi32(x2, x3); + t = _mm_sub_epi32(x2, x0); + s3 = _mm_add_epi32(t, x3); + + u0 = _mm_add_epi32(s0, rnding); + u0 = _mm_srai_epi32(u0, bit); + + u1 = _mm_add_epi32(s1, rnding); + u1 = _mm_srai_epi32(u1, bit); + + u2 = _mm_add_epi32(s2, rnding); + u2 = _mm_srai_epi32(u2, bit); + + u3 = _mm_add_epi32(s3, rnding); + u3 = _mm_srai_epi32(u3, bit); - v2 = _mm_sub_epi32(v0, v1); - s3 = _mm_add_epi32(v2, rnding); - u3 = _mm_srai_epi32(s3, bit); - - // u0, u1, u2, u3 - u2 = _mm_sub_epi32(kZero, u2); - u1 = _mm_sub_epi32(kZero, u1); - - // u0, u2, u3, u1 - // Transpose 4x4 32-bit - v0 = _mm_unpacklo_epi32(u0, u2); - v1 = _mm_unpackhi_epi32(u0, u2); - v2 = _mm_unpacklo_epi32(u3, u1); - v3 = _mm_unpackhi_epi32(u3, u1); + v0 = _mm_unpacklo_epi32(u0, u1); + v1 = _mm_unpackhi_epi32(u0, u1); + v2 = _mm_unpacklo_epi32(u2, u3); + v3 = _mm_unpackhi_epi32(u2, u3); in[0] = _mm_unpacklo_epi64(v0, v2); in[1] = _mm_unpackhi_epi64(v0, v2); @@ -197,84 +183,65 @@ static void fadst4x4_sse4_1(__m128i *in, int bit) { void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff, int input_stride, TX_TYPE tx_type, int bd) { __m128i in[4]; - const TXFM_1D_CFG *row_cfg = NULL; - const TXFM_1D_CFG *col_cfg = NULL; + const int8_t *shift = fwd_txfm_shift_ls[TX_4X4]; + const int txw_idx = get_txw_idx(TX_4X4); + const int txh_idx = get_txh_idx(TX_4X4); switch (tx_type) { case DCT_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_4; - col_cfg = &fwd_txfm_1d_col_cfg_dct_4; - load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]); - fdct4x4_sse4_1(in, col_cfg->cos_bit[2]); - fdct4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; case ADST_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_4; - col_cfg = &fwd_txfm_1d_col_cfg_adst_4; - load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]); - fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); - fdct4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; case DCT_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_4; - col_cfg = &fwd_txfm_1d_col_cfg_dct_4; - load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]); - fdct4x4_sse4_1(in, col_cfg->cos_bit[2]); - fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; case ADST_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_4; - col_cfg = &fwd_txfm_1d_col_cfg_adst_4; - load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]); - fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); - fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; -#if CONFIG_EXT_TX case FLIPADST_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_4; - col_cfg = &fwd_txfm_1d_col_cfg_adst_4; - load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]); - fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); - fdct4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); + fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; case DCT_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_4; - col_cfg = &fwd_txfm_1d_col_cfg_dct_4; - load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]); - fdct4x4_sse4_1(in, col_cfg->cos_bit[2]); - fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); + fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; case FLIPADST_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_4; - col_cfg = &fwd_txfm_1d_col_cfg_adst_4; - load_buffer_4x4(input, in, input_stride, 1, 1, row_cfg->shift[0]); - fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); - fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]); + fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; case ADST_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_4; - col_cfg = &fwd_txfm_1d_col_cfg_adst_4; - load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]); - fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); - fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); + fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; case FLIPADST_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_4; - col_cfg = &fwd_txfm_1d_col_cfg_adst_4; - load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]); - fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); - fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); + fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; -#endif default: assert(0); } (void)bd; @@ -624,415 +591,274 @@ static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) { static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) { const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); const __m128i cospi44 = _mm_set1_epi32(cospi[44]); - const __m128i cospi36 = _mm_set1_epi32(cospi[36]); const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); const __m128i cospi12 = _mm_set1_epi32(cospi[12]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const __m128i kZero = _mm_setzero_si128(); - __m128i u[8], v[8], x; - - // Even 8 points: 0, 2, ..., 14 - // stage 0 - // stage 1 - // stage 2 - // (1) - u[0] = _mm_mullo_epi32(in[14], cospi4); - x = _mm_mullo_epi32(in[0], cospi60); - u[0] = _mm_add_epi32(u[0], x); - u[0] = _mm_add_epi32(u[0], rnding); - u[0] = _mm_srai_epi32(u[0], bit); - - u[1] = _mm_mullo_epi32(in[14], cospi60); - x = _mm_mullo_epi32(in[0], cospi4); - u[1] = _mm_sub_epi32(u[1], x); - u[1] = _mm_add_epi32(u[1], rnding); - u[1] = _mm_srai_epi32(u[1], bit); - - // (2) - u[2] = _mm_mullo_epi32(in[10], cospi20); - x = _mm_mullo_epi32(in[4], cospi44); - u[2] = _mm_add_epi32(u[2], x); - u[2] = _mm_add_epi32(u[2], rnding); - u[2] = _mm_srai_epi32(u[2], bit); - - u[3] = _mm_mullo_epi32(in[10], cospi44); - x = _mm_mullo_epi32(in[4], cospi20); - u[3] = _mm_sub_epi32(u[3], x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); - - // (3) - u[4] = _mm_mullo_epi32(in[6], cospi36); - x = _mm_mullo_epi32(in[8], cospi28); - u[4] = _mm_add_epi32(u[4], x); - u[4] = _mm_add_epi32(u[4], rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - u[5] = _mm_mullo_epi32(in[6], cospi28); - x = _mm_mullo_epi32(in[8], cospi36); - u[5] = _mm_sub_epi32(u[5], x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - // (4) - u[6] = _mm_mullo_epi32(in[2], cospi52); - x = _mm_mullo_epi32(in[12], cospi12); - u[6] = _mm_add_epi32(u[6], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_mullo_epi32(in[2], cospi12); - x = _mm_mullo_epi32(in[12], cospi52); - u[7] = _mm_sub_epi32(u[7], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 3 - v[0] = _mm_add_epi32(u[0], u[4]); - v[4] = _mm_sub_epi32(u[0], u[4]); - v[1] = _mm_add_epi32(u[1], u[5]); - v[5] = _mm_sub_epi32(u[1], u[5]); - v[2] = _mm_add_epi32(u[2], u[6]); - v[6] = _mm_sub_epi32(u[2], u[6]); - v[3] = _mm_add_epi32(u[3], u[7]); - v[7] = _mm_sub_epi32(u[3], u[7]); - - // stage 4 - u[0] = v[0]; - u[1] = v[1]; - u[2] = v[2]; - u[3] = v[3]; - - u[4] = _mm_mullo_epi32(v[4], cospi16); - x = _mm_mullo_epi32(v[5], cospi48); - u[4] = _mm_add_epi32(u[4], x); - u[4] = _mm_add_epi32(u[4], rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - u[5] = _mm_mullo_epi32(v[4], cospi48); - x = _mm_mullo_epi32(v[5], cospi16); - u[5] = _mm_sub_epi32(u[5], x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - u[6] = _mm_mullo_epi32(v[6], cospim48); - x = _mm_mullo_epi32(v[7], cospi16); - u[6] = _mm_add_epi32(u[6], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_mullo_epi32(v[6], cospi16); - x = _mm_mullo_epi32(v[7], cospim48); - u[7] = _mm_sub_epi32(u[7], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 5 - v[0] = _mm_add_epi32(u[0], u[2]); - v[2] = _mm_sub_epi32(u[0], u[2]); - v[1] = _mm_add_epi32(u[1], u[3]); - v[3] = _mm_sub_epi32(u[1], u[3]); - v[4] = _mm_add_epi32(u[4], u[6]); - v[6] = _mm_sub_epi32(u[4], u[6]); - v[5] = _mm_add_epi32(u[5], u[7]); - v[7] = _mm_sub_epi32(u[5], u[7]); - - // stage 6 - u[0] = v[0]; - u[1] = v[1]; - u[4] = v[4]; - u[5] = v[5]; - - v[0] = _mm_mullo_epi32(v[2], cospi32); - x = _mm_mullo_epi32(v[3], cospi32); - u[2] = _mm_add_epi32(v[0], x); - u[2] = _mm_add_epi32(u[2], rnding); - u[2] = _mm_srai_epi32(u[2], bit); + const __m128i zero = _mm_setzero_si128(); + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x, y; + int col; - u[3] = _mm_sub_epi32(v[0], x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); + // Note: + // Even column: 0, 2, ..., 14 + // Odd column: 1, 3, ..., 15 + // one even column plus one odd column constructs one row (8 coeffs) + // total we have 8 rows (8x8). + for (col = 0; col < 2; ++col) { + // stage 0 + // stage 1 + u0 = in[2 * 0 + col]; + u1 = _mm_sub_epi32(zero, in[2 * 7 + col]); + u2 = _mm_sub_epi32(zero, in[2 * 3 + col]); + u3 = in[2 * 4 + col]; + u4 = _mm_sub_epi32(zero, in[2 * 1 + col]); + u5 = in[2 * 6 + col]; + u6 = in[2 * 2 + col]; + u7 = _mm_sub_epi32(zero, in[2 * 5 + col]); - v[0] = _mm_mullo_epi32(v[6], cospi32); - x = _mm_mullo_epi32(v[7], cospi32); - u[6] = _mm_add_epi32(v[0], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_sub_epi32(v[0], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 7 - out[0] = u[0]; - out[2] = _mm_sub_epi32(kZero, u[4]); - out[4] = u[6]; - out[6] = _mm_sub_epi32(kZero, u[2]); - out[8] = u[3]; - out[10] = _mm_sub_epi32(kZero, u[7]); - out[12] = u[5]; - out[14] = _mm_sub_epi32(kZero, u[1]); + // stage 2 + v0 = u0; + v1 = u1; - // Odd 8 points: 1, 3, ..., 15 - // stage 0 - // stage 1 - // stage 2 - // (1) - u[0] = _mm_mullo_epi32(in[15], cospi4); - x = _mm_mullo_epi32(in[1], cospi60); - u[0] = _mm_add_epi32(u[0], x); - u[0] = _mm_add_epi32(u[0], rnding); - u[0] = _mm_srai_epi32(u[0], bit); + x = _mm_mullo_epi32(u2, cospi32); + y = _mm_mullo_epi32(u3, cospi32); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); - u[1] = _mm_mullo_epi32(in[15], cospi60); - x = _mm_mullo_epi32(in[1], cospi4); - u[1] = _mm_sub_epi32(u[1], x); - u[1] = _mm_add_epi32(u[1], rnding); - u[1] = _mm_srai_epi32(u[1], bit); + v3 = _mm_sub_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); - // (2) - u[2] = _mm_mullo_epi32(in[11], cospi20); - x = _mm_mullo_epi32(in[5], cospi44); - u[2] = _mm_add_epi32(u[2], x); - u[2] = _mm_add_epi32(u[2], rnding); - u[2] = _mm_srai_epi32(u[2], bit); + v4 = u4; + v5 = u5; - u[3] = _mm_mullo_epi32(in[11], cospi44); - x = _mm_mullo_epi32(in[5], cospi20); - u[3] = _mm_sub_epi32(u[3], x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); + x = _mm_mullo_epi32(u6, cospi32); + y = _mm_mullo_epi32(u7, cospi32); + v6 = _mm_add_epi32(x, y); + v6 = _mm_add_epi32(v6, rnding); + v6 = _mm_srai_epi32(v6, bit); - // (3) - u[4] = _mm_mullo_epi32(in[7], cospi36); - x = _mm_mullo_epi32(in[9], cospi28); - u[4] = _mm_add_epi32(u[4], x); - u[4] = _mm_add_epi32(u[4], rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - u[5] = _mm_mullo_epi32(in[7], cospi28); - x = _mm_mullo_epi32(in[9], cospi36); - u[5] = _mm_sub_epi32(u[5], x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - // (4) - u[6] = _mm_mullo_epi32(in[3], cospi52); - x = _mm_mullo_epi32(in[13], cospi12); - u[6] = _mm_add_epi32(u[6], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_mullo_epi32(in[3], cospi12); - x = _mm_mullo_epi32(in[13], cospi52); - u[7] = _mm_sub_epi32(u[7], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); + v7 = _mm_sub_epi32(x, y); + v7 = _mm_add_epi32(v7, rnding); + v7 = _mm_srai_epi32(v7, bit); - // stage 3 - v[0] = _mm_add_epi32(u[0], u[4]); - v[4] = _mm_sub_epi32(u[0], u[4]); - v[1] = _mm_add_epi32(u[1], u[5]); - v[5] = _mm_sub_epi32(u[1], u[5]); - v[2] = _mm_add_epi32(u[2], u[6]); - v[6] = _mm_sub_epi32(u[2], u[6]); - v[3] = _mm_add_epi32(u[3], u[7]); - v[7] = _mm_sub_epi32(u[3], u[7]); + // stage 3 + u0 = _mm_add_epi32(v0, v2); + u1 = _mm_add_epi32(v1, v3); + u2 = _mm_sub_epi32(v0, v2); + u3 = _mm_sub_epi32(v1, v3); + u4 = _mm_add_epi32(v4, v6); + u5 = _mm_add_epi32(v5, v7); + u6 = _mm_sub_epi32(v4, v6); + u7 = _mm_sub_epi32(v5, v7); - // stage 4 - u[0] = v[0]; - u[1] = v[1]; - u[2] = v[2]; - u[3] = v[3]; - - u[4] = _mm_mullo_epi32(v[4], cospi16); - x = _mm_mullo_epi32(v[5], cospi48); - u[4] = _mm_add_epi32(u[4], x); - u[4] = _mm_add_epi32(u[4], rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - u[5] = _mm_mullo_epi32(v[4], cospi48); - x = _mm_mullo_epi32(v[5], cospi16); - u[5] = _mm_sub_epi32(u[5], x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - u[6] = _mm_mullo_epi32(v[6], cospim48); - x = _mm_mullo_epi32(v[7], cospi16); - u[6] = _mm_add_epi32(u[6], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_mullo_epi32(v[6], cospi16); - x = _mm_mullo_epi32(v[7], cospim48); - u[7] = _mm_sub_epi32(u[7], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); + // stage 4 + v0 = u0; + v1 = u1; + v2 = u2; + v3 = u3; + + x = _mm_mullo_epi32(u4, cospi16); + y = _mm_mullo_epi32(u5, cospi48); + v4 = _mm_add_epi32(x, y); + v4 = _mm_add_epi32(v4, rnding); + v4 = _mm_srai_epi32(v4, bit); + + x = _mm_mullo_epi32(u4, cospi48); + y = _mm_mullo_epi32(u5, cospim16); + v5 = _mm_add_epi32(x, y); + v5 = _mm_add_epi32(v5, rnding); + v5 = _mm_srai_epi32(v5, bit); + + x = _mm_mullo_epi32(u6, cospim48); + y = _mm_mullo_epi32(u7, cospi16); + v6 = _mm_add_epi32(x, y); + v6 = _mm_add_epi32(v6, rnding); + v6 = _mm_srai_epi32(v6, bit); + + x = _mm_mullo_epi32(u6, cospi16); + y = _mm_mullo_epi32(u7, cospi48); + v7 = _mm_add_epi32(x, y); + v7 = _mm_add_epi32(v7, rnding); + v7 = _mm_srai_epi32(v7, bit); - // stage 5 - v[0] = _mm_add_epi32(u[0], u[2]); - v[2] = _mm_sub_epi32(u[0], u[2]); - v[1] = _mm_add_epi32(u[1], u[3]); - v[3] = _mm_sub_epi32(u[1], u[3]); - v[4] = _mm_add_epi32(u[4], u[6]); - v[6] = _mm_sub_epi32(u[4], u[6]); - v[5] = _mm_add_epi32(u[5], u[7]); - v[7] = _mm_sub_epi32(u[5], u[7]); - - // stage 6 - u[0] = v[0]; - u[1] = v[1]; - u[4] = v[4]; - u[5] = v[5]; - - v[0] = _mm_mullo_epi32(v[2], cospi32); - x = _mm_mullo_epi32(v[3], cospi32); - u[2] = _mm_add_epi32(v[0], x); - u[2] = _mm_add_epi32(u[2], rnding); - u[2] = _mm_srai_epi32(u[2], bit); + // stage 5 + u0 = _mm_add_epi32(v0, v4); + u1 = _mm_add_epi32(v1, v5); + u2 = _mm_add_epi32(v2, v6); + u3 = _mm_add_epi32(v3, v7); + u4 = _mm_sub_epi32(v0, v4); + u5 = _mm_sub_epi32(v1, v5); + u6 = _mm_sub_epi32(v2, v6); + u7 = _mm_sub_epi32(v3, v7); - u[3] = _mm_sub_epi32(v[0], x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); + // stage 6 + x = _mm_mullo_epi32(u0, cospi4); + y = _mm_mullo_epi32(u1, cospi60); + v0 = _mm_add_epi32(x, y); + v0 = _mm_add_epi32(v0, rnding); + v0 = _mm_srai_epi32(v0, bit); + + x = _mm_mullo_epi32(u0, cospi60); + y = _mm_mullo_epi32(u1, cospim4); + v1 = _mm_add_epi32(x, y); + v1 = _mm_add_epi32(v1, rnding); + v1 = _mm_srai_epi32(v1, bit); + + x = _mm_mullo_epi32(u2, cospi20); + y = _mm_mullo_epi32(u3, cospi44); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + x = _mm_mullo_epi32(u2, cospi44); + y = _mm_mullo_epi32(u3, cospim20); + v3 = _mm_add_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + x = _mm_mullo_epi32(u4, cospi36); + y = _mm_mullo_epi32(u5, cospi28); + v4 = _mm_add_epi32(x, y); + v4 = _mm_add_epi32(v4, rnding); + v4 = _mm_srai_epi32(v4, bit); + + x = _mm_mullo_epi32(u4, cospi28); + y = _mm_mullo_epi32(u5, cospim36); + v5 = _mm_add_epi32(x, y); + v5 = _mm_add_epi32(v5, rnding); + v5 = _mm_srai_epi32(v5, bit); + + x = _mm_mullo_epi32(u6, cospi52); + y = _mm_mullo_epi32(u7, cospi12); + v6 = _mm_add_epi32(x, y); + v6 = _mm_add_epi32(v6, rnding); + v6 = _mm_srai_epi32(v6, bit); + + x = _mm_mullo_epi32(u6, cospi12); + y = _mm_mullo_epi32(u7, cospim52); + v7 = _mm_add_epi32(x, y); + v7 = _mm_add_epi32(v7, rnding); + v7 = _mm_srai_epi32(v7, bit); - v[0] = _mm_mullo_epi32(v[6], cospi32); - x = _mm_mullo_epi32(v[7], cospi32); - u[6] = _mm_add_epi32(v[0], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_sub_epi32(v[0], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 7 - out[1] = u[0]; - out[3] = _mm_sub_epi32(kZero, u[4]); - out[5] = u[6]; - out[7] = _mm_sub_epi32(kZero, u[2]); - out[9] = u[3]; - out[11] = _mm_sub_epi32(kZero, u[7]); - out[13] = u[5]; - out[15] = _mm_sub_epi32(kZero, u[1]); + // stage 7 + out[2 * 0 + col] = v1; + out[2 * 1 + col] = v6; + out[2 * 2 + col] = v3; + out[2 * 3 + col] = v4; + out[2 * 4 + col] = v5; + out[2 * 5 + col] = v2; + out[2 * 6 + col] = v7; + out[2 * 7 + col] = v0; + } } void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[16], out[16]; - const TXFM_1D_CFG *row_cfg = NULL; - const TXFM_1D_CFG *col_cfg = NULL; + const int8_t *shift = fwd_txfm_shift_ls[TX_8X8]; + const int txw_idx = get_txw_idx(TX_8X8); + const int txh_idx = get_txh_idx(TX_8X8); switch (tx_type) { case DCT_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_8; - col_cfg = &fwd_txfm_1d_col_cfg_dct_8; - load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]); - fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case ADST_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_8; - col_cfg = &fwd_txfm_1d_col_cfg_adst_8; - load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]); - fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case DCT_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_8; - col_cfg = &fwd_txfm_1d_col_cfg_dct_8; - load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]); - fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case ADST_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_8; - col_cfg = &fwd_txfm_1d_col_cfg_adst_8; - load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]); - fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; -#if CONFIG_EXT_TX case FLIPADST_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_8; - col_cfg = &fwd_txfm_1d_col_cfg_adst_8; - load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]); - fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 1, 0, shift[0]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case DCT_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_8; - col_cfg = &fwd_txfm_1d_col_cfg_dct_8; - load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]); - fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 0, 1, shift[0]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case FLIPADST_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_8; - col_cfg = &fwd_txfm_1d_col_cfg_adst_8; - load_buffer_8x8(input, in, stride, 1, 1, row_cfg->shift[0]); - fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 1, 1, shift[0]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case ADST_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_8; - col_cfg = &fwd_txfm_1d_col_cfg_adst_8; - load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]); - fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 0, 1, shift[0]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case FLIPADST_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_8; - col_cfg = &fwd_txfm_1d_col_cfg_adst_8; - load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]); - fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 1, 0, shift[0]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; -#endif // CONFIG_EXT_TX default: assert(0); } (void)bd; @@ -1402,230 +1228,174 @@ static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); const __m128i cospi2 = _mm_set1_epi32(cospi[2]); const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospim2 = _mm_set1_epi32(-cospi[2]); const __m128i cospi10 = _mm_set1_epi32(cospi[10]); const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospim10 = _mm_set1_epi32(-cospi[10]); const __m128i cospi18 = _mm_set1_epi32(cospi[18]); const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospim18 = _mm_set1_epi32(-cospi[18]); const __m128i cospi26 = _mm_set1_epi32(cospi[26]); const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospim26 = _mm_set1_epi32(-cospi[26]); const __m128i cospi34 = _mm_set1_epi32(cospi[34]); const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); const __m128i cospi42 = _mm_set1_epi32(cospi[42]); const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); const __m128i cospi50 = _mm_set1_epi32(cospi[50]); const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); const __m128i cospi58 = _mm_set1_epi32(cospi[58]); const __m128i cospi6 = _mm_set1_epi32(cospi[6]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); - const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i zero = _mm_setzero_si128(); + __m128i u[16], v[16], x, y; - const int col_num = 4; int col; - // Calculate the column 0, 1, 2, 3 - for (col = 0; col < col_num; ++col) { + for (col = 0; col < 4; ++col) { // stage 0 // stage 1 - // stage 2 - v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2); - x = _mm_mullo_epi32(in[0 * col_num + col], cospi62); - v[0] = _mm_add_epi32(v[0], x); - v[0] = _mm_add_epi32(v[0], rnding); - v[0] = _mm_srai_epi32(v[0], bit); + u[0] = in[0 * 4 + col]; + u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]); + u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]); + u[3] = in[8 * 4 + col]; + u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]); + u[5] = in[12 * 4 + col]; + u[6] = in[4 * 4 + col]; + u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]); + u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]); + u[9] = in[14 * 4 + col]; + u[10] = in[6 * 4 + col]; + u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]); + u[12] = in[2 * 4 + col]; + u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]); + u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]); + u[15] = in[10 * 4 + col]; - v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62); - x = _mm_mullo_epi32(in[0 * col_num + col], cospi2); - v[1] = _mm_sub_epi32(v[1], x); - v[1] = _mm_add_epi32(v[1], rnding); - v[1] = _mm_srai_epi32(v[1], bit); + // stage 2 + v[0] = u[0]; + v[1] = u[1]; - v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10); - x = _mm_mullo_epi32(in[2 * col_num + col], cospi54); - v[2] = _mm_add_epi32(v[2], x); + x = _mm_mullo_epi32(u[2], cospi32); + y = _mm_mullo_epi32(u[3], cospi32); + v[2] = _mm_add_epi32(x, y); v[2] = _mm_add_epi32(v[2], rnding); v[2] = _mm_srai_epi32(v[2], bit); - v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54); - x = _mm_mullo_epi32(in[2 * col_num + col], cospi10); - v[3] = _mm_sub_epi32(v[3], x); + v[3] = _mm_sub_epi32(x, y); v[3] = _mm_add_epi32(v[3], rnding); v[3] = _mm_srai_epi32(v[3], bit); - v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18); - x = _mm_mullo_epi32(in[4 * col_num + col], cospi46); - v[4] = _mm_add_epi32(v[4], x); - v[4] = _mm_add_epi32(v[4], rnding); - v[4] = _mm_srai_epi32(v[4], bit); - - v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46); - x = _mm_mullo_epi32(in[4 * col_num + col], cospi18); - v[5] = _mm_sub_epi32(v[5], x); - v[5] = _mm_add_epi32(v[5], rnding); - v[5] = _mm_srai_epi32(v[5], bit); - - v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26); - x = _mm_mullo_epi32(in[6 * col_num + col], cospi38); - v[6] = _mm_add_epi32(v[6], x); + v[4] = u[4]; + v[5] = u[5]; + + x = _mm_mullo_epi32(u[6], cospi32); + y = _mm_mullo_epi32(u[7], cospi32); + v[6] = _mm_add_epi32(x, y); v[6] = _mm_add_epi32(v[6], rnding); v[6] = _mm_srai_epi32(v[6], bit); - v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38); - x = _mm_mullo_epi32(in[6 * col_num + col], cospi26); - v[7] = _mm_sub_epi32(v[7], x); + v[7] = _mm_sub_epi32(x, y); v[7] = _mm_add_epi32(v[7], rnding); v[7] = _mm_srai_epi32(v[7], bit); - v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34); - x = _mm_mullo_epi32(in[8 * col_num + col], cospi30); - v[8] = _mm_add_epi32(v[8], x); - v[8] = _mm_add_epi32(v[8], rnding); - v[8] = _mm_srai_epi32(v[8], bit); - - v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30); - x = _mm_mullo_epi32(in[8 * col_num + col], cospi34); - v[9] = _mm_sub_epi32(v[9], x); - v[9] = _mm_add_epi32(v[9], rnding); - v[9] = _mm_srai_epi32(v[9], bit); + v[8] = u[8]; + v[9] = u[9]; - v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42); - x = _mm_mullo_epi32(in[10 * col_num + col], cospi22); - v[10] = _mm_add_epi32(v[10], x); + x = _mm_mullo_epi32(u[10], cospi32); + y = _mm_mullo_epi32(u[11], cospi32); + v[10] = _mm_add_epi32(x, y); v[10] = _mm_add_epi32(v[10], rnding); v[10] = _mm_srai_epi32(v[10], bit); - v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22); - x = _mm_mullo_epi32(in[10 * col_num + col], cospi42); - v[11] = _mm_sub_epi32(v[11], x); + v[11] = _mm_sub_epi32(x, y); v[11] = _mm_add_epi32(v[11], rnding); v[11] = _mm_srai_epi32(v[11], bit); - v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50); - x = _mm_mullo_epi32(in[12 * col_num + col], cospi14); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); - - v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14); - x = _mm_mullo_epi32(in[12 * col_num + col], cospi50); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); + v[12] = u[12]; + v[13] = u[13]; - v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58); - x = _mm_mullo_epi32(in[14 * col_num + col], cospi6); - v[14] = _mm_add_epi32(v[14], x); + x = _mm_mullo_epi32(u[14], cospi32); + y = _mm_mullo_epi32(u[15], cospi32); + v[14] = _mm_add_epi32(x, y); v[14] = _mm_add_epi32(v[14], rnding); v[14] = _mm_srai_epi32(v[14], bit); - v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6); - x = _mm_mullo_epi32(in[14 * col_num + col], cospi58); - v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_sub_epi32(x, y); v[15] = _mm_add_epi32(v[15], rnding); v[15] = _mm_srai_epi32(v[15], bit); // stage 3 - u[0] = _mm_add_epi32(v[0], v[8]); - u[8] = _mm_sub_epi32(v[0], v[8]); - u[1] = _mm_add_epi32(v[1], v[9]); - u[9] = _mm_sub_epi32(v[1], v[9]); - u[2] = _mm_add_epi32(v[2], v[10]); - u[10] = _mm_sub_epi32(v[2], v[10]); - u[3] = _mm_add_epi32(v[3], v[11]); - u[11] = _mm_sub_epi32(v[3], v[11]); - u[4] = _mm_add_epi32(v[4], v[12]); - u[12] = _mm_sub_epi32(v[4], v[12]); - u[5] = _mm_add_epi32(v[5], v[13]); - u[13] = _mm_sub_epi32(v[5], v[13]); - u[6] = _mm_add_epi32(v[6], v[14]); - u[14] = _mm_sub_epi32(v[6], v[14]); - u[7] = _mm_add_epi32(v[7], v[15]); - u[15] = _mm_sub_epi32(v[7], v[15]); + u[0] = _mm_add_epi32(v[0], v[2]); + u[1] = _mm_add_epi32(v[1], v[3]); + u[2] = _mm_sub_epi32(v[0], v[2]); + u[3] = _mm_sub_epi32(v[1], v[3]); + u[4] = _mm_add_epi32(v[4], v[6]); + u[5] = _mm_add_epi32(v[5], v[7]); + u[6] = _mm_sub_epi32(v[4], v[6]); + u[7] = _mm_sub_epi32(v[5], v[7]); + u[8] = _mm_add_epi32(v[8], v[10]); + u[9] = _mm_add_epi32(v[9], v[11]); + u[10] = _mm_sub_epi32(v[8], v[10]); + u[11] = _mm_sub_epi32(v[9], v[11]); + u[12] = _mm_add_epi32(v[12], v[14]); + u[13] = _mm_add_epi32(v[13], v[15]); + u[14] = _mm_sub_epi32(v[12], v[14]); + u[15] = _mm_sub_epi32(v[13], v[15]); // stage 4 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; - v[4] = u[4]; - v[5] = u[5]; - v[6] = u[6]; - v[7] = u[7]; - - v[8] = _mm_mullo_epi32(u[8], cospi8); - x = _mm_mullo_epi32(u[9], cospi56); - v[8] = _mm_add_epi32(v[8], x); - v[8] = _mm_add_epi32(v[8], rnding); - v[8] = _mm_srai_epi32(v[8], bit); - - v[9] = _mm_mullo_epi32(u[8], cospi56); - x = _mm_mullo_epi32(u[9], cospi8); - v[9] = _mm_sub_epi32(v[9], x); - v[9] = _mm_add_epi32(v[9], rnding); - v[9] = _mm_srai_epi32(v[9], bit); - - v[10] = _mm_mullo_epi32(u[10], cospi40); - x = _mm_mullo_epi32(u[11], cospi24); - v[10] = _mm_add_epi32(v[10], x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); - - v[11] = _mm_mullo_epi32(u[10], cospi24); - x = _mm_mullo_epi32(u[11], cospi40); - v[11] = _mm_sub_epi32(v[11], x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); - - v[12] = _mm_mullo_epi32(u[12], cospim56); - x = _mm_mullo_epi32(u[13], cospi8); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); - - v[13] = _mm_mullo_epi32(u[12], cospi8); - x = _mm_mullo_epi32(u[13], cospim56); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); - - v[14] = _mm_mullo_epi32(u[14], cospim24); - x = _mm_mullo_epi32(u[15], cospi40); - v[14] = _mm_add_epi32(v[14], x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); - - v[15] = _mm_mullo_epi32(u[14], cospi40); - x = _mm_mullo_epi32(u[15], cospim24); - v[15] = _mm_sub_epi32(v[15], x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); + v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit); + v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit); + v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit); + v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit); + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit); // stage 5 u[0] = _mm_add_epi32(v[0], v[4]); - u[4] = _mm_sub_epi32(v[0], v[4]); u[1] = _mm_add_epi32(v[1], v[5]); - u[5] = _mm_sub_epi32(v[1], v[5]); u[2] = _mm_add_epi32(v[2], v[6]); - u[6] = _mm_sub_epi32(v[2], v[6]); u[3] = _mm_add_epi32(v[3], v[7]); + u[4] = _mm_sub_epi32(v[0], v[4]); + u[5] = _mm_sub_epi32(v[1], v[5]); + u[6] = _mm_sub_epi32(v[2], v[6]); u[7] = _mm_sub_epi32(v[3], v[7]); u[8] = _mm_add_epi32(v[8], v[12]); - u[12] = _mm_sub_epi32(v[8], v[12]); u[9] = _mm_add_epi32(v[9], v[13]); - u[13] = _mm_sub_epi32(v[9], v[13]); u[10] = _mm_add_epi32(v[10], v[14]); - u[14] = _mm_sub_epi32(v[10], v[14]); u[11] = _mm_add_epi32(v[11], v[15]); + u[12] = _mm_sub_epi32(v[8], v[12]); + u[13] = _mm_sub_epi32(v[9], v[13]); + u[14] = _mm_sub_epi32(v[10], v[14]); u[15] = _mm_sub_epi32(v[11], v[15]); // stage 6 @@ -1633,148 +1403,72 @@ static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; - - v[4] = _mm_mullo_epi32(u[4], cospi16); - x = _mm_mullo_epi32(u[5], cospi48); - v[4] = _mm_add_epi32(v[4], x); - v[4] = _mm_add_epi32(v[4], rnding); - v[4] = _mm_srai_epi32(v[4], bit); - - v[5] = _mm_mullo_epi32(u[4], cospi48); - x = _mm_mullo_epi32(u[5], cospi16); - v[5] = _mm_sub_epi32(v[5], x); - v[5] = _mm_add_epi32(v[5], rnding); - v[5] = _mm_srai_epi32(v[5], bit); - - v[6] = _mm_mullo_epi32(u[6], cospim48); - x = _mm_mullo_epi32(u[7], cospi16); - v[6] = _mm_add_epi32(v[6], x); - v[6] = _mm_add_epi32(v[6], rnding); - v[6] = _mm_srai_epi32(v[6], bit); - - v[7] = _mm_mullo_epi32(u[6], cospi16); - x = _mm_mullo_epi32(u[7], cospim48); - v[7] = _mm_sub_epi32(v[7], x); - v[7] = _mm_add_epi32(v[7], rnding); - v[7] = _mm_srai_epi32(v[7], bit); - - v[8] = u[8]; - v[9] = u[9]; - v[10] = u[10]; - v[11] = u[11]; - - v[12] = _mm_mullo_epi32(u[12], cospi16); - x = _mm_mullo_epi32(u[13], cospi48); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); - - v[13] = _mm_mullo_epi32(u[12], cospi48); - x = _mm_mullo_epi32(u[13], cospi16); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); - - v[14] = _mm_mullo_epi32(u[14], cospim48); - x = _mm_mullo_epi32(u[15], cospi16); - v[14] = _mm_add_epi32(v[14], x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); - - v[15] = _mm_mullo_epi32(u[14], cospi16); - x = _mm_mullo_epi32(u[15], cospim48); - v[15] = _mm_sub_epi32(v[15], x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); - - // stage 7 - u[0] = _mm_add_epi32(v[0], v[2]); - u[2] = _mm_sub_epi32(v[0], v[2]); - u[1] = _mm_add_epi32(v[1], v[3]); - u[3] = _mm_sub_epi32(v[1], v[3]); - u[4] = _mm_add_epi32(v[4], v[6]); - u[6] = _mm_sub_epi32(v[4], v[6]); - u[5] = _mm_add_epi32(v[5], v[7]); - u[7] = _mm_sub_epi32(v[5], v[7]); - u[8] = _mm_add_epi32(v[8], v[10]); - u[10] = _mm_sub_epi32(v[8], v[10]); - u[9] = _mm_add_epi32(v[9], v[11]); - u[11] = _mm_sub_epi32(v[9], v[11]); - u[12] = _mm_add_epi32(v[12], v[14]); - u[14] = _mm_sub_epi32(v[12], v[14]); - u[13] = _mm_add_epi32(v[13], v[15]); - u[15] = _mm_sub_epi32(v[13], v[15]); - - // stage 8 - v[0] = u[0]; - v[1] = u[1]; - - y = _mm_mullo_epi32(u[2], cospi32); - x = _mm_mullo_epi32(u[3], cospi32); - v[2] = _mm_add_epi32(y, x); - v[2] = _mm_add_epi32(v[2], rnding); - v[2] = _mm_srai_epi32(v[2], bit); - - v[3] = _mm_sub_epi32(y, x); - v[3] = _mm_add_epi32(v[3], rnding); - v[3] = _mm_srai_epi32(v[3], bit); - v[4] = u[4]; v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit); + v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit); - y = _mm_mullo_epi32(u[6], cospi32); - x = _mm_mullo_epi32(u[7], cospi32); - v[6] = _mm_add_epi32(y, x); - v[6] = _mm_add_epi32(v[6], rnding); - v[6] = _mm_srai_epi32(v[6], bit); - - v[7] = _mm_sub_epi32(y, x); - v[7] = _mm_add_epi32(v[7], rnding); - v[7] = _mm_srai_epi32(v[7], bit); - - v[8] = u[8]; - v[9] = u[9]; - - y = _mm_mullo_epi32(u[10], cospi32); - x = _mm_mullo_epi32(u[11], cospi32); - v[10] = _mm_add_epi32(y, x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); - - v[11] = _mm_sub_epi32(y, x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); - - v[12] = u[12]; - v[13] = u[13]; - - y = _mm_mullo_epi32(u[14], cospi32); - x = _mm_mullo_epi32(u[15], cospi32); - v[14] = _mm_add_epi32(y, x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); + // stage 7 + u[0] = _mm_add_epi32(v[0], v[8]); + u[1] = _mm_add_epi32(v[1], v[9]); + u[2] = _mm_add_epi32(v[2], v[10]); + u[3] = _mm_add_epi32(v[3], v[11]); + u[4] = _mm_add_epi32(v[4], v[12]); + u[5] = _mm_add_epi32(v[5], v[13]); + u[6] = _mm_add_epi32(v[6], v[14]); + u[7] = _mm_add_epi32(v[7], v[15]); + u[8] = _mm_sub_epi32(v[0], v[8]); + u[9] = _mm_sub_epi32(v[1], v[9]); + u[10] = _mm_sub_epi32(v[2], v[10]); + u[11] = _mm_sub_epi32(v[3], v[11]); + u[12] = _mm_sub_epi32(v[4], v[12]); + u[13] = _mm_sub_epi32(v[5], v[13]); + u[14] = _mm_sub_epi32(v[6], v[14]); + u[15] = _mm_sub_epi32(v[7], v[15]); - v[15] = _mm_sub_epi32(y, x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); + // stage 8 + v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit); + v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit); + v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit); + v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit); + v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit); + v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit); + v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit); + v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit); + v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit); // stage 9 - out[0 * col_num + col] = v[0]; - out[1 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[8]); - out[2 * col_num + col] = v[12]; - out[3 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[4]); - out[4 * col_num + col] = v[6]; - out[5 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[14]); - out[6 * col_num + col] = v[10]; - out[7 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[2]); - out[8 * col_num + col] = v[3]; - out[9 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[11]); - out[10 * col_num + col] = v[15]; - out[11 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[7]); - out[12 * col_num + col] = v[5]; - out[13 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[13]); - out[14 * col_num + col] = v[9]; - out[15 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[1]); + out[0 * 4 + col] = v[1]; + out[1 * 4 + col] = v[14]; + out[2 * 4 + col] = v[3]; + out[3 * 4 + col] = v[12]; + out[4 * 4 + col] = v[5]; + out[5 * 4 + col] = v[10]; + out[6 * 4 + col] = v[7]; + out[7 * 4 + col] = v[8]; + out[8 * 4 + col] = v[9]; + out[9 * 4 + col] = v[6]; + out[10 * 4 + col] = v[11]; + out[11 * 4 + col] = v[4]; + out[12 * 4 + col] = v[13]; + out[13 * 4 + col] = v[2]; + out[14 * 4 + col] = v[15]; + out[15 * 4 + col] = v[0]; } } @@ -1802,111 +1496,91 @@ static void write_buffer_16x16(const __m128i *in, int32_t *output) { void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[64], out[64]; - const TXFM_1D_CFG *row_cfg = NULL; - const TXFM_1D_CFG *col_cfg = NULL; - + const int8_t *shift = fwd_txfm_shift_ls[TX_16X16]; + const int txw_idx = get_txw_idx(TX_16X16); + const int txh_idx = get_txh_idx(TX_16X16); switch (tx_type) { case DCT_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_16; - col_cfg = &fwd_txfm_1d_col_cfg_dct_16; - load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]); - fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case ADST_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_16; - col_cfg = &fwd_txfm_1d_col_cfg_adst_16; - load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]); - fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case DCT_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_16; - col_cfg = &fwd_txfm_1d_col_cfg_dct_16; - load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]); - fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case ADST_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_16; - col_cfg = &fwd_txfm_1d_col_cfg_adst_16; - load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]); - fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; -#if CONFIG_EXT_TX case FLIPADST_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_16; - col_cfg = &fwd_txfm_1d_col_cfg_adst_16; - load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]); - fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 1, 0, shift[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case DCT_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_16; - col_cfg = &fwd_txfm_1d_col_cfg_dct_16; - load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]); - fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 0, 1, shift[0]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case FLIPADST_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_16; - col_cfg = &fwd_txfm_1d_col_cfg_adst_16; - load_buffer_16x16(input, in, stride, 1, 1, row_cfg->shift[0]); - fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 1, 1, shift[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case ADST_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_16; - col_cfg = &fwd_txfm_1d_col_cfg_adst_16; - load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]); - fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 0, 1, shift[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case FLIPADST_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_16; - col_cfg = &fwd_txfm_1d_col_cfg_adst_16; - load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]); - fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 1, 0, shift[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; -#endif // CONFIG_EXT_TX default: assert(0); } (void)bd; diff --git a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c deleted file mode 100644 index 88621c82b..000000000 --- a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c +++ /dev/null @@ -1,1627 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> // avx2 - -#include "./av1_rtcd.h" -#include "./aom_dsp_rtcd.h" - -#include "aom_dsp/x86/fwd_txfm_avx2.h" -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/x86/txfm_common_avx2.h" - -static INLINE void load_buffer_16x16(const int16_t *input, int stride, - int flipud, int fliplr, __m256i *in) { - if (!flipud) { - in[0] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride)); - in[1] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride)); - in[2] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride)); - in[3] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride)); - in[4] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride)); - in[5] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride)); - in[6] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride)); - in[7] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride)); - in[8] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride)); - in[9] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride)); - in[10] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride)); - in[11] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride)); - in[12] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride)); - in[13] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride)); - in[14] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride)); - in[15] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride)); - } else { - in[0] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride)); - in[1] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride)); - in[2] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride)); - in[3] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride)); - in[4] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride)); - in[5] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride)); - in[6] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride)); - in[7] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride)); - in[8] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride)); - in[9] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride)); - in[10] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride)); - in[11] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride)); - in[12] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride)); - in[13] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride)); - in[14] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride)); - in[15] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride)); - } - - if (fliplr) { - mm256_reverse_epi16(&in[0]); - mm256_reverse_epi16(&in[1]); - mm256_reverse_epi16(&in[2]); - mm256_reverse_epi16(&in[3]); - mm256_reverse_epi16(&in[4]); - mm256_reverse_epi16(&in[5]); - mm256_reverse_epi16(&in[6]); - mm256_reverse_epi16(&in[7]); - mm256_reverse_epi16(&in[8]); - mm256_reverse_epi16(&in[9]); - mm256_reverse_epi16(&in[10]); - mm256_reverse_epi16(&in[11]); - mm256_reverse_epi16(&in[12]); - mm256_reverse_epi16(&in[13]); - mm256_reverse_epi16(&in[14]); - mm256_reverse_epi16(&in[15]); - } - - in[0] = _mm256_slli_epi16(in[0], 2); - in[1] = _mm256_slli_epi16(in[1], 2); - in[2] = _mm256_slli_epi16(in[2], 2); - in[3] = _mm256_slli_epi16(in[3], 2); - in[4] = _mm256_slli_epi16(in[4], 2); - in[5] = _mm256_slli_epi16(in[5], 2); - in[6] = _mm256_slli_epi16(in[6], 2); - in[7] = _mm256_slli_epi16(in[7], 2); - in[8] = _mm256_slli_epi16(in[8], 2); - in[9] = _mm256_slli_epi16(in[9], 2); - in[10] = _mm256_slli_epi16(in[10], 2); - in[11] = _mm256_slli_epi16(in[11], 2); - in[12] = _mm256_slli_epi16(in[12], 2); - in[13] = _mm256_slli_epi16(in[13], 2); - in[14] = _mm256_slli_epi16(in[14], 2); - in[15] = _mm256_slli_epi16(in[15], 2); -} - -static INLINE void write_buffer_16x16(const __m256i *in, tran_low_t *output) { - int i; - for (i = 0; i < 16; ++i) { - storeu_output_avx2(&in[i], output + (i << 4)); - } -} - -static void right_shift_16x16(__m256i *in) { - const __m256i one = _mm256_set1_epi16(1); - __m256i s0 = _mm256_srai_epi16(in[0], 15); - __m256i s1 = _mm256_srai_epi16(in[1], 15); - __m256i s2 = _mm256_srai_epi16(in[2], 15); - __m256i s3 = _mm256_srai_epi16(in[3], 15); - __m256i s4 = _mm256_srai_epi16(in[4], 15); - __m256i s5 = _mm256_srai_epi16(in[5], 15); - __m256i s6 = _mm256_srai_epi16(in[6], 15); - __m256i s7 = _mm256_srai_epi16(in[7], 15); - __m256i s8 = _mm256_srai_epi16(in[8], 15); - __m256i s9 = _mm256_srai_epi16(in[9], 15); - __m256i s10 = _mm256_srai_epi16(in[10], 15); - __m256i s11 = _mm256_srai_epi16(in[11], 15); - __m256i s12 = _mm256_srai_epi16(in[12], 15); - __m256i s13 = _mm256_srai_epi16(in[13], 15); - __m256i s14 = _mm256_srai_epi16(in[14], 15); - __m256i s15 = _mm256_srai_epi16(in[15], 15); - - in[0] = _mm256_add_epi16(in[0], one); - in[1] = _mm256_add_epi16(in[1], one); - in[2] = _mm256_add_epi16(in[2], one); - in[3] = _mm256_add_epi16(in[3], one); - in[4] = _mm256_add_epi16(in[4], one); - in[5] = _mm256_add_epi16(in[5], one); - in[6] = _mm256_add_epi16(in[6], one); - in[7] = _mm256_add_epi16(in[7], one); - in[8] = _mm256_add_epi16(in[8], one); - in[9] = _mm256_add_epi16(in[9], one); - in[10] = _mm256_add_epi16(in[10], one); - in[11] = _mm256_add_epi16(in[11], one); - in[12] = _mm256_add_epi16(in[12], one); - in[13] = _mm256_add_epi16(in[13], one); - in[14] = _mm256_add_epi16(in[14], one); - in[15] = _mm256_add_epi16(in[15], one); - - in[0] = _mm256_sub_epi16(in[0], s0); - in[1] = _mm256_sub_epi16(in[1], s1); - in[2] = _mm256_sub_epi16(in[2], s2); - in[3] = _mm256_sub_epi16(in[3], s3); - in[4] = _mm256_sub_epi16(in[4], s4); - in[5] = _mm256_sub_epi16(in[5], s5); - in[6] = _mm256_sub_epi16(in[6], s6); - in[7] = _mm256_sub_epi16(in[7], s7); - in[8] = _mm256_sub_epi16(in[8], s8); - in[9] = _mm256_sub_epi16(in[9], s9); - in[10] = _mm256_sub_epi16(in[10], s10); - in[11] = _mm256_sub_epi16(in[11], s11); - in[12] = _mm256_sub_epi16(in[12], s12); - in[13] = _mm256_sub_epi16(in[13], s13); - in[14] = _mm256_sub_epi16(in[14], s14); - in[15] = _mm256_sub_epi16(in[15], s15); - - in[0] = _mm256_srai_epi16(in[0], 2); - in[1] = _mm256_srai_epi16(in[1], 2); - in[2] = _mm256_srai_epi16(in[2], 2); - in[3] = _mm256_srai_epi16(in[3], 2); - in[4] = _mm256_srai_epi16(in[4], 2); - in[5] = _mm256_srai_epi16(in[5], 2); - in[6] = _mm256_srai_epi16(in[6], 2); - in[7] = _mm256_srai_epi16(in[7], 2); - in[8] = _mm256_srai_epi16(in[8], 2); - in[9] = _mm256_srai_epi16(in[9], 2); - in[10] = _mm256_srai_epi16(in[10], 2); - in[11] = _mm256_srai_epi16(in[11], 2); - in[12] = _mm256_srai_epi16(in[12], 2); - in[13] = _mm256_srai_epi16(in[13], 2); - in[14] = _mm256_srai_epi16(in[14], 2); - in[15] = _mm256_srai_epi16(in[15], 2); -} - -static void fdct16_avx2(__m256i *in) { - // sequence: cospi_L_H = pairs(L, H) and L first - const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64); - const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64); - const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64); - const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64); - - const __m256i cospi_p30_p02 = pair256_set_epi16(cospi_30_64, cospi_2_64); - const __m256i cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64); - - const __m256i cospi_p14_p18 = pair256_set_epi16(cospi_14_64, cospi_18_64); - const __m256i cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64); - - const __m256i cospi_p22_p10 = pair256_set_epi16(cospi_22_64, cospi_10_64); - const __m256i cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64); - - const __m256i cospi_p06_p26 = pair256_set_epi16(cospi_6_64, cospi_26_64); - const __m256i cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64); - - __m256i u0, u1, u2, u3, u4, u5, u6, u7; - __m256i s0, s1, s2, s3, s4, s5, s6, s7; - __m256i t0, t1, t2, t3, t4, t5, t6, t7; - __m256i v0, v1, v2, v3; - __m256i x0, x1; - - // 0, 4, 8, 12 - u0 = _mm256_add_epi16(in[0], in[15]); - u1 = _mm256_add_epi16(in[1], in[14]); - u2 = _mm256_add_epi16(in[2], in[13]); - u3 = _mm256_add_epi16(in[3], in[12]); - u4 = _mm256_add_epi16(in[4], in[11]); - u5 = _mm256_add_epi16(in[5], in[10]); - u6 = _mm256_add_epi16(in[6], in[9]); - u7 = _mm256_add_epi16(in[7], in[8]); - - s0 = _mm256_add_epi16(u0, u7); - s1 = _mm256_add_epi16(u1, u6); - s2 = _mm256_add_epi16(u2, u5); - s3 = _mm256_add_epi16(u3, u4); - - // 0, 8 - v0 = _mm256_add_epi16(s0, s3); - v1 = _mm256_add_epi16(s1, s2); - - x0 = _mm256_unpacklo_epi16(v0, v1); - x1 = _mm256_unpackhi_epi16(v0, v1); - - t0 = butter_fly(&x0, &x1, &cospi_p16_p16); - t1 = butter_fly(&x0, &x1, &cospi_p16_m16); - - // 4, 12 - v0 = _mm256_sub_epi16(s1, s2); - v1 = _mm256_sub_epi16(s0, s3); - - x0 = _mm256_unpacklo_epi16(v0, v1); - x1 = _mm256_unpackhi_epi16(v0, v1); - - t2 = butter_fly(&x0, &x1, &cospi_p24_p08); - t3 = butter_fly(&x0, &x1, &cospi_m08_p24); - - // 2, 6, 10, 14 - s0 = _mm256_sub_epi16(u3, u4); - s1 = _mm256_sub_epi16(u2, u5); - s2 = _mm256_sub_epi16(u1, u6); - s3 = _mm256_sub_epi16(u0, u7); - - v0 = s0; // output[4] - v3 = s3; // output[7] - - x0 = _mm256_unpacklo_epi16(s2, s1); - x1 = _mm256_unpackhi_epi16(s2, s1); - - v2 = butter_fly(&x0, &x1, &cospi_p16_p16); // output[5] - v1 = butter_fly(&x0, &x1, &cospi_p16_m16); // output[6] - - s0 = _mm256_add_epi16(v0, v1); // step[4] - s1 = _mm256_sub_epi16(v0, v1); // step[5] - s2 = _mm256_sub_epi16(v3, v2); // step[6] - s3 = _mm256_add_epi16(v3, v2); // step[7] - - // 2, 14 - x0 = _mm256_unpacklo_epi16(s0, s3); - x1 = _mm256_unpackhi_epi16(s0, s3); - - t4 = butter_fly(&x0, &x1, &cospi_p28_p04); - t5 = butter_fly(&x0, &x1, &cospi_m04_p28); - - // 10, 6 - x0 = _mm256_unpacklo_epi16(s1, s2); - x1 = _mm256_unpackhi_epi16(s1, s2); - t6 = butter_fly(&x0, &x1, &cospi_p12_p20); - t7 = butter_fly(&x0, &x1, &cospi_m20_p12); - - // 1, 3, 5, 7, 9, 11, 13, 15 - s0 = _mm256_sub_epi16(in[7], in[8]); // step[8] - s1 = _mm256_sub_epi16(in[6], in[9]); // step[9] - u2 = _mm256_sub_epi16(in[5], in[10]); - u3 = _mm256_sub_epi16(in[4], in[11]); - u4 = _mm256_sub_epi16(in[3], in[12]); - u5 = _mm256_sub_epi16(in[2], in[13]); - s6 = _mm256_sub_epi16(in[1], in[14]); // step[14] - s7 = _mm256_sub_epi16(in[0], in[15]); // step[15] - - in[0] = t0; - in[8] = t1; - in[4] = t2; - in[12] = t3; - in[2] = t4; - in[14] = t5; - in[10] = t6; - in[6] = t7; - - x0 = _mm256_unpacklo_epi16(u5, u2); - x1 = _mm256_unpackhi_epi16(u5, u2); - - s2 = butter_fly(&x0, &x1, &cospi_p16_p16); // step[13] - s5 = butter_fly(&x0, &x1, &cospi_p16_m16); // step[10] - - x0 = _mm256_unpacklo_epi16(u4, u3); - x1 = _mm256_unpackhi_epi16(u4, u3); - - s3 = butter_fly(&x0, &x1, &cospi_p16_p16); // step[12] - s4 = butter_fly(&x0, &x1, &cospi_p16_m16); // step[11] - - u0 = _mm256_add_epi16(s0, s4); // output[8] - u1 = _mm256_add_epi16(s1, s5); - u2 = _mm256_sub_epi16(s1, s5); - u3 = _mm256_sub_epi16(s0, s4); - u4 = _mm256_sub_epi16(s7, s3); - u5 = _mm256_sub_epi16(s6, s2); - u6 = _mm256_add_epi16(s6, s2); - u7 = _mm256_add_epi16(s7, s3); - - // stage 4 - s0 = u0; - s3 = u3; - s4 = u4; - s7 = u7; - - x0 = _mm256_unpacklo_epi16(u1, u6); - x1 = _mm256_unpackhi_epi16(u1, u6); - - s1 = butter_fly(&x0, &x1, &cospi_m08_p24); - s6 = butter_fly(&x0, &x1, &cospi_p24_p08); - - x0 = _mm256_unpacklo_epi16(u2, u5); - x1 = _mm256_unpackhi_epi16(u2, u5); - - s2 = butter_fly(&x0, &x1, &cospi_m24_m08); - s5 = butter_fly(&x0, &x1, &cospi_m08_p24); - - // stage 5 - u0 = _mm256_add_epi16(s0, s1); - u1 = _mm256_sub_epi16(s0, s1); - u2 = _mm256_sub_epi16(s3, s2); - u3 = _mm256_add_epi16(s3, s2); - u4 = _mm256_add_epi16(s4, s5); - u5 = _mm256_sub_epi16(s4, s5); - u6 = _mm256_sub_epi16(s7, s6); - u7 = _mm256_add_epi16(s7, s6); - - // stage 6 - x0 = _mm256_unpacklo_epi16(u0, u7); - x1 = _mm256_unpackhi_epi16(u0, u7); - in[1] = butter_fly(&x0, &x1, &cospi_p30_p02); - in[15] = butter_fly(&x0, &x1, &cospi_m02_p30); - - x0 = _mm256_unpacklo_epi16(u1, u6); - x1 = _mm256_unpackhi_epi16(u1, u6); - in[9] = butter_fly(&x0, &x1, &cospi_p14_p18); - in[7] = butter_fly(&x0, &x1, &cospi_m18_p14); - - x0 = _mm256_unpacklo_epi16(u2, u5); - x1 = _mm256_unpackhi_epi16(u2, u5); - in[5] = butter_fly(&x0, &x1, &cospi_p22_p10); - in[11] = butter_fly(&x0, &x1, &cospi_m10_p22); - - x0 = _mm256_unpacklo_epi16(u3, u4); - x1 = _mm256_unpackhi_epi16(u3, u4); - in[13] = butter_fly(&x0, &x1, &cospi_p06_p26); - in[3] = butter_fly(&x0, &x1, &cospi_m26_p06); -} - -void fadst16_avx2(__m256i *in) { - const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64); - const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64); - const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64); - const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64); - const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64); - const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64); - const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64); - const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64); - const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64); - const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64); - const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64); - const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64); - const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64); - const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64); - const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64); - const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64); - const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64); - const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64); - const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64); - const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64); - const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64); - const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64); - const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64); - const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64); - const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64); - const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64); - const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); - const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64); - const __m256i zero = _mm256_setzero_si256(); - const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); - __m256i s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; - __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - __m256i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; - __m256i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - __m256i y0, y1; - - // stage 1, s takes low 256 bits; x takes high 256 bits - y0 = _mm256_unpacklo_epi16(in[15], in[0]); - y1 = _mm256_unpackhi_epi16(in[15], in[0]); - s0 = _mm256_madd_epi16(y0, cospi_p01_p31); - x0 = _mm256_madd_epi16(y1, cospi_p01_p31); - s1 = _mm256_madd_epi16(y0, cospi_p31_m01); - x1 = _mm256_madd_epi16(y1, cospi_p31_m01); - - y0 = _mm256_unpacklo_epi16(in[13], in[2]); - y1 = _mm256_unpackhi_epi16(in[13], in[2]); - s2 = _mm256_madd_epi16(y0, cospi_p05_p27); - x2 = _mm256_madd_epi16(y1, cospi_p05_p27); - s3 = _mm256_madd_epi16(y0, cospi_p27_m05); - x3 = _mm256_madd_epi16(y1, cospi_p27_m05); - - y0 = _mm256_unpacklo_epi16(in[11], in[4]); - y1 = _mm256_unpackhi_epi16(in[11], in[4]); - s4 = _mm256_madd_epi16(y0, cospi_p09_p23); - x4 = _mm256_madd_epi16(y1, cospi_p09_p23); - s5 = _mm256_madd_epi16(y0, cospi_p23_m09); - x5 = _mm256_madd_epi16(y1, cospi_p23_m09); - - y0 = _mm256_unpacklo_epi16(in[9], in[6]); - y1 = _mm256_unpackhi_epi16(in[9], in[6]); - s6 = _mm256_madd_epi16(y0, cospi_p13_p19); - x6 = _mm256_madd_epi16(y1, cospi_p13_p19); - s7 = _mm256_madd_epi16(y0, cospi_p19_m13); - x7 = _mm256_madd_epi16(y1, cospi_p19_m13); - - y0 = _mm256_unpacklo_epi16(in[7], in[8]); - y1 = _mm256_unpackhi_epi16(in[7], in[8]); - s8 = _mm256_madd_epi16(y0, cospi_p17_p15); - x8 = _mm256_madd_epi16(y1, cospi_p17_p15); - s9 = _mm256_madd_epi16(y0, cospi_p15_m17); - x9 = _mm256_madd_epi16(y1, cospi_p15_m17); - - y0 = _mm256_unpacklo_epi16(in[5], in[10]); - y1 = _mm256_unpackhi_epi16(in[5], in[10]); - s10 = _mm256_madd_epi16(y0, cospi_p21_p11); - x10 = _mm256_madd_epi16(y1, cospi_p21_p11); - s11 = _mm256_madd_epi16(y0, cospi_p11_m21); - x11 = _mm256_madd_epi16(y1, cospi_p11_m21); - - y0 = _mm256_unpacklo_epi16(in[3], in[12]); - y1 = _mm256_unpackhi_epi16(in[3], in[12]); - s12 = _mm256_madd_epi16(y0, cospi_p25_p07); - x12 = _mm256_madd_epi16(y1, cospi_p25_p07); - s13 = _mm256_madd_epi16(y0, cospi_p07_m25); - x13 = _mm256_madd_epi16(y1, cospi_p07_m25); - - y0 = _mm256_unpacklo_epi16(in[1], in[14]); - y1 = _mm256_unpackhi_epi16(in[1], in[14]); - s14 = _mm256_madd_epi16(y0, cospi_p29_p03); - x14 = _mm256_madd_epi16(y1, cospi_p29_p03); - s15 = _mm256_madd_epi16(y0, cospi_p03_m29); - x15 = _mm256_madd_epi16(y1, cospi_p03_m29); - - // u takes low 256 bits; v takes high 256 bits - u0 = _mm256_add_epi32(s0, s8); - u1 = _mm256_add_epi32(s1, s9); - u2 = _mm256_add_epi32(s2, s10); - u3 = _mm256_add_epi32(s3, s11); - u4 = _mm256_add_epi32(s4, s12); - u5 = _mm256_add_epi32(s5, s13); - u6 = _mm256_add_epi32(s6, s14); - u7 = _mm256_add_epi32(s7, s15); - - u8 = _mm256_sub_epi32(s0, s8); - u9 = _mm256_sub_epi32(s1, s9); - u10 = _mm256_sub_epi32(s2, s10); - u11 = _mm256_sub_epi32(s3, s11); - u12 = _mm256_sub_epi32(s4, s12); - u13 = _mm256_sub_epi32(s5, s13); - u14 = _mm256_sub_epi32(s6, s14); - u15 = _mm256_sub_epi32(s7, s15); - - v0 = _mm256_add_epi32(x0, x8); - v1 = _mm256_add_epi32(x1, x9); - v2 = _mm256_add_epi32(x2, x10); - v3 = _mm256_add_epi32(x3, x11); - v4 = _mm256_add_epi32(x4, x12); - v5 = _mm256_add_epi32(x5, x13); - v6 = _mm256_add_epi32(x6, x14); - v7 = _mm256_add_epi32(x7, x15); - - v8 = _mm256_sub_epi32(x0, x8); - v9 = _mm256_sub_epi32(x1, x9); - v10 = _mm256_sub_epi32(x2, x10); - v11 = _mm256_sub_epi32(x3, x11); - v12 = _mm256_sub_epi32(x4, x12); - v13 = _mm256_sub_epi32(x5, x13); - v14 = _mm256_sub_epi32(x6, x14); - v15 = _mm256_sub_epi32(x7, x15); - - // low 256 bits rounding - u8 = _mm256_add_epi32(u8, dct_rounding); - u9 = _mm256_add_epi32(u9, dct_rounding); - u10 = _mm256_add_epi32(u10, dct_rounding); - u11 = _mm256_add_epi32(u11, dct_rounding); - u12 = _mm256_add_epi32(u12, dct_rounding); - u13 = _mm256_add_epi32(u13, dct_rounding); - u14 = _mm256_add_epi32(u14, dct_rounding); - u15 = _mm256_add_epi32(u15, dct_rounding); - - u8 = _mm256_srai_epi32(u8, DCT_CONST_BITS); - u9 = _mm256_srai_epi32(u9, DCT_CONST_BITS); - u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS); - u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS); - u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS); - u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS); - u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS); - u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS); - - // high 256 bits rounding - v8 = _mm256_add_epi32(v8, dct_rounding); - v9 = _mm256_add_epi32(v9, dct_rounding); - v10 = _mm256_add_epi32(v10, dct_rounding); - v11 = _mm256_add_epi32(v11, dct_rounding); - v12 = _mm256_add_epi32(v12, dct_rounding); - v13 = _mm256_add_epi32(v13, dct_rounding); - v14 = _mm256_add_epi32(v14, dct_rounding); - v15 = _mm256_add_epi32(v15, dct_rounding); - - v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS); - v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS); - v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS); - v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS); - v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS); - v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS); - v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS); - v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS); - - // Saturation pack 32-bit to 16-bit - x8 = _mm256_packs_epi32(u8, v8); - x9 = _mm256_packs_epi32(u9, v9); - x10 = _mm256_packs_epi32(u10, v10); - x11 = _mm256_packs_epi32(u11, v11); - x12 = _mm256_packs_epi32(u12, v12); - x13 = _mm256_packs_epi32(u13, v13); - x14 = _mm256_packs_epi32(u14, v14); - x15 = _mm256_packs_epi32(u15, v15); - - // stage 2 - y0 = _mm256_unpacklo_epi16(x8, x9); - y1 = _mm256_unpackhi_epi16(x8, x9); - s8 = _mm256_madd_epi16(y0, cospi_p04_p28); - x8 = _mm256_madd_epi16(y1, cospi_p04_p28); - s9 = _mm256_madd_epi16(y0, cospi_p28_m04); - x9 = _mm256_madd_epi16(y1, cospi_p28_m04); - - y0 = _mm256_unpacklo_epi16(x10, x11); - y1 = _mm256_unpackhi_epi16(x10, x11); - s10 = _mm256_madd_epi16(y0, cospi_p20_p12); - x10 = _mm256_madd_epi16(y1, cospi_p20_p12); - s11 = _mm256_madd_epi16(y0, cospi_p12_m20); - x11 = _mm256_madd_epi16(y1, cospi_p12_m20); - - y0 = _mm256_unpacklo_epi16(x12, x13); - y1 = _mm256_unpackhi_epi16(x12, x13); - s12 = _mm256_madd_epi16(y0, cospi_m28_p04); - x12 = _mm256_madd_epi16(y1, cospi_m28_p04); - s13 = _mm256_madd_epi16(y0, cospi_p04_p28); - x13 = _mm256_madd_epi16(y1, cospi_p04_p28); - - y0 = _mm256_unpacklo_epi16(x14, x15); - y1 = _mm256_unpackhi_epi16(x14, x15); - s14 = _mm256_madd_epi16(y0, cospi_m12_p20); - x14 = _mm256_madd_epi16(y1, cospi_m12_p20); - s15 = _mm256_madd_epi16(y0, cospi_p20_p12); - x15 = _mm256_madd_epi16(y1, cospi_p20_p12); - - x0 = _mm256_add_epi32(u0, u4); - s0 = _mm256_add_epi32(v0, v4); - x1 = _mm256_add_epi32(u1, u5); - s1 = _mm256_add_epi32(v1, v5); - x2 = _mm256_add_epi32(u2, u6); - s2 = _mm256_add_epi32(v2, v6); - x3 = _mm256_add_epi32(u3, u7); - s3 = _mm256_add_epi32(v3, v7); - - v8 = _mm256_sub_epi32(u0, u4); - v9 = _mm256_sub_epi32(v0, v4); - v10 = _mm256_sub_epi32(u1, u5); - v11 = _mm256_sub_epi32(v1, v5); - v12 = _mm256_sub_epi32(u2, u6); - v13 = _mm256_sub_epi32(v2, v6); - v14 = _mm256_sub_epi32(u3, u7); - v15 = _mm256_sub_epi32(v3, v7); - - v8 = _mm256_add_epi32(v8, dct_rounding); - v9 = _mm256_add_epi32(v9, dct_rounding); - v10 = _mm256_add_epi32(v10, dct_rounding); - v11 = _mm256_add_epi32(v11, dct_rounding); - v12 = _mm256_add_epi32(v12, dct_rounding); - v13 = _mm256_add_epi32(v13, dct_rounding); - v14 = _mm256_add_epi32(v14, dct_rounding); - v15 = _mm256_add_epi32(v15, dct_rounding); - - v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS); - v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS); - v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS); - v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS); - v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS); - v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS); - v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS); - v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS); - - x4 = _mm256_packs_epi32(v8, v9); - x5 = _mm256_packs_epi32(v10, v11); - x6 = _mm256_packs_epi32(v12, v13); - x7 = _mm256_packs_epi32(v14, v15); - - u8 = _mm256_add_epi32(s8, s12); - u9 = _mm256_add_epi32(s9, s13); - u10 = _mm256_add_epi32(s10, s14); - u11 = _mm256_add_epi32(s11, s15); - u12 = _mm256_sub_epi32(s8, s12); - u13 = _mm256_sub_epi32(s9, s13); - u14 = _mm256_sub_epi32(s10, s14); - u15 = _mm256_sub_epi32(s11, s15); - - v8 = _mm256_add_epi32(x8, x12); - v9 = _mm256_add_epi32(x9, x13); - v10 = _mm256_add_epi32(x10, x14); - v11 = _mm256_add_epi32(x11, x15); - v12 = _mm256_sub_epi32(x8, x12); - v13 = _mm256_sub_epi32(x9, x13); - v14 = _mm256_sub_epi32(x10, x14); - v15 = _mm256_sub_epi32(x11, x15); - - u12 = _mm256_add_epi32(u12, dct_rounding); - u13 = _mm256_add_epi32(u13, dct_rounding); - u14 = _mm256_add_epi32(u14, dct_rounding); - u15 = _mm256_add_epi32(u15, dct_rounding); - - u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS); - u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS); - u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS); - u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS); - - v12 = _mm256_add_epi32(v12, dct_rounding); - v13 = _mm256_add_epi32(v13, dct_rounding); - v14 = _mm256_add_epi32(v14, dct_rounding); - v15 = _mm256_add_epi32(v15, dct_rounding); - - v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS); - v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS); - v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS); - v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS); - - x12 = _mm256_packs_epi32(u12, v12); - x13 = _mm256_packs_epi32(u13, v13); - x14 = _mm256_packs_epi32(u14, v14); - x15 = _mm256_packs_epi32(u15, v15); - - // stage 3 - y0 = _mm256_unpacklo_epi16(x4, x5); - y1 = _mm256_unpackhi_epi16(x4, x5); - s4 = _mm256_madd_epi16(y0, cospi_p08_p24); - x4 = _mm256_madd_epi16(y1, cospi_p08_p24); - s5 = _mm256_madd_epi16(y0, cospi_p24_m08); - x5 = _mm256_madd_epi16(y1, cospi_p24_m08); - - y0 = _mm256_unpacklo_epi16(x6, x7); - y1 = _mm256_unpackhi_epi16(x6, x7); - s6 = _mm256_madd_epi16(y0, cospi_m24_p08); - x6 = _mm256_madd_epi16(y1, cospi_m24_p08); - s7 = _mm256_madd_epi16(y0, cospi_p08_p24); - x7 = _mm256_madd_epi16(y1, cospi_p08_p24); - - y0 = _mm256_unpacklo_epi16(x12, x13); - y1 = _mm256_unpackhi_epi16(x12, x13); - s12 = _mm256_madd_epi16(y0, cospi_p08_p24); - x12 = _mm256_madd_epi16(y1, cospi_p08_p24); - s13 = _mm256_madd_epi16(y0, cospi_p24_m08); - x13 = _mm256_madd_epi16(y1, cospi_p24_m08); - - y0 = _mm256_unpacklo_epi16(x14, x15); - y1 = _mm256_unpackhi_epi16(x14, x15); - s14 = _mm256_madd_epi16(y0, cospi_m24_p08); - x14 = _mm256_madd_epi16(y1, cospi_m24_p08); - s15 = _mm256_madd_epi16(y0, cospi_p08_p24); - x15 = _mm256_madd_epi16(y1, cospi_p08_p24); - - u0 = _mm256_add_epi32(x0, x2); - v0 = _mm256_add_epi32(s0, s2); - u1 = _mm256_add_epi32(x1, x3); - v1 = _mm256_add_epi32(s1, s3); - u2 = _mm256_sub_epi32(x0, x2); - v2 = _mm256_sub_epi32(s0, s2); - u3 = _mm256_sub_epi32(x1, x3); - v3 = _mm256_sub_epi32(s1, s3); - - u0 = _mm256_add_epi32(u0, dct_rounding); - v0 = _mm256_add_epi32(v0, dct_rounding); - u1 = _mm256_add_epi32(u1, dct_rounding); - v1 = _mm256_add_epi32(v1, dct_rounding); - u2 = _mm256_add_epi32(u2, dct_rounding); - v2 = _mm256_add_epi32(v2, dct_rounding); - u3 = _mm256_add_epi32(u3, dct_rounding); - v3 = _mm256_add_epi32(v3, dct_rounding); - - u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS); - v0 = _mm256_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS); - v1 = _mm256_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS); - v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS); - v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS); - - in[0] = _mm256_packs_epi32(u0, v0); - x1 = _mm256_packs_epi32(u1, v1); - x2 = _mm256_packs_epi32(u2, v2); - x3 = _mm256_packs_epi32(u3, v3); - - // Rounding on s4 + s6, s5 + s7, s4 - s6, s5 - s7 - u4 = _mm256_add_epi32(s4, s6); - u5 = _mm256_add_epi32(s5, s7); - u6 = _mm256_sub_epi32(s4, s6); - u7 = _mm256_sub_epi32(s5, s7); - - v4 = _mm256_add_epi32(x4, x6); - v5 = _mm256_add_epi32(x5, x7); - v6 = _mm256_sub_epi32(x4, x6); - v7 = _mm256_sub_epi32(x5, x7); - - u4 = _mm256_add_epi32(u4, dct_rounding); - u5 = _mm256_add_epi32(u5, dct_rounding); - u6 = _mm256_add_epi32(u6, dct_rounding); - u7 = _mm256_add_epi32(u7, dct_rounding); - - u4 = _mm256_srai_epi32(u4, DCT_CONST_BITS); - u5 = _mm256_srai_epi32(u5, DCT_CONST_BITS); - u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS); - u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS); - - v4 = _mm256_add_epi32(v4, dct_rounding); - v5 = _mm256_add_epi32(v5, dct_rounding); - v6 = _mm256_add_epi32(v6, dct_rounding); - v7 = _mm256_add_epi32(v7, dct_rounding); - - v4 = _mm256_srai_epi32(v4, DCT_CONST_BITS); - v5 = _mm256_srai_epi32(v5, DCT_CONST_BITS); - v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS); - v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS); - - x4 = _mm256_packs_epi32(u4, v4); - in[12] = _mm256_packs_epi32(u5, v5); - x6 = _mm256_packs_epi32(u6, v6); - x7 = _mm256_packs_epi32(u7, v7); - - u0 = _mm256_add_epi32(u8, u10); - v0 = _mm256_add_epi32(v8, v10); - u1 = _mm256_add_epi32(u9, u11); - v1 = _mm256_add_epi32(v9, v11); - u2 = _mm256_sub_epi32(u8, u10); - v2 = _mm256_sub_epi32(v8, v10); - u3 = _mm256_sub_epi32(u9, u11); - v3 = _mm256_sub_epi32(v9, v11); - - u0 = _mm256_add_epi32(u0, dct_rounding); - v0 = _mm256_add_epi32(v0, dct_rounding); - u1 = _mm256_add_epi32(u1, dct_rounding); - v1 = _mm256_add_epi32(v1, dct_rounding); - u2 = _mm256_add_epi32(u2, dct_rounding); - v2 = _mm256_add_epi32(v2, dct_rounding); - u3 = _mm256_add_epi32(u3, dct_rounding); - v3 = _mm256_add_epi32(v3, dct_rounding); - - u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS); - v0 = _mm256_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS); - v1 = _mm256_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS); - v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS); - v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS); - - x8 = _mm256_packs_epi32(u0, v0); - in[14] = _mm256_packs_epi32(u1, v1); - x10 = _mm256_packs_epi32(u2, v2); - x11 = _mm256_packs_epi32(u3, v3); - - // Rounding on s12 + s14, s13 + s15, s12 - s14, s13 - s15 - u12 = _mm256_add_epi32(s12, s14); - u13 = _mm256_add_epi32(s13, s15); - u14 = _mm256_sub_epi32(s12, s14); - u15 = _mm256_sub_epi32(s13, s15); - - v12 = _mm256_add_epi32(x12, x14); - v13 = _mm256_add_epi32(x13, x15); - v14 = _mm256_sub_epi32(x12, x14); - v15 = _mm256_sub_epi32(x13, x15); - - u12 = _mm256_add_epi32(u12, dct_rounding); - u13 = _mm256_add_epi32(u13, dct_rounding); - u14 = _mm256_add_epi32(u14, dct_rounding); - u15 = _mm256_add_epi32(u15, dct_rounding); - - u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS); - u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS); - u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS); - u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS); - - v12 = _mm256_add_epi32(v12, dct_rounding); - v13 = _mm256_add_epi32(v13, dct_rounding); - v14 = _mm256_add_epi32(v14, dct_rounding); - v15 = _mm256_add_epi32(v15, dct_rounding); - - v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS); - v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS); - v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS); - v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS); - - x12 = _mm256_packs_epi32(u12, v12); - x13 = _mm256_packs_epi32(u13, v13); - x14 = _mm256_packs_epi32(u14, v14); - x15 = _mm256_packs_epi32(u15, v15); - in[2] = x12; - - // stage 4 - y0 = _mm256_unpacklo_epi16(x2, x3); - y1 = _mm256_unpackhi_epi16(x2, x3); - s2 = _mm256_madd_epi16(y0, cospi_m16_m16); - x2 = _mm256_madd_epi16(y1, cospi_m16_m16); - s3 = _mm256_madd_epi16(y0, cospi_p16_m16); - x3 = _mm256_madd_epi16(y1, cospi_p16_m16); - - y0 = _mm256_unpacklo_epi16(x6, x7); - y1 = _mm256_unpackhi_epi16(x6, x7); - s6 = _mm256_madd_epi16(y0, cospi_p16_p16); - x6 = _mm256_madd_epi16(y1, cospi_p16_p16); - s7 = _mm256_madd_epi16(y0, cospi_m16_p16); - x7 = _mm256_madd_epi16(y1, cospi_m16_p16); - - y0 = _mm256_unpacklo_epi16(x10, x11); - y1 = _mm256_unpackhi_epi16(x10, x11); - s10 = _mm256_madd_epi16(y0, cospi_p16_p16); - x10 = _mm256_madd_epi16(y1, cospi_p16_p16); - s11 = _mm256_madd_epi16(y0, cospi_m16_p16); - x11 = _mm256_madd_epi16(y1, cospi_m16_p16); - - y0 = _mm256_unpacklo_epi16(x14, x15); - y1 = _mm256_unpackhi_epi16(x14, x15); - s14 = _mm256_madd_epi16(y0, cospi_m16_m16); - x14 = _mm256_madd_epi16(y1, cospi_m16_m16); - s15 = _mm256_madd_epi16(y0, cospi_p16_m16); - x15 = _mm256_madd_epi16(y1, cospi_p16_m16); - - // Rounding - u2 = _mm256_add_epi32(s2, dct_rounding); - u3 = _mm256_add_epi32(s3, dct_rounding); - u6 = _mm256_add_epi32(s6, dct_rounding); - u7 = _mm256_add_epi32(s7, dct_rounding); - - u10 = _mm256_add_epi32(s10, dct_rounding); - u11 = _mm256_add_epi32(s11, dct_rounding); - u14 = _mm256_add_epi32(s14, dct_rounding); - u15 = _mm256_add_epi32(s15, dct_rounding); - - u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS); - u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS); - u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS); - u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS); - - u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS); - u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS); - u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS); - u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS); - - v2 = _mm256_add_epi32(x2, dct_rounding); - v3 = _mm256_add_epi32(x3, dct_rounding); - v6 = _mm256_add_epi32(x6, dct_rounding); - v7 = _mm256_add_epi32(x7, dct_rounding); - - v10 = _mm256_add_epi32(x10, dct_rounding); - v11 = _mm256_add_epi32(x11, dct_rounding); - v14 = _mm256_add_epi32(x14, dct_rounding); - v15 = _mm256_add_epi32(x15, dct_rounding); - - v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS); - v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS); - v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS); - v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS); - - v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS); - v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS); - v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS); - v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS); - - in[7] = _mm256_packs_epi32(u2, v2); - in[8] = _mm256_packs_epi32(u3, v3); - - in[4] = _mm256_packs_epi32(u6, v6); - in[11] = _mm256_packs_epi32(u7, v7); - - in[6] = _mm256_packs_epi32(u10, v10); - in[9] = _mm256_packs_epi32(u11, v11); - - in[5] = _mm256_packs_epi32(u14, v14); - in[10] = _mm256_packs_epi32(u15, v15); - - in[1] = _mm256_sub_epi16(zero, x8); - in[3] = _mm256_sub_epi16(zero, x4); - in[13] = _mm256_sub_epi16(zero, x13); - in[15] = _mm256_sub_epi16(zero, x1); -} - -#if CONFIG_EXT_TX -static void fidtx16_avx2(__m256i *in) { - txfm_scaling16_avx2((int16_t)Sqrt2, in); -} -#endif - -void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m256i in[16]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - switch (tx_type) { - case DCT_DCT: - load_buffer_16x16(input, stride, 0, 0, in); - fdct16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fdct16_avx2(in); - break; - case ADST_DCT: - load_buffer_16x16(input, stride, 0, 0, in); - fadst16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fdct16_avx2(in); - break; - case DCT_ADST: - load_buffer_16x16(input, stride, 0, 0, in); - fdct16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fadst16_avx2(in); - break; - case ADST_ADST: - load_buffer_16x16(input, stride, 0, 0, in); - fadst16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fadst16_avx2(in); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_16x16(input, stride, 1, 0, in); - fadst16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fdct16_avx2(in); - break; - case DCT_FLIPADST: - load_buffer_16x16(input, stride, 0, 1, in); - fdct16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fadst16_avx2(in); - break; - case FLIPADST_FLIPADST: - load_buffer_16x16(input, stride, 1, 1, in); - fadst16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fadst16_avx2(in); - break; - case ADST_FLIPADST: - load_buffer_16x16(input, stride, 0, 1, in); - fadst16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fadst16_avx2(in); - break; - case FLIPADST_ADST: - load_buffer_16x16(input, stride, 1, 0, in); - fadst16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fadst16_avx2(in); - break; - case IDTX: - load_buffer_16x16(input, stride, 0, 0, in); - fidtx16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fidtx16_avx2(in); - break; - case V_DCT: - load_buffer_16x16(input, stride, 0, 0, in); - fdct16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fidtx16_avx2(in); - break; - case H_DCT: - load_buffer_16x16(input, stride, 0, 0, in); - fidtx16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fdct16_avx2(in); - break; - case V_ADST: - load_buffer_16x16(input, stride, 0, 0, in); - fadst16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fidtx16_avx2(in); - break; - case H_ADST: - load_buffer_16x16(input, stride, 0, 0, in); - fidtx16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fadst16_avx2(in); - break; - case V_FLIPADST: - load_buffer_16x16(input, stride, 1, 0, in); - fadst16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fidtx16_avx2(in); - break; - case H_FLIPADST: - load_buffer_16x16(input, stride, 0, 1, in); - fidtx16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fadst16_avx2(in); - break; -#endif // CONFIG_EXT_TX - default: assert(0); break; - } - mm256_transpose_16x16(in, in); - write_buffer_16x16(in, output); - _mm256_zeroupper(); -} - -static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) { - int i = 0; - __m256i temp; - while (i < size) { - temp = a0[i]; - a0[i] = a1[i]; - a1[i] = temp; - i++; - } -} - -static void mm256_transpose_32x32(__m256i *in0, __m256i *in1) { - mm256_transpose_16x16(in0, in0); - mm256_transpose_16x16(&in0[16], &in0[16]); - mm256_transpose_16x16(in1, in1); - mm256_transpose_16x16(&in1[16], &in1[16]); - mm256_vectors_swap(&in0[16], in1, 16); -} - -static void prepare_16x16_even(const __m256i *in, __m256i *even) { - even[0] = _mm256_add_epi16(in[0], in[31]); - even[1] = _mm256_add_epi16(in[1], in[30]); - even[2] = _mm256_add_epi16(in[2], in[29]); - even[3] = _mm256_add_epi16(in[3], in[28]); - even[4] = _mm256_add_epi16(in[4], in[27]); - even[5] = _mm256_add_epi16(in[5], in[26]); - even[6] = _mm256_add_epi16(in[6], in[25]); - even[7] = _mm256_add_epi16(in[7], in[24]); - even[8] = _mm256_add_epi16(in[8], in[23]); - even[9] = _mm256_add_epi16(in[9], in[22]); - even[10] = _mm256_add_epi16(in[10], in[21]); - even[11] = _mm256_add_epi16(in[11], in[20]); - even[12] = _mm256_add_epi16(in[12], in[19]); - even[13] = _mm256_add_epi16(in[13], in[18]); - even[14] = _mm256_add_epi16(in[14], in[17]); - even[15] = _mm256_add_epi16(in[15], in[16]); -} - -static void prepare_16x16_odd(const __m256i *in, __m256i *odd) { - odd[0] = _mm256_sub_epi16(in[15], in[16]); - odd[1] = _mm256_sub_epi16(in[14], in[17]); - odd[2] = _mm256_sub_epi16(in[13], in[18]); - odd[3] = _mm256_sub_epi16(in[12], in[19]); - odd[4] = _mm256_sub_epi16(in[11], in[20]); - odd[5] = _mm256_sub_epi16(in[10], in[21]); - odd[6] = _mm256_sub_epi16(in[9], in[22]); - odd[7] = _mm256_sub_epi16(in[8], in[23]); - odd[8] = _mm256_sub_epi16(in[7], in[24]); - odd[9] = _mm256_sub_epi16(in[6], in[25]); - odd[10] = _mm256_sub_epi16(in[5], in[26]); - odd[11] = _mm256_sub_epi16(in[4], in[27]); - odd[12] = _mm256_sub_epi16(in[3], in[28]); - odd[13] = _mm256_sub_epi16(in[2], in[29]); - odd[14] = _mm256_sub_epi16(in[1], in[30]); - odd[15] = _mm256_sub_epi16(in[0], in[31]); -} - -static void collect_16col(const __m256i *even, const __m256i *odd, - __m256i *out) { - // fdct16_avx2() already maps the output - out[0] = even[0]; - out[2] = even[1]; - out[4] = even[2]; - out[6] = even[3]; - out[8] = even[4]; - out[10] = even[5]; - out[12] = even[6]; - out[14] = even[7]; - out[16] = even[8]; - out[18] = even[9]; - out[20] = even[10]; - out[22] = even[11]; - out[24] = even[12]; - out[26] = even[13]; - out[28] = even[14]; - out[30] = even[15]; - - out[1] = odd[0]; - out[17] = odd[1]; - out[9] = odd[2]; - out[25] = odd[3]; - out[5] = odd[4]; - out[21] = odd[5]; - out[13] = odd[6]; - out[29] = odd[7]; - out[3] = odd[8]; - out[19] = odd[9]; - out[11] = odd[10]; - out[27] = odd[11]; - out[7] = odd[12]; - out[23] = odd[13]; - out[15] = odd[14]; - out[31] = odd[15]; -} - -static void collect_coeffs(const __m256i *first_16col_even, - const __m256i *first_16col_odd, - const __m256i *second_16col_even, - const __m256i *second_16col_odd, __m256i *in0, - __m256i *in1) { - collect_16col(first_16col_even, first_16col_odd, in0); - collect_16col(second_16col_even, second_16col_odd, in1); -} - -static void fdct16_odd_avx2(__m256i *in) { - // sequence: cospi_L_H = pairs(L, H) and L first - const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64); - const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64); - const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64); - const __m256i cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); - const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64); - const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64); - const __m256i cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m256i cospi_p31_p01 = pair256_set_epi16(cospi_31_64, cospi_1_64); - const __m256i cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64); - const __m256i cospi_p15_p17 = pair256_set_epi16(cospi_15_64, cospi_17_64); - const __m256i cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64); - const __m256i cospi_p23_p09 = pair256_set_epi16(cospi_23_64, cospi_9_64); - const __m256i cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64); - const __m256i cospi_p07_p25 = pair256_set_epi16(cospi_7_64, cospi_25_64); - const __m256i cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64); - const __m256i cospi_p27_p05 = pair256_set_epi16(cospi_27_64, cospi_5_64); - const __m256i cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64); - const __m256i cospi_p11_p21 = pair256_set_epi16(cospi_11_64, cospi_21_64); - const __m256i cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64); - const __m256i cospi_p19_p13 = pair256_set_epi16(cospi_19_64, cospi_13_64); - const __m256i cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64); - const __m256i cospi_p03_p29 = pair256_set_epi16(cospi_3_64, cospi_29_64); - const __m256i cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64); - - __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15; - __m256i u0, u1; - - // stage 1 is in prepare_16x16_odd() - - // stage 2 - y0 = in[0]; - y1 = in[1]; - y2 = in[2]; - y3 = in[3]; - - u0 = _mm256_unpacklo_epi16(in[4], in[11]); - u1 = _mm256_unpackhi_epi16(in[4], in[11]); - y4 = butter_fly(&u0, &u1, &cospi_m16_p16); - y11 = butter_fly(&u0, &u1, &cospi_p16_p16); - - u0 = _mm256_unpacklo_epi16(in[5], in[10]); - u1 = _mm256_unpackhi_epi16(in[5], in[10]); - y5 = butter_fly(&u0, &u1, &cospi_m16_p16); - y10 = butter_fly(&u0, &u1, &cospi_p16_p16); - - u0 = _mm256_unpacklo_epi16(in[6], in[9]); - u1 = _mm256_unpackhi_epi16(in[6], in[9]); - y6 = butter_fly(&u0, &u1, &cospi_m16_p16); - y9 = butter_fly(&u0, &u1, &cospi_p16_p16); - - u0 = _mm256_unpacklo_epi16(in[7], in[8]); - u1 = _mm256_unpackhi_epi16(in[7], in[8]); - y7 = butter_fly(&u0, &u1, &cospi_m16_p16); - y8 = butter_fly(&u0, &u1, &cospi_p16_p16); - - y12 = in[12]; - y13 = in[13]; - y14 = in[14]; - y15 = in[15]; - - // stage 3 - x0 = _mm256_add_epi16(y0, y7); - x1 = _mm256_add_epi16(y1, y6); - x2 = _mm256_add_epi16(y2, y5); - x3 = _mm256_add_epi16(y3, y4); - x4 = _mm256_sub_epi16(y3, y4); - x5 = _mm256_sub_epi16(y2, y5); - x6 = _mm256_sub_epi16(y1, y6); - x7 = _mm256_sub_epi16(y0, y7); - x8 = _mm256_sub_epi16(y15, y8); - x9 = _mm256_sub_epi16(y14, y9); - x10 = _mm256_sub_epi16(y13, y10); - x11 = _mm256_sub_epi16(y12, y11); - x12 = _mm256_add_epi16(y12, y11); - x13 = _mm256_add_epi16(y13, y10); - x14 = _mm256_add_epi16(y14, y9); - x15 = _mm256_add_epi16(y15, y8); - - // stage 4 - y0 = x0; - y1 = x1; - y6 = x6; - y7 = x7; - y8 = x8; - y9 = x9; - y14 = x14; - y15 = x15; - - u0 = _mm256_unpacklo_epi16(x2, x13); - u1 = _mm256_unpackhi_epi16(x2, x13); - y2 = butter_fly(&u0, &u1, &cospi_m08_p24); - y13 = butter_fly(&u0, &u1, &cospi_p24_p08); - - u0 = _mm256_unpacklo_epi16(x3, x12); - u1 = _mm256_unpackhi_epi16(x3, x12); - y3 = butter_fly(&u0, &u1, &cospi_m08_p24); - y12 = butter_fly(&u0, &u1, &cospi_p24_p08); - - u0 = _mm256_unpacklo_epi16(x4, x11); - u1 = _mm256_unpackhi_epi16(x4, x11); - y4 = butter_fly(&u0, &u1, &cospi_m24_m08); - y11 = butter_fly(&u0, &u1, &cospi_m08_p24); - - u0 = _mm256_unpacklo_epi16(x5, x10); - u1 = _mm256_unpackhi_epi16(x5, x10); - y5 = butter_fly(&u0, &u1, &cospi_m24_m08); - y10 = butter_fly(&u0, &u1, &cospi_m08_p24); - - // stage 5 - x0 = _mm256_add_epi16(y0, y3); - x1 = _mm256_add_epi16(y1, y2); - x2 = _mm256_sub_epi16(y1, y2); - x3 = _mm256_sub_epi16(y0, y3); - x4 = _mm256_sub_epi16(y7, y4); - x5 = _mm256_sub_epi16(y6, y5); - x6 = _mm256_add_epi16(y6, y5); - x7 = _mm256_add_epi16(y7, y4); - - x8 = _mm256_add_epi16(y8, y11); - x9 = _mm256_add_epi16(y9, y10); - x10 = _mm256_sub_epi16(y9, y10); - x11 = _mm256_sub_epi16(y8, y11); - x12 = _mm256_sub_epi16(y15, y12); - x13 = _mm256_sub_epi16(y14, y13); - x14 = _mm256_add_epi16(y14, y13); - x15 = _mm256_add_epi16(y15, y12); - - // stage 6 - y0 = x0; - y3 = x3; - y4 = x4; - y7 = x7; - y8 = x8; - y11 = x11; - y12 = x12; - y15 = x15; - - u0 = _mm256_unpacklo_epi16(x1, x14); - u1 = _mm256_unpackhi_epi16(x1, x14); - y1 = butter_fly(&u0, &u1, &cospi_m04_p28); - y14 = butter_fly(&u0, &u1, &cospi_p28_p04); - - u0 = _mm256_unpacklo_epi16(x2, x13); - u1 = _mm256_unpackhi_epi16(x2, x13); - y2 = butter_fly(&u0, &u1, &cospi_m28_m04); - y13 = butter_fly(&u0, &u1, &cospi_m04_p28); - - u0 = _mm256_unpacklo_epi16(x5, x10); - u1 = _mm256_unpackhi_epi16(x5, x10); - y5 = butter_fly(&u0, &u1, &cospi_m20_p12); - y10 = butter_fly(&u0, &u1, &cospi_p12_p20); - - u0 = _mm256_unpacklo_epi16(x6, x9); - u1 = _mm256_unpackhi_epi16(x6, x9); - y6 = butter_fly(&u0, &u1, &cospi_m12_m20); - y9 = butter_fly(&u0, &u1, &cospi_m20_p12); - - // stage 7 - x0 = _mm256_add_epi16(y0, y1); - x1 = _mm256_sub_epi16(y0, y1); - x2 = _mm256_sub_epi16(y3, y2); - x3 = _mm256_add_epi16(y3, y2); - x4 = _mm256_add_epi16(y4, y5); - x5 = _mm256_sub_epi16(y4, y5); - x6 = _mm256_sub_epi16(y7, y6); - x7 = _mm256_add_epi16(y7, y6); - - x8 = _mm256_add_epi16(y8, y9); - x9 = _mm256_sub_epi16(y8, y9); - x10 = _mm256_sub_epi16(y11, y10); - x11 = _mm256_add_epi16(y11, y10); - x12 = _mm256_add_epi16(y12, y13); - x13 = _mm256_sub_epi16(y12, y13); - x14 = _mm256_sub_epi16(y15, y14); - x15 = _mm256_add_epi16(y15, y14); - - // stage 8 - u0 = _mm256_unpacklo_epi16(x0, x15); - u1 = _mm256_unpackhi_epi16(x0, x15); - in[0] = butter_fly(&u0, &u1, &cospi_p31_p01); - in[15] = butter_fly(&u0, &u1, &cospi_m01_p31); - - u0 = _mm256_unpacklo_epi16(x1, x14); - u1 = _mm256_unpackhi_epi16(x1, x14); - in[1] = butter_fly(&u0, &u1, &cospi_p15_p17); - in[14] = butter_fly(&u0, &u1, &cospi_m17_p15); - - u0 = _mm256_unpacklo_epi16(x2, x13); - u1 = _mm256_unpackhi_epi16(x2, x13); - in[2] = butter_fly(&u0, &u1, &cospi_p23_p09); - in[13] = butter_fly(&u0, &u1, &cospi_m09_p23); - - u0 = _mm256_unpacklo_epi16(x3, x12); - u1 = _mm256_unpackhi_epi16(x3, x12); - in[3] = butter_fly(&u0, &u1, &cospi_p07_p25); - in[12] = butter_fly(&u0, &u1, &cospi_m25_p07); - - u0 = _mm256_unpacklo_epi16(x4, x11); - u1 = _mm256_unpackhi_epi16(x4, x11); - in[4] = butter_fly(&u0, &u1, &cospi_p27_p05); - in[11] = butter_fly(&u0, &u1, &cospi_m05_p27); - - u0 = _mm256_unpacklo_epi16(x5, x10); - u1 = _mm256_unpackhi_epi16(x5, x10); - in[5] = butter_fly(&u0, &u1, &cospi_p11_p21); - in[10] = butter_fly(&u0, &u1, &cospi_m21_p11); - - u0 = _mm256_unpacklo_epi16(x6, x9); - u1 = _mm256_unpackhi_epi16(x6, x9); - in[6] = butter_fly(&u0, &u1, &cospi_p19_p13); - in[9] = butter_fly(&u0, &u1, &cospi_m13_p19); - - u0 = _mm256_unpacklo_epi16(x7, x8); - u1 = _mm256_unpackhi_epi16(x7, x8); - in[7] = butter_fly(&u0, &u1, &cospi_p03_p29); - in[8] = butter_fly(&u0, &u1, &cospi_m29_p03); -} - -static void fdct32_avx2(__m256i *in0, __m256i *in1) { - __m256i even0[16], even1[16], odd0[16], odd1[16]; - prepare_16x16_even(in0, even0); - fdct16_avx2(even0); - - prepare_16x16_odd(in0, odd0); - fdct16_odd_avx2(odd0); - - prepare_16x16_even(in1, even1); - fdct16_avx2(even1); - - prepare_16x16_odd(in1, odd1); - fdct16_odd_avx2(odd1); - - collect_coeffs(even0, odd0, even1, odd1, in0, in1); - - mm256_transpose_32x32(in0, in1); -} - -static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1, - tran_low_t *output) { - int i = 0; - const int stride = 32; - tran_low_t *coeff = output; - while (i < 32) { - storeu_output_avx2(&in0[i], coeff); - storeu_output_avx2(&in1[i], coeff + 16); - coeff += stride; - i += 1; - } -} - -#if CONFIG_EXT_TX -static void fhalfright32_16col_avx2(__m256i *in) { - int i = 0; - const __m256i zero = _mm256_setzero_si256(); - const __m256i sqrt2 = _mm256_set1_epi16((int16_t)Sqrt2); - const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); - __m256i x0, x1; - - while (i < 16) { - in[i] = _mm256_slli_epi16(in[i], 2); - x0 = _mm256_unpacklo_epi16(in[i + 16], zero); - x1 = _mm256_unpackhi_epi16(in[i + 16], zero); - x0 = _mm256_madd_epi16(x0, sqrt2); - x1 = _mm256_madd_epi16(x1, sqrt2); - x0 = _mm256_add_epi32(x0, dct_rounding); - x1 = _mm256_add_epi32(x1, dct_rounding); - x0 = _mm256_srai_epi32(x0, DCT_CONST_BITS); - x1 = _mm256_srai_epi32(x1, DCT_CONST_BITS); - in[i + 16] = _mm256_packs_epi32(x0, x1); - i += 1; - } - fdct16_avx2(&in[16]); -} - -static void fhalfright32_avx2(__m256i *in0, __m256i *in1) { - fhalfright32_16col_avx2(in0); - fhalfright32_16col_avx2(in1); - mm256_vectors_swap(in0, &in0[16], 16); - mm256_vectors_swap(in1, &in1[16], 16); - mm256_transpose_32x32(in0, in1); -} -#endif // CONFIG_EXT_TX - -static INLINE void load_buffer_32x32(const int16_t *input, int stride, - int flipud, int fliplr, __m256i *in0, - __m256i *in1) { - // Load 4 16x16 blocks - const int16_t *topL = input; - const int16_t *topR = input + 16; - const int16_t *botL = input + 16 * stride; - const int16_t *botR = input + 16 * stride + 16; - - const int16_t *tmp; - - if (flipud) { - // Swap left columns - tmp = topL; - topL = botL; - botL = tmp; - // Swap right columns - tmp = topR; - topR = botR; - botR = tmp; - } - - if (fliplr) { - // Swap top rows - tmp = topL; - topL = topR; - topR = tmp; - // Swap bottom rows - tmp = botL; - botL = botR; - botR = tmp; - } - - // load first 16 columns - load_buffer_16x16(topL, stride, flipud, fliplr, in0); - load_buffer_16x16(botL, stride, flipud, fliplr, in0 + 16); - - // load second 16 columns - load_buffer_16x16(topR, stride, flipud, fliplr, in1); - load_buffer_16x16(botR, stride, flipud, fliplr, in1 + 16); -} - -static INLINE void right_shift_32x32_16col(int bit, __m256i *in) { - int i = 0; - const __m256i rounding = _mm256_set1_epi16((1 << bit) >> 1); - __m256i sign; - while (i < 32) { - sign = _mm256_srai_epi16(in[i], 15); - in[i] = _mm256_add_epi16(in[i], rounding); - in[i] = _mm256_add_epi16(in[i], sign); - in[i] = _mm256_srai_epi16(in[i], bit); - i += 1; - } -} - -// Positive rounding -static INLINE void right_shift_32x32(__m256i *in0, __m256i *in1) { - const int bit = 4; - right_shift_32x32_16col(bit, in0); - right_shift_32x32_16col(bit, in1); -} - -#if CONFIG_EXT_TX -static void fidtx32_avx2(__m256i *in0, __m256i *in1) { - int i = 0; - while (i < 32) { - in0[i] = _mm256_slli_epi16(in0[i], 2); - in1[i] = _mm256_slli_epi16(in1[i], 2); - i += 1; - } - mm256_transpose_32x32(in0, in1); -} -#endif - -void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m256i in0[32]; // left 32 columns - __m256i in1[32]; // right 32 columns - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "No avx2 32x32 implementation of MRC_DCT"); -#endif - - switch (tx_type) { - case DCT_DCT: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fdct32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fdct32_avx2(in0, in1); - break; -#if CONFIG_EXT_TX - case ADST_DCT: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fhalfright32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fdct32_avx2(in0, in1); - break; - case DCT_ADST: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fdct32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fhalfright32_avx2(in0, in1); - break; - case ADST_ADST: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fhalfright32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fhalfright32_avx2(in0, in1); - break; - case FLIPADST_DCT: - load_buffer_32x32(input, stride, 1, 0, in0, in1); - fhalfright32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fdct32_avx2(in0, in1); - break; - case DCT_FLIPADST: - load_buffer_32x32(input, stride, 0, 1, in0, in1); - fdct32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fhalfright32_avx2(in0, in1); - break; - case FLIPADST_FLIPADST: - load_buffer_32x32(input, stride, 1, 1, in0, in1); - fhalfright32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fhalfright32_avx2(in0, in1); - break; - case ADST_FLIPADST: - load_buffer_32x32(input, stride, 0, 1, in0, in1); - fhalfright32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fhalfright32_avx2(in0, in1); - break; - case FLIPADST_ADST: - load_buffer_32x32(input, stride, 1, 0, in0, in1); - fhalfright32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fhalfright32_avx2(in0, in1); - break; - case IDTX: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fidtx32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fidtx32_avx2(in0, in1); - break; - case V_DCT: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fdct32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fidtx32_avx2(in0, in1); - break; - case H_DCT: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fidtx32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fdct32_avx2(in0, in1); - break; - case V_ADST: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fhalfright32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fidtx32_avx2(in0, in1); - break; - case H_ADST: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fidtx32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fhalfright32_avx2(in0, in1); - break; - case V_FLIPADST: - load_buffer_32x32(input, stride, 1, 0, in0, in1); - fhalfright32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fidtx32_avx2(in0, in1); - break; - case H_FLIPADST: - load_buffer_32x32(input, stride, 0, 1, in0, in1); - fidtx32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fhalfright32_avx2(in0, in1); - break; -#endif // CONFIG_EXT_TX - default: assert(0); break; - } - write_buffer_32x32(in0, in1, output); - _mm256_zeroupper(); -} diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm index 7186b6b92..30983d1c1 100644 --- a/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm +++ b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm @@ -14,6 +14,8 @@ %include "aom_ports/x86_abi_support.asm" +SECTION .text + ; void av1_temporal_filter_apply_sse2 | arg ; (unsigned char *frame1, | 0 ; unsigned int stride, | 1 diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c index bf233ca4d..4d2e99f25 100644 --- a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c +++ b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c @@ -31,7 +31,7 @@ uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, uint64_t csse; const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE); - const __m128i v_zext_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + const __m128i v_zext_q = xx_set1_64_from_32i(0xffffffff); __m128i v_acc0_q = _mm_setzero_si128(); |