diff options
author | trav90 <travawine@palemoon.org> | 2018-10-19 21:52:15 -0500 |
---|---|---|
committer | trav90 <travawine@palemoon.org> | 2018-10-19 21:52:20 -0500 |
commit | bbcc64772580c8a979288791afa02d30bc476d2e (patch) | |
tree | 437ce94c3fdd7497508e5b55de06c6d011678597 /third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c | |
parent | 14805f6ddbfb173c327768fff9f81f40ce5e81b0 (diff) | |
download | UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.gz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.lz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.xz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.zip |
Update aom to v1.0.0
Update aom to commit id d14c5bb4f336ef1842046089849dee4a301fbbf0.
Diffstat (limited to 'third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c')
-rw-r--r-- | third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c | 1957 |
1 files changed, 1957 insertions, 0 deletions
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c new file mode 100644 index 000000000..7415c58df --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c @@ -0,0 +1,1957 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "config/av1_rtcd.h" + +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/common/x86/av1_inv_txfm_avx2.h" +#include "av1/common/x86/av1_inv_txfm_ssse3.h" + +static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(x1[0], x1[3]); + btf_16_adds_subs_avx2(x1[1], x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x1[5], x1[6]); + + btf_16_adds_subs_avx2(x1[8], x1[11]); + btf_16_adds_subs_avx2(x1[9], x1[10]); + btf_16_subs_adds_avx2(x1[15], x1[12]); + btf_16_subs_adds_avx2(x1[14], x1[13]); +} + +static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(x[0], x[7]); + btf_16_adds_subs_avx2(x[1], x[6]); + btf_16_adds_subs_avx2(x[2], x[5]); + btf_16_adds_subs_avx2(x[3], x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); +} + +static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) { + btf_16_adds_subs_out_avx2(output[0], output[15], x1[0], x1[15]); + btf_16_adds_subs_out_avx2(output[1], output[14], x1[1], x1[14]); + btf_16_adds_subs_out_avx2(output[2], output[13], x1[2], x1[13]); + btf_16_adds_subs_out_avx2(output[3], output[12], x1[3], x1[12]); + btf_16_adds_subs_out_avx2(output[4], output[11], x1[4], x1[11]); + btf_16_adds_subs_out_avx2(output[5], output[10], x1[5], x1[10]); + btf_16_adds_subs_out_avx2(output[6], output[9], x1[6], x1[9]); + btf_16_adds_subs_out_avx2(output[7], output[8], x1[7], x1[8]); +} + +static void idct16_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); + __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); + __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); + __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); + __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); + __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); + __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); + __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); + __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); + __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + + // stage 1 + __m256i x1[16]; + x1[0] = input[0]; + x1[1] = input[8]; + x1[2] = input[4]; + x1[3] = input[12]; + x1[4] = input[2]; + x1[5] = input[10]; + x1[6] = input[6]; + x1[7] = input[14]; + x1[8] = input[1]; + x1[9] = input[9]; + x1[10] = input[5]; + x1[11] = input[13]; + x1[12] = input[3]; + x1[13] = input[11]; + x1[14] = input[7]; + x1[15] = input[15]; + + // stage 2 + btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]); + btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]); + btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]); + btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]); + + // stage 3 + btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]); + btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]); + btf_16_adds_subs_avx2(x1[8], x1[9]); + btf_16_subs_adds_avx2(x1[11], x1[10]); + btf_16_adds_subs_avx2(x1[12], x1[13]); + btf_16_subs_adds_avx2(x1[15], x1[14]); + + // stage 4 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]); + btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]); + btf_16_adds_subs_avx2(x1[4], x1[5]); + btf_16_subs_adds_avx2(x1[7], x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]); + + idct16_stage5_avx2(x1, cospi, __rounding, cos_bit); + idct16_stage6_avx2(x1, cospi, __rounding, cos_bit); + idct16_stage7_avx2(output, x1); +} + +static void idct16_low8_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + + // stage 1 + __m256i x1[16]; + x1[0] = input[0]; + x1[2] = input[4]; + x1[4] = input[2]; + x1[6] = input[6]; + x1[8] = input[1]; + x1[10] = input[5]; + x1[12] = input[3]; + x1[14] = input[7]; + + // stage 2 + btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]); + btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]); + btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]); + + // stage 3 + btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]); + btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]); + btf_16_adds_subs_avx2(x1[8], x1[9]); + btf_16_subs_adds_avx2(x1[11], x1[10]); + btf_16_adds_subs_avx2(x1[12], x1[13]); + btf_16_subs_adds_avx2(x1[15], x1[14]); + + // stage 4 + btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); + btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]); + btf_16_adds_subs_avx2(x1[4], x1[5]); + btf_16_subs_adds_avx2(x1[7], x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]); + + idct16_stage5_avx2(x1, cospi, __rounding, cos_bit); + idct16_stage6_avx2(x1, cospi, __rounding, cos_bit); + idct16_stage7_avx2(output, x1); +} + +static void idct16_low1_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m256i x1[2]; + x1[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); + + // stage 5 + // stage 6 + output[0] = x1[0]; + output[1] = x1[1]; + output[2] = x1[1]; + output[3] = x1[0]; + output[4] = x1[0]; + output[5] = x1[1]; + output[6] = x1[1]; + output[7] = x1[0]; + output[8] = x1[0]; + output[9] = x1[1]; + output[10] = x1[1]; + output[11] = x1[0]; + output[12] = x1[0]; + output[13] = x1[1]; + output[14] = x1[1]; + output[15] = x1[0]; +} + +static INLINE void iadst16_stage3_avx2(__m256i *x) { + btf_16_adds_subs_avx2(x[0], x[8]); + btf_16_adds_subs_avx2(x[1], x[9]); + btf_16_adds_subs_avx2(x[2], x[10]); + btf_16_adds_subs_avx2(x[3], x[11]); + btf_16_adds_subs_avx2(x[4], x[12]); + btf_16_adds_subs_avx2(x[5], x[13]); + btf_16_adds_subs_avx2(x[6], x[14]); + btf_16_adds_subs_avx2(x[7], x[15]); +} + +static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); + const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); + const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]); + const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]); + btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); + btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]); + btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]); + btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]); +} + +static INLINE void iadst16_stage5_avx2(__m256i *x) { + btf_16_adds_subs_avx2(x[0], x[4]); + btf_16_adds_subs_avx2(x[1], x[5]); + btf_16_adds_subs_avx2(x[2], x[6]); + btf_16_adds_subs_avx2(x[3], x[7]); + btf_16_adds_subs_avx2(x[8], x[12]); + btf_16_adds_subs_avx2(x[9], x[13]); + btf_16_adds_subs_avx2(x[10], x[14]); + btf_16_adds_subs_avx2(x[11], x[15]); +} + +static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]); +} + +static INLINE void iadst16_stage7_avx2(__m256i *x) { + btf_16_adds_subs_avx2(x[0], x[2]); + btf_16_adds_subs_avx2(x[1], x[3]); + btf_16_adds_subs_avx2(x[4], x[6]); + btf_16_adds_subs_avx2(x[5], x[7]); + btf_16_adds_subs_avx2(x[8], x[10]); + btf_16_adds_subs_avx2(x[9], x[11]); + btf_16_adds_subs_avx2(x[12], x[14]); + btf_16_adds_subs_avx2(x[13], x[15]); +} + +static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x1[2], x1[3]); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x1[6], x1[7]); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x1[10], x1[11]); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x1[14], x1[15]); +} + +static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) { + const __m256i __zero = _mm256_setzero_si256(); + output[0] = x1[0]; + output[1] = _mm256_subs_epi16(__zero, x1[8]); + output[2] = x1[12]; + output[3] = _mm256_subs_epi16(__zero, x1[4]); + output[4] = x1[6]; + output[5] = _mm256_subs_epi16(__zero, x1[14]); + output[6] = x1[10]; + output[7] = _mm256_subs_epi16(__zero, x1[2]); + output[8] = x1[3]; + output[9] = _mm256_subs_epi16(__zero, x1[11]); + output[10] = x1[15]; + output[11] = _mm256_subs_epi16(__zero, x1[7]); + output[12] = x1[5]; + output[13] = _mm256_subs_epi16(__zero, x1[13]); + output[14] = x1[9]; + output[15] = _mm256_subs_epi16(__zero, x1[1]); +} + +static void iadst16_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); + __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); + __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); + __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); + __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); + __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); + __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); + __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); + __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); + __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); + __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); + __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); + __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); + __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); + __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); + __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); + + // stage 1 + __m256i x1[16]; + x1[0] = input[15]; + x1[1] = input[0]; + x1[2] = input[13]; + x1[3] = input[2]; + x1[4] = input[11]; + x1[5] = input[4]; + x1[6] = input[9]; + x1[7] = input[6]; + x1[8] = input[7]; + x1[9] = input[8]; + x1[10] = input[5]; + x1[11] = input[10]; + x1[12] = input[3]; + x1[13] = input[12]; + x1[14] = input[1]; + x1[15] = input[14]; + + // stage 2 + btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, x1[0], x1[1], x1[0], x1[1]); + btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, x1[2], x1[3], x1[2], x1[3]); + btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, x1[4], x1[5], x1[4], x1[5]); + btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, x1[6], x1[7], x1[6], x1[7]); + btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, x1[8], x1[9], x1[8], x1[9]); + btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, x1[10], x1[11], x1[10], x1[11]); + btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, x1[12], x1[13], x1[12], x1[13]); + btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, x1[14], x1[15], x1[14], x1[15]); + + iadst16_stage3_avx2(x1); + iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage5_avx2(x1); + iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage7_avx2(x1); + iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage9_avx2(output, x1); +} + +static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m256i x1[16]; + x1[1] = input[0]; + x1[3] = input[2]; + x1[5] = input[4]; + x1[7] = input[6]; + x1[8] = input[7]; + x1[10] = input[5]; + x1[12] = input[3]; + x1[14] = input[1]; + + // stage 2 + btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]); + btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]); + btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]); + btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]); + btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]); + btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]); + btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]); + btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]); + + iadst16_stage3_avx2(x1); + iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage5_avx2(x1); + iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage7_avx2(x1); + iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage9_avx2(output, x1); +} + +static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + + // stage 1 + __m256i x1[16]; + x1[1] = input[0]; + + // stage 2 + btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]); + + // stage 3 + x1[8] = x1[0]; + x1[9] = x1[1]; + + // stage 4 + btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x1[8], x1[9], x1[8], x1[9]); + + // stage 5 + x1[4] = x1[0]; + x1[5] = x1[1]; + + x1[12] = x1[8]; + x1[13] = x1[9]; + + // stage 6 + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[4], x1[5], x1[4], x1[5]); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[12], x1[13], x1[12], x1[13]); + + // stage 7 + x1[2] = x1[0]; + x1[3] = x1[1]; + x1[6] = x1[4]; + x1[7] = x1[5]; + x1[10] = x1[8]; + x1[11] = x1[9]; + x1[14] = x1[12]; + x1[15] = x1[13]; + + iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage9_avx2(output, x1); +} + +static INLINE void idct32_high16_stage3_avx2(__m256i *x) { + btf_16_adds_subs_avx2(x[16], x[17]); + btf_16_subs_adds_avx2(x[19], x[18]); + btf_16_adds_subs_avx2(x[20], x[21]); + btf_16_subs_adds_avx2(x[23], x[22]); + btf_16_adds_subs_avx2(x[24], x[25]); + btf_16_subs_adds_avx2(x[27], x[26]); + btf_16_adds_subs_avx2(x[28], x[29]); + btf_16_subs_adds_avx2(x[31], x[30]); +} + +static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); +} + +static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + btf_16_adds_subs_avx2(x[16], x[19]); + btf_16_adds_subs_avx2(x[17], x[18]); + btf_16_subs_adds_avx2(x[23], x[20]); + btf_16_subs_adds_avx2(x[22], x[21]); + btf_16_adds_subs_avx2(x[24], x[27]); + btf_16_adds_subs_avx2(x[25], x[26]); + btf_16_subs_adds_avx2(x[31], x[28]); + btf_16_subs_adds_avx2(x[30], x[29]); +} + +static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_avx2(x[8], x[11]); + btf_16_adds_subs_avx2(x[9], x[10]); + btf_16_subs_adds_avx2(x[15], x[12]); + btf_16_subs_adds_avx2(x[14], x[13]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); +} + +static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(x[0], x[7]); + btf_16_adds_subs_avx2(x[1], x[6]); + btf_16_adds_subs_avx2(x[2], x[5]); + btf_16_adds_subs_avx2(x[3], x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + btf_16_adds_subs_avx2(x[16], x[23]); + btf_16_adds_subs_avx2(x[17], x[22]); + btf_16_adds_subs_avx2(x[18], x[21]); + btf_16_adds_subs_avx2(x[19], x[20]); + btf_16_subs_adds_avx2(x[31], x[24]); + btf_16_subs_adds_avx2(x[30], x[25]); + btf_16_subs_adds_avx2(x[29], x[26]); + btf_16_subs_adds_avx2(x[28], x[27]); +} + +static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(x[0], x[15]); + btf_16_adds_subs_avx2(x[1], x[14]); + btf_16_adds_subs_avx2(x[2], x[13]); + btf_16_adds_subs_avx2(x[3], x[12]); + btf_16_adds_subs_avx2(x[4], x[11]); + btf_16_adds_subs_avx2(x[5], x[10]); + btf_16_adds_subs_avx2(x[6], x[9]); + btf_16_adds_subs_avx2(x[7], x[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); +} + +static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) { + btf_16_adds_subs_out_avx2(output[0], output[31], x[0], x[31]); + btf_16_adds_subs_out_avx2(output[1], output[30], x[1], x[30]); + btf_16_adds_subs_out_avx2(output[2], output[29], x[2], x[29]); + btf_16_adds_subs_out_avx2(output[3], output[28], x[3], x[28]); + btf_16_adds_subs_out_avx2(output[4], output[27], x[4], x[27]); + btf_16_adds_subs_out_avx2(output[5], output[26], x[5], x[26]); + btf_16_adds_subs_out_avx2(output[6], output[25], x[6], x[25]); + btf_16_adds_subs_out_avx2(output[7], output[24], x[7], x[24]); + btf_16_adds_subs_out_avx2(output[8], output[23], x[8], x[23]); + btf_16_adds_subs_out_avx2(output[9], output[22], x[9], x[22]); + btf_16_adds_subs_out_avx2(output[10], output[21], x[10], x[21]); + btf_16_adds_subs_out_avx2(output[11], output[20], x[11], x[20]); + btf_16_adds_subs_out_avx2(output[12], output[19], x[12], x[19]); + btf_16_adds_subs_out_avx2(output[13], output[18], x[13], x[18]); + btf_16_adds_subs_out_avx2(output[14], output[17], x[14], x[17]); + btf_16_adds_subs_out_avx2(output[15], output[16], x[15], x[16]); +} + +static void idct32_low1_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m256i x[2]; + x[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + // stage 5 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + output[0] = x[0]; + output[31] = x[0]; + output[1] = x[1]; + output[30] = x[1]; + output[2] = x[1]; + output[29] = x[1]; + output[3] = x[0]; + output[28] = x[0]; + output[4] = x[0]; + output[27] = x[0]; + output[5] = x[1]; + output[26] = x[1]; + output[6] = x[1]; + output[25] = x[1]; + output[7] = x[0]; + output[24] = x[0]; + output[8] = x[0]; + output[23] = x[0]; + output[9] = x[1]; + output[22] = x[1]; + output[10] = x[1]; + output[21] = x[1]; + output[11] = x[0]; + output[20] = x[0]; + output[12] = x[0]; + output[19] = x[0]; + output[13] = x[1]; + output[18] = x[1]; + output[14] = x[1]; + output[17] = x[1]; + output[15] = x[0]; + output[16] = x[0]; +} + +static void idct32_low8_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m256i x[32]; + x[0] = input[0]; + x[4] = input[4]; + x[8] = input[2]; + x[12] = input[6]; + x[16] = input[1]; + x[20] = input[5]; + x[24] = input[3]; + x[28] = input[7]; + + // stage 2 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + + // stage 3 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); + x[17] = x[16]; + x[18] = x[19]; + x[21] = x[20]; + x[22] = x[23]; + x[25] = x[24]; + x[26] = x[27]; + x[29] = x[28]; + x[30] = x[31]; + + // stage 4 + btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); + x[9] = x[8]; + x[10] = x[11]; + x[13] = x[12]; + x[14] = x[15]; + idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + x[5] = x[4]; + x[6] = x[7]; + idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit); + // stage 6 + x[3] = x[0]; + x[2] = x[1]; + idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit); + + idct32_stage7_avx2(x, cospi, __rounding, cos_bit); + idct32_stage8_avx2(x, cospi, __rounding, cos_bit); + idct32_stage9_avx2(output, x); +} + +static void idct32_low16_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m256i x[32]; + x[0] = input[0]; + x[2] = input[8]; + x[4] = input[4]; + x[6] = input[12]; + x[8] = input[2]; + x[10] = input[10]; + x[12] = input[6]; + x[14] = input[14]; + x[16] = input[1]; + x[18] = input[9]; + x[20] = input[5]; + x[22] = input[13]; + x[24] = input[3]; + x[26] = input[11]; + x[28] = input[7]; + x[30] = input[15]; + + // stage 2 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]); + btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]); + btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); + btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + + // stage 3 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); + btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); + idct32_high16_stage3_avx2(x); + + // stage 4 + btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); + btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); + btf_16_adds_subs_avx2(x[8], x[9]); + btf_16_subs_adds_avx2(x[11], x[10]); + btf_16_adds_subs_avx2(x[12], x[13]); + btf_16_subs_adds_avx2(x[15], x[14]); + idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); + btf_16_adds_subs_avx2(x[4], x[5]); + btf_16_subs_adds_avx2(x[7], x[6]); + idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit); + + btf_16_adds_subs_avx2(x[0], x[3]); + btf_16_adds_subs_avx2(x[1], x[2]); + idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit); + + idct32_stage7_avx2(x, cospi, __rounding, cos_bit); + idct32_stage8_avx2(x, cospi, __rounding, cos_bit); + idct32_stage9_avx2(output, x); +} + +static void idct32_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); + __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); + __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); + __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); + __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); + __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); + __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); + __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); + __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); + __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); + __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); + __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); + __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); + __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); + __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); + __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); + __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); + __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); + __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); + __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); + __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); + __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); + __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); + __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); + __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); + __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + + // stage 1 + __m256i x1[32]; + x1[0] = input[0]; + x1[1] = input[16]; + x1[2] = input[8]; + x1[3] = input[24]; + x1[4] = input[4]; + x1[5] = input[20]; + x1[6] = input[12]; + x1[7] = input[28]; + x1[8] = input[2]; + x1[9] = input[18]; + x1[10] = input[10]; + x1[11] = input[26]; + x1[12] = input[6]; + x1[13] = input[22]; + x1[14] = input[14]; + x1[15] = input[30]; + x1[16] = input[1]; + x1[17] = input[17]; + x1[18] = input[9]; + x1[19] = input[25]; + x1[20] = input[5]; + x1[21] = input[21]; + x1[22] = input[13]; + x1[23] = input[29]; + x1[24] = input[3]; + x1[25] = input[19]; + x1[26] = input[11]; + x1[27] = input[27]; + x1[28] = input[7]; + x1[29] = input[23]; + x1[30] = input[15]; + x1[31] = input[31]; + + // stage 2 + btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, x1[16], x1[31], x1[16], x1[31]); + btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, x1[17], x1[30], x1[17], x1[30]); + btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, x1[18], x1[29], x1[18], x1[29]); + btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, x1[19], x1[28], x1[19], x1[28]); + btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, x1[20], x1[27], x1[20], x1[27]); + btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, x1[21], x1[26], x1[21], x1[26]); + btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, x1[22], x1[25], x1[22], x1[25]); + btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, x1[23], x1[24], x1[23], x1[24]); + + // stage 3 + btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]); + btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]); + btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]); + btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]); + idct32_high16_stage3_avx2(x1); + + // stage 4 + btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]); + btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]); + btf_16_adds_subs_avx2(x1[8], x1[9]); + btf_16_subs_adds_avx2(x1[11], x1[10]); + btf_16_adds_subs_avx2(x1[12], x1[13]); + btf_16_subs_adds_avx2(x1[15], x1[14]); + idct32_high16_stage4_avx2(x1, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]); + btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]); + btf_16_adds_subs_avx2(x1[4], x1[5]); + btf_16_subs_adds_avx2(x1[7], x1[6]); + idct32_high24_stage5_avx2(x1, cospi, __rounding, cos_bit); + + // stage 6 + btf_16_adds_subs_avx2(x1[0], x1[3]); + btf_16_adds_subs_avx2(x1[1], x1[2]); + idct32_high28_stage6_avx2(x1, cospi, __rounding, cos_bit); + + idct32_stage7_avx2(x1, cospi, __rounding, cos_bit); + idct32_stage8_avx2(x1, cospi, __rounding, cos_bit); + idct32_stage9_avx2(output, x1); +} + +static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]); + const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); + const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]); + const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); + btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); + btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]); + btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]); + btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); + btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); + btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]); + btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]); + btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); +} + +static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); + btf_16_adds_subs_avx2(x[32], x[35]); + btf_16_adds_subs_avx2(x[33], x[34]); + btf_16_subs_adds_avx2(x[39], x[36]); + btf_16_subs_adds_avx2(x[38], x[37]); + btf_16_adds_subs_avx2(x[40], x[43]); + btf_16_adds_subs_avx2(x[41], x[42]); + btf_16_subs_adds_avx2(x[47], x[44]); + btf_16_subs_adds_avx2(x[46], x[45]); + btf_16_adds_subs_avx2(x[48], x[51]); + btf_16_adds_subs_avx2(x[49], x[50]); + btf_16_subs_adds_avx2(x[55], x[52]); + btf_16_subs_adds_avx2(x[54], x[53]); + btf_16_adds_subs_avx2(x[56], x[59]); + btf_16_adds_subs_avx2(x[57], x[58]); + btf_16_subs_adds_avx2(x[63], x[60]); + btf_16_subs_adds_avx2(x[62], x[61]); +} + +static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]); +} + +static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + btf_16_adds_subs_avx2(x[16], x[19]); + btf_16_adds_subs_avx2(x[17], x[18]); + btf_16_subs_adds_avx2(x[23], x[20]); + btf_16_subs_adds_avx2(x[22], x[21]); + btf_16_adds_subs_avx2(x[24], x[27]); + btf_16_adds_subs_avx2(x[25], x[26]); + btf_16_subs_adds_avx2(x[31], x[28]); + btf_16_subs_adds_avx2(x[30], x[29]); + idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit); +} + +static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); + btf_16_adds_subs_avx2(x[32], x[39]); + btf_16_adds_subs_avx2(x[33], x[38]); + btf_16_adds_subs_avx2(x[34], x[37]); + btf_16_adds_subs_avx2(x[35], x[36]); + btf_16_subs_adds_avx2(x[47], x[40]); + btf_16_subs_adds_avx2(x[46], x[41]); + btf_16_subs_adds_avx2(x[45], x[42]); + btf_16_subs_adds_avx2(x[44], x[43]); + btf_16_adds_subs_avx2(x[48], x[55]); + btf_16_adds_subs_avx2(x[49], x[54]); + btf_16_adds_subs_avx2(x[50], x[53]); + btf_16_adds_subs_avx2(x[51], x[52]); + btf_16_subs_adds_avx2(x[63], x[56]); + btf_16_subs_adds_avx2(x[62], x[57]); + btf_16_subs_adds_avx2(x[61], x[58]); + btf_16_subs_adds_avx2(x[60], x[59]); +} + +static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + btf_16_adds_subs_avx2(x[16], x[23]); + btf_16_adds_subs_avx2(x[17], x[22]); + btf_16_adds_subs_avx2(x[18], x[21]); + btf_16_adds_subs_avx2(x[19], x[20]); + btf_16_subs_adds_avx2(x[31], x[24]); + btf_16_subs_adds_avx2(x[30], x[25]); + btf_16_subs_adds_avx2(x[29], x[26]); + btf_16_subs_adds_avx2(x[28], x[27]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]); +} + +static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(x[0], x[15]); + btf_16_adds_subs_avx2(x[1], x[14]); + btf_16_adds_subs_avx2(x[2], x[13]); + btf_16_adds_subs_avx2(x[3], x[12]); + btf_16_adds_subs_avx2(x[4], x[11]); + btf_16_adds_subs_avx2(x[5], x[10]); + btf_16_adds_subs_avx2(x[6], x[9]); + btf_16_adds_subs_avx2(x[7], x[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); + btf_16_adds_subs_avx2(x[32], x[47]); + btf_16_adds_subs_avx2(x[33], x[46]); + btf_16_adds_subs_avx2(x[34], x[45]); + btf_16_adds_subs_avx2(x[35], x[44]); + btf_16_adds_subs_avx2(x[36], x[43]); + btf_16_adds_subs_avx2(x[37], x[42]); + btf_16_adds_subs_avx2(x[38], x[41]); + btf_16_adds_subs_avx2(x[39], x[40]); + btf_16_subs_adds_avx2(x[63], x[48]); + btf_16_subs_adds_avx2(x[62], x[49]); + btf_16_subs_adds_avx2(x[61], x[50]); + btf_16_subs_adds_avx2(x[60], x[51]); + btf_16_subs_adds_avx2(x[59], x[52]); + btf_16_subs_adds_avx2(x[58], x[53]); + btf_16_subs_adds_avx2(x[57], x[54]); + btf_16_subs_adds_avx2(x[56], x[55]); +} + +static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi, + const __m256i __rounding, + int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(x[0], x[31]); + btf_16_adds_subs_avx2(x[1], x[30]); + btf_16_adds_subs_avx2(x[2], x[29]); + btf_16_adds_subs_avx2(x[3], x[28]); + btf_16_adds_subs_avx2(x[4], x[27]); + btf_16_adds_subs_avx2(x[5], x[26]); + btf_16_adds_subs_avx2(x[6], x[25]); + btf_16_adds_subs_avx2(x[7], x[24]); + btf_16_adds_subs_avx2(x[8], x[23]); + btf_16_adds_subs_avx2(x[9], x[22]); + btf_16_adds_subs_avx2(x[10], x[21]); + btf_16_adds_subs_avx2(x[11], x[20]); + btf_16_adds_subs_avx2(x[12], x[19]); + btf_16_adds_subs_avx2(x[13], x[18]); + btf_16_adds_subs_avx2(x[14], x[17]); + btf_16_adds_subs_avx2(x[15], x[16]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]); +} + +static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) { + btf_16_adds_subs_out_avx2(output[0], output[63], x[0], x[63]); + btf_16_adds_subs_out_avx2(output[1], output[62], x[1], x[62]); + btf_16_adds_subs_out_avx2(output[2], output[61], x[2], x[61]); + btf_16_adds_subs_out_avx2(output[3], output[60], x[3], x[60]); + btf_16_adds_subs_out_avx2(output[4], output[59], x[4], x[59]); + btf_16_adds_subs_out_avx2(output[5], output[58], x[5], x[58]); + btf_16_adds_subs_out_avx2(output[6], output[57], x[6], x[57]); + btf_16_adds_subs_out_avx2(output[7], output[56], x[7], x[56]); + btf_16_adds_subs_out_avx2(output[8], output[55], x[8], x[55]); + btf_16_adds_subs_out_avx2(output[9], output[54], x[9], x[54]); + btf_16_adds_subs_out_avx2(output[10], output[53], x[10], x[53]); + btf_16_adds_subs_out_avx2(output[11], output[52], x[11], x[52]); + btf_16_adds_subs_out_avx2(output[12], output[51], x[12], x[51]); + btf_16_adds_subs_out_avx2(output[13], output[50], x[13], x[50]); + btf_16_adds_subs_out_avx2(output[14], output[49], x[14], x[49]); + btf_16_adds_subs_out_avx2(output[15], output[48], x[15], x[48]); + btf_16_adds_subs_out_avx2(output[16], output[47], x[16], x[47]); + btf_16_adds_subs_out_avx2(output[17], output[46], x[17], x[46]); + btf_16_adds_subs_out_avx2(output[18], output[45], x[18], x[45]); + btf_16_adds_subs_out_avx2(output[19], output[44], x[19], x[44]); + btf_16_adds_subs_out_avx2(output[20], output[43], x[20], x[43]); + btf_16_adds_subs_out_avx2(output[21], output[42], x[21], x[42]); + btf_16_adds_subs_out_avx2(output[22], output[41], x[22], x[41]); + btf_16_adds_subs_out_avx2(output[23], output[40], x[23], x[40]); + btf_16_adds_subs_out_avx2(output[24], output[39], x[24], x[39]); + btf_16_adds_subs_out_avx2(output[25], output[38], x[25], x[38]); + btf_16_adds_subs_out_avx2(output[26], output[37], x[26], x[37]); + btf_16_adds_subs_out_avx2(output[27], output[36], x[27], x[36]); + btf_16_adds_subs_out_avx2(output[28], output[35], x[28], x[35]); + btf_16_adds_subs_out_avx2(output[29], output[34], x[29], x[34]); + btf_16_adds_subs_out_avx2(output[30], output[33], x[30], x[33]); + btf_16_adds_subs_out_avx2(output[31], output[32], x[31], x[32]); +} + +static void idct64_low1_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m256i x[32]; + x[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 7 + // stage 8 + // stage 9 + // stage 10 + // stage 11 + output[0] = x[0]; + output[63] = x[0]; + output[1] = x[1]; + output[62] = x[1]; + output[2] = x[1]; + output[61] = x[1]; + output[3] = x[0]; + output[60] = x[0]; + output[4] = x[0]; + output[59] = x[0]; + output[5] = x[1]; + output[58] = x[1]; + output[6] = x[1]; + output[57] = x[1]; + output[7] = x[0]; + output[56] = x[0]; + output[8] = x[0]; + output[55] = x[0]; + output[9] = x[1]; + output[54] = x[1]; + output[10] = x[1]; + output[53] = x[1]; + output[11] = x[0]; + output[52] = x[0]; + output[12] = x[0]; + output[51] = x[0]; + output[13] = x[1]; + output[50] = x[1]; + output[14] = x[1]; + output[49] = x[1]; + output[15] = x[0]; + output[48] = x[0]; + output[16] = x[0]; + output[47] = x[0]; + output[17] = x[1]; + output[46] = x[1]; + output[18] = x[1]; + output[45] = x[1]; + output[19] = x[0]; + output[44] = x[0]; + output[20] = x[0]; + output[43] = x[0]; + output[21] = x[1]; + output[42] = x[1]; + output[22] = x[1]; + output[41] = x[1]; + output[23] = x[0]; + output[40] = x[0]; + output[24] = x[0]; + output[39] = x[0]; + output[25] = x[1]; + output[38] = x[1]; + output[26] = x[1]; + output[37] = x[1]; + output[27] = x[0]; + output[36] = x[0]; + output[28] = x[0]; + output[35] = x[0]; + output[29] = x[1]; + output[34] = x[1]; + output[30] = x[1]; + output[33] = x[1]; + output[31] = x[0]; + output[32] = x[0]; +} + +static void idct64_low8_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); + const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); + const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m256i x[64]; + x[0] = input[0]; + x[8] = input[4]; + x[16] = input[2]; + x[24] = input[6]; + x[32] = input[1]; + x[40] = input[5]; + x[48] = input[3]; + x[56] = input[7]; + + // stage 2 + btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + x[33] = x[32]; + x[38] = x[39]; + x[41] = x[40]; + x[46] = x[47]; + x[49] = x[48]; + x[54] = x[55]; + x[57] = x[56]; + x[62] = x[63]; + + // stage 4 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + x[17] = x[16]; + x[22] = x[23]; + x[25] = x[24]; + x[30] = x[31]; + btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); + btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); + btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); + btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); + + // stage 5 + x[9] = x[8]; + x[14] = x[15]; + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); + x[35] = x[32]; + x[34] = x[33]; + x[36] = x[39]; + x[37] = x[38]; + x[43] = x[40]; + x[42] = x[41]; + x[44] = x[47]; + x[45] = x[46]; + x[51] = x[48]; + x[50] = x[49]; + x[52] = x[55]; + x[53] = x[54]; + x[59] = x[56]; + x[58] = x[57]; + x[60] = x[63]; + x[61] = x[62]; + + // stage 6 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + x[19] = x[16]; + x[18] = x[17]; + x[20] = x[23]; + x[21] = x[22]; + x[27] = x[24]; + x[26] = x[25]; + x[28] = x[31]; + x[29] = x[30]; + idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit); + + // stage 7 + x[3] = x[0]; + x[2] = x[1]; + x[11] = x[8]; + x[10] = x[9]; + x[12] = x[15]; + x[13] = x[14]; + idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit); + + // stage 8 + x[7] = x[0]; + x[6] = x[1]; + x[5] = x[2]; + x[4] = x[3]; + x[9] = x[9]; + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit); + + idct64_stage9_avx2(x, cospi, __rounding, cos_bit); + idct64_stage10_avx2(x, cospi, __rounding, cos_bit); + idct64_stage11_avx2(output, x); +} + +static void idct64_low16_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m256i x[64]; + x[0] = input[0]; + x[4] = input[8]; + x[8] = input[4]; + x[12] = input[12]; + x[16] = input[2]; + x[20] = input[10]; + x[24] = input[6]; + x[28] = input[14]; + x[32] = input[1]; + x[36] = input[9]; + x[40] = input[5]; + x[44] = input[13]; + x[48] = input[3]; + x[52] = input[11]; + x[56] = input[7]; + x[60] = input[15]; + + // stage 2 + btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]); + btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]); + btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]); + btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]); + btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + x[33] = x[32]; + x[34] = x[35]; + x[37] = x[36]; + x[38] = x[39]; + x[41] = x[40]; + x[42] = x[43]; + x[45] = x[44]; + x[46] = x[47]; + x[49] = x[48]; + x[50] = x[51]; + x[53] = x[52]; + x[54] = x[55]; + x[57] = x[56]; + x[58] = x[59]; + x[61] = x[60]; + x[62] = x[63]; + + // stage 4 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); + x[17] = x[16]; + x[18] = x[19]; + x[21] = x[20]; + x[22] = x[23]; + x[25] = x[24]; + x[26] = x[27]; + x[29] = x[28]; + x[30] = x[31]; + idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); + x[9] = x[8]; + x[10] = x[11]; + x[13] = x[12]; + x[14] = x[15]; + idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit); + + // stage 6 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + x[5] = x[4]; + x[6] = x[7]; + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit); + + // stage 7 + x[3] = x[0]; + x[2] = x[1]; + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_avx2(x[8], x[11]); + btf_16_adds_subs_avx2(x[9], x[10]); + btf_16_subs_adds_avx2(x[15], x[12]); + btf_16_subs_adds_avx2(x[14], x[13]); + idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit); + + // stage 8 + btf_16_adds_subs_avx2(x[0], x[7]); + btf_16_adds_subs_avx2(x[1], x[6]); + btf_16_adds_subs_avx2(x[2], x[5]); + btf_16_adds_subs_avx2(x[3], x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit); + + idct64_stage9_avx2(x, cospi, __rounding, cos_bit); + idct64_stage10_avx2(x, cospi, __rounding, cos_bit); + idct64_stage11_avx2(output, x); +} + +static void idct64_low32_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m256i x[64]; + x[0] = input[0]; + x[2] = input[16]; + x[4] = input[8]; + x[6] = input[24]; + x[8] = input[4]; + x[10] = input[20]; + x[12] = input[12]; + x[14] = input[28]; + x[16] = input[2]; + x[18] = input[18]; + x[20] = input[10]; + x[22] = input[26]; + x[24] = input[6]; + x[26] = input[22]; + x[28] = input[14]; + x[30] = input[30]; + x[32] = input[1]; + x[34] = input[17]; + x[36] = input[9]; + x[38] = input[25]; + x[40] = input[5]; + x[42] = input[21]; + x[44] = input[13]; + x[46] = input[29]; + x[48] = input[3]; + x[50] = input[19]; + x[52] = input[11]; + x[54] = input[27]; + x[56] = input[7]; + x[58] = input[23]; + x[60] = input[15]; + x[62] = input[31]; + + // stage 2 + btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]); + btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]); + btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]); + btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]); + btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]); + btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]); + btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]); + btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]); + btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]); + btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]); + btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]); + btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]); + btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]); + btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]); + btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); + btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + btf_16_adds_subs_avx2(x[32], x[33]); + btf_16_subs_adds_avx2(x[35], x[34]); + btf_16_adds_subs_avx2(x[36], x[37]); + btf_16_subs_adds_avx2(x[39], x[38]); + btf_16_adds_subs_avx2(x[40], x[41]); + btf_16_subs_adds_avx2(x[43], x[42]); + btf_16_adds_subs_avx2(x[44], x[45]); + btf_16_subs_adds_avx2(x[47], x[46]); + btf_16_adds_subs_avx2(x[48], x[49]); + btf_16_subs_adds_avx2(x[51], x[50]); + btf_16_adds_subs_avx2(x[52], x[53]); + btf_16_subs_adds_avx2(x[55], x[54]); + btf_16_adds_subs_avx2(x[56], x[57]); + btf_16_subs_adds_avx2(x[59], x[58]); + btf_16_adds_subs_avx2(x[60], x[61]); + btf_16_subs_adds_avx2(x[63], x[62]); + + // stage 4 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); + btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); + btf_16_adds_subs_avx2(x[16], x[17]); + btf_16_subs_adds_avx2(x[19], x[18]); + btf_16_adds_subs_avx2(x[20], x[21]); + btf_16_subs_adds_avx2(x[23], x[22]); + btf_16_adds_subs_avx2(x[24], x[25]); + btf_16_subs_adds_avx2(x[27], x[26]); + btf_16_adds_subs_avx2(x[28], x[29]); + btf_16_subs_adds_avx2(x[31], x[30]); + idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); + btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); + btf_16_adds_subs_avx2(x[8], x[9]); + btf_16_subs_adds_avx2(x[11], x[10]); + btf_16_adds_subs_avx2(x[12], x[13]); + btf_16_subs_adds_avx2(x[15], x[14]); + idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit); + + // stage 6 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); + btf_16_adds_subs_avx2(x[4], x[5]); + btf_16_subs_adds_avx2(x[7], x[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit); + + // stage 7 + btf_16_adds_subs_avx2(x[0], x[3]); + btf_16_adds_subs_avx2(x[1], x[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_avx2(x[8], x[11]); + btf_16_adds_subs_avx2(x[9], x[10]); + btf_16_subs_adds_avx2(x[15], x[12]); + btf_16_subs_adds_avx2(x[14], x[13]); + idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit); + + // stage 8 + btf_16_adds_subs_avx2(x[0], x[7]); + btf_16_adds_subs_avx2(x[1], x[6]); + btf_16_adds_subs_avx2(x[2], x[5]); + btf_16_adds_subs_avx2(x[3], x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit); + + // stage 9~11 + idct64_stage9_avx2(x, cospi, __rounding, cos_bit); + idct64_stage10_avx2(x, cospi, __rounding, cos_bit); + idct64_stage11_avx2(output, x); +} + +// 1D functions process 16 pixels at one time. +static const transform_1d_avx2 + lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { + { idct16_low1_new_avx2, idct16_low8_new_avx2, idct16_new_avx2, NULL }, + { iadst16_low1_new_avx2, iadst16_low8_new_avx2, iadst16_new_avx2, + NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct32_low1_new_avx2, idct32_low8_new_avx2, idct32_low16_new_avx2, + idct32_new_avx2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { { idct64_low1_new_avx2, idct64_low8_new_avx2, idct64_low16_new_avx2, + idct64_low32_new_avx2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +// only process w >= 16 h >= 16 +static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + __m256i buf1[64 * 16]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div16 = txfm_size_col >> 4; + const int buf_size_nonzero_w_div16 = (eobx + 16) >> 4; + const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4; + const int input_stride = AOMMIN(32, txfm_size_col); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_avx2 row_txfm = + lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_avx2 col_txfm = + lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_nonzero_h_div16; i++) { + __m256i buf0[64]; + const int32_t *input_row = input + (i << 4) * input_stride; + for (int j = 0; j < buf_size_nonzero_w_div16; ++j) { + __m256i *buf0_cur = buf0 + j * 16; + const int32_t *input_cur = input_row + j * 16; + load_buffer_32bit_to_16bit_w16_avx2(input_cur, input_stride, buf0_cur, + 16); + transpose_16bit_16x16_avx2(buf0_cur, buf0_cur); + } + if (rect_type == 1 || rect_type == -1) { + round_shift_avx2(buf0, buf0, input_stride); // rect special code + } + row_txfm(buf0, buf0, cos_bit_row); + round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]); + + __m256i *buf1_cur = buf1 + (i << 4); + if (lr_flip) { + for (int j = 0; j < buf_size_w_div16; ++j) { + __m256i temp[16]; + flip_buf_av2(buf0 + 16 * j, temp, 16); + int offset = txfm_size_row * (buf_size_w_div16 - 1 - j); + transpose_16bit_16x16_avx2(temp, buf1_cur + offset); + } + } else { + for (int j = 0; j < buf_size_w_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j); + } + } + } + for (int i = 0; i < buf_size_w_div16; i++) { + __m256i *buf1_cur = buf1 + i * txfm_size_row; + col_txfm(buf1_cur, buf1_cur, cos_bit_col); + round_shift_16bit_w16_avx2(buf1_cur, txfm_size_row, shift[1]); + } + for (int i = 0; i < buf_size_w_div16; i++) { + lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i, + stride, ud_flip, txfm_size_row); + } +} + +static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input, + int stride, int shift, int height, + int txw_idx, int rect_type) { + const int32_t *input_row = input; + const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]); + const __m256i rounding = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) + + (1 << (NewSqrt2Bits - shift - 1))); + const __m256i one = _mm256_set1_epi16(1); + const __m256i scale_rounding = _mm256_unpacklo_epi16(scale, rounding); + if (rect_type != 1 && rect_type != -1) { + for (int i = 0; i < height; ++i) { + const __m256i src = load_32bit_to_16bit_w16_avx2(input_row); + input_row += stride; + __m256i lo = _mm256_unpacklo_epi16(src, one); + __m256i hi = _mm256_unpackhi_epi16(src, one); + lo = _mm256_madd_epi16(lo, scale_rounding); + hi = _mm256_madd_epi16(hi, scale_rounding); + lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); + hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); + out[i] = _mm256_packs_epi32(lo, hi); + } + } else { + const __m256i rect_scale = + _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits)); + for (int i = 0; i < height; ++i) { + __m256i src = load_32bit_to_16bit_w16_avx2(input_row); + src = _mm256_mulhrs_epi16(src, rect_scale); + input_row += stride; + __m256i lo = _mm256_unpacklo_epi16(src, one); + __m256i hi = _mm256_unpackhi_epi16(src, one); + lo = _mm256_madd_epi16(lo, scale_rounding); + hi = _mm256_madd_epi16(hi, scale_rounding); + lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); + hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); + out[i] = _mm256_packs_epi32(lo, hi); + } + } +} + +static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride, + __m256i *buf, int shift, int height, + int txh_idx) { + const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]); + const __m256i scale_rounding = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1)); + const __m256i shift_rounding = _mm256_set1_epi32(1 << (-shift - 1)); + const __m256i one = _mm256_set1_epi16(1); + const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale_rounding); + for (int h = 0; h < height; ++h) { + __m256i lo = _mm256_unpacklo_epi16(buf[h], one); + __m256i hi = _mm256_unpackhi_epi16(buf[h], one); + lo = _mm256_madd_epi16(lo, scale_coeff); + hi = _mm256_madd_epi16(hi, scale_coeff); + lo = _mm256_srai_epi32(lo, NewSqrt2Bits); + hi = _mm256_srai_epi32(hi, NewSqrt2Bits); + lo = _mm256_add_epi32(lo, shift_rounding); + hi = _mm256_add_epi32(hi, shift_rounding); + lo = _mm256_srai_epi32(lo, -shift); + hi = _mm256_srai_epi32(hi, -shift); + const __m256i x = _mm256_packs_epi32(lo, hi); + write_recon_w16_avx2(x, output); + output += stride; + } +} + +static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input, + uint8_t *output, int stride, + TX_SIZE tx_size, + int32_t eob) { + (void)eob; + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int input_stride = AOMMIN(32, txfm_size_col); + const int row_max = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + __m256i buf[32]; + for (int i = 0; i < input_stride; i += 16) { + iidentity_row_16xn_avx2(buf, input + i, input_stride, shift[0], row_max, + txw_idx, rect_type); + iidentity_col_16xn_avx2(output + i, stride, buf, shift[1], row_max, + txh_idx); + } +} + +static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int eobx, eoby; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col); + const int input_stride = txfm_size_col_notzero; + const int buf_size_w_div16 = (eobx + 16) >> 4; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_avx2 col_txfm = + lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_w_div16; i++) { + __m256i buf0[64]; + iidentity_row_16xn_avx2(buf0, input + (i << 4), input_stride, shift[0], + eoby + 1, txw_idx, rect_type); + col_txfm(buf0, buf0, cos_bit_col); + __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1])); + int k = ud_flip ? (txfm_size_row - 1) : 0; + const int step = ud_flip ? -1 : 1; + for (int j = 0; j < txfm_size_row; ++j, k += step) { + __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift); + write_recon_w16_avx2(res, output + (i << 4) + j * stride); + } + } +} + +static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + __m256i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div16 = txfm_size_col >> 4; + const int buf_size_h_div16 = (eoby + 16) >> 4; + const int input_stride = AOMMIN(32, txfm_size_col); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const transform_1d_avx2 row_txfm = + lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + + assert(row_txfm != NULL); + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_h_div16; i++) { + __m256i buf0[64]; + const int32_t *input_row = input + i * input_stride * 16; + for (int j = 0; j < AOMMIN(4, buf_size_w_div16); ++j) { + __m256i *buf0_cur = buf0 + j * 16; + load_buffer_32bit_to_16bit_w16_avx2(input_row + j * 16, input_stride, + buf0_cur, 16); + transpose_16bit_16x16_avx2(buf0_cur, buf0_cur); + } + if (rect_type == 1 || rect_type == -1) { + round_shift_avx2(buf0, buf0, input_stride); // rect special code + } + row_txfm(buf0, buf0, cos_bit_row); + round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]); + __m256i *_buf1 = buf1; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div16; ++j) { + __m256i temp[16]; + flip_buf_av2(buf0 + 16 * j, temp, 16); + transpose_16bit_16x16_avx2(temp, + _buf1 + 16 * (buf_size_w_div16 - 1 - j)); + } + } else { + for (int j = 0; j < buf_size_w_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j); + } + } + for (int j = 0; j < buf_size_w_div16; ++j) { + iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride, + buf1 + j * 16, shift[1], 16, txh_idx); + } + } +} + +// for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64 +static INLINE void lowbd_inv_txfm2d_add_universe_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + (void)eob; + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: // ADST in vertical, DCT in horizontal + case DCT_ADST: // DCT in vertical, ADST in horizontal + case ADST_ADST: // ADST in both directions + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type, + tx_size, eob); + break; + case IDTX: + lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob); + break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type, + tx_size, eob); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type, + tx_size, eob); + break; + default: + av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + } +} + +void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, TX_SIZE tx_size, + int eob) { + switch (tx_size) { + case TX_4X4: + case TX_8X8: + case TX_4X8: + case TX_8X4: + case TX_8X16: + case TX_16X8: + case TX_4X16: + case TX_16X4: + case TX_8X32: + case TX_32X8: + av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_16X16: + case TX_32X32: + case TX_64X64: + case TX_16X32: + case TX_32X16: + case TX_32X64: + case TX_64X32: + case TX_16X64: + case TX_64X16: + default: + lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type, + tx_size, eob); + break; + } +} + +void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, + const TxfmParam *txfm_param) { + const TX_TYPE tx_type = txfm_param->tx_type; + if (!txfm_param->lossless) { + av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type, + txfm_param->tx_size, txfm_param->eob); + } else { + av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); + } +} |