From b8df135c97a854c2ff9b4394b016649c601177fa Mon Sep 17 00:00:00 2001 From: trav90 Date: Fri, 19 Oct 2018 23:00:02 -0500 Subject: Update libaom to rev b25610052a1398032320008d69b51d2da94f5928 --- .../aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c | 11 + .../aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c | 2068 ++++++++++++++++++++ .../aom/av1/encoder/x86/av1_fwd_txfm_avx2.h | 103 + .../aom/av1/encoder/x86/corner_match_sse4.c | 11 + third_party/aom/av1/encoder/x86/wedge_utils_avx2.c | 215 ++ 5 files changed, 2408 insertions(+) create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h create mode 100644 third_party/aom/av1/encoder/x86/wedge_utils_avx2.c (limited to 'third_party/aom/av1/encoder/x86') diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c index 84065d6de..c71f2e74c 100644 --- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + #include "av1/encoder/x86/av1_txfm1d_sse4.h" void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output, diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c new file mode 100644 index 000000000..592462e20 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c @@ -0,0 +1,2068 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" + +#include "av1/common/enums.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/x86/av1_fwd_txfm_avx2.h" +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_txfm1d_sse4.h" +#include "av1/encoder/x86/av1_fwd_txfm_sse2.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +static INLINE void fdct16x16_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + + // stage 1 + __m256i x1[16]; + btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]); + btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]); + btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]); + btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]); + btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]); + btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]); + btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]); + btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]); + + // stage 2 + btf_16_adds_subs_avx2(&x1[0], &x1[7]); + btf_16_adds_subs_avx2(&x1[1], &x1[6]); + btf_16_adds_subs_avx2(&x1[2], &x1[5]); + btf_16_adds_subs_avx2(&x1[3], &x1[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); + + // stage 4 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + + // stage 5 + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + + // stage 6 + btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); + + // stage 7 + output[0] = x1[0]; + output[1] = x1[8]; + output[2] = x1[4]; + output[3] = x1[12]; + output[4] = x1[2]; + output[5] = x1[10]; + output[6] = x1[6]; + output[7] = x1[14]; + output[8] = x1[1]; + output[9] = x1[9]; + output[10] = x1[5]; + output[11] = x1[13]; + output[12] = x1[3]; + output[13] = x1[11]; + output[14] = x1[7]; + output[15] = x1[15]; +} + +static INLINE void fdct16x32_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]); + __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]); + __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]); + __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]); + __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]); + __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]); + __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]); + __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]); + __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]); + __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]); + __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]); + __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]); + __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]); + __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]); + __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]); + __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]); + + // stage 1 + __m256i x1[32]; + btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]); + btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]); + btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]); + btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]); + btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]); + btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]); + btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]); + btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]); + btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]); + btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]); + btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]); + btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]); + btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]); + btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]); + btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]); + btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]); + + // stage 2 + btf_16_adds_subs_avx2(&x1[0], &x1[15]); + btf_16_adds_subs_avx2(&x1[1], &x1[14]); + btf_16_adds_subs_avx2(&x1[2], &x1[13]); + btf_16_adds_subs_avx2(&x1[3], &x1[12]); + btf_16_adds_subs_avx2(&x1[4], &x1[11]); + btf_16_adds_subs_avx2(&x1[5], &x1[10]); + btf_16_adds_subs_avx2(&x1[6], &x1[9]); + btf_16_adds_subs_avx2(&x1[7], &x1[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[7]); + btf_16_adds_subs_avx2(&x1[1], &x1[6]); + btf_16_adds_subs_avx2(&x1[2], &x1[5]); + btf_16_adds_subs_avx2(&x1[3], &x1[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[23]); + btf_16_adds_subs_avx2(&x1[17], &x1[22]); + btf_16_adds_subs_avx2(&x1[18], &x1[21]); + btf_16_adds_subs_avx2(&x1[19], &x1[20]); + btf_16_adds_subs_avx2(&x1[31], &x1[24]); + btf_16_adds_subs_avx2(&x1[30], &x1[25]); + btf_16_adds_subs_avx2(&x1[29], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[27]); + + // stage 4 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit); + + // stage 5 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[19]); + btf_16_adds_subs_avx2(&x1[17], &x1[18]); + btf_16_adds_subs_avx2(&x1[23], &x1[20]); + btf_16_adds_subs_avx2(&x1[22], &x1[21]); + btf_16_adds_subs_avx2(&x1[24], &x1[27]); + btf_16_adds_subs_avx2(&x1[25], &x1[26]); + btf_16_adds_subs_avx2(&x1[31], &x1[28]); + btf_16_adds_subs_avx2(&x1[30], &x1[29]); + + // stage 6 + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit); + + // stage 7 + btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[17]); + btf_16_adds_subs_avx2(&x1[19], &x1[18]); + btf_16_adds_subs_avx2(&x1[20], &x1[21]); + btf_16_adds_subs_avx2(&x1[23], &x1[22]); + btf_16_adds_subs_avx2(&x1[24], &x1[25]); + btf_16_adds_subs_avx2(&x1[27], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[29]); + btf_16_adds_subs_avx2(&x1[31], &x1[30]); + + // stage 8 + btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit); + btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit); + + // stage 9 + output[0] = x1[0]; + output[1] = x1[16]; + output[2] = x1[8]; + output[3] = x1[24]; + output[4] = x1[4]; + output[5] = x1[20]; + output[6] = x1[12]; + output[7] = x1[28]; + output[8] = x1[2]; + output[9] = x1[18]; + output[10] = x1[10]; + output[11] = x1[26]; + output[12] = x1[6]; + output[13] = x1[22]; + output[14] = x1[14]; + output[15] = x1[30]; + output[16] = x1[1]; + output[17] = x1[17]; + output[18] = x1[9]; + output[19] = x1[25]; + output[20] = x1[5]; + output[21] = x1[21]; + output[22] = x1[13]; + output[23] = x1[29]; + output[24] = x1[3]; + output[25] = x1[19]; + output[26] = x1[11]; + output[27] = x1[27]; + output[28] = x1[7]; + output[29] = x1[23]; + output[30] = x1[15]; + output[31] = x1[31]; +} + +static INLINE void fdct16x64_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]); + __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); + __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]); + __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); + __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]); + __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]); + __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]); + __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]); + __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]); + __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]); + __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]); + __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]); + __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]); + __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]); + __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]); + __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]); + __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]); + __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]); + __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]); + __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]); + __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]); + __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]); + __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]); + __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]); + __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]); + __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]); + __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]); + __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]); + __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]); + __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]); + __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]); + __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]); + __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]); + __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]); + __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]); + __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]); + __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]); + __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]); + __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]); + __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]); + __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]); + __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]); + __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]); + __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]); + __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]); + __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]); + __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]); + __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]); + __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]); + __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]); + __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]); + __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]); + + // stage 1 + __m256i x1[64]; + btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]); + btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]); + btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]); + btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]); + btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]); + btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]); + btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]); + btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]); + btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]); + btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]); + btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]); + btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]); + btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]); + btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]); + btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]); + btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]); + btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]); + btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]); + btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]); + btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]); + btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]); + btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]); + btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]); + btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]); + btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]); + btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]); + btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]); + btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]); + btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]); + btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]); + btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]); + btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]); + + // stage 2 + btf_16_adds_subs_avx2(&x1[0], &x1[31]); + btf_16_adds_subs_avx2(&x1[1], &x1[30]); + btf_16_adds_subs_avx2(&x1[2], &x1[29]); + btf_16_adds_subs_avx2(&x1[3], &x1[28]); + btf_16_adds_subs_avx2(&x1[4], &x1[27]); + btf_16_adds_subs_avx2(&x1[5], &x1[26]); + btf_16_adds_subs_avx2(&x1[6], &x1[25]); + btf_16_adds_subs_avx2(&x1[7], &x1[24]); + btf_16_adds_subs_avx2(&x1[8], &x1[23]); + btf_16_adds_subs_avx2(&x1[9], &x1[22]); + btf_16_adds_subs_avx2(&x1[10], &x1[21]); + btf_16_adds_subs_avx2(&x1[11], &x1[20]); + btf_16_adds_subs_avx2(&x1[12], &x1[19]); + btf_16_adds_subs_avx2(&x1[13], &x1[18]); + btf_16_adds_subs_avx2(&x1[14], &x1[17]); + btf_16_adds_subs_avx2(&x1[15], &x1[16]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[15]); + btf_16_adds_subs_avx2(&x1[1], &x1[14]); + btf_16_adds_subs_avx2(&x1[2], &x1[13]); + btf_16_adds_subs_avx2(&x1[3], &x1[12]); + btf_16_adds_subs_avx2(&x1[4], &x1[11]); + btf_16_adds_subs_avx2(&x1[5], &x1[10]); + btf_16_adds_subs_avx2(&x1[6], &x1[9]); + btf_16_adds_subs_avx2(&x1[7], &x1[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[47]); + btf_16_adds_subs_avx2(&x1[33], &x1[46]); + btf_16_adds_subs_avx2(&x1[34], &x1[45]); + btf_16_adds_subs_avx2(&x1[35], &x1[44]); + btf_16_adds_subs_avx2(&x1[36], &x1[43]); + btf_16_adds_subs_avx2(&x1[37], &x1[42]); + btf_16_adds_subs_avx2(&x1[38], &x1[41]); + btf_16_adds_subs_avx2(&x1[39], &x1[40]); + btf_16_adds_subs_avx2(&x1[63], &x1[48]); + btf_16_adds_subs_avx2(&x1[62], &x1[49]); + btf_16_adds_subs_avx2(&x1[61], &x1[50]); + btf_16_adds_subs_avx2(&x1[60], &x1[51]); + btf_16_adds_subs_avx2(&x1[59], &x1[52]); + btf_16_adds_subs_avx2(&x1[58], &x1[53]); + btf_16_adds_subs_avx2(&x1[57], &x1[54]); + btf_16_adds_subs_avx2(&x1[56], &x1[55]); + + // stage 4 + btf_16_adds_subs_avx2(&x1[0], &x1[7]); + btf_16_adds_subs_avx2(&x1[1], &x1[6]); + btf_16_adds_subs_avx2(&x1[2], &x1[5]); + btf_16_adds_subs_avx2(&x1[3], &x1[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[23]); + btf_16_adds_subs_avx2(&x1[17], &x1[22]); + btf_16_adds_subs_avx2(&x1[18], &x1[21]); + btf_16_adds_subs_avx2(&x1[19], &x1[20]); + btf_16_adds_subs_avx2(&x1[31], &x1[24]); + btf_16_adds_subs_avx2(&x1[30], &x1[25]); + btf_16_adds_subs_avx2(&x1[29], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[27]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit); + + // stage 5 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[39]); + btf_16_adds_subs_avx2(&x1[33], &x1[38]); + btf_16_adds_subs_avx2(&x1[34], &x1[37]); + btf_16_adds_subs_avx2(&x1[35], &x1[36]); + btf_16_adds_subs_avx2(&x1[47], &x1[40]); + btf_16_adds_subs_avx2(&x1[46], &x1[41]); + btf_16_adds_subs_avx2(&x1[45], &x1[42]); + btf_16_adds_subs_avx2(&x1[44], &x1[43]); + btf_16_adds_subs_avx2(&x1[48], &x1[55]); + btf_16_adds_subs_avx2(&x1[49], &x1[54]); + btf_16_adds_subs_avx2(&x1[50], &x1[53]); + btf_16_adds_subs_avx2(&x1[51], &x1[52]); + btf_16_adds_subs_avx2(&x1[63], &x1[56]); + btf_16_adds_subs_avx2(&x1[62], &x1[57]); + btf_16_adds_subs_avx2(&x1[61], &x1[58]); + btf_16_adds_subs_avx2(&x1[60], &x1[59]); + + // stage 6 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[19]); + btf_16_adds_subs_avx2(&x1[17], &x1[18]); + btf_16_adds_subs_avx2(&x1[23], &x1[20]); + btf_16_adds_subs_avx2(&x1[22], &x1[21]); + btf_16_adds_subs_avx2(&x1[24], &x1[27]); + btf_16_adds_subs_avx2(&x1[25], &x1[26]); + btf_16_adds_subs_avx2(&x1[31], &x1[28]); + btf_16_adds_subs_avx2(&x1[30], &x1[29]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit); + + // stage 7 + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[35]); + btf_16_adds_subs_avx2(&x1[33], &x1[34]); + btf_16_adds_subs_avx2(&x1[39], &x1[36]); + btf_16_adds_subs_avx2(&x1[38], &x1[37]); + btf_16_adds_subs_avx2(&x1[40], &x1[43]); + btf_16_adds_subs_avx2(&x1[41], &x1[42]); + btf_16_adds_subs_avx2(&x1[47], &x1[44]); + btf_16_adds_subs_avx2(&x1[46], &x1[45]); + btf_16_adds_subs_avx2(&x1[48], &x1[51]); + btf_16_adds_subs_avx2(&x1[49], &x1[50]); + btf_16_adds_subs_avx2(&x1[55], &x1[52]); + btf_16_adds_subs_avx2(&x1[54], &x1[53]); + btf_16_adds_subs_avx2(&x1[56], &x1[59]); + btf_16_adds_subs_avx2(&x1[57], &x1[58]); + btf_16_adds_subs_avx2(&x1[63], &x1[60]); + btf_16_adds_subs_avx2(&x1[62], &x1[61]); + + // stage 8 + btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[17]); + btf_16_adds_subs_avx2(&x1[19], &x1[18]); + btf_16_adds_subs_avx2(&x1[20], &x1[21]); + btf_16_adds_subs_avx2(&x1[23], &x1[22]); + btf_16_adds_subs_avx2(&x1[24], &x1[25]); + btf_16_adds_subs_avx2(&x1[27], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[29]); + btf_16_adds_subs_avx2(&x1[31], &x1[30]); + btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit); + btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit); + + // stage 9 + btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit); + btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[33]); + btf_16_adds_subs_avx2(&x1[35], &x1[34]); + btf_16_adds_subs_avx2(&x1[36], &x1[37]); + btf_16_adds_subs_avx2(&x1[39], &x1[38]); + btf_16_adds_subs_avx2(&x1[40], &x1[41]); + btf_16_adds_subs_avx2(&x1[43], &x1[42]); + btf_16_adds_subs_avx2(&x1[44], &x1[45]); + btf_16_adds_subs_avx2(&x1[47], &x1[46]); + btf_16_adds_subs_avx2(&x1[48], &x1[49]); + btf_16_adds_subs_avx2(&x1[51], &x1[50]); + btf_16_adds_subs_avx2(&x1[52], &x1[53]); + btf_16_adds_subs_avx2(&x1[55], &x1[54]); + btf_16_adds_subs_avx2(&x1[56], &x1[57]); + btf_16_adds_subs_avx2(&x1[59], &x1[58]); + btf_16_adds_subs_avx2(&x1[60], &x1[61]); + btf_16_adds_subs_avx2(&x1[63], &x1[62]); + + // stage 10 + btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit); + btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit); + btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit); + btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit); + btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit); + btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit); + btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit); + btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit); + btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit); + btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit); + btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit); + btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit); + btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit); + + // stage 11 + output[0] = x1[0]; + output[1] = x1[32]; + output[2] = x1[16]; + output[3] = x1[48]; + output[4] = x1[8]; + output[5] = x1[40]; + output[6] = x1[24]; + output[7] = x1[56]; + output[8] = x1[4]; + output[9] = x1[36]; + output[10] = x1[20]; + output[11] = x1[52]; + output[12] = x1[12]; + output[13] = x1[44]; + output[14] = x1[28]; + output[15] = x1[60]; + output[16] = x1[2]; + output[17] = x1[34]; + output[18] = x1[18]; + output[19] = x1[50]; + output[20] = x1[10]; + output[21] = x1[42]; + output[22] = x1[26]; + output[23] = x1[58]; + output[24] = x1[6]; + output[25] = x1[38]; + output[26] = x1[22]; + output[27] = x1[54]; + output[28] = x1[14]; + output[29] = x1[46]; + output[30] = x1[30]; + output[31] = x1[62]; + output[32] = x1[1]; + output[33] = x1[33]; + output[34] = x1[17]; + output[35] = x1[49]; + output[36] = x1[9]; + output[37] = x1[41]; + output[38] = x1[25]; + output[39] = x1[57]; + output[40] = x1[5]; + output[41] = x1[37]; + output[42] = x1[21]; + output[43] = x1[53]; + output[44] = x1[13]; + output[45] = x1[45]; + output[46] = x1[29]; + output[47] = x1[61]; + output[48] = x1[3]; + output[49] = x1[35]; + output[50] = x1[19]; + output[51] = x1[51]; + output[52] = x1[11]; + output[53] = x1[43]; + output[54] = x1[27]; + output[55] = x1[59]; + output[56] = x1[7]; + output[57] = x1[39]; + output[58] = x1[23]; + output[59] = x1[55]; + output[60] = x1[15]; + output[61] = x1[47]; + output[62] = x1[31]; + output[63] = x1[63]; +} + +static INLINE void av1_fdct32_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + __m256i x1[32]; + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + // stage 0 + // stage 1 + btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]); + btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]); + btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]); + btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]); + btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]); + btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]); + btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]); + btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]); + btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]); + btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]); + btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]); + btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]); + btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]); + btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]); + btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]); + btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]); + + // stage 2 + btf_32_add_sub_avx2(&x1[0], &x1[15]); + btf_32_add_sub_avx2(&x1[1], &x1[14]); + btf_32_add_sub_avx2(&x1[2], &x1[13]); + btf_32_add_sub_avx2(&x1[3], &x1[12]); + btf_32_add_sub_avx2(&x1[4], &x1[11]); + btf_32_add_sub_avx2(&x1[5], &x1[10]); + btf_32_add_sub_avx2(&x1[6], &x1[9]); + btf_32_add_sub_avx2(&x1[7], &x1[8]); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit); + + // stage 3 + btf_32_add_sub_avx2(&x1[0], &x1[7]); + btf_32_add_sub_avx2(&x1[1], &x1[6]); + btf_32_add_sub_avx2(&x1[2], &x1[5]); + btf_32_add_sub_avx2(&x1[3], &x1[4]); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[23]); + btf_32_add_sub_avx2(&x1[17], &x1[22]); + btf_32_add_sub_avx2(&x1[18], &x1[21]); + btf_32_add_sub_avx2(&x1[19], &x1[20]); + btf_32_add_sub_avx2(&x1[31], &x1[24]); + btf_32_add_sub_avx2(&x1[30], &x1[25]); + btf_32_add_sub_avx2(&x1[29], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[27]); + + // stage 4 + btf_32_add_sub_avx2(&x1[0], &x1[3]); + btf_32_add_sub_avx2(&x1[1], &x1[2]); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[11]); + btf_32_add_sub_avx2(&x1[9], &x1[10]); + btf_32_add_sub_avx2(&x1[15], &x1[12]); + btf_32_add_sub_avx2(&x1[14], &x1[13]); + btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit); + + // stage 5 + btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit); + btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit); + btf_32_add_sub_avx2(&x1[4], &x1[5]); + btf_32_add_sub_avx2(&x1[7], &x1[6]); + btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[19]); + btf_32_add_sub_avx2(&x1[17], &x1[18]); + btf_32_add_sub_avx2(&x1[23], &x1[20]); + btf_32_add_sub_avx2(&x1[22], &x1[21]); + btf_32_add_sub_avx2(&x1[24], &x1[27]); + btf_32_add_sub_avx2(&x1[25], &x1[26]); + btf_32_add_sub_avx2(&x1[31], &x1[28]); + btf_32_add_sub_avx2(&x1[30], &x1[29]); + + // stage 6 + btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit); + btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[9]); + btf_32_add_sub_avx2(&x1[11], &x1[10]); + btf_32_add_sub_avx2(&x1[12], &x1[13]); + btf_32_add_sub_avx2(&x1[15], &x1[14]); + btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit); + + // stage 7 + btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit); + btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[17]); + btf_32_add_sub_avx2(&x1[19], &x1[18]); + btf_32_add_sub_avx2(&x1[20], &x1[21]); + btf_32_add_sub_avx2(&x1[23], &x1[22]); + btf_32_add_sub_avx2(&x1[24], &x1[25]); + btf_32_add_sub_avx2(&x1[27], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[29]); + btf_32_add_sub_avx2(&x1[31], &x1[30]); + + // stage 8 + btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit); + btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit); + + // stage 9 + output[0] = x1[0]; + output[1] = x1[16]; + output[2] = x1[8]; + output[3] = x1[24]; + output[4] = x1[4]; + output[5] = x1[20]; + output[6] = x1[12]; + output[7] = x1[28]; + output[8] = x1[2]; + output[9] = x1[18]; + output[10] = x1[10]; + output[11] = x1[26]; + output[12] = x1[6]; + output[13] = x1[22]; + output[14] = x1[14]; + output[15] = x1[30]; + output[16] = x1[1]; + output[17] = x1[17]; + output[18] = x1[9]; + output[19] = x1[25]; + output[20] = x1[5]; + output[21] = x1[21]; + output[22] = x1[13]; + output[23] = x1[29]; + output[24] = x1[3]; + output[25] = x1[19]; + output[26] = x1[11]; + output[27] = x1[27]; + output[28] = x1[7]; + output[29] = x1[23]; + output[30] = x1[15]; + output[31] = x1[31]; +} + +static INLINE void av1_fdct64_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]); + __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]); + __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]); + __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]); + __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]); + __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]); + __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]); + __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]); + __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]); + __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]); + __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]); + __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]); + __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]); + __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]); + __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]); + __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]); + __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]); + __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]); + __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]); + __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]); + __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]); + __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]); + __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]); + __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]); + __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]); + __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]); + __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]); + __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]); + __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]); + __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]); + __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]); + __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]); + __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]); + __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]); + __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]); + __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]); + __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]); + __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]); + __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]); + __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]); + __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]); + __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]); + __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]); + __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]); + __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]); + __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]); + __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]); + __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]); + __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]); + __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]); + __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]); + __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]); + __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]); + __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]); + __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]); + __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]); + __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]); + __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]); + __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]); + __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]); + __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]); + __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]); + __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]); + __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]); + __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]); + __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]); + __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]); + __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]); + __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]); + __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]); + __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]); + __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]); + __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]); + __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]); + __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]); + __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]); + __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]); + __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]); + + // stage 1 + __m256i x1[64]; + btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]); + btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]); + btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]); + btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]); + btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]); + btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]); + btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]); + btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]); + btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]); + btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]); + btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]); + btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]); + btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]); + btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]); + btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]); + btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]); + btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]); + btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]); + btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]); + btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]); + btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]); + btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]); + btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]); + btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]); + btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]); + btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]); + btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]); + btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]); + btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]); + btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]); + btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]); + btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]); + + // stage 2 + btf_32_add_sub_avx2(&x1[0], &x1[31]); + btf_32_add_sub_avx2(&x1[1], &x1[30]); + btf_32_add_sub_avx2(&x1[2], &x1[29]); + btf_32_add_sub_avx2(&x1[3], &x1[28]); + btf_32_add_sub_avx2(&x1[4], &x1[27]); + btf_32_add_sub_avx2(&x1[5], &x1[26]); + btf_32_add_sub_avx2(&x1[6], &x1[25]); + btf_32_add_sub_avx2(&x1[7], &x1[24]); + btf_32_add_sub_avx2(&x1[8], &x1[23]); + btf_32_add_sub_avx2(&x1[9], &x1[22]); + btf_32_add_sub_avx2(&x1[10], &x1[21]); + btf_32_add_sub_avx2(&x1[11], &x1[20]); + btf_32_add_sub_avx2(&x1[12], &x1[19]); + btf_32_add_sub_avx2(&x1[13], &x1[18]); + btf_32_add_sub_avx2(&x1[14], &x1[17]); + btf_32_add_sub_avx2(&x1[15], &x1[16]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit); + + // stage 3 + btf_32_add_sub_avx2(&x1[0], &x1[15]); + btf_32_add_sub_avx2(&x1[1], &x1[14]); + btf_32_add_sub_avx2(&x1[2], &x1[13]); + btf_32_add_sub_avx2(&x1[3], &x1[12]); + btf_32_add_sub_avx2(&x1[4], &x1[11]); + btf_32_add_sub_avx2(&x1[5], &x1[10]); + btf_32_add_sub_avx2(&x1[6], &x1[9]); + btf_32_add_sub_avx2(&x1[7], &x1[8]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[47]); + btf_32_add_sub_avx2(&x1[33], &x1[46]); + btf_32_add_sub_avx2(&x1[34], &x1[45]); + btf_32_add_sub_avx2(&x1[35], &x1[44]); + btf_32_add_sub_avx2(&x1[36], &x1[43]); + btf_32_add_sub_avx2(&x1[37], &x1[42]); + btf_32_add_sub_avx2(&x1[38], &x1[41]); + btf_32_add_sub_avx2(&x1[39], &x1[40]); + btf_32_add_sub_avx2(&x1[63], &x1[48]); + btf_32_add_sub_avx2(&x1[62], &x1[49]); + btf_32_add_sub_avx2(&x1[61], &x1[50]); + btf_32_add_sub_avx2(&x1[60], &x1[51]); + btf_32_add_sub_avx2(&x1[59], &x1[52]); + btf_32_add_sub_avx2(&x1[58], &x1[53]); + btf_32_add_sub_avx2(&x1[57], &x1[54]); + btf_32_add_sub_avx2(&x1[56], &x1[55]); + + // stage 4 + btf_32_add_sub_avx2(&x1[0], &x1[7]); + btf_32_add_sub_avx2(&x1[1], &x1[6]); + btf_32_add_sub_avx2(&x1[2], &x1[5]); + btf_32_add_sub_avx2(&x1[3], &x1[4]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[23]); + btf_32_add_sub_avx2(&x1[17], &x1[22]); + btf_32_add_sub_avx2(&x1[18], &x1[21]); + btf_32_add_sub_avx2(&x1[19], &x1[20]); + btf_32_add_sub_avx2(&x1[31], &x1[24]); + btf_32_add_sub_avx2(&x1[30], &x1[25]); + btf_32_add_sub_avx2(&x1[29], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[27]); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit); + + // stage 5 + btf_32_add_sub_avx2(&x1[0], &x1[3]); + btf_32_add_sub_avx2(&x1[1], &x1[2]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[11]); + btf_32_add_sub_avx2(&x1[9], &x1[10]); + btf_32_add_sub_avx2(&x1[15], &x1[12]); + btf_32_add_sub_avx2(&x1[14], &x1[13]); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[39]); + btf_32_add_sub_avx2(&x1[33], &x1[38]); + btf_32_add_sub_avx2(&x1[34], &x1[37]); + btf_32_add_sub_avx2(&x1[35], &x1[36]); + btf_32_add_sub_avx2(&x1[47], &x1[40]); + btf_32_add_sub_avx2(&x1[46], &x1[41]); + btf_32_add_sub_avx2(&x1[45], &x1[42]); + btf_32_add_sub_avx2(&x1[44], &x1[43]); + btf_32_add_sub_avx2(&x1[48], &x1[55]); + btf_32_add_sub_avx2(&x1[49], &x1[54]); + btf_32_add_sub_avx2(&x1[50], &x1[53]); + btf_32_add_sub_avx2(&x1[51], &x1[52]); + btf_32_add_sub_avx2(&x1[63], &x1[56]); + btf_32_add_sub_avx2(&x1[62], &x1[57]); + btf_32_add_sub_avx2(&x1[61], &x1[58]); + btf_32_add_sub_avx2(&x1[60], &x1[59]); + + // stage 6 + btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit); + btf_32_add_sub_avx2(&x1[4], &x1[5]); + btf_32_add_sub_avx2(&x1[7], &x1[6]); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[19]); + btf_32_add_sub_avx2(&x1[17], &x1[18]); + btf_32_add_sub_avx2(&x1[23], &x1[20]); + btf_32_add_sub_avx2(&x1[22], &x1[21]); + btf_32_add_sub_avx2(&x1[24], &x1[27]); + btf_32_add_sub_avx2(&x1[25], &x1[26]); + btf_32_add_sub_avx2(&x1[31], &x1[28]); + btf_32_add_sub_avx2(&x1[30], &x1[29]); + btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit); + + // stage 7 + btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[9]); + btf_32_add_sub_avx2(&x1[11], &x1[10]); + btf_32_add_sub_avx2(&x1[12], &x1[13]); + btf_32_add_sub_avx2(&x1[15], &x1[14]); + btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[35]); + btf_32_add_sub_avx2(&x1[33], &x1[34]); + btf_32_add_sub_avx2(&x1[39], &x1[36]); + btf_32_add_sub_avx2(&x1[38], &x1[37]); + btf_32_add_sub_avx2(&x1[40], &x1[43]); + btf_32_add_sub_avx2(&x1[41], &x1[42]); + btf_32_add_sub_avx2(&x1[47], &x1[44]); + btf_32_add_sub_avx2(&x1[46], &x1[45]); + btf_32_add_sub_avx2(&x1[48], &x1[51]); + btf_32_add_sub_avx2(&x1[49], &x1[50]); + btf_32_add_sub_avx2(&x1[55], &x1[52]); + btf_32_add_sub_avx2(&x1[54], &x1[53]); + btf_32_add_sub_avx2(&x1[56], &x1[59]); + btf_32_add_sub_avx2(&x1[57], &x1[58]); + btf_32_add_sub_avx2(&x1[63], &x1[60]); + btf_32_add_sub_avx2(&x1[62], &x1[61]); + + // stage 8 + btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[17]); + btf_32_add_sub_avx2(&x1[19], &x1[18]); + btf_32_add_sub_avx2(&x1[20], &x1[21]); + btf_32_add_sub_avx2(&x1[23], &x1[22]); + btf_32_add_sub_avx2(&x1[24], &x1[25]); + btf_32_add_sub_avx2(&x1[27], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[29]); + btf_32_add_sub_avx2(&x1[31], &x1[30]); + btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit); + + // stage 9 + btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[33]); + btf_32_add_sub_avx2(&x1[35], &x1[34]); + btf_32_add_sub_avx2(&x1[36], &x1[37]); + btf_32_add_sub_avx2(&x1[39], &x1[38]); + btf_32_add_sub_avx2(&x1[40], &x1[41]); + btf_32_add_sub_avx2(&x1[43], &x1[42]); + btf_32_add_sub_avx2(&x1[44], &x1[45]); + btf_32_add_sub_avx2(&x1[47], &x1[46]); + btf_32_add_sub_avx2(&x1[48], &x1[49]); + btf_32_add_sub_avx2(&x1[51], &x1[50]); + btf_32_add_sub_avx2(&x1[52], &x1[53]); + btf_32_add_sub_avx2(&x1[55], &x1[54]); + btf_32_add_sub_avx2(&x1[56], &x1[57]); + btf_32_add_sub_avx2(&x1[59], &x1[58]); + btf_32_add_sub_avx2(&x1[60], &x1[61]); + btf_32_add_sub_avx2(&x1[63], &x1[62]); + + // stage 10 + btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit); + + // stage 11 + output[0] = x1[0]; + output[1] = x1[32]; + output[2] = x1[16]; + output[3] = x1[48]; + output[4] = x1[8]; + output[5] = x1[40]; + output[6] = x1[24]; + output[7] = x1[56]; + output[8] = x1[4]; + output[9] = x1[36]; + output[10] = x1[20]; + output[11] = x1[52]; + output[12] = x1[12]; + output[13] = x1[44]; + output[14] = x1[28]; + output[15] = x1[60]; + output[16] = x1[2]; + output[17] = x1[34]; + output[18] = x1[18]; + output[19] = x1[50]; + output[20] = x1[10]; + output[21] = x1[42]; + output[22] = x1[26]; + output[23] = x1[58]; + output[24] = x1[6]; + output[25] = x1[38]; + output[26] = x1[22]; + output[27] = x1[54]; + output[28] = x1[14]; + output[29] = x1[46]; + output[30] = x1[30]; + output[31] = x1[62]; + output[32] = x1[1]; + output[33] = x1[33]; + output[34] = x1[17]; + output[35] = x1[49]; + output[36] = x1[9]; + output[37] = x1[41]; + output[38] = x1[25]; + output[39] = x1[57]; + output[40] = x1[5]; + output[41] = x1[37]; + output[42] = x1[21]; + output[43] = x1[53]; + output[44] = x1[13]; + output[45] = x1[45]; + output[46] = x1[29]; + output[47] = x1[61]; + output[48] = x1[3]; + output[49] = x1[35]; + output[50] = x1[19]; + output[51] = x1[51]; + output[52] = x1[11]; + output[53] = x1[43]; + output[54] = x1[27]; + output[55] = x1[59]; + output[56] = x1[7]; + output[57] = x1[39]; + output[58] = x1[23]; + output[59] = x1[55]; + output[60] = x1[15]; + output[61] = x1[47]; + output[62] = x1[31]; + output[63] = x1[63]; +} + +static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __zero = _mm256_setzero_si256(); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); + __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); + __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); + __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]); + __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]); + __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); + __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); + __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); + __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); + __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); + __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); + __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); + __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); + __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); + __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); + __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); + __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); + __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); + __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); + __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); + __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); + + // stage 1 + __m256i x1[16]; + x1[0] = input[0]; + x1[1] = _mm256_subs_epi16(__zero, input[15]); + x1[2] = _mm256_subs_epi16(__zero, input[7]); + x1[3] = input[8]; + x1[4] = _mm256_subs_epi16(__zero, input[3]); + x1[5] = input[12]; + x1[6] = input[4]; + x1[7] = _mm256_subs_epi16(__zero, input[11]); + x1[8] = _mm256_subs_epi16(__zero, input[1]); + x1[9] = input[14]; + x1[10] = input[6]; + x1[11] = _mm256_subs_epi16(__zero, input[9]); + x1[12] = input[2]; + x1[13] = _mm256_subs_epi16(__zero, input[13]); + x1[14] = _mm256_subs_epi16(__zero, input[5]); + x1[15] = input[10]; + + // stage 2 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[2]); + btf_16_adds_subs_avx2(&x1[1], &x1[3]); + btf_16_adds_subs_avx2(&x1[4], &x1[6]); + btf_16_adds_subs_avx2(&x1[5], &x1[7]); + btf_16_adds_subs_avx2(&x1[8], &x1[10]); + btf_16_adds_subs_avx2(&x1[9], &x1[11]); + btf_16_adds_subs_avx2(&x1[12], &x1[14]); + btf_16_adds_subs_avx2(&x1[13], &x1[15]); + + // stage 4 + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit); + + // stage 5 + btf_16_adds_subs_avx2(&x1[0], &x1[4]); + btf_16_adds_subs_avx2(&x1[1], &x1[5]); + btf_16_adds_subs_avx2(&x1[2], &x1[6]); + btf_16_adds_subs_avx2(&x1[3], &x1[7]); + btf_16_adds_subs_avx2(&x1[8], &x1[12]); + btf_16_adds_subs_avx2(&x1[9], &x1[13]); + btf_16_adds_subs_avx2(&x1[10], &x1[14]); + btf_16_adds_subs_avx2(&x1[11], &x1[15]); + + // stage 6 + btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit); + + // stage 7 + btf_16_adds_subs_avx2(&x1[0], &x1[8]); + btf_16_adds_subs_avx2(&x1[1], &x1[9]); + btf_16_adds_subs_avx2(&x1[2], &x1[10]); + btf_16_adds_subs_avx2(&x1[3], &x1[11]); + btf_16_adds_subs_avx2(&x1[4], &x1[12]); + btf_16_adds_subs_avx2(&x1[5], &x1[13]); + btf_16_adds_subs_avx2(&x1[6], &x1[14]); + btf_16_adds_subs_avx2(&x1[7], &x1[15]); + + // stage 8 + btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit); + btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit); + + // stage 9 + output[0] = x1[1]; + output[1] = x1[14]; + output[2] = x1[3]; + output[3] = x1[12]; + output[4] = x1[5]; + output[5] = x1[10]; + output[6] = x1[7]; + output[7] = x1[8]; + output[8] = x1[9]; + output[9] = x1[6]; + output[10] = x1[11]; + output[11] = x1[4]; + output[12] = x1[13]; + output[13] = x1[2]; + output[14] = x1[15]; + output[15] = x1[0]; +} + +static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) { + const __m256i scale__r = pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1)); + const __m256i b = _mm256_madd_epi16(a, scale__r); + return _mm256_srai_epi32(b, NewSqrt2Bits); +} + +static INLINE void fidentity16x16_new_avx2(const __m256i *input, + __m256i *output, int8_t cos_bit) { + (void)cos_bit; + const __m256i one = _mm256_set1_epi16(1); + + for (int i = 0; i < 16; ++i) { + const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one); + const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one); + const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2); + output[i] = _mm256_packs_epi32(b_lo, b_hi); + } +} + +static INLINE void fidentity16x32_new_avx2(const __m256i *input, + __m256i *output, int8_t cos_bit) { + (void)cos_bit; + for (int i = 0; i < 32; ++i) { + output[i] = _mm256_slli_epi16(input[i], 2); + } +} + +static INLINE void av1_round_shift_array_32_avx2(__m256i *input, + __m256i *output, + const int size, + const int bit) { + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + output[i] = av1_round_shift_32_avx2(input[i], bit); + } + } else { + int i; + for (i = 0; i < size; i++) { + output[i] = _mm256_slli_epi32(input[i], -bit); + } + } +} + +static INLINE void av1_round_shift_rect_array_32_avx2(__m256i *input, + __m256i *output, + const int size, + const int bit) { + const __m256i sqrt2 = _mm256_set1_epi32(NewSqrt2); + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + const __m256i r0 = av1_round_shift_32_avx2(input[i], bit); + const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0); + output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits); + } + } else { + int i; + for (i = 0; i < size; i++) { + const __m256i r0 = _mm256_slli_epi32(input[i], -bit); + const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0); + output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits); + } + } +} + +static INLINE void transpose_32_8x8_avx2(int stride, const __m256i *inputA, + __m256i *output) { + __m256i temp0 = _mm256_unpacklo_epi32(inputA[0], inputA[2]); + __m256i temp1 = _mm256_unpackhi_epi32(inputA[0], inputA[2]); + __m256i temp2 = _mm256_unpacklo_epi32(inputA[1], inputA[3]); + __m256i temp3 = _mm256_unpackhi_epi32(inputA[1], inputA[3]); + __m256i temp4 = _mm256_unpacklo_epi32(inputA[4], inputA[6]); + __m256i temp5 = _mm256_unpackhi_epi32(inputA[4], inputA[6]); + __m256i temp6 = _mm256_unpacklo_epi32(inputA[5], inputA[7]); + __m256i temp7 = _mm256_unpackhi_epi32(inputA[5], inputA[7]); + + __m256i t0 = _mm256_unpacklo_epi32(temp0, temp2); + __m256i t1 = _mm256_unpackhi_epi32(temp0, temp2); + __m256i t2 = _mm256_unpacklo_epi32(temp1, temp3); + __m256i t3 = _mm256_unpackhi_epi32(temp1, temp3); + __m256i t4 = _mm256_unpacklo_epi32(temp4, temp6); + __m256i t5 = _mm256_unpackhi_epi32(temp4, temp6); + __m256i t6 = _mm256_unpacklo_epi32(temp5, temp7); + __m256i t7 = _mm256_unpackhi_epi32(temp5, temp7); + + output[0 * stride] = _mm256_permute2x128_si256(t0, t4, 0x20); + output[1 * stride] = _mm256_permute2x128_si256(t1, t5, 0x20); + output[2 * stride] = _mm256_permute2x128_si256(t2, t6, 0x20); + output[3 * stride] = _mm256_permute2x128_si256(t3, t7, 0x20); + output[4 * stride] = _mm256_permute2x128_si256(t0, t4, 0x31); + output[5 * stride] = _mm256_permute2x128_si256(t1, t5, 0x31); + output[6 * stride] = _mm256_permute2x128_si256(t2, t6, 0x31); + output[7 * stride] = _mm256_permute2x128_si256(t3, t7, 0x31); +} + +// Store 8 16 bit values. Sign extend the values. +static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in, + int32_t *out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + _mm256_store_si256((__m256i *)(out), + _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i]))); + _mm256_store_si256( + (__m256i *)(out + 8), + _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1))); + out += stride; + } +} + +static INLINE void store_rect_16bit_to_32bit_avx2(const __m256i a, + int32_t *const b) { + const __m256i one = _mm256_set1_epi16(1); + const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8); + const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one); + const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one); + const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2); + _mm256_store_si256((__m256i *)b, b_lo); + _mm256_store_si256((__m256i *)(b + 8), b_hi); +} + +static INLINE void store_rect_buffer_16bit_to_32bit_w16_avx2( + const __m256i *const in, int32_t *const out, const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_rect_16bit_to_32bit_avx2(in[i], out + i * stride); + } +} + +static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = { + fdct16x32_new_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity16x32_new_avx2, // IDTX + fdct16x32_new_avx2, // V_DCT + fidentity16x32_new_avx2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = { + fdct16x32_new_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity16x32_new_avx2, // IDTX + fidentity16x32_new_avx2, // V_DCT + fdct16x32_new_avx2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = { + fdct16x16_new_avx2, // DCT_DCT + fadst16x16_new_avx2, // ADST_DCT + fdct16x16_new_avx2, // DCT_ADST + fadst16x16_new_avx2, // ADST_ADST + fadst16x16_new_avx2, // FLIPADST_DCT + fdct16x16_new_avx2, // DCT_FLIPADST + fadst16x16_new_avx2, // FLIPADST_FLIPADST + fadst16x16_new_avx2, // ADST_FLIPADST + fadst16x16_new_avx2, // FLIPADST_ADST + fidentity16x16_new_avx2, // IDTX + fdct16x16_new_avx2, // V_DCT + fidentity16x16_new_avx2, // H_DCT + fadst16x16_new_avx2, // V_ADST + fidentity16x16_new_avx2, // H_ADST + fadst16x16_new_avx2, // V_FLIPADST + fidentity16x16_new_avx2 // H_FLIPADST +}; + +static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = { + fdct16x16_new_avx2, // DCT_DCT + fdct16x16_new_avx2, // ADST_DCT + fadst16x16_new_avx2, // DCT_ADST + fadst16x16_new_avx2, // ADST_ADST + fdct16x16_new_avx2, // FLIPADST_DCT + fadst16x16_new_avx2, // DCT_FLIPADST + fadst16x16_new_avx2, // FLIPADST_FLIPADST + fadst16x16_new_avx2, // ADST_FLIPADST + fadst16x16_new_avx2, // FLIPADST_ADST + fidentity16x16_new_avx2, // IDTX + fidentity16x16_new_avx2, // V_DCT + fdct16x16_new_avx2, // H_DCT + fidentity16x16_new_avx2, // V_ADST + fadst16x16_new_avx2, // H_ADST + fidentity16x16_new_avx2, // V_FLIPADST + fadst16x16_new_avx2 // H_FLIPADST +}; + +static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_16X16; + __m256i buf0[16], buf1[16]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + const int32_t i = 0; + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i); + + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + transpose_16bit_16x16_avx2(buf, buf); + store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width, 16); +} + +static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_32X32; + __m256i buf0[32], buf1[128]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, + height); + } else { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i); + transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i); + } + + for (int i = 0; i < 2; i++) { + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + transpose_16bit_16x16_avx2(buf, buf); + store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width, + 16); + transpose_16bit_16x16_avx2(buf + 16, buf + 16); + store_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16 * width * i + 16, + width, 16); + } +} + +static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X64; + __m256i buf0[64], buf1[256]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(2, height_div16); ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < AOMMIN(2, height_div16); i++) { + __m256i bufA[64]; + __m256i bufB[64]; + __m128i *buf = (__m128i *)(buf1 + width * i); + for (int j = 0; j < width; ++j) { + bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); + bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); + } + av1_fdct64_new_avx2(bufA, bufA, cos_bit_row); + av1_fdct64_new_avx2(bufB, bufB, cos_bit_row); + av1_round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]); + av1_round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]); + + int32_t *output8 = output + 16 * 32 * i; + for (int j = 0; j < 4; ++j) { + __m256i *out = (__m256i *)(output8 + 8 * j); + transpose_32_8x8_avx2(4, bufA + 8 * j, out); + transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4); + } + } +} + +static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_16X32; + __m256i buf0[32], buf1[32]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0, buf1); + transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16); + + for (int i = 0; i < 2; i++) { + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + transpose_16bit_16x16_avx2(buf, buf); + store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, + width, 16); + } +} + +static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m256i buf0[32], buf1[64]; + const int8_t *shift = fwd_txfm_shift_ls[TX_32X16]; + const int txw_idx = get_txw_idx(TX_32X16); + const int txh_idx = get_txh_idx(TX_32X16); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 16; + const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, + height); + } else { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i); + } + + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + transpose_16bit_16x16_avx2(buf, buf); + store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, width, 16); + + transpose_16bit_16x16_avx2(buf + 16, buf + 16); + store_rect_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16, width, 16); +} + +static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_64X32; + __m256i buf0[64], buf1[256]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div16); ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + assert(tx_type == DCT_DCT); + for (int i = 0; i < AOMMIN(2, height_div16); i++) { + __m256i bufA[64]; + __m256i bufB[64]; + __m128i *buf = (__m128i *)(buf1 + width * i); + for (int j = 0; j < width; ++j) { + bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); + bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); + } + av1_fdct64_new_avx2(bufA, bufA, cos_bit_row); + av1_fdct64_new_avx2(bufB, bufB, cos_bit_row); + av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2]); + av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2]); + + int32_t *output8 = output + 16 * 32 * i; + for (int j = 0; j < 4; ++j) { + __m256i *out = (__m256i *)(output8 + 8 * j); + transpose_32_8x8_avx2(4, bufA + 8 * j, out); + transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4); + } + } +} + +static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_32X64; + __m256i buf0[64], buf1[256]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(2, height_div16); ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < AOMMIN(2, height_div16); i++) { + __m256i bufA[32]; + __m256i bufB[32]; + __m128i *buf = (__m128i *)(buf1 + width * i); + for (int j = 0; j < width; ++j) { + bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); + bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); + } + av1_fdct32_new_avx2(bufA, bufA, cos_bit_row); + av1_fdct32_new_avx2(bufB, bufB, cos_bit_row); + av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2]); + av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2]); + + int32_t *output8 = output + 16 * 32 * i; + for (int j = 0; j < 4; ++j) { + __m256i *out = (__m256i *)(output8 + 8 * j); + transpose_32_8x8_avx2(4, bufA + 8 * j, out); + transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4); + } + } +} + +static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_16X64; + __m256i buf0[64], buf1[64]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; + const transform_1d_avx2 row_txfm = fdct16x16_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < height_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < AOMMIN(4, height_div16); i++) { + __m256i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + int32_t *output16 = output + 16 * width * i; + for (int j = 0; j < width_div16; ++j) { + __m256i *buf16 = buf + 16 * j; + transpose_16bit_16x16_avx2(buf16, buf16); + store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, width, 16); + } + } + // Zero out the bottom 16x32 area. + memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); +} + +static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X16; + __m256i buf0[64], buf1[64]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x16_new_avx2; + const transform_1d_avx2 row_txfm = fdct16x64_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < height_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < height_div16; i++) { + __m256i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + int32_t *output16 = output + 16 * 32 * i; + for (int j = 0; j < 2; ++j) { + __m256i *buf16 = buf + 16 * j; + transpose_16bit_16x16_avx2(buf16, buf16); + store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, 32, 16); + } + } +} + +static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { + av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform + av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform + lowbd_fwd_txfm2d_16x16_avx2, // 16x16 transform + lowbd_fwd_txfm2d_32x32_avx2, // 32x32 transform + lowbd_fwd_txfm2d_64x64_avx2, // 64x64 transform + av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform + av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform + av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform + av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform + lowbd_fwd_txfm2d_16x32_avx2, // 16x32 transform + lowbd_fwd_txfm2d_32x16_avx2, // 32x16 transform + lowbd_fwd_txfm2d_32x64_avx2, // 32x64 transform + lowbd_fwd_txfm2d_64x32_avx2, // 64x32 transform + av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform + av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform + av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform + av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform + lowbd_fwd_txfm2d_16x64_avx2, // 16x64 transform + lowbd_fwd_txfm2d_64x16_avx2, // 64x16 transform +}; + +void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; + if ((fwd_txfm2d_func == NULL) || + (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) { + av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); + } else { + fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h new file mode 100644 index 000000000..c582ca0e3 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_FWD_TXFM_AVX2_H_ +#define AV1_FWD_TXFM_AVX2_H_ +#include + +static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) { + __m256i tmp, round; + round = _mm256_set1_epi32(1 << (bit - 1)); + tmp = _mm256_add_epi32(vec, round); + return _mm256_srai_epi32(tmp, bit); +} + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +static INLINE void btf_32_avx2_type0(const int32_t w0, const int32_t w1, + __m256i *in0, __m256i *in1, + const __m256i _r, const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i ww0 = _mm256_set1_epi32(w0); + const __m256i ww1 = _mm256_set1_epi32(w1); + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +static INLINE void btf_32_avx2_type1(const int32_t w0, const int32_t w1, + __m256i *in0, __m256i *in1, + const __m256i _r, const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i ww0 = _mm256_set1_epi32(w0); + const __m256i ww1 = _mm256_set1_epi32(w1); + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +static INLINE void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1, + __m256i *in0, __m256i *in1, + const __m256i _r, + const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +// out0 = in0*w0 + in1*w1 +// out1 = in1*w0 - in0*w1 +static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1, + __m256i *in0, __m256i *in1, + const __m256i _r, + const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +#endif // AV1_FWD_TXFM_AVX2_H_ diff --git a/third_party/aom/av1/encoder/x86/corner_match_sse4.c b/third_party/aom/av1/encoder/x86/corner_match_sse4.c index 381f757da..93f37b71d 100644 --- a/third_party/aom/av1/encoder/x86/corner_match_sse4.c +++ b/third_party/aom/av1/encoder/x86/corner_match_sse4.c @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + #include #include #include diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c new file mode 100644 index 000000000..f776e84c7 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/x86/synonyms.h" + +#include "aom/aom_integer.h" + +#include "av1/common/reconinter.h" + +#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) + +/** + * See av1_wedge_sse_from_residuals_c + */ +uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + int n = -N; + + uint64_t csse; + + const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE); + const __m256i v_zext_q = _mm256_set1_epi64x(0xffffffff); + + __m256i v_acc0_q = _mm256_setzero_si256(); + + assert(N % 64 == 0); + + r1 += N; + d += N; + m += N; + + do { + const __m256i v_r0_w = _mm256_lddqu_si256((__m256i *)(r1 + n)); + const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(d + n)); + const __m128i v_m01_b = _mm_lddqu_si128((__m128i *)(m + n)); + + const __m256i v_rd0l_w = _mm256_unpacklo_epi16(v_d0_w, v_r0_w); + const __m256i v_rd0h_w = _mm256_unpackhi_epi16(v_d0_w, v_r0_w); + const __m256i v_m0_w = _mm256_cvtepu8_epi16(v_m01_b); + + const __m256i v_m0l_w = _mm256_unpacklo_epi16(v_m0_w, v_mask_max_w); + const __m256i v_m0h_w = _mm256_unpackhi_epi16(v_m0_w, v_mask_max_w); + + const __m256i v_t0l_d = _mm256_madd_epi16(v_rd0l_w, v_m0l_w); + const __m256i v_t0h_d = _mm256_madd_epi16(v_rd0h_w, v_m0h_w); + + const __m256i v_t0_w = _mm256_packs_epi32(v_t0l_d, v_t0h_d); + + const __m256i v_sq0_d = _mm256_madd_epi16(v_t0_w, v_t0_w); + + const __m256i v_sum0_q = _mm256_add_epi64( + _mm256_and_si256(v_sq0_d, v_zext_q), _mm256_srli_epi64(v_sq0_d, 32)); + + v_acc0_q = _mm256_add_epi64(v_acc0_q, v_sum0_q); + + n += 16; + } while (n); + + v_acc0_q = _mm256_add_epi64(v_acc0_q, _mm256_srli_si256(v_acc0_q, 8)); + __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc0_q); + __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc0_q, 1); + v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1); +#if ARCH_X86_64 + csse = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0); +#else + xx_storel_64(&csse, v_acc_q_0); +#endif + + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +/** + * See av1_wedge_sign_from_residuals_c + */ +int av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m, + int N, int64_t limit) { + int64_t acc; + __m256i v_acc0_d = _mm256_setzero_si256(); + + // Input size limited to 8192 by the use of 32 bit accumulators and m + // being between [0, 64]. Overflow might happen at larger sizes, + // though it is practically impossible on real video input. + assert(N < 8192); + assert(N % 64 == 0); + + do { + const __m256i v_m01_b = _mm256_lddqu_si256((__m256i *)(m)); + const __m256i v_m23_b = _mm256_lddqu_si256((__m256i *)(m + 32)); + + const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(ds)); + const __m256i v_d1_w = _mm256_lddqu_si256((__m256i *)(ds + 16)); + const __m256i v_d2_w = _mm256_lddqu_si256((__m256i *)(ds + 32)); + const __m256i v_d3_w = _mm256_lddqu_si256((__m256i *)(ds + 48)); + + const __m256i v_m0_w = + _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m01_b)); + const __m256i v_m1_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m01_b, 1)); + const __m256i v_m2_w = + _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m23_b)); + const __m256i v_m3_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m23_b, 1)); + + const __m256i v_p0_d = _mm256_madd_epi16(v_d0_w, v_m0_w); + const __m256i v_p1_d = _mm256_madd_epi16(v_d1_w, v_m1_w); + const __m256i v_p2_d = _mm256_madd_epi16(v_d2_w, v_m2_w); + const __m256i v_p3_d = _mm256_madd_epi16(v_d3_w, v_m3_w); + + const __m256i v_p01_d = _mm256_add_epi32(v_p0_d, v_p1_d); + const __m256i v_p23_d = _mm256_add_epi32(v_p2_d, v_p3_d); + + const __m256i v_p0123_d = _mm256_add_epi32(v_p01_d, v_p23_d); + + v_acc0_d = _mm256_add_epi32(v_acc0_d, v_p0123_d); + + ds += 64; + m += 64; + + N -= 64; + } while (N); + + __m256i v_sign_d = _mm256_srai_epi32(v_acc0_d, 31); + v_acc0_d = _mm256_add_epi64(_mm256_unpacklo_epi32(v_acc0_d, v_sign_d), + _mm256_unpackhi_epi32(v_acc0_d, v_sign_d)); + + __m256i v_acc_q = _mm256_add_epi64(v_acc0_d, _mm256_srli_si256(v_acc0_d, 8)); + + __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc_q); + __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc_q, 1); + v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1); + +#if ARCH_X86_64 + acc = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0); +#else + xx_storel_64(&acc, v_acc_q_0); +#endif + + return acc > limit; +} + +/** + * av1_wedge_compute_delta_squares_c + */ +void av1_wedge_compute_delta_squares_avx2(int16_t *d, const int16_t *a, + const int16_t *b, int N) { + const __m256i v_neg_w = _mm256_set1_epi32(0xffff0001); + + assert(N % 64 == 0); + + do { + const __m256i v_a0_w = _mm256_lddqu_si256((__m256i *)(a)); + const __m256i v_b0_w = _mm256_lddqu_si256((__m256i *)(b)); + const __m256i v_a1_w = _mm256_lddqu_si256((__m256i *)(a + 16)); + const __m256i v_b1_w = _mm256_lddqu_si256((__m256i *)(b + 16)); + const __m256i v_a2_w = _mm256_lddqu_si256((__m256i *)(a + 32)); + const __m256i v_b2_w = _mm256_lddqu_si256((__m256i *)(b + 32)); + const __m256i v_a3_w = _mm256_lddqu_si256((__m256i *)(a + 48)); + const __m256i v_b3_w = _mm256_lddqu_si256((__m256i *)(b + 48)); + + const __m256i v_ab0l_w = _mm256_unpacklo_epi16(v_a0_w, v_b0_w); + const __m256i v_ab0h_w = _mm256_unpackhi_epi16(v_a0_w, v_b0_w); + const __m256i v_ab1l_w = _mm256_unpacklo_epi16(v_a1_w, v_b1_w); + const __m256i v_ab1h_w = _mm256_unpackhi_epi16(v_a1_w, v_b1_w); + const __m256i v_ab2l_w = _mm256_unpacklo_epi16(v_a2_w, v_b2_w); + const __m256i v_ab2h_w = _mm256_unpackhi_epi16(v_a2_w, v_b2_w); + const __m256i v_ab3l_w = _mm256_unpacklo_epi16(v_a3_w, v_b3_w); + const __m256i v_ab3h_w = _mm256_unpackhi_epi16(v_a3_w, v_b3_w); + + // Negate top word of pairs + const __m256i v_abl0n_w = _mm256_sign_epi16(v_ab0l_w, v_neg_w); + const __m256i v_abh0n_w = _mm256_sign_epi16(v_ab0h_w, v_neg_w); + const __m256i v_abl1n_w = _mm256_sign_epi16(v_ab1l_w, v_neg_w); + const __m256i v_abh1n_w = _mm256_sign_epi16(v_ab1h_w, v_neg_w); + const __m256i v_abl2n_w = _mm256_sign_epi16(v_ab2l_w, v_neg_w); + const __m256i v_abh2n_w = _mm256_sign_epi16(v_ab2h_w, v_neg_w); + const __m256i v_abl3n_w = _mm256_sign_epi16(v_ab3l_w, v_neg_w); + const __m256i v_abh3n_w = _mm256_sign_epi16(v_ab3h_w, v_neg_w); + + const __m256i v_r0l_w = _mm256_madd_epi16(v_ab0l_w, v_abl0n_w); + const __m256i v_r0h_w = _mm256_madd_epi16(v_ab0h_w, v_abh0n_w); + const __m256i v_r1l_w = _mm256_madd_epi16(v_ab1l_w, v_abl1n_w); + const __m256i v_r1h_w = _mm256_madd_epi16(v_ab1h_w, v_abh1n_w); + const __m256i v_r2l_w = _mm256_madd_epi16(v_ab2l_w, v_abl2n_w); + const __m256i v_r2h_w = _mm256_madd_epi16(v_ab2h_w, v_abh2n_w); + const __m256i v_r3l_w = _mm256_madd_epi16(v_ab3l_w, v_abl3n_w); + const __m256i v_r3h_w = _mm256_madd_epi16(v_ab3h_w, v_abh3n_w); + + const __m256i v_r0_w = _mm256_packs_epi32(v_r0l_w, v_r0h_w); + const __m256i v_r1_w = _mm256_packs_epi32(v_r1l_w, v_r1h_w); + const __m256i v_r2_w = _mm256_packs_epi32(v_r2l_w, v_r2h_w); + const __m256i v_r3_w = _mm256_packs_epi32(v_r3l_w, v_r3h_w); + + _mm256_store_si256((__m256i *)(d), v_r0_w); + _mm256_store_si256((__m256i *)(d + 16), v_r1_w); + _mm256_store_si256((__m256i *)(d + 32), v_r2_w); + _mm256_store_si256((__m256i *)(d + 48), v_r3_w); + + a += 64; + b += 64; + d += 64; + N -= 64; + } while (N); +} -- cgit v1.2.3