diff options
author | trav90 <travawine@palemoon.org> | 2018-10-19 23:00:02 -0500 |
---|---|---|
committer | trav90 <travawine@palemoon.org> | 2018-10-19 23:00:02 -0500 |
commit | b8df135c97a854c2ff9b4394b016649c601177fa (patch) | |
tree | 802b7de5ad245f1a12adbcef835ab0d0687c1bf8 /third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c | |
parent | a4d3c59dcac642f6b9557dc09b60eda40b517630 (diff) | |
download | UXP-b8df135c97a854c2ff9b4394b016649c601177fa.tar UXP-b8df135c97a854c2ff9b4394b016649c601177fa.tar.gz UXP-b8df135c97a854c2ff9b4394b016649c601177fa.tar.lz UXP-b8df135c97a854c2ff9b4394b016649c601177fa.tar.xz UXP-b8df135c97a854c2ff9b4394b016649c601177fa.zip |
Update libaom to rev b25610052a1398032320008d69b51d2da94f5928
Diffstat (limited to 'third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c')
-rw-r--r-- | third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c | 1058 |
1 files changed, 520 insertions, 538 deletions
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c index 7415c58df..ae331b40d 100644 --- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c @@ -19,49 +19,47 @@ #include "av1/common/x86/av1_inv_txfm_ssse3.h" static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(x1[0], x1[3]); - btf_16_adds_subs_avx2(x1[1], x1[2]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x1[5], x1[6]); - - btf_16_adds_subs_avx2(x1[8], x1[11]); - btf_16_adds_subs_avx2(x1[9], x1[10]); - btf_16_subs_adds_avx2(x1[15], x1[12]); - btf_16_subs_adds_avx2(x1[14], x1[13]); + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); } static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(x[0], x[7]); - btf_16_adds_subs_avx2(x[1], x[6]); - btf_16_adds_subs_avx2(x[2], x[5]); - btf_16_adds_subs_avx2(x[3], x[4]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); } static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) { - btf_16_adds_subs_out_avx2(output[0], output[15], x1[0], x1[15]); - btf_16_adds_subs_out_avx2(output[1], output[14], x1[1], x1[14]); - btf_16_adds_subs_out_avx2(output[2], output[13], x1[2], x1[13]); - btf_16_adds_subs_out_avx2(output[3], output[12], x1[3], x1[12]); - btf_16_adds_subs_out_avx2(output[4], output[11], x1[4], x1[11]); - btf_16_adds_subs_out_avx2(output[5], output[10], x1[5], x1[10]); - btf_16_adds_subs_out_avx2(output[6], output[9], x1[6], x1[9]); - btf_16_adds_subs_out_avx2(output[7], output[8], x1[7], x1[8]); + btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]); + btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]); + btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]); + btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]); + btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]); + btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]); + btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]); + btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]); } static void idct16_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); @@ -103,29 +101,29 @@ static void idct16_new_avx2(const __m256i *input, __m256i *output, x1[15] = input[15]; // stage 2 - btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]); - btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]); - btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]); - btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]); + btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit); // stage 3 - btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]); - btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]); - btf_16_adds_subs_avx2(x1[8], x1[9]); - btf_16_subs_adds_avx2(x1[11], x1[10]); - btf_16_adds_subs_avx2(x1[12], x1[13]); - btf_16_subs_adds_avx2(x1[15], x1[14]); + btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); // stage 4 - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]); - btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]); - btf_16_adds_subs_avx2(x1[4], x1[5]); - btf_16_subs_adds_avx2(x1[7], x1[6]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]); - - idct16_stage5_avx2(x1, cospi, __rounding, cos_bit); - idct16_stage6_avx2(x1, cospi, __rounding, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + + idct16_stage5_avx2(x1, cospi, _r, cos_bit); + idct16_stage6_avx2(x1, cospi, _r, cos_bit); idct16_stage7_avx2(output, x1); } @@ -133,7 +131,7 @@ static void idct16_low8_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); @@ -159,21 +157,21 @@ static void idct16_low8_new_avx2(const __m256i *input, __m256i *output, // stage 3 btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]); btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]); - btf_16_adds_subs_avx2(x1[8], x1[9]); - btf_16_subs_adds_avx2(x1[11], x1[10]); - btf_16_adds_subs_avx2(x1[12], x1[13]); - btf_16_subs_adds_avx2(x1[15], x1[14]); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); // stage 4 btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]); - btf_16_adds_subs_avx2(x1[4], x1[5]); - btf_16_subs_adds_avx2(x1[7], x1[6]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); - idct16_stage5_avx2(x1, cospi, __rounding, cos_bit); - idct16_stage6_avx2(x1, cospi, __rounding, cos_bit); + idct16_stage5_avx2(x1, cospi, _r, cos_bit); + idct16_stage6_avx2(x1, cospi, _r, cos_bit); idct16_stage7_avx2(output, x1); } @@ -212,74 +210,71 @@ static void idct16_low1_new_avx2(const __m256i *input, __m256i *output, } static INLINE void iadst16_stage3_avx2(__m256i *x) { - btf_16_adds_subs_avx2(x[0], x[8]); - btf_16_adds_subs_avx2(x[1], x[9]); - btf_16_adds_subs_avx2(x[2], x[10]); - btf_16_adds_subs_avx2(x[3], x[11]); - btf_16_adds_subs_avx2(x[4], x[12]); - btf_16_adds_subs_avx2(x[5], x[13]); - btf_16_adds_subs_avx2(x[6], x[14]); - btf_16_adds_subs_avx2(x[7], x[15]); + btf_16_adds_subs_avx2(&x[0], &x[8]); + btf_16_adds_subs_avx2(&x[1], &x[9]); + btf_16_adds_subs_avx2(&x[2], &x[10]); + btf_16_adds_subs_avx2(&x[3], &x[11]); + btf_16_adds_subs_avx2(&x[4], &x[12]); + btf_16_adds_subs_avx2(&x[5], &x[13]); + btf_16_adds_subs_avx2(&x[6], &x[14]); + btf_16_adds_subs_avx2(&x[7], &x[15]); } static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]); const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]); - btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); - btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]); - btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]); - btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]); + btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit); } static INLINE void iadst16_stage5_avx2(__m256i *x) { - btf_16_adds_subs_avx2(x[0], x[4]); - btf_16_adds_subs_avx2(x[1], x[5]); - btf_16_adds_subs_avx2(x[2], x[6]); - btf_16_adds_subs_avx2(x[3], x[7]); - btf_16_adds_subs_avx2(x[8], x[12]); - btf_16_adds_subs_avx2(x[9], x[13]); - btf_16_adds_subs_avx2(x[10], x[14]); - btf_16_adds_subs_avx2(x[11], x[15]); + btf_16_adds_subs_avx2(&x[0], &x[4]); + btf_16_adds_subs_avx2(&x[1], &x[5]); + btf_16_adds_subs_avx2(&x[2], &x[6]); + btf_16_adds_subs_avx2(&x[3], &x[7]); + btf_16_adds_subs_avx2(&x[8], &x[12]); + btf_16_adds_subs_avx2(&x[9], &x[13]); + btf_16_adds_subs_avx2(&x[10], &x[14]); + btf_16_adds_subs_avx2(&x[11], &x[15]); } static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); - btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); - btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); - btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); - btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit); } static INLINE void iadst16_stage7_avx2(__m256i *x) { - btf_16_adds_subs_avx2(x[0], x[2]); - btf_16_adds_subs_avx2(x[1], x[3]); - btf_16_adds_subs_avx2(x[4], x[6]); - btf_16_adds_subs_avx2(x[5], x[7]); - btf_16_adds_subs_avx2(x[8], x[10]); - btf_16_adds_subs_avx2(x[9], x[11]); - btf_16_adds_subs_avx2(x[12], x[14]); - btf_16_adds_subs_avx2(x[13], x[15]); + btf_16_adds_subs_avx2(&x[0], &x[2]); + btf_16_adds_subs_avx2(&x[1], &x[3]); + btf_16_adds_subs_avx2(&x[4], &x[6]); + btf_16_adds_subs_avx2(&x[5], &x[7]); + btf_16_adds_subs_avx2(&x[8], &x[10]); + btf_16_adds_subs_avx2(&x[9], &x[11]); + btf_16_adds_subs_avx2(&x[12], &x[14]); + btf_16_adds_subs_avx2(&x[13], &x[15]); } static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x1[2], x1[3]); - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x1[6], x1[7]); - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x1[10], x1[11]); - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x1[14], x1[15]); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit); } static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) { @@ -307,7 +302,7 @@ static void iadst16_new_avx2(const __m256i *input, __m256i *output, (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); @@ -346,21 +341,21 @@ static void iadst16_new_avx2(const __m256i *input, __m256i *output, x1[15] = input[14]; // stage 2 - btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, x1[0], x1[1], x1[0], x1[1]); - btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, x1[2], x1[3], x1[2], x1[3]); - btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, x1[4], x1[5], x1[4], x1[5]); - btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, x1[6], x1[7], x1[6], x1[7]); - btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, x1[8], x1[9], x1[8], x1[9]); - btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, x1[10], x1[11], x1[10], x1[11]); - btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, x1[12], x1[13], x1[12], x1[13]); - btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, x1[14], x1[15], x1[14], x1[15]); + btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit); + btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit); iadst16_stage3_avx2(x1); - iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage4_avx2(x1, cospi, _r, cos_bit); iadst16_stage5_avx2(x1); - iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage6_avx2(x1, cospi, _r, cos_bit); iadst16_stage7_avx2(x1); - iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage8_avx2(x1, cospi, _r, cos_bit); iadst16_stage9_avx2(output, x1); } @@ -368,7 +363,7 @@ static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); // stage 1 __m256i x1[16]; @@ -392,11 +387,11 @@ static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output, btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]); iadst16_stage3_avx2(x1); - iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage4_avx2(x1, cospi, _r, cos_bit); iadst16_stage5_avx2(x1); - iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage6_avx2(x1, cospi, _r, cos_bit); iadst16_stage7_avx2(x1); - iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage8_avx2(x1, cospi, _r, cos_bit); iadst16_stage9_avx2(output, x1); } @@ -404,7 +399,7 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); @@ -423,7 +418,7 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output, x1[9] = x1[1]; // stage 4 - btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x1[8], x1[9], x1[8], x1[9]); + btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit); // stage 5 x1[4] = x1[0]; @@ -433,8 +428,8 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output, x1[13] = x1[9]; // stage 6 - btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[4], x1[5], x1[4], x1[5]); - btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[12], x1[13], x1[12], x1[13]); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit); // stage 7 x1[2] = x1[0]; @@ -446,130 +441,125 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output, x1[14] = x1[12]; x1[15] = x1[13]; - iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit); + iadst16_stage8_avx2(x1, cospi, _r, cos_bit); iadst16_stage9_avx2(output, x1); } static INLINE void idct32_high16_stage3_avx2(__m256i *x) { - btf_16_adds_subs_avx2(x[16], x[17]); - btf_16_subs_adds_avx2(x[19], x[18]); - btf_16_adds_subs_avx2(x[20], x[21]); - btf_16_subs_adds_avx2(x[23], x[22]); - btf_16_adds_subs_avx2(x[24], x[25]); - btf_16_subs_adds_avx2(x[27], x[26]); - btf_16_adds_subs_avx2(x[28], x[29]); - btf_16_subs_adds_avx2(x[31], x[30]); + btf_16_adds_subs_avx2(&x[16], &x[17]); + btf_16_adds_subs_avx2(&x[19], &x[18]); + btf_16_adds_subs_avx2(&x[20], &x[21]); + btf_16_adds_subs_avx2(&x[23], &x[22]); + btf_16_adds_subs_avx2(&x[24], &x[25]); + btf_16_adds_subs_avx2(&x[27], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[29]); + btf_16_adds_subs_avx2(&x[31], &x[30]); } static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); - btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); - btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); - btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); - btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); } static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); - btf_16_adds_subs_avx2(x[16], x[19]); - btf_16_adds_subs_avx2(x[17], x[18]); - btf_16_subs_adds_avx2(x[23], x[20]); - btf_16_subs_adds_avx2(x[22], x[21]); - btf_16_adds_subs_avx2(x[24], x[27]); - btf_16_adds_subs_avx2(x[25], x[26]); - btf_16_subs_adds_avx2(x[31], x[28]); - btf_16_subs_adds_avx2(x[30], x[29]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit); + btf_16_adds_subs_avx2(&x[16], &x[19]); + btf_16_adds_subs_avx2(&x[17], &x[18]); + btf_16_adds_subs_avx2(&x[23], &x[20]); + btf_16_adds_subs_avx2(&x[22], &x[21]); + btf_16_adds_subs_avx2(&x[24], &x[27]); + btf_16_adds_subs_avx2(&x[25], &x[26]); + btf_16_adds_subs_avx2(&x[31], &x[28]); + btf_16_adds_subs_avx2(&x[30], &x[29]); } static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); - btf_16_adds_subs_avx2(x[8], x[11]); - btf_16_adds_subs_avx2(x[9], x[10]); - btf_16_subs_adds_avx2(x[15], x[12]); - btf_16_subs_adds_avx2(x[14], x[13]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x[8], &x[11]); + btf_16_adds_subs_avx2(&x[9], &x[10]); + btf_16_adds_subs_avx2(&x[15], &x[12]); + btf_16_adds_subs_avx2(&x[14], &x[13]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); } static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(x[0], x[7]); - btf_16_adds_subs_avx2(x[1], x[6]); - btf_16_adds_subs_avx2(x[2], x[5]); - btf_16_adds_subs_avx2(x[3], x[4]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); - btf_16_adds_subs_avx2(x[16], x[23]); - btf_16_adds_subs_avx2(x[17], x[22]); - btf_16_adds_subs_avx2(x[18], x[21]); - btf_16_adds_subs_avx2(x[19], x[20]); - btf_16_subs_adds_avx2(x[31], x[24]); - btf_16_subs_adds_avx2(x[30], x[25]); - btf_16_subs_adds_avx2(x[29], x[26]); - btf_16_subs_adds_avx2(x[28], x[27]); + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x[16], &x[23]); + btf_16_adds_subs_avx2(&x[17], &x[22]); + btf_16_adds_subs_avx2(&x[18], &x[21]); + btf_16_adds_subs_avx2(&x[19], &x[20]); + btf_16_adds_subs_avx2(&x[31], &x[24]); + btf_16_adds_subs_avx2(&x[30], &x[25]); + btf_16_adds_subs_avx2(&x[29], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[27]); } static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(x[0], x[15]); - btf_16_adds_subs_avx2(x[1], x[14]); - btf_16_adds_subs_avx2(x[2], x[13]); - btf_16_adds_subs_avx2(x[3], x[12]); - btf_16_adds_subs_avx2(x[4], x[11]); - btf_16_adds_subs_avx2(x[5], x[10]); - btf_16_adds_subs_avx2(x[6], x[9]); - btf_16_adds_subs_avx2(x[7], x[8]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); + btf_16_adds_subs_avx2(&x[0], &x[15]); + btf_16_adds_subs_avx2(&x[1], &x[14]); + btf_16_adds_subs_avx2(&x[2], &x[13]); + btf_16_adds_subs_avx2(&x[3], &x[12]); + btf_16_adds_subs_avx2(&x[4], &x[11]); + btf_16_adds_subs_avx2(&x[5], &x[10]); + btf_16_adds_subs_avx2(&x[6], &x[9]); + btf_16_adds_subs_avx2(&x[7], &x[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); } static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) { - btf_16_adds_subs_out_avx2(output[0], output[31], x[0], x[31]); - btf_16_adds_subs_out_avx2(output[1], output[30], x[1], x[30]); - btf_16_adds_subs_out_avx2(output[2], output[29], x[2], x[29]); - btf_16_adds_subs_out_avx2(output[3], output[28], x[3], x[28]); - btf_16_adds_subs_out_avx2(output[4], output[27], x[4], x[27]); - btf_16_adds_subs_out_avx2(output[5], output[26], x[5], x[26]); - btf_16_adds_subs_out_avx2(output[6], output[25], x[6], x[25]); - btf_16_adds_subs_out_avx2(output[7], output[24], x[7], x[24]); - btf_16_adds_subs_out_avx2(output[8], output[23], x[8], x[23]); - btf_16_adds_subs_out_avx2(output[9], output[22], x[9], x[22]); - btf_16_adds_subs_out_avx2(output[10], output[21], x[10], x[21]); - btf_16_adds_subs_out_avx2(output[11], output[20], x[11], x[20]); - btf_16_adds_subs_out_avx2(output[12], output[19], x[12], x[19]); - btf_16_adds_subs_out_avx2(output[13], output[18], x[13], x[18]); - btf_16_adds_subs_out_avx2(output[14], output[17], x[14], x[17]); - btf_16_adds_subs_out_avx2(output[15], output[16], x[15], x[16]); + btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]); + btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]); + btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]); + btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]); + btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]); + btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]); + btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]); + btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]); + btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]); + btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]); + btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]); + btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]); + btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]); + btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]); + btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]); + btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]); } static void idct32_low1_new_avx2(const __m256i *input, __m256i *output, @@ -629,7 +619,7 @@ static void idct32_low8_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); // stage 1 __m256i x[32]; @@ -666,20 +656,20 @@ static void idct32_low8_new_avx2(const __m256i *input, __m256i *output, x[10] = x[11]; x[13] = x[12]; x[14] = x[15]; - idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit); + idct32_high16_stage4_avx2(x, cospi, _r, cos_bit); // stage 5 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); x[5] = x[4]; x[6] = x[7]; - idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit); + idct32_high24_stage5_avx2(x, cospi, _r, cos_bit); // stage 6 x[3] = x[0]; x[2] = x[1]; - idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit); + idct32_high28_stage6_avx2(x, cospi, _r, cos_bit); - idct32_stage7_avx2(x, cospi, __rounding, cos_bit); - idct32_stage8_avx2(x, cospi, __rounding, cos_bit); + idct32_stage7_avx2(x, cospi, _r, cos_bit); + idct32_stage8_avx2(x, cospi, _r, cos_bit); idct32_stage9_avx2(output, x); } @@ -687,7 +677,7 @@ static void idct32_low16_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); // stage 1 __m256i x[32]; @@ -728,25 +718,25 @@ static void idct32_low16_new_avx2(const __m256i *input, __m256i *output, // stage 4 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); - btf_16_adds_subs_avx2(x[8], x[9]); - btf_16_subs_adds_avx2(x[11], x[10]); - btf_16_adds_subs_avx2(x[12], x[13]); - btf_16_subs_adds_avx2(x[15], x[14]); - idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[8], &x[9]); + btf_16_adds_subs_avx2(&x[11], &x[10]); + btf_16_adds_subs_avx2(&x[12], &x[13]); + btf_16_adds_subs_avx2(&x[15], &x[14]); + idct32_high16_stage4_avx2(x, cospi, _r, cos_bit); // stage 5 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); - btf_16_adds_subs_avx2(x[4], x[5]); - btf_16_subs_adds_avx2(x[7], x[6]); - idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[4], &x[5]); + btf_16_adds_subs_avx2(&x[7], &x[6]); + idct32_high24_stage5_avx2(x, cospi, _r, cos_bit); - btf_16_adds_subs_avx2(x[0], x[3]); - btf_16_adds_subs_avx2(x[1], x[2]); - idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[0], &x[3]); + btf_16_adds_subs_avx2(&x[1], &x[2]); + idct32_high28_stage6_avx2(x, cospi, _r, cos_bit); - idct32_stage7_avx2(x, cospi, __rounding, cos_bit); - idct32_stage8_avx2(x, cospi, __rounding, cos_bit); + idct32_stage7_avx2(x, cospi, _r, cos_bit); + idct32_stage8_avx2(x, cospi, _r, cos_bit); idct32_stage9_avx2(output, x); } @@ -754,7 +744,7 @@ static void idct32_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); @@ -825,51 +815,50 @@ static void idct32_new_avx2(const __m256i *input, __m256i *output, x1[31] = input[31]; // stage 2 - btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, x1[16], x1[31], x1[16], x1[31]); - btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, x1[17], x1[30], x1[17], x1[30]); - btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, x1[18], x1[29], x1[18], x1[29]); - btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, x1[19], x1[28], x1[19], x1[28]); - btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, x1[20], x1[27], x1[20], x1[27]); - btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, x1[21], x1[26], x1[21], x1[26]); - btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, x1[22], x1[25], x1[22], x1[25]); - btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, x1[23], x1[24], x1[23], x1[24]); + btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, cos_bit); + btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, cos_bit); // stage 3 - btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]); - btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]); - btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]); - btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]); + btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit); idct32_high16_stage3_avx2(x1); // stage 4 - btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]); - btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]); - btf_16_adds_subs_avx2(x1[8], x1[9]); - btf_16_subs_adds_avx2(x1[11], x1[10]); - btf_16_adds_subs_avx2(x1[12], x1[13]); - btf_16_subs_adds_avx2(x1[15], x1[14]); - idct32_high16_stage4_avx2(x1, cospi, __rounding, cos_bit); + btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + idct32_high16_stage4_avx2(x1, cospi, _r, cos_bit); // stage 5 - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]); - btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]); - btf_16_adds_subs_avx2(x1[4], x1[5]); - btf_16_subs_adds_avx2(x1[7], x1[6]); - idct32_high24_stage5_avx2(x1, cospi, __rounding, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + idct32_high24_stage5_avx2(x1, cospi, _r, cos_bit); // stage 6 - btf_16_adds_subs_avx2(x1[0], x1[3]); - btf_16_adds_subs_avx2(x1[1], x1[2]); - idct32_high28_stage6_avx2(x1, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + idct32_high28_stage6_avx2(x1, cospi, _r, cos_bit); - idct32_stage7_avx2(x1, cospi, __rounding, cos_bit); - idct32_stage8_avx2(x1, cospi, __rounding, cos_bit); + idct32_stage7_avx2(x1, cospi, _r, cos_bit); + idct32_stage8_avx2(x1, cospi, _r, cos_bit); idct32_stage9_avx2(output, x1); } static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); @@ -883,19 +872,18 @@ static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi, const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); - btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); - btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]); - btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]); - btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); - btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); - btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]); - btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]); - btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); + btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit); + btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit); } static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); @@ -903,31 +891,30 @@ static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi, const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); - btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); - btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); - btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); - btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); - btf_16_adds_subs_avx2(x[32], x[35]); - btf_16_adds_subs_avx2(x[33], x[34]); - btf_16_subs_adds_avx2(x[39], x[36]); - btf_16_subs_adds_avx2(x[38], x[37]); - btf_16_adds_subs_avx2(x[40], x[43]); - btf_16_adds_subs_avx2(x[41], x[42]); - btf_16_subs_adds_avx2(x[47], x[44]); - btf_16_subs_adds_avx2(x[46], x[45]); - btf_16_adds_subs_avx2(x[48], x[51]); - btf_16_adds_subs_avx2(x[49], x[50]); - btf_16_subs_adds_avx2(x[55], x[52]); - btf_16_subs_adds_avx2(x[54], x[53]); - btf_16_adds_subs_avx2(x[56], x[59]); - btf_16_adds_subs_avx2(x[57], x[58]); - btf_16_subs_adds_avx2(x[63], x[60]); - btf_16_subs_adds_avx2(x[62], x[61]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); + btf_16_adds_subs_avx2(&x[32], &x[35]); + btf_16_adds_subs_avx2(&x[33], &x[34]); + btf_16_adds_subs_avx2(&x[39], &x[36]); + btf_16_adds_subs_avx2(&x[38], &x[37]); + btf_16_adds_subs_avx2(&x[40], &x[43]); + btf_16_adds_subs_avx2(&x[41], &x[42]); + btf_16_adds_subs_avx2(&x[47], &x[44]); + btf_16_adds_subs_avx2(&x[46], &x[45]); + btf_16_adds_subs_avx2(&x[48], &x[51]); + btf_16_adds_subs_avx2(&x[49], &x[50]); + btf_16_adds_subs_avx2(&x[55], &x[52]); + btf_16_adds_subs_avx2(&x[54], &x[53]); + btf_16_adds_subs_avx2(&x[56], &x[59]); + btf_16_adds_subs_avx2(&x[57], &x[58]); + btf_16_adds_subs_avx2(&x[63], &x[60]); + btf_16_adds_subs_avx2(&x[62], &x[61]); } static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); @@ -935,185 +922,180 @@ static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi, const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); - btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]); - btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]); - btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]); - btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]); - btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]); - btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]); - btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]); - btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit); } static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { - btf_16_adds_subs_avx2(x[16], x[19]); - btf_16_adds_subs_avx2(x[17], x[18]); - btf_16_subs_adds_avx2(x[23], x[20]); - btf_16_subs_adds_avx2(x[22], x[21]); - btf_16_adds_subs_avx2(x[24], x[27]); - btf_16_adds_subs_avx2(x[25], x[26]); - btf_16_subs_adds_avx2(x[31], x[28]); - btf_16_subs_adds_avx2(x[30], x[29]); - idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit); + const __m256i _r, int8_t cos_bit) { + btf_16_adds_subs_avx2(&x[16], &x[19]); + btf_16_adds_subs_avx2(&x[17], &x[18]); + btf_16_adds_subs_avx2(&x[23], &x[20]); + btf_16_adds_subs_avx2(&x[22], &x[21]); + btf_16_adds_subs_avx2(&x[24], &x[27]); + btf_16_adds_subs_avx2(&x[25], &x[26]); + btf_16_adds_subs_avx2(&x[31], &x[28]); + btf_16_adds_subs_avx2(&x[30], &x[29]); + idct64_stage6_high32_avx2(x, cospi, _r, cos_bit); } static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); - btf_16_adds_subs_avx2(x[32], x[39]); - btf_16_adds_subs_avx2(x[33], x[38]); - btf_16_adds_subs_avx2(x[34], x[37]); - btf_16_adds_subs_avx2(x[35], x[36]); - btf_16_subs_adds_avx2(x[47], x[40]); - btf_16_subs_adds_avx2(x[46], x[41]); - btf_16_subs_adds_avx2(x[45], x[42]); - btf_16_subs_adds_avx2(x[44], x[43]); - btf_16_adds_subs_avx2(x[48], x[55]); - btf_16_adds_subs_avx2(x[49], x[54]); - btf_16_adds_subs_avx2(x[50], x[53]); - btf_16_adds_subs_avx2(x[51], x[52]); - btf_16_subs_adds_avx2(x[63], x[56]); - btf_16_subs_adds_avx2(x[62], x[57]); - btf_16_subs_adds_avx2(x[61], x[58]); - btf_16_subs_adds_avx2(x[60], x[59]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); + btf_16_adds_subs_avx2(&x[32], &x[39]); + btf_16_adds_subs_avx2(&x[33], &x[38]); + btf_16_adds_subs_avx2(&x[34], &x[37]); + btf_16_adds_subs_avx2(&x[35], &x[36]); + btf_16_adds_subs_avx2(&x[47], &x[40]); + btf_16_adds_subs_avx2(&x[46], &x[41]); + btf_16_adds_subs_avx2(&x[45], &x[42]); + btf_16_adds_subs_avx2(&x[44], &x[43]); + btf_16_adds_subs_avx2(&x[48], &x[55]); + btf_16_adds_subs_avx2(&x[49], &x[54]); + btf_16_adds_subs_avx2(&x[50], &x[53]); + btf_16_adds_subs_avx2(&x[51], &x[52]); + btf_16_adds_subs_avx2(&x[63], &x[56]); + btf_16_adds_subs_avx2(&x[62], &x[57]); + btf_16_adds_subs_avx2(&x[61], &x[58]); + btf_16_adds_subs_avx2(&x[60], &x[59]); } static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); - btf_16_adds_subs_avx2(x[16], x[23]); - btf_16_adds_subs_avx2(x[17], x[22]); - btf_16_adds_subs_avx2(x[18], x[21]); - btf_16_adds_subs_avx2(x[19], x[20]); - btf_16_subs_adds_avx2(x[31], x[24]); - btf_16_subs_adds_avx2(x[30], x[25]); - btf_16_subs_adds_avx2(x[29], x[26]); - btf_16_subs_adds_avx2(x[28], x[27]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]); + btf_16_adds_subs_avx2(&x[16], &x[23]); + btf_16_adds_subs_avx2(&x[17], &x[22]); + btf_16_adds_subs_avx2(&x[18], &x[21]); + btf_16_adds_subs_avx2(&x[19], &x[20]); + btf_16_adds_subs_avx2(&x[31], &x[24]); + btf_16_adds_subs_avx2(&x[30], &x[25]); + btf_16_adds_subs_avx2(&x[29], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[27]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit); } static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(x[0], x[15]); - btf_16_adds_subs_avx2(x[1], x[14]); - btf_16_adds_subs_avx2(x[2], x[13]); - btf_16_adds_subs_avx2(x[3], x[12]); - btf_16_adds_subs_avx2(x[4], x[11]); - btf_16_adds_subs_avx2(x[5], x[10]); - btf_16_adds_subs_avx2(x[6], x[9]); - btf_16_adds_subs_avx2(x[7], x[8]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); - btf_16_adds_subs_avx2(x[32], x[47]); - btf_16_adds_subs_avx2(x[33], x[46]); - btf_16_adds_subs_avx2(x[34], x[45]); - btf_16_adds_subs_avx2(x[35], x[44]); - btf_16_adds_subs_avx2(x[36], x[43]); - btf_16_adds_subs_avx2(x[37], x[42]); - btf_16_adds_subs_avx2(x[38], x[41]); - btf_16_adds_subs_avx2(x[39], x[40]); - btf_16_subs_adds_avx2(x[63], x[48]); - btf_16_subs_adds_avx2(x[62], x[49]); - btf_16_subs_adds_avx2(x[61], x[50]); - btf_16_subs_adds_avx2(x[60], x[51]); - btf_16_subs_adds_avx2(x[59], x[52]); - btf_16_subs_adds_avx2(x[58], x[53]); - btf_16_subs_adds_avx2(x[57], x[54]); - btf_16_subs_adds_avx2(x[56], x[55]); + btf_16_adds_subs_avx2(&x[0], &x[15]); + btf_16_adds_subs_avx2(&x[1], &x[14]); + btf_16_adds_subs_avx2(&x[2], &x[13]); + btf_16_adds_subs_avx2(&x[3], &x[12]); + btf_16_adds_subs_avx2(&x[4], &x[11]); + btf_16_adds_subs_avx2(&x[5], &x[10]); + btf_16_adds_subs_avx2(&x[6], &x[9]); + btf_16_adds_subs_avx2(&x[7], &x[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); + btf_16_adds_subs_avx2(&x[32], &x[47]); + btf_16_adds_subs_avx2(&x[33], &x[46]); + btf_16_adds_subs_avx2(&x[34], &x[45]); + btf_16_adds_subs_avx2(&x[35], &x[44]); + btf_16_adds_subs_avx2(&x[36], &x[43]); + btf_16_adds_subs_avx2(&x[37], &x[42]); + btf_16_adds_subs_avx2(&x[38], &x[41]); + btf_16_adds_subs_avx2(&x[39], &x[40]); + btf_16_adds_subs_avx2(&x[63], &x[48]); + btf_16_adds_subs_avx2(&x[62], &x[49]); + btf_16_adds_subs_avx2(&x[61], &x[50]); + btf_16_adds_subs_avx2(&x[60], &x[51]); + btf_16_adds_subs_avx2(&x[59], &x[52]); + btf_16_adds_subs_avx2(&x[58], &x[53]); + btf_16_adds_subs_avx2(&x[57], &x[54]); + btf_16_adds_subs_avx2(&x[56], &x[55]); } static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi, - const __m256i __rounding, - int8_t cos_bit) { + const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(x[0], x[31]); - btf_16_adds_subs_avx2(x[1], x[30]); - btf_16_adds_subs_avx2(x[2], x[29]); - btf_16_adds_subs_avx2(x[3], x[28]); - btf_16_adds_subs_avx2(x[4], x[27]); - btf_16_adds_subs_avx2(x[5], x[26]); - btf_16_adds_subs_avx2(x[6], x[25]); - btf_16_adds_subs_avx2(x[7], x[24]); - btf_16_adds_subs_avx2(x[8], x[23]); - btf_16_adds_subs_avx2(x[9], x[22]); - btf_16_adds_subs_avx2(x[10], x[21]); - btf_16_adds_subs_avx2(x[11], x[20]); - btf_16_adds_subs_avx2(x[12], x[19]); - btf_16_adds_subs_avx2(x[13], x[18]); - btf_16_adds_subs_avx2(x[14], x[17]); - btf_16_adds_subs_avx2(x[15], x[16]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]); + btf_16_adds_subs_avx2(&x[0], &x[31]); + btf_16_adds_subs_avx2(&x[1], &x[30]); + btf_16_adds_subs_avx2(&x[2], &x[29]); + btf_16_adds_subs_avx2(&x[3], &x[28]); + btf_16_adds_subs_avx2(&x[4], &x[27]); + btf_16_adds_subs_avx2(&x[5], &x[26]); + btf_16_adds_subs_avx2(&x[6], &x[25]); + btf_16_adds_subs_avx2(&x[7], &x[24]); + btf_16_adds_subs_avx2(&x[8], &x[23]); + btf_16_adds_subs_avx2(&x[9], &x[22]); + btf_16_adds_subs_avx2(&x[10], &x[21]); + btf_16_adds_subs_avx2(&x[11], &x[20]); + btf_16_adds_subs_avx2(&x[12], &x[19]); + btf_16_adds_subs_avx2(&x[13], &x[18]); + btf_16_adds_subs_avx2(&x[14], &x[17]); + btf_16_adds_subs_avx2(&x[15], &x[16]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit); } static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) { - btf_16_adds_subs_out_avx2(output[0], output[63], x[0], x[63]); - btf_16_adds_subs_out_avx2(output[1], output[62], x[1], x[62]); - btf_16_adds_subs_out_avx2(output[2], output[61], x[2], x[61]); - btf_16_adds_subs_out_avx2(output[3], output[60], x[3], x[60]); - btf_16_adds_subs_out_avx2(output[4], output[59], x[4], x[59]); - btf_16_adds_subs_out_avx2(output[5], output[58], x[5], x[58]); - btf_16_adds_subs_out_avx2(output[6], output[57], x[6], x[57]); - btf_16_adds_subs_out_avx2(output[7], output[56], x[7], x[56]); - btf_16_adds_subs_out_avx2(output[8], output[55], x[8], x[55]); - btf_16_adds_subs_out_avx2(output[9], output[54], x[9], x[54]); - btf_16_adds_subs_out_avx2(output[10], output[53], x[10], x[53]); - btf_16_adds_subs_out_avx2(output[11], output[52], x[11], x[52]); - btf_16_adds_subs_out_avx2(output[12], output[51], x[12], x[51]); - btf_16_adds_subs_out_avx2(output[13], output[50], x[13], x[50]); - btf_16_adds_subs_out_avx2(output[14], output[49], x[14], x[49]); - btf_16_adds_subs_out_avx2(output[15], output[48], x[15], x[48]); - btf_16_adds_subs_out_avx2(output[16], output[47], x[16], x[47]); - btf_16_adds_subs_out_avx2(output[17], output[46], x[17], x[46]); - btf_16_adds_subs_out_avx2(output[18], output[45], x[18], x[45]); - btf_16_adds_subs_out_avx2(output[19], output[44], x[19], x[44]); - btf_16_adds_subs_out_avx2(output[20], output[43], x[20], x[43]); - btf_16_adds_subs_out_avx2(output[21], output[42], x[21], x[42]); - btf_16_adds_subs_out_avx2(output[22], output[41], x[22], x[41]); - btf_16_adds_subs_out_avx2(output[23], output[40], x[23], x[40]); - btf_16_adds_subs_out_avx2(output[24], output[39], x[24], x[39]); - btf_16_adds_subs_out_avx2(output[25], output[38], x[25], x[38]); - btf_16_adds_subs_out_avx2(output[26], output[37], x[26], x[37]); - btf_16_adds_subs_out_avx2(output[27], output[36], x[27], x[36]); - btf_16_adds_subs_out_avx2(output[28], output[35], x[28], x[35]); - btf_16_adds_subs_out_avx2(output[29], output[34], x[29], x[34]); - btf_16_adds_subs_out_avx2(output[30], output[33], x[30], x[33]); - btf_16_adds_subs_out_avx2(output[31], output[32], x[31], x[32]); + btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]); + btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]); + btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]); + btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]); + btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]); + btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]); + btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]); + btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]); + btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]); + btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]); + btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]); + btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]); + btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]); + btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]); + btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]); + btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]); + btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]); + btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]); + btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]); + btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]); + btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]); + btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]); + btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]); + btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]); + btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]); + btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]); + btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]); + btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]); + btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]); + btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]); + btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]); + btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]); } static void idct64_low1_new_avx2(const __m256i *input, __m256i *output, @@ -1207,7 +1189,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); @@ -1260,16 +1242,16 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output, x[22] = x[23]; x[25] = x[24]; x[30] = x[31]; - btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); - btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); - btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); - btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); + btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit); + btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit); // stage 5 x[9] = x[8]; x[14] = x[15]; - btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); - btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); x[35] = x[32]; x[34] = x[33]; x[36] = x[39]; @@ -1289,7 +1271,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output, // stage 6 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); x[19] = x[16]; x[18] = x[17]; x[20] = x[23]; @@ -1298,7 +1280,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output, x[26] = x[25]; x[28] = x[31]; x[29] = x[30]; - idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit); + idct64_stage6_high32_avx2(x, cospi, _r, cos_bit); // stage 7 x[3] = x[0]; @@ -1307,7 +1289,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output, x[10] = x[9]; x[12] = x[15]; x[13] = x[14]; - idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit); + idct64_stage7_high48_avx2(x, cospi, _r, cos_bit); // stage 8 x[7] = x[0]; @@ -1315,12 +1297,12 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output, x[5] = x[2]; x[4] = x[3]; x[9] = x[9]; - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); - idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); + idct64_stage8_high48_avx2(x, cospi, _r, cos_bit); - idct64_stage9_avx2(x, cospi, __rounding, cos_bit); - idct64_stage10_avx2(x, cospi, __rounding, cos_bit); + idct64_stage9_avx2(x, cospi, _r, cos_bit); + idct64_stage10_avx2(x, cospi, _r, cos_bit); idct64_stage11_avx2(output, x); } @@ -1328,7 +1310,7 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); @@ -1398,7 +1380,7 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output, x[26] = x[27]; x[29] = x[28]; x[30] = x[31]; - idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit); + idct64_stage4_high32_avx2(x, cospi, _r, cos_bit); // stage 5 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); @@ -1406,37 +1388,37 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output, x[10] = x[11]; x[13] = x[12]; x[14] = x[15]; - idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit); + idct64_stage5_high48_avx2(x, cospi, _r, cos_bit); // stage 6 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); x[5] = x[4]; x[6] = x[7]; - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); - idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit); + idct64_stage6_high48_avx2(x, cospi, _r, cos_bit); // stage 7 x[3] = x[0]; x[2] = x[1]; - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); - btf_16_adds_subs_avx2(x[8], x[11]); - btf_16_adds_subs_avx2(x[9], x[10]); - btf_16_subs_adds_avx2(x[15], x[12]); - btf_16_subs_adds_avx2(x[14], x[13]); - idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x[8], &x[11]); + btf_16_adds_subs_avx2(&x[9], &x[10]); + btf_16_adds_subs_avx2(&x[15], &x[12]); + btf_16_adds_subs_avx2(&x[14], &x[13]); + idct64_stage7_high48_avx2(x, cospi, _r, cos_bit); // stage 8 - btf_16_adds_subs_avx2(x[0], x[7]); - btf_16_adds_subs_avx2(x[1], x[6]); - btf_16_adds_subs_avx2(x[2], x[5]); - btf_16_adds_subs_avx2(x[3], x[4]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); - idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit); - - idct64_stage9_avx2(x, cospi, __rounding, cos_bit); - idct64_stage10_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); + idct64_stage8_high48_avx2(x, cospi, _r, cos_bit); + + idct64_stage9_avx2(x, cospi, _r, cos_bit); + idct64_stage10_avx2(x, cospi, _r, cos_bit); idct64_stage11_avx2(output, x); } @@ -1444,7 +1426,7 @@ static void idct64_low32_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); @@ -1514,78 +1496,78 @@ static void idct64_low32_new_avx2(const __m256i *input, __m256i *output, btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); - btf_16_adds_subs_avx2(x[32], x[33]); - btf_16_subs_adds_avx2(x[35], x[34]); - btf_16_adds_subs_avx2(x[36], x[37]); - btf_16_subs_adds_avx2(x[39], x[38]); - btf_16_adds_subs_avx2(x[40], x[41]); - btf_16_subs_adds_avx2(x[43], x[42]); - btf_16_adds_subs_avx2(x[44], x[45]); - btf_16_subs_adds_avx2(x[47], x[46]); - btf_16_adds_subs_avx2(x[48], x[49]); - btf_16_subs_adds_avx2(x[51], x[50]); - btf_16_adds_subs_avx2(x[52], x[53]); - btf_16_subs_adds_avx2(x[55], x[54]); - btf_16_adds_subs_avx2(x[56], x[57]); - btf_16_subs_adds_avx2(x[59], x[58]); - btf_16_adds_subs_avx2(x[60], x[61]); - btf_16_subs_adds_avx2(x[63], x[62]); + btf_16_adds_subs_avx2(&x[32], &x[33]); + btf_16_adds_subs_avx2(&x[35], &x[34]); + btf_16_adds_subs_avx2(&x[36], &x[37]); + btf_16_adds_subs_avx2(&x[39], &x[38]); + btf_16_adds_subs_avx2(&x[40], &x[41]); + btf_16_adds_subs_avx2(&x[43], &x[42]); + btf_16_adds_subs_avx2(&x[44], &x[45]); + btf_16_adds_subs_avx2(&x[47], &x[46]); + btf_16_adds_subs_avx2(&x[48], &x[49]); + btf_16_adds_subs_avx2(&x[51], &x[50]); + btf_16_adds_subs_avx2(&x[52], &x[53]); + btf_16_adds_subs_avx2(&x[55], &x[54]); + btf_16_adds_subs_avx2(&x[56], &x[57]); + btf_16_adds_subs_avx2(&x[59], &x[58]); + btf_16_adds_subs_avx2(&x[60], &x[61]); + btf_16_adds_subs_avx2(&x[63], &x[62]); // stage 4 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); - btf_16_adds_subs_avx2(x[16], x[17]); - btf_16_subs_adds_avx2(x[19], x[18]); - btf_16_adds_subs_avx2(x[20], x[21]); - btf_16_subs_adds_avx2(x[23], x[22]); - btf_16_adds_subs_avx2(x[24], x[25]); - btf_16_subs_adds_avx2(x[27], x[26]); - btf_16_adds_subs_avx2(x[28], x[29]); - btf_16_subs_adds_avx2(x[31], x[30]); - idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[16], &x[17]); + btf_16_adds_subs_avx2(&x[19], &x[18]); + btf_16_adds_subs_avx2(&x[20], &x[21]); + btf_16_adds_subs_avx2(&x[23], &x[22]); + btf_16_adds_subs_avx2(&x[24], &x[25]); + btf_16_adds_subs_avx2(&x[27], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[29]); + btf_16_adds_subs_avx2(&x[31], &x[30]); + idct64_stage4_high32_avx2(x, cospi, _r, cos_bit); // stage 5 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); - btf_16_adds_subs_avx2(x[8], x[9]); - btf_16_subs_adds_avx2(x[11], x[10]); - btf_16_adds_subs_avx2(x[12], x[13]); - btf_16_subs_adds_avx2(x[15], x[14]); - idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[8], &x[9]); + btf_16_adds_subs_avx2(&x[11], &x[10]); + btf_16_adds_subs_avx2(&x[12], &x[13]); + btf_16_adds_subs_avx2(&x[15], &x[14]); + idct64_stage5_high48_avx2(x, cospi, _r, cos_bit); // stage 6 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); - btf_16_adds_subs_avx2(x[4], x[5]); - btf_16_subs_adds_avx2(x[7], x[6]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); - idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[4], &x[5]); + btf_16_adds_subs_avx2(&x[7], &x[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit); + idct64_stage6_high48_avx2(x, cospi, _r, cos_bit); // stage 7 - btf_16_adds_subs_avx2(x[0], x[3]); - btf_16_adds_subs_avx2(x[1], x[2]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); - btf_16_adds_subs_avx2(x[8], x[11]); - btf_16_adds_subs_avx2(x[9], x[10]); - btf_16_subs_adds_avx2(x[15], x[12]); - btf_16_subs_adds_avx2(x[14], x[13]); - idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[0], &x[3]); + btf_16_adds_subs_avx2(&x[1], &x[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x[8], &x[11]); + btf_16_adds_subs_avx2(&x[9], &x[10]); + btf_16_adds_subs_avx2(&x[15], &x[12]); + btf_16_adds_subs_avx2(&x[14], &x[13]); + idct64_stage7_high48_avx2(x, cospi, _r, cos_bit); // stage 8 - btf_16_adds_subs_avx2(x[0], x[7]); - btf_16_adds_subs_avx2(x[1], x[6]); - btf_16_adds_subs_avx2(x[2], x[5]); - btf_16_adds_subs_avx2(x[3], x[4]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); - idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit); + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); + idct64_stage8_high48_avx2(x, cospi, _r, cos_bit); // stage 9~11 - idct64_stage9_avx2(x, cospi, __rounding, cos_bit); - idct64_stage10_avx2(x, cospi, __rounding, cos_bit); + idct64_stage9_avx2(x, cospi, _r, cos_bit); + idct64_stage10_avx2(x, cospi, _r, cos_bit); idct64_stage11_avx2(output, x); } @@ -1667,7 +1649,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2( if (lr_flip) { for (int j = 0; j < buf_size_w_div16; ++j) { __m256i temp[16]; - flip_buf_av2(buf0 + 16 * j, temp, 16); + flip_buf_avx2(buf0 + 16 * j, temp, 16); int offset = txfm_size_row * (buf_size_w_div16 - 1 - j); transpose_16bit_16x16_avx2(temp, buf1_cur + offset); } @@ -1693,18 +1675,18 @@ static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input, int txw_idx, int rect_type) { const int32_t *input_row = input; const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]); - const __m256i rounding = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) + - (1 << (NewSqrt2Bits - shift - 1))); + const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) + + (1 << (NewSqrt2Bits - shift - 1))); const __m256i one = _mm256_set1_epi16(1); - const __m256i scale_rounding = _mm256_unpacklo_epi16(scale, rounding); + const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r); if (rect_type != 1 && rect_type != -1) { for (int i = 0; i < height; ++i) { const __m256i src = load_32bit_to_16bit_w16_avx2(input_row); input_row += stride; __m256i lo = _mm256_unpacklo_epi16(src, one); __m256i hi = _mm256_unpackhi_epi16(src, one); - lo = _mm256_madd_epi16(lo, scale_rounding); - hi = _mm256_madd_epi16(hi, scale_rounding); + lo = _mm256_madd_epi16(lo, scale__r); + hi = _mm256_madd_epi16(hi, scale__r); lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); out[i] = _mm256_packs_epi32(lo, hi); @@ -1718,8 +1700,8 @@ static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input, input_row += stride; __m256i lo = _mm256_unpacklo_epi16(src, one); __m256i hi = _mm256_unpackhi_epi16(src, one); - lo = _mm256_madd_epi16(lo, scale_rounding); - hi = _mm256_madd_epi16(hi, scale_rounding); + lo = _mm256_madd_epi16(lo, scale__r); + hi = _mm256_madd_epi16(hi, scale__r); lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); out[i] = _mm256_packs_epi32(lo, hi); @@ -1731,10 +1713,10 @@ static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride, __m256i *buf, int shift, int height, int txh_idx) { const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]); - const __m256i scale_rounding = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1)); - const __m256i shift_rounding = _mm256_set1_epi32(1 << (-shift - 1)); + const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1)); + const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1)); const __m256i one = _mm256_set1_epi16(1); - const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale_rounding); + const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r); for (int h = 0; h < height; ++h) { __m256i lo = _mm256_unpacklo_epi16(buf[h], one); __m256i hi = _mm256_unpackhi_epi16(buf[h], one); @@ -1742,8 +1724,8 @@ static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride, hi = _mm256_madd_epi16(hi, scale_coeff); lo = _mm256_srai_epi32(lo, NewSqrt2Bits); hi = _mm256_srai_epi32(hi, NewSqrt2Bits); - lo = _mm256_add_epi32(lo, shift_rounding); - hi = _mm256_add_epi32(hi, shift_rounding); + lo = _mm256_add_epi32(lo, shift__r); + hi = _mm256_add_epi32(hi, shift__r); lo = _mm256_srai_epi32(lo, -shift); hi = _mm256_srai_epi32(hi, -shift); const __m256i x = _mm256_packs_epi32(lo, hi); @@ -1856,7 +1838,7 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2( if (lr_flip) { for (int j = 0; j < buf_size_w_div16; ++j) { __m256i temp[16]; - flip_buf_av2(buf0 + 16 * j, temp, 16); + flip_buf_avx2(buf0 + 16 * j, temp, 16); transpose_16bit_16x16_avx2(temp, _buf1 + 16 * (buf_size_w_div16 - 1 - j)); } |