From bbcc64772580c8a979288791afa02d30bc476d2e Mon Sep 17 00:00:00 2001 From: trav90 Date: Fri, 19 Oct 2018 21:52:15 -0500 Subject: Update aom to v1.0.0 Update aom to commit id d14c5bb4f336ef1842046089849dee4a301fbbf0. --- .../aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c | 1205 +++++++ .../aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c | 306 ++ .../aom/av1/encoder/x86/av1_fwd_txfm_sse2.c | 2889 ++++++++++++++++ .../aom/av1/encoder/x86/av1_fwd_txfm_sse2.h | 117 + .../aom/av1/encoder/x86/av1_highbd_quantize_avx2.c | 84 +- .../aom/av1/encoder/x86/av1_highbd_quantize_sse4.c | 103 +- .../aom/av1/encoder/x86/av1_quantize_avx2.c | 234 +- .../aom/av1/encoder/x86/av1_quantize_sse2.c | 273 +- .../aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm | 3 + third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h | 141 + .../aom/av1/encoder/x86/corner_match_sse4.c | 3 +- third_party/aom/av1/encoder/x86/dct_intrin_sse2.c | 3483 -------------------- third_party/aom/av1/encoder/x86/dct_sse2.asm | 5 - third_party/aom/av1/encoder/x86/encodetxb_sse2.c | 505 +++ third_party/aom/av1/encoder/x86/encodetxb_sse4.c | 80 + .../aom/av1/encoder/x86/error_intrin_avx2.c | 3 +- third_party/aom/av1/encoder/x86/error_sse2.asm | 46 - third_party/aom/av1/encoder/x86/hash_sse42.c | 51 + .../aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c | 1276 +++---- .../aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c | 1627 --------- .../av1/encoder/x86/temporal_filter_apply_sse2.asm | 2 + third_party/aom/av1/encoder/x86/wedge_utils_sse2.c | 2 +- 22 files changed, 6114 insertions(+), 6324 deletions(-) create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h create mode 100644 third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h delete mode 100644 third_party/aom/av1/encoder/x86/dct_intrin_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/encodetxb_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/encodetxb_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/hash_sse42.c delete mode 100644 third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c (limited to 'third_party/aom/av1/encoder/x86') diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c new file mode 100644 index 000000000..84065d6de --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c @@ -0,0 +1,1205 @@ +#include "av1/encoder/x86/av1_txfm1d_sse4.h" + +void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output, + int8_t cos_bit) { + __m128i buf0[32]; + __m128i buf1[32]; + const int32_t *cospi; + // stage 0 + // stage 1 + buf1[0] = _mm_add_epi32(input[0], input[31]); + buf1[31] = _mm_sub_epi32(input[0], input[31]); + buf1[1] = _mm_add_epi32(input[1], input[30]); + buf1[30] = _mm_sub_epi32(input[1], input[30]); + buf1[2] = _mm_add_epi32(input[2], input[29]); + buf1[29] = _mm_sub_epi32(input[2], input[29]); + buf1[3] = _mm_add_epi32(input[3], input[28]); + buf1[28] = _mm_sub_epi32(input[3], input[28]); + buf1[4] = _mm_add_epi32(input[4], input[27]); + buf1[27] = _mm_sub_epi32(input[4], input[27]); + buf1[5] = _mm_add_epi32(input[5], input[26]); + buf1[26] = _mm_sub_epi32(input[5], input[26]); + buf1[6] = _mm_add_epi32(input[6], input[25]); + buf1[25] = _mm_sub_epi32(input[6], input[25]); + buf1[7] = _mm_add_epi32(input[7], input[24]); + buf1[24] = _mm_sub_epi32(input[7], input[24]); + buf1[8] = _mm_add_epi32(input[8], input[23]); + buf1[23] = _mm_sub_epi32(input[8], input[23]); + buf1[9] = _mm_add_epi32(input[9], input[22]); + buf1[22] = _mm_sub_epi32(input[9], input[22]); + buf1[10] = _mm_add_epi32(input[10], input[21]); + buf1[21] = _mm_sub_epi32(input[10], input[21]); + buf1[11] = _mm_add_epi32(input[11], input[20]); + buf1[20] = _mm_sub_epi32(input[11], input[20]); + buf1[12] = _mm_add_epi32(input[12], input[19]); + buf1[19] = _mm_sub_epi32(input[12], input[19]); + buf1[13] = _mm_add_epi32(input[13], input[18]); + buf1[18] = _mm_sub_epi32(input[13], input[18]); + buf1[14] = _mm_add_epi32(input[14], input[17]); + buf1[17] = _mm_sub_epi32(input[14], input[17]); + buf1[15] = _mm_add_epi32(input[15], input[16]); + buf1[16] = _mm_sub_epi32(input[15], input[16]); + + // stage 2 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm_add_epi32(buf1[0], buf1[15]); + buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]); + buf0[1] = _mm_add_epi32(buf1[1], buf1[14]); + buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]); + buf0[2] = _mm_add_epi32(buf1[2], buf1[13]); + buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]); + buf0[3] = _mm_add_epi32(buf1[3], buf1[12]); + buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]); + buf0[4] = _mm_add_epi32(buf1[4], buf1[11]); + buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]); + buf0[5] = _mm_add_epi32(buf1[5], buf1[10]); + buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]); + buf0[6] = _mm_add_epi32(buf1[6], buf1[9]); + buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]); + buf0[7] = _mm_add_epi32(buf1[7], buf1[8]); + buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + buf0[18] = buf1[18]; + buf0[19] = buf1[19]; + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23], + buf0[24], cos_bit); + buf0[28] = buf1[28]; + buf0[29] = buf1[29]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 3 + cospi = cospi_arr(cos_bit); + buf1[0] = _mm_add_epi32(buf0[0], buf0[7]); + buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]); + buf1[1] = _mm_add_epi32(buf0[1], buf0[6]); + buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]); + buf1[2] = _mm_add_epi32(buf0[2], buf0[5]); + buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]); + buf1[3] = _mm_add_epi32(buf0[3], buf0[4]); + buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]); + buf1[8] = buf0[8]; + buf1[9] = buf0[9]; + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11], + buf1[12], cos_bit); + buf1[14] = buf0[14]; + buf1[15] = buf0[15]; + buf1[16] = _mm_add_epi32(buf0[16], buf0[23]); + buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]); + buf1[17] = _mm_add_epi32(buf0[17], buf0[22]); + buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]); + buf1[18] = _mm_add_epi32(buf0[18], buf0[21]); + buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]); + buf1[19] = _mm_add_epi32(buf0[19], buf0[20]); + buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]); + buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]); + buf1[31] = _mm_add_epi32(buf0[31], buf0[24]); + buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]); + buf1[30] = _mm_add_epi32(buf0[30], buf0[25]); + buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]); + buf1[29] = _mm_add_epi32(buf0[29], buf0[26]); + buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]); + buf1[28] = _mm_add_epi32(buf0[28], buf0[27]); + + // stage 4 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm_add_epi32(buf1[0], buf1[3]); + buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]); + buf0[1] = _mm_add_epi32(buf1[1], buf1[2]); + buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]); + buf0[4] = buf1[4]; + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6], + cos_bit); + buf0[7] = buf1[7]; + buf0[8] = _mm_add_epi32(buf1[8], buf1[11]); + buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]); + buf0[9] = _mm_add_epi32(buf1[9], buf1[10]); + buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]); + buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]); + buf0[15] = _mm_add_epi32(buf1[15], buf1[12]); + buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]); + buf0[14] = _mm_add_epi32(buf1[14], buf1[13]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19], + buf0[28], cos_bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + buf0[22] = buf1[22]; + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[25] = buf1[25]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 5 + cospi = cospi_arr(cos_bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1], + cos_bit); + btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3], + cos_bit); + buf1[4] = _mm_add_epi32(buf0[4], buf0[5]); + buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]); + buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]); + buf1[7] = _mm_add_epi32(buf0[7], buf0[6]); + buf1[8] = buf0[8]; + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], + buf1[14], cos_bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + buf1[11] = buf0[11]; + buf1[12] = buf0[12]; + buf1[15] = buf0[15]; + buf1[16] = _mm_add_epi32(buf0[16], buf0[19]); + buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]); + buf1[17] = _mm_add_epi32(buf0[17], buf0[18]); + buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]); + buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]); + buf1[23] = _mm_add_epi32(buf0[23], buf0[20]); + buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]); + buf1[22] = _mm_add_epi32(buf0[22], buf0[21]); + buf1[24] = _mm_add_epi32(buf0[24], buf0[27]); + buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]); + buf1[25] = _mm_add_epi32(buf0[25], buf0[26]); + buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]); + buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]); + buf1[31] = _mm_add_epi32(buf0[31], buf0[28]); + buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]); + buf1[30] = _mm_add_epi32(buf0[30], buf0[29]); + + // stage 6 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7], + cos_bit); + btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6], + cos_bit); + buf0[8] = _mm_add_epi32(buf1[8], buf1[9]); + buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]); + buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]); + buf0[11] = _mm_add_epi32(buf1[11], buf1[10]); + buf0[12] = _mm_add_epi32(buf1[12], buf1[13]); + buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]); + buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]); + buf0[15] = _mm_add_epi32(buf1[15], buf1[14]); + buf0[16] = buf1[16]; + btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], + buf0[30], cos_bit); + btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + buf0[19] = buf1[19]; + buf0[20] = buf1[20]; + btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[27] = buf1[27]; + buf0[28] = buf1[28]; + buf0[31] = buf1[31]; + + // stage 7 + cospi = cospi_arr(cos_bit); + buf1[0] = buf0[0]; + buf1[1] = buf0[1]; + buf1[2] = buf0[2]; + buf1[3] = buf0[3]; + buf1[4] = buf0[4]; + buf1[5] = buf0[5]; + buf1[6] = buf0[6]; + buf1[7] = buf0[7]; + btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15], + cos_bit); + btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], + buf1[14], cos_bit); + btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11], + buf1[12], cos_bit); + buf1[16] = _mm_add_epi32(buf0[16], buf0[17]); + buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]); + buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]); + buf1[19] = _mm_add_epi32(buf0[19], buf0[18]); + buf1[20] = _mm_add_epi32(buf0[20], buf0[21]); + buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]); + buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]); + buf1[23] = _mm_add_epi32(buf0[23], buf0[22]); + buf1[24] = _mm_add_epi32(buf0[24], buf0[25]); + buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]); + buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]); + buf1[27] = _mm_add_epi32(buf0[27], buf0[26]); + buf1[28] = _mm_add_epi32(buf0[28], buf0[29]); + buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]); + buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]); + buf1[31] = _mm_add_epi32(buf0[31], buf0[30]); + + // stage 8 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + buf0[4] = buf1[4]; + buf0[5] = buf1[5]; + buf0[6] = buf1[6]; + buf0[7] = buf1[7]; + buf0[8] = buf1[8]; + buf0[9] = buf1[9]; + buf0[10] = buf1[10]; + buf0[11] = buf1[11]; + buf0[12] = buf1[12]; + buf0[13] = buf1[13]; + buf0[14] = buf1[14]; + buf0[15] = buf1[15]; + btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], + buf0[31], cos_bit); + btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17], + buf0[30], cos_bit); + btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19], + buf0[28], cos_bit); + btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], + buf0[24], cos_bit); + + // stage 9 + output[0] = buf0[0]; + output[1] = buf0[16]; + output[2] = buf0[8]; + output[3] = buf0[24]; + output[4] = buf0[4]; + output[5] = buf0[20]; + output[6] = buf0[12]; + output[7] = buf0[28]; + output[8] = buf0[2]; + output[9] = buf0[18]; + output[10] = buf0[10]; + output[11] = buf0[26]; + output[12] = buf0[6]; + output[13] = buf0[22]; + output[14] = buf0[14]; + output[15] = buf0[30]; + output[16] = buf0[1]; + output[17] = buf0[17]; + output[18] = buf0[9]; + output[19] = buf0[25]; + output[20] = buf0[5]; + output[21] = buf0[21]; + output[22] = buf0[13]; + output[23] = buf0[29]; + output[24] = buf0[3]; + output[25] = buf0[19]; + output[26] = buf0[11]; + output[27] = buf0[27]; + output[28] = buf0[7]; + output[29] = buf0[23]; + output[30] = buf0[15]; + output[31] = buf0[31]; +} + +void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range) { + const int txfm_size = 4; + const int num_per_128 = 4; + const int32_t *cospi; + __m128i buf0[4]; + __m128i buf1[4]; + int col_num = txfm_size / num_per_128; + int col; + (void)stage_range; + for (col = 0; col < col_num; col++) { + // stage 0; + int32_t stage_idx = 0; + int j; + for (j = 0; j < 4; ++j) { + buf0[j] = input[j * col_num + col]; + } + + // stage 1 + stage_idx++; + buf1[0] = buf0[3]; + buf1[1] = buf0[0]; + buf1[2] = buf0[1]; + buf1[3] = buf0[2]; + + // stage 2 + stage_idx++; + + cospi = cospi_arr(cos_bit); + btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1], + cos_bit); + btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2], + buf0[3], cos_bit); + + // stage 3 + stage_idx++; + buf1[0] = _mm_add_epi32(buf0[0], buf0[2]); + buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]); + buf1[1] = _mm_add_epi32(buf0[1], buf0[3]); + buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]); + + // stage 4 + stage_idx++; + + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], + buf0[3], cos_bit); + + // stage 5 + stage_idx++; + buf1[0] = buf0[0]; + buf1[1] = _mm_sub_epi32(_mm_setzero_si128(), buf0[2]); + buf1[2] = buf0[3]; + buf1[3] = _mm_sub_epi32(_mm_setzero_si128(), buf0[1]); + + for (j = 0; j < 4; ++j) { + output[j * col_num + col] = buf1[j]; + } + } +} + +void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32 = _mm_set1_epi32(-cospi[32]); + __m128i cospi_p32 = _mm_set1_epi32(cospi[32]); + __m128i cospi_m16 = _mm_set1_epi32(-cospi[16]); + __m128i cospi_p48 = _mm_set1_epi32(cospi[48]); + __m128i cospi_m48 = _mm_set1_epi32(-cospi[48]); + __m128i cospi_p16 = _mm_set1_epi32(cospi[16]); + __m128i cospi_m08 = _mm_set1_epi32(-cospi[8]); + __m128i cospi_p56 = _mm_set1_epi32(cospi[56]); + __m128i cospi_m56 = _mm_set1_epi32(-cospi[56]); + __m128i cospi_m40 = _mm_set1_epi32(-cospi[40]); + __m128i cospi_p24 = _mm_set1_epi32(cospi[24]); + __m128i cospi_m24 = _mm_set1_epi32(-cospi[24]); + __m128i cospi_p08 = _mm_set1_epi32(cospi[8]); + __m128i cospi_p40 = _mm_set1_epi32(cospi[40]); + __m128i cospi_p60 = _mm_set1_epi32(cospi[60]); + __m128i cospi_p04 = _mm_set1_epi32(cospi[4]); + __m128i cospi_p28 = _mm_set1_epi32(cospi[28]); + __m128i cospi_p36 = _mm_set1_epi32(cospi[36]); + __m128i cospi_p44 = _mm_set1_epi32(cospi[44]); + __m128i cospi_p20 = _mm_set1_epi32(cospi[20]); + __m128i cospi_p12 = _mm_set1_epi32(cospi[12]); + __m128i cospi_p52 = _mm_set1_epi32(cospi[52]); + __m128i cospi_m04 = _mm_set1_epi32(-cospi[4]); + __m128i cospi_m60 = _mm_set1_epi32(-cospi[60]); + __m128i cospi_m36 = _mm_set1_epi32(-cospi[36]); + __m128i cospi_m28 = _mm_set1_epi32(-cospi[28]); + __m128i cospi_m20 = _mm_set1_epi32(-cospi[20]); + __m128i cospi_m44 = _mm_set1_epi32(-cospi[44]); + __m128i cospi_m52 = _mm_set1_epi32(-cospi[52]); + __m128i cospi_m12 = _mm_set1_epi32(-cospi[12]); + __m128i cospi_p62 = _mm_set1_epi32(cospi[62]); + __m128i cospi_p02 = _mm_set1_epi32(cospi[2]); + __m128i cospi_p30 = _mm_set1_epi32(cospi[30]); + __m128i cospi_p34 = _mm_set1_epi32(cospi[34]); + __m128i cospi_p46 = _mm_set1_epi32(cospi[46]); + __m128i cospi_p18 = _mm_set1_epi32(cospi[18]); + __m128i cospi_p14 = _mm_set1_epi32(cospi[14]); + __m128i cospi_p50 = _mm_set1_epi32(cospi[50]); + __m128i cospi_p54 = _mm_set1_epi32(cospi[54]); + __m128i cospi_p10 = _mm_set1_epi32(cospi[10]); + __m128i cospi_p22 = _mm_set1_epi32(cospi[22]); + __m128i cospi_p42 = _mm_set1_epi32(cospi[42]); + __m128i cospi_p38 = _mm_set1_epi32(cospi[38]); + __m128i cospi_p26 = _mm_set1_epi32(cospi[26]); + __m128i cospi_p06 = _mm_set1_epi32(cospi[6]); + __m128i cospi_p58 = _mm_set1_epi32(cospi[58]); + __m128i cospi_p63 = _mm_set1_epi32(cospi[63]); + __m128i cospi_p01 = _mm_set1_epi32(cospi[1]); + __m128i cospi_p31 = _mm_set1_epi32(cospi[31]); + __m128i cospi_p33 = _mm_set1_epi32(cospi[33]); + __m128i cospi_p47 = _mm_set1_epi32(cospi[47]); + __m128i cospi_p17 = _mm_set1_epi32(cospi[17]); + __m128i cospi_p15 = _mm_set1_epi32(cospi[15]); + __m128i cospi_p49 = _mm_set1_epi32(cospi[49]); + __m128i cospi_p55 = _mm_set1_epi32(cospi[55]); + __m128i cospi_p09 = _mm_set1_epi32(cospi[9]); + __m128i cospi_p23 = _mm_set1_epi32(cospi[23]); + __m128i cospi_p41 = _mm_set1_epi32(cospi[41]); + __m128i cospi_p39 = _mm_set1_epi32(cospi[39]); + __m128i cospi_p25 = _mm_set1_epi32(cospi[25]); + __m128i cospi_p07 = _mm_set1_epi32(cospi[7]); + __m128i cospi_p57 = _mm_set1_epi32(cospi[57]); + __m128i cospi_p59 = _mm_set1_epi32(cospi[59]); + __m128i cospi_p05 = _mm_set1_epi32(cospi[5]); + __m128i cospi_p27 = _mm_set1_epi32(cospi[27]); + __m128i cospi_p37 = _mm_set1_epi32(cospi[37]); + __m128i cospi_p43 = _mm_set1_epi32(cospi[43]); + __m128i cospi_p21 = _mm_set1_epi32(cospi[21]); + __m128i cospi_p11 = _mm_set1_epi32(cospi[11]); + __m128i cospi_p53 = _mm_set1_epi32(cospi[53]); + __m128i cospi_p51 = _mm_set1_epi32(cospi[51]); + __m128i cospi_p13 = _mm_set1_epi32(cospi[13]); + __m128i cospi_p19 = _mm_set1_epi32(cospi[19]); + __m128i cospi_p45 = _mm_set1_epi32(cospi[45]); + __m128i cospi_p35 = _mm_set1_epi32(cospi[35]); + __m128i cospi_p29 = _mm_set1_epi32(cospi[29]); + __m128i cospi_p03 = _mm_set1_epi32(cospi[3]); + __m128i cospi_p61 = _mm_set1_epi32(cospi[61]); + + // stage 1 + __m128i x1[64]; + x1[0] = _mm_add_epi32(input[0], input[63]); + x1[63] = _mm_sub_epi32(input[0], input[63]); + x1[1] = _mm_add_epi32(input[1], input[62]); + x1[62] = _mm_sub_epi32(input[1], input[62]); + x1[2] = _mm_add_epi32(input[2], input[61]); + x1[61] = _mm_sub_epi32(input[2], input[61]); + x1[3] = _mm_add_epi32(input[3], input[60]); + x1[60] = _mm_sub_epi32(input[3], input[60]); + x1[4] = _mm_add_epi32(input[4], input[59]); + x1[59] = _mm_sub_epi32(input[4], input[59]); + x1[5] = _mm_add_epi32(input[5], input[58]); + x1[58] = _mm_sub_epi32(input[5], input[58]); + x1[6] = _mm_add_epi32(input[6], input[57]); + x1[57] = _mm_sub_epi32(input[6], input[57]); + x1[7] = _mm_add_epi32(input[7], input[56]); + x1[56] = _mm_sub_epi32(input[7], input[56]); + x1[8] = _mm_add_epi32(input[8], input[55]); + x1[55] = _mm_sub_epi32(input[8], input[55]); + x1[9] = _mm_add_epi32(input[9], input[54]); + x1[54] = _mm_sub_epi32(input[9], input[54]); + x1[10] = _mm_add_epi32(input[10], input[53]); + x1[53] = _mm_sub_epi32(input[10], input[53]); + x1[11] = _mm_add_epi32(input[11], input[52]); + x1[52] = _mm_sub_epi32(input[11], input[52]); + x1[12] = _mm_add_epi32(input[12], input[51]); + x1[51] = _mm_sub_epi32(input[12], input[51]); + x1[13] = _mm_add_epi32(input[13], input[50]); + x1[50] = _mm_sub_epi32(input[13], input[50]); + x1[14] = _mm_add_epi32(input[14], input[49]); + x1[49] = _mm_sub_epi32(input[14], input[49]); + x1[15] = _mm_add_epi32(input[15], input[48]); + x1[48] = _mm_sub_epi32(input[15], input[48]); + x1[16] = _mm_add_epi32(input[16], input[47]); + x1[47] = _mm_sub_epi32(input[16], input[47]); + x1[17] = _mm_add_epi32(input[17], input[46]); + x1[46] = _mm_sub_epi32(input[17], input[46]); + x1[18] = _mm_add_epi32(input[18], input[45]); + x1[45] = _mm_sub_epi32(input[18], input[45]); + x1[19] = _mm_add_epi32(input[19], input[44]); + x1[44] = _mm_sub_epi32(input[19], input[44]); + x1[20] = _mm_add_epi32(input[20], input[43]); + x1[43] = _mm_sub_epi32(input[20], input[43]); + x1[21] = _mm_add_epi32(input[21], input[42]); + x1[42] = _mm_sub_epi32(input[21], input[42]); + x1[22] = _mm_add_epi32(input[22], input[41]); + x1[41] = _mm_sub_epi32(input[22], input[41]); + x1[23] = _mm_add_epi32(input[23], input[40]); + x1[40] = _mm_sub_epi32(input[23], input[40]); + x1[24] = _mm_add_epi32(input[24], input[39]); + x1[39] = _mm_sub_epi32(input[24], input[39]); + x1[25] = _mm_add_epi32(input[25], input[38]); + x1[38] = _mm_sub_epi32(input[25], input[38]); + x1[26] = _mm_add_epi32(input[26], input[37]); + x1[37] = _mm_sub_epi32(input[26], input[37]); + x1[27] = _mm_add_epi32(input[27], input[36]); + x1[36] = _mm_sub_epi32(input[27], input[36]); + x1[28] = _mm_add_epi32(input[28], input[35]); + x1[35] = _mm_sub_epi32(input[28], input[35]); + x1[29] = _mm_add_epi32(input[29], input[34]); + x1[34] = _mm_sub_epi32(input[29], input[34]); + x1[30] = _mm_add_epi32(input[30], input[33]); + x1[33] = _mm_sub_epi32(input[30], input[33]); + x1[31] = _mm_add_epi32(input[31], input[32]); + x1[32] = _mm_sub_epi32(input[31], input[32]); + + // stage 2 + __m128i x2[64]; + x2[0] = _mm_add_epi32(x1[0], x1[31]); + x2[31] = _mm_sub_epi32(x1[0], x1[31]); + x2[1] = _mm_add_epi32(x1[1], x1[30]); + x2[30] = _mm_sub_epi32(x1[1], x1[30]); + x2[2] = _mm_add_epi32(x1[2], x1[29]); + x2[29] = _mm_sub_epi32(x1[2], x1[29]); + x2[3] = _mm_add_epi32(x1[3], x1[28]); + x2[28] = _mm_sub_epi32(x1[3], x1[28]); + x2[4] = _mm_add_epi32(x1[4], x1[27]); + x2[27] = _mm_sub_epi32(x1[4], x1[27]); + x2[5] = _mm_add_epi32(x1[5], x1[26]); + x2[26] = _mm_sub_epi32(x1[5], x1[26]); + x2[6] = _mm_add_epi32(x1[6], x1[25]); + x2[25] = _mm_sub_epi32(x1[6], x1[25]); + x2[7] = _mm_add_epi32(x1[7], x1[24]); + x2[24] = _mm_sub_epi32(x1[7], x1[24]); + x2[8] = _mm_add_epi32(x1[8], x1[23]); + x2[23] = _mm_sub_epi32(x1[8], x1[23]); + x2[9] = _mm_add_epi32(x1[9], x1[22]); + x2[22] = _mm_sub_epi32(x1[9], x1[22]); + x2[10] = _mm_add_epi32(x1[10], x1[21]); + x2[21] = _mm_sub_epi32(x1[10], x1[21]); + x2[11] = _mm_add_epi32(x1[11], x1[20]); + x2[20] = _mm_sub_epi32(x1[11], x1[20]); + x2[12] = _mm_add_epi32(x1[12], x1[19]); + x2[19] = _mm_sub_epi32(x1[12], x1[19]); + x2[13] = _mm_add_epi32(x1[13], x1[18]); + x2[18] = _mm_sub_epi32(x1[13], x1[18]); + x2[14] = _mm_add_epi32(x1[14], x1[17]); + x2[17] = _mm_sub_epi32(x1[14], x1[17]); + x2[15] = _mm_add_epi32(x1[15], x1[16]); + x2[16] = _mm_sub_epi32(x1[15], x1[16]); + x2[32] = x1[32]; + x2[33] = x1[33]; + x2[34] = x1[34]; + x2[35] = x1[35]; + x2[36] = x1[36]; + x2[37] = x1[37]; + x2[38] = x1[38]; + x2[39] = x1[39]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48], + __rounding, cos_bit); + x2[56] = x1[56]; + x2[57] = x1[57]; + x2[58] = x1[58]; + x2[59] = x1[59]; + x2[60] = x1[60]; + x2[61] = x1[61]; + x2[62] = x1[62]; + x2[63] = x1[63]; + + // stage 3 + __m128i x3[64]; + x3[0] = _mm_add_epi32(x2[0], x2[15]); + x3[15] = _mm_sub_epi32(x2[0], x2[15]); + x3[1] = _mm_add_epi32(x2[1], x2[14]); + x3[14] = _mm_sub_epi32(x2[1], x2[14]); + x3[2] = _mm_add_epi32(x2[2], x2[13]); + x3[13] = _mm_sub_epi32(x2[2], x2[13]); + x3[3] = _mm_add_epi32(x2[3], x2[12]); + x3[12] = _mm_sub_epi32(x2[3], x2[12]); + x3[4] = _mm_add_epi32(x2[4], x2[11]); + x3[11] = _mm_sub_epi32(x2[4], x2[11]); + x3[5] = _mm_add_epi32(x2[5], x2[10]); + x3[10] = _mm_sub_epi32(x2[5], x2[10]); + x3[6] = _mm_add_epi32(x2[6], x2[9]); + x3[9] = _mm_sub_epi32(x2[6], x2[9]); + x3[7] = _mm_add_epi32(x2[7], x2[8]); + x3[8] = _mm_sub_epi32(x2[7], x2[8]); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24], + __rounding, cos_bit); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + x3[32] = _mm_add_epi32(x2[32], x2[47]); + x3[47] = _mm_sub_epi32(x2[32], x2[47]); + x3[33] = _mm_add_epi32(x2[33], x2[46]); + x3[46] = _mm_sub_epi32(x2[33], x2[46]); + x3[34] = _mm_add_epi32(x2[34], x2[45]); + x3[45] = _mm_sub_epi32(x2[34], x2[45]); + x3[35] = _mm_add_epi32(x2[35], x2[44]); + x3[44] = _mm_sub_epi32(x2[35], x2[44]); + x3[36] = _mm_add_epi32(x2[36], x2[43]); + x3[43] = _mm_sub_epi32(x2[36], x2[43]); + x3[37] = _mm_add_epi32(x2[37], x2[42]); + x3[42] = _mm_sub_epi32(x2[37], x2[42]); + x3[38] = _mm_add_epi32(x2[38], x2[41]); + x3[41] = _mm_sub_epi32(x2[38], x2[41]); + x3[39] = _mm_add_epi32(x2[39], x2[40]); + x3[40] = _mm_sub_epi32(x2[39], x2[40]); + x3[48] = _mm_sub_epi32(x2[63], x2[48]); + x3[63] = _mm_add_epi32(x2[63], x2[48]); + x3[49] = _mm_sub_epi32(x2[62], x2[49]); + x3[62] = _mm_add_epi32(x2[62], x2[49]); + x3[50] = _mm_sub_epi32(x2[61], x2[50]); + x3[61] = _mm_add_epi32(x2[61], x2[50]); + x3[51] = _mm_sub_epi32(x2[60], x2[51]); + x3[60] = _mm_add_epi32(x2[60], x2[51]); + x3[52] = _mm_sub_epi32(x2[59], x2[52]); + x3[59] = _mm_add_epi32(x2[59], x2[52]); + x3[53] = _mm_sub_epi32(x2[58], x2[53]); + x3[58] = _mm_add_epi32(x2[58], x2[53]); + x3[54] = _mm_sub_epi32(x2[57], x2[54]); + x3[57] = _mm_add_epi32(x2[57], x2[54]); + x3[55] = _mm_sub_epi32(x2[56], x2[55]); + x3[56] = _mm_add_epi32(x2[56], x2[55]); + + // stage 4 + __m128i x4[64]; + x4[0] = _mm_add_epi32(x3[0], x3[7]); + x4[7] = _mm_sub_epi32(x3[0], x3[7]); + x4[1] = _mm_add_epi32(x3[1], x3[6]); + x4[6] = _mm_sub_epi32(x3[1], x3[6]); + x4[2] = _mm_add_epi32(x3[2], x3[5]); + x4[5] = _mm_sub_epi32(x3[2], x3[5]); + x4[3] = _mm_add_epi32(x3[3], x3[4]); + x4[4] = _mm_sub_epi32(x3[3], x3[4]); + x4[8] = x3[8]; + x4[9] = x3[9]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12], + __rounding, cos_bit); + x4[14] = x3[14]; + x4[15] = x3[15]; + x4[16] = _mm_add_epi32(x3[16], x3[23]); + x4[23] = _mm_sub_epi32(x3[16], x3[23]); + x4[17] = _mm_add_epi32(x3[17], x3[22]); + x4[22] = _mm_sub_epi32(x3[17], x3[22]); + x4[18] = _mm_add_epi32(x3[18], x3[21]); + x4[21] = _mm_sub_epi32(x3[18], x3[21]); + x4[19] = _mm_add_epi32(x3[19], x3[20]); + x4[20] = _mm_sub_epi32(x3[19], x3[20]); + x4[24] = _mm_sub_epi32(x3[31], x3[24]); + x4[31] = _mm_add_epi32(x3[31], x3[24]); + x4[25] = _mm_sub_epi32(x3[30], x3[25]); + x4[30] = _mm_add_epi32(x3[30], x3[25]); + x4[26] = _mm_sub_epi32(x3[29], x3[26]); + x4[29] = _mm_add_epi32(x3[29], x3[26]); + x4[27] = _mm_sub_epi32(x3[28], x3[27]); + x4[28] = _mm_add_epi32(x3[28], x3[27]); + x4[32] = x3[32]; + x4[33] = x3[33]; + x4[34] = x3[34]; + x4[35] = x3[35]; + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52], + __rounding, cos_bit); + x4[44] = x3[44]; + x4[45] = x3[45]; + x4[46] = x3[46]; + x4[47] = x3[47]; + x4[48] = x3[48]; + x4[49] = x3[49]; + x4[50] = x3[50]; + x4[51] = x3[51]; + x4[60] = x3[60]; + x4[61] = x3[61]; + x4[62] = x3[62]; + x4[63] = x3[63]; + + // stage 5 + __m128i x5[64]; + x5[0] = _mm_add_epi32(x4[0], x4[3]); + x5[3] = _mm_sub_epi32(x4[0], x4[3]); + x5[1] = _mm_add_epi32(x4[1], x4[2]); + x5[2] = _mm_sub_epi32(x4[1], x4[2]); + x5[4] = x4[4]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6], + __rounding, cos_bit); + x5[7] = x4[7]; + x5[8] = _mm_add_epi32(x4[8], x4[11]); + x5[11] = _mm_sub_epi32(x4[8], x4[11]); + x5[9] = _mm_add_epi32(x4[9], x4[10]); + x5[10] = _mm_sub_epi32(x4[9], x4[10]); + x5[12] = _mm_sub_epi32(x4[15], x4[12]); + x5[15] = _mm_add_epi32(x4[15], x4[12]); + x5[13] = _mm_sub_epi32(x4[14], x4[13]); + x5[14] = _mm_add_epi32(x4[14], x4[13]); + x5[16] = x4[16]; + x5[17] = x4[17]; + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26], + __rounding, cos_bit); + x5[22] = x4[22]; + x5[23] = x4[23]; + x5[24] = x4[24]; + x5[25] = x4[25]; + x5[30] = x4[30]; + x5[31] = x4[31]; + x5[32] = _mm_add_epi32(x4[32], x4[39]); + x5[39] = _mm_sub_epi32(x4[32], x4[39]); + x5[33] = _mm_add_epi32(x4[33], x4[38]); + x5[38] = _mm_sub_epi32(x4[33], x4[38]); + x5[34] = _mm_add_epi32(x4[34], x4[37]); + x5[37] = _mm_sub_epi32(x4[34], x4[37]); + x5[35] = _mm_add_epi32(x4[35], x4[36]); + x5[36] = _mm_sub_epi32(x4[35], x4[36]); + x5[40] = _mm_sub_epi32(x4[47], x4[40]); + x5[47] = _mm_add_epi32(x4[47], x4[40]); + x5[41] = _mm_sub_epi32(x4[46], x4[41]); + x5[46] = _mm_add_epi32(x4[46], x4[41]); + x5[42] = _mm_sub_epi32(x4[45], x4[42]); + x5[45] = _mm_add_epi32(x4[45], x4[42]); + x5[43] = _mm_sub_epi32(x4[44], x4[43]); + x5[44] = _mm_add_epi32(x4[44], x4[43]); + x5[48] = _mm_add_epi32(x4[48], x4[55]); + x5[55] = _mm_sub_epi32(x4[48], x4[55]); + x5[49] = _mm_add_epi32(x4[49], x4[54]); + x5[54] = _mm_sub_epi32(x4[49], x4[54]); + x5[50] = _mm_add_epi32(x4[50], x4[53]); + x5[53] = _mm_sub_epi32(x4[50], x4[53]); + x5[51] = _mm_add_epi32(x4[51], x4[52]); + x5[52] = _mm_sub_epi32(x4[51], x4[52]); + x5[56] = _mm_sub_epi32(x4[63], x4[56]); + x5[63] = _mm_add_epi32(x4[63], x4[56]); + x5[57] = _mm_sub_epi32(x4[62], x4[57]); + x5[62] = _mm_add_epi32(x4[62], x4[57]); + x5[58] = _mm_sub_epi32(x4[61], x4[58]); + x5[61] = _mm_add_epi32(x4[61], x4[58]); + x5[59] = _mm_sub_epi32(x4[60], x4[59]); + x5[60] = _mm_add_epi32(x4[60], x4[59]); + + // stage 6 + __m128i x6[64]; + btf_32_type0_sse4_1_new(cospi_p32, cospi_p32, x5[0], x5[1], x6[0], x6[1], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p48, cospi_p16, x5[2], x5[3], x6[2], x6[3], + __rounding, cos_bit); + x6[4] = _mm_add_epi32(x5[4], x5[5]); + x6[5] = _mm_sub_epi32(x5[4], x5[5]); + x6[6] = _mm_sub_epi32(x5[7], x5[6]); + x6[7] = _mm_add_epi32(x5[7], x5[6]); + x6[8] = x5[8]; + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13], + __rounding, cos_bit); + x6[11] = x5[11]; + x6[12] = x5[12]; + x6[15] = x5[15]; + x6[16] = _mm_add_epi32(x5[16], x5[19]); + x6[19] = _mm_sub_epi32(x5[16], x5[19]); + x6[17] = _mm_add_epi32(x5[17], x5[18]); + x6[18] = _mm_sub_epi32(x5[17], x5[18]); + x6[20] = _mm_sub_epi32(x5[23], x5[20]); + x6[23] = _mm_add_epi32(x5[23], x5[20]); + x6[21] = _mm_sub_epi32(x5[22], x5[21]); + x6[22] = _mm_add_epi32(x5[22], x5[21]); + x6[24] = _mm_add_epi32(x5[24], x5[27]); + x6[27] = _mm_sub_epi32(x5[24], x5[27]); + x6[25] = _mm_add_epi32(x5[25], x5[26]); + x6[26] = _mm_sub_epi32(x5[25], x5[26]); + x6[28] = _mm_sub_epi32(x5[31], x5[28]); + x6[31] = _mm_add_epi32(x5[31], x5[28]); + x6[29] = _mm_sub_epi32(x5[30], x5[29]); + x6[30] = _mm_add_epi32(x5[30], x5[29]); + x6[32] = x5[32]; + x6[33] = x5[33]; + btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58], + __rounding, cos_bit); + x6[38] = x5[38]; + x6[39] = x5[39]; + x6[40] = x5[40]; + x6[41] = x5[41]; + btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50], + __rounding, cos_bit); + x6[46] = x5[46]; + x6[47] = x5[47]; + x6[48] = x5[48]; + x6[49] = x5[49]; + x6[54] = x5[54]; + x6[55] = x5[55]; + x6[56] = x5[56]; + x6[57] = x5[57]; + x6[62] = x5[62]; + x6[63] = x5[63]; + + // stage 7 + __m128i x7[64]; + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + btf_32_type1_sse4_1_new(cospi_p56, cospi_p08, x6[4], x6[7], x7[4], x7[7], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p24, cospi_p40, x6[5], x6[6], x7[5], x7[6], + __rounding, cos_bit); + x7[8] = _mm_add_epi32(x6[8], x6[9]); + x7[9] = _mm_sub_epi32(x6[8], x6[9]); + x7[10] = _mm_sub_epi32(x6[11], x6[10]); + x7[11] = _mm_add_epi32(x6[11], x6[10]); + x7[12] = _mm_add_epi32(x6[12], x6[13]); + x7[13] = _mm_sub_epi32(x6[12], x6[13]); + x7[14] = _mm_sub_epi32(x6[15], x6[14]); + x7[15] = _mm_add_epi32(x6[15], x6[14]); + x7[16] = x6[16]; + btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29], + __rounding, cos_bit); + x7[19] = x6[19]; + x7[20] = x6[20]; + btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25], + __rounding, cos_bit); + x7[23] = x6[23]; + x7[24] = x6[24]; + x7[27] = x6[27]; + x7[28] = x6[28]; + x7[31] = x6[31]; + x7[32] = _mm_add_epi32(x6[32], x6[35]); + x7[35] = _mm_sub_epi32(x6[32], x6[35]); + x7[33] = _mm_add_epi32(x6[33], x6[34]); + x7[34] = _mm_sub_epi32(x6[33], x6[34]); + x7[36] = _mm_sub_epi32(x6[39], x6[36]); + x7[39] = _mm_add_epi32(x6[39], x6[36]); + x7[37] = _mm_sub_epi32(x6[38], x6[37]); + x7[38] = _mm_add_epi32(x6[38], x6[37]); + x7[40] = _mm_add_epi32(x6[40], x6[43]); + x7[43] = _mm_sub_epi32(x6[40], x6[43]); + x7[41] = _mm_add_epi32(x6[41], x6[42]); + x7[42] = _mm_sub_epi32(x6[41], x6[42]); + x7[44] = _mm_sub_epi32(x6[47], x6[44]); + x7[47] = _mm_add_epi32(x6[47], x6[44]); + x7[45] = _mm_sub_epi32(x6[46], x6[45]); + x7[46] = _mm_add_epi32(x6[46], x6[45]); + x7[48] = _mm_add_epi32(x6[48], x6[51]); + x7[51] = _mm_sub_epi32(x6[48], x6[51]); + x7[49] = _mm_add_epi32(x6[49], x6[50]); + x7[50] = _mm_sub_epi32(x6[49], x6[50]); + x7[52] = _mm_sub_epi32(x6[55], x6[52]); + x7[55] = _mm_add_epi32(x6[55], x6[52]); + x7[53] = _mm_sub_epi32(x6[54], x6[53]); + x7[54] = _mm_add_epi32(x6[54], x6[53]); + x7[56] = _mm_add_epi32(x6[56], x6[59]); + x7[59] = _mm_sub_epi32(x6[56], x6[59]); + x7[57] = _mm_add_epi32(x6[57], x6[58]); + x7[58] = _mm_sub_epi32(x6[57], x6[58]); + x7[60] = _mm_sub_epi32(x6[63], x6[60]); + x7[63] = _mm_add_epi32(x6[63], x6[60]); + x7[61] = _mm_sub_epi32(x6[62], x6[61]); + x7[62] = _mm_add_epi32(x6[62], x6[61]); + + // stage 8 + __m128i x8[64]; + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + btf_32_type1_sse4_1_new(cospi_p60, cospi_p04, x7[8], x7[15], x8[8], x8[15], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p28, cospi_p36, x7[9], x7[14], x8[9], x8[14], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p44, cospi_p20, x7[10], x7[13], x8[10], x8[13], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p12, cospi_p52, x7[11], x7[12], x8[11], x8[12], + __rounding, cos_bit); + x8[16] = _mm_add_epi32(x7[16], x7[17]); + x8[17] = _mm_sub_epi32(x7[16], x7[17]); + x8[18] = _mm_sub_epi32(x7[19], x7[18]); + x8[19] = _mm_add_epi32(x7[19], x7[18]); + x8[20] = _mm_add_epi32(x7[20], x7[21]); + x8[21] = _mm_sub_epi32(x7[20], x7[21]); + x8[22] = _mm_sub_epi32(x7[23], x7[22]); + x8[23] = _mm_add_epi32(x7[23], x7[22]); + x8[24] = _mm_add_epi32(x7[24], x7[25]); + x8[25] = _mm_sub_epi32(x7[24], x7[25]); + x8[26] = _mm_sub_epi32(x7[27], x7[26]); + x8[27] = _mm_add_epi32(x7[27], x7[26]); + x8[28] = _mm_add_epi32(x7[28], x7[29]); + x8[29] = _mm_sub_epi32(x7[28], x7[29]); + x8[30] = _mm_sub_epi32(x7[31], x7[30]); + x8[31] = _mm_add_epi32(x7[31], x7[30]); + x8[32] = x7[32]; + btf_32_type0_sse4_1_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61], + __rounding, cos_bit); + x8[35] = x7[35]; + x8[36] = x7[36]; + btf_32_type0_sse4_1_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57], + __rounding, cos_bit); + x8[39] = x7[39]; + x8[40] = x7[40]; + btf_32_type0_sse4_1_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53], + __rounding, cos_bit); + x8[43] = x7[43]; + x8[44] = x7[44]; + btf_32_type0_sse4_1_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49], + __rounding, cos_bit); + x8[47] = x7[47]; + x8[48] = x7[48]; + x8[51] = x7[51]; + x8[52] = x7[52]; + x8[55] = x7[55]; + x8[56] = x7[56]; + x8[59] = x7[59]; + x8[60] = x7[60]; + x8[63] = x7[63]; + + // stage 9 + __m128i x9[64]; + x9[0] = x8[0]; + x9[1] = x8[1]; + x9[2] = x8[2]; + x9[3] = x8[3]; + x9[4] = x8[4]; + x9[5] = x8[5]; + x9[6] = x8[6]; + x9[7] = x8[7]; + x9[8] = x8[8]; + x9[9] = x8[9]; + x9[10] = x8[10]; + x9[11] = x8[11]; + x9[12] = x8[12]; + x9[13] = x8[13]; + x9[14] = x8[14]; + x9[15] = x8[15]; + btf_32_type1_sse4_1_new(cospi_p62, cospi_p02, x8[16], x8[31], x9[16], x9[31], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p30, cospi_p34, x8[17], x8[30], x9[17], x9[30], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p46, cospi_p18, x8[18], x8[29], x9[18], x9[29], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p14, cospi_p50, x8[19], x8[28], x9[19], x9[28], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p54, cospi_p10, x8[20], x8[27], x9[20], x9[27], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p22, cospi_p42, x8[21], x8[26], x9[21], x9[26], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p38, cospi_p26, x8[22], x8[25], x9[22], x9[25], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p06, cospi_p58, x8[23], x8[24], x9[23], x9[24], + __rounding, cos_bit); + x9[32] = _mm_add_epi32(x8[32], x8[33]); + x9[33] = _mm_sub_epi32(x8[32], x8[33]); + x9[34] = _mm_sub_epi32(x8[35], x8[34]); + x9[35] = _mm_add_epi32(x8[35], x8[34]); + x9[36] = _mm_add_epi32(x8[36], x8[37]); + x9[37] = _mm_sub_epi32(x8[36], x8[37]); + x9[38] = _mm_sub_epi32(x8[39], x8[38]); + x9[39] = _mm_add_epi32(x8[39], x8[38]); + x9[40] = _mm_add_epi32(x8[40], x8[41]); + x9[41] = _mm_sub_epi32(x8[40], x8[41]); + x9[42] = _mm_sub_epi32(x8[43], x8[42]); + x9[43] = _mm_add_epi32(x8[43], x8[42]); + x9[44] = _mm_add_epi32(x8[44], x8[45]); + x9[45] = _mm_sub_epi32(x8[44], x8[45]); + x9[46] = _mm_sub_epi32(x8[47], x8[46]); + x9[47] = _mm_add_epi32(x8[47], x8[46]); + x9[48] = _mm_add_epi32(x8[48], x8[49]); + x9[49] = _mm_sub_epi32(x8[48], x8[49]); + x9[50] = _mm_sub_epi32(x8[51], x8[50]); + x9[51] = _mm_add_epi32(x8[51], x8[50]); + x9[52] = _mm_add_epi32(x8[52], x8[53]); + x9[53] = _mm_sub_epi32(x8[52], x8[53]); + x9[54] = _mm_sub_epi32(x8[55], x8[54]); + x9[55] = _mm_add_epi32(x8[55], x8[54]); + x9[56] = _mm_add_epi32(x8[56], x8[57]); + x9[57] = _mm_sub_epi32(x8[56], x8[57]); + x9[58] = _mm_sub_epi32(x8[59], x8[58]); + x9[59] = _mm_add_epi32(x8[59], x8[58]); + x9[60] = _mm_add_epi32(x8[60], x8[61]); + x9[61] = _mm_sub_epi32(x8[60], x8[61]); + x9[62] = _mm_sub_epi32(x8[63], x8[62]); + x9[63] = _mm_add_epi32(x8[63], x8[62]); + + // stage 10 + __m128i x10[64]; + x10[0] = x9[0]; + x10[1] = x9[1]; + x10[2] = x9[2]; + x10[3] = x9[3]; + x10[4] = x9[4]; + x10[5] = x9[5]; + x10[6] = x9[6]; + x10[7] = x9[7]; + x10[8] = x9[8]; + x10[9] = x9[9]; + x10[10] = x9[10]; + x10[11] = x9[11]; + x10[12] = x9[12]; + x10[13] = x9[13]; + x10[14] = x9[14]; + x10[15] = x9[15]; + x10[16] = x9[16]; + x10[17] = x9[17]; + x10[18] = x9[18]; + x10[19] = x9[19]; + x10[20] = x9[20]; + x10[21] = x9[21]; + x10[22] = x9[22]; + x10[23] = x9[23]; + x10[24] = x9[24]; + x10[25] = x9[25]; + x10[26] = x9[26]; + x10[27] = x9[27]; + x10[28] = x9[28]; + x10[29] = x9[29]; + x10[30] = x9[30]; + x10[31] = x9[31]; + btf_32_type1_sse4_1_new(cospi_p63, cospi_p01, x9[32], x9[63], x10[32], + x10[63], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p31, cospi_p33, x9[33], x9[62], x10[33], + x10[62], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p47, cospi_p17, x9[34], x9[61], x10[34], + x10[61], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p15, cospi_p49, x9[35], x9[60], x10[35], + x10[60], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p55, cospi_p09, x9[36], x9[59], x10[36], + x10[59], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p23, cospi_p41, x9[37], x9[58], x10[37], + x10[58], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p39, cospi_p25, x9[38], x9[57], x10[38], + x10[57], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p07, cospi_p57, x9[39], x9[56], x10[39], + x10[56], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p59, cospi_p05, x9[40], x9[55], x10[40], + x10[55], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p27, cospi_p37, x9[41], x9[54], x10[41], + x10[54], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p43, cospi_p21, x9[42], x9[53], x10[42], + x10[53], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p11, cospi_p53, x9[43], x9[52], x10[43], + x10[52], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p51, cospi_p13, x9[44], x9[51], x10[44], + x10[51], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p19, cospi_p45, x9[45], x9[50], x10[45], + x10[50], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p35, cospi_p29, x9[46], x9[49], x10[46], + x10[49], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47], + x10[48], __rounding, cos_bit); + + // stage 11 + output[0] = x10[0]; + output[1] = x10[32]; + output[2] = x10[16]; + output[3] = x10[48]; + output[4] = x10[8]; + output[5] = x10[40]; + output[6] = x10[24]; + output[7] = x10[56]; + output[8] = x10[4]; + output[9] = x10[36]; + output[10] = x10[20]; + output[11] = x10[52]; + output[12] = x10[12]; + output[13] = x10[44]; + output[14] = x10[28]; + output[15] = x10[60]; + output[16] = x10[2]; + output[17] = x10[34]; + output[18] = x10[18]; + output[19] = x10[50]; + output[20] = x10[10]; + output[21] = x10[42]; + output[22] = x10[26]; + output[23] = x10[58]; + output[24] = x10[6]; + output[25] = x10[38]; + output[26] = x10[22]; + output[27] = x10[54]; + output[28] = x10[14]; + output[29] = x10[46]; + output[30] = x10[30]; + output[31] = x10[62]; + output[32] = x10[1]; + output[33] = x10[33]; + output[34] = x10[17]; + output[35] = x10[49]; + output[36] = x10[9]; + output[37] = x10[41]; + output[38] = x10[25]; + output[39] = x10[57]; + output[40] = x10[5]; + output[41] = x10[37]; + output[42] = x10[21]; + output[43] = x10[53]; + output[44] = x10[13]; + output[45] = x10[45]; + output[46] = x10[29]; + output[47] = x10[61]; + output[48] = x10[3]; + output[49] = x10[35]; + output[50] = x10[19]; + output[51] = x10[51]; + output[52] = x10[11]; + output[53] = x10[43]; + output[54] = x10[27]; + output[55] = x10[59]; + output[56] = x10[7]; + output[57] = x10[39]; + output[58] = x10[23]; + output[59] = x10[55]; + output[60] = x10[15]; + output[61] = x10[47]; + output[62] = x10[31]; + output[63] = x10[63]; +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c new file mode 100644 index 000000000..abb95f31e --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" + +#include "av1/common/enums.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_txfm1d_sse4.h" +#include "av1/encoder/x86/av1_fwd_txfm_sse2.h" + +static INLINE void int16_array_with_stride_to_int32_array_without_stride( + const int16_t *input, int stride, int32_t *output, int txfm1d_size) { + int r, c; + for (r = 0; r < txfm1d_size; r++) { + for (c = 0; c < txfm1d_size; c++) { + output[r * txfm1d_size + c] = (int32_t)input[r * stride + c]; + } + } +} + +typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +static void fdct32_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range) { + const int txfm_size = 32; + const int num_per_128 = 4; + __m128i buf0[32]; + __m128i buf1[32]; + int col_num = txfm_size / num_per_128; + int col; + (void)stage_range; + for (col = 0; col < col_num; col++) { + int j; + for (j = 0; j < 32; ++j) { + buf0[j] = input[j * col_num + col]; + } + av1_fdct32_new_sse4_1(buf0, buf1, cos_bit); + for (j = 0; j < 32; ++j) { + output[j * col_num + col] = buf1[j]; + } + } +} + +static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { + switch (txfm_type) { + case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break; + default: assert(0); + } + return NULL; +} + +static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output, + const int stride, + const TXFM_2D_FLIP_CFG *cfg, + int32_t *txfm_buf) { + // TODO(sarahparker) This does not currently support rectangular transforms + // and will break without splitting txfm_size out into row and col size. + // Rectangular transforms use c code only, so it should be ok for now. + // It will be corrected when there are sse implementations for rectangular + // transforms. + assert(cfg->tx_size < TX_SIZES); + const int txfm_size = tx_size_wide[cfg->tx_size]; + const int8_t *shift = cfg->shift; + const int8_t *stage_range_col = cfg->stage_range_col; + const int8_t *stage_range_row = cfg->stage_range_row; + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); + const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row); + + __m128i *buf_128 = (__m128i *)txfm_buf; + __m128i *out_128 = (__m128i *)output; + int num_per_128 = 4; + int txfm2d_size_128 = txfm_size * txfm_size / num_per_128; + + int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf, + txfm_size); + av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]); + txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); + av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); + transpose_32(txfm_size, out_128, buf_128); + txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row); + av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]); + transpose_32(txfm_size, buf_128, out_128); +} + +void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg); + (void)bd; + fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf); +} + +static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA, + const __m128i *inputB, __m128i *output) { + __m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]); + __m128i temp1 = _mm_unpackhi_epi32(inputA[0], inputA[2]); + __m128i temp2 = _mm_unpacklo_epi32(inputA[1], inputA[3]); + __m128i temp3 = _mm_unpackhi_epi32(inputA[1], inputA[3]); + + output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2); + output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2); + output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3); + output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3); + + temp0 = _mm_unpacklo_epi32(inputB[0], inputB[2]); + temp1 = _mm_unpackhi_epi32(inputB[0], inputB[2]); + temp2 = _mm_unpacklo_epi32(inputB[1], inputB[3]); + temp3 = _mm_unpackhi_epi32(inputB[1], inputB[3]); + + output[4 * stride] = _mm_unpacklo_epi32(temp0, temp2); + output[5 * stride] = _mm_unpackhi_epi32(temp0, temp2); + output[6 * stride] = _mm_unpacklo_epi32(temp1, temp3); + output[7 * stride] = _mm_unpackhi_epi32(temp1, temp3); +} + +static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X64; + __m128i buf0[64], buf1[512]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = fdct8x64_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div8); ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i bufA[64]; + __m128i bufB[64]; + __m128i *buf = buf1 + width * i; + for (int j = 0; j < width; ++j) { + bufA[j] = _mm_cvtepi16_epi32(buf[j]); + bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); + } + av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row); + av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row); + av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]); + av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]); + + int32_t *output8 = output + 8 * 32 * i; + for (int j = 0; j < width_div8; ++j) { + __m128i *out = (__m128i *)(output8 + 4 * j); + transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out); + } + } +} + +static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_64X32; + __m128i buf0[64], buf1[256]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div8); ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + assert(tx_type == DCT_DCT); + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i bufA[64]; + __m128i bufB[64]; + __m128i *buf = buf1 + width * i; + for (int j = 0; j < width; ++j) { + bufA[j] = _mm_cvtepi16_epi32(buf[j]); + bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); + } + av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row); + av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row); + av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]); + av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]); + + int32_t *output8 = output + 8 * 32 * i; + for (int j = 0; j < width_div8; ++j) { + __m128i *out = (__m128i *)(output8 + 4 * j); + transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out); + } + } +} + +static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_32X64; + __m128i buf0[64], buf1[256]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = fdct8x64_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div8); ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i bufA[32]; + __m128i bufB[32]; + __m128i *buf = buf1 + width * i; + for (int j = 0; j < width; ++j) { + bufA[j] = _mm_cvtepi16_epi32(buf[j]); + bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); + } + av1_fdct32_new_sse4_1(bufA, bufA, cos_bit_row); + av1_fdct32_new_sse4_1(bufB, bufB, cos_bit_row); + av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]); + av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]); + + int32_t *output8 = output + 8 * 32 * i; + for (int j = 0; j < (32 / 4); ++j) { + __m128i *out = (__m128i *)(output8 + 4 * j); + transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out); + } + } +} + +static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { + av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform + av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform + av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform + av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform + lowbd_fwd_txfm2d_64x64_sse4_1, // 64x64 transform + av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform + av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform + av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform + av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform + av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform + av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform + lowbd_fwd_txfm2d_32x64_sse4_1, // 32x64 transform + lowbd_fwd_txfm2d_64x32_sse4_1, // 64x32 transform + av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform + av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform + av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform + av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform + av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform + av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform +}; + +void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; + if ((fwd_txfm2d_func == NULL) || + (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) { + av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); + } else { + fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c new file mode 100644 index 000000000..6aae7ce1e --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c @@ -0,0 +1,2889 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_fwd_txfm_sse2.h" + +// TODO(linfengz): refine fdct4x8 and fadst4x8 optimization (if possible). + +static void fdct4x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + __m128i u[4], v[4]; + + u[0] = _mm_unpacklo_epi16(input[0], input[1]); + u[1] = _mm_unpacklo_epi16(input[3], input[2]); + + v[0] = _mm_add_epi16(u[0], u[1]); + v[1] = _mm_sub_epi16(u[0], u[1]); + + u[0] = _mm_madd_epi16(v[0], cospi_p32_p32); // 0 + u[1] = _mm_madd_epi16(v[0], cospi_p32_m32); // 2 + u[2] = _mm_madd_epi16(v[1], cospi_p16_p48); // 1 + u[3] = _mm_madd_epi16(v[1], cospi_p48_m16); // 3 + + v[0] = _mm_add_epi32(u[0], __rounding); + v[1] = _mm_add_epi32(u[1], __rounding); + v[2] = _mm_add_epi32(u[2], __rounding); + v[3] = _mm_add_epi32(u[3], __rounding); + u[0] = _mm_srai_epi32(v[0], cos_bit); + u[1] = _mm_srai_epi32(v[1], cos_bit); + u[2] = _mm_srai_epi32(v[2], cos_bit); + u[3] = _mm_srai_epi32(v[3], cos_bit); + + output[0] = _mm_packs_epi32(u[0], u[1]); + output[1] = _mm_packs_epi32(u[2], u[3]); + output[2] = _mm_srli_si128(output[0], 8); + output[3] = _mm_srli_si128(output[1], 8); +} + +static void fdct8x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + + // stage 1 + __m128i x1[4]; + x1[0] = _mm_adds_epi16(input[0], input[3]); + x1[3] = _mm_subs_epi16(input[0], input[3]); + x1[1] = _mm_adds_epi16(input[1], input[2]); + x1[2] = _mm_subs_epi16(input[1], input[2]); + + // stage 2 + __m128i x2[4]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x1[2], x1[3], x2[2], x2[3]); + + // stage 3 + output[0] = x2[0]; + output[1] = x2[2]; + output[2] = x2[1]; + output[3] = x2[3]; +} + +static void fdct4x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + + // stage 1 + __m128i x1[8]; + x1[0] = _mm_adds_epi16(input[0], input[7]); + x1[7] = _mm_subs_epi16(input[0], input[7]); + x1[1] = _mm_adds_epi16(input[1], input[6]); + x1[6] = _mm_subs_epi16(input[1], input[6]); + x1[2] = _mm_adds_epi16(input[2], input[5]); + x1[5] = _mm_subs_epi16(input[2], input[5]); + x1[3] = _mm_adds_epi16(input[3], input[4]); + x1[4] = _mm_subs_epi16(input[3], input[4]); + + // stage 2 + __m128i x2[8]; + x2[0] = _mm_adds_epi16(x1[0], x1[3]); + x2[3] = _mm_subs_epi16(x1[0], x1[3]); + x2[1] = _mm_adds_epi16(x1[1], x1[2]); + x2[2] = _mm_subs_epi16(x1[1], x1[2]); + x2[4] = x1[4]; + btf_16_w4_sse2(&cospi_m32_p32, &cospi_p32_p32, __rounding, cos_bit, &x1[5], + &x1[6], &x2[5], &x2[6]); + x2[7] = x1[7]; + + // stage 3 + __m128i x3[8]; + btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x2[0], + &x2[1], &x3[0], &x3[1]); + btf_16_w4_sse2(&cospi_p48_p16, &cospi_m16_p48, __rounding, cos_bit, &x2[2], + &x2[3], &x3[2], &x3[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[5]); + x3[5] = _mm_subs_epi16(x2[4], x2[5]); + x3[6] = _mm_subs_epi16(x2[7], x2[6]); + x3[7] = _mm_adds_epi16(x2[7], x2[6]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w4_sse2(&cospi_p56_p08, &cospi_m08_p56, __rounding, cos_bit, &x3[4], + &x3[7], &x4[4], &x4[7]); + btf_16_w4_sse2(&cospi_p24_p40, &cospi_m40_p24, __rounding, cos_bit, &x3[5], + &x3[6], &x4[5], &x4[6]); + + // stage 5 + output[0] = x4[0]; + output[1] = x4[4]; + output[2] = x4[2]; + output[3] = x4[6]; + output[4] = x4[1]; + output[5] = x4[5]; + output[6] = x4[3]; + output[7] = x4[7]; +} + +static void fdct8x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + + // stage 1 + __m128i x1[8]; + x1[0] = _mm_adds_epi16(input[0], input[7]); + x1[7] = _mm_subs_epi16(input[0], input[7]); + x1[1] = _mm_adds_epi16(input[1], input[6]); + x1[6] = _mm_subs_epi16(input[1], input[6]); + x1[2] = _mm_adds_epi16(input[2], input[5]); + x1[5] = _mm_subs_epi16(input[2], input[5]); + x1[3] = _mm_adds_epi16(input[3], input[4]); + x1[4] = _mm_subs_epi16(input[3], input[4]); + + // stage 2 + __m128i x2[8]; + x2[0] = _mm_adds_epi16(x1[0], x1[3]); + x2[3] = _mm_subs_epi16(x1[0], x1[3]); + x2[1] = _mm_adds_epi16(x1[1], x1[2]); + x2[2] = _mm_subs_epi16(x1[1], x1[2]); + x2[4] = x1[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]); + x2[7] = x1[7]; + + // stage 3 + __m128i x3[8]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[5]); + x3[5] = _mm_subs_epi16(x2[4], x2[5]); + x3[6] = _mm_subs_epi16(x2[7], x2[6]); + x3[7] = _mm_adds_epi16(x2[7], x2[6]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], x4[4], x4[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], x4[5], x4[6]); + + // stage 5 + output[0] = x4[0]; + output[1] = x4[4]; + output[2] = x4[2]; + output[3] = x4[6]; + output[4] = x4[1]; + output[5] = x4[5]; + output[6] = x4[3]; + output[7] = x4[7]; +} + +static void fdct8x16_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + + // stage 1 + __m128i x1[16]; + x1[0] = _mm_adds_epi16(input[0], input[15]); + x1[15] = _mm_subs_epi16(input[0], input[15]); + x1[1] = _mm_adds_epi16(input[1], input[14]); + x1[14] = _mm_subs_epi16(input[1], input[14]); + x1[2] = _mm_adds_epi16(input[2], input[13]); + x1[13] = _mm_subs_epi16(input[2], input[13]); + x1[3] = _mm_adds_epi16(input[3], input[12]); + x1[12] = _mm_subs_epi16(input[3], input[12]); + x1[4] = _mm_adds_epi16(input[4], input[11]); + x1[11] = _mm_subs_epi16(input[4], input[11]); + x1[5] = _mm_adds_epi16(input[5], input[10]); + x1[10] = _mm_subs_epi16(input[5], input[10]); + x1[6] = _mm_adds_epi16(input[6], input[9]); + x1[9] = _mm_subs_epi16(input[6], input[9]); + x1[7] = _mm_adds_epi16(input[7], input[8]); + x1[8] = _mm_subs_epi16(input[7], input[8]); + + // stage 2 + __m128i x2[16]; + x2[0] = _mm_adds_epi16(x1[0], x1[7]); + x2[7] = _mm_subs_epi16(x1[0], x1[7]); + x2[1] = _mm_adds_epi16(x1[1], x1[6]); + x2[6] = _mm_subs_epi16(x1[1], x1[6]); + x2[2] = _mm_adds_epi16(x1[2], x1[5]); + x2[5] = _mm_subs_epi16(x1[2], x1[5]); + x2[3] = _mm_adds_epi16(x1[3], x1[4]); + x2[4] = _mm_subs_epi16(x1[3], x1[4]); + x2[8] = x1[8]; + x2[9] = x1[9]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[10], x1[13], x2[10], x2[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[11], x1[12], x2[11], x2[12]); + x2[14] = x1[14]; + x2[15] = x1[15]; + + // stage 3 + __m128i x3[16]; + x3[0] = _mm_adds_epi16(x2[0], x2[3]); + x3[3] = _mm_subs_epi16(x2[0], x2[3]); + x3[1] = _mm_adds_epi16(x2[1], x2[2]); + x3[2] = _mm_subs_epi16(x2[1], x2[2]); + x3[4] = x2[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[5], x2[6], x3[5], x3[6]); + x3[7] = x2[7]; + x3[8] = _mm_adds_epi16(x2[8], x2[11]); + x3[11] = _mm_subs_epi16(x2[8], x2[11]); + x3[9] = _mm_adds_epi16(x2[9], x2[10]); + x3[10] = _mm_subs_epi16(x2[9], x2[10]); + x3[12] = _mm_subs_epi16(x2[15], x2[12]); + x3[15] = _mm_adds_epi16(x2[15], x2[12]); + x3[13] = _mm_subs_epi16(x2[14], x2[13]); + x3[14] = _mm_adds_epi16(x2[14], x2[13]); + + // stage 4 + __m128i x4[16]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x3[2], x3[3], x4[2], x4[3]); + x4[4] = _mm_adds_epi16(x3[4], x3[5]); + x4[5] = _mm_subs_epi16(x3[4], x3[5]); + x4[6] = _mm_subs_epi16(x3[7], x3[6]); + x4[7] = _mm_adds_epi16(x3[7], x3[6]); + x4[8] = x3[8]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]); + x4[11] = x3[11]; + x4[12] = x3[12]; + x4[15] = x3[15]; + + // stage 5 + __m128i x5[16]; + x5[0] = x4[0]; + x5[1] = x4[1]; + x5[2] = x4[2]; + x5[3] = x4[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x4[4], x4[7], x5[4], x5[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x4[5], x4[6], x5[5], x5[6]); + x5[8] = _mm_adds_epi16(x4[8], x4[9]); + x5[9] = _mm_subs_epi16(x4[8], x4[9]); + x5[10] = _mm_subs_epi16(x4[11], x4[10]); + x5[11] = _mm_adds_epi16(x4[11], x4[10]); + x5[12] = _mm_adds_epi16(x4[12], x4[13]); + x5[13] = _mm_subs_epi16(x4[12], x4[13]); + x5[14] = _mm_subs_epi16(x4[15], x4[14]); + x5[15] = _mm_adds_epi16(x4[15], x4[14]); + + // stage 6 + __m128i x6[16]; + x6[0] = x5[0]; + x6[1] = x5[1]; + x6[2] = x5[2]; + x6[3] = x5[3]; + x6[4] = x5[4]; + x6[5] = x5[5]; + x6[6] = x5[6]; + x6[7] = x5[7]; + btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x5[8], x5[15], x6[8], x6[15]); + btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x5[9], x5[14], x6[9], x6[14]); + btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x5[10], x5[13], x6[10], x6[13]); + btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x5[11], x5[12], x6[11], x6[12]); + + // stage 7 + output[0] = x6[0]; + output[1] = x6[8]; + output[2] = x6[4]; + output[3] = x6[12]; + output[4] = x6[2]; + output[5] = x6[10]; + output[6] = x6[6]; + output[7] = x6[14]; + output[8] = x6[1]; + output[9] = x6[9]; + output[10] = x6[5]; + output[11] = x6[13]; + output[12] = x6[3]; + output[13] = x6[11]; + output[14] = x6[7]; + output[15] = x6[15]; +} + +void fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]); + __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]); + __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]); + __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]); + __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]); + __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]); + __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]); + __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]); + __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]); + __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]); + __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]); + __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]); + __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]); + __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]); + __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]); + __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]); + + // stage 1 + __m128i x1[32]; + x1[0] = _mm_adds_epi16(input[0], input[31]); + x1[31] = _mm_subs_epi16(input[0], input[31]); + x1[1] = _mm_adds_epi16(input[1], input[30]); + x1[30] = _mm_subs_epi16(input[1], input[30]); + x1[2] = _mm_adds_epi16(input[2], input[29]); + x1[29] = _mm_subs_epi16(input[2], input[29]); + x1[3] = _mm_adds_epi16(input[3], input[28]); + x1[28] = _mm_subs_epi16(input[3], input[28]); + x1[4] = _mm_adds_epi16(input[4], input[27]); + x1[27] = _mm_subs_epi16(input[4], input[27]); + x1[5] = _mm_adds_epi16(input[5], input[26]); + x1[26] = _mm_subs_epi16(input[5], input[26]); + x1[6] = _mm_adds_epi16(input[6], input[25]); + x1[25] = _mm_subs_epi16(input[6], input[25]); + x1[7] = _mm_adds_epi16(input[7], input[24]); + x1[24] = _mm_subs_epi16(input[7], input[24]); + x1[8] = _mm_adds_epi16(input[8], input[23]); + x1[23] = _mm_subs_epi16(input[8], input[23]); + x1[9] = _mm_adds_epi16(input[9], input[22]); + x1[22] = _mm_subs_epi16(input[9], input[22]); + x1[10] = _mm_adds_epi16(input[10], input[21]); + x1[21] = _mm_subs_epi16(input[10], input[21]); + x1[11] = _mm_adds_epi16(input[11], input[20]); + x1[20] = _mm_subs_epi16(input[11], input[20]); + x1[12] = _mm_adds_epi16(input[12], input[19]); + x1[19] = _mm_subs_epi16(input[12], input[19]); + x1[13] = _mm_adds_epi16(input[13], input[18]); + x1[18] = _mm_subs_epi16(input[13], input[18]); + x1[14] = _mm_adds_epi16(input[14], input[17]); + x1[17] = _mm_subs_epi16(input[14], input[17]); + x1[15] = _mm_adds_epi16(input[15], input[16]); + x1[16] = _mm_subs_epi16(input[15], input[16]); + + // stage 2 + __m128i x2[32]; + x2[0] = _mm_adds_epi16(x1[0], x1[15]); + x2[15] = _mm_subs_epi16(x1[0], x1[15]); + x2[1] = _mm_adds_epi16(x1[1], x1[14]); + x2[14] = _mm_subs_epi16(x1[1], x1[14]); + x2[2] = _mm_adds_epi16(x1[2], x1[13]); + x2[13] = _mm_subs_epi16(x1[2], x1[13]); + x2[3] = _mm_adds_epi16(x1[3], x1[12]); + x2[12] = _mm_subs_epi16(x1[3], x1[12]); + x2[4] = _mm_adds_epi16(x1[4], x1[11]); + x2[11] = _mm_subs_epi16(x1[4], x1[11]); + x2[5] = _mm_adds_epi16(x1[5], x1[10]); + x2[10] = _mm_subs_epi16(x1[5], x1[10]); + x2[6] = _mm_adds_epi16(x1[6], x1[9]); + x2[9] = _mm_subs_epi16(x1[6], x1[9]); + x2[7] = _mm_adds_epi16(x1[7], x1[8]); + x2[8] = _mm_subs_epi16(x1[7], x1[8]); + x2[16] = x1[16]; + x2[17] = x1[17]; + x2[18] = x1[18]; + x2[19] = x1[19]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[20], x1[27], x2[20], x2[27]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[21], x1[26], x2[21], x2[26]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[22], x1[25], x2[22], x2[25]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[23], x1[24], x2[23], x2[24]); + x2[28] = x1[28]; + x2[29] = x1[29]; + x2[30] = x1[30]; + x2[31] = x1[31]; + + // stage 3 + __m128i x3[32]; + x3[0] = _mm_adds_epi16(x2[0], x2[7]); + x3[7] = _mm_subs_epi16(x2[0], x2[7]); + x3[1] = _mm_adds_epi16(x2[1], x2[6]); + x3[6] = _mm_subs_epi16(x2[1], x2[6]); + x3[2] = _mm_adds_epi16(x2[2], x2[5]); + x3[5] = _mm_subs_epi16(x2[2], x2[5]); + x3[3] = _mm_adds_epi16(x2[3], x2[4]); + x3[4] = _mm_subs_epi16(x2[3], x2[4]); + x3[8] = x2[8]; + x3[9] = x2[9]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[10], x2[13], x3[10], x3[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[11], x2[12], x3[11], x3[12]); + x3[14] = x2[14]; + x3[15] = x2[15]; + x3[16] = _mm_adds_epi16(x2[16], x2[23]); + x3[23] = _mm_subs_epi16(x2[16], x2[23]); + x3[17] = _mm_adds_epi16(x2[17], x2[22]); + x3[22] = _mm_subs_epi16(x2[17], x2[22]); + x3[18] = _mm_adds_epi16(x2[18], x2[21]); + x3[21] = _mm_subs_epi16(x2[18], x2[21]); + x3[19] = _mm_adds_epi16(x2[19], x2[20]); + x3[20] = _mm_subs_epi16(x2[19], x2[20]); + x3[24] = _mm_subs_epi16(x2[31], x2[24]); + x3[31] = _mm_adds_epi16(x2[31], x2[24]); + x3[25] = _mm_subs_epi16(x2[30], x2[25]); + x3[30] = _mm_adds_epi16(x2[30], x2[25]); + x3[26] = _mm_subs_epi16(x2[29], x2[26]); + x3[29] = _mm_adds_epi16(x2[29], x2[26]); + x3[27] = _mm_subs_epi16(x2[28], x2[27]); + x3[28] = _mm_adds_epi16(x2[28], x2[27]); + + // stage 4 + __m128i x4[32]; + x4[0] = _mm_adds_epi16(x3[0], x3[3]); + x4[3] = _mm_subs_epi16(x3[0], x3[3]); + x4[1] = _mm_adds_epi16(x3[1], x3[2]); + x4[2] = _mm_subs_epi16(x3[1], x3[2]); + x4[4] = x3[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]); + x4[7] = x3[7]; + x4[8] = _mm_adds_epi16(x3[8], x3[11]); + x4[11] = _mm_subs_epi16(x3[8], x3[11]); + x4[9] = _mm_adds_epi16(x3[9], x3[10]); + x4[10] = _mm_subs_epi16(x3[9], x3[10]); + x4[12] = _mm_subs_epi16(x3[15], x3[12]); + x4[15] = _mm_adds_epi16(x3[15], x3[12]); + x4[13] = _mm_subs_epi16(x3[14], x3[13]); + x4[14] = _mm_adds_epi16(x3[14], x3[13]); + x4[16] = x3[16]; + x4[17] = x3[17]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[18], x3[29], x4[18], x4[29]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[19], x3[28], x4[19], x4[28]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[20], x3[27], x4[20], x4[27]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[21], x3[26], x4[21], x4[26]); + x4[22] = x3[22]; + x4[23] = x3[23]; + x4[24] = x3[24]; + x4[25] = x3[25]; + x4[30] = x3[30]; + x4[31] = x3[31]; + + // stage 5 + __m128i x5[32]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x4[0], x4[1], x5[0], x5[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x4[2], x4[3], x5[2], x5[3]); + x5[4] = _mm_adds_epi16(x4[4], x4[5]); + x5[5] = _mm_subs_epi16(x4[4], x4[5]); + x5[6] = _mm_subs_epi16(x4[7], x4[6]); + x5[7] = _mm_adds_epi16(x4[7], x4[6]); + x5[8] = x4[8]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[9], x4[14], x5[9], x5[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[10], x4[13], x5[10], x5[13]); + x5[11] = x4[11]; + x5[12] = x4[12]; + x5[15] = x4[15]; + x5[16] = _mm_adds_epi16(x4[16], x4[19]); + x5[19] = _mm_subs_epi16(x4[16], x4[19]); + x5[17] = _mm_adds_epi16(x4[17], x4[18]); + x5[18] = _mm_subs_epi16(x4[17], x4[18]); + x5[20] = _mm_subs_epi16(x4[23], x4[20]); + x5[23] = _mm_adds_epi16(x4[23], x4[20]); + x5[21] = _mm_subs_epi16(x4[22], x4[21]); + x5[22] = _mm_adds_epi16(x4[22], x4[21]); + x5[24] = _mm_adds_epi16(x4[24], x4[27]); + x5[27] = _mm_subs_epi16(x4[24], x4[27]); + x5[25] = _mm_adds_epi16(x4[25], x4[26]); + x5[26] = _mm_subs_epi16(x4[25], x4[26]); + x5[28] = _mm_subs_epi16(x4[31], x4[28]); + x5[31] = _mm_adds_epi16(x4[31], x4[28]); + x5[29] = _mm_subs_epi16(x4[30], x4[29]); + x5[30] = _mm_adds_epi16(x4[30], x4[29]); + + // stage 6 + __m128i x6[32]; + x6[0] = x5[0]; + x6[1] = x5[1]; + x6[2] = x5[2]; + x6[3] = x5[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x5[4], x5[7], x6[4], x6[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x5[5], x5[6], x6[5], x6[6]); + x6[8] = _mm_adds_epi16(x5[8], x5[9]); + x6[9] = _mm_subs_epi16(x5[8], x5[9]); + x6[10] = _mm_subs_epi16(x5[11], x5[10]); + x6[11] = _mm_adds_epi16(x5[11], x5[10]); + x6[12] = _mm_adds_epi16(x5[12], x5[13]); + x6[13] = _mm_subs_epi16(x5[12], x5[13]); + x6[14] = _mm_subs_epi16(x5[15], x5[14]); + x6[15] = _mm_adds_epi16(x5[15], x5[14]); + x6[16] = x5[16]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[17], x5[30], x6[17], x6[30]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[18], x5[29], x6[18], x6[29]); + x6[19] = x5[19]; + x6[20] = x5[20]; + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[21], x5[26], x6[21], x6[26]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[22], x5[25], x6[22], x6[25]); + x6[23] = x5[23]; + x6[24] = x5[24]; + x6[27] = x5[27]; + x6[28] = x5[28]; + x6[31] = x5[31]; + + // stage 7 + __m128i x7[32]; + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + x7[4] = x6[4]; + x7[5] = x6[5]; + x7[6] = x6[6]; + x7[7] = x6[7]; + btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x6[8], x6[15], x7[8], x7[15]); + btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x6[9], x6[14], x7[9], x7[14]); + btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x6[10], x6[13], x7[10], x7[13]); + btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x6[11], x6[12], x7[11], x7[12]); + x7[16] = _mm_adds_epi16(x6[16], x6[17]); + x7[17] = _mm_subs_epi16(x6[16], x6[17]); + x7[18] = _mm_subs_epi16(x6[19], x6[18]); + x7[19] = _mm_adds_epi16(x6[19], x6[18]); + x7[20] = _mm_adds_epi16(x6[20], x6[21]); + x7[21] = _mm_subs_epi16(x6[20], x6[21]); + x7[22] = _mm_subs_epi16(x6[23], x6[22]); + x7[23] = _mm_adds_epi16(x6[23], x6[22]); + x7[24] = _mm_adds_epi16(x6[24], x6[25]); + x7[25] = _mm_subs_epi16(x6[24], x6[25]); + x7[26] = _mm_subs_epi16(x6[27], x6[26]); + x7[27] = _mm_adds_epi16(x6[27], x6[26]); + x7[28] = _mm_adds_epi16(x6[28], x6[29]); + x7[29] = _mm_subs_epi16(x6[28], x6[29]); + x7[30] = _mm_subs_epi16(x6[31], x6[30]); + x7[31] = _mm_adds_epi16(x6[31], x6[30]); + + // stage 8 + __m128i x8[32]; + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + x8[8] = x7[8]; + x8[9] = x7[9]; + x8[10] = x7[10]; + x8[11] = x7[11]; + x8[12] = x7[12]; + x8[13] = x7[13]; + x8[14] = x7[14]; + x8[15] = x7[15]; + btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x7[16], x7[31], x8[16], x8[31]); + btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x7[17], x7[30], x8[17], x8[30]); + btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x7[18], x7[29], x8[18], x8[29]); + btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x7[19], x7[28], x8[19], x8[28]); + btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x7[20], x7[27], x8[20], x8[27]); + btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x7[21], x7[26], x8[21], x8[26]); + btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x7[22], x7[25], x8[22], x8[25]); + btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x7[23], x7[24], x8[23], x8[24]); + + // stage 9 + output[0] = x8[0]; + output[1] = x8[16]; + output[2] = x8[8]; + output[3] = x8[24]; + output[4] = x8[4]; + output[5] = x8[20]; + output[6] = x8[12]; + output[7] = x8[28]; + output[8] = x8[2]; + output[9] = x8[18]; + output[10] = x8[10]; + output[11] = x8[26]; + output[12] = x8[6]; + output[13] = x8[22]; + output[14] = x8[14]; + output[15] = x8[30]; + output[16] = x8[1]; + output[17] = x8[17]; + output[18] = x8[9]; + output[19] = x8[25]; + output[20] = x8[5]; + output[21] = x8[21]; + output[22] = x8[13]; + output[23] = x8[29]; + output[24] = x8[3]; + output[25] = x8[19]; + output[26] = x8[11]; + output[27] = x8[27]; + output[28] = x8[7]; + output[29] = x8[23]; + output[30] = x8[15]; + output[31] = x8[31]; +} + +void fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]); + __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); + __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]); + __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); + __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]); + __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]); + __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]); + __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]); + __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]); + __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]); + __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]); + __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]); + __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]); + __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]); + __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]); + __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]); + __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]); + __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]); + __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]); + __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]); + __m128i cospi_p63_p01 = pair_set_epi16(cospi[63], cospi[1]); + __m128i cospi_m01_p63 = pair_set_epi16(-cospi[1], cospi[63]); + __m128i cospi_p31_p33 = pair_set_epi16(cospi[31], cospi[33]); + __m128i cospi_m33_p31 = pair_set_epi16(-cospi[33], cospi[31]); + __m128i cospi_p47_p17 = pair_set_epi16(cospi[47], cospi[17]); + __m128i cospi_m17_p47 = pair_set_epi16(-cospi[17], cospi[47]); + __m128i cospi_p15_p49 = pair_set_epi16(cospi[15], cospi[49]); + __m128i cospi_m49_p15 = pair_set_epi16(-cospi[49], cospi[15]); + __m128i cospi_p55_p09 = pair_set_epi16(cospi[55], cospi[9]); + __m128i cospi_m09_p55 = pair_set_epi16(-cospi[9], cospi[55]); + __m128i cospi_p23_p41 = pair_set_epi16(cospi[23], cospi[41]); + __m128i cospi_m41_p23 = pair_set_epi16(-cospi[41], cospi[23]); + __m128i cospi_p39_p25 = pair_set_epi16(cospi[39], cospi[25]); + __m128i cospi_m25_p39 = pair_set_epi16(-cospi[25], cospi[39]); + __m128i cospi_p07_p57 = pair_set_epi16(cospi[7], cospi[57]); + __m128i cospi_m57_p07 = pair_set_epi16(-cospi[57], cospi[7]); + __m128i cospi_p59_p05 = pair_set_epi16(cospi[59], cospi[5]); + __m128i cospi_m05_p59 = pair_set_epi16(-cospi[5], cospi[59]); + __m128i cospi_p27_p37 = pair_set_epi16(cospi[27], cospi[37]); + __m128i cospi_m37_p27 = pair_set_epi16(-cospi[37], cospi[27]); + __m128i cospi_p43_p21 = pair_set_epi16(cospi[43], cospi[21]); + __m128i cospi_m21_p43 = pair_set_epi16(-cospi[21], cospi[43]); + __m128i cospi_p11_p53 = pair_set_epi16(cospi[11], cospi[53]); + __m128i cospi_m53_p11 = pair_set_epi16(-cospi[53], cospi[11]); + __m128i cospi_p51_p13 = pair_set_epi16(cospi[51], cospi[13]); + __m128i cospi_m13_p51 = pair_set_epi16(-cospi[13], cospi[51]); + __m128i cospi_p19_p45 = pair_set_epi16(cospi[19], cospi[45]); + __m128i cospi_m45_p19 = pair_set_epi16(-cospi[45], cospi[19]); + __m128i cospi_p35_p29 = pair_set_epi16(cospi[35], cospi[29]); + __m128i cospi_m29_p35 = pair_set_epi16(-cospi[29], cospi[35]); + __m128i cospi_p03_p61 = pair_set_epi16(cospi[3], cospi[61]); + __m128i cospi_m61_p03 = pair_set_epi16(-cospi[61], cospi[3]); + + // stage 1 + __m128i x1[64]; + x1[0] = _mm_adds_epi16(input[0], input[63]); + x1[63] = _mm_subs_epi16(input[0], input[63]); + x1[1] = _mm_adds_epi16(input[1], input[62]); + x1[62] = _mm_subs_epi16(input[1], input[62]); + x1[2] = _mm_adds_epi16(input[2], input[61]); + x1[61] = _mm_subs_epi16(input[2], input[61]); + x1[3] = _mm_adds_epi16(input[3], input[60]); + x1[60] = _mm_subs_epi16(input[3], input[60]); + x1[4] = _mm_adds_epi16(input[4], input[59]); + x1[59] = _mm_subs_epi16(input[4], input[59]); + x1[5] = _mm_adds_epi16(input[5], input[58]); + x1[58] = _mm_subs_epi16(input[5], input[58]); + x1[6] = _mm_adds_epi16(input[6], input[57]); + x1[57] = _mm_subs_epi16(input[6], input[57]); + x1[7] = _mm_adds_epi16(input[7], input[56]); + x1[56] = _mm_subs_epi16(input[7], input[56]); + x1[8] = _mm_adds_epi16(input[8], input[55]); + x1[55] = _mm_subs_epi16(input[8], input[55]); + x1[9] = _mm_adds_epi16(input[9], input[54]); + x1[54] = _mm_subs_epi16(input[9], input[54]); + x1[10] = _mm_adds_epi16(input[10], input[53]); + x1[53] = _mm_subs_epi16(input[10], input[53]); + x1[11] = _mm_adds_epi16(input[11], input[52]); + x1[52] = _mm_subs_epi16(input[11], input[52]); + x1[12] = _mm_adds_epi16(input[12], input[51]); + x1[51] = _mm_subs_epi16(input[12], input[51]); + x1[13] = _mm_adds_epi16(input[13], input[50]); + x1[50] = _mm_subs_epi16(input[13], input[50]); + x1[14] = _mm_adds_epi16(input[14], input[49]); + x1[49] = _mm_subs_epi16(input[14], input[49]); + x1[15] = _mm_adds_epi16(input[15], input[48]); + x1[48] = _mm_subs_epi16(input[15], input[48]); + x1[16] = _mm_adds_epi16(input[16], input[47]); + x1[47] = _mm_subs_epi16(input[16], input[47]); + x1[17] = _mm_adds_epi16(input[17], input[46]); + x1[46] = _mm_subs_epi16(input[17], input[46]); + x1[18] = _mm_adds_epi16(input[18], input[45]); + x1[45] = _mm_subs_epi16(input[18], input[45]); + x1[19] = _mm_adds_epi16(input[19], input[44]); + x1[44] = _mm_subs_epi16(input[19], input[44]); + x1[20] = _mm_adds_epi16(input[20], input[43]); + x1[43] = _mm_subs_epi16(input[20], input[43]); + x1[21] = _mm_adds_epi16(input[21], input[42]); + x1[42] = _mm_subs_epi16(input[21], input[42]); + x1[22] = _mm_adds_epi16(input[22], input[41]); + x1[41] = _mm_subs_epi16(input[22], input[41]); + x1[23] = _mm_adds_epi16(input[23], input[40]); + x1[40] = _mm_subs_epi16(input[23], input[40]); + x1[24] = _mm_adds_epi16(input[24], input[39]); + x1[39] = _mm_subs_epi16(input[24], input[39]); + x1[25] = _mm_adds_epi16(input[25], input[38]); + x1[38] = _mm_subs_epi16(input[25], input[38]); + x1[26] = _mm_adds_epi16(input[26], input[37]); + x1[37] = _mm_subs_epi16(input[26], input[37]); + x1[27] = _mm_adds_epi16(input[27], input[36]); + x1[36] = _mm_subs_epi16(input[27], input[36]); + x1[28] = _mm_adds_epi16(input[28], input[35]); + x1[35] = _mm_subs_epi16(input[28], input[35]); + x1[29] = _mm_adds_epi16(input[29], input[34]); + x1[34] = _mm_subs_epi16(input[29], input[34]); + x1[30] = _mm_adds_epi16(input[30], input[33]); + x1[33] = _mm_subs_epi16(input[30], input[33]); + x1[31] = _mm_adds_epi16(input[31], input[32]); + x1[32] = _mm_subs_epi16(input[31], input[32]); + + // stage 2 + __m128i x2[64]; + x2[0] = _mm_adds_epi16(x1[0], x1[31]); + x2[31] = _mm_subs_epi16(x1[0], x1[31]); + x2[1] = _mm_adds_epi16(x1[1], x1[30]); + x2[30] = _mm_subs_epi16(x1[1], x1[30]); + x2[2] = _mm_adds_epi16(x1[2], x1[29]); + x2[29] = _mm_subs_epi16(x1[2], x1[29]); + x2[3] = _mm_adds_epi16(x1[3], x1[28]); + x2[28] = _mm_subs_epi16(x1[3], x1[28]); + x2[4] = _mm_adds_epi16(x1[4], x1[27]); + x2[27] = _mm_subs_epi16(x1[4], x1[27]); + x2[5] = _mm_adds_epi16(x1[5], x1[26]); + x2[26] = _mm_subs_epi16(x1[5], x1[26]); + x2[6] = _mm_adds_epi16(x1[6], x1[25]); + x2[25] = _mm_subs_epi16(x1[6], x1[25]); + x2[7] = _mm_adds_epi16(x1[7], x1[24]); + x2[24] = _mm_subs_epi16(x1[7], x1[24]); + x2[8] = _mm_adds_epi16(x1[8], x1[23]); + x2[23] = _mm_subs_epi16(x1[8], x1[23]); + x2[9] = _mm_adds_epi16(x1[9], x1[22]); + x2[22] = _mm_subs_epi16(x1[9], x1[22]); + x2[10] = _mm_adds_epi16(x1[10], x1[21]); + x2[21] = _mm_subs_epi16(x1[10], x1[21]); + x2[11] = _mm_adds_epi16(x1[11], x1[20]); + x2[20] = _mm_subs_epi16(x1[11], x1[20]); + x2[12] = _mm_adds_epi16(x1[12], x1[19]); + x2[19] = _mm_subs_epi16(x1[12], x1[19]); + x2[13] = _mm_adds_epi16(x1[13], x1[18]); + x2[18] = _mm_subs_epi16(x1[13], x1[18]); + x2[14] = _mm_adds_epi16(x1[14], x1[17]); + x2[17] = _mm_subs_epi16(x1[14], x1[17]); + x2[15] = _mm_adds_epi16(x1[15], x1[16]); + x2[16] = _mm_subs_epi16(x1[15], x1[16]); + x2[32] = x1[32]; + x2[33] = x1[33]; + x2[34] = x1[34]; + x2[35] = x1[35]; + x2[36] = x1[36]; + x2[37] = x1[37]; + x2[38] = x1[38]; + x2[39] = x1[39]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[40], x1[55], x2[40], x2[55]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[41], x1[54], x2[41], x2[54]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[42], x1[53], x2[42], x2[53]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[43], x1[52], x2[43], x2[52]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[44], x1[51], x2[44], x2[51]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[45], x1[50], x2[45], x2[50]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[46], x1[49], x2[46], x2[49]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[47], x1[48], x2[47], x2[48]); + x2[56] = x1[56]; + x2[57] = x1[57]; + x2[58] = x1[58]; + x2[59] = x1[59]; + x2[60] = x1[60]; + x2[61] = x1[61]; + x2[62] = x1[62]; + x2[63] = x1[63]; + + // stage 3 + __m128i x3[64]; + x3[0] = _mm_adds_epi16(x2[0], x2[15]); + x3[15] = _mm_subs_epi16(x2[0], x2[15]); + x3[1] = _mm_adds_epi16(x2[1], x2[14]); + x3[14] = _mm_subs_epi16(x2[1], x2[14]); + x3[2] = _mm_adds_epi16(x2[2], x2[13]); + x3[13] = _mm_subs_epi16(x2[2], x2[13]); + x3[3] = _mm_adds_epi16(x2[3], x2[12]); + x3[12] = _mm_subs_epi16(x2[3], x2[12]); + x3[4] = _mm_adds_epi16(x2[4], x2[11]); + x3[11] = _mm_subs_epi16(x2[4], x2[11]); + x3[5] = _mm_adds_epi16(x2[5], x2[10]); + x3[10] = _mm_subs_epi16(x2[5], x2[10]); + x3[6] = _mm_adds_epi16(x2[6], x2[9]); + x3[9] = _mm_subs_epi16(x2[6], x2[9]); + x3[7] = _mm_adds_epi16(x2[7], x2[8]); + x3[8] = _mm_subs_epi16(x2[7], x2[8]); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[20], x2[27], x3[20], x3[27]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[21], x2[26], x3[21], x3[26]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[22], x2[25], x3[22], x3[25]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[23], x2[24], x3[23], x3[24]); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + x3[32] = _mm_adds_epi16(x2[32], x2[47]); + x3[47] = _mm_subs_epi16(x2[32], x2[47]); + x3[33] = _mm_adds_epi16(x2[33], x2[46]); + x3[46] = _mm_subs_epi16(x2[33], x2[46]); + x3[34] = _mm_adds_epi16(x2[34], x2[45]); + x3[45] = _mm_subs_epi16(x2[34], x2[45]); + x3[35] = _mm_adds_epi16(x2[35], x2[44]); + x3[44] = _mm_subs_epi16(x2[35], x2[44]); + x3[36] = _mm_adds_epi16(x2[36], x2[43]); + x3[43] = _mm_subs_epi16(x2[36], x2[43]); + x3[37] = _mm_adds_epi16(x2[37], x2[42]); + x3[42] = _mm_subs_epi16(x2[37], x2[42]); + x3[38] = _mm_adds_epi16(x2[38], x2[41]); + x3[41] = _mm_subs_epi16(x2[38], x2[41]); + x3[39] = _mm_adds_epi16(x2[39], x2[40]); + x3[40] = _mm_subs_epi16(x2[39], x2[40]); + x3[48] = _mm_subs_epi16(x2[63], x2[48]); + x3[63] = _mm_adds_epi16(x2[63], x2[48]); + x3[49] = _mm_subs_epi16(x2[62], x2[49]); + x3[62] = _mm_adds_epi16(x2[62], x2[49]); + x3[50] = _mm_subs_epi16(x2[61], x2[50]); + x3[61] = _mm_adds_epi16(x2[61], x2[50]); + x3[51] = _mm_subs_epi16(x2[60], x2[51]); + x3[60] = _mm_adds_epi16(x2[60], x2[51]); + x3[52] = _mm_subs_epi16(x2[59], x2[52]); + x3[59] = _mm_adds_epi16(x2[59], x2[52]); + x3[53] = _mm_subs_epi16(x2[58], x2[53]); + x3[58] = _mm_adds_epi16(x2[58], x2[53]); + x3[54] = _mm_subs_epi16(x2[57], x2[54]); + x3[57] = _mm_adds_epi16(x2[57], x2[54]); + x3[55] = _mm_subs_epi16(x2[56], x2[55]); + x3[56] = _mm_adds_epi16(x2[56], x2[55]); + + // stage 4 + __m128i x4[64]; + x4[0] = _mm_adds_epi16(x3[0], x3[7]); + x4[7] = _mm_subs_epi16(x3[0], x3[7]); + x4[1] = _mm_adds_epi16(x3[1], x3[6]); + x4[6] = _mm_subs_epi16(x3[1], x3[6]); + x4[2] = _mm_adds_epi16(x3[2], x3[5]); + x4[5] = _mm_subs_epi16(x3[2], x3[5]); + x4[3] = _mm_adds_epi16(x3[3], x3[4]); + x4[4] = _mm_subs_epi16(x3[3], x3[4]); + x4[8] = x3[8]; + x4[9] = x3[9]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[10], x3[13], x4[10], x4[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[11], x3[12], x4[11], x4[12]); + x4[14] = x3[14]; + x4[15] = x3[15]; + x4[16] = _mm_adds_epi16(x3[16], x3[23]); + x4[23] = _mm_subs_epi16(x3[16], x3[23]); + x4[17] = _mm_adds_epi16(x3[17], x3[22]); + x4[22] = _mm_subs_epi16(x3[17], x3[22]); + x4[18] = _mm_adds_epi16(x3[18], x3[21]); + x4[21] = _mm_subs_epi16(x3[18], x3[21]); + x4[19] = _mm_adds_epi16(x3[19], x3[20]); + x4[20] = _mm_subs_epi16(x3[19], x3[20]); + x4[24] = _mm_subs_epi16(x3[31], x3[24]); + x4[31] = _mm_adds_epi16(x3[31], x3[24]); + x4[25] = _mm_subs_epi16(x3[30], x3[25]); + x4[30] = _mm_adds_epi16(x3[30], x3[25]); + x4[26] = _mm_subs_epi16(x3[29], x3[26]); + x4[29] = _mm_adds_epi16(x3[29], x3[26]); + x4[27] = _mm_subs_epi16(x3[28], x3[27]); + x4[28] = _mm_adds_epi16(x3[28], x3[27]); + x4[32] = x3[32]; + x4[33] = x3[33]; + x4[34] = x3[34]; + x4[35] = x3[35]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[36], x3[59], x4[36], x4[59]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[37], x3[58], x4[37], x4[58]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[38], x3[57], x4[38], x4[57]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[39], x3[56], x4[39], x4[56]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[40], x3[55], x4[40], x4[55]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[41], x3[54], x4[41], x4[54]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[42], x3[53], x4[42], x4[53]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[43], x3[52], x4[43], x4[52]); + x4[44] = x3[44]; + x4[45] = x3[45]; + x4[46] = x3[46]; + x4[47] = x3[47]; + x4[48] = x3[48]; + x4[49] = x3[49]; + x4[50] = x3[50]; + x4[51] = x3[51]; + x4[60] = x3[60]; + x4[61] = x3[61]; + x4[62] = x3[62]; + x4[63] = x3[63]; + + // stage 5 + __m128i x5[64]; + x5[0] = _mm_adds_epi16(x4[0], x4[3]); + x5[3] = _mm_subs_epi16(x4[0], x4[3]); + x5[1] = _mm_adds_epi16(x4[1], x4[2]); + x5[2] = _mm_subs_epi16(x4[1], x4[2]); + x5[4] = x4[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x4[5], x4[6], x5[5], x5[6]); + x5[7] = x4[7]; + x5[8] = _mm_adds_epi16(x4[8], x4[11]); + x5[11] = _mm_subs_epi16(x4[8], x4[11]); + x5[9] = _mm_adds_epi16(x4[9], x4[10]); + x5[10] = _mm_subs_epi16(x4[9], x4[10]); + x5[12] = _mm_subs_epi16(x4[15], x4[12]); + x5[15] = _mm_adds_epi16(x4[15], x4[12]); + x5[13] = _mm_subs_epi16(x4[14], x4[13]); + x5[14] = _mm_adds_epi16(x4[14], x4[13]); + x5[16] = x4[16]; + x5[17] = x4[17]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[18], x4[29], x5[18], x5[29]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[19], x4[28], x5[19], x5[28]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[20], x4[27], x5[20], x5[27]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[21], x4[26], x5[21], x5[26]); + x5[22] = x4[22]; + x5[23] = x4[23]; + x5[24] = x4[24]; + x5[25] = x4[25]; + x5[30] = x4[30]; + x5[31] = x4[31]; + x5[32] = _mm_adds_epi16(x4[32], x4[39]); + x5[39] = _mm_subs_epi16(x4[32], x4[39]); + x5[33] = _mm_adds_epi16(x4[33], x4[38]); + x5[38] = _mm_subs_epi16(x4[33], x4[38]); + x5[34] = _mm_adds_epi16(x4[34], x4[37]); + x5[37] = _mm_subs_epi16(x4[34], x4[37]); + x5[35] = _mm_adds_epi16(x4[35], x4[36]); + x5[36] = _mm_subs_epi16(x4[35], x4[36]); + x5[40] = _mm_subs_epi16(x4[47], x4[40]); + x5[47] = _mm_adds_epi16(x4[47], x4[40]); + x5[41] = _mm_subs_epi16(x4[46], x4[41]); + x5[46] = _mm_adds_epi16(x4[46], x4[41]); + x5[42] = _mm_subs_epi16(x4[45], x4[42]); + x5[45] = _mm_adds_epi16(x4[45], x4[42]); + x5[43] = _mm_subs_epi16(x4[44], x4[43]); + x5[44] = _mm_adds_epi16(x4[44], x4[43]); + x5[48] = _mm_adds_epi16(x4[48], x4[55]); + x5[55] = _mm_subs_epi16(x4[48], x4[55]); + x5[49] = _mm_adds_epi16(x4[49], x4[54]); + x5[54] = _mm_subs_epi16(x4[49], x4[54]); + x5[50] = _mm_adds_epi16(x4[50], x4[53]); + x5[53] = _mm_subs_epi16(x4[50], x4[53]); + x5[51] = _mm_adds_epi16(x4[51], x4[52]); + x5[52] = _mm_subs_epi16(x4[51], x4[52]); + x5[56] = _mm_subs_epi16(x4[63], x4[56]); + x5[63] = _mm_adds_epi16(x4[63], x4[56]); + x5[57] = _mm_subs_epi16(x4[62], x4[57]); + x5[62] = _mm_adds_epi16(x4[62], x4[57]); + x5[58] = _mm_subs_epi16(x4[61], x4[58]); + x5[61] = _mm_adds_epi16(x4[61], x4[58]); + x5[59] = _mm_subs_epi16(x4[60], x4[59]); + x5[60] = _mm_adds_epi16(x4[60], x4[59]); + + // stage 6 + __m128i x6[64]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x5[0], x5[1], x6[0], x6[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x5[2], x5[3], x6[2], x6[3]); + x6[4] = _mm_adds_epi16(x5[4], x5[5]); + x6[5] = _mm_subs_epi16(x5[4], x5[5]); + x6[6] = _mm_subs_epi16(x5[7], x5[6]); + x6[7] = _mm_adds_epi16(x5[7], x5[6]); + x6[8] = x5[8]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x5[9], x5[14], x6[9], x6[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x5[10], x5[13], x6[10], x6[13]); + x6[11] = x5[11]; + x6[12] = x5[12]; + x6[15] = x5[15]; + x6[16] = _mm_adds_epi16(x5[16], x5[19]); + x6[19] = _mm_subs_epi16(x5[16], x5[19]); + x6[17] = _mm_adds_epi16(x5[17], x5[18]); + x6[18] = _mm_subs_epi16(x5[17], x5[18]); + x6[20] = _mm_subs_epi16(x5[23], x5[20]); + x6[23] = _mm_adds_epi16(x5[23], x5[20]); + x6[21] = _mm_subs_epi16(x5[22], x5[21]); + x6[22] = _mm_adds_epi16(x5[22], x5[21]); + x6[24] = _mm_adds_epi16(x5[24], x5[27]); + x6[27] = _mm_subs_epi16(x5[24], x5[27]); + x6[25] = _mm_adds_epi16(x5[25], x5[26]); + x6[26] = _mm_subs_epi16(x5[25], x5[26]); + x6[28] = _mm_subs_epi16(x5[31], x5[28]); + x6[31] = _mm_adds_epi16(x5[31], x5[28]); + x6[29] = _mm_subs_epi16(x5[30], x5[29]); + x6[30] = _mm_adds_epi16(x5[30], x5[29]); + x6[32] = x5[32]; + x6[33] = x5[33]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[34], x5[61], x6[34], x6[61]); + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[35], x5[60], x6[35], x6[60]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[36], x5[59], x6[36], x6[59]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[37], x5[58], x6[37], x6[58]); + x6[38] = x5[38]; + x6[39] = x5[39]; + x6[40] = x5[40]; + x6[41] = x5[41]; + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[42], x5[53], x6[42], x6[53]); + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[43], x5[52], x6[43], x6[52]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[44], x5[51], x6[44], x6[51]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[45], x5[50], x6[45], x6[50]); + x6[46] = x5[46]; + x6[47] = x5[47]; + x6[48] = x5[48]; + x6[49] = x5[49]; + x6[54] = x5[54]; + x6[55] = x5[55]; + x6[56] = x5[56]; + x6[57] = x5[57]; + x6[62] = x5[62]; + x6[63] = x5[63]; + + // stage 7 + __m128i x7[64]; + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x6[4], x6[7], x7[4], x7[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x6[5], x6[6], x7[5], x7[6]); + x7[8] = _mm_adds_epi16(x6[8], x6[9]); + x7[9] = _mm_subs_epi16(x6[8], x6[9]); + x7[10] = _mm_subs_epi16(x6[11], x6[10]); + x7[11] = _mm_adds_epi16(x6[11], x6[10]); + x7[12] = _mm_adds_epi16(x6[12], x6[13]); + x7[13] = _mm_subs_epi16(x6[12], x6[13]); + x7[14] = _mm_subs_epi16(x6[15], x6[14]); + x7[15] = _mm_adds_epi16(x6[15], x6[14]); + x7[16] = x6[16]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x6[17], x6[30], x7[17], x7[30]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x6[18], x6[29], x7[18], x7[29]); + x7[19] = x6[19]; + x7[20] = x6[20]; + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x6[21], x6[26], x7[21], x7[26]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x6[22], x6[25], x7[22], x7[25]); + x7[23] = x6[23]; + x7[24] = x6[24]; + x7[27] = x6[27]; + x7[28] = x6[28]; + x7[31] = x6[31]; + x7[32] = _mm_adds_epi16(x6[32], x6[35]); + x7[35] = _mm_subs_epi16(x6[32], x6[35]); + x7[33] = _mm_adds_epi16(x6[33], x6[34]); + x7[34] = _mm_subs_epi16(x6[33], x6[34]); + x7[36] = _mm_subs_epi16(x6[39], x6[36]); + x7[39] = _mm_adds_epi16(x6[39], x6[36]); + x7[37] = _mm_subs_epi16(x6[38], x6[37]); + x7[38] = _mm_adds_epi16(x6[38], x6[37]); + x7[40] = _mm_adds_epi16(x6[40], x6[43]); + x7[43] = _mm_subs_epi16(x6[40], x6[43]); + x7[41] = _mm_adds_epi16(x6[41], x6[42]); + x7[42] = _mm_subs_epi16(x6[41], x6[42]); + x7[44] = _mm_subs_epi16(x6[47], x6[44]); + x7[47] = _mm_adds_epi16(x6[47], x6[44]); + x7[45] = _mm_subs_epi16(x6[46], x6[45]); + x7[46] = _mm_adds_epi16(x6[46], x6[45]); + x7[48] = _mm_adds_epi16(x6[48], x6[51]); + x7[51] = _mm_subs_epi16(x6[48], x6[51]); + x7[49] = _mm_adds_epi16(x6[49], x6[50]); + x7[50] = _mm_subs_epi16(x6[49], x6[50]); + x7[52] = _mm_subs_epi16(x6[55], x6[52]); + x7[55] = _mm_adds_epi16(x6[55], x6[52]); + x7[53] = _mm_subs_epi16(x6[54], x6[53]); + x7[54] = _mm_adds_epi16(x6[54], x6[53]); + x7[56] = _mm_adds_epi16(x6[56], x6[59]); + x7[59] = _mm_subs_epi16(x6[56], x6[59]); + x7[57] = _mm_adds_epi16(x6[57], x6[58]); + x7[58] = _mm_subs_epi16(x6[57], x6[58]); + x7[60] = _mm_subs_epi16(x6[63], x6[60]); + x7[63] = _mm_adds_epi16(x6[63], x6[60]); + x7[61] = _mm_subs_epi16(x6[62], x6[61]); + x7[62] = _mm_adds_epi16(x6[62], x6[61]); + + // stage 8 + __m128i x8[64]; + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x7[8], x7[15], x8[8], x8[15]); + btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x7[9], x7[14], x8[9], x8[14]); + btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x7[10], x7[13], x8[10], x8[13]); + btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x7[11], x7[12], x8[11], x8[12]); + x8[16] = _mm_adds_epi16(x7[16], x7[17]); + x8[17] = _mm_subs_epi16(x7[16], x7[17]); + x8[18] = _mm_subs_epi16(x7[19], x7[18]); + x8[19] = _mm_adds_epi16(x7[19], x7[18]); + x8[20] = _mm_adds_epi16(x7[20], x7[21]); + x8[21] = _mm_subs_epi16(x7[20], x7[21]); + x8[22] = _mm_subs_epi16(x7[23], x7[22]); + x8[23] = _mm_adds_epi16(x7[23], x7[22]); + x8[24] = _mm_adds_epi16(x7[24], x7[25]); + x8[25] = _mm_subs_epi16(x7[24], x7[25]); + x8[26] = _mm_subs_epi16(x7[27], x7[26]); + x8[27] = _mm_adds_epi16(x7[27], x7[26]); + x8[28] = _mm_adds_epi16(x7[28], x7[29]); + x8[29] = _mm_subs_epi16(x7[28], x7[29]); + x8[30] = _mm_subs_epi16(x7[31], x7[30]); + x8[31] = _mm_adds_epi16(x7[31], x7[30]); + x8[32] = x7[32]; + btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x7[33], x7[62], x8[33], x8[62]); + btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x7[34], x7[61], x8[34], x8[61]); + x8[35] = x7[35]; + x8[36] = x7[36]; + btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x7[37], x7[58], x8[37], x8[58]); + btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x7[38], x7[57], x8[38], x8[57]); + x8[39] = x7[39]; + x8[40] = x7[40]; + btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x7[41], x7[54], x8[41], x8[54]); + btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x7[42], x7[53], x8[42], x8[53]); + x8[43] = x7[43]; + x8[44] = x7[44]; + btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x7[45], x7[50], x8[45], x8[50]); + btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x7[46], x7[49], x8[46], x8[49]); + x8[47] = x7[47]; + x8[48] = x7[48]; + x8[51] = x7[51]; + x8[52] = x7[52]; + x8[55] = x7[55]; + x8[56] = x7[56]; + x8[59] = x7[59]; + x8[60] = x7[60]; + x8[63] = x7[63]; + + // stage 9 + __m128i x9[64]; + x9[0] = x8[0]; + x9[1] = x8[1]; + x9[2] = x8[2]; + x9[3] = x8[3]; + x9[4] = x8[4]; + x9[5] = x8[5]; + x9[6] = x8[6]; + x9[7] = x8[7]; + x9[8] = x8[8]; + x9[9] = x8[9]; + x9[10] = x8[10]; + x9[11] = x8[11]; + x9[12] = x8[12]; + x9[13] = x8[13]; + x9[14] = x8[14]; + x9[15] = x8[15]; + btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x8[16], x8[31], x9[16], x9[31]); + btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x8[17], x8[30], x9[17], x9[30]); + btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x8[18], x8[29], x9[18], x9[29]); + btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x8[19], x8[28], x9[19], x9[28]); + btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x8[20], x8[27], x9[20], x9[27]); + btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x8[21], x8[26], x9[21], x9[26]); + btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x8[22], x8[25], x9[22], x9[25]); + btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x8[23], x8[24], x9[23], x9[24]); + x9[32] = _mm_adds_epi16(x8[32], x8[33]); + x9[33] = _mm_subs_epi16(x8[32], x8[33]); + x9[34] = _mm_subs_epi16(x8[35], x8[34]); + x9[35] = _mm_adds_epi16(x8[35], x8[34]); + x9[36] = _mm_adds_epi16(x8[36], x8[37]); + x9[37] = _mm_subs_epi16(x8[36], x8[37]); + x9[38] = _mm_subs_epi16(x8[39], x8[38]); + x9[39] = _mm_adds_epi16(x8[39], x8[38]); + x9[40] = _mm_adds_epi16(x8[40], x8[41]); + x9[41] = _mm_subs_epi16(x8[40], x8[41]); + x9[42] = _mm_subs_epi16(x8[43], x8[42]); + x9[43] = _mm_adds_epi16(x8[43], x8[42]); + x9[44] = _mm_adds_epi16(x8[44], x8[45]); + x9[45] = _mm_subs_epi16(x8[44], x8[45]); + x9[46] = _mm_subs_epi16(x8[47], x8[46]); + x9[47] = _mm_adds_epi16(x8[47], x8[46]); + x9[48] = _mm_adds_epi16(x8[48], x8[49]); + x9[49] = _mm_subs_epi16(x8[48], x8[49]); + x9[50] = _mm_subs_epi16(x8[51], x8[50]); + x9[51] = _mm_adds_epi16(x8[51], x8[50]); + x9[52] = _mm_adds_epi16(x8[52], x8[53]); + x9[53] = _mm_subs_epi16(x8[52], x8[53]); + x9[54] = _mm_subs_epi16(x8[55], x8[54]); + x9[55] = _mm_adds_epi16(x8[55], x8[54]); + x9[56] = _mm_adds_epi16(x8[56], x8[57]); + x9[57] = _mm_subs_epi16(x8[56], x8[57]); + x9[58] = _mm_subs_epi16(x8[59], x8[58]); + x9[59] = _mm_adds_epi16(x8[59], x8[58]); + x9[60] = _mm_adds_epi16(x8[60], x8[61]); + x9[61] = _mm_subs_epi16(x8[60], x8[61]); + x9[62] = _mm_subs_epi16(x8[63], x8[62]); + x9[63] = _mm_adds_epi16(x8[63], x8[62]); + + // stage 10 + __m128i x10[64]; + x10[0] = x9[0]; + x10[1] = x9[1]; + x10[2] = x9[2]; + x10[3] = x9[3]; + x10[4] = x9[4]; + x10[5] = x9[5]; + x10[6] = x9[6]; + x10[7] = x9[7]; + x10[8] = x9[8]; + x10[9] = x9[9]; + x10[10] = x9[10]; + x10[11] = x9[11]; + x10[12] = x9[12]; + x10[13] = x9[13]; + x10[14] = x9[14]; + x10[15] = x9[15]; + x10[16] = x9[16]; + x10[17] = x9[17]; + x10[18] = x9[18]; + x10[19] = x9[19]; + x10[20] = x9[20]; + x10[21] = x9[21]; + x10[22] = x9[22]; + x10[23] = x9[23]; + x10[24] = x9[24]; + x10[25] = x9[25]; + x10[26] = x9[26]; + x10[27] = x9[27]; + x10[28] = x9[28]; + x10[29] = x9[29]; + x10[30] = x9[30]; + x10[31] = x9[31]; + btf_16_sse2(cospi_p63_p01, cospi_m01_p63, x9[32], x9[63], x10[32], x10[63]); + btf_16_sse2(cospi_p31_p33, cospi_m33_p31, x9[33], x9[62], x10[33], x10[62]); + btf_16_sse2(cospi_p47_p17, cospi_m17_p47, x9[34], x9[61], x10[34], x10[61]); + btf_16_sse2(cospi_p15_p49, cospi_m49_p15, x9[35], x9[60], x10[35], x10[60]); + btf_16_sse2(cospi_p55_p09, cospi_m09_p55, x9[36], x9[59], x10[36], x10[59]); + btf_16_sse2(cospi_p23_p41, cospi_m41_p23, x9[37], x9[58], x10[37], x10[58]); + btf_16_sse2(cospi_p39_p25, cospi_m25_p39, x9[38], x9[57], x10[38], x10[57]); + btf_16_sse2(cospi_p07_p57, cospi_m57_p07, x9[39], x9[56], x10[39], x10[56]); + btf_16_sse2(cospi_p59_p05, cospi_m05_p59, x9[40], x9[55], x10[40], x10[55]); + btf_16_sse2(cospi_p27_p37, cospi_m37_p27, x9[41], x9[54], x10[41], x10[54]); + btf_16_sse2(cospi_p43_p21, cospi_m21_p43, x9[42], x9[53], x10[42], x10[53]); + btf_16_sse2(cospi_p11_p53, cospi_m53_p11, x9[43], x9[52], x10[43], x10[52]); + btf_16_sse2(cospi_p51_p13, cospi_m13_p51, x9[44], x9[51], x10[44], x10[51]); + btf_16_sse2(cospi_p19_p45, cospi_m45_p19, x9[45], x9[50], x10[45], x10[50]); + btf_16_sse2(cospi_p35_p29, cospi_m29_p35, x9[46], x9[49], x10[46], x10[49]); + btf_16_sse2(cospi_p03_p61, cospi_m61_p03, x9[47], x9[48], x10[47], x10[48]); + + // stage 11 + output[0] = x10[0]; + output[1] = x10[32]; + output[2] = x10[16]; + output[3] = x10[48]; + output[4] = x10[8]; + output[5] = x10[40]; + output[6] = x10[24]; + output[7] = x10[56]; + output[8] = x10[4]; + output[9] = x10[36]; + output[10] = x10[20]; + output[11] = x10[52]; + output[12] = x10[12]; + output[13] = x10[44]; + output[14] = x10[28]; + output[15] = x10[60]; + output[16] = x10[2]; + output[17] = x10[34]; + output[18] = x10[18]; + output[19] = x10[50]; + output[20] = x10[10]; + output[21] = x10[42]; + output[22] = x10[26]; + output[23] = x10[58]; + output[24] = x10[6]; + output[25] = x10[38]; + output[26] = x10[22]; + output[27] = x10[54]; + output[28] = x10[14]; + output[29] = x10[46]; + output[30] = x10[30]; + output[31] = x10[62]; + output[32] = x10[1]; + output[33] = x10[33]; + output[34] = x10[17]; + output[35] = x10[49]; + output[36] = x10[9]; + output[37] = x10[41]; + output[38] = x10[25]; + output[39] = x10[57]; + output[40] = x10[5]; + output[41] = x10[37]; + output[42] = x10[21]; + output[43] = x10[53]; + output[44] = x10[13]; + output[45] = x10[45]; + output[46] = x10[29]; + output[47] = x10[61]; + output[48] = x10[3]; + output[49] = x10[35]; + output[50] = x10[19]; + output[51] = x10[51]; + output[52] = x10[11]; + output[53] = x10[43]; + output[54] = x10[27]; + output[55] = x10[59]; + output[56] = x10[7]; + output[57] = x10[39]; + output[58] = x10[23]; + output[59] = x10[55]; + output[60] = x10[15]; + output[61] = x10[47]; + output[62] = x10[31]; + output[63] = x10[63]; +} + +static void fadst4x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *sinpi = sinpi_arr(cos_bit); + const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]); + const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]); + const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]); + const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]); + const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]); + const __m128i __zero = _mm_set1_epi16(0); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + const __m128i in7 = _mm_add_epi16(input[0], input[1]); + __m128i u[8], v[8]; + + u[0] = _mm_unpacklo_epi16(input[0], input[1]); + u[1] = _mm_unpacklo_epi16(input[2], input[3]); + u[2] = _mm_unpacklo_epi16(in7, __zero); + u[3] = _mm_unpacklo_epi16(input[2], __zero); + u[4] = _mm_unpacklo_epi16(input[3], __zero); + + v[0] = _mm_madd_epi16(u[0], sinpi_p01_p02); // s0 + s2 + v[1] = _mm_madd_epi16(u[1], sinpi_p03_p04); // s4 + s5 + v[2] = _mm_madd_epi16(u[2], sinpi_p03_p03); // x1 + v[3] = _mm_madd_epi16(u[0], sinpi_p04_m01); // s1 - s3 + v[4] = _mm_madd_epi16(u[1], sinpi_m03_p02); // -s4 + s6 + v[5] = _mm_madd_epi16(u[3], sinpi_p03_p03); // s4 + v[6] = _mm_madd_epi16(u[4], sinpi_p03_p03); + + u[0] = _mm_add_epi32(v[0], v[1]); + u[1] = _mm_sub_epi32(v[2], v[6]); + u[2] = _mm_add_epi32(v[3], v[4]); + u[3] = _mm_sub_epi32(u[2], u[0]); + u[4] = _mm_slli_epi32(v[5], 2); + u[5] = _mm_sub_epi32(u[4], v[5]); + u[6] = _mm_add_epi32(u[3], u[5]); + + v[0] = _mm_add_epi32(u[0], __rounding); + v[1] = _mm_add_epi32(u[1], __rounding); + v[2] = _mm_add_epi32(u[2], __rounding); + v[3] = _mm_add_epi32(u[6], __rounding); + + u[0] = _mm_srai_epi32(v[0], cos_bit); + u[1] = _mm_srai_epi32(v[1], cos_bit); + u[2] = _mm_srai_epi32(v[2], cos_bit); + u[3] = _mm_srai_epi32(v[3], cos_bit); + + output[0] = _mm_packs_epi32(u[0], u[2]); + output[1] = _mm_packs_epi32(u[1], u[3]); + output[2] = _mm_srli_si128(output[0], 8); + output[3] = _mm_srli_si128(output[1], 8); +} + +static void fadst4x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + + // stage 1 + __m128i x1[8]; + x1[0] = input[0]; + x1[1] = _mm_subs_epi16(__zero, input[7]); + x1[2] = _mm_subs_epi16(__zero, input[3]); + x1[3] = input[4]; + x1[4] = _mm_subs_epi16(__zero, input[1]); + x1[5] = input[6]; + x1[6] = input[2]; + x1[7] = _mm_subs_epi16(__zero, input[5]); + + // stage 2 + __m128i x2[8]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[2], + &x1[3], &x2[2], &x2[3]); + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[6], + &x1[7], &x2[6], &x2[7]); + + // stage 3 + __m128i x3[8]; + x3[0] = _mm_adds_epi16(x2[0], x2[2]); + x3[2] = _mm_subs_epi16(x2[0], x2[2]); + x3[1] = _mm_adds_epi16(x2[1], x2[3]); + x3[3] = _mm_subs_epi16(x2[1], x2[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[6]); + x3[6] = _mm_subs_epi16(x2[4], x2[6]); + x3[5] = _mm_adds_epi16(x2[5], x2[7]); + x3[7] = _mm_subs_epi16(x2[5], x2[7]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w4_sse2(&cospi_p16_p48, &cospi_p48_m16, __rounding, cos_bit, &x3[4], + &x3[5], &x4[4], &x4[5]); + btf_16_w4_sse2(&cospi_m48_p16, &cospi_p16_p48, __rounding, cos_bit, &x3[6], + &x3[7], &x4[6], &x4[7]); + + // stage 5 + __m128i x5[8]; + x5[0] = _mm_adds_epi16(x4[0], x4[4]); + x5[4] = _mm_subs_epi16(x4[0], x4[4]); + x5[1] = _mm_adds_epi16(x4[1], x4[5]); + x5[5] = _mm_subs_epi16(x4[1], x4[5]); + x5[2] = _mm_adds_epi16(x4[2], x4[6]); + x5[6] = _mm_subs_epi16(x4[2], x4[6]); + x5[3] = _mm_adds_epi16(x4[3], x4[7]); + x5[7] = _mm_subs_epi16(x4[3], x4[7]); + + // stage 6 + __m128i x6[8]; + btf_16_w4_sse2(&cospi_p04_p60, &cospi_p60_m04, __rounding, cos_bit, &x5[0], + &x5[1], &x6[0], &x6[1]); + btf_16_w4_sse2(&cospi_p20_p44, &cospi_p44_m20, __rounding, cos_bit, &x5[2], + &x5[3], &x6[2], &x6[3]); + btf_16_w4_sse2(&cospi_p36_p28, &cospi_p28_m36, __rounding, cos_bit, &x5[4], + &x5[5], &x6[4], &x6[5]); + btf_16_w4_sse2(&cospi_p52_p12, &cospi_p12_m52, __rounding, cos_bit, &x5[6], + &x5[7], &x6[6], &x6[7]); + + // stage 7 + output[0] = x6[1]; + output[1] = x6[6]; + output[2] = x6[3]; + output[3] = x6[4]; + output[4] = x6[5]; + output[5] = x6[2]; + output[6] = x6[7]; + output[7] = x6[0]; +} + +static void fadst8x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *sinpi = sinpi_arr(cos_bit); + const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]); + const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]); + const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]); + const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]); + const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]); + const __m128i __zero = _mm_set1_epi16(0); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + const __m128i in7 = _mm_add_epi16(input[0], input[1]); + __m128i u_lo[8], u_hi[8], v_lo[8], v_hi[8]; + + u_lo[0] = _mm_unpacklo_epi16(input[0], input[1]); + u_hi[0] = _mm_unpackhi_epi16(input[0], input[1]); + u_lo[1] = _mm_unpacklo_epi16(input[2], input[3]); + u_hi[1] = _mm_unpackhi_epi16(input[2], input[3]); + u_lo[2] = _mm_unpacklo_epi16(in7, __zero); + u_hi[2] = _mm_unpackhi_epi16(in7, __zero); + u_lo[3] = _mm_unpacklo_epi16(input[2], __zero); + u_hi[3] = _mm_unpackhi_epi16(input[2], __zero); + u_lo[4] = _mm_unpacklo_epi16(input[3], __zero); + u_hi[4] = _mm_unpackhi_epi16(input[3], __zero); + + v_lo[0] = _mm_madd_epi16(u_lo[0], sinpi_p01_p02); // s0 + s2 + v_hi[0] = _mm_madd_epi16(u_hi[0], sinpi_p01_p02); // s0 + s2 + v_lo[1] = _mm_madd_epi16(u_lo[1], sinpi_p03_p04); // s4 + s5 + v_hi[1] = _mm_madd_epi16(u_hi[1], sinpi_p03_p04); // s4 + s5 + v_lo[2] = _mm_madd_epi16(u_lo[2], sinpi_p03_p03); // x1 + v_hi[2] = _mm_madd_epi16(u_hi[2], sinpi_p03_p03); // x1 + v_lo[3] = _mm_madd_epi16(u_lo[0], sinpi_p04_m01); // s1 - s3 + v_hi[3] = _mm_madd_epi16(u_hi[0], sinpi_p04_m01); // s1 - s3 + v_lo[4] = _mm_madd_epi16(u_lo[1], sinpi_m03_p02); // -s4 + s6 + v_hi[4] = _mm_madd_epi16(u_hi[1], sinpi_m03_p02); // -s4 + s6 + v_lo[5] = _mm_madd_epi16(u_lo[3], sinpi_p03_p03); // s4 + v_hi[5] = _mm_madd_epi16(u_hi[3], sinpi_p03_p03); // s4 + v_lo[6] = _mm_madd_epi16(u_lo[4], sinpi_p03_p03); + v_hi[6] = _mm_madd_epi16(u_hi[4], sinpi_p03_p03); + + u_lo[0] = _mm_add_epi32(v_lo[0], v_lo[1]); + u_hi[0] = _mm_add_epi32(v_hi[0], v_hi[1]); + u_lo[1] = _mm_sub_epi32(v_lo[2], v_lo[6]); + u_hi[1] = _mm_sub_epi32(v_hi[2], v_hi[6]); + u_lo[2] = _mm_add_epi32(v_lo[3], v_lo[4]); + u_hi[2] = _mm_add_epi32(v_hi[3], v_hi[4]); + u_lo[3] = _mm_sub_epi32(u_lo[2], u_lo[0]); + u_hi[3] = _mm_sub_epi32(u_hi[2], u_hi[0]); + u_lo[4] = _mm_slli_epi32(v_lo[5], 2); + u_hi[4] = _mm_slli_epi32(v_hi[5], 2); + u_lo[5] = _mm_sub_epi32(u_lo[4], v_lo[5]); + u_hi[5] = _mm_sub_epi32(u_hi[4], v_hi[5]); + u_lo[6] = _mm_add_epi32(u_lo[3], u_lo[5]); + u_hi[6] = _mm_add_epi32(u_hi[3], u_hi[5]); + + v_lo[0] = _mm_add_epi32(u_lo[0], __rounding); + v_hi[0] = _mm_add_epi32(u_hi[0], __rounding); + v_lo[1] = _mm_add_epi32(u_lo[1], __rounding); + v_hi[1] = _mm_add_epi32(u_hi[1], __rounding); + v_lo[2] = _mm_add_epi32(u_lo[2], __rounding); + v_hi[2] = _mm_add_epi32(u_hi[2], __rounding); + v_lo[3] = _mm_add_epi32(u_lo[6], __rounding); + v_hi[3] = _mm_add_epi32(u_hi[6], __rounding); + + u_lo[0] = _mm_srai_epi32(v_lo[0], cos_bit); + u_hi[0] = _mm_srai_epi32(v_hi[0], cos_bit); + u_lo[1] = _mm_srai_epi32(v_lo[1], cos_bit); + u_hi[1] = _mm_srai_epi32(v_hi[1], cos_bit); + u_lo[2] = _mm_srai_epi32(v_lo[2], cos_bit); + u_hi[2] = _mm_srai_epi32(v_hi[2], cos_bit); + u_lo[3] = _mm_srai_epi32(v_lo[3], cos_bit); + u_hi[3] = _mm_srai_epi32(v_hi[3], cos_bit); + + output[0] = _mm_packs_epi32(u_lo[0], u_hi[0]); + output[1] = _mm_packs_epi32(u_lo[1], u_hi[1]); + output[2] = _mm_packs_epi32(u_lo[2], u_hi[2]); + output[3] = _mm_packs_epi32(u_lo[3], u_hi[3]); +} + +static void fadst8x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + + // stage 1 + __m128i x1[8]; + x1[0] = input[0]; + x1[1] = _mm_subs_epi16(__zero, input[7]); + x1[2] = _mm_subs_epi16(__zero, input[3]); + x1[3] = input[4]; + x1[4] = _mm_subs_epi16(__zero, input[1]); + x1[5] = input[6]; + x1[6] = input[2]; + x1[7] = _mm_subs_epi16(__zero, input[5]); + + // stage 2 + __m128i x2[8]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]); + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]); + + // stage 3 + __m128i x3[8]; + x3[0] = _mm_adds_epi16(x2[0], x2[2]); + x3[2] = _mm_subs_epi16(x2[0], x2[2]); + x3[1] = _mm_adds_epi16(x2[1], x2[3]); + x3[3] = _mm_subs_epi16(x2[1], x2[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[6]); + x3[6] = _mm_subs_epi16(x2[4], x2[6]); + x3[5] = _mm_adds_epi16(x2[5], x2[7]); + x3[7] = _mm_subs_epi16(x2[5], x2[7]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]); + + // stage 5 + __m128i x5[8]; + x5[0] = _mm_adds_epi16(x4[0], x4[4]); + x5[4] = _mm_subs_epi16(x4[0], x4[4]); + x5[1] = _mm_adds_epi16(x4[1], x4[5]); + x5[5] = _mm_subs_epi16(x4[1], x4[5]); + x5[2] = _mm_adds_epi16(x4[2], x4[6]); + x5[6] = _mm_subs_epi16(x4[2], x4[6]); + x5[3] = _mm_adds_epi16(x4[3], x4[7]); + x5[7] = _mm_subs_epi16(x4[3], x4[7]); + + // stage 6 + __m128i x6[8]; + btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x5[0], x5[1], x6[0], x6[1]); + btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x5[2], x5[3], x6[2], x6[3]); + btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x5[4], x5[5], x6[4], x6[5]); + btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x5[6], x5[7], x6[6], x6[7]); + + // stage 7 + output[0] = x6[1]; + output[1] = x6[6]; + output[2] = x6[3]; + output[3] = x6[4]; + output[4] = x6[5]; + output[5] = x6[2]; + output[6] = x6[7]; + output[7] = x6[0]; +} + +static void fadst8x16_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); + __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); + __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + + // stage 1 + __m128i x1[16]; + x1[0] = input[0]; + x1[1] = _mm_subs_epi16(__zero, input[15]); + x1[2] = _mm_subs_epi16(__zero, input[7]); + x1[3] = input[8]; + x1[4] = _mm_subs_epi16(__zero, input[3]); + x1[5] = input[12]; + x1[6] = input[4]; + x1[7] = _mm_subs_epi16(__zero, input[11]); + x1[8] = _mm_subs_epi16(__zero, input[1]); + x1[9] = input[14]; + x1[10] = input[6]; + x1[11] = _mm_subs_epi16(__zero, input[9]); + x1[12] = input[2]; + x1[13] = _mm_subs_epi16(__zero, input[13]); + x1[14] = _mm_subs_epi16(__zero, input[5]); + x1[15] = input[10]; + + // stage 2 + __m128i x2[16]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]); + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]); + x2[8] = x1[8]; + x2[9] = x1[9]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x2[10], x2[11]); + x2[12] = x1[12]; + x2[13] = x1[13]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x2[14], x2[15]); + + // stage 3 + __m128i x3[16]; + x3[0] = _mm_adds_epi16(x2[0], x2[2]); + x3[2] = _mm_subs_epi16(x2[0], x2[2]); + x3[1] = _mm_adds_epi16(x2[1], x2[3]); + x3[3] = _mm_subs_epi16(x2[1], x2[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[6]); + x3[6] = _mm_subs_epi16(x2[4], x2[6]); + x3[5] = _mm_adds_epi16(x2[5], x2[7]); + x3[7] = _mm_subs_epi16(x2[5], x2[7]); + x3[8] = _mm_adds_epi16(x2[8], x2[10]); + x3[10] = _mm_subs_epi16(x2[8], x2[10]); + x3[9] = _mm_adds_epi16(x2[9], x2[11]); + x3[11] = _mm_subs_epi16(x2[9], x2[11]); + x3[12] = _mm_adds_epi16(x2[12], x2[14]); + x3[14] = _mm_subs_epi16(x2[12], x2[14]); + x3[13] = _mm_adds_epi16(x2[13], x2[15]); + x3[15] = _mm_subs_epi16(x2[13], x2[15]); + + // stage 4 + __m128i x4[16]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]); + x4[8] = x3[8]; + x4[9] = x3[9]; + x4[10] = x3[10]; + x4[11] = x3[11]; + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[12], x3[13], x4[12], x4[13]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[14], x3[15], x4[14], x4[15]); + + // stage 5 + __m128i x5[16]; + x5[0] = _mm_adds_epi16(x4[0], x4[4]); + x5[4] = _mm_subs_epi16(x4[0], x4[4]); + x5[1] = _mm_adds_epi16(x4[1], x4[5]); + x5[5] = _mm_subs_epi16(x4[1], x4[5]); + x5[2] = _mm_adds_epi16(x4[2], x4[6]); + x5[6] = _mm_subs_epi16(x4[2], x4[6]); + x5[3] = _mm_adds_epi16(x4[3], x4[7]); + x5[7] = _mm_subs_epi16(x4[3], x4[7]); + x5[8] = _mm_adds_epi16(x4[8], x4[12]); + x5[12] = _mm_subs_epi16(x4[8], x4[12]); + x5[9] = _mm_adds_epi16(x4[9], x4[13]); + x5[13] = _mm_subs_epi16(x4[9], x4[13]); + x5[10] = _mm_adds_epi16(x4[10], x4[14]); + x5[14] = _mm_subs_epi16(x4[10], x4[14]); + x5[11] = _mm_adds_epi16(x4[11], x4[15]); + x5[15] = _mm_subs_epi16(x4[11], x4[15]); + + // stage 6 + __m128i x6[16]; + x6[0] = x5[0]; + x6[1] = x5[1]; + x6[2] = x5[2]; + x6[3] = x5[3]; + x6[4] = x5[4]; + x6[5] = x5[5]; + x6[6] = x5[6]; + x6[7] = x5[7]; + btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x5[8], x5[9], x6[8], x6[9]); + btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x5[10], x5[11], x6[10], x6[11]); + btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x5[12], x5[13], x6[12], x6[13]); + btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x5[14], x5[15], x6[14], x6[15]); + + // stage 7 + __m128i x7[16]; + x7[0] = _mm_adds_epi16(x6[0], x6[8]); + x7[8] = _mm_subs_epi16(x6[0], x6[8]); + x7[1] = _mm_adds_epi16(x6[1], x6[9]); + x7[9] = _mm_subs_epi16(x6[1], x6[9]); + x7[2] = _mm_adds_epi16(x6[2], x6[10]); + x7[10] = _mm_subs_epi16(x6[2], x6[10]); + x7[3] = _mm_adds_epi16(x6[3], x6[11]); + x7[11] = _mm_subs_epi16(x6[3], x6[11]); + x7[4] = _mm_adds_epi16(x6[4], x6[12]); + x7[12] = _mm_subs_epi16(x6[4], x6[12]); + x7[5] = _mm_adds_epi16(x6[5], x6[13]); + x7[13] = _mm_subs_epi16(x6[5], x6[13]); + x7[6] = _mm_adds_epi16(x6[6], x6[14]); + x7[14] = _mm_subs_epi16(x6[6], x6[14]); + x7[7] = _mm_adds_epi16(x6[7], x6[15]); + x7[15] = _mm_subs_epi16(x6[7], x6[15]); + + // stage 8 + __m128i x8[16]; + btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x7[0], x7[1], x8[0], x8[1]); + btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x7[2], x7[3], x8[2], x8[3]); + btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x7[4], x7[5], x8[4], x8[5]); + btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x7[6], x7[7], x8[6], x8[7]); + btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x7[8], x7[9], x8[8], x8[9]); + btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x7[10], x7[11], x8[10], x8[11]); + btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x7[12], x7[13], x8[12], x8[13]); + btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x7[14], x7[15], x8[14], x8[15]); + + // stage 9 + output[0] = x8[1]; + output[1] = x8[14]; + output[2] = x8[3]; + output[3] = x8[12]; + output[4] = x8[5]; + output[5] = x8[10]; + output[6] = x8[7]; + output[7] = x8[8]; + output[8] = x8[9]; + output[9] = x8[6]; + output[10] = x8[11]; + output[11] = x8[4]; + output[12] = x8[13]; + output[13] = x8[2]; + output[14] = x8[15]; + output[15] = x8[0]; +} + +static const transform_1d_sse2 col_txfm4x4_arr[TX_TYPES] = { + fdct4x4_new_sse2, // DCT_DCT + fadst4x4_new_sse2, // ADST_DCT + fdct4x4_new_sse2, // DCT_ADST + fadst4x4_new_sse2, // ADST_ADST + fadst4x4_new_sse2, // FLIPADST_DCT + fdct4x4_new_sse2, // DCT_FLIPADST + fadst4x4_new_sse2, // FLIPADST_FLIPADST + fadst4x4_new_sse2, // ADST_FLIPADST + fadst4x4_new_sse2, // FLIPADST_ADST + fidentity4x4_new_sse2, // IDTX + fdct4x4_new_sse2, // V_DCT + fidentity4x4_new_sse2, // H_DCT + fadst4x4_new_sse2, // V_ADST + fidentity4x4_new_sse2, // H_ADST + fadst4x4_new_sse2, // V_FLIPADST + fidentity4x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm4x4_arr[TX_TYPES] = { + fdct4x4_new_sse2, // DCT_DCT + fdct4x4_new_sse2, // ADST_DCT + fadst4x4_new_sse2, // DCT_ADST + fadst4x4_new_sse2, // ADST_ADST + fdct4x4_new_sse2, // FLIPADST_DCT + fadst4x4_new_sse2, // DCT_FLIPADST + fadst4x4_new_sse2, // FLIPADST_FLIPADST + fadst4x4_new_sse2, // ADST_FLIPADST + fadst4x4_new_sse2, // FLIPADST_ADST + fidentity4x4_new_sse2, // IDTX + fidentity4x4_new_sse2, // V_DCT + fdct4x4_new_sse2, // H_DCT + fidentity4x4_new_sse2, // V_ADST + fadst4x4_new_sse2, // H_ADST + fidentity4x4_new_sse2, // V_FLIPADST + fadst4x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm4x8_arr[TX_TYPES] = { + fdct4x8_new_sse2, // DCT_DCT + fadst4x8_new_sse2, // ADST_DCT + fdct4x8_new_sse2, // DCT_ADST + fadst4x8_new_sse2, // ADST_ADST + fadst4x8_new_sse2, // FLIPADST_DCT + fdct4x8_new_sse2, // DCT_FLIPADST + fadst4x8_new_sse2, // FLIPADST_FLIPADST + fadst4x8_new_sse2, // ADST_FLIPADST + fadst4x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fdct4x8_new_sse2, // V_DCT + fidentity8x8_new_sse2, // H_DCT + fadst4x8_new_sse2, // V_ADST + fidentity8x8_new_sse2, // H_ADST + fadst4x8_new_sse2, // V_FLIPADST + fidentity8x8_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x4_arr[TX_TYPES] = { + fdct8x4_new_sse2, // DCT_DCT + fdct8x4_new_sse2, // ADST_DCT + fadst8x4_new_sse2, // DCT_ADST + fadst8x4_new_sse2, // ADST_ADST + fdct8x4_new_sse2, // FLIPADST_DCT + fadst8x4_new_sse2, // DCT_FLIPADST + fadst8x4_new_sse2, // FLIPADST_FLIPADST + fadst8x4_new_sse2, // ADST_FLIPADST + fadst8x4_new_sse2, // FLIPADST_ADST + fidentity8x4_new_sse2, // IDTX + fidentity8x4_new_sse2, // V_DCT + fdct8x4_new_sse2, // H_DCT + fidentity8x4_new_sse2, // V_ADST + fadst8x4_new_sse2, // H_ADST + fidentity8x4_new_sse2, // V_FLIPADST + fadst8x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x4_arr[TX_TYPES] = { + fdct8x4_new_sse2, // DCT_DCT + fadst8x4_new_sse2, // ADST_DCT + fdct8x4_new_sse2, // DCT_ADST + fadst8x4_new_sse2, // ADST_ADST + fadst8x4_new_sse2, // FLIPADST_DCT + fdct8x4_new_sse2, // DCT_FLIPADST + fadst8x4_new_sse2, // FLIPADST_FLIPADST + fadst8x4_new_sse2, // ADST_FLIPADST + fadst8x4_new_sse2, // FLIPADST_ADST + fidentity8x4_new_sse2, // IDTX + fdct8x4_new_sse2, // V_DCT + fidentity8x4_new_sse2, // H_DCT + fadst8x4_new_sse2, // V_ADST + fidentity8x4_new_sse2, // H_ADST + fadst8x4_new_sse2, // V_FLIPADST + fidentity8x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm4x8_arr[TX_TYPES] = { + fdct4x8_new_sse2, // DCT_DCT + fdct4x8_new_sse2, // ADST_DCT + fadst4x8_new_sse2, // DCT_ADST + fadst4x8_new_sse2, // ADST_ADST + fdct4x8_new_sse2, // FLIPADST_DCT + fadst4x8_new_sse2, // DCT_FLIPADST + fadst4x8_new_sse2, // FLIPADST_FLIPADST + fadst4x8_new_sse2, // ADST_FLIPADST + fadst4x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fidentity8x8_new_sse2, // V_DCT + fdct4x8_new_sse2, // H_DCT + fidentity8x8_new_sse2, // V_ADST + fadst4x8_new_sse2, // H_ADST + fidentity8x8_new_sse2, // V_FLIPADST + fadst4x8_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = { + fdct8x8_new_sse2, // DCT_DCT + fadst8x8_new_sse2, // ADST_DCT + fdct8x8_new_sse2, // DCT_ADST + fadst8x8_new_sse2, // ADST_ADST + fadst8x8_new_sse2, // FLIPADST_DCT + fdct8x8_new_sse2, // DCT_FLIPADST + fadst8x8_new_sse2, // FLIPADST_FLIPADST + fadst8x8_new_sse2, // ADST_FLIPADST + fadst8x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fdct8x8_new_sse2, // V_DCT + fidentity8x8_new_sse2, // H_DCT + fadst8x8_new_sse2, // V_ADST + fidentity8x8_new_sse2, // H_ADST + fadst8x8_new_sse2, // V_FLIPADST + fidentity8x8_new_sse2, // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = { + fdct8x8_new_sse2, // DCT_DCT + fdct8x8_new_sse2, // ADST_DCT + fadst8x8_new_sse2, // DCT_ADST + fadst8x8_new_sse2, // ADST_ADST + fdct8x8_new_sse2, // FLIPADST_DCT + fadst8x8_new_sse2, // DCT_FLIPADST + fadst8x8_new_sse2, // FLIPADST_FLIPADST + fadst8x8_new_sse2, // ADST_FLIPADST + fadst8x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fidentity8x8_new_sse2, // V_DCT + fdct8x8_new_sse2, // H_DCT + fidentity8x8_new_sse2, // V_ADST + fadst8x8_new_sse2, // H_ADST + fidentity8x8_new_sse2, // V_FLIPADST + fadst8x8_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = { + fdct8x16_new_sse2, // DCT_DCT + fadst8x16_new_sse2, // ADST_DCT + fdct8x16_new_sse2, // DCT_ADST + fadst8x16_new_sse2, // ADST_ADST + fadst8x16_new_sse2, // FLIPADST_DCT + fdct8x16_new_sse2, // DCT_FLIPADST + fadst8x16_new_sse2, // FLIPADST_FLIPADST + fadst8x16_new_sse2, // ADST_FLIPADST + fadst8x16_new_sse2, // FLIPADST_ADST + fidentity8x16_new_sse2, // IDTX + fdct8x16_new_sse2, // V_DCT + fidentity8x16_new_sse2, // H_DCT + fadst8x16_new_sse2, // V_ADST + fidentity8x16_new_sse2, // H_ADST + fadst8x16_new_sse2, // V_FLIPADST + fidentity8x16_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x16_arr[TX_TYPES] = { + fdct8x16_new_sse2, // DCT_DCT + fdct8x16_new_sse2, // ADST_DCT + fadst8x16_new_sse2, // DCT_ADST + fadst8x16_new_sse2, // ADST_ADST + fdct8x16_new_sse2, // FLIPADST_DCT + fadst8x16_new_sse2, // DCT_FLIPADST + fadst8x16_new_sse2, // FLIPADST_FLIPADST + fadst8x16_new_sse2, // ADST_FLIPADST + fadst8x16_new_sse2, // FLIPADST_ADST + fidentity8x16_new_sse2, // IDTX + fidentity8x16_new_sse2, // V_DCT + fdct8x16_new_sse2, // H_DCT + fidentity8x16_new_sse2, // V_ADST + fadst8x16_new_sse2, // H_ADST + fidentity8x16_new_sse2, // V_FLIPADST + fadst8x16_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x32_arr[TX_TYPES] = { + fdct8x32_new_sse2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity8x32_new_sse2, // IDTX + fidentity8x32_new_sse2, // V_DCT + fdct8x32_new_sse2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[4], buf1[4], *buf; + const int8_t *shift = fwd_txfm_shift_ls[TX_4X4]; + const int txw_idx = get_txw_idx(TX_4X4); + const int txh_idx = get_txh_idx(TX_4X4); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 4; + const int height = 4; + const transform_1d_sse2 col_txfm = col_txfm4x4_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm4x4_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_4x4(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_4x4(buf, buf); + store_buffer_16bit_to_32bit_w4(buf, output, width, height); +} + +void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)stride; + (void)bd; + __m128i buf0[8], buf1[8], *buf; + const int8_t *shift = fwd_txfm_shift_ls[TX_4X8]; + const int txw_idx = get_txw_idx(TX_4X8); + const int txh_idx = get_txh_idx(TX_4X8); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 4; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm4x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_4x8(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x4(buf, buf); + store_rect_buffer_16bit_to_32bit_w4(buf, output, width, height); +} + +void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = fwd_txfm_shift_ls[TX_4X16]; + const int txw_idx = get_txw_idx(TX_4X16); + const int txh_idx = get_txh_idx(TX_4X16); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 4; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_4x8(buf0, buf1); + transpose_16bit_4x8(buf0 + 8, buf1 + 8); + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + 8 * i, buf, width); + } else { + buf = buf1 + 8 * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x4(buf, buf); + store_buffer_16bit_to_32bit_w4(buf, output + 8 * width * i, width, 8); + } +} + +void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[8], buf1[8], *buf; + const int8_t *shift = fwd_txfm_shift_ls[TX_8X4]; + const int txw_idx = get_txw_idx(TX_8X4); + const int txh_idx = get_txh_idx(TX_8X4); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 4; + const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm4x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + else + load_buffer_16bit_to_16bit(input, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height); +} + +void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[8], buf1[8], *buf; + const int8_t *shift = fwd_txfm_shift_ls[TX_8X8]; + const int txw_idx = get_txw_idx(TX_8X8); + const int txh_idx = get_txh_idx(TX_8X8); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + else + load_buffer_16bit_to_16bit(input, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output, width, height); +} + +void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + transpose_16bit_8x8(buf0 + 8, buf1 + 8); + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8); + } +} + +void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[32]; + const int8_t *shift = fwd_txfm_shift_ls[TX_8X32]; + const int txw_idx = get_txw_idx(TX_8X32); + const int txh_idx = get_txh_idx(TX_8X32); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 32; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + transpose_16bit_8x8(buf0 + 8, buf1 + 8); + transpose_16bit_8x8(buf0 + 16, buf1 + 16); + transpose_16bit_8x8(buf0 + 24, buf1 + 24); + + for (int i = 0; i < 4; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8); + } +} + +void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = fwd_txfm_shift_ls[TX_16X4]; + const int txw_idx = get_txw_idx(TX_16X4); + const int txh_idx = get_txh_idx(TX_16X4); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 4; + const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + __m128i *buf; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x4(buf0, buf1 + 8 * i); + } + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_4x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output, width, height); + transpose_16bit_4x8(buf + 8, buf + 8); + store_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height); +} + +void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + __m128i *buf; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 8 * i); + } + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height); + transpose_16bit_8x8(buf + 8, buf + 8); + store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height); +} + +void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[32]; + const int8_t *shift = fwd_txfm_shift_ls[TX_16X16]; + const int txw_idx = get_txw_idx(TX_16X16); + const int txh_idx = get_txh_idx(TX_16X16); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i); + } + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8); + transpose_16bit_8x8(buf + 8, buf + 8); + store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width, + 8); + } +} + +void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[64]; + const int8_t *shift = fwd_txfm_shift_ls[TX_16X32]; + const int txw_idx = get_txw_idx(TX_16X32); + const int txh_idx = get_txh_idx(TX_16X32); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 32; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i); + transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i); + transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i); + } + + for (int i = 0; i < 4; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, + 8); + transpose_16bit_8x8(buf + 8, buf + 8); + store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, + width, 8); + } + } else { + av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[32]; + const int8_t *shift = fwd_txfm_shift_ls[TX_32X8]; + const int txw_idx = get_txw_idx(TX_32X8); + const int txh_idx = get_txh_idx(TX_32X8); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 4; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); + } + + for (int i = 0; i < 1; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, + height); + transpose_16bit_8x8(buf + 8, buf + 8); + store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width, + height); + transpose_16bit_8x8(buf + 16, buf + 16); + store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16, + width, height); + transpose_16bit_8x8(buf + 24, buf + 24); + store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24, + width, height); + } + } else { + av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[64]; + const int8_t *shift = fwd_txfm_shift_ls[TX_32X16]; + const int txw_idx = get_txw_idx(TX_32X16); + const int txh_idx = get_txh_idx(TX_32X16); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 4; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i); + } + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, + 8); + transpose_16bit_8x8(buf + 8, buf + 8); + store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, + width, 8); + transpose_16bit_8x8(buf + 16, buf + 16); + store_rect_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16, + width, 8); + transpose_16bit_8x8(buf + 24, buf + 24); + store_rect_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24, + width, 8); + } + } else { + av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[128]; + const int8_t *shift = fwd_txfm_shift_ls[TX_32X32]; + const int txw_idx = get_txw_idx(TX_32X32); + const int txh_idx = get_txh_idx(TX_32X32); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 32; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 4; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i); + transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i); + transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i); + } + + for (int i = 0; i < 4; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8); + transpose_16bit_8x8(buf + 8, buf + 8); + store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width, + 8); + transpose_16bit_8x8(buf + 16, buf + 16); + store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16, + width, 8); + transpose_16bit_8x8(buf + 24, buf + 24); + store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24, + width, 8); + } + } else { + av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X16; + __m128i buf0[64], buf1[128]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = fdct8x16_new_sse2; + const transform_1d_sse2 row_txfm = fdct8x64_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < height_div8; ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + + for (int i = 0; i < height_div8; i++) { + __m128i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + int32_t *output8 = output + 8 * 32 * i; + for (int j = 0; j < 4; ++j) { + __m128i *buf8 = buf + 8 * j; + transpose_16bit_8x8(buf8, buf8); + store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, 32, 8); + } + } +} + +void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_16X64; + __m128i buf0[64], buf1[128]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = fdct8x64_new_sse2; + const transform_1d_sse2 row_txfm = fdct8x16_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < height_div8; ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + int32_t *output8 = output + 8 * width * i; + for (int j = 0; j < width_div8; ++j) { + __m128i *buf8 = buf + 8 * j; + transpose_16bit_8x8(buf8, buf8); + store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, width, 8); + } + } + // Zero out the bottom 16x32 area. + memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); +} + +static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { + av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform + av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform + av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform + av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform + NULL, // 64x64 transform + av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform + av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform + av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform + av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform + av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform + av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform + av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform + av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform + av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform + av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform + av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform +}; + +void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; + + if ((fwd_txfm2d_func == NULL) || + (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) + av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); + else + fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h new file mode 100644 index 000000000..aa14d3ade --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_ +#define AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/transpose_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit); +void fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit); + +static INLINE void fidentity4x4_new_sse2(const __m128i *const input, + __m128i *const output, + const int8_t cos_bit) { + (void)cos_bit; + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 4; ++i) { + const __m128i a = _mm_unpacklo_epi16(input[i], one); + const __m128i b = scale_round_sse2(a, NewSqrt2); + output[i] = _mm_packs_epi32(b, b); + } +} + +static INLINE void fidentity8x4_new_sse2(const __m128i *const input, + __m128i *const output, + const int8_t cos_bit) { + (void)cos_bit; + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 4; ++i) { + const __m128i a_lo = _mm_unpacklo_epi16(input[i], one); + const __m128i a_hi = _mm_unpackhi_epi16(input[i], one); + const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); + const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2); + output[i] = _mm_packs_epi32(b_lo, b_hi); + } +} + +static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + + output[0] = _mm_adds_epi16(input[0], input[0]); + output[1] = _mm_adds_epi16(input[1], input[1]); + output[2] = _mm_adds_epi16(input[2], input[2]); + output[3] = _mm_adds_epi16(input[3], input[3]); + output[4] = _mm_adds_epi16(input[4], input[4]); + output[5] = _mm_adds_epi16(input[5], input[5]); + output[6] = _mm_adds_epi16(input[6], input[6]); + output[7] = _mm_adds_epi16(input[7], input[7]); +} + +static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 16; ++i) { + const __m128i a_lo = _mm_unpacklo_epi16(input[i], one); + const __m128i a_hi = _mm_unpackhi_epi16(input[i], one); + const __m128i b_lo = scale_round_sse2(a_lo, 2 * NewSqrt2); + const __m128i b_hi = scale_round_sse2(a_hi, 2 * NewSqrt2); + output[i] = _mm_packs_epi32(b_lo, b_hi); + } +} + +static INLINE void fidentity8x32_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + for (int i = 0; i < 32; ++i) { + output[i] = _mm_slli_epi16(input[i], 2); + } +} + +static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = { + fdct8x32_new_sse2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity8x32_new_sse2, // IDTX + fdct8x32_new_sse2, // V_DCT + fidentity8x32_new_sse2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +#ifdef __cplusplus +} +#endif + +#endif // AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_ diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c index c8d4ccb70..b58911fcb 100644 --- a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c +++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c @@ -11,7 +11,8 @@ #include -#include "./av1_rtcd.h" +#include "config/av1_rtcd.h" + #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" @@ -32,7 +33,10 @@ static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *dequant_ptr, int log_scale, __m256i *qp) { __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); - round = _mm_srai_epi16(round, log_scale); + if (log_scale) { + const __m128i round_scale = _mm_set1_epi16(1 << (15 - log_scale)); + round = _mm_mulhrs_epi16(round, round_scale); + } const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); @@ -45,8 +49,8 @@ static INLINE void quantize(const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, int log_scale, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { - const __m256i abs = _mm256_abs_epi32(*c); - __m256i q = _mm256_add_epi32(abs, qp[0]); + const __m256i abs_coeff = _mm256_abs_epi32(*c); + __m256i q = _mm256_add_epi32(abs_coeff, qp[0]); __m256i q_lo = _mm256_mul_epi32(q, qp[1]); __m256i q_hi = _mm256_srli_epi64(q, 32); @@ -56,6 +60,9 @@ static INLINE void quantize(const __m256i *qp, __m256i *c, q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); q_hi = _mm256_slli_epi64(q_hi, 32); q = _mm256_or_si256(q_lo, q_hi); + const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale); + const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s); + q = _mm256_andnot_si256(mask, q); __m256i dq = _mm256_mullo_epi32(q, qp[2]); dq = _mm256_srai_epi32(dq, log_scale); @@ -81,8 +88,8 @@ static INLINE void quantize(const __m256i *qp, __m256i *c, } void av1_highbd_quantize_fp_avx2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale) { @@ -90,14 +97,23 @@ void av1_highbd_quantize_fp_avx2( (void)zbin_ptr; (void)quant_shift_ptr; const unsigned int step = 8; + __m256i qp[3], coeff; - if (LIKELY(!skip_block)) { - __m256i qp[3], coeff; + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp); + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); - init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp); - coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + __m256i eob = _mm256_setzero_si256(); + quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; - __m256i eob = _mm256_setzero_si256(); + update_qp(qp); + while (n_coeffs > 0) { + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; @@ -105,39 +121,17 @@ void av1_highbd_quantize_fp_avx2( dqcoeff_ptr += step; iscan += step; n_coeffs -= step; - - update_qp(qp); - while (n_coeffs > 0) { - coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); - quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); - - coeff_ptr += step; - qcoeff_ptr += step; - dqcoeff_ptr += step; - iscan += step; - n_coeffs -= step; - } - { - __m256i eob_s; - eob_s = _mm256_shuffle_epi32(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 1); - eob = _mm256_max_epi16(eob, eob_s); - const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), - _mm256_extractf128_si256(eob, 1)); - *eob_ptr = _mm_extract_epi16(final_eob, 0); - } - } else { - do { - const __m256i zero = _mm256_setzero_si256(); - _mm256_storeu_si256((__m256i *)qcoeff_ptr, zero); - _mm256_storeu_si256((__m256i *)dqcoeff_ptr, zero); - qcoeff_ptr += step; - dqcoeff_ptr += step; - n_coeffs -= step; - } while (n_coeffs > 0); - *eob_ptr = 0; + } + { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); + const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), + _mm256_extractf128_si256(eob, 1)); + *eob_ptr = _mm_extract_epi16(final_eob, 0); } } diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c index 8d717a083..40b3b460b 100644 --- a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c +++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c @@ -12,8 +12,10 @@ #include #include -#include "./av1_rtcd.h" +#include "config/av1_rtcd.h" + #include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/synonyms.h" // Coefficient quantization phase 1 // param[0-2] : rounding/quan/dequan constants @@ -36,6 +38,8 @@ static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param, qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift); dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]); dquan[0] = _mm_srli_epi64(dquan[0], scale); + const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale); + qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]); } // Coefficient quantization phase 2 @@ -70,7 +74,8 @@ static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan, qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign); dquan[0] = _mm_sign_epi32(dquan[0], *sign); - + qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]); + dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]); _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]); _mm_storeu_si128((__m128i *)dqAddr, dquan[0]); } @@ -108,12 +113,12 @@ static INLINE uint16_t get_accumulated_eob(__m128i *eob) { } void av1_highbd_quantize_fp_sse4_1( - const tran_low_t *coeff_ptr, intptr_t count, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale) { - __m128i coeff[2], qcoeff[2], dequant[2], qparam[3], coeff_sign; + __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign; __m128i eob = _mm_setzero_si128(); const tran_low_t *src = coeff_ptr; tran_low_t *quanAddr = qcoeff_ptr; @@ -121,7 +126,6 @@ void av1_highbd_quantize_fp_sse4_1( const int shift = 16 - log_scale; const int coeff_stride = 4; const int quan_stride = coeff_stride; - (void)skip_block; (void)zbin_ptr; (void)quant_shift_ptr; (void)scan; @@ -129,29 +133,54 @@ void av1_highbd_quantize_fp_sse4_1( memset(quanAddr, 0, count * sizeof(quanAddr[0])); memset(dquanAddr, 0, count * sizeof(dquanAddr[0])); - if (!skip_block) { - coeff[0] = _mm_loadu_si128((__m128i const *)src); + coeff[0] = _mm_loadu_si128((__m128i const *)src); + const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale); + const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale); + + qparam[0] = _mm_set_epi32(round1, round1, round1, round0); + qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]); + qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]); + qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1], + dequant_ptr[0]); + + // DC and first 3 AC + quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + + // update round/quan/dquan for AC + qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]); + qparam[1] = xx_set1_64_from_32i(quant_ptr[1]); + qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]); + qparam[3] = _mm_set1_epi32(dequant_ptr[1]); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, + quanAddr, dquanAddr); + + // next 4 AC + coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); + quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, + quanAddr + quan_stride, dquanAddr + quan_stride); + + find_eob(quanAddr, iscan, &eob); + + count -= 8; + + // loop for the rest of AC + while (count > 0) { + src += coeff_stride << 1; + quanAddr += quan_stride << 1; + dquanAddr += quan_stride << 1; + iscan += quan_stride << 1; - qparam[0] = - _mm_set_epi32(round_ptr[1] >> log_scale, round_ptr[1] >> log_scale, - round_ptr[1] >> log_scale, round_ptr[0] >> log_scale); - qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[0]); - qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[0]); + coeff[0] = _mm_loadu_si128((__m128i const *)src); + coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); - // DC and first 3 AC quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant, &coeff_sign); - - // update round/quan/dquan for AC - qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]); - qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[1]); - qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[1]); - quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, quanAddr, dquanAddr); - // next 4 AC - coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant, &coeff_sign); quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, @@ -161,34 +190,6 @@ void av1_highbd_quantize_fp_sse4_1( find_eob(quanAddr, iscan, &eob); count -= 8; - - // loop for the rest of AC - while (count > 0) { - src += coeff_stride << 1; - quanAddr += quan_stride << 1; - dquanAddr += quan_stride << 1; - iscan += quan_stride << 1; - - coeff[0] = _mm_loadu_si128((__m128i const *)src); - coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); - - quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, - dequant, &coeff_sign); - quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, - log_scale, quanAddr, dquanAddr); - - quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, - dequant, &coeff_sign); - quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, - log_scale, quanAddr + quan_stride, - dquanAddr + quan_stride); - - find_eob(quanAddr, iscan, &eob); - - count -= 8; - } - *eob_ptr = get_accumulated_eob(&eob); - } else { - *eob_ptr = 0; } + *eob_ptr = get_accumulated_eob(&eob); } diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c index 078a67510..df22aaba7 100644 --- a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c +++ b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c @@ -11,7 +11,8 @@ #include -#include "./av1_rtcd.h" +#include "config/av1_rtcd.h" + #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" @@ -57,7 +58,7 @@ static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr, init_one_qp(&round, &qp[0]); init_one_qp(&quant, &qp[1]); - if (log_scale > 0) { + if (log_scale == 1) { qp[1] = _mm256_slli_epi16(qp[1], log_scale); } @@ -94,16 +95,25 @@ static INLINE void update_qp(int log_scale, __m256i *thr, __m256i *qp) { } \ } while (0) +static INLINE uint16_t quant_gather_eob(__m256i eob) { + const __m128i eob_lo = _mm256_castsi256_si128(eob); + const __m128i eob_hi = _mm256_extractf128_si256(eob, 1); + __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi); + eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s); + eob_s = _mm_minpos_epu16(eob_s); + return INT16_MAX - _mm_extract_epi16(eob_s, 0); +} + static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { - const __m256i abs = _mm256_abs_epi16(*c); - __m256i mask = _mm256_cmpgt_epi16(abs, *thr); - mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr)); + const __m256i abs_coeff = _mm256_abs_epi16(*c); + __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr)); const int nzflag = _mm256_movemask_epi8(mask); if (nzflag) { - __m256i q = _mm256_adds_epi16(abs, qp[0]); + __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]); q = _mm256_mulhi_epi16(q, qp[1]); q = _mm256_sign_epi16(q, *c); const __m256i dq = _mm256_mullo_epi16(q, qp[2]); @@ -123,8 +133,8 @@ static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c, } void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, @@ -134,15 +144,26 @@ void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, (void)quant_shift_ptr; const unsigned int step = 16; - if (LIKELY(!skip_block)) { - __m256i qp[3]; - __m256i coeff, thr; - const int log_scale = 0; + __m256i qp[3]; + __m256i coeff, thr; + const int log_scale = 0; - init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); - read_coeff(coeff_ptr, &coeff); + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); + read_coeff(coeff_ptr, &coeff); + + __m256i eob = _mm256_setzero_si256(); + quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + + update_qp(log_scale, &thr, qp); - __m256i eob = _mm256_setzero_si256(); + while (n_coeffs > 0) { + read_coeff(coeff_ptr, &coeff); quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; @@ -150,54 +171,21 @@ void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dqcoeff_ptr += step; iscan_ptr += step; n_coeffs -= step; - - update_qp(log_scale, &thr, qp); - - while (n_coeffs > 0) { - read_coeff(coeff_ptr, &coeff); - quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); - - coeff_ptr += step; - qcoeff_ptr += step; - dqcoeff_ptr += step; - iscan_ptr += step; - n_coeffs -= step; - } - { - __m256i eob_s; - eob_s = _mm256_shuffle_epi32(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 1); - eob = _mm256_max_epi16(eob, eob_s); - const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), - _mm256_extractf128_si256(eob, 1)); - *eob_ptr = _mm_extract_epi16(final_eob, 0); - } - } else { - do { - write_zero(qcoeff_ptr); - write_zero(dqcoeff_ptr); - qcoeff_ptr += step; - dqcoeff_ptr += step; - n_coeffs -= step; - } while (n_coeffs > 0); - *eob_ptr = 0; } + *eob_ptr = quant_gather_eob(eob); } static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { - const __m256i abs = _mm256_abs_epi16(*c); - __m256i mask = _mm256_cmpgt_epi16(abs, *thr); - mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr)); + const __m256i abs_coeff = _mm256_abs_epi16(*c); + __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr)); const int nzflag = _mm256_movemask_epi8(mask); if (nzflag) { - __m256i q = _mm256_adds_epi16(abs, qp[0]); + __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]); q = _mm256_mulhi_epu16(q, qp[1]); __m256i dq = _mm256_mullo_epi16(q, qp[2]); @@ -221,8 +209,8 @@ static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp, } void av1_quantize_fp_32x32_avx2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { @@ -231,15 +219,26 @@ void av1_quantize_fp_32x32_avx2( (void)quant_shift_ptr; const unsigned int step = 16; - if (LIKELY(!skip_block)) { - __m256i qp[3]; - __m256i coeff, thr; - const int log_scale = 1; + __m256i qp[3]; + __m256i coeff, thr; + const int log_scale = 1; - init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); - read_coeff(coeff_ptr, &coeff); + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); + read_coeff(coeff_ptr, &coeff); - __m256i eob = _mm256_setzero_si256(); + __m256i eob = _mm256_setzero_si256(); + quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + + update_qp(log_scale, &thr, qp); + + while (n_coeffs > 0) { + read_coeff(coeff_ptr, &coeff); quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; @@ -247,40 +246,85 @@ void av1_quantize_fp_32x32_avx2( dqcoeff_ptr += step; iscan_ptr += step; n_coeffs -= step; + } + *eob_ptr = quant_gather_eob(eob); +} + +static INLINE void quantize_64x64(const __m256i *thr, const __m256i *qp, + __m256i *c, const int16_t *iscan_ptr, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + __m256i *eob) { + const __m256i abs_coeff = _mm256_abs_epi16(*c); + __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr)); + const int nzflag = _mm256_movemask_epi8(mask); - update_qp(log_scale, &thr, qp); - - while (n_coeffs > 0) { - read_coeff(coeff_ptr, &coeff); - quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, - &eob); - - coeff_ptr += step; - qcoeff_ptr += step; - dqcoeff_ptr += step; - iscan_ptr += step; - n_coeffs -= step; - } - { - __m256i eob_s; - eob_s = _mm256_shuffle_epi32(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 1); - eob = _mm256_max_epi16(eob, eob_s); - const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), - _mm256_extractf128_si256(eob, 1)); - *eob_ptr = _mm_extract_epi16(final_eob, 0); - } + if (nzflag) { + __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]); + __m256i qh = _mm256_mulhi_epi16(q, qp[1]); + __m256i ql = _mm256_mullo_epi16(q, qp[1]); + qh = _mm256_slli_epi16(qh, 2); + ql = _mm256_srli_epi16(ql, 14); + q = _mm256_or_si256(qh, ql); + const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(q, qp[2]), 14); + const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(q, qp[2]), 2); + __m256i dq = _mm256_or_si256(dqh, dql); + + q = _mm256_sign_epi16(q, *c); + dq = _mm256_sign_epi16(dq, *c); + + store_two_quan(q, qcoeff, dq, dqcoeff); + const __m256i zero = _mm256_setzero_si256(); + const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr); + const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero); + const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero); + __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff); + cur_eob = _mm256_and_si256(cur_eob, nzero_coeff); + *eob = _mm256_max_epi16(*eob, cur_eob); } else { - do { - write_zero(qcoeff_ptr); - write_zero(dqcoeff_ptr); - qcoeff_ptr += step; - dqcoeff_ptr += step; - n_coeffs -= step; - } while (n_coeffs > 0); - *eob_ptr = 0; + write_zero(qcoeff); + write_zero(dqcoeff); + } +} + +void av1_quantize_fp_64x64_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + const unsigned int step = 16; + + __m256i qp[3]; + __m256i coeff, thr; + const int log_scale = 2; + + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); + read_coeff(coeff_ptr, &coeff); + + __m256i eob = _mm256_setzero_si256(); + quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + + update_qp(log_scale, &thr, qp); + + while (n_coeffs > 0) { + read_coeff(coeff_ptr, &coeff); + quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; } + *eob_ptr = quant_gather_eob(eob); } diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c index 4f7c09546..b07e7717f 100644 --- a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c +++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c @@ -12,7 +12,8 @@ #include #include -#include "./av1_rtcd.h" +#include "config/av1_rtcd.h" + #include "aom/aom_integer.h" static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset, @@ -67,16 +68,80 @@ static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) { } } +static INLINE void quantize(const int16_t *iscan_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const __m128i *round0, const __m128i *round1, + const __m128i *quant0, const __m128i *quant1, + const __m128i *dequant0, const __m128i *dequant1, + const __m128i *thr0, const __m128i *thr1, + __m128i *eob) { + __m128i coeff0, coeff1; + // Do DC and first 15 AC + read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1); + + // Poor man's sign extract + const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15); + const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15); + __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0), + _mm_cmpeq_epi16(qcoeff0, *thr0)); + const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1), + _mm_cmpeq_epi16(qcoeff1, *thr1)); + const int16_t nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1); + + if (nzflag) { + qcoeff0 = _mm_adds_epi16(qcoeff0, *round0); + qcoeff1 = _mm_adds_epi16(qcoeff1, *round1); + const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0); + const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs); + + coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0); + coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1); + + write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs); + + const __m128i zero = _mm_setzero_si128(); + // Scan for eob + const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + const __m128i iscan0 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + const __m128i iscan1 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0); + const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1); + const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0); + const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1); + const __m128i eob2 = _mm_max_epi16(eob0, eob1); + *eob = _mm_max_epi16(*eob, eob2); + } else { + write_zero(qcoeff_ptr, n_coeffs); + write_zero(dqcoeff_ptr, n_coeffs); + } +} + void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { - __m128i zero; - __m128i thr; - int16_t nzflag; (void)scan_ptr; (void)zbin_ptr; (void)quant_shift_ptr; @@ -86,167 +151,39 @@ void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; - zero = _mm_setzero_si128(); - - if (!skip_block) { - __m128i eob; - __m128i round, quant, dequant; - { - __m128i coeff0, coeff1; - - // Setup global values - { - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - } - - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - // Do DC and first 15 AC - read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); - } - n_coeffs += 8 * 2; - } - - thr = _mm_srai_epi16(dequant, 1); - - // AC only loop - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - - read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | - _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); - - if (nzflag) { - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs); - } else { - write_zero(qcoeff_ptr, n_coeffs); - write_zero(dqcoeff_ptr, n_coeffs); - } - } - - if (nzflag) { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } - n_coeffs += 8 * 2; - } - - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } - } else { - do { - write_zero(dqcoeff_ptr, n_coeffs); - write_zero(qcoeff_ptr, n_coeffs); - n_coeffs += 8 * 2; - } while (n_coeffs < 0); - *eob_ptr = 0; + + const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr); + const __m128i round1 = _mm_unpackhi_epi64(round0, round0); + const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr); + const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0); + const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr); + const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0); + const __m128i thr0 = _mm_srai_epi16(dequant0, 1); + const __m128i thr1 = _mm_srai_epi16(dequant1, 1); + __m128i eob = _mm_setzero_si128(); + + quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0, + &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob); + + n_coeffs += 8 * 2; + + // AC only loop + while (n_coeffs < 0) { + quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1, + &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1, + &eob); + n_coeffs += 8 * 2; + } + + // Accumulate EOB + { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + *eob_ptr = _mm_extract_epi16(eob, 1); } } diff --git a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm index dcc697ba3..faa2a232a 100644 --- a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm +++ b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm @@ -47,6 +47,9 @@ paddd %1, xmm1 SUM_ACROSS_Q %1 %endmacro + +SECTION .text + ;void ssim_parms_sse2( ; unsigned char *s, ; int sp, diff --git a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h new file mode 100644 index 000000000..0adefecdb --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_TXMF1D_SSE2_H_ +#define AV1_TXMF1D_SSE2_H_ + +#include +#include "av1/common/av1_txfm.h" +#include "av1/common/x86/av1_txfm_sse4.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output, + int8_t cos_bit); +void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output, + int8_t cos_bit); + +void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +static INLINE void transpose_32_4x4(int stride, const __m128i *input, + __m128i *output) { + __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]); + __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]); + __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]); + __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]); + + output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2); + output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2); + output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3); + output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3); +} + +// the entire input block can be represent by a grid of 4x4 blocks +// each 4x4 blocks can be represent by 4 vertical __m128i +// we first transpose each 4x4 block internally +// then transpose the grid +static INLINE void transpose_32(int txfm_size, const __m128i *input, + __m128i *output) { + const int num_per_128 = 4; + const int row_size = txfm_size; + const int col_size = txfm_size / num_per_128; + int r, c; + + // transpose each 4x4 block internally + for (r = 0; r < row_size; r += 4) { + for (c = 0; c < col_size; c++) { + transpose_32_4x4(col_size, &input[r * col_size + c], + &output[c * 4 * col_size + r / 4]); + } + } +} + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \ + do { \ + const __m128i ww0 = _mm_set1_epi32(w0); \ + const __m128i ww1 = _mm_set1_epi32(w1); \ + const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \ + const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \ + out0 = _mm_add_epi32(in0_w0, in1_w1); \ + out0 = av1_round_shift_32_sse4_1(out0, bit); \ + const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \ + const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \ + out1 = _mm_sub_epi32(in0_w1, in1_w0); \ + out1 = av1_round_shift_32_sse4_1(out1, bit); \ + } while (0) + +// out0 = in0*w0 + in1*w1 +// out1 = in1*w0 - in0*w1 +#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \ + do { \ + btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit); \ + } while (0) + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +#define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ + do { \ + const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \ + const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \ + out0 = _mm_add_epi32(in0_w0, in1_w1); \ + out0 = _mm_add_epi32(out0, r); \ + out0 = _mm_srai_epi32(out0, bit); \ + const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \ + const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \ + out1 = _mm_sub_epi32(in0_w1, in1_w0); \ + out1 = _mm_add_epi32(out1, r); \ + out1 = _mm_srai_epi32(out1, bit); \ + } while (0) + +// out0 = in0*w0 + in1*w1 +// out1 = in1*w0 - in0*w1 +#define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ + do { \ + btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit); \ + } while (0) + +#ifdef __cplusplus +} +#endif + +#endif // AV1_TXMF1D_SSE2_H_ diff --git a/third_party/aom/av1/encoder/x86/corner_match_sse4.c b/third_party/aom/av1/encoder/x86/corner_match_sse4.c index 179da0d28..381f757da 100644 --- a/third_party/aom/av1/encoder/x86/corner_match_sse4.c +++ b/third_party/aom/av1/encoder/x86/corner_match_sse4.c @@ -5,7 +5,8 @@ #include -#include "./av1_rtcd.h" +#include "config/av1_rtcd.h" + #include "aom_ports/mem.h" #include "av1/encoder/corner_match.h" diff --git a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c deleted file mode 100644 index e5b19a44c..000000000 --- a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c +++ /dev/null @@ -1,3483 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include -#include // SSE2 - -#include "./aom_dsp_rtcd.h" -#include "./av1_rtcd.h" -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/x86/fwd_txfm_sse2.h" -#include "aom_dsp/x86/synonyms.h" -#include "aom_dsp/x86/txfm_common_sse2.h" -#include "aom_ports/mem.h" - -static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, - int stride, int flipud, int fliplr) { - const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); - const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); - __m128i mask; - - if (!flipud) { - in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); - in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); - } else { - in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); - in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); - in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - } - - if (fliplr) { - in[0] = _mm_shufflelo_epi16(in[0], 0x1b); - in[1] = _mm_shufflelo_epi16(in[1], 0x1b); - in[2] = _mm_shufflelo_epi16(in[2], 0x1b); - in[3] = _mm_shufflelo_epi16(in[3], 0x1b); - } - - in[0] = _mm_slli_epi16(in[0], 4); - in[1] = _mm_slli_epi16(in[1], 4); - in[2] = _mm_slli_epi16(in[2], 4); - in[3] = _mm_slli_epi16(in[3], 4); - - mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a); - in[0] = _mm_add_epi16(in[0], mask); - in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b); -} - -static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) { - const __m128i kOne = _mm_set1_epi16(1); - __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]); - __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]); - __m128i out01 = _mm_add_epi16(in01, kOne); - __m128i out23 = _mm_add_epi16(in23, kOne); - out01 = _mm_srai_epi16(out01, 2); - out23 = _mm_srai_epi16(out23, 2); - store_output(&out01, (output + 0 * 8)); - store_output(&out23, (output + 1 * 8)); -} - -static INLINE void transpose_4x4(__m128i *res) { - // Combine and transpose - // 00 01 02 03 20 21 22 23 - // 10 11 12 13 30 31 32 33 - const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); - const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); - - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); - res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); - - // 00 10 20 30 01 11 21 31 - // 02 12 22 32 03 13 23 33 - // only use the first 4 16-bit integers - res[1] = _mm_unpackhi_epi64(res[0], res[0]); - res[3] = _mm_unpackhi_epi64(res[2], res[2]); -} - -static void fdct4_sse2(__m128i *in) { - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i u[4], v[4]; - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpacklo_epi16(in[3], in[2]); - - v[0] = _mm_add_epi16(u[0], u[1]); - v[1] = _mm_sub_epi16(u[0], u[1]); - - u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 - u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 - u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 - u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[1]); - in[1] = _mm_packs_epi32(u[2], u[3]); - transpose_4x4(in); -} - -static void fadst4_sse2(__m128i *in) { - const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); - const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); - const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); - const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); - const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); - const __m128i kZero = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8]; - __m128i in7 = _mm_add_epi16(in[0], in[1]); - - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpacklo_epi16(in[2], in[3]); - u[2] = _mm_unpacklo_epi16(in7, kZero); - u[3] = _mm_unpacklo_epi16(in[2], kZero); - u[4] = _mm_unpacklo_epi16(in[3], kZero); - - v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 - v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 - v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 - v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 - v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 - v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 - v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); - - u[0] = _mm_add_epi32(v[0], v[1]); - u[1] = _mm_sub_epi32(v[2], v[6]); - u[2] = _mm_add_epi32(v[3], v[4]); - u[3] = _mm_sub_epi32(u[2], u[0]); - u[4] = _mm_slli_epi32(v[5], 2); - u[5] = _mm_sub_epi32(u[4], v[5]); - u[6] = _mm_add_epi32(u[3], u[5]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[2]); - in[1] = _mm_packs_epi32(u[1], u[3]); - transpose_4x4(in); -} - -#if CONFIG_EXT_TX -static void fidtx4_sse2(__m128i *in) { - const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0); - const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i v0, v1, v2, v3; - __m128i u0, u1, u2, u3; - - v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16); - v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16); - v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16); - v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16); - - u0 = _mm_madd_epi16(v0, k__sqrt2_epi16); - u1 = _mm_madd_epi16(v1, k__sqrt2_epi16); - u2 = _mm_madd_epi16(v2, k__sqrt2_epi16); - u3 = _mm_madd_epi16(v3, k__sqrt2_epi16); - - v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u0, u2); - in[1] = _mm_packs_epi32(u1, u3); - transpose_4x4(in); -} -#endif // CONFIG_EXT_TX - -void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in[4]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - switch (tx_type) { - case DCT_DCT: aom_fdct4x4_sse2(input, output, stride); break; - case ADST_DCT: - load_buffer_4x4(input, in, stride, 0, 0); - fadst4_sse2(in); - fdct4_sse2(in); - write_buffer_4x4(output, in); - break; - case DCT_ADST: - load_buffer_4x4(input, in, stride, 0, 0); - fdct4_sse2(in); - fadst4_sse2(in); - write_buffer_4x4(output, in); - break; - case ADST_ADST: - load_buffer_4x4(input, in, stride, 0, 0); - fadst4_sse2(in); - fadst4_sse2(in); - write_buffer_4x4(output, in); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_4x4(input, in, stride, 1, 0); - fadst4_sse2(in); - fdct4_sse2(in); - write_buffer_4x4(output, in); - break; - case DCT_FLIPADST: - load_buffer_4x4(input, in, stride, 0, 1); - fdct4_sse2(in); - fadst4_sse2(in); - write_buffer_4x4(output, in); - break; - case FLIPADST_FLIPADST: - load_buffer_4x4(input, in, stride, 1, 1); - fadst4_sse2(in); - fadst4_sse2(in); - write_buffer_4x4(output, in); - break; - case ADST_FLIPADST: - load_buffer_4x4(input, in, stride, 0, 1); - fadst4_sse2(in); - fadst4_sse2(in); - write_buffer_4x4(output, in); - break; - case FLIPADST_ADST: - load_buffer_4x4(input, in, stride, 1, 0); - fadst4_sse2(in); - fadst4_sse2(in); - write_buffer_4x4(output, in); - break; - case IDTX: - load_buffer_4x4(input, in, stride, 0, 0); - fidtx4_sse2(in); - fidtx4_sse2(in); - write_buffer_4x4(output, in); - break; - case V_DCT: - load_buffer_4x4(input, in, stride, 0, 0); - fdct4_sse2(in); - fidtx4_sse2(in); - write_buffer_4x4(output, in); - break; - case H_DCT: - load_buffer_4x4(input, in, stride, 0, 0); - fidtx4_sse2(in); - fdct4_sse2(in); - write_buffer_4x4(output, in); - break; - case V_ADST: - load_buffer_4x4(input, in, stride, 0, 0); - fadst4_sse2(in); - fidtx4_sse2(in); - write_buffer_4x4(output, in); - break; - case H_ADST: - load_buffer_4x4(input, in, stride, 0, 0); - fidtx4_sse2(in); - fadst4_sse2(in); - write_buffer_4x4(output, in); - break; - case V_FLIPADST: - load_buffer_4x4(input, in, stride, 1, 0); - fadst4_sse2(in); - fidtx4_sse2(in); - write_buffer_4x4(output, in); - break; - case H_FLIPADST: - load_buffer_4x4(input, in, stride, 0, 1); - fidtx4_sse2(in); - fadst4_sse2(in); - write_buffer_4x4(output, in); - break; -#endif // CONFIG_EXT_TX - default: assert(0); - } -} - -// load 8x8 array -static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, - int stride, int flipud, int fliplr) { - if (!flipud) { - in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); - in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); - in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); - in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); - in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); - in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); - in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); - in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); - } else { - in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride)); - in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride)); - in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride)); - in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride)); - in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride)); - in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride)); - in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride)); - in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride)); - } - - if (fliplr) { - in[0] = mm_reverse_epi16(in[0]); - in[1] = mm_reverse_epi16(in[1]); - in[2] = mm_reverse_epi16(in[2]); - in[3] = mm_reverse_epi16(in[3]); - in[4] = mm_reverse_epi16(in[4]); - in[5] = mm_reverse_epi16(in[5]); - in[6] = mm_reverse_epi16(in[6]); - in[7] = mm_reverse_epi16(in[7]); - } - - in[0] = _mm_slli_epi16(in[0], 2); - in[1] = _mm_slli_epi16(in[1], 2); - in[2] = _mm_slli_epi16(in[2], 2); - in[3] = _mm_slli_epi16(in[3], 2); - in[4] = _mm_slli_epi16(in[4], 2); - in[5] = _mm_slli_epi16(in[5], 2); - in[6] = _mm_slli_epi16(in[6], 2); - in[7] = _mm_slli_epi16(in[7], 2); -} - -// right shift and rounding -static INLINE void right_shift_8x8(__m128i *res, const int bit) { - __m128i sign0 = _mm_srai_epi16(res[0], 15); - __m128i sign1 = _mm_srai_epi16(res[1], 15); - __m128i sign2 = _mm_srai_epi16(res[2], 15); - __m128i sign3 = _mm_srai_epi16(res[3], 15); - __m128i sign4 = _mm_srai_epi16(res[4], 15); - __m128i sign5 = _mm_srai_epi16(res[5], 15); - __m128i sign6 = _mm_srai_epi16(res[6], 15); - __m128i sign7 = _mm_srai_epi16(res[7], 15); - - if (bit == 2) { - const __m128i const_rounding = _mm_set1_epi16(1); - res[0] = _mm_adds_epi16(res[0], const_rounding); - res[1] = _mm_adds_epi16(res[1], const_rounding); - res[2] = _mm_adds_epi16(res[2], const_rounding); - res[3] = _mm_adds_epi16(res[3], const_rounding); - res[4] = _mm_adds_epi16(res[4], const_rounding); - res[5] = _mm_adds_epi16(res[5], const_rounding); - res[6] = _mm_adds_epi16(res[6], const_rounding); - res[7] = _mm_adds_epi16(res[7], const_rounding); - } - - res[0] = _mm_sub_epi16(res[0], sign0); - res[1] = _mm_sub_epi16(res[1], sign1); - res[2] = _mm_sub_epi16(res[2], sign2); - res[3] = _mm_sub_epi16(res[3], sign3); - res[4] = _mm_sub_epi16(res[4], sign4); - res[5] = _mm_sub_epi16(res[5], sign5); - res[6] = _mm_sub_epi16(res[6], sign6); - res[7] = _mm_sub_epi16(res[7], sign7); - - if (bit == 1) { - res[0] = _mm_srai_epi16(res[0], 1); - res[1] = _mm_srai_epi16(res[1], 1); - res[2] = _mm_srai_epi16(res[2], 1); - res[3] = _mm_srai_epi16(res[3], 1); - res[4] = _mm_srai_epi16(res[4], 1); - res[5] = _mm_srai_epi16(res[5], 1); - res[6] = _mm_srai_epi16(res[6], 1); - res[7] = _mm_srai_epi16(res[7], 1); - } else { - res[0] = _mm_srai_epi16(res[0], 2); - res[1] = _mm_srai_epi16(res[1], 2); - res[2] = _mm_srai_epi16(res[2], 2); - res[3] = _mm_srai_epi16(res[3], 2); - res[4] = _mm_srai_epi16(res[4], 2); - res[5] = _mm_srai_epi16(res[5], 2); - res[6] = _mm_srai_epi16(res[6], 2); - res[7] = _mm_srai_epi16(res[7], 2); - } -} - -// write 8x8 array -static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, - int stride) { - store_output(&res[0], (output + 0 * stride)); - store_output(&res[1], (output + 1 * stride)); - store_output(&res[2], (output + 2 * stride)); - store_output(&res[3], (output + 3 * stride)); - store_output(&res[4], (output + 4 * stride)); - store_output(&res[5], (output + 5 * stride)); - store_output(&res[6], (output + 6 * stride)); - store_output(&res[7], (output + 7 * stride)); -} - -// perform in-place transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 44 54 45 55 46 56 47 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 25 35 - // 44 54 64 74 45 55 65 75 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 -} - -static void fdct8_sse2(__m128i *in) { - // constants - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i v0, v1, v2, v3, v4, v5, v6, v7; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - - // stage 1 - s0 = _mm_add_epi16(in[0], in[7]); - s1 = _mm_add_epi16(in[1], in[6]); - s2 = _mm_add_epi16(in[2], in[5]); - s3 = _mm_add_epi16(in[3], in[4]); - s4 = _mm_sub_epi16(in[3], in[4]); - s5 = _mm_sub_epi16(in[2], in[5]); - s6 = _mm_sub_epi16(in[1], in[6]); - s7 = _mm_sub_epi16(in[0], in[7]); - - u0 = _mm_add_epi16(s0, s3); - u1 = _mm_add_epi16(s1, s2); - u2 = _mm_sub_epi16(s1, s2); - u3 = _mm_sub_epi16(s0, s3); - // interleave and perform butterfly multiplication/addition - v0 = _mm_unpacklo_epi16(u0, u1); - v1 = _mm_unpackhi_epi16(u0, u1); - v2 = _mm_unpacklo_epi16(u2, u3); - v3 = _mm_unpackhi_epi16(u2, u3); - - u0 = _mm_madd_epi16(v0, k__cospi_p16_p16); - u1 = _mm_madd_epi16(v1, k__cospi_p16_p16); - u2 = _mm_madd_epi16(v0, k__cospi_p16_m16); - u3 = _mm_madd_epi16(v1, k__cospi_p16_m16); - u4 = _mm_madd_epi16(v2, k__cospi_p24_p08); - u5 = _mm_madd_epi16(v3, k__cospi_p24_p08); - u6 = _mm_madd_epi16(v2, k__cospi_m08_p24); - u7 = _mm_madd_epi16(v3, k__cospi_m08_p24); - - // shift and rounding - v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u0, u1); - in[2] = _mm_packs_epi32(u4, u5); - in[4] = _mm_packs_epi32(u2, u3); - in[6] = _mm_packs_epi32(u6, u7); - - // stage 2 - // interleave and perform butterfly multiplication/addition - u0 = _mm_unpacklo_epi16(s6, s5); - u1 = _mm_unpackhi_epi16(s6, s5); - v0 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_p16); - - // shift and rounding - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - - u0 = _mm_packs_epi32(v0, v1); - u1 = _mm_packs_epi32(v2, v3); - - // stage 3 - s0 = _mm_add_epi16(s4, u0); - s1 = _mm_sub_epi16(s4, u0); - s2 = _mm_sub_epi16(s7, u1); - s3 = _mm_add_epi16(s7, u1); - - // stage 4 - u0 = _mm_unpacklo_epi16(s0, s3); - u1 = _mm_unpackhi_epi16(s0, s3); - u2 = _mm_unpacklo_epi16(s1, s2); - u3 = _mm_unpackhi_epi16(s1, s2); - - v0 = _mm_madd_epi16(u0, k__cospi_p28_p04); - v1 = _mm_madd_epi16(u1, k__cospi_p28_p04); - v2 = _mm_madd_epi16(u2, k__cospi_p12_p20); - v3 = _mm_madd_epi16(u3, k__cospi_p12_p20); - v4 = _mm_madd_epi16(u2, k__cospi_m20_p12); - v5 = _mm_madd_epi16(u3, k__cospi_m20_p12); - v6 = _mm_madd_epi16(u0, k__cospi_m04_p28); - v7 = _mm_madd_epi16(u1, k__cospi_m04_p28); - - // shift and rounding - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - in[1] = _mm_packs_epi32(v0, v1); - in[3] = _mm_packs_epi32(v4, v5); - in[5] = _mm_packs_epi32(v2, v3); - in[7] = _mm_packs_epi32(v6, v7); - - // transpose - array_transpose_8x8(in, in); -} - -static void fadst8_sse2(__m128i *in) { - // Constants - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__const_0 = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; - __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - - // properly aligned for butterfly input - in0 = in[7]; - in1 = in[0]; - in2 = in[5]; - in3 = in[2]; - in4 = in[3]; - in5 = in[4]; - in6 = in[1]; - in7 = in[6]; - - // column transformation - // stage 1 - // interleave and multiply/add into 32-bit integer - s0 = _mm_unpacklo_epi16(in0, in1); - s1 = _mm_unpackhi_epi16(in0, in1); - s2 = _mm_unpacklo_epi16(in2, in3); - s3 = _mm_unpackhi_epi16(in2, in3); - s4 = _mm_unpacklo_epi16(in4, in5); - s5 = _mm_unpackhi_epi16(in4, in5); - s6 = _mm_unpacklo_epi16(in6, in7); - s7 = _mm_unpackhi_epi16(in6, in7); - - u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); - u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); - u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); - u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); - u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); - u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); - u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); - u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); - u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); - u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); - u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); - u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); - u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); - u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); - u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); - u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); - - // addition - w0 = _mm_add_epi32(u0, u8); - w1 = _mm_add_epi32(u1, u9); - w2 = _mm_add_epi32(u2, u10); - w3 = _mm_add_epi32(u3, u11); - w4 = _mm_add_epi32(u4, u12); - w5 = _mm_add_epi32(u5, u13); - w6 = _mm_add_epi32(u6, u14); - w7 = _mm_add_epi32(u7, u15); - w8 = _mm_sub_epi32(u0, u8); - w9 = _mm_sub_epi32(u1, u9); - w10 = _mm_sub_epi32(u2, u10); - w11 = _mm_sub_epi32(u3, u11); - w12 = _mm_sub_epi32(u4, u12); - w13 = _mm_sub_epi32(u5, u13); - w14 = _mm_sub_epi32(u6, u14); - w15 = _mm_sub_epi32(u7, u15); - - // shift and rounding - v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); - v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); - v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); - v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); - v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); - v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); - v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); - v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); - - u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); - u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); - u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); - u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); - u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); - u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); - u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); - u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); - - // back to 16-bit and pack 8 integers into __m128i - v0 = _mm_add_epi32(w0, w4); - v1 = _mm_add_epi32(w1, w5); - v2 = _mm_add_epi32(w2, w6); - v3 = _mm_add_epi32(w3, w7); - v4 = _mm_sub_epi32(w0, w4); - v5 = _mm_sub_epi32(w1, w5); - v6 = _mm_sub_epi32(w2, w6); - v7 = _mm_sub_epi32(w3, w7); - - w0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - w1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - w2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - w3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - w4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - w5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - w6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - w7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(w0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(w1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(w2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(w3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(w4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(w5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(w6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(w7, DCT_CONST_BITS); - - in[4] = _mm_packs_epi32(u8, u9); - in[5] = _mm_packs_epi32(u10, u11); - in[6] = _mm_packs_epi32(u12, u13); - in[7] = _mm_packs_epi32(u14, u15); - - // stage 2 - s0 = _mm_packs_epi32(v0, v1); - s1 = _mm_packs_epi32(v2, v3); - s2 = _mm_packs_epi32(v4, v5); - s3 = _mm_packs_epi32(v6, v7); - - u0 = _mm_unpacklo_epi16(in[4], in[5]); - u1 = _mm_unpackhi_epi16(in[4], in[5]); - u2 = _mm_unpacklo_epi16(in[6], in[7]); - u3 = _mm_unpackhi_epi16(in[6], in[7]); - - v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); - v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); - v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); - v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); - v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); - v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); - v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); - v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); - - w0 = _mm_add_epi32(v0, v4); - w1 = _mm_add_epi32(v1, v5); - w2 = _mm_add_epi32(v2, v6); - w3 = _mm_add_epi32(v3, v7); - w4 = _mm_sub_epi32(v0, v4); - w5 = _mm_sub_epi32(v1, v5); - w6 = _mm_sub_epi32(v2, v6); - w7 = _mm_sub_epi32(v3, v7); - - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - - // back to 16-bit intergers - s4 = _mm_packs_epi32(u0, u1); - s5 = _mm_packs_epi32(u2, u3); - s6 = _mm_packs_epi32(u4, u5); - s7 = _mm_packs_epi32(u6, u7); - - // stage 3 - u0 = _mm_unpacklo_epi16(s2, s3); - u1 = _mm_unpackhi_epi16(s2, s3); - u2 = _mm_unpacklo_epi16(s6, s7); - u3 = _mm_unpackhi_epi16(s6, s7); - - v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); - v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); - v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); - v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); - - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - s2 = _mm_packs_epi32(v0, v1); - s3 = _mm_packs_epi32(v2, v3); - s6 = _mm_packs_epi32(v4, v5); - s7 = _mm_packs_epi32(v6, v7); - - // FIXME(jingning): do subtract using bit inversion? - in[0] = s0; - in[1] = _mm_sub_epi16(k__const_0, s4); - in[2] = s6; - in[3] = _mm_sub_epi16(k__const_0, s2); - in[4] = s3; - in[5] = _mm_sub_epi16(k__const_0, s7); - in[6] = s5; - in[7] = _mm_sub_epi16(k__const_0, s1); - - // transpose - array_transpose_8x8(in, in); -} - -#if CONFIG_EXT_TX -static void fidtx8_sse2(__m128i *in) { - in[0] = _mm_slli_epi16(in[0], 1); - in[1] = _mm_slli_epi16(in[1], 1); - in[2] = _mm_slli_epi16(in[2], 1); - in[3] = _mm_slli_epi16(in[3], 1); - in[4] = _mm_slli_epi16(in[4], 1); - in[5] = _mm_slli_epi16(in[5], 1); - in[6] = _mm_slli_epi16(in[6], 1); - in[7] = _mm_slli_epi16(in[7], 1); - - array_transpose_8x8(in, in); -} -#endif // CONFIG_EXT_TX - -void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in[8]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - switch (tx_type) { - case DCT_DCT: aom_fdct8x8_sse2(input, output, stride); break; - case ADST_DCT: - load_buffer_8x8(input, in, stride, 0, 0); - fadst8_sse2(in); - fdct8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case DCT_ADST: - load_buffer_8x8(input, in, stride, 0, 0); - fdct8_sse2(in); - fadst8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case ADST_ADST: - load_buffer_8x8(input, in, stride, 0, 0); - fadst8_sse2(in); - fadst8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_8x8(input, in, stride, 1, 0); - fadst8_sse2(in); - fdct8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case DCT_FLIPADST: - load_buffer_8x8(input, in, stride, 0, 1); - fdct8_sse2(in); - fadst8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case FLIPADST_FLIPADST: - load_buffer_8x8(input, in, stride, 1, 1); - fadst8_sse2(in); - fadst8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case ADST_FLIPADST: - load_buffer_8x8(input, in, stride, 0, 1); - fadst8_sse2(in); - fadst8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case FLIPADST_ADST: - load_buffer_8x8(input, in, stride, 1, 0); - fadst8_sse2(in); - fadst8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case IDTX: - load_buffer_8x8(input, in, stride, 0, 0); - fidtx8_sse2(in); - fidtx8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case V_DCT: - load_buffer_8x8(input, in, stride, 0, 0); - fdct8_sse2(in); - fidtx8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case H_DCT: - load_buffer_8x8(input, in, stride, 0, 0); - fidtx8_sse2(in); - fdct8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case V_ADST: - load_buffer_8x8(input, in, stride, 0, 0); - fadst8_sse2(in); - fidtx8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case H_ADST: - load_buffer_8x8(input, in, stride, 0, 0); - fidtx8_sse2(in); - fadst8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case V_FLIPADST: - load_buffer_8x8(input, in, stride, 1, 0); - fadst8_sse2(in); - fidtx8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; - case H_FLIPADST: - load_buffer_8x8(input, in, stride, 0, 1); - fidtx8_sse2(in); - fadst8_sse2(in); - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); - break; -#endif // CONFIG_EXT_TX - default: assert(0); - } -} - -static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0, - __m128i *in1, int stride, int flipud, - int fliplr) { - // Load 4 8x8 blocks - const int16_t *topL = input; - const int16_t *topR = input + 8; - const int16_t *botL = input + 8 * stride; - const int16_t *botR = input + 8 * stride + 8; - - const int16_t *tmp; - - if (flipud) { - // Swap left columns - tmp = topL; - topL = botL; - botL = tmp; - // Swap right columns - tmp = topR; - topR = botR; - botR = tmp; - } - - if (fliplr) { - // Swap top rows - tmp = topL; - topL = topR; - topR = tmp; - // Swap bottom rows - tmp = botL; - botL = botR; - botR = tmp; - } - - // load first 8 columns - load_buffer_8x8(topL, in0, stride, flipud, fliplr); - load_buffer_8x8(botL, in0 + 8, stride, flipud, fliplr); - - // load second 8 columns - load_buffer_8x8(topR, in1, stride, flipud, fliplr); - load_buffer_8x8(botR, in1 + 8, stride, flipud, fliplr); -} - -static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0, - __m128i *in1, int stride) { - // write first 8 columns - write_buffer_8x8(output, in0, stride); - write_buffer_8x8(output + 8 * stride, in0 + 8, stride); - // write second 8 columns - output += 8; - write_buffer_8x8(output, in1, stride); - write_buffer_8x8(output + 8 * stride, in1 + 8, stride); -} - -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - -static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { - // perform rounding operations - right_shift_8x8(res0, 2); - right_shift_8x8(res0 + 8, 2); - right_shift_8x8(res1, 2); - right_shift_8x8(res1 + 8, 2); -} - -static void fdct16_8col(__m128i *in) { - // perform 16x16 1-D DCT for 8 columns - __m128i i[8], s[8], p[8], t[8], u[16], v[16]; - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - // stage 1 - i[0] = _mm_add_epi16(in[0], in[15]); - i[1] = _mm_add_epi16(in[1], in[14]); - i[2] = _mm_add_epi16(in[2], in[13]); - i[3] = _mm_add_epi16(in[3], in[12]); - i[4] = _mm_add_epi16(in[4], in[11]); - i[5] = _mm_add_epi16(in[5], in[10]); - i[6] = _mm_add_epi16(in[6], in[9]); - i[7] = _mm_add_epi16(in[7], in[8]); - - s[0] = _mm_sub_epi16(in[7], in[8]); - s[1] = _mm_sub_epi16(in[6], in[9]); - s[2] = _mm_sub_epi16(in[5], in[10]); - s[3] = _mm_sub_epi16(in[4], in[11]); - s[4] = _mm_sub_epi16(in[3], in[12]); - s[5] = _mm_sub_epi16(in[2], in[13]); - s[6] = _mm_sub_epi16(in[1], in[14]); - s[7] = _mm_sub_epi16(in[0], in[15]); - - p[0] = _mm_add_epi16(i[0], i[7]); - p[1] = _mm_add_epi16(i[1], i[6]); - p[2] = _mm_add_epi16(i[2], i[5]); - p[3] = _mm_add_epi16(i[3], i[4]); - p[4] = _mm_sub_epi16(i[3], i[4]); - p[5] = _mm_sub_epi16(i[2], i[5]); - p[6] = _mm_sub_epi16(i[1], i[6]); - p[7] = _mm_sub_epi16(i[0], i[7]); - - u[0] = _mm_add_epi16(p[0], p[3]); - u[1] = _mm_add_epi16(p[1], p[2]); - u[2] = _mm_sub_epi16(p[1], p[2]); - u[3] = _mm_sub_epi16(p[0], p[3]); - - v[0] = _mm_unpacklo_epi16(u[0], u[1]); - v[1] = _mm_unpackhi_epi16(u[0], u[1]); - v[2] = _mm_unpacklo_epi16(u[2], u[3]); - v[3] = _mm_unpackhi_epi16(u[2], u[3]); - - u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); - u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); - u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16); - u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); - u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08); - u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08); - u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24); - u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[1]); - in[4] = _mm_packs_epi32(u[4], u[5]); - in[8] = _mm_packs_epi32(u[2], u[3]); - in[12] = _mm_packs_epi32(u[6], u[7]); - - u[0] = _mm_unpacklo_epi16(p[5], p[6]); - u[1] = _mm_unpackhi_epi16(p[5], p[6]); - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - - u[0] = _mm_packs_epi32(v[0], v[1]); - u[1] = _mm_packs_epi32(v[2], v[3]); - - t[0] = _mm_add_epi16(p[4], u[0]); - t[1] = _mm_sub_epi16(p[4], u[0]); - t[2] = _mm_sub_epi16(p[7], u[1]); - t[3] = _mm_add_epi16(p[7], u[1]); - - u[0] = _mm_unpacklo_epi16(t[0], t[3]); - u[1] = _mm_unpackhi_epi16(t[0], t[3]); - u[2] = _mm_unpacklo_epi16(t[1], t[2]); - u[3] = _mm_unpackhi_epi16(t[1], t[2]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04); - v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04); - v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20); - v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20); - v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12); - v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12); - v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28); - v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - in[2] = _mm_packs_epi32(v[0], v[1]); - in[6] = _mm_packs_epi32(v[4], v[5]); - in[10] = _mm_packs_epi32(v[2], v[3]); - in[14] = _mm_packs_epi32(v[6], v[7]); - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[2], s[5]); - u[1] = _mm_unpackhi_epi16(s[2], s[5]); - u[2] = _mm_unpacklo_epi16(s[3], s[4]); - u[3] = _mm_unpackhi_epi16(s[3], s[4]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[2] = _mm_packs_epi32(v[0], v[1]); - t[3] = _mm_packs_epi32(v[2], v[3]); - t[4] = _mm_packs_epi32(v[4], v[5]); - t[5] = _mm_packs_epi32(v[6], v[7]); - - // stage 3 - p[0] = _mm_add_epi16(s[0], t[3]); - p[1] = _mm_add_epi16(s[1], t[2]); - p[2] = _mm_sub_epi16(s[1], t[2]); - p[3] = _mm_sub_epi16(s[0], t[3]); - p[4] = _mm_sub_epi16(s[7], t[4]); - p[5] = _mm_sub_epi16(s[6], t[5]); - p[6] = _mm_add_epi16(s[6], t[5]); - p[7] = _mm_add_epi16(s[7], t[4]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(p[1], p[6]); - u[1] = _mm_unpackhi_epi16(p[1], p[6]); - u[2] = _mm_unpacklo_epi16(p[2], p[5]); - u[3] = _mm_unpackhi_epi16(p[2], p[5]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); - v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); - v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); - v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); - v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); - v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); - v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[1] = _mm_packs_epi32(v[0], v[1]); - t[2] = _mm_packs_epi32(v[2], v[3]); - t[5] = _mm_packs_epi32(v[4], v[5]); - t[6] = _mm_packs_epi32(v[6], v[7]); - - // stage 5 - s[0] = _mm_add_epi16(p[0], t[1]); - s[1] = _mm_sub_epi16(p[0], t[1]); - s[2] = _mm_sub_epi16(p[3], t[2]); - s[3] = _mm_add_epi16(p[3], t[2]); - s[4] = _mm_add_epi16(p[4], t[5]); - s[5] = _mm_sub_epi16(p[4], t[5]); - s[6] = _mm_sub_epi16(p[7], t[6]); - s[7] = _mm_add_epi16(p[7], t[6]); - - // stage 6 - u[0] = _mm_unpacklo_epi16(s[0], s[7]); - u[1] = _mm_unpackhi_epi16(s[0], s[7]); - u[2] = _mm_unpacklo_epi16(s[1], s[6]); - u[3] = _mm_unpackhi_epi16(s[1], s[6]); - u[4] = _mm_unpacklo_epi16(s[2], s[5]); - u[5] = _mm_unpackhi_epi16(s[2], s[5]); - u[6] = _mm_unpacklo_epi16(s[3], s[4]); - u[7] = _mm_unpackhi_epi16(s[3], s[4]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02); - v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02); - v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18); - v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18); - v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10); - v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10); - v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26); - v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26); - v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06); - v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06); - v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22); - v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22); - v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14); - v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14); - v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30); - v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - in[1] = _mm_packs_epi32(v[0], v[1]); - in[9] = _mm_packs_epi32(v[2], v[3]); - in[5] = _mm_packs_epi32(v[4], v[5]); - in[13] = _mm_packs_epi32(v[6], v[7]); - in[3] = _mm_packs_epi32(v[8], v[9]); - in[11] = _mm_packs_epi32(v[10], v[11]); - in[7] = _mm_packs_epi32(v[12], v[13]); - in[15] = _mm_packs_epi32(v[14], v[15]); -} - -static void fadst16_8col(__m128i *in) { - // perform 16x16 1-D ADST for 8 columns - __m128i s[16], x[16], u[32], v[32]; - const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); - const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); - const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); - - u[0] = _mm_unpacklo_epi16(in[15], in[0]); - u[1] = _mm_unpackhi_epi16(in[15], in[0]); - u[2] = _mm_unpacklo_epi16(in[13], in[2]); - u[3] = _mm_unpackhi_epi16(in[13], in[2]); - u[4] = _mm_unpacklo_epi16(in[11], in[4]); - u[5] = _mm_unpackhi_epi16(in[11], in[4]); - u[6] = _mm_unpacklo_epi16(in[9], in[6]); - u[7] = _mm_unpackhi_epi16(in[9], in[6]); - u[8] = _mm_unpacklo_epi16(in[7], in[8]); - u[9] = _mm_unpackhi_epi16(in[7], in[8]); - u[10] = _mm_unpacklo_epi16(in[5], in[10]); - u[11] = _mm_unpackhi_epi16(in[5], in[10]); - u[12] = _mm_unpacklo_epi16(in[3], in[12]); - u[13] = _mm_unpackhi_epi16(in[3], in[12]); - u[14] = _mm_unpacklo_epi16(in[1], in[14]); - u[15] = _mm_unpackhi_epi16(in[1], in[14]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); - v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); - v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); - v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); - v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); - v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); - v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); - v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); - v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); - v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); - v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); - v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); - v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); - v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); - v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); - v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); - v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); - v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); - v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); - v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); - v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); - v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); - v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); - v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); - v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); - v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); - v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); - v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); - v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); - v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); - v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); - v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); - - u[0] = _mm_add_epi32(v[0], v[16]); - u[1] = _mm_add_epi32(v[1], v[17]); - u[2] = _mm_add_epi32(v[2], v[18]); - u[3] = _mm_add_epi32(v[3], v[19]); - u[4] = _mm_add_epi32(v[4], v[20]); - u[5] = _mm_add_epi32(v[5], v[21]); - u[6] = _mm_add_epi32(v[6], v[22]); - u[7] = _mm_add_epi32(v[7], v[23]); - u[8] = _mm_add_epi32(v[8], v[24]); - u[9] = _mm_add_epi32(v[9], v[25]); - u[10] = _mm_add_epi32(v[10], v[26]); - u[11] = _mm_add_epi32(v[11], v[27]); - u[12] = _mm_add_epi32(v[12], v[28]); - u[13] = _mm_add_epi32(v[13], v[29]); - u[14] = _mm_add_epi32(v[14], v[30]); - u[15] = _mm_add_epi32(v[15], v[31]); - u[16] = _mm_sub_epi32(v[0], v[16]); - u[17] = _mm_sub_epi32(v[1], v[17]); - u[18] = _mm_sub_epi32(v[2], v[18]); - u[19] = _mm_sub_epi32(v[3], v[19]); - u[20] = _mm_sub_epi32(v[4], v[20]); - u[21] = _mm_sub_epi32(v[5], v[21]); - u[22] = _mm_sub_epi32(v[6], v[22]); - u[23] = _mm_sub_epi32(v[7], v[23]); - u[24] = _mm_sub_epi32(v[8], v[24]); - u[25] = _mm_sub_epi32(v[9], v[25]); - u[26] = _mm_sub_epi32(v[10], v[26]); - u[27] = _mm_sub_epi32(v[11], v[27]); - u[28] = _mm_sub_epi32(v[12], v[28]); - u[29] = _mm_sub_epi32(v[13], v[29]); - u[30] = _mm_sub_epi32(v[14], v[30]); - u[31] = _mm_sub_epi32(v[15], v[31]); - - v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); - v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); - v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); - v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); - v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); - v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); - v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); - v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); - v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); - v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); - v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); - v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); - v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); - v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); - v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); - v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); - - u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); - u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); - u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); - u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); - u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); - u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); - u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); - u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); - u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); - u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); - u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); - u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); - u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); - u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); - u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); - u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); - - v[0] = _mm_add_epi32(u[0], u[8]); - v[1] = _mm_add_epi32(u[1], u[9]); - v[2] = _mm_add_epi32(u[2], u[10]); - v[3] = _mm_add_epi32(u[3], u[11]); - v[4] = _mm_add_epi32(u[4], u[12]); - v[5] = _mm_add_epi32(u[5], u[13]); - v[6] = _mm_add_epi32(u[6], u[14]); - v[7] = _mm_add_epi32(u[7], u[15]); - - v[16] = _mm_add_epi32(v[0], v[4]); - v[17] = _mm_add_epi32(v[1], v[5]); - v[18] = _mm_add_epi32(v[2], v[6]); - v[19] = _mm_add_epi32(v[3], v[7]); - v[20] = _mm_sub_epi32(v[0], v[4]); - v[21] = _mm_sub_epi32(v[1], v[5]); - v[22] = _mm_sub_epi32(v[2], v[6]); - v[23] = _mm_sub_epi32(v[3], v[7]); - v[16] = _mm_add_epi32(v[16], k__DCT_CONST_ROUNDING); - v[17] = _mm_add_epi32(v[17], k__DCT_CONST_ROUNDING); - v[18] = _mm_add_epi32(v[18], k__DCT_CONST_ROUNDING); - v[19] = _mm_add_epi32(v[19], k__DCT_CONST_ROUNDING); - v[20] = _mm_add_epi32(v[20], k__DCT_CONST_ROUNDING); - v[21] = _mm_add_epi32(v[21], k__DCT_CONST_ROUNDING); - v[22] = _mm_add_epi32(v[22], k__DCT_CONST_ROUNDING); - v[23] = _mm_add_epi32(v[23], k__DCT_CONST_ROUNDING); - v[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); - v[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); - v[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); - v[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); - v[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); - v[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); - v[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); - v[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); - s[0] = _mm_packs_epi32(v[16], v[17]); - s[1] = _mm_packs_epi32(v[18], v[19]); - s[2] = _mm_packs_epi32(v[20], v[21]); - s[3] = _mm_packs_epi32(v[22], v[23]); - - v[8] = _mm_sub_epi32(u[0], u[8]); - v[9] = _mm_sub_epi32(u[1], u[9]); - v[10] = _mm_sub_epi32(u[2], u[10]); - v[11] = _mm_sub_epi32(u[3], u[11]); - v[12] = _mm_sub_epi32(u[4], u[12]); - v[13] = _mm_sub_epi32(u[5], u[13]); - v[14] = _mm_sub_epi32(u[6], u[14]); - v[15] = _mm_sub_epi32(u[7], u[15]); - - v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - s[4] = _mm_packs_epi32(v[8], v[9]); - s[5] = _mm_packs_epi32(v[10], v[11]); - s[6] = _mm_packs_epi32(v[12], v[13]); - s[7] = _mm_packs_epi32(v[14], v[15]); - // - - s[8] = _mm_packs_epi32(u[16], u[17]); - s[9] = _mm_packs_epi32(u[18], u[19]); - s[10] = _mm_packs_epi32(u[20], u[21]); - s[11] = _mm_packs_epi32(u[22], u[23]); - s[12] = _mm_packs_epi32(u[24], u[25]); - s[13] = _mm_packs_epi32(u[26], u[27]); - s[14] = _mm_packs_epi32(u[28], u[29]); - s[15] = _mm_packs_epi32(u[30], u[31]); - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[9]); - u[1] = _mm_unpackhi_epi16(s[8], s[9]); - u[2] = _mm_unpacklo_epi16(s[10], s[11]); - u[3] = _mm_unpackhi_epi16(s[10], s[11]); - u[4] = _mm_unpacklo_epi16(s[12], s[13]); - u[5] = _mm_unpackhi_epi16(s[12], s[13]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); - v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); - v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); - v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); - v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); - v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); - v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); - v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], v[8]); - u[1] = _mm_add_epi32(v[1], v[9]); - u[2] = _mm_add_epi32(v[2], v[10]); - u[3] = _mm_add_epi32(v[3], v[11]); - u[4] = _mm_add_epi32(v[4], v[12]); - u[5] = _mm_add_epi32(v[5], v[13]); - u[6] = _mm_add_epi32(v[6], v[14]); - u[7] = _mm_add_epi32(v[7], v[15]); - u[8] = _mm_sub_epi32(v[0], v[8]); - u[9] = _mm_sub_epi32(v[1], v[9]); - u[10] = _mm_sub_epi32(v[2], v[10]); - u[11] = _mm_sub_epi32(v[3], v[11]); - u[12] = _mm_sub_epi32(v[4], v[12]); - u[13] = _mm_sub_epi32(v[5], v[13]); - u[14] = _mm_sub_epi32(v[6], v[14]); - u[15] = _mm_sub_epi32(v[7], v[15]); - - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - v[8] = _mm_add_epi32(u[0], u[4]); - v[9] = _mm_add_epi32(u[1], u[5]); - v[10] = _mm_add_epi32(u[2], u[6]); - v[11] = _mm_add_epi32(u[3], u[7]); - v[12] = _mm_sub_epi32(u[0], u[4]); - v[13] = _mm_sub_epi32(u[1], u[5]); - v[14] = _mm_sub_epi32(u[2], u[6]); - v[15] = _mm_sub_epi32(u[3], u[7]); - - v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - s[8] = _mm_packs_epi32(v[8], v[9]); - s[9] = _mm_packs_epi32(v[10], v[11]); - s[10] = _mm_packs_epi32(v[12], v[13]); - s[11] = _mm_packs_epi32(v[14], v[15]); - - x[12] = _mm_packs_epi32(u[8], u[9]); - x[13] = _mm_packs_epi32(u[10], u[11]); - x[14] = _mm_packs_epi32(u[12], u[13]); - x[15] = _mm_packs_epi32(u[14], u[15]); - - // stage 3 - u[0] = _mm_unpacklo_epi16(s[4], s[5]); - u[1] = _mm_unpackhi_epi16(s[4], s[5]); - u[2] = _mm_unpacklo_epi16(s[6], s[7]); - u[3] = _mm_unpackhi_epi16(s[6], s[7]); - u[4] = _mm_unpacklo_epi16(x[12], x[13]); - u[5] = _mm_unpackhi_epi16(x[12], x[13]); - u[6] = _mm_unpacklo_epi16(x[14], x[15]); - u[7] = _mm_unpackhi_epi16(x[14], x[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); - v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); - v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); - v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], v[4]); - u[1] = _mm_add_epi32(v[1], v[5]); - u[2] = _mm_add_epi32(v[2], v[6]); - u[3] = _mm_add_epi32(v[3], v[7]); - u[4] = _mm_sub_epi32(v[0], v[4]); - u[5] = _mm_sub_epi32(v[1], v[5]); - u[6] = _mm_sub_epi32(v[2], v[6]); - u[7] = _mm_sub_epi32(v[3], v[7]); - u[8] = _mm_add_epi32(v[8], v[12]); - u[9] = _mm_add_epi32(v[9], v[13]); - u[10] = _mm_add_epi32(v[10], v[14]); - u[11] = _mm_add_epi32(v[11], v[15]); - u[12] = _mm_sub_epi32(v[8], v[12]); - u[13] = _mm_sub_epi32(v[9], v[13]); - u[14] = _mm_sub_epi32(v[10], v[14]); - u[15] = _mm_sub_epi32(v[11], v[15]); - - u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[4] = _mm_packs_epi32(v[0], v[1]); - s[5] = _mm_packs_epi32(v[2], v[3]); - s[6] = _mm_packs_epi32(v[4], v[5]); - s[7] = _mm_packs_epi32(v[6], v[7]); - - s[12] = _mm_packs_epi32(v[8], v[9]); - s[13] = _mm_packs_epi32(v[10], v[11]); - s[14] = _mm_packs_epi32(v[12], v[13]); - s[15] = _mm_packs_epi32(v[14], v[15]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(s[2], s[3]); - u[1] = _mm_unpackhi_epi16(s[2], s[3]); - u[2] = _mm_unpacklo_epi16(s[6], s[7]); - u[3] = _mm_unpackhi_epi16(s[6], s[7]); - u[4] = _mm_unpacklo_epi16(s[10], s[11]); - u[5] = _mm_unpackhi_epi16(s[10], s[11]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); - v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); - v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); - v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); - v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); - v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); - v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); - v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - in[0] = s[0]; - in[1] = _mm_sub_epi16(kZero, s[8]); - in[2] = s[12]; - in[3] = _mm_sub_epi16(kZero, s[4]); - in[4] = _mm_packs_epi32(v[4], v[5]); - in[5] = _mm_packs_epi32(v[12], v[13]); - in[6] = _mm_packs_epi32(v[8], v[9]); - in[7] = _mm_packs_epi32(v[0], v[1]); - in[8] = _mm_packs_epi32(v[2], v[3]); - in[9] = _mm_packs_epi32(v[10], v[11]); - in[10] = _mm_packs_epi32(v[14], v[15]); - in[11] = _mm_packs_epi32(v[6], v[7]); - in[12] = s[5]; - in[13] = _mm_sub_epi16(kZero, s[13]); - in[14] = s[9]; - in[15] = _mm_sub_epi16(kZero, s[1]); -} - -static void fdct16_sse2(__m128i *in0, __m128i *in1) { - fdct16_8col(in0); - fdct16_8col(in1); - array_transpose_16x16(in0, in1); -} - -static void fadst16_sse2(__m128i *in0, __m128i *in1) { - fadst16_8col(in0); - fadst16_8col(in1); - array_transpose_16x16(in0, in1); -} - -#if CONFIG_EXT_TX -static void fidtx16_sse2(__m128i *in0, __m128i *in1) { - idtx16_8col(in0); - idtx16_8col(in1); - array_transpose_16x16(in0, in1); -} -#endif // CONFIG_EXT_TX - -void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in0[16], in1[16]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - switch (tx_type) { - case DCT_DCT: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fdct16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fdct16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case ADST_DCT: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fadst16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fdct16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case DCT_ADST: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fdct16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fadst16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case ADST_ADST: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fadst16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fadst16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_16x16(input, in0, in1, stride, 1, 0); - fadst16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fdct16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case DCT_FLIPADST: - load_buffer_16x16(input, in0, in1, stride, 0, 1); - fdct16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fadst16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case FLIPADST_FLIPADST: - load_buffer_16x16(input, in0, in1, stride, 1, 1); - fadst16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fadst16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case ADST_FLIPADST: - load_buffer_16x16(input, in0, in1, stride, 0, 1); - fadst16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fadst16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case FLIPADST_ADST: - load_buffer_16x16(input, in0, in1, stride, 1, 0); - fadst16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fadst16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case IDTX: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fidtx16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fidtx16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case V_DCT: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fdct16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fidtx16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case H_DCT: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fidtx16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fdct16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case V_ADST: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fadst16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fidtx16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case H_ADST: - load_buffer_16x16(input, in0, in1, stride, 0, 0); - fidtx16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fadst16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case V_FLIPADST: - load_buffer_16x16(input, in0, in1, stride, 1, 0); - fadst16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fidtx16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; - case H_FLIPADST: - load_buffer_16x16(input, in0, in1, stride, 0, 1); - fidtx16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fadst16_sse2(in0, in1); - write_buffer_16x16(output, in0, in1, 16); - break; -#endif // CONFIG_EXT_TX - default: assert(0); break; - } -} - -static INLINE void prepare_4x8_row_first(__m128i *in) { - in[0] = _mm_unpacklo_epi64(in[0], in[2]); - in[1] = _mm_unpacklo_epi64(in[1], in[3]); - transpose_4x4(in); - in[4] = _mm_unpacklo_epi64(in[4], in[6]); - in[5] = _mm_unpacklo_epi64(in[5], in[7]); - transpose_4x4(in + 4); -} - -// Load input into the left-hand half of in (ie, into lanes 0..3 of -// each element of in). The right hand half (lanes 4..7) should be -// treated as being filled with "don't care" values. -static INLINE void load_buffer_4x8(const int16_t *input, __m128i *in, - int stride, int flipud, int fliplr) { - const int shift = 2; - if (!flipud) { - in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); - in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); - in[4] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride)); - in[5] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride)); - in[6] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride)); - in[7] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride)); - } else { - in[0] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride)); - in[1] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride)); - in[2] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride)); - in[3] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride)); - in[4] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); - in[5] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); - in[6] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in[7] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - } - - if (fliplr) { - in[0] = _mm_shufflelo_epi16(in[0], 0x1b); - in[1] = _mm_shufflelo_epi16(in[1], 0x1b); - in[2] = _mm_shufflelo_epi16(in[2], 0x1b); - in[3] = _mm_shufflelo_epi16(in[3], 0x1b); - in[4] = _mm_shufflelo_epi16(in[4], 0x1b); - in[5] = _mm_shufflelo_epi16(in[5], 0x1b); - in[6] = _mm_shufflelo_epi16(in[6], 0x1b); - in[7] = _mm_shufflelo_epi16(in[7], 0x1b); - } - - in[0] = _mm_slli_epi16(in[0], shift); - in[1] = _mm_slli_epi16(in[1], shift); - in[2] = _mm_slli_epi16(in[2], shift); - in[3] = _mm_slli_epi16(in[3], shift); - in[4] = _mm_slli_epi16(in[4], shift); - in[5] = _mm_slli_epi16(in[5], shift); - in[6] = _mm_slli_epi16(in[6], shift); - in[7] = _mm_slli_epi16(in[7], shift); - - scale_sqrt2_8x4(in); - scale_sqrt2_8x4(in + 4); - prepare_4x8_row_first(in); -} - -static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) { - __m128i in01, in23, in45, in67, sign01, sign23, sign45, sign67; - const int shift = 1; - - // revert the 8x8 txfm's transpose - array_transpose_8x8(res, res); - - in01 = _mm_unpacklo_epi64(res[0], res[1]); - in23 = _mm_unpacklo_epi64(res[2], res[3]); - in45 = _mm_unpacklo_epi64(res[4], res[5]); - in67 = _mm_unpacklo_epi64(res[6], res[7]); - - sign01 = _mm_srai_epi16(in01, 15); - sign23 = _mm_srai_epi16(in23, 15); - sign45 = _mm_srai_epi16(in45, 15); - sign67 = _mm_srai_epi16(in67, 15); - - in01 = _mm_sub_epi16(in01, sign01); - in23 = _mm_sub_epi16(in23, sign23); - in45 = _mm_sub_epi16(in45, sign45); - in67 = _mm_sub_epi16(in67, sign67); - - in01 = _mm_srai_epi16(in01, shift); - in23 = _mm_srai_epi16(in23, shift); - in45 = _mm_srai_epi16(in45, shift); - in67 = _mm_srai_epi16(in67, shift); - - store_output(&in01, (output + 0 * 8)); - store_output(&in23, (output + 1 * 8)); - store_output(&in45, (output + 2 * 8)); - store_output(&in67, (output + 3 * 8)); -} - -void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in[8]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - switch (tx_type) { - case DCT_DCT: - load_buffer_4x8(input, in, stride, 0, 0); - fdct4_sse2(in); - fdct4_sse2(in + 4); - fdct8_sse2(in); - break; - case ADST_DCT: - load_buffer_4x8(input, in, stride, 0, 0); - fdct4_sse2(in); - fdct4_sse2(in + 4); - fadst8_sse2(in); - break; - case DCT_ADST: - load_buffer_4x8(input, in, stride, 0, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fdct8_sse2(in); - break; - case ADST_ADST: - load_buffer_4x8(input, in, stride, 0, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fadst8_sse2(in); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_4x8(input, in, stride, 1, 0); - fdct4_sse2(in); - fdct4_sse2(in + 4); - fadst8_sse2(in); - break; - case DCT_FLIPADST: - load_buffer_4x8(input, in, stride, 0, 1); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fdct8_sse2(in); - break; - case FLIPADST_FLIPADST: - load_buffer_4x8(input, in, stride, 1, 1); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fadst8_sse2(in); - break; - case ADST_FLIPADST: - load_buffer_4x8(input, in, stride, 0, 1); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fadst8_sse2(in); - break; - case FLIPADST_ADST: - load_buffer_4x8(input, in, stride, 1, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fadst8_sse2(in); - break; - case IDTX: - load_buffer_4x8(input, in, stride, 0, 0); - fidtx4_sse2(in); - fidtx4_sse2(in + 4); - fidtx8_sse2(in); - break; - case V_DCT: - load_buffer_4x8(input, in, stride, 0, 0); - fidtx4_sse2(in); - fidtx4_sse2(in + 4); - fdct8_sse2(in); - break; - case H_DCT: - load_buffer_4x8(input, in, stride, 0, 0); - fdct4_sse2(in); - fdct4_sse2(in + 4); - fidtx8_sse2(in); - break; - case V_ADST: - load_buffer_4x8(input, in, stride, 0, 0); - fidtx4_sse2(in); - fidtx4_sse2(in + 4); - fadst8_sse2(in); - break; - case H_ADST: - load_buffer_4x8(input, in, stride, 0, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fidtx8_sse2(in); - break; - case V_FLIPADST: - load_buffer_4x8(input, in, stride, 1, 0); - fidtx4_sse2(in); - fidtx4_sse2(in + 4); - fadst8_sse2(in); - break; - case H_FLIPADST: - load_buffer_4x8(input, in, stride, 0, 1); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fidtx8_sse2(in); - break; -#endif - default: assert(0); break; - } - write_buffer_4x8(output, in); -} - -// Load input into the left-hand half of in (ie, into lanes 0..3 of -// each element of in). The right hand half (lanes 4..7) should be -// treated as being filled with "don't care" values. -// The input is split horizontally into two 4x4 -// chunks 'l' and 'r'. Then 'l' is stored in the top-left 4x4 -// block of 'in' and 'r' is stored in the bottom-left block. -// This is to allow us to reuse 4x4 transforms. -static INLINE void load_buffer_8x4(const int16_t *input, __m128i *in, - int stride, int flipud, int fliplr) { - const int shift = 2; - if (!flipud) { - in[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride)); - in[1] = _mm_loadu_si128((const __m128i *)(input + 1 * stride)); - in[2] = _mm_loadu_si128((const __m128i *)(input + 2 * stride)); - in[3] = _mm_loadu_si128((const __m128i *)(input + 3 * stride)); - } else { - in[0] = _mm_loadu_si128((const __m128i *)(input + 3 * stride)); - in[1] = _mm_loadu_si128((const __m128i *)(input + 2 * stride)); - in[2] = _mm_loadu_si128((const __m128i *)(input + 1 * stride)); - in[3] = _mm_loadu_si128((const __m128i *)(input + 0 * stride)); - } - - if (fliplr) { - in[0] = mm_reverse_epi16(in[0]); - in[1] = mm_reverse_epi16(in[1]); - in[2] = mm_reverse_epi16(in[2]); - in[3] = mm_reverse_epi16(in[3]); - } - - in[0] = _mm_slli_epi16(in[0], shift); - in[1] = _mm_slli_epi16(in[1], shift); - in[2] = _mm_slli_epi16(in[2], shift); - in[3] = _mm_slli_epi16(in[3], shift); - - scale_sqrt2_8x4(in); - - in[4] = _mm_shuffle_epi32(in[0], 0xe); - in[5] = _mm_shuffle_epi32(in[1], 0xe); - in[6] = _mm_shuffle_epi32(in[2], 0xe); - in[7] = _mm_shuffle_epi32(in[3], 0xe); -} - -static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) { - __m128i out0, out1, out2, out3, sign0, sign1, sign2, sign3; - const int shift = 1; - sign0 = _mm_srai_epi16(res[0], 15); - sign1 = _mm_srai_epi16(res[1], 15); - sign2 = _mm_srai_epi16(res[2], 15); - sign3 = _mm_srai_epi16(res[3], 15); - - out0 = _mm_sub_epi16(res[0], sign0); - out1 = _mm_sub_epi16(res[1], sign1); - out2 = _mm_sub_epi16(res[2], sign2); - out3 = _mm_sub_epi16(res[3], sign3); - - out0 = _mm_srai_epi16(out0, shift); - out1 = _mm_srai_epi16(out1, shift); - out2 = _mm_srai_epi16(out2, shift); - out3 = _mm_srai_epi16(out3, shift); - - store_output(&out0, (output + 0 * 8)); - store_output(&out1, (output + 1 * 8)); - store_output(&out2, (output + 2 * 8)); - store_output(&out3, (output + 3 * 8)); -} - -void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in[8]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - switch (tx_type) { - case DCT_DCT: - load_buffer_8x4(input, in, stride, 0, 0); - fdct4_sse2(in); - fdct4_sse2(in + 4); - fdct8_sse2(in); - break; - case ADST_DCT: - load_buffer_8x4(input, in, stride, 0, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fdct8_sse2(in); - break; - case DCT_ADST: - load_buffer_8x4(input, in, stride, 0, 0); - fdct4_sse2(in); - fdct4_sse2(in + 4); - fadst8_sse2(in); - break; - case ADST_ADST: - load_buffer_8x4(input, in, stride, 0, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fadst8_sse2(in); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_8x4(input, in, stride, 1, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fdct8_sse2(in); - break; - case DCT_FLIPADST: - load_buffer_8x4(input, in, stride, 0, 1); - fdct4_sse2(in); - fdct4_sse2(in + 4); - fadst8_sse2(in); - break; - case FLIPADST_FLIPADST: - load_buffer_8x4(input, in, stride, 1, 1); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fadst8_sse2(in); - break; - case ADST_FLIPADST: - load_buffer_8x4(input, in, stride, 0, 1); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fadst8_sse2(in); - break; - case FLIPADST_ADST: - load_buffer_8x4(input, in, stride, 1, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fadst8_sse2(in); - break; - case IDTX: - load_buffer_8x4(input, in, stride, 0, 0); - fidtx4_sse2(in); - fidtx4_sse2(in + 4); - fidtx8_sse2(in); - break; - case V_DCT: - load_buffer_8x4(input, in, stride, 0, 0); - fdct4_sse2(in); - fdct4_sse2(in + 4); - fidtx8_sse2(in); - break; - case H_DCT: - load_buffer_8x4(input, in, stride, 0, 0); - fidtx4_sse2(in); - fidtx4_sse2(in + 4); - fdct8_sse2(in); - break; - case V_ADST: - load_buffer_8x4(input, in, stride, 0, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fidtx8_sse2(in); - break; - case H_ADST: - load_buffer_8x4(input, in, stride, 0, 0); - fidtx4_sse2(in); - fidtx4_sse2(in + 4); - fadst8_sse2(in); - break; - case V_FLIPADST: - load_buffer_8x4(input, in, stride, 1, 0); - fadst4_sse2(in); - fadst4_sse2(in + 4); - fidtx8_sse2(in); - break; - case H_FLIPADST: - load_buffer_8x4(input, in, stride, 0, 1); - fidtx4_sse2(in); - fidtx4_sse2(in + 4); - fadst8_sse2(in); - break; -#endif - default: assert(0); break; - } - write_buffer_8x4(output, in); -} - -static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in, - int stride, int flipud, int fliplr) { - // Load 2 8x8 blocks - const int16_t *t = input; - const int16_t *b = input + 8 * stride; - - if (flipud) { - const int16_t *const tmp = t; - t = b; - b = tmp; - } - - load_buffer_8x8(t, in, stride, flipud, fliplr); - scale_sqrt2_8x8(in); - load_buffer_8x8(b, in + 8, stride, flipud, fliplr); - scale_sqrt2_8x8(in + 8); -} - -static INLINE void round_power_of_two_signed(__m128i *x, int n) { - const __m128i rounding = _mm_set1_epi16((1 << n) >> 1); - const __m128i sign = _mm_srai_epi16(*x, 15); - const __m128i res = _mm_add_epi16(_mm_add_epi16(*x, rounding), sign); - *x = _mm_srai_epi16(res, n); -} - -static void row_8x16_rounding(__m128i *in, int bits) { - int i; - for (i = 0; i < 16; i++) { - round_power_of_two_signed(&in[i], bits); - } -} - -void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in[16]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - __m128i *const t = in; // Alias to top 8x8 sub block - __m128i *const b = in + 8; // Alias to bottom 8x8 sub block - - switch (tx_type) { - case DCT_DCT: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fdct8_sse2(t); - fdct8_sse2(b); - row_8x16_rounding(in, 2); - fdct16_8col(in); - break; - case ADST_DCT: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fdct8_sse2(t); - fdct8_sse2(b); - row_8x16_rounding(in, 2); - fadst16_8col(in); - break; - case DCT_ADST: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fadst8_sse2(t); - fadst8_sse2(b); - row_8x16_rounding(in, 2); - fdct16_8col(in); - break; - case ADST_ADST: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fadst8_sse2(t); - fadst8_sse2(b); - row_8x16_rounding(in, 2); - fadst16_8col(in); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_8x16(input, in, stride, 1, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fdct8_sse2(t); - fdct8_sse2(b); - row_8x16_rounding(in, 2); - fadst16_8col(in); - break; - case DCT_FLIPADST: - load_buffer_8x16(input, in, stride, 0, 1); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fadst8_sse2(t); - fadst8_sse2(b); - row_8x16_rounding(in, 2); - fdct16_8col(in); - break; - case FLIPADST_FLIPADST: - load_buffer_8x16(input, in, stride, 1, 1); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fadst8_sse2(t); - fadst8_sse2(b); - row_8x16_rounding(in, 2); - fadst16_8col(in); - break; - case ADST_FLIPADST: - load_buffer_8x16(input, in, stride, 0, 1); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fadst8_sse2(t); - fadst8_sse2(b); - row_8x16_rounding(in, 2); - fadst16_8col(in); - break; - case FLIPADST_ADST: - load_buffer_8x16(input, in, stride, 1, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fadst8_sse2(t); - fadst8_sse2(b); - row_8x16_rounding(in, 2); - fadst16_8col(in); - break; - case IDTX: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fidtx8_sse2(t); - fidtx8_sse2(b); - row_8x16_rounding(in, 2); - idtx16_8col(in); - break; - case V_DCT: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fidtx8_sse2(t); - fidtx8_sse2(b); - row_8x16_rounding(in, 2); - fdct16_8col(in); - break; - case H_DCT: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fdct8_sse2(t); - fdct8_sse2(b); - row_8x16_rounding(in, 2); - idtx16_8col(in); - break; - case V_ADST: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fidtx8_sse2(t); - fidtx8_sse2(b); - row_8x16_rounding(in, 2); - fadst16_8col(in); - break; - case H_ADST: - load_buffer_8x16(input, in, stride, 0, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fadst8_sse2(t); - fadst8_sse2(b); - row_8x16_rounding(in, 2); - idtx16_8col(in); - break; - case V_FLIPADST: - load_buffer_8x16(input, in, stride, 1, 0); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fidtx8_sse2(t); - fidtx8_sse2(b); - row_8x16_rounding(in, 2); - fadst16_8col(in); - break; - case H_FLIPADST: - load_buffer_8x16(input, in, stride, 0, 1); - array_transpose_8x8(t, t); - array_transpose_8x8(b, b); - fadst8_sse2(t); - fadst8_sse2(b); - row_8x16_rounding(in, 2); - idtx16_8col(in); - break; -#endif - default: assert(0); break; - } - write_buffer_8x8(output, t, 8); - write_buffer_8x8(output + 64, b, 8); -} - -static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in, - int stride, int flipud, int fliplr) { - // Load 2 8x8 blocks - const int16_t *l = input; - const int16_t *r = input + 8; - - if (fliplr) { - const int16_t *const tmp = l; - l = r; - r = tmp; - } - - // load first 8 columns - load_buffer_8x8(l, in, stride, flipud, fliplr); - scale_sqrt2_8x8(in); - load_buffer_8x8(r, in + 8, stride, flipud, fliplr); - scale_sqrt2_8x8(in + 8); -} - -#define col_16x8_rounding row_8x16_rounding - -void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in[16]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - __m128i *const l = in; // Alias to left 8x8 sub block - __m128i *const r = in + 8; // Alias to right 8x8 sub block, which we store - // in the second half of the array - - switch (tx_type) { - case DCT_DCT: - load_buffer_16x8(input, in, stride, 0, 0); - fdct8_sse2(l); - fdct8_sse2(r); - col_16x8_rounding(in, 2); - fdct16_8col(in); - break; - case ADST_DCT: - load_buffer_16x8(input, in, stride, 0, 0); - fadst8_sse2(l); - fadst8_sse2(r); - col_16x8_rounding(in, 2); - fdct16_8col(in); - break; - case DCT_ADST: - load_buffer_16x8(input, in, stride, 0, 0); - fdct8_sse2(l); - fdct8_sse2(r); - col_16x8_rounding(in, 2); - fadst16_8col(in); - break; - case ADST_ADST: - load_buffer_16x8(input, in, stride, 0, 0); - fadst8_sse2(l); - fadst8_sse2(r); - col_16x8_rounding(in, 2); - fadst16_8col(in); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_16x8(input, in, stride, 1, 0); - fadst8_sse2(l); - fadst8_sse2(r); - col_16x8_rounding(in, 2); - fdct16_8col(in); - break; - case DCT_FLIPADST: - load_buffer_16x8(input, in, stride, 0, 1); - fdct8_sse2(l); - fdct8_sse2(r); - col_16x8_rounding(in, 2); - fadst16_8col(in); - break; - case FLIPADST_FLIPADST: - load_buffer_16x8(input, in, stride, 1, 1); - fadst8_sse2(l); - fadst8_sse2(r); - col_16x8_rounding(in, 2); - fadst16_8col(in); - break; - case ADST_FLIPADST: - load_buffer_16x8(input, in, stride, 0, 1); - fadst8_sse2(l); - fadst8_sse2(r); - col_16x8_rounding(in, 2); - fadst16_8col(in); - break; - case FLIPADST_ADST: - load_buffer_16x8(input, in, stride, 1, 0); - fadst8_sse2(l); - fadst8_sse2(r); - col_16x8_rounding(in, 2); - fadst16_8col(in); - break; - case IDTX: - load_buffer_16x8(input, in, stride, 0, 0); - fidtx8_sse2(l); - fidtx8_sse2(r); - col_16x8_rounding(in, 2); - idtx16_8col(in); - break; - case V_DCT: - load_buffer_16x8(input, in, stride, 0, 0); - fdct8_sse2(l); - fdct8_sse2(r); - col_16x8_rounding(in, 2); - idtx16_8col(in); - break; - case H_DCT: - load_buffer_16x8(input, in, stride, 0, 0); - fidtx8_sse2(l); - fidtx8_sse2(r); - col_16x8_rounding(in, 2); - fdct16_8col(in); - break; - case V_ADST: - load_buffer_16x8(input, in, stride, 0, 0); - fadst8_sse2(l); - fadst8_sse2(r); - col_16x8_rounding(in, 2); - idtx16_8col(in); - break; - case H_ADST: - load_buffer_16x8(input, in, stride, 0, 0); - fidtx8_sse2(l); - fidtx8_sse2(r); - col_16x8_rounding(in, 2); - fadst16_8col(in); - break; - case V_FLIPADST: - load_buffer_16x8(input, in, stride, 1, 0); - fadst8_sse2(l); - fadst8_sse2(r); - col_16x8_rounding(in, 2); - idtx16_8col(in); - break; - case H_FLIPADST: - load_buffer_16x8(input, in, stride, 0, 1); - fidtx8_sse2(l); - fidtx8_sse2(r); - col_16x8_rounding(in, 2); - fadst16_8col(in); - break; -#endif - default: assert(0); break; - } - array_transpose_8x8(l, l); - array_transpose_8x8(r, r); - write_buffer_8x8(output, l, 16); - write_buffer_8x8(output + 8, r, 16); -} - -// Note: The 16-column 32-element transforms expect their input to be -// split up into a 2x2 grid of 8x16 blocks -static INLINE void fdct32_16col(__m128i *tl, __m128i *tr, __m128i *bl, - __m128i *br) { - fdct32_8col(tl, bl); - fdct32_8col(tr, br); - array_transpose_16x16(tl, tr); - array_transpose_16x16(bl, br); -} - -#if CONFIG_EXT_TX -static INLINE void fidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl, - __m128i *br) { - int i; - for (i = 0; i < 16; ++i) { - tl[i] = _mm_slli_epi16(tl[i], 2); - tr[i] = _mm_slli_epi16(tr[i], 2); - bl[i] = _mm_slli_epi16(bl[i], 2); - br[i] = _mm_slli_epi16(br[i], 2); - } - array_transpose_16x16(tl, tr); - array_transpose_16x16(bl, br); -} -#endif - -static INLINE void load_buffer_16x32(const int16_t *input, __m128i *intl, - __m128i *intr, __m128i *inbl, - __m128i *inbr, int stride, int flipud, - int fliplr) { - int i; - if (flipud) { - input = input + 31 * stride; - stride = -stride; - } - - for (i = 0; i < 16; ++i) { - intl[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2); - intr[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2); - inbl[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 0)), 2); - inbr[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 8)), 2); - } - - if (fliplr) { - __m128i tmp; - for (i = 0; i < 16; ++i) { - tmp = intl[i]; - intl[i] = mm_reverse_epi16(intr[i]); - intr[i] = mm_reverse_epi16(tmp); - tmp = inbl[i]; - inbl[i] = mm_reverse_epi16(inbr[i]); - inbr[i] = mm_reverse_epi16(tmp); - } - } - - scale_sqrt2_8x16(intl); - scale_sqrt2_8x16(intr); - scale_sqrt2_8x16(inbl); - scale_sqrt2_8x16(inbr); -} - -static INLINE void write_buffer_16x32(tran_low_t *output, __m128i *restl, - __m128i *restr, __m128i *resbl, - __m128i *resbr) { - int i; - for (i = 0; i < 16; ++i) { - store_output(&restl[i], output + i * 16 + 0); - store_output(&restr[i], output + i * 16 + 8); - store_output(&resbl[i], output + (i + 16) * 16 + 0); - store_output(&resbr[i], output + (i + 16) * 16 + 8); - } -} - -static INLINE void round_signed_8x8(__m128i *in, const int bit) { - const __m128i rounding = _mm_set1_epi16((1 << bit) >> 1); - __m128i sign0 = _mm_srai_epi16(in[0], 15); - __m128i sign1 = _mm_srai_epi16(in[1], 15); - __m128i sign2 = _mm_srai_epi16(in[2], 15); - __m128i sign3 = _mm_srai_epi16(in[3], 15); - __m128i sign4 = _mm_srai_epi16(in[4], 15); - __m128i sign5 = _mm_srai_epi16(in[5], 15); - __m128i sign6 = _mm_srai_epi16(in[6], 15); - __m128i sign7 = _mm_srai_epi16(in[7], 15); - - in[0] = _mm_add_epi16(_mm_add_epi16(in[0], rounding), sign0); - in[1] = _mm_add_epi16(_mm_add_epi16(in[1], rounding), sign1); - in[2] = _mm_add_epi16(_mm_add_epi16(in[2], rounding), sign2); - in[3] = _mm_add_epi16(_mm_add_epi16(in[3], rounding), sign3); - in[4] = _mm_add_epi16(_mm_add_epi16(in[4], rounding), sign4); - in[5] = _mm_add_epi16(_mm_add_epi16(in[5], rounding), sign5); - in[6] = _mm_add_epi16(_mm_add_epi16(in[6], rounding), sign6); - in[7] = _mm_add_epi16(_mm_add_epi16(in[7], rounding), sign7); - - in[0] = _mm_srai_epi16(in[0], bit); - in[1] = _mm_srai_epi16(in[1], bit); - in[2] = _mm_srai_epi16(in[2], bit); - in[3] = _mm_srai_epi16(in[3], bit); - in[4] = _mm_srai_epi16(in[4], bit); - in[5] = _mm_srai_epi16(in[5], bit); - in[6] = _mm_srai_epi16(in[6], bit); - in[7] = _mm_srai_epi16(in[7], bit); -} - -static INLINE void round_signed_16x16(__m128i *in0, __m128i *in1) { - const int bit = 4; - round_signed_8x8(in0, bit); - round_signed_8x8(in0 + 8, bit); - round_signed_8x8(in1, bit); - round_signed_8x8(in1 + 8, bit); -} - -// Note: -// suffix "t" indicates the transpose operation comes first -static void fdct16t_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - fdct16_8col(in0); - fdct16_8col(in1); -} - -static void fadst16t_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - fadst16_8col(in0); - fadst16_8col(in1); -} - -static INLINE void fdct32t_16col(__m128i *tl, __m128i *tr, __m128i *bl, - __m128i *br) { - array_transpose_16x16(tl, tr); - array_transpose_16x16(bl, br); - fdct32_8col(tl, bl); - fdct32_8col(tr, br); -} - -typedef enum transpose_indicator_ { - transpose, - no_transpose, -} transpose_indicator; - -static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl, - __m128i *br, transpose_indicator t) { - __m128i tmpl[16], tmpr[16]; - int i; - - // Copy the bottom half of the input to temporary storage - for (i = 0; i < 16; ++i) { - tmpl[i] = bl[i]; - tmpr[i] = br[i]; - } - - // Generate the bottom half of the output - for (i = 0; i < 16; ++i) { - bl[i] = _mm_slli_epi16(tl[i], 2); - br[i] = _mm_slli_epi16(tr[i], 2); - } - array_transpose_16x16(bl, br); - - // Copy the temporary storage back to the top half of the input - for (i = 0; i < 16; ++i) { - tl[i] = tmpl[i]; - tr[i] = tmpr[i]; - } - - // Generate the top half of the output - scale_sqrt2_8x16(tl); - scale_sqrt2_8x16(tr); - if (t == transpose) - fdct16t_sse2(tl, tr); - else - fdct16_sse2(tl, tr); -} - -// Note on data layout, for both this and the 32x16 transforms: -// So that we can reuse the 16-element transforms easily, -// we want to split the input into 8x16 blocks. -// For 16x32, this means the input is a 2x2 grid of such blocks. -// For 32x16, it means the input is a 4x1 grid. -void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i intl[16], intr[16], inbl[16], inbr[16]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - switch (tx_type) { - case DCT_DCT: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fdct16t_sse2(intl, intr); - fdct16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fdct32t_16col(intl, intr, inbl, inbr); - break; - case ADST_DCT: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fdct16t_sse2(intl, intr); - fdct16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fhalfright32_16col(intl, intr, inbl, inbr, transpose); - break; - case DCT_ADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fadst16t_sse2(intl, intr); - fadst16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fdct32t_16col(intl, intr, inbl, inbr); - break; - case ADST_ADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fadst16t_sse2(intl, intr); - fadst16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fhalfright32_16col(intl, intr, inbl, inbr, transpose); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0); - fdct16t_sse2(intl, intr); - fdct16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fhalfright32_16col(intl, intr, inbl, inbr, transpose); - break; - case DCT_FLIPADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1); - fadst16t_sse2(intl, intr); - fadst16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fdct32t_16col(intl, intr, inbl, inbr); - break; - case FLIPADST_FLIPADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 1); - fadst16t_sse2(intl, intr); - fadst16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fhalfright32_16col(intl, intr, inbl, inbr, transpose); - break; - case ADST_FLIPADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1); - fadst16t_sse2(intl, intr); - fadst16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fhalfright32_16col(intl, intr, inbl, inbr, transpose); - break; - case FLIPADST_ADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0); - fadst16t_sse2(intl, intr); - fadst16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fhalfright32_16col(intl, intr, inbl, inbr, transpose); - break; - case IDTX: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fidtx16_sse2(intl, intr); - fidtx16_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fidtx32_16col(intl, intr, inbl, inbr); - break; - case V_DCT: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fidtx16_sse2(intl, intr); - fidtx16_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fdct32t_16col(intl, intr, inbl, inbr); - break; - case H_DCT: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fdct16t_sse2(intl, intr); - fdct16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fidtx32_16col(intl, intr, inbl, inbr); - break; - case V_ADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fidtx16_sse2(intl, intr); - fidtx16_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fhalfright32_16col(intl, intr, inbl, inbr, transpose); - break; - case H_ADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); - fadst16t_sse2(intl, intr); - fadst16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fidtx32_16col(intl, intr, inbl, inbr); - break; - case V_FLIPADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0); - fidtx16_sse2(intl, intr); - fidtx16_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fhalfright32_16col(intl, intr, inbl, inbr, transpose); - break; - case H_FLIPADST: - load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1); - fadst16t_sse2(intl, intr); - fadst16t_sse2(inbl, inbr); - round_signed_16x16(intl, intr); - round_signed_16x16(inbl, inbr); - fidtx32_16col(intl, intr, inbl, inbr); - break; -#endif - default: assert(0); break; - } - write_buffer_16x32(output, intl, intr, inbl, inbr); -} - -static INLINE void load_buffer_32x16(const int16_t *input, __m128i *in0, - __m128i *in1, __m128i *in2, __m128i *in3, - int stride, int flipud, int fliplr) { - int i; - if (flipud) { - input += 15 * stride; - stride = -stride; - } - - for (i = 0; i < 16; ++i) { - in0[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2); - in1[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2); - in2[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2); - in3[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2); - } - - if (fliplr) { - for (i = 0; i < 16; ++i) { - __m128i tmp1 = in0[i]; - __m128i tmp2 = in1[i]; - in0[i] = mm_reverse_epi16(in3[i]); - in1[i] = mm_reverse_epi16(in2[i]); - in2[i] = mm_reverse_epi16(tmp2); - in3[i] = mm_reverse_epi16(tmp1); - } - } - - scale_sqrt2_8x16(in0); - scale_sqrt2_8x16(in1); - scale_sqrt2_8x16(in2); - scale_sqrt2_8x16(in3); -} - -static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0, - __m128i *res1, __m128i *res2, - __m128i *res3) { - int i; - for (i = 0; i < 16; ++i) { - store_output(&res0[i], output + i * 32 + 0); - store_output(&res1[i], output + i * 32 + 8); - store_output(&res2[i], output + i * 32 + 16); - store_output(&res3[i], output + i * 32 + 24); - } -} - -void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in0[16], in1[16], in2[16], in3[16]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); - switch (tx_type) { - case DCT_DCT: - fdct16_sse2(in0, in1); - fdct16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fdct32_16col(in0, in1, in2, in3); - break; - case ADST_DCT: - fadst16_sse2(in0, in1); - fadst16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fdct32_16col(in0, in1, in2, in3); - break; - case DCT_ADST: - fdct16_sse2(in0, in1); - fdct16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fhalfright32_16col(in0, in1, in2, in3, no_transpose); - break; - case ADST_ADST: - fadst16_sse2(in0, in1); - fadst16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fhalfright32_16col(in0, in1, in2, in3, no_transpose); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0); - fadst16_sse2(in0, in1); - fadst16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fdct32_16col(in0, in1, in2, in3); - break; - case DCT_FLIPADST: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1); - fdct16_sse2(in0, in1); - fdct16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fhalfright32_16col(in0, in1, in2, in3, no_transpose); - break; - case FLIPADST_FLIPADST: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 1); - fadst16_sse2(in0, in1); - fadst16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fhalfright32_16col(in0, in1, in2, in3, no_transpose); - break; - case ADST_FLIPADST: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1); - fadst16_sse2(in0, in1); - fadst16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fhalfright32_16col(in0, in1, in2, in3, no_transpose); - break; - case FLIPADST_ADST: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0); - fadst16_sse2(in0, in1); - fadst16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fhalfright32_16col(in0, in1, in2, in3, no_transpose); - break; - case IDTX: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); - fidtx16_sse2(in0, in1); - fidtx16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fidtx32_16col(in0, in1, in2, in3); - break; - case V_DCT: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); - fdct16_sse2(in0, in1); - fdct16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fidtx32_16col(in0, in1, in2, in3); - break; - case H_DCT: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); - fidtx16_sse2(in0, in1); - fidtx16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fdct32_16col(in0, in1, in2, in3); - break; - case V_ADST: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); - fadst16_sse2(in0, in1); - fadst16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fidtx32_16col(in0, in1, in2, in3); - break; - case H_ADST: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); - fidtx16_sse2(in0, in1); - fidtx16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fhalfright32_16col(in0, in1, in2, in3, no_transpose); - break; - case V_FLIPADST: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0); - fadst16_sse2(in0, in1); - fadst16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fidtx32_16col(in0, in1, in2, in3); - break; - case H_FLIPADST: - load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1); - fidtx16_sse2(in0, in1); - fidtx16_sse2(in2, in3); - round_signed_16x16(in0, in1); - round_signed_16x16(in2, in3); - fhalfright32_16col(in0, in1, in2, in3, no_transpose); - break; -#endif - default: assert(0); break; - } - write_buffer_32x16(output, in0, in1, in2, in3); -} - -// Note: -// 32x32 hybrid fwd txfm -// 4x2 grids of 8x16 block. Each block is represented by __m128i in[16] -static INLINE void load_buffer_32x32(const int16_t *input, - __m128i *in0 /*in0[32]*/, - __m128i *in1 /*in1[32]*/, - __m128i *in2 /*in2[32]*/, - __m128i *in3 /*in3[32]*/, int stride, - int flipud, int fliplr) { - if (flipud) { - input += 31 * stride; - stride = -stride; - } - - int i; - for (i = 0; i < 32; ++i) { - in0[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2); - in1[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2); - in2[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2); - in3[i] = _mm_slli_epi16( - _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2); - } - - if (fliplr) { - for (i = 0; i < 32; ++i) { - __m128i tmp1 = in0[i]; - __m128i tmp2 = in1[i]; - in0[i] = mm_reverse_epi16(in3[i]); - in1[i] = mm_reverse_epi16(in2[i]); - in2[i] = mm_reverse_epi16(tmp2); - in3[i] = mm_reverse_epi16(tmp1); - } - } -} - -static INLINE void swap_16x16(__m128i *b0l /*b0l[16]*/, - __m128i *b0r /*b0r[16]*/, - __m128i *b1l /*b1l[16]*/, - __m128i *b1r /*b1r[16]*/) { - int i; - for (i = 0; i < 16; ++i) { - __m128i tmp0 = b1l[i]; - __m128i tmp1 = b1r[i]; - b1l[i] = b0l[i]; - b1r[i] = b0r[i]; - b0l[i] = tmp0; - b0r[i] = tmp1; - } -} - -static INLINE void fdct32(__m128i *in0, __m128i *in1, __m128i *in2, - __m128i *in3) { - fdct32_8col(in0, &in0[16]); - fdct32_8col(in1, &in1[16]); - fdct32_8col(in2, &in2[16]); - fdct32_8col(in3, &in3[16]); - - array_transpose_16x16(in0, in1); - array_transpose_16x16(&in0[16], &in1[16]); - array_transpose_16x16(in2, in3); - array_transpose_16x16(&in2[16], &in3[16]); - - swap_16x16(&in0[16], &in1[16], in2, in3); -} - -static INLINE void fhalfright32(__m128i *in0, __m128i *in1, __m128i *in2, - __m128i *in3) { - fhalfright32_16col(in0, in1, &in0[16], &in1[16], no_transpose); - fhalfright32_16col(in2, in3, &in2[16], &in3[16], no_transpose); - swap_16x16(&in0[16], &in1[16], in2, in3); -} - -#if CONFIG_EXT_TX -static INLINE void fidtx32(__m128i *in0, __m128i *in1, __m128i *in2, - __m128i *in3) { - fidtx32_16col(in0, in1, &in0[16], &in1[16]); - fidtx32_16col(in2, in3, &in2[16], &in3[16]); - swap_16x16(&in0[16], &in1[16], in2, in3); -} -#endif - -static INLINE void round_signed_32x32(__m128i *in0, __m128i *in1, __m128i *in2, - __m128i *in3) { - round_signed_16x16(in0, in1); - round_signed_16x16(&in0[16], &in1[16]); - round_signed_16x16(in2, in3); - round_signed_16x16(&in2[16], &in3[16]); -} - -static INLINE void write_buffer_32x32(__m128i *in0, __m128i *in1, __m128i *in2, - __m128i *in3, tran_low_t *output) { - int i; - for (i = 0; i < 32; ++i) { - store_output(&in0[i], output + i * 32 + 0); - store_output(&in1[i], output + i * 32 + 8); - store_output(&in2[i], output + i * 32 + 16); - store_output(&in3[i], output + i * 32 + 24); - } -} - -void av1_fht32x32_sse2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m128i in0[32], in1[32], in2[32], in3[32]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "No 32x32 sse2 MRC_DCT implementation"); -#endif - - load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 0); - switch (tx_type) { - case DCT_DCT: - fdct32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fdct32(in0, in1, in2, in3); - break; - case ADST_DCT: - fhalfright32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fdct32(in0, in1, in2, in3); - break; - case DCT_ADST: - fdct32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fhalfright32(in0, in1, in2, in3); - break; - case ADST_ADST: - fhalfright32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fhalfright32(in0, in1, in2, in3); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0); - fhalfright32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fdct32(in0, in1, in2, in3); - break; - case DCT_FLIPADST: - load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1); - fdct32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fhalfright32(in0, in1, in2, in3); - break; - case FLIPADST_FLIPADST: - load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 1); - fhalfright32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fhalfright32(in0, in1, in2, in3); - break; - case ADST_FLIPADST: - load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1); - fhalfright32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fhalfright32(in0, in1, in2, in3); - break; - case FLIPADST_ADST: - load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0); - fhalfright32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fhalfright32(in0, in1, in2, in3); - break; - case IDTX: - fidtx32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fidtx32(in0, in1, in2, in3); - break; - case V_DCT: - fdct32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fidtx32(in0, in1, in2, in3); - break; - case H_DCT: - fidtx32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fdct32(in0, in1, in2, in3); - break; - case V_ADST: - fhalfright32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fidtx32(in0, in1, in2, in3); - break; - case H_ADST: - fidtx32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fhalfright32(in0, in1, in2, in3); - break; - case V_FLIPADST: - load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0); - fhalfright32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fidtx32(in0, in1, in2, in3); - break; - case H_FLIPADST: - load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1); - fidtx32(in0, in1, in2, in3); - round_signed_32x32(in0, in1, in2, in3); - fhalfright32(in0, in1, in2, in3); - break; -#endif - default: assert(0); - } - write_buffer_32x32(in0, in1, in2, in3, output); -} diff --git a/third_party/aom/av1/encoder/x86/dct_sse2.asm b/third_party/aom/av1/encoder/x86/dct_sse2.asm index a99db3d6e..b18554818 100644 --- a/third_party/aom/av1/encoder/x86/dct_sse2.asm +++ b/third_party/aom/av1/encoder/x86/dct_sse2.asm @@ -63,7 +63,6 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride psllw m0, 2 psllw m1, 2 -%if CONFIG_HIGHBITDEPTH ; sign extension mova m2, m0 mova m3, m1 @@ -79,9 +78,5 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride mova [outputq + 16], m2 mova [outputq + 32], m1 mova [outputq + 48], m3 -%else - mova [outputq], m0 - mova [outputq + 16], m1 -%endif RET diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse2.c b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c new file mode 100644 index 000000000..dedb4d02f --- /dev/null +++ b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c @@ -0,0 +1,505 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/txb_common.h" + +static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + __m128i *const level) { + level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride); + level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride); + level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride); + level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride); + level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride); +} + +static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + __m128i *const level) { + level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride); + level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride); + level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride); + level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride); + level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride); +} + +static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + __m128i *const level) { + level[0] = _mm_loadu_si128((__m128i *)(src + 1)); + level[1] = _mm_loadu_si128((__m128i *)(src + stride)); + level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0])); + level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1])); + level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2])); +} + +static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) { + const __m128i const_3 = _mm_set1_epi8(3); + const __m128i const_4 = _mm_set1_epi8(4); + __m128i count; + + count = _mm_min_epu8(level[0], const_3); + level[1] = _mm_min_epu8(level[1], const_3); + level[2] = _mm_min_epu8(level[2], const_3); + level[3] = _mm_min_epu8(level[3], const_3); + level[4] = _mm_min_epu8(level[4], const_3); + count = _mm_add_epi8(count, level[1]); + count = _mm_add_epi8(count, level[2]); + count = _mm_add_epi8(count, level[3]); + count = _mm_add_epi8(count, level[4]); + count = _mm_avg_epu8(count, _mm_setzero_si128()); + count = _mm_min_epu8(count, const_4); + return count; +} + +static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *const coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const __m128i pos_to_offset_large = _mm_set1_epi8(21); + __m128i pos_to_offset = + (height == 4) + ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21) + : _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21, + 21, 21); + __m128i count; + __m128i level[5]; + int8_t *cc = coeff_contexts; + int row = height; + + assert(!(height % 4)); + + do { + load_levels_4x4x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)cc, count); + pos_to_offset = pos_to_offset_large; + levels += 4 * stride; + cc += 16; + row -= 4; + } while (row); + + coeff_contexts[0] = 0; +} + +static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + __m128i count; + __m128i level[5]; + int row = height; + + assert(!(height % 4)); + + do { + load_levels_4x4x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + levels += 4 * stride; + coeff_contexts += 16; + row -= 4; + } while (row); +} + +static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); + __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + __m128i count; + __m128i level[5]; + int row = height; + + assert(!(height % 4)); + + do { + load_levels_4x4x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 4 * stride; + coeff_contexts += 16; + row -= 4; + } while (row); +} + +static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + int8_t *cc = coeff_contexts; + int row = height; + __m128i count; + __m128i level[5]; + __m128i pos_to_offset[3]; + + assert(!(height % 2)); + + if (height == 8) { + pos_to_offset[0] = + _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, + 21, 21, 21, 21, 21); + } else if (height < 8) { + pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21, + 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, + 21, 21, 21, 21, 21); + } else { + pos_to_offset[0] = _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11); + pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, + 21, 21, 21, 21, 21); + } + pos_to_offset[2] = _mm_set1_epi8(21); + + do { + load_levels_8x2x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset[0]); + _mm_store_si128((__m128i *)cc, count); + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + levels += 2 * stride; + cc += 16; + row -= 2; + } while (row); + + coeff_contexts[0] = 0; +} + +static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + const __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + int row = height; + __m128i count; + __m128i level[5]; + + assert(!(height % 2)); + + do { + load_levels_8x2x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + levels += 2 * stride; + coeff_contexts += 16; + row -= 2; + } while (row); +} + +static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); + __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5); + int row = height; + __m128i count; + __m128i level[5]; + + assert(!(height % 2)); + + do { + load_levels_8x2x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 2 * stride; + coeff_contexts += 16; + row -= 2; + } while (row); +} + +static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels, + const int real_width, + const int real_height, + const int width, const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = width + TX_PAD_HOR; + int8_t *cc = coeff_contexts; + int row = height; + __m128i pos_to_offset[5]; + __m128i pos_to_offset_large[3]; + __m128i count; + __m128i level[5]; + + assert(!(width % 16)); + + pos_to_offset_large[2] = _mm_set1_epi8(21); + if (real_width == real_height) { + pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] = + pos_to_offset_large[2]; + } else if (real_width > real_height) { + pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8( + 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21); + pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2]; + } else { // real_width < real_height + pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8( + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11); + pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[4] = pos_to_offset_large[2]; + pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(11); + } + + do { + int w = width; + + do { + load_levels_16x1x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset[0]); + _mm_store_si128((__m128i *)cc, count); + levels += 16; + cc += 16; + w -= 16; + pos_to_offset[0] = pos_to_offset_large[0]; + } while (w); + + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + pos_to_offset[2] = pos_to_offset[3]; + pos_to_offset[3] = pos_to_offset[4]; + pos_to_offset_large[0] = pos_to_offset_large[1]; + pos_to_offset_large[1] = pos_to_offset_large[2]; + levels += TX_PAD_HOR; + } while (--row); + + coeff_contexts[0] = 0; +} + +static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels, + const int width, const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = width + TX_PAD_HOR; + const __m128i pos_to_offset_large = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + __m128i count; + __m128i level[5]; + int row = height; + + assert(!(width % 16)); + + do { + __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + int w = width; + + do { + load_levels_16x1x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 16; + coeff_contexts += 16; + w -= 16; + } while (w); + + levels += TX_PAD_HOR; + } while (--row); +} + +static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels, + const int width, const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = width + TX_PAD_HOR; + __m128i pos_to_offset[3]; + __m128i count; + __m128i level[5]; + int row = height; + + assert(!(width % 16)); + + pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0); + pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5); + pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); + + do { + int w = width; + + do { + load_levels_16x1x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset[0]); + _mm_store_si128((__m128i *)coeff_contexts, count); + levels += 16; + coeff_contexts += 16; + w -= 16; + } while (w); + + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + levels += TX_PAD_HOR; + } while (--row); +} + +// Note: levels[] must be in the range [0, 127], inclusive. +void av1_get_nz_map_contexts_sse2(const uint8_t *const levels, + const int16_t *const scan, const uint16_t eob, + const TX_SIZE tx_size, + const TX_CLASS tx_class, + int8_t *const coeff_contexts) { + const int last_idx = eob - 1; + if (!last_idx) { + coeff_contexts[0] = 0; + return; + } + + const int real_width = tx_size_wide[tx_size]; + const int real_height = tx_size_high[tx_size]; + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + const int stride = width + TX_PAD_HOR; + ptrdiff_t offsets[3]; + + /* coeff_contexts must be 16 byte aligned. */ + assert(!((intptr_t)coeff_contexts & 0xf)); + + if (tx_class == TX_CLASS_2D) { + offsets[0] = 0 * stride + 2; + offsets[1] = 1 * stride + 1; + offsets[2] = 2 * stride + 0; + + if (width == 4) { + get_4_nz_map_contexts_2d(levels, height, offsets, coeff_contexts); + } else if (width == 8) { + get_8_coeff_contexts_2d(levels, height, offsets, coeff_contexts); + } else if (width == 16) { + get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height, + offsets, coeff_contexts); + } else { + get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height, + offsets, coeff_contexts); + } + } else if (tx_class == TX_CLASS_HORIZ) { + offsets[0] = 2; + offsets[1] = 3; + offsets[2] = 4; + if (width == 4) { + get_4_nz_map_contexts_hor(levels, height, offsets, coeff_contexts); + } else if (width == 8) { + get_8_coeff_contexts_hor(levels, height, offsets, coeff_contexts); + } else { + get_16n_coeff_contexts_hor(levels, width, height, offsets, + coeff_contexts); + } + } else { // TX_CLASS_VERT + offsets[0] = 2 * stride; + offsets[1] = 3 * stride; + offsets[2] = 4 * stride; + if (width == 4) { + get_4_nz_map_contexts_ver(levels, height, offsets, coeff_contexts); + } else if (width == 8) { + get_8_coeff_contexts_ver(levels, height, offsets, coeff_contexts); + } else { + get_16n_coeff_contexts_ver(levels, width, height, offsets, + coeff_contexts); + } + } + + const int bwl = get_txb_bwl(tx_size); + const int pos = scan[last_idx]; + if (last_idx <= (height << bwl) / 8) + coeff_contexts[pos] = 1; + else if (last_idx <= (height << bwl) / 4) + coeff_contexts[pos] = 2; + else + coeff_contexts[pos] = 3; +} diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c new file mode 100644 index 000000000..b3a879b0f --- /dev/null +++ b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 +#include /* SSE4.1 */ + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/txb_common.h" + +void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = width + TX_PAD_HOR; + memset(levels - TX_PAD_TOP * stride, 0, + sizeof(*levels) * TX_PAD_TOP * stride); + memset(levels + stride * height, 0, + sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END)); + + const __m128i zeros = _mm_setzero_si128(); + int i = 0; + uint8_t *ls = levels; + const tran_low_t *cf = coeff; + if (width == 4) { + do { + const __m128i coeffA = _mm_load_si128((__m128i *)(cf)); + const __m128i coeffB = _mm_load_si128((__m128i *)(cf + width)); + const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); + const __m128i absAB = _mm_abs_epi16(coeffAB); + const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); + const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros); + _mm_storeu_si128((__m128i *)ls, lsAB); + ls += (stride << 1); + cf += (width << 1); + i += 2; + } while (i < height); + } else if (width == 8) { + do { + const __m128i coeffA = _mm_load_si128((__m128i *)(cf)); + const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4)); + const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); + const __m128i absAB = _mm_abs_epi16(coeffAB); + const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); + _mm_storeu_si128((__m128i *)ls, absAB8); + ls += stride; + cf += width; + i += 1; + } while (i < height); + } else { + do { + int j = 0; + do { + const __m128i coeffA = _mm_load_si128((__m128i *)(cf)); + const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4)); + const __m128i coeffC = _mm_load_si128((__m128i *)(cf + 8)); + const __m128i coeffD = _mm_load_si128((__m128i *)(cf + 12)); + const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); + const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD); + const __m128i absAB = _mm_abs_epi16(coeffAB); + const __m128i absCD = _mm_abs_epi16(coeffCD); + const __m128i absABCD = _mm_packs_epi16(absAB, absCD); + _mm_storeu_si128((__m128i *)(ls + j), absABCD); + j += 16; + cf += 16; + } while (j < width); + *(int32_t *)(ls + width) = 0; + ls += stride; + i += 1; + } while (i < height); + } +} diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c index 6599630d0..7d4f69585 100644 --- a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c +++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c @@ -11,7 +11,8 @@ #include // AVX2 -#include "./av1_rtcd.h" +#include "config/av1_rtcd.h" + #include "aom/aom_integer.h" static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset, diff --git a/third_party/aom/av1/encoder/x86/error_sse2.asm b/third_party/aom/av1/encoder/x86/error_sse2.asm index 4680f1fab..72e9e22b1 100644 --- a/third_party/aom/av1/encoder/x86/error_sse2.asm +++ b/third_party/aom/av1/encoder/x86/error_sse2.asm @@ -77,49 +77,3 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz movd edx, m5 %endif RET - -; Compute the sum of squared difference between two int16_t vectors. -; int64_t av1_block_error_fp(int16_t *coeff, int16_t *dqcoeff, -; intptr_t block_size) - -INIT_XMM sse2 -cglobal block_error_fp, 3, 3, 6, uqc, dqc, size - pxor m4, m4 ; sse accumulator - pxor m5, m5 ; dedicated zero register - lea uqcq, [uqcq+sizeq*2] - lea dqcq, [dqcq+sizeq*2] - neg sizeq -.loop: - mova m2, [uqcq+sizeq*2] - mova m0, [dqcq+sizeq*2] - mova m3, [uqcq+sizeq*2+mmsize] - mova m1, [dqcq+sizeq*2+mmsize] - psubw m0, m2 - psubw m1, m3 - ; individual errors are max. 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) - pmaddwd m0, m0 - pmaddwd m1, m1 - ; accumulate in 64bit - punpckldq m3, m0, m5 - punpckhdq m0, m5 - paddq m4, m3 - punpckldq m3, m1, m5 - paddq m4, m0 - punpckhdq m1, m5 - paddq m4, m3 - paddq m4, m1 - add sizeq, mmsize - jl .loop - - ; accumulate horizontally and store in return value - movhlps m5, m4 - paddq m4, m5 -%if ARCH_X86_64 - movq rax, m4 -%else - pshufd m5, m4, 0x1 - movd eax, m4 - movd edx, m5 -%endif - RET diff --git a/third_party/aom/av1/encoder/x86/hash_sse42.c b/third_party/aom/av1/encoder/x86/hash_sse42.c new file mode 100644 index 000000000..65fa46311 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/hash_sse42.c @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +// Byte-boundary alignment issues +#define ALIGN_SIZE 8 +#define ALIGN_MASK (ALIGN_SIZE - 1) + +#define CALC_CRC(op, crc, type, buf, len) \ + while ((len) >= sizeof(type)) { \ + (crc) = op((crc), *(type *)(buf)); \ + (len) -= sizeof(type); \ + buf += sizeof(type); \ + } + +/** + * Calculates 32-bit CRC for the input buffer + * polynomial is 0x11EDC6F41 + * @return A 32-bit unsigned integer representing the CRC + */ +uint32_t av1_get_crc32c_value_sse4_2(void *crc_calculator, uint8_t *p, + size_t len) { + (void)crc_calculator; + const uint8_t *buf = p; + uint32_t crc = 0xFFFFFFFF; + + // Align the input to the word boundary + for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) { + crc = _mm_crc32_u8(crc, *buf); + } + +#ifdef __x86_64__ + uint64_t crc64 = crc; + CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len); + crc = (uint32_t)crc64; +#endif + CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len); + CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len); + CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len); + return (crc ^= 0xFFFFFFFF); +} diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c index b684f7a3a..4cd6371a6 100644 --- a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c +++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c @@ -11,11 +11,12 @@ #include #include /* SSE4.1 */ -#include "./av1_rtcd.h" -#include "./aom_config.h" -#include "av1/common/av1_fwd_txfm1d_cfg.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + #include "av1/common/av1_txfm.h" #include "av1/common/x86/highbd_txfm_utility_sse4.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" #include "aom_dsp/txfm_common.h" #include "aom_dsp/x86/txfm_common_sse2.h" #include "aom_ports/mem.h" @@ -121,72 +122,57 @@ static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) { } static void fadst4x4_sse4_1(__m128i *in, int bit) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const int32_t *sinpi = sinpi_arr(bit); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const __m128i kZero = _mm_setzero_si128(); - __m128i s0, s1, s2, s3; + const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]); + const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]); + const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]); + const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]); + __m128i t; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i x0, x1, x2, x3; __m128i u0, u1, u2, u3; __m128i v0, v1, v2, v3; - // stage 0 - // stage 1 - // stage 2 - u0 = _mm_mullo_epi32(in[3], cospi8); - u1 = _mm_mullo_epi32(in[0], cospi56); - u2 = _mm_add_epi32(u0, u1); - s0 = _mm_add_epi32(u2, rnding); - s0 = _mm_srai_epi32(s0, bit); - - v0 = _mm_mullo_epi32(in[3], cospi56); - v1 = _mm_mullo_epi32(in[0], cospi8); - v2 = _mm_sub_epi32(v0, v1); - s1 = _mm_add_epi32(v2, rnding); - s1 = _mm_srai_epi32(s1, bit); - - u0 = _mm_mullo_epi32(in[1], cospi40); - u1 = _mm_mullo_epi32(in[2], cospi24); - u2 = _mm_add_epi32(u0, u1); - s2 = _mm_add_epi32(u2, rnding); - s2 = _mm_srai_epi32(s2, bit); - - v0 = _mm_mullo_epi32(in[1], cospi24); - v1 = _mm_mullo_epi32(in[2], cospi40); - v2 = _mm_sub_epi32(v0, v1); - s3 = _mm_add_epi32(v2, rnding); - s3 = _mm_srai_epi32(s3, bit); - - // stage 3 - u0 = _mm_add_epi32(s0, s2); - u2 = _mm_sub_epi32(s0, s2); - u1 = _mm_add_epi32(s1, s3); - u3 = _mm_sub_epi32(s1, s3); - - // stage 4 - v0 = _mm_mullo_epi32(u2, cospi32); - v1 = _mm_mullo_epi32(u3, cospi32); - v2 = _mm_add_epi32(v0, v1); - s2 = _mm_add_epi32(v2, rnding); - u2 = _mm_srai_epi32(s2, bit); + s0 = _mm_mullo_epi32(in[0], sinpi1); + s1 = _mm_mullo_epi32(in[0], sinpi4); + s2 = _mm_mullo_epi32(in[1], sinpi2); + s3 = _mm_mullo_epi32(in[1], sinpi1); + s4 = _mm_mullo_epi32(in[2], sinpi3); + s5 = _mm_mullo_epi32(in[3], sinpi4); + s6 = _mm_mullo_epi32(in[3], sinpi2); + t = _mm_add_epi32(in[0], in[1]); + s7 = _mm_sub_epi32(t, in[3]); + + t = _mm_add_epi32(s0, s2); + x0 = _mm_add_epi32(t, s5); + x1 = _mm_mullo_epi32(s7, sinpi3); + t = _mm_sub_epi32(s1, s3); + x2 = _mm_add_epi32(t, s6); + x3 = s4; + + s0 = _mm_add_epi32(x0, x3); + s1 = x1; + s2 = _mm_sub_epi32(x2, x3); + t = _mm_sub_epi32(x2, x0); + s3 = _mm_add_epi32(t, x3); + + u0 = _mm_add_epi32(s0, rnding); + u0 = _mm_srai_epi32(u0, bit); + + u1 = _mm_add_epi32(s1, rnding); + u1 = _mm_srai_epi32(u1, bit); + + u2 = _mm_add_epi32(s2, rnding); + u2 = _mm_srai_epi32(u2, bit); + + u3 = _mm_add_epi32(s3, rnding); + u3 = _mm_srai_epi32(u3, bit); - v2 = _mm_sub_epi32(v0, v1); - s3 = _mm_add_epi32(v2, rnding); - u3 = _mm_srai_epi32(s3, bit); - - // u0, u1, u2, u3 - u2 = _mm_sub_epi32(kZero, u2); - u1 = _mm_sub_epi32(kZero, u1); - - // u0, u2, u3, u1 - // Transpose 4x4 32-bit - v0 = _mm_unpacklo_epi32(u0, u2); - v1 = _mm_unpackhi_epi32(u0, u2); - v2 = _mm_unpacklo_epi32(u3, u1); - v3 = _mm_unpackhi_epi32(u3, u1); + v0 = _mm_unpacklo_epi32(u0, u1); + v1 = _mm_unpackhi_epi32(u0, u1); + v2 = _mm_unpacklo_epi32(u2, u3); + v3 = _mm_unpackhi_epi32(u2, u3); in[0] = _mm_unpacklo_epi64(v0, v2); in[1] = _mm_unpackhi_epi64(v0, v2); @@ -197,84 +183,65 @@ static void fadst4x4_sse4_1(__m128i *in, int bit) { void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff, int input_stride, TX_TYPE tx_type, int bd) { __m128i in[4]; - const TXFM_1D_CFG *row_cfg = NULL; - const TXFM_1D_CFG *col_cfg = NULL; + const int8_t *shift = fwd_txfm_shift_ls[TX_4X4]; + const int txw_idx = get_txw_idx(TX_4X4); + const int txh_idx = get_txh_idx(TX_4X4); switch (tx_type) { case DCT_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_4; - col_cfg = &fwd_txfm_1d_col_cfg_dct_4; - load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]); - fdct4x4_sse4_1(in, col_cfg->cos_bit[2]); - fdct4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; case ADST_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_4; - col_cfg = &fwd_txfm_1d_col_cfg_adst_4; - load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]); - fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); - fdct4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; case DCT_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_4; - col_cfg = &fwd_txfm_1d_col_cfg_dct_4; - load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]); - fdct4x4_sse4_1(in, col_cfg->cos_bit[2]); - fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; case ADST_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_4; - col_cfg = &fwd_txfm_1d_col_cfg_adst_4; - load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]); - fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); - fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; -#if CONFIG_EXT_TX case FLIPADST_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_4; - col_cfg = &fwd_txfm_1d_col_cfg_adst_4; - load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]); - fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); - fdct4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); + fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; case DCT_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_4; - col_cfg = &fwd_txfm_1d_col_cfg_dct_4; - load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]); - fdct4x4_sse4_1(in, col_cfg->cos_bit[2]); - fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); + fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; case FLIPADST_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_4; - col_cfg = &fwd_txfm_1d_col_cfg_adst_4; - load_buffer_4x4(input, in, input_stride, 1, 1, row_cfg->shift[0]); - fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); - fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]); + fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; case ADST_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_4; - col_cfg = &fwd_txfm_1d_col_cfg_adst_4; - load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]); - fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); - fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); + fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; case FLIPADST_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_4; - col_cfg = &fwd_txfm_1d_col_cfg_adst_4; - load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]); - fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); - fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); + fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]); write_buffer_4x4(in, coeff); break; -#endif default: assert(0); } (void)bd; @@ -624,415 +591,274 @@ static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) { static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) { const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); const __m128i cospi44 = _mm_set1_epi32(cospi[44]); - const __m128i cospi36 = _mm_set1_epi32(cospi[36]); const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); const __m128i cospi12 = _mm_set1_epi32(cospi[12]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const __m128i kZero = _mm_setzero_si128(); - __m128i u[8], v[8], x; - - // Even 8 points: 0, 2, ..., 14 - // stage 0 - // stage 1 - // stage 2 - // (1) - u[0] = _mm_mullo_epi32(in[14], cospi4); - x = _mm_mullo_epi32(in[0], cospi60); - u[0] = _mm_add_epi32(u[0], x); - u[0] = _mm_add_epi32(u[0], rnding); - u[0] = _mm_srai_epi32(u[0], bit); - - u[1] = _mm_mullo_epi32(in[14], cospi60); - x = _mm_mullo_epi32(in[0], cospi4); - u[1] = _mm_sub_epi32(u[1], x); - u[1] = _mm_add_epi32(u[1], rnding); - u[1] = _mm_srai_epi32(u[1], bit); - - // (2) - u[2] = _mm_mullo_epi32(in[10], cospi20); - x = _mm_mullo_epi32(in[4], cospi44); - u[2] = _mm_add_epi32(u[2], x); - u[2] = _mm_add_epi32(u[2], rnding); - u[2] = _mm_srai_epi32(u[2], bit); - - u[3] = _mm_mullo_epi32(in[10], cospi44); - x = _mm_mullo_epi32(in[4], cospi20); - u[3] = _mm_sub_epi32(u[3], x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); - - // (3) - u[4] = _mm_mullo_epi32(in[6], cospi36); - x = _mm_mullo_epi32(in[8], cospi28); - u[4] = _mm_add_epi32(u[4], x); - u[4] = _mm_add_epi32(u[4], rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - u[5] = _mm_mullo_epi32(in[6], cospi28); - x = _mm_mullo_epi32(in[8], cospi36); - u[5] = _mm_sub_epi32(u[5], x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - // (4) - u[6] = _mm_mullo_epi32(in[2], cospi52); - x = _mm_mullo_epi32(in[12], cospi12); - u[6] = _mm_add_epi32(u[6], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_mullo_epi32(in[2], cospi12); - x = _mm_mullo_epi32(in[12], cospi52); - u[7] = _mm_sub_epi32(u[7], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 3 - v[0] = _mm_add_epi32(u[0], u[4]); - v[4] = _mm_sub_epi32(u[0], u[4]); - v[1] = _mm_add_epi32(u[1], u[5]); - v[5] = _mm_sub_epi32(u[1], u[5]); - v[2] = _mm_add_epi32(u[2], u[6]); - v[6] = _mm_sub_epi32(u[2], u[6]); - v[3] = _mm_add_epi32(u[3], u[7]); - v[7] = _mm_sub_epi32(u[3], u[7]); - - // stage 4 - u[0] = v[0]; - u[1] = v[1]; - u[2] = v[2]; - u[3] = v[3]; - - u[4] = _mm_mullo_epi32(v[4], cospi16); - x = _mm_mullo_epi32(v[5], cospi48); - u[4] = _mm_add_epi32(u[4], x); - u[4] = _mm_add_epi32(u[4], rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - u[5] = _mm_mullo_epi32(v[4], cospi48); - x = _mm_mullo_epi32(v[5], cospi16); - u[5] = _mm_sub_epi32(u[5], x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - u[6] = _mm_mullo_epi32(v[6], cospim48); - x = _mm_mullo_epi32(v[7], cospi16); - u[6] = _mm_add_epi32(u[6], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_mullo_epi32(v[6], cospi16); - x = _mm_mullo_epi32(v[7], cospim48); - u[7] = _mm_sub_epi32(u[7], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 5 - v[0] = _mm_add_epi32(u[0], u[2]); - v[2] = _mm_sub_epi32(u[0], u[2]); - v[1] = _mm_add_epi32(u[1], u[3]); - v[3] = _mm_sub_epi32(u[1], u[3]); - v[4] = _mm_add_epi32(u[4], u[6]); - v[6] = _mm_sub_epi32(u[4], u[6]); - v[5] = _mm_add_epi32(u[5], u[7]); - v[7] = _mm_sub_epi32(u[5], u[7]); - - // stage 6 - u[0] = v[0]; - u[1] = v[1]; - u[4] = v[4]; - u[5] = v[5]; - - v[0] = _mm_mullo_epi32(v[2], cospi32); - x = _mm_mullo_epi32(v[3], cospi32); - u[2] = _mm_add_epi32(v[0], x); - u[2] = _mm_add_epi32(u[2], rnding); - u[2] = _mm_srai_epi32(u[2], bit); + const __m128i zero = _mm_setzero_si128(); + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x, y; + int col; - u[3] = _mm_sub_epi32(v[0], x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); + // Note: + // Even column: 0, 2, ..., 14 + // Odd column: 1, 3, ..., 15 + // one even column plus one odd column constructs one row (8 coeffs) + // total we have 8 rows (8x8). + for (col = 0; col < 2; ++col) { + // stage 0 + // stage 1 + u0 = in[2 * 0 + col]; + u1 = _mm_sub_epi32(zero, in[2 * 7 + col]); + u2 = _mm_sub_epi32(zero, in[2 * 3 + col]); + u3 = in[2 * 4 + col]; + u4 = _mm_sub_epi32(zero, in[2 * 1 + col]); + u5 = in[2 * 6 + col]; + u6 = in[2 * 2 + col]; + u7 = _mm_sub_epi32(zero, in[2 * 5 + col]); - v[0] = _mm_mullo_epi32(v[6], cospi32); - x = _mm_mullo_epi32(v[7], cospi32); - u[6] = _mm_add_epi32(v[0], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_sub_epi32(v[0], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 7 - out[0] = u[0]; - out[2] = _mm_sub_epi32(kZero, u[4]); - out[4] = u[6]; - out[6] = _mm_sub_epi32(kZero, u[2]); - out[8] = u[3]; - out[10] = _mm_sub_epi32(kZero, u[7]); - out[12] = u[5]; - out[14] = _mm_sub_epi32(kZero, u[1]); + // stage 2 + v0 = u0; + v1 = u1; - // Odd 8 points: 1, 3, ..., 15 - // stage 0 - // stage 1 - // stage 2 - // (1) - u[0] = _mm_mullo_epi32(in[15], cospi4); - x = _mm_mullo_epi32(in[1], cospi60); - u[0] = _mm_add_epi32(u[0], x); - u[0] = _mm_add_epi32(u[0], rnding); - u[0] = _mm_srai_epi32(u[0], bit); + x = _mm_mullo_epi32(u2, cospi32); + y = _mm_mullo_epi32(u3, cospi32); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); - u[1] = _mm_mullo_epi32(in[15], cospi60); - x = _mm_mullo_epi32(in[1], cospi4); - u[1] = _mm_sub_epi32(u[1], x); - u[1] = _mm_add_epi32(u[1], rnding); - u[1] = _mm_srai_epi32(u[1], bit); + v3 = _mm_sub_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); - // (2) - u[2] = _mm_mullo_epi32(in[11], cospi20); - x = _mm_mullo_epi32(in[5], cospi44); - u[2] = _mm_add_epi32(u[2], x); - u[2] = _mm_add_epi32(u[2], rnding); - u[2] = _mm_srai_epi32(u[2], bit); + v4 = u4; + v5 = u5; - u[3] = _mm_mullo_epi32(in[11], cospi44); - x = _mm_mullo_epi32(in[5], cospi20); - u[3] = _mm_sub_epi32(u[3], x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); + x = _mm_mullo_epi32(u6, cospi32); + y = _mm_mullo_epi32(u7, cospi32); + v6 = _mm_add_epi32(x, y); + v6 = _mm_add_epi32(v6, rnding); + v6 = _mm_srai_epi32(v6, bit); - // (3) - u[4] = _mm_mullo_epi32(in[7], cospi36); - x = _mm_mullo_epi32(in[9], cospi28); - u[4] = _mm_add_epi32(u[4], x); - u[4] = _mm_add_epi32(u[4], rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - u[5] = _mm_mullo_epi32(in[7], cospi28); - x = _mm_mullo_epi32(in[9], cospi36); - u[5] = _mm_sub_epi32(u[5], x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - // (4) - u[6] = _mm_mullo_epi32(in[3], cospi52); - x = _mm_mullo_epi32(in[13], cospi12); - u[6] = _mm_add_epi32(u[6], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_mullo_epi32(in[3], cospi12); - x = _mm_mullo_epi32(in[13], cospi52); - u[7] = _mm_sub_epi32(u[7], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); + v7 = _mm_sub_epi32(x, y); + v7 = _mm_add_epi32(v7, rnding); + v7 = _mm_srai_epi32(v7, bit); - // stage 3 - v[0] = _mm_add_epi32(u[0], u[4]); - v[4] = _mm_sub_epi32(u[0], u[4]); - v[1] = _mm_add_epi32(u[1], u[5]); - v[5] = _mm_sub_epi32(u[1], u[5]); - v[2] = _mm_add_epi32(u[2], u[6]); - v[6] = _mm_sub_epi32(u[2], u[6]); - v[3] = _mm_add_epi32(u[3], u[7]); - v[7] = _mm_sub_epi32(u[3], u[7]); + // stage 3 + u0 = _mm_add_epi32(v0, v2); + u1 = _mm_add_epi32(v1, v3); + u2 = _mm_sub_epi32(v0, v2); + u3 = _mm_sub_epi32(v1, v3); + u4 = _mm_add_epi32(v4, v6); + u5 = _mm_add_epi32(v5, v7); + u6 = _mm_sub_epi32(v4, v6); + u7 = _mm_sub_epi32(v5, v7); - // stage 4 - u[0] = v[0]; - u[1] = v[1]; - u[2] = v[2]; - u[3] = v[3]; - - u[4] = _mm_mullo_epi32(v[4], cospi16); - x = _mm_mullo_epi32(v[5], cospi48); - u[4] = _mm_add_epi32(u[4], x); - u[4] = _mm_add_epi32(u[4], rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - u[5] = _mm_mullo_epi32(v[4], cospi48); - x = _mm_mullo_epi32(v[5], cospi16); - u[5] = _mm_sub_epi32(u[5], x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - u[6] = _mm_mullo_epi32(v[6], cospim48); - x = _mm_mullo_epi32(v[7], cospi16); - u[6] = _mm_add_epi32(u[6], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_mullo_epi32(v[6], cospi16); - x = _mm_mullo_epi32(v[7], cospim48); - u[7] = _mm_sub_epi32(u[7], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); + // stage 4 + v0 = u0; + v1 = u1; + v2 = u2; + v3 = u3; + + x = _mm_mullo_epi32(u4, cospi16); + y = _mm_mullo_epi32(u5, cospi48); + v4 = _mm_add_epi32(x, y); + v4 = _mm_add_epi32(v4, rnding); + v4 = _mm_srai_epi32(v4, bit); + + x = _mm_mullo_epi32(u4, cospi48); + y = _mm_mullo_epi32(u5, cospim16); + v5 = _mm_add_epi32(x, y); + v5 = _mm_add_epi32(v5, rnding); + v5 = _mm_srai_epi32(v5, bit); + + x = _mm_mullo_epi32(u6, cospim48); + y = _mm_mullo_epi32(u7, cospi16); + v6 = _mm_add_epi32(x, y); + v6 = _mm_add_epi32(v6, rnding); + v6 = _mm_srai_epi32(v6, bit); + + x = _mm_mullo_epi32(u6, cospi16); + y = _mm_mullo_epi32(u7, cospi48); + v7 = _mm_add_epi32(x, y); + v7 = _mm_add_epi32(v7, rnding); + v7 = _mm_srai_epi32(v7, bit); - // stage 5 - v[0] = _mm_add_epi32(u[0], u[2]); - v[2] = _mm_sub_epi32(u[0], u[2]); - v[1] = _mm_add_epi32(u[1], u[3]); - v[3] = _mm_sub_epi32(u[1], u[3]); - v[4] = _mm_add_epi32(u[4], u[6]); - v[6] = _mm_sub_epi32(u[4], u[6]); - v[5] = _mm_add_epi32(u[5], u[7]); - v[7] = _mm_sub_epi32(u[5], u[7]); - - // stage 6 - u[0] = v[0]; - u[1] = v[1]; - u[4] = v[4]; - u[5] = v[5]; - - v[0] = _mm_mullo_epi32(v[2], cospi32); - x = _mm_mullo_epi32(v[3], cospi32); - u[2] = _mm_add_epi32(v[0], x); - u[2] = _mm_add_epi32(u[2], rnding); - u[2] = _mm_srai_epi32(u[2], bit); + // stage 5 + u0 = _mm_add_epi32(v0, v4); + u1 = _mm_add_epi32(v1, v5); + u2 = _mm_add_epi32(v2, v6); + u3 = _mm_add_epi32(v3, v7); + u4 = _mm_sub_epi32(v0, v4); + u5 = _mm_sub_epi32(v1, v5); + u6 = _mm_sub_epi32(v2, v6); + u7 = _mm_sub_epi32(v3, v7); - u[3] = _mm_sub_epi32(v[0], x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); + // stage 6 + x = _mm_mullo_epi32(u0, cospi4); + y = _mm_mullo_epi32(u1, cospi60); + v0 = _mm_add_epi32(x, y); + v0 = _mm_add_epi32(v0, rnding); + v0 = _mm_srai_epi32(v0, bit); + + x = _mm_mullo_epi32(u0, cospi60); + y = _mm_mullo_epi32(u1, cospim4); + v1 = _mm_add_epi32(x, y); + v1 = _mm_add_epi32(v1, rnding); + v1 = _mm_srai_epi32(v1, bit); + + x = _mm_mullo_epi32(u2, cospi20); + y = _mm_mullo_epi32(u3, cospi44); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + x = _mm_mullo_epi32(u2, cospi44); + y = _mm_mullo_epi32(u3, cospim20); + v3 = _mm_add_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + x = _mm_mullo_epi32(u4, cospi36); + y = _mm_mullo_epi32(u5, cospi28); + v4 = _mm_add_epi32(x, y); + v4 = _mm_add_epi32(v4, rnding); + v4 = _mm_srai_epi32(v4, bit); + + x = _mm_mullo_epi32(u4, cospi28); + y = _mm_mullo_epi32(u5, cospim36); + v5 = _mm_add_epi32(x, y); + v5 = _mm_add_epi32(v5, rnding); + v5 = _mm_srai_epi32(v5, bit); + + x = _mm_mullo_epi32(u6, cospi52); + y = _mm_mullo_epi32(u7, cospi12); + v6 = _mm_add_epi32(x, y); + v6 = _mm_add_epi32(v6, rnding); + v6 = _mm_srai_epi32(v6, bit); + + x = _mm_mullo_epi32(u6, cospi12); + y = _mm_mullo_epi32(u7, cospim52); + v7 = _mm_add_epi32(x, y); + v7 = _mm_add_epi32(v7, rnding); + v7 = _mm_srai_epi32(v7, bit); - v[0] = _mm_mullo_epi32(v[6], cospi32); - x = _mm_mullo_epi32(v[7], cospi32); - u[6] = _mm_add_epi32(v[0], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_sub_epi32(v[0], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 7 - out[1] = u[0]; - out[3] = _mm_sub_epi32(kZero, u[4]); - out[5] = u[6]; - out[7] = _mm_sub_epi32(kZero, u[2]); - out[9] = u[3]; - out[11] = _mm_sub_epi32(kZero, u[7]); - out[13] = u[5]; - out[15] = _mm_sub_epi32(kZero, u[1]); + // stage 7 + out[2 * 0 + col] = v1; + out[2 * 1 + col] = v6; + out[2 * 2 + col] = v3; + out[2 * 3 + col] = v4; + out[2 * 4 + col] = v5; + out[2 * 5 + col] = v2; + out[2 * 6 + col] = v7; + out[2 * 7 + col] = v0; + } } void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[16], out[16]; - const TXFM_1D_CFG *row_cfg = NULL; - const TXFM_1D_CFG *col_cfg = NULL; + const int8_t *shift = fwd_txfm_shift_ls[TX_8X8]; + const int txw_idx = get_txw_idx(TX_8X8); + const int txh_idx = get_txh_idx(TX_8X8); switch (tx_type) { case DCT_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_8; - col_cfg = &fwd_txfm_1d_col_cfg_dct_8; - load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]); - fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case ADST_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_8; - col_cfg = &fwd_txfm_1d_col_cfg_adst_8; - load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]); - fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case DCT_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_8; - col_cfg = &fwd_txfm_1d_col_cfg_dct_8; - load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]); - fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case ADST_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_8; - col_cfg = &fwd_txfm_1d_col_cfg_adst_8; - load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]); - fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; -#if CONFIG_EXT_TX case FLIPADST_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_8; - col_cfg = &fwd_txfm_1d_col_cfg_adst_8; - load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]); - fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 1, 0, shift[0]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case DCT_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_8; - col_cfg = &fwd_txfm_1d_col_cfg_dct_8; - load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]); - fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 0, 1, shift[0]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case FLIPADST_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_8; - col_cfg = &fwd_txfm_1d_col_cfg_adst_8; - load_buffer_8x8(input, in, stride, 1, 1, row_cfg->shift[0]); - fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 1, 1, shift[0]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case ADST_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_8; - col_cfg = &fwd_txfm_1d_col_cfg_adst_8; - load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]); - fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 0, 1, shift[0]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case FLIPADST_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_8; - col_cfg = &fwd_txfm_1d_col_cfg_adst_8; - load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]); - fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); - col_txfm_8x8_rounding(out, -row_cfg->shift[1]); + load_buffer_8x8(input, in, stride, 1, 0, shift[0]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; -#endif // CONFIG_EXT_TX default: assert(0); } (void)bd; @@ -1402,230 +1228,174 @@ static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); const __m128i cospi2 = _mm_set1_epi32(cospi[2]); const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospim2 = _mm_set1_epi32(-cospi[2]); const __m128i cospi10 = _mm_set1_epi32(cospi[10]); const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospim10 = _mm_set1_epi32(-cospi[10]); const __m128i cospi18 = _mm_set1_epi32(cospi[18]); const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospim18 = _mm_set1_epi32(-cospi[18]); const __m128i cospi26 = _mm_set1_epi32(cospi[26]); const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospim26 = _mm_set1_epi32(-cospi[26]); const __m128i cospi34 = _mm_set1_epi32(cospi[34]); const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); const __m128i cospi42 = _mm_set1_epi32(cospi[42]); const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); const __m128i cospi50 = _mm_set1_epi32(cospi[50]); const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); const __m128i cospi58 = _mm_set1_epi32(cospi[58]); const __m128i cospi6 = _mm_set1_epi32(cospi[6]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); - const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i zero = _mm_setzero_si128(); + __m128i u[16], v[16], x, y; - const int col_num = 4; int col; - // Calculate the column 0, 1, 2, 3 - for (col = 0; col < col_num; ++col) { + for (col = 0; col < 4; ++col) { // stage 0 // stage 1 - // stage 2 - v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2); - x = _mm_mullo_epi32(in[0 * col_num + col], cospi62); - v[0] = _mm_add_epi32(v[0], x); - v[0] = _mm_add_epi32(v[0], rnding); - v[0] = _mm_srai_epi32(v[0], bit); + u[0] = in[0 * 4 + col]; + u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]); + u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]); + u[3] = in[8 * 4 + col]; + u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]); + u[5] = in[12 * 4 + col]; + u[6] = in[4 * 4 + col]; + u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]); + u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]); + u[9] = in[14 * 4 + col]; + u[10] = in[6 * 4 + col]; + u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]); + u[12] = in[2 * 4 + col]; + u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]); + u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]); + u[15] = in[10 * 4 + col]; - v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62); - x = _mm_mullo_epi32(in[0 * col_num + col], cospi2); - v[1] = _mm_sub_epi32(v[1], x); - v[1] = _mm_add_epi32(v[1], rnding); - v[1] = _mm_srai_epi32(v[1], bit); + // stage 2 + v[0] = u[0]; + v[1] = u[1]; - v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10); - x = _mm_mullo_epi32(in[2 * col_num + col], cospi54); - v[2] = _mm_add_epi32(v[2], x); + x = _mm_mullo_epi32(u[2], cospi32); + y = _mm_mullo_epi32(u[3], cospi32); + v[2] = _mm_add_epi32(x, y); v[2] = _mm_add_epi32(v[2], rnding); v[2] = _mm_srai_epi32(v[2], bit); - v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54); - x = _mm_mullo_epi32(in[2 * col_num + col], cospi10); - v[3] = _mm_sub_epi32(v[3], x); + v[3] = _mm_sub_epi32(x, y); v[3] = _mm_add_epi32(v[3], rnding); v[3] = _mm_srai_epi32(v[3], bit); - v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18); - x = _mm_mullo_epi32(in[4 * col_num + col], cospi46); - v[4] = _mm_add_epi32(v[4], x); - v[4] = _mm_add_epi32(v[4], rnding); - v[4] = _mm_srai_epi32(v[4], bit); - - v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46); - x = _mm_mullo_epi32(in[4 * col_num + col], cospi18); - v[5] = _mm_sub_epi32(v[5], x); - v[5] = _mm_add_epi32(v[5], rnding); - v[5] = _mm_srai_epi32(v[5], bit); - - v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26); - x = _mm_mullo_epi32(in[6 * col_num + col], cospi38); - v[6] = _mm_add_epi32(v[6], x); + v[4] = u[4]; + v[5] = u[5]; + + x = _mm_mullo_epi32(u[6], cospi32); + y = _mm_mullo_epi32(u[7], cospi32); + v[6] = _mm_add_epi32(x, y); v[6] = _mm_add_epi32(v[6], rnding); v[6] = _mm_srai_epi32(v[6], bit); - v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38); - x = _mm_mullo_epi32(in[6 * col_num + col], cospi26); - v[7] = _mm_sub_epi32(v[7], x); + v[7] = _mm_sub_epi32(x, y); v[7] = _mm_add_epi32(v[7], rnding); v[7] = _mm_srai_epi32(v[7], bit); - v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34); - x = _mm_mullo_epi32(in[8 * col_num + col], cospi30); - v[8] = _mm_add_epi32(v[8], x); - v[8] = _mm_add_epi32(v[8], rnding); - v[8] = _mm_srai_epi32(v[8], bit); - - v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30); - x = _mm_mullo_epi32(in[8 * col_num + col], cospi34); - v[9] = _mm_sub_epi32(v[9], x); - v[9] = _mm_add_epi32(v[9], rnding); - v[9] = _mm_srai_epi32(v[9], bit); + v[8] = u[8]; + v[9] = u[9]; - v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42); - x = _mm_mullo_epi32(in[10 * col_num + col], cospi22); - v[10] = _mm_add_epi32(v[10], x); + x = _mm_mullo_epi32(u[10], cospi32); + y = _mm_mullo_epi32(u[11], cospi32); + v[10] = _mm_add_epi32(x, y); v[10] = _mm_add_epi32(v[10], rnding); v[10] = _mm_srai_epi32(v[10], bit); - v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22); - x = _mm_mullo_epi32(in[10 * col_num + col], cospi42); - v[11] = _mm_sub_epi32(v[11], x); + v[11] = _mm_sub_epi32(x, y); v[11] = _mm_add_epi32(v[11], rnding); v[11] = _mm_srai_epi32(v[11], bit); - v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50); - x = _mm_mullo_epi32(in[12 * col_num + col], cospi14); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); - - v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14); - x = _mm_mullo_epi32(in[12 * col_num + col], cospi50); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); + v[12] = u[12]; + v[13] = u[13]; - v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58); - x = _mm_mullo_epi32(in[14 * col_num + col], cospi6); - v[14] = _mm_add_epi32(v[14], x); + x = _mm_mullo_epi32(u[14], cospi32); + y = _mm_mullo_epi32(u[15], cospi32); + v[14] = _mm_add_epi32(x, y); v[14] = _mm_add_epi32(v[14], rnding); v[14] = _mm_srai_epi32(v[14], bit); - v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6); - x = _mm_mullo_epi32(in[14 * col_num + col], cospi58); - v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_sub_epi32(x, y); v[15] = _mm_add_epi32(v[15], rnding); v[15] = _mm_srai_epi32(v[15], bit); // stage 3 - u[0] = _mm_add_epi32(v[0], v[8]); - u[8] = _mm_sub_epi32(v[0], v[8]); - u[1] = _mm_add_epi32(v[1], v[9]); - u[9] = _mm_sub_epi32(v[1], v[9]); - u[2] = _mm_add_epi32(v[2], v[10]); - u[10] = _mm_sub_epi32(v[2], v[10]); - u[3] = _mm_add_epi32(v[3], v[11]); - u[11] = _mm_sub_epi32(v[3], v[11]); - u[4] = _mm_add_epi32(v[4], v[12]); - u[12] = _mm_sub_epi32(v[4], v[12]); - u[5] = _mm_add_epi32(v[5], v[13]); - u[13] = _mm_sub_epi32(v[5], v[13]); - u[6] = _mm_add_epi32(v[6], v[14]); - u[14] = _mm_sub_epi32(v[6], v[14]); - u[7] = _mm_add_epi32(v[7], v[15]); - u[15] = _mm_sub_epi32(v[7], v[15]); + u[0] = _mm_add_epi32(v[0], v[2]); + u[1] = _mm_add_epi32(v[1], v[3]); + u[2] = _mm_sub_epi32(v[0], v[2]); + u[3] = _mm_sub_epi32(v[1], v[3]); + u[4] = _mm_add_epi32(v[4], v[6]); + u[5] = _mm_add_epi32(v[5], v[7]); + u[6] = _mm_sub_epi32(v[4], v[6]); + u[7] = _mm_sub_epi32(v[5], v[7]); + u[8] = _mm_add_epi32(v[8], v[10]); + u[9] = _mm_add_epi32(v[9], v[11]); + u[10] = _mm_sub_epi32(v[8], v[10]); + u[11] = _mm_sub_epi32(v[9], v[11]); + u[12] = _mm_add_epi32(v[12], v[14]); + u[13] = _mm_add_epi32(v[13], v[15]); + u[14] = _mm_sub_epi32(v[12], v[14]); + u[15] = _mm_sub_epi32(v[13], v[15]); // stage 4 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; - v[4] = u[4]; - v[5] = u[5]; - v[6] = u[6]; - v[7] = u[7]; - - v[8] = _mm_mullo_epi32(u[8], cospi8); - x = _mm_mullo_epi32(u[9], cospi56); - v[8] = _mm_add_epi32(v[8], x); - v[8] = _mm_add_epi32(v[8], rnding); - v[8] = _mm_srai_epi32(v[8], bit); - - v[9] = _mm_mullo_epi32(u[8], cospi56); - x = _mm_mullo_epi32(u[9], cospi8); - v[9] = _mm_sub_epi32(v[9], x); - v[9] = _mm_add_epi32(v[9], rnding); - v[9] = _mm_srai_epi32(v[9], bit); - - v[10] = _mm_mullo_epi32(u[10], cospi40); - x = _mm_mullo_epi32(u[11], cospi24); - v[10] = _mm_add_epi32(v[10], x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); - - v[11] = _mm_mullo_epi32(u[10], cospi24); - x = _mm_mullo_epi32(u[11], cospi40); - v[11] = _mm_sub_epi32(v[11], x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); - - v[12] = _mm_mullo_epi32(u[12], cospim56); - x = _mm_mullo_epi32(u[13], cospi8); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); - - v[13] = _mm_mullo_epi32(u[12], cospi8); - x = _mm_mullo_epi32(u[13], cospim56); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); - - v[14] = _mm_mullo_epi32(u[14], cospim24); - x = _mm_mullo_epi32(u[15], cospi40); - v[14] = _mm_add_epi32(v[14], x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); - - v[15] = _mm_mullo_epi32(u[14], cospi40); - x = _mm_mullo_epi32(u[15], cospim24); - v[15] = _mm_sub_epi32(v[15], x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); + v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit); + v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit); + v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit); + v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit); + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit); // stage 5 u[0] = _mm_add_epi32(v[0], v[4]); - u[4] = _mm_sub_epi32(v[0], v[4]); u[1] = _mm_add_epi32(v[1], v[5]); - u[5] = _mm_sub_epi32(v[1], v[5]); u[2] = _mm_add_epi32(v[2], v[6]); - u[6] = _mm_sub_epi32(v[2], v[6]); u[3] = _mm_add_epi32(v[3], v[7]); + u[4] = _mm_sub_epi32(v[0], v[4]); + u[5] = _mm_sub_epi32(v[1], v[5]); + u[6] = _mm_sub_epi32(v[2], v[6]); u[7] = _mm_sub_epi32(v[3], v[7]); u[8] = _mm_add_epi32(v[8], v[12]); - u[12] = _mm_sub_epi32(v[8], v[12]); u[9] = _mm_add_epi32(v[9], v[13]); - u[13] = _mm_sub_epi32(v[9], v[13]); u[10] = _mm_add_epi32(v[10], v[14]); - u[14] = _mm_sub_epi32(v[10], v[14]); u[11] = _mm_add_epi32(v[11], v[15]); + u[12] = _mm_sub_epi32(v[8], v[12]); + u[13] = _mm_sub_epi32(v[9], v[13]); + u[14] = _mm_sub_epi32(v[10], v[14]); u[15] = _mm_sub_epi32(v[11], v[15]); // stage 6 @@ -1633,148 +1403,72 @@ static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; - - v[4] = _mm_mullo_epi32(u[4], cospi16); - x = _mm_mullo_epi32(u[5], cospi48); - v[4] = _mm_add_epi32(v[4], x); - v[4] = _mm_add_epi32(v[4], rnding); - v[4] = _mm_srai_epi32(v[4], bit); - - v[5] = _mm_mullo_epi32(u[4], cospi48); - x = _mm_mullo_epi32(u[5], cospi16); - v[5] = _mm_sub_epi32(v[5], x); - v[5] = _mm_add_epi32(v[5], rnding); - v[5] = _mm_srai_epi32(v[5], bit); - - v[6] = _mm_mullo_epi32(u[6], cospim48); - x = _mm_mullo_epi32(u[7], cospi16); - v[6] = _mm_add_epi32(v[6], x); - v[6] = _mm_add_epi32(v[6], rnding); - v[6] = _mm_srai_epi32(v[6], bit); - - v[7] = _mm_mullo_epi32(u[6], cospi16); - x = _mm_mullo_epi32(u[7], cospim48); - v[7] = _mm_sub_epi32(v[7], x); - v[7] = _mm_add_epi32(v[7], rnding); - v[7] = _mm_srai_epi32(v[7], bit); - - v[8] = u[8]; - v[9] = u[9]; - v[10] = u[10]; - v[11] = u[11]; - - v[12] = _mm_mullo_epi32(u[12], cospi16); - x = _mm_mullo_epi32(u[13], cospi48); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); - - v[13] = _mm_mullo_epi32(u[12], cospi48); - x = _mm_mullo_epi32(u[13], cospi16); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); - - v[14] = _mm_mullo_epi32(u[14], cospim48); - x = _mm_mullo_epi32(u[15], cospi16); - v[14] = _mm_add_epi32(v[14], x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); - - v[15] = _mm_mullo_epi32(u[14], cospi16); - x = _mm_mullo_epi32(u[15], cospim48); - v[15] = _mm_sub_epi32(v[15], x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); - - // stage 7 - u[0] = _mm_add_epi32(v[0], v[2]); - u[2] = _mm_sub_epi32(v[0], v[2]); - u[1] = _mm_add_epi32(v[1], v[3]); - u[3] = _mm_sub_epi32(v[1], v[3]); - u[4] = _mm_add_epi32(v[4], v[6]); - u[6] = _mm_sub_epi32(v[4], v[6]); - u[5] = _mm_add_epi32(v[5], v[7]); - u[7] = _mm_sub_epi32(v[5], v[7]); - u[8] = _mm_add_epi32(v[8], v[10]); - u[10] = _mm_sub_epi32(v[8], v[10]); - u[9] = _mm_add_epi32(v[9], v[11]); - u[11] = _mm_sub_epi32(v[9], v[11]); - u[12] = _mm_add_epi32(v[12], v[14]); - u[14] = _mm_sub_epi32(v[12], v[14]); - u[13] = _mm_add_epi32(v[13], v[15]); - u[15] = _mm_sub_epi32(v[13], v[15]); - - // stage 8 - v[0] = u[0]; - v[1] = u[1]; - - y = _mm_mullo_epi32(u[2], cospi32); - x = _mm_mullo_epi32(u[3], cospi32); - v[2] = _mm_add_epi32(y, x); - v[2] = _mm_add_epi32(v[2], rnding); - v[2] = _mm_srai_epi32(v[2], bit); - - v[3] = _mm_sub_epi32(y, x); - v[3] = _mm_add_epi32(v[3], rnding); - v[3] = _mm_srai_epi32(v[3], bit); - v[4] = u[4]; v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit); + v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit); - y = _mm_mullo_epi32(u[6], cospi32); - x = _mm_mullo_epi32(u[7], cospi32); - v[6] = _mm_add_epi32(y, x); - v[6] = _mm_add_epi32(v[6], rnding); - v[6] = _mm_srai_epi32(v[6], bit); - - v[7] = _mm_sub_epi32(y, x); - v[7] = _mm_add_epi32(v[7], rnding); - v[7] = _mm_srai_epi32(v[7], bit); - - v[8] = u[8]; - v[9] = u[9]; - - y = _mm_mullo_epi32(u[10], cospi32); - x = _mm_mullo_epi32(u[11], cospi32); - v[10] = _mm_add_epi32(y, x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); - - v[11] = _mm_sub_epi32(y, x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); - - v[12] = u[12]; - v[13] = u[13]; - - y = _mm_mullo_epi32(u[14], cospi32); - x = _mm_mullo_epi32(u[15], cospi32); - v[14] = _mm_add_epi32(y, x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); + // stage 7 + u[0] = _mm_add_epi32(v[0], v[8]); + u[1] = _mm_add_epi32(v[1], v[9]); + u[2] = _mm_add_epi32(v[2], v[10]); + u[3] = _mm_add_epi32(v[3], v[11]); + u[4] = _mm_add_epi32(v[4], v[12]); + u[5] = _mm_add_epi32(v[5], v[13]); + u[6] = _mm_add_epi32(v[6], v[14]); + u[7] = _mm_add_epi32(v[7], v[15]); + u[8] = _mm_sub_epi32(v[0], v[8]); + u[9] = _mm_sub_epi32(v[1], v[9]); + u[10] = _mm_sub_epi32(v[2], v[10]); + u[11] = _mm_sub_epi32(v[3], v[11]); + u[12] = _mm_sub_epi32(v[4], v[12]); + u[13] = _mm_sub_epi32(v[5], v[13]); + u[14] = _mm_sub_epi32(v[6], v[14]); + u[15] = _mm_sub_epi32(v[7], v[15]); - v[15] = _mm_sub_epi32(y, x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); + // stage 8 + v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit); + v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit); + v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit); + v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit); + v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit); + v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit); + v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit); + v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit); + v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit); // stage 9 - out[0 * col_num + col] = v[0]; - out[1 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[8]); - out[2 * col_num + col] = v[12]; - out[3 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[4]); - out[4 * col_num + col] = v[6]; - out[5 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[14]); - out[6 * col_num + col] = v[10]; - out[7 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[2]); - out[8 * col_num + col] = v[3]; - out[9 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[11]); - out[10 * col_num + col] = v[15]; - out[11 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[7]); - out[12 * col_num + col] = v[5]; - out[13 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[13]); - out[14 * col_num + col] = v[9]; - out[15 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[1]); + out[0 * 4 + col] = v[1]; + out[1 * 4 + col] = v[14]; + out[2 * 4 + col] = v[3]; + out[3 * 4 + col] = v[12]; + out[4 * 4 + col] = v[5]; + out[5 * 4 + col] = v[10]; + out[6 * 4 + col] = v[7]; + out[7 * 4 + col] = v[8]; + out[8 * 4 + col] = v[9]; + out[9 * 4 + col] = v[6]; + out[10 * 4 + col] = v[11]; + out[11 * 4 + col] = v[4]; + out[12 * 4 + col] = v[13]; + out[13 * 4 + col] = v[2]; + out[14 * 4 + col] = v[15]; + out[15 * 4 + col] = v[0]; } } @@ -1802,111 +1496,91 @@ static void write_buffer_16x16(const __m128i *in, int32_t *output) { void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[64], out[64]; - const TXFM_1D_CFG *row_cfg = NULL; - const TXFM_1D_CFG *col_cfg = NULL; - + const int8_t *shift = fwd_txfm_shift_ls[TX_16X16]; + const int txw_idx = get_txw_idx(TX_16X16); + const int txh_idx = get_txh_idx(TX_16X16); switch (tx_type) { case DCT_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_16; - col_cfg = &fwd_txfm_1d_col_cfg_dct_16; - load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]); - fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case ADST_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_16; - col_cfg = &fwd_txfm_1d_col_cfg_adst_16; - load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]); - fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case DCT_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_16; - col_cfg = &fwd_txfm_1d_col_cfg_dct_16; - load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]); - fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case ADST_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_16; - col_cfg = &fwd_txfm_1d_col_cfg_adst_16; - load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]); - fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; -#if CONFIG_EXT_TX case FLIPADST_DCT: - row_cfg = &fwd_txfm_1d_row_cfg_dct_16; - col_cfg = &fwd_txfm_1d_col_cfg_adst_16; - load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]); - fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 1, 0, shift[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case DCT_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_16; - col_cfg = &fwd_txfm_1d_col_cfg_dct_16; - load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]); - fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 0, 1, shift[0]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case FLIPADST_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_16; - col_cfg = &fwd_txfm_1d_col_cfg_adst_16; - load_buffer_16x16(input, in, stride, 1, 1, row_cfg->shift[0]); - fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 1, 1, shift[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case ADST_FLIPADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_16; - col_cfg = &fwd_txfm_1d_col_cfg_adst_16; - load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]); - fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 0, 1, shift[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case FLIPADST_ADST: - row_cfg = &fwd_txfm_1d_row_cfg_adst_16; - col_cfg = &fwd_txfm_1d_col_cfg_adst_16; - load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]); - fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); - col_txfm_16x16_rounding(out, -row_cfg->shift[1]); + load_buffer_16x16(input, in, stride, 1, 0, shift[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; -#endif // CONFIG_EXT_TX default: assert(0); } (void)bd; diff --git a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c deleted file mode 100644 index 88621c82b..000000000 --- a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c +++ /dev/null @@ -1,1627 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include // avx2 - -#include "./av1_rtcd.h" -#include "./aom_dsp_rtcd.h" - -#include "aom_dsp/x86/fwd_txfm_avx2.h" -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/x86/txfm_common_avx2.h" - -static INLINE void load_buffer_16x16(const int16_t *input, int stride, - int flipud, int fliplr, __m256i *in) { - if (!flipud) { - in[0] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride)); - in[1] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride)); - in[2] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride)); - in[3] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride)); - in[4] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride)); - in[5] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride)); - in[6] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride)); - in[7] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride)); - in[8] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride)); - in[9] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride)); - in[10] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride)); - in[11] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride)); - in[12] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride)); - in[13] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride)); - in[14] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride)); - in[15] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride)); - } else { - in[0] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride)); - in[1] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride)); - in[2] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride)); - in[3] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride)); - in[4] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride)); - in[5] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride)); - in[6] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride)); - in[7] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride)); - in[8] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride)); - in[9] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride)); - in[10] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride)); - in[11] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride)); - in[12] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride)); - in[13] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride)); - in[14] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride)); - in[15] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride)); - } - - if (fliplr) { - mm256_reverse_epi16(&in[0]); - mm256_reverse_epi16(&in[1]); - mm256_reverse_epi16(&in[2]); - mm256_reverse_epi16(&in[3]); - mm256_reverse_epi16(&in[4]); - mm256_reverse_epi16(&in[5]); - mm256_reverse_epi16(&in[6]); - mm256_reverse_epi16(&in[7]); - mm256_reverse_epi16(&in[8]); - mm256_reverse_epi16(&in[9]); - mm256_reverse_epi16(&in[10]); - mm256_reverse_epi16(&in[11]); - mm256_reverse_epi16(&in[12]); - mm256_reverse_epi16(&in[13]); - mm256_reverse_epi16(&in[14]); - mm256_reverse_epi16(&in[15]); - } - - in[0] = _mm256_slli_epi16(in[0], 2); - in[1] = _mm256_slli_epi16(in[1], 2); - in[2] = _mm256_slli_epi16(in[2], 2); - in[3] = _mm256_slli_epi16(in[3], 2); - in[4] = _mm256_slli_epi16(in[4], 2); - in[5] = _mm256_slli_epi16(in[5], 2); - in[6] = _mm256_slli_epi16(in[6], 2); - in[7] = _mm256_slli_epi16(in[7], 2); - in[8] = _mm256_slli_epi16(in[8], 2); - in[9] = _mm256_slli_epi16(in[9], 2); - in[10] = _mm256_slli_epi16(in[10], 2); - in[11] = _mm256_slli_epi16(in[11], 2); - in[12] = _mm256_slli_epi16(in[12], 2); - in[13] = _mm256_slli_epi16(in[13], 2); - in[14] = _mm256_slli_epi16(in[14], 2); - in[15] = _mm256_slli_epi16(in[15], 2); -} - -static INLINE void write_buffer_16x16(const __m256i *in, tran_low_t *output) { - int i; - for (i = 0; i < 16; ++i) { - storeu_output_avx2(&in[i], output + (i << 4)); - } -} - -static void right_shift_16x16(__m256i *in) { - const __m256i one = _mm256_set1_epi16(1); - __m256i s0 = _mm256_srai_epi16(in[0], 15); - __m256i s1 = _mm256_srai_epi16(in[1], 15); - __m256i s2 = _mm256_srai_epi16(in[2], 15); - __m256i s3 = _mm256_srai_epi16(in[3], 15); - __m256i s4 = _mm256_srai_epi16(in[4], 15); - __m256i s5 = _mm256_srai_epi16(in[5], 15); - __m256i s6 = _mm256_srai_epi16(in[6], 15); - __m256i s7 = _mm256_srai_epi16(in[7], 15); - __m256i s8 = _mm256_srai_epi16(in[8], 15); - __m256i s9 = _mm256_srai_epi16(in[9], 15); - __m256i s10 = _mm256_srai_epi16(in[10], 15); - __m256i s11 = _mm256_srai_epi16(in[11], 15); - __m256i s12 = _mm256_srai_epi16(in[12], 15); - __m256i s13 = _mm256_srai_epi16(in[13], 15); - __m256i s14 = _mm256_srai_epi16(in[14], 15); - __m256i s15 = _mm256_srai_epi16(in[15], 15); - - in[0] = _mm256_add_epi16(in[0], one); - in[1] = _mm256_add_epi16(in[1], one); - in[2] = _mm256_add_epi16(in[2], one); - in[3] = _mm256_add_epi16(in[3], one); - in[4] = _mm256_add_epi16(in[4], one); - in[5] = _mm256_add_epi16(in[5], one); - in[6] = _mm256_add_epi16(in[6], one); - in[7] = _mm256_add_epi16(in[7], one); - in[8] = _mm256_add_epi16(in[8], one); - in[9] = _mm256_add_epi16(in[9], one); - in[10] = _mm256_add_epi16(in[10], one); - in[11] = _mm256_add_epi16(in[11], one); - in[12] = _mm256_add_epi16(in[12], one); - in[13] = _mm256_add_epi16(in[13], one); - in[14] = _mm256_add_epi16(in[14], one); - in[15] = _mm256_add_epi16(in[15], one); - - in[0] = _mm256_sub_epi16(in[0], s0); - in[1] = _mm256_sub_epi16(in[1], s1); - in[2] = _mm256_sub_epi16(in[2], s2); - in[3] = _mm256_sub_epi16(in[3], s3); - in[4] = _mm256_sub_epi16(in[4], s4); - in[5] = _mm256_sub_epi16(in[5], s5); - in[6] = _mm256_sub_epi16(in[6], s6); - in[7] = _mm256_sub_epi16(in[7], s7); - in[8] = _mm256_sub_epi16(in[8], s8); - in[9] = _mm256_sub_epi16(in[9], s9); - in[10] = _mm256_sub_epi16(in[10], s10); - in[11] = _mm256_sub_epi16(in[11], s11); - in[12] = _mm256_sub_epi16(in[12], s12); - in[13] = _mm256_sub_epi16(in[13], s13); - in[14] = _mm256_sub_epi16(in[14], s14); - in[15] = _mm256_sub_epi16(in[15], s15); - - in[0] = _mm256_srai_epi16(in[0], 2); - in[1] = _mm256_srai_epi16(in[1], 2); - in[2] = _mm256_srai_epi16(in[2], 2); - in[3] = _mm256_srai_epi16(in[3], 2); - in[4] = _mm256_srai_epi16(in[4], 2); - in[5] = _mm256_srai_epi16(in[5], 2); - in[6] = _mm256_srai_epi16(in[6], 2); - in[7] = _mm256_srai_epi16(in[7], 2); - in[8] = _mm256_srai_epi16(in[8], 2); - in[9] = _mm256_srai_epi16(in[9], 2); - in[10] = _mm256_srai_epi16(in[10], 2); - in[11] = _mm256_srai_epi16(in[11], 2); - in[12] = _mm256_srai_epi16(in[12], 2); - in[13] = _mm256_srai_epi16(in[13], 2); - in[14] = _mm256_srai_epi16(in[14], 2); - in[15] = _mm256_srai_epi16(in[15], 2); -} - -static void fdct16_avx2(__m256i *in) { - // sequence: cospi_L_H = pairs(L, H) and L first - const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64); - const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64); - const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64); - const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64); - - const __m256i cospi_p30_p02 = pair256_set_epi16(cospi_30_64, cospi_2_64); - const __m256i cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64); - - const __m256i cospi_p14_p18 = pair256_set_epi16(cospi_14_64, cospi_18_64); - const __m256i cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64); - - const __m256i cospi_p22_p10 = pair256_set_epi16(cospi_22_64, cospi_10_64); - const __m256i cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64); - - const __m256i cospi_p06_p26 = pair256_set_epi16(cospi_6_64, cospi_26_64); - const __m256i cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64); - - __m256i u0, u1, u2, u3, u4, u5, u6, u7; - __m256i s0, s1, s2, s3, s4, s5, s6, s7; - __m256i t0, t1, t2, t3, t4, t5, t6, t7; - __m256i v0, v1, v2, v3; - __m256i x0, x1; - - // 0, 4, 8, 12 - u0 = _mm256_add_epi16(in[0], in[15]); - u1 = _mm256_add_epi16(in[1], in[14]); - u2 = _mm256_add_epi16(in[2], in[13]); - u3 = _mm256_add_epi16(in[3], in[12]); - u4 = _mm256_add_epi16(in[4], in[11]); - u5 = _mm256_add_epi16(in[5], in[10]); - u6 = _mm256_add_epi16(in[6], in[9]); - u7 = _mm256_add_epi16(in[7], in[8]); - - s0 = _mm256_add_epi16(u0, u7); - s1 = _mm256_add_epi16(u1, u6); - s2 = _mm256_add_epi16(u2, u5); - s3 = _mm256_add_epi16(u3, u4); - - // 0, 8 - v0 = _mm256_add_epi16(s0, s3); - v1 = _mm256_add_epi16(s1, s2); - - x0 = _mm256_unpacklo_epi16(v0, v1); - x1 = _mm256_unpackhi_epi16(v0, v1); - - t0 = butter_fly(&x0, &x1, &cospi_p16_p16); - t1 = butter_fly(&x0, &x1, &cospi_p16_m16); - - // 4, 12 - v0 = _mm256_sub_epi16(s1, s2); - v1 = _mm256_sub_epi16(s0, s3); - - x0 = _mm256_unpacklo_epi16(v0, v1); - x1 = _mm256_unpackhi_epi16(v0, v1); - - t2 = butter_fly(&x0, &x1, &cospi_p24_p08); - t3 = butter_fly(&x0, &x1, &cospi_m08_p24); - - // 2, 6, 10, 14 - s0 = _mm256_sub_epi16(u3, u4); - s1 = _mm256_sub_epi16(u2, u5); - s2 = _mm256_sub_epi16(u1, u6); - s3 = _mm256_sub_epi16(u0, u7); - - v0 = s0; // output[4] - v3 = s3; // output[7] - - x0 = _mm256_unpacklo_epi16(s2, s1); - x1 = _mm256_unpackhi_epi16(s2, s1); - - v2 = butter_fly(&x0, &x1, &cospi_p16_p16); // output[5] - v1 = butter_fly(&x0, &x1, &cospi_p16_m16); // output[6] - - s0 = _mm256_add_epi16(v0, v1); // step[4] - s1 = _mm256_sub_epi16(v0, v1); // step[5] - s2 = _mm256_sub_epi16(v3, v2); // step[6] - s3 = _mm256_add_epi16(v3, v2); // step[7] - - // 2, 14 - x0 = _mm256_unpacklo_epi16(s0, s3); - x1 = _mm256_unpackhi_epi16(s0, s3); - - t4 = butter_fly(&x0, &x1, &cospi_p28_p04); - t5 = butter_fly(&x0, &x1, &cospi_m04_p28); - - // 10, 6 - x0 = _mm256_unpacklo_epi16(s1, s2); - x1 = _mm256_unpackhi_epi16(s1, s2); - t6 = butter_fly(&x0, &x1, &cospi_p12_p20); - t7 = butter_fly(&x0, &x1, &cospi_m20_p12); - - // 1, 3, 5, 7, 9, 11, 13, 15 - s0 = _mm256_sub_epi16(in[7], in[8]); // step[8] - s1 = _mm256_sub_epi16(in[6], in[9]); // step[9] - u2 = _mm256_sub_epi16(in[5], in[10]); - u3 = _mm256_sub_epi16(in[4], in[11]); - u4 = _mm256_sub_epi16(in[3], in[12]); - u5 = _mm256_sub_epi16(in[2], in[13]); - s6 = _mm256_sub_epi16(in[1], in[14]); // step[14] - s7 = _mm256_sub_epi16(in[0], in[15]); // step[15] - - in[0] = t0; - in[8] = t1; - in[4] = t2; - in[12] = t3; - in[2] = t4; - in[14] = t5; - in[10] = t6; - in[6] = t7; - - x0 = _mm256_unpacklo_epi16(u5, u2); - x1 = _mm256_unpackhi_epi16(u5, u2); - - s2 = butter_fly(&x0, &x1, &cospi_p16_p16); // step[13] - s5 = butter_fly(&x0, &x1, &cospi_p16_m16); // step[10] - - x0 = _mm256_unpacklo_epi16(u4, u3); - x1 = _mm256_unpackhi_epi16(u4, u3); - - s3 = butter_fly(&x0, &x1, &cospi_p16_p16); // step[12] - s4 = butter_fly(&x0, &x1, &cospi_p16_m16); // step[11] - - u0 = _mm256_add_epi16(s0, s4); // output[8] - u1 = _mm256_add_epi16(s1, s5); - u2 = _mm256_sub_epi16(s1, s5); - u3 = _mm256_sub_epi16(s0, s4); - u4 = _mm256_sub_epi16(s7, s3); - u5 = _mm256_sub_epi16(s6, s2); - u6 = _mm256_add_epi16(s6, s2); - u7 = _mm256_add_epi16(s7, s3); - - // stage 4 - s0 = u0; - s3 = u3; - s4 = u4; - s7 = u7; - - x0 = _mm256_unpacklo_epi16(u1, u6); - x1 = _mm256_unpackhi_epi16(u1, u6); - - s1 = butter_fly(&x0, &x1, &cospi_m08_p24); - s6 = butter_fly(&x0, &x1, &cospi_p24_p08); - - x0 = _mm256_unpacklo_epi16(u2, u5); - x1 = _mm256_unpackhi_epi16(u2, u5); - - s2 = butter_fly(&x0, &x1, &cospi_m24_m08); - s5 = butter_fly(&x0, &x1, &cospi_m08_p24); - - // stage 5 - u0 = _mm256_add_epi16(s0, s1); - u1 = _mm256_sub_epi16(s0, s1); - u2 = _mm256_sub_epi16(s3, s2); - u3 = _mm256_add_epi16(s3, s2); - u4 = _mm256_add_epi16(s4, s5); - u5 = _mm256_sub_epi16(s4, s5); - u6 = _mm256_sub_epi16(s7, s6); - u7 = _mm256_add_epi16(s7, s6); - - // stage 6 - x0 = _mm256_unpacklo_epi16(u0, u7); - x1 = _mm256_unpackhi_epi16(u0, u7); - in[1] = butter_fly(&x0, &x1, &cospi_p30_p02); - in[15] = butter_fly(&x0, &x1, &cospi_m02_p30); - - x0 = _mm256_unpacklo_epi16(u1, u6); - x1 = _mm256_unpackhi_epi16(u1, u6); - in[9] = butter_fly(&x0, &x1, &cospi_p14_p18); - in[7] = butter_fly(&x0, &x1, &cospi_m18_p14); - - x0 = _mm256_unpacklo_epi16(u2, u5); - x1 = _mm256_unpackhi_epi16(u2, u5); - in[5] = butter_fly(&x0, &x1, &cospi_p22_p10); - in[11] = butter_fly(&x0, &x1, &cospi_m10_p22); - - x0 = _mm256_unpacklo_epi16(u3, u4); - x1 = _mm256_unpackhi_epi16(u3, u4); - in[13] = butter_fly(&x0, &x1, &cospi_p06_p26); - in[3] = butter_fly(&x0, &x1, &cospi_m26_p06); -} - -void fadst16_avx2(__m256i *in) { - const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64); - const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64); - const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64); - const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64); - const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64); - const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64); - const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64); - const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64); - const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64); - const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64); - const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64); - const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64); - const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64); - const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64); - const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64); - const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64); - const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64); - const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64); - const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64); - const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64); - const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64); - const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64); - const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64); - const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64); - const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64); - const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64); - const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); - const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64); - const __m256i zero = _mm256_setzero_si256(); - const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); - __m256i s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; - __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - __m256i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; - __m256i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - __m256i y0, y1; - - // stage 1, s takes low 256 bits; x takes high 256 bits - y0 = _mm256_unpacklo_epi16(in[15], in[0]); - y1 = _mm256_unpackhi_epi16(in[15], in[0]); - s0 = _mm256_madd_epi16(y0, cospi_p01_p31); - x0 = _mm256_madd_epi16(y1, cospi_p01_p31); - s1 = _mm256_madd_epi16(y0, cospi_p31_m01); - x1 = _mm256_madd_epi16(y1, cospi_p31_m01); - - y0 = _mm256_unpacklo_epi16(in[13], in[2]); - y1 = _mm256_unpackhi_epi16(in[13], in[2]); - s2 = _mm256_madd_epi16(y0, cospi_p05_p27); - x2 = _mm256_madd_epi16(y1, cospi_p05_p27); - s3 = _mm256_madd_epi16(y0, cospi_p27_m05); - x3 = _mm256_madd_epi16(y1, cospi_p27_m05); - - y0 = _mm256_unpacklo_epi16(in[11], in[4]); - y1 = _mm256_unpackhi_epi16(in[11], in[4]); - s4 = _mm256_madd_epi16(y0, cospi_p09_p23); - x4 = _mm256_madd_epi16(y1, cospi_p09_p23); - s5 = _mm256_madd_epi16(y0, cospi_p23_m09); - x5 = _mm256_madd_epi16(y1, cospi_p23_m09); - - y0 = _mm256_unpacklo_epi16(in[9], in[6]); - y1 = _mm256_unpackhi_epi16(in[9], in[6]); - s6 = _mm256_madd_epi16(y0, cospi_p13_p19); - x6 = _mm256_madd_epi16(y1, cospi_p13_p19); - s7 = _mm256_madd_epi16(y0, cospi_p19_m13); - x7 = _mm256_madd_epi16(y1, cospi_p19_m13); - - y0 = _mm256_unpacklo_epi16(in[7], in[8]); - y1 = _mm256_unpackhi_epi16(in[7], in[8]); - s8 = _mm256_madd_epi16(y0, cospi_p17_p15); - x8 = _mm256_madd_epi16(y1, cospi_p17_p15); - s9 = _mm256_madd_epi16(y0, cospi_p15_m17); - x9 = _mm256_madd_epi16(y1, cospi_p15_m17); - - y0 = _mm256_unpacklo_epi16(in[5], in[10]); - y1 = _mm256_unpackhi_epi16(in[5], in[10]); - s10 = _mm256_madd_epi16(y0, cospi_p21_p11); - x10 = _mm256_madd_epi16(y1, cospi_p21_p11); - s11 = _mm256_madd_epi16(y0, cospi_p11_m21); - x11 = _mm256_madd_epi16(y1, cospi_p11_m21); - - y0 = _mm256_unpacklo_epi16(in[3], in[12]); - y1 = _mm256_unpackhi_epi16(in[3], in[12]); - s12 = _mm256_madd_epi16(y0, cospi_p25_p07); - x12 = _mm256_madd_epi16(y1, cospi_p25_p07); - s13 = _mm256_madd_epi16(y0, cospi_p07_m25); - x13 = _mm256_madd_epi16(y1, cospi_p07_m25); - - y0 = _mm256_unpacklo_epi16(in[1], in[14]); - y1 = _mm256_unpackhi_epi16(in[1], in[14]); - s14 = _mm256_madd_epi16(y0, cospi_p29_p03); - x14 = _mm256_madd_epi16(y1, cospi_p29_p03); - s15 = _mm256_madd_epi16(y0, cospi_p03_m29); - x15 = _mm256_madd_epi16(y1, cospi_p03_m29); - - // u takes low 256 bits; v takes high 256 bits - u0 = _mm256_add_epi32(s0, s8); - u1 = _mm256_add_epi32(s1, s9); - u2 = _mm256_add_epi32(s2, s10); - u3 = _mm256_add_epi32(s3, s11); - u4 = _mm256_add_epi32(s4, s12); - u5 = _mm256_add_epi32(s5, s13); - u6 = _mm256_add_epi32(s6, s14); - u7 = _mm256_add_epi32(s7, s15); - - u8 = _mm256_sub_epi32(s0, s8); - u9 = _mm256_sub_epi32(s1, s9); - u10 = _mm256_sub_epi32(s2, s10); - u11 = _mm256_sub_epi32(s3, s11); - u12 = _mm256_sub_epi32(s4, s12); - u13 = _mm256_sub_epi32(s5, s13); - u14 = _mm256_sub_epi32(s6, s14); - u15 = _mm256_sub_epi32(s7, s15); - - v0 = _mm256_add_epi32(x0, x8); - v1 = _mm256_add_epi32(x1, x9); - v2 = _mm256_add_epi32(x2, x10); - v3 = _mm256_add_epi32(x3, x11); - v4 = _mm256_add_epi32(x4, x12); - v5 = _mm256_add_epi32(x5, x13); - v6 = _mm256_add_epi32(x6, x14); - v7 = _mm256_add_epi32(x7, x15); - - v8 = _mm256_sub_epi32(x0, x8); - v9 = _mm256_sub_epi32(x1, x9); - v10 = _mm256_sub_epi32(x2, x10); - v11 = _mm256_sub_epi32(x3, x11); - v12 = _mm256_sub_epi32(x4, x12); - v13 = _mm256_sub_epi32(x5, x13); - v14 = _mm256_sub_epi32(x6, x14); - v15 = _mm256_sub_epi32(x7, x15); - - // low 256 bits rounding - u8 = _mm256_add_epi32(u8, dct_rounding); - u9 = _mm256_add_epi32(u9, dct_rounding); - u10 = _mm256_add_epi32(u10, dct_rounding); - u11 = _mm256_add_epi32(u11, dct_rounding); - u12 = _mm256_add_epi32(u12, dct_rounding); - u13 = _mm256_add_epi32(u13, dct_rounding); - u14 = _mm256_add_epi32(u14, dct_rounding); - u15 = _mm256_add_epi32(u15, dct_rounding); - - u8 = _mm256_srai_epi32(u8, DCT_CONST_BITS); - u9 = _mm256_srai_epi32(u9, DCT_CONST_BITS); - u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS); - u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS); - u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS); - u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS); - u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS); - u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS); - - // high 256 bits rounding - v8 = _mm256_add_epi32(v8, dct_rounding); - v9 = _mm256_add_epi32(v9, dct_rounding); - v10 = _mm256_add_epi32(v10, dct_rounding); - v11 = _mm256_add_epi32(v11, dct_rounding); - v12 = _mm256_add_epi32(v12, dct_rounding); - v13 = _mm256_add_epi32(v13, dct_rounding); - v14 = _mm256_add_epi32(v14, dct_rounding); - v15 = _mm256_add_epi32(v15, dct_rounding); - - v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS); - v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS); - v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS); - v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS); - v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS); - v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS); - v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS); - v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS); - - // Saturation pack 32-bit to 16-bit - x8 = _mm256_packs_epi32(u8, v8); - x9 = _mm256_packs_epi32(u9, v9); - x10 = _mm256_packs_epi32(u10, v10); - x11 = _mm256_packs_epi32(u11, v11); - x12 = _mm256_packs_epi32(u12, v12); - x13 = _mm256_packs_epi32(u13, v13); - x14 = _mm256_packs_epi32(u14, v14); - x15 = _mm256_packs_epi32(u15, v15); - - // stage 2 - y0 = _mm256_unpacklo_epi16(x8, x9); - y1 = _mm256_unpackhi_epi16(x8, x9); - s8 = _mm256_madd_epi16(y0, cospi_p04_p28); - x8 = _mm256_madd_epi16(y1, cospi_p04_p28); - s9 = _mm256_madd_epi16(y0, cospi_p28_m04); - x9 = _mm256_madd_epi16(y1, cospi_p28_m04); - - y0 = _mm256_unpacklo_epi16(x10, x11); - y1 = _mm256_unpackhi_epi16(x10, x11); - s10 = _mm256_madd_epi16(y0, cospi_p20_p12); - x10 = _mm256_madd_epi16(y1, cospi_p20_p12); - s11 = _mm256_madd_epi16(y0, cospi_p12_m20); - x11 = _mm256_madd_epi16(y1, cospi_p12_m20); - - y0 = _mm256_unpacklo_epi16(x12, x13); - y1 = _mm256_unpackhi_epi16(x12, x13); - s12 = _mm256_madd_epi16(y0, cospi_m28_p04); - x12 = _mm256_madd_epi16(y1, cospi_m28_p04); - s13 = _mm256_madd_epi16(y0, cospi_p04_p28); - x13 = _mm256_madd_epi16(y1, cospi_p04_p28); - - y0 = _mm256_unpacklo_epi16(x14, x15); - y1 = _mm256_unpackhi_epi16(x14, x15); - s14 = _mm256_madd_epi16(y0, cospi_m12_p20); - x14 = _mm256_madd_epi16(y1, cospi_m12_p20); - s15 = _mm256_madd_epi16(y0, cospi_p20_p12); - x15 = _mm256_madd_epi16(y1, cospi_p20_p12); - - x0 = _mm256_add_epi32(u0, u4); - s0 = _mm256_add_epi32(v0, v4); - x1 = _mm256_add_epi32(u1, u5); - s1 = _mm256_add_epi32(v1, v5); - x2 = _mm256_add_epi32(u2, u6); - s2 = _mm256_add_epi32(v2, v6); - x3 = _mm256_add_epi32(u3, u7); - s3 = _mm256_add_epi32(v3, v7); - - v8 = _mm256_sub_epi32(u0, u4); - v9 = _mm256_sub_epi32(v0, v4); - v10 = _mm256_sub_epi32(u1, u5); - v11 = _mm256_sub_epi32(v1, v5); - v12 = _mm256_sub_epi32(u2, u6); - v13 = _mm256_sub_epi32(v2, v6); - v14 = _mm256_sub_epi32(u3, u7); - v15 = _mm256_sub_epi32(v3, v7); - - v8 = _mm256_add_epi32(v8, dct_rounding); - v9 = _mm256_add_epi32(v9, dct_rounding); - v10 = _mm256_add_epi32(v10, dct_rounding); - v11 = _mm256_add_epi32(v11, dct_rounding); - v12 = _mm256_add_epi32(v12, dct_rounding); - v13 = _mm256_add_epi32(v13, dct_rounding); - v14 = _mm256_add_epi32(v14, dct_rounding); - v15 = _mm256_add_epi32(v15, dct_rounding); - - v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS); - v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS); - v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS); - v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS); - v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS); - v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS); - v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS); - v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS); - - x4 = _mm256_packs_epi32(v8, v9); - x5 = _mm256_packs_epi32(v10, v11); - x6 = _mm256_packs_epi32(v12, v13); - x7 = _mm256_packs_epi32(v14, v15); - - u8 = _mm256_add_epi32(s8, s12); - u9 = _mm256_add_epi32(s9, s13); - u10 = _mm256_add_epi32(s10, s14); - u11 = _mm256_add_epi32(s11, s15); - u12 = _mm256_sub_epi32(s8, s12); - u13 = _mm256_sub_epi32(s9, s13); - u14 = _mm256_sub_epi32(s10, s14); - u15 = _mm256_sub_epi32(s11, s15); - - v8 = _mm256_add_epi32(x8, x12); - v9 = _mm256_add_epi32(x9, x13); - v10 = _mm256_add_epi32(x10, x14); - v11 = _mm256_add_epi32(x11, x15); - v12 = _mm256_sub_epi32(x8, x12); - v13 = _mm256_sub_epi32(x9, x13); - v14 = _mm256_sub_epi32(x10, x14); - v15 = _mm256_sub_epi32(x11, x15); - - u12 = _mm256_add_epi32(u12, dct_rounding); - u13 = _mm256_add_epi32(u13, dct_rounding); - u14 = _mm256_add_epi32(u14, dct_rounding); - u15 = _mm256_add_epi32(u15, dct_rounding); - - u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS); - u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS); - u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS); - u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS); - - v12 = _mm256_add_epi32(v12, dct_rounding); - v13 = _mm256_add_epi32(v13, dct_rounding); - v14 = _mm256_add_epi32(v14, dct_rounding); - v15 = _mm256_add_epi32(v15, dct_rounding); - - v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS); - v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS); - v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS); - v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS); - - x12 = _mm256_packs_epi32(u12, v12); - x13 = _mm256_packs_epi32(u13, v13); - x14 = _mm256_packs_epi32(u14, v14); - x15 = _mm256_packs_epi32(u15, v15); - - // stage 3 - y0 = _mm256_unpacklo_epi16(x4, x5); - y1 = _mm256_unpackhi_epi16(x4, x5); - s4 = _mm256_madd_epi16(y0, cospi_p08_p24); - x4 = _mm256_madd_epi16(y1, cospi_p08_p24); - s5 = _mm256_madd_epi16(y0, cospi_p24_m08); - x5 = _mm256_madd_epi16(y1, cospi_p24_m08); - - y0 = _mm256_unpacklo_epi16(x6, x7); - y1 = _mm256_unpackhi_epi16(x6, x7); - s6 = _mm256_madd_epi16(y0, cospi_m24_p08); - x6 = _mm256_madd_epi16(y1, cospi_m24_p08); - s7 = _mm256_madd_epi16(y0, cospi_p08_p24); - x7 = _mm256_madd_epi16(y1, cospi_p08_p24); - - y0 = _mm256_unpacklo_epi16(x12, x13); - y1 = _mm256_unpackhi_epi16(x12, x13); - s12 = _mm256_madd_epi16(y0, cospi_p08_p24); - x12 = _mm256_madd_epi16(y1, cospi_p08_p24); - s13 = _mm256_madd_epi16(y0, cospi_p24_m08); - x13 = _mm256_madd_epi16(y1, cospi_p24_m08); - - y0 = _mm256_unpacklo_epi16(x14, x15); - y1 = _mm256_unpackhi_epi16(x14, x15); - s14 = _mm256_madd_epi16(y0, cospi_m24_p08); - x14 = _mm256_madd_epi16(y1, cospi_m24_p08); - s15 = _mm256_madd_epi16(y0, cospi_p08_p24); - x15 = _mm256_madd_epi16(y1, cospi_p08_p24); - - u0 = _mm256_add_epi32(x0, x2); - v0 = _mm256_add_epi32(s0, s2); - u1 = _mm256_add_epi32(x1, x3); - v1 = _mm256_add_epi32(s1, s3); - u2 = _mm256_sub_epi32(x0, x2); - v2 = _mm256_sub_epi32(s0, s2); - u3 = _mm256_sub_epi32(x1, x3); - v3 = _mm256_sub_epi32(s1, s3); - - u0 = _mm256_add_epi32(u0, dct_rounding); - v0 = _mm256_add_epi32(v0, dct_rounding); - u1 = _mm256_add_epi32(u1, dct_rounding); - v1 = _mm256_add_epi32(v1, dct_rounding); - u2 = _mm256_add_epi32(u2, dct_rounding); - v2 = _mm256_add_epi32(v2, dct_rounding); - u3 = _mm256_add_epi32(u3, dct_rounding); - v3 = _mm256_add_epi32(v3, dct_rounding); - - u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS); - v0 = _mm256_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS); - v1 = _mm256_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS); - v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS); - v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS); - - in[0] = _mm256_packs_epi32(u0, v0); - x1 = _mm256_packs_epi32(u1, v1); - x2 = _mm256_packs_epi32(u2, v2); - x3 = _mm256_packs_epi32(u3, v3); - - // Rounding on s4 + s6, s5 + s7, s4 - s6, s5 - s7 - u4 = _mm256_add_epi32(s4, s6); - u5 = _mm256_add_epi32(s5, s7); - u6 = _mm256_sub_epi32(s4, s6); - u7 = _mm256_sub_epi32(s5, s7); - - v4 = _mm256_add_epi32(x4, x6); - v5 = _mm256_add_epi32(x5, x7); - v6 = _mm256_sub_epi32(x4, x6); - v7 = _mm256_sub_epi32(x5, x7); - - u4 = _mm256_add_epi32(u4, dct_rounding); - u5 = _mm256_add_epi32(u5, dct_rounding); - u6 = _mm256_add_epi32(u6, dct_rounding); - u7 = _mm256_add_epi32(u7, dct_rounding); - - u4 = _mm256_srai_epi32(u4, DCT_CONST_BITS); - u5 = _mm256_srai_epi32(u5, DCT_CONST_BITS); - u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS); - u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS); - - v4 = _mm256_add_epi32(v4, dct_rounding); - v5 = _mm256_add_epi32(v5, dct_rounding); - v6 = _mm256_add_epi32(v6, dct_rounding); - v7 = _mm256_add_epi32(v7, dct_rounding); - - v4 = _mm256_srai_epi32(v4, DCT_CONST_BITS); - v5 = _mm256_srai_epi32(v5, DCT_CONST_BITS); - v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS); - v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS); - - x4 = _mm256_packs_epi32(u4, v4); - in[12] = _mm256_packs_epi32(u5, v5); - x6 = _mm256_packs_epi32(u6, v6); - x7 = _mm256_packs_epi32(u7, v7); - - u0 = _mm256_add_epi32(u8, u10); - v0 = _mm256_add_epi32(v8, v10); - u1 = _mm256_add_epi32(u9, u11); - v1 = _mm256_add_epi32(v9, v11); - u2 = _mm256_sub_epi32(u8, u10); - v2 = _mm256_sub_epi32(v8, v10); - u3 = _mm256_sub_epi32(u9, u11); - v3 = _mm256_sub_epi32(v9, v11); - - u0 = _mm256_add_epi32(u0, dct_rounding); - v0 = _mm256_add_epi32(v0, dct_rounding); - u1 = _mm256_add_epi32(u1, dct_rounding); - v1 = _mm256_add_epi32(v1, dct_rounding); - u2 = _mm256_add_epi32(u2, dct_rounding); - v2 = _mm256_add_epi32(v2, dct_rounding); - u3 = _mm256_add_epi32(u3, dct_rounding); - v3 = _mm256_add_epi32(v3, dct_rounding); - - u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS); - v0 = _mm256_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS); - v1 = _mm256_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS); - v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS); - v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS); - - x8 = _mm256_packs_epi32(u0, v0); - in[14] = _mm256_packs_epi32(u1, v1); - x10 = _mm256_packs_epi32(u2, v2); - x11 = _mm256_packs_epi32(u3, v3); - - // Rounding on s12 + s14, s13 + s15, s12 - s14, s13 - s15 - u12 = _mm256_add_epi32(s12, s14); - u13 = _mm256_add_epi32(s13, s15); - u14 = _mm256_sub_epi32(s12, s14); - u15 = _mm256_sub_epi32(s13, s15); - - v12 = _mm256_add_epi32(x12, x14); - v13 = _mm256_add_epi32(x13, x15); - v14 = _mm256_sub_epi32(x12, x14); - v15 = _mm256_sub_epi32(x13, x15); - - u12 = _mm256_add_epi32(u12, dct_rounding); - u13 = _mm256_add_epi32(u13, dct_rounding); - u14 = _mm256_add_epi32(u14, dct_rounding); - u15 = _mm256_add_epi32(u15, dct_rounding); - - u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS); - u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS); - u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS); - u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS); - - v12 = _mm256_add_epi32(v12, dct_rounding); - v13 = _mm256_add_epi32(v13, dct_rounding); - v14 = _mm256_add_epi32(v14, dct_rounding); - v15 = _mm256_add_epi32(v15, dct_rounding); - - v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS); - v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS); - v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS); - v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS); - - x12 = _mm256_packs_epi32(u12, v12); - x13 = _mm256_packs_epi32(u13, v13); - x14 = _mm256_packs_epi32(u14, v14); - x15 = _mm256_packs_epi32(u15, v15); - in[2] = x12; - - // stage 4 - y0 = _mm256_unpacklo_epi16(x2, x3); - y1 = _mm256_unpackhi_epi16(x2, x3); - s2 = _mm256_madd_epi16(y0, cospi_m16_m16); - x2 = _mm256_madd_epi16(y1, cospi_m16_m16); - s3 = _mm256_madd_epi16(y0, cospi_p16_m16); - x3 = _mm256_madd_epi16(y1, cospi_p16_m16); - - y0 = _mm256_unpacklo_epi16(x6, x7); - y1 = _mm256_unpackhi_epi16(x6, x7); - s6 = _mm256_madd_epi16(y0, cospi_p16_p16); - x6 = _mm256_madd_epi16(y1, cospi_p16_p16); - s7 = _mm256_madd_epi16(y0, cospi_m16_p16); - x7 = _mm256_madd_epi16(y1, cospi_m16_p16); - - y0 = _mm256_unpacklo_epi16(x10, x11); - y1 = _mm256_unpackhi_epi16(x10, x11); - s10 = _mm256_madd_epi16(y0, cospi_p16_p16); - x10 = _mm256_madd_epi16(y1, cospi_p16_p16); - s11 = _mm256_madd_epi16(y0, cospi_m16_p16); - x11 = _mm256_madd_epi16(y1, cospi_m16_p16); - - y0 = _mm256_unpacklo_epi16(x14, x15); - y1 = _mm256_unpackhi_epi16(x14, x15); - s14 = _mm256_madd_epi16(y0, cospi_m16_m16); - x14 = _mm256_madd_epi16(y1, cospi_m16_m16); - s15 = _mm256_madd_epi16(y0, cospi_p16_m16); - x15 = _mm256_madd_epi16(y1, cospi_p16_m16); - - // Rounding - u2 = _mm256_add_epi32(s2, dct_rounding); - u3 = _mm256_add_epi32(s3, dct_rounding); - u6 = _mm256_add_epi32(s6, dct_rounding); - u7 = _mm256_add_epi32(s7, dct_rounding); - - u10 = _mm256_add_epi32(s10, dct_rounding); - u11 = _mm256_add_epi32(s11, dct_rounding); - u14 = _mm256_add_epi32(s14, dct_rounding); - u15 = _mm256_add_epi32(s15, dct_rounding); - - u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS); - u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS); - u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS); - u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS); - - u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS); - u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS); - u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS); - u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS); - - v2 = _mm256_add_epi32(x2, dct_rounding); - v3 = _mm256_add_epi32(x3, dct_rounding); - v6 = _mm256_add_epi32(x6, dct_rounding); - v7 = _mm256_add_epi32(x7, dct_rounding); - - v10 = _mm256_add_epi32(x10, dct_rounding); - v11 = _mm256_add_epi32(x11, dct_rounding); - v14 = _mm256_add_epi32(x14, dct_rounding); - v15 = _mm256_add_epi32(x15, dct_rounding); - - v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS); - v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS); - v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS); - v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS); - - v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS); - v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS); - v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS); - v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS); - - in[7] = _mm256_packs_epi32(u2, v2); - in[8] = _mm256_packs_epi32(u3, v3); - - in[4] = _mm256_packs_epi32(u6, v6); - in[11] = _mm256_packs_epi32(u7, v7); - - in[6] = _mm256_packs_epi32(u10, v10); - in[9] = _mm256_packs_epi32(u11, v11); - - in[5] = _mm256_packs_epi32(u14, v14); - in[10] = _mm256_packs_epi32(u15, v15); - - in[1] = _mm256_sub_epi16(zero, x8); - in[3] = _mm256_sub_epi16(zero, x4); - in[13] = _mm256_sub_epi16(zero, x13); - in[15] = _mm256_sub_epi16(zero, x1); -} - -#if CONFIG_EXT_TX -static void fidtx16_avx2(__m256i *in) { - txfm_scaling16_avx2((int16_t)Sqrt2, in); -} -#endif - -void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m256i in[16]; - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); -#endif - - switch (tx_type) { - case DCT_DCT: - load_buffer_16x16(input, stride, 0, 0, in); - fdct16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fdct16_avx2(in); - break; - case ADST_DCT: - load_buffer_16x16(input, stride, 0, 0, in); - fadst16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fdct16_avx2(in); - break; - case DCT_ADST: - load_buffer_16x16(input, stride, 0, 0, in); - fdct16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fadst16_avx2(in); - break; - case ADST_ADST: - load_buffer_16x16(input, stride, 0, 0, in); - fadst16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fadst16_avx2(in); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - load_buffer_16x16(input, stride, 1, 0, in); - fadst16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fdct16_avx2(in); - break; - case DCT_FLIPADST: - load_buffer_16x16(input, stride, 0, 1, in); - fdct16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fadst16_avx2(in); - break; - case FLIPADST_FLIPADST: - load_buffer_16x16(input, stride, 1, 1, in); - fadst16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fadst16_avx2(in); - break; - case ADST_FLIPADST: - load_buffer_16x16(input, stride, 0, 1, in); - fadst16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fadst16_avx2(in); - break; - case FLIPADST_ADST: - load_buffer_16x16(input, stride, 1, 0, in); - fadst16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fadst16_avx2(in); - break; - case IDTX: - load_buffer_16x16(input, stride, 0, 0, in); - fidtx16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fidtx16_avx2(in); - break; - case V_DCT: - load_buffer_16x16(input, stride, 0, 0, in); - fdct16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fidtx16_avx2(in); - break; - case H_DCT: - load_buffer_16x16(input, stride, 0, 0, in); - fidtx16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fdct16_avx2(in); - break; - case V_ADST: - load_buffer_16x16(input, stride, 0, 0, in); - fadst16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fidtx16_avx2(in); - break; - case H_ADST: - load_buffer_16x16(input, stride, 0, 0, in); - fidtx16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fadst16_avx2(in); - break; - case V_FLIPADST: - load_buffer_16x16(input, stride, 1, 0, in); - fadst16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fidtx16_avx2(in); - break; - case H_FLIPADST: - load_buffer_16x16(input, stride, 0, 1, in); - fidtx16_avx2(in); - mm256_transpose_16x16(in, in); - right_shift_16x16(in); - fadst16_avx2(in); - break; -#endif // CONFIG_EXT_TX - default: assert(0); break; - } - mm256_transpose_16x16(in, in); - write_buffer_16x16(in, output); - _mm256_zeroupper(); -} - -static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) { - int i = 0; - __m256i temp; - while (i < size) { - temp = a0[i]; - a0[i] = a1[i]; - a1[i] = temp; - i++; - } -} - -static void mm256_transpose_32x32(__m256i *in0, __m256i *in1) { - mm256_transpose_16x16(in0, in0); - mm256_transpose_16x16(&in0[16], &in0[16]); - mm256_transpose_16x16(in1, in1); - mm256_transpose_16x16(&in1[16], &in1[16]); - mm256_vectors_swap(&in0[16], in1, 16); -} - -static void prepare_16x16_even(const __m256i *in, __m256i *even) { - even[0] = _mm256_add_epi16(in[0], in[31]); - even[1] = _mm256_add_epi16(in[1], in[30]); - even[2] = _mm256_add_epi16(in[2], in[29]); - even[3] = _mm256_add_epi16(in[3], in[28]); - even[4] = _mm256_add_epi16(in[4], in[27]); - even[5] = _mm256_add_epi16(in[5], in[26]); - even[6] = _mm256_add_epi16(in[6], in[25]); - even[7] = _mm256_add_epi16(in[7], in[24]); - even[8] = _mm256_add_epi16(in[8], in[23]); - even[9] = _mm256_add_epi16(in[9], in[22]); - even[10] = _mm256_add_epi16(in[10], in[21]); - even[11] = _mm256_add_epi16(in[11], in[20]); - even[12] = _mm256_add_epi16(in[12], in[19]); - even[13] = _mm256_add_epi16(in[13], in[18]); - even[14] = _mm256_add_epi16(in[14], in[17]); - even[15] = _mm256_add_epi16(in[15], in[16]); -} - -static void prepare_16x16_odd(const __m256i *in, __m256i *odd) { - odd[0] = _mm256_sub_epi16(in[15], in[16]); - odd[1] = _mm256_sub_epi16(in[14], in[17]); - odd[2] = _mm256_sub_epi16(in[13], in[18]); - odd[3] = _mm256_sub_epi16(in[12], in[19]); - odd[4] = _mm256_sub_epi16(in[11], in[20]); - odd[5] = _mm256_sub_epi16(in[10], in[21]); - odd[6] = _mm256_sub_epi16(in[9], in[22]); - odd[7] = _mm256_sub_epi16(in[8], in[23]); - odd[8] = _mm256_sub_epi16(in[7], in[24]); - odd[9] = _mm256_sub_epi16(in[6], in[25]); - odd[10] = _mm256_sub_epi16(in[5], in[26]); - odd[11] = _mm256_sub_epi16(in[4], in[27]); - odd[12] = _mm256_sub_epi16(in[3], in[28]); - odd[13] = _mm256_sub_epi16(in[2], in[29]); - odd[14] = _mm256_sub_epi16(in[1], in[30]); - odd[15] = _mm256_sub_epi16(in[0], in[31]); -} - -static void collect_16col(const __m256i *even, const __m256i *odd, - __m256i *out) { - // fdct16_avx2() already maps the output - out[0] = even[0]; - out[2] = even[1]; - out[4] = even[2]; - out[6] = even[3]; - out[8] = even[4]; - out[10] = even[5]; - out[12] = even[6]; - out[14] = even[7]; - out[16] = even[8]; - out[18] = even[9]; - out[20] = even[10]; - out[22] = even[11]; - out[24] = even[12]; - out[26] = even[13]; - out[28] = even[14]; - out[30] = even[15]; - - out[1] = odd[0]; - out[17] = odd[1]; - out[9] = odd[2]; - out[25] = odd[3]; - out[5] = odd[4]; - out[21] = odd[5]; - out[13] = odd[6]; - out[29] = odd[7]; - out[3] = odd[8]; - out[19] = odd[9]; - out[11] = odd[10]; - out[27] = odd[11]; - out[7] = odd[12]; - out[23] = odd[13]; - out[15] = odd[14]; - out[31] = odd[15]; -} - -static void collect_coeffs(const __m256i *first_16col_even, - const __m256i *first_16col_odd, - const __m256i *second_16col_even, - const __m256i *second_16col_odd, __m256i *in0, - __m256i *in1) { - collect_16col(first_16col_even, first_16col_odd, in0); - collect_16col(second_16col_even, second_16col_odd, in1); -} - -static void fdct16_odd_avx2(__m256i *in) { - // sequence: cospi_L_H = pairs(L, H) and L first - const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64); - const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64); - const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64); - const __m256i cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); - const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64); - const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64); - const __m256i cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m256i cospi_p31_p01 = pair256_set_epi16(cospi_31_64, cospi_1_64); - const __m256i cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64); - const __m256i cospi_p15_p17 = pair256_set_epi16(cospi_15_64, cospi_17_64); - const __m256i cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64); - const __m256i cospi_p23_p09 = pair256_set_epi16(cospi_23_64, cospi_9_64); - const __m256i cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64); - const __m256i cospi_p07_p25 = pair256_set_epi16(cospi_7_64, cospi_25_64); - const __m256i cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64); - const __m256i cospi_p27_p05 = pair256_set_epi16(cospi_27_64, cospi_5_64); - const __m256i cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64); - const __m256i cospi_p11_p21 = pair256_set_epi16(cospi_11_64, cospi_21_64); - const __m256i cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64); - const __m256i cospi_p19_p13 = pair256_set_epi16(cospi_19_64, cospi_13_64); - const __m256i cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64); - const __m256i cospi_p03_p29 = pair256_set_epi16(cospi_3_64, cospi_29_64); - const __m256i cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64); - - __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15; - __m256i u0, u1; - - // stage 1 is in prepare_16x16_odd() - - // stage 2 - y0 = in[0]; - y1 = in[1]; - y2 = in[2]; - y3 = in[3]; - - u0 = _mm256_unpacklo_epi16(in[4], in[11]); - u1 = _mm256_unpackhi_epi16(in[4], in[11]); - y4 = butter_fly(&u0, &u1, &cospi_m16_p16); - y11 = butter_fly(&u0, &u1, &cospi_p16_p16); - - u0 = _mm256_unpacklo_epi16(in[5], in[10]); - u1 = _mm256_unpackhi_epi16(in[5], in[10]); - y5 = butter_fly(&u0, &u1, &cospi_m16_p16); - y10 = butter_fly(&u0, &u1, &cospi_p16_p16); - - u0 = _mm256_unpacklo_epi16(in[6], in[9]); - u1 = _mm256_unpackhi_epi16(in[6], in[9]); - y6 = butter_fly(&u0, &u1, &cospi_m16_p16); - y9 = butter_fly(&u0, &u1, &cospi_p16_p16); - - u0 = _mm256_unpacklo_epi16(in[7], in[8]); - u1 = _mm256_unpackhi_epi16(in[7], in[8]); - y7 = butter_fly(&u0, &u1, &cospi_m16_p16); - y8 = butter_fly(&u0, &u1, &cospi_p16_p16); - - y12 = in[12]; - y13 = in[13]; - y14 = in[14]; - y15 = in[15]; - - // stage 3 - x0 = _mm256_add_epi16(y0, y7); - x1 = _mm256_add_epi16(y1, y6); - x2 = _mm256_add_epi16(y2, y5); - x3 = _mm256_add_epi16(y3, y4); - x4 = _mm256_sub_epi16(y3, y4); - x5 = _mm256_sub_epi16(y2, y5); - x6 = _mm256_sub_epi16(y1, y6); - x7 = _mm256_sub_epi16(y0, y7); - x8 = _mm256_sub_epi16(y15, y8); - x9 = _mm256_sub_epi16(y14, y9); - x10 = _mm256_sub_epi16(y13, y10); - x11 = _mm256_sub_epi16(y12, y11); - x12 = _mm256_add_epi16(y12, y11); - x13 = _mm256_add_epi16(y13, y10); - x14 = _mm256_add_epi16(y14, y9); - x15 = _mm256_add_epi16(y15, y8); - - // stage 4 - y0 = x0; - y1 = x1; - y6 = x6; - y7 = x7; - y8 = x8; - y9 = x9; - y14 = x14; - y15 = x15; - - u0 = _mm256_unpacklo_epi16(x2, x13); - u1 = _mm256_unpackhi_epi16(x2, x13); - y2 = butter_fly(&u0, &u1, &cospi_m08_p24); - y13 = butter_fly(&u0, &u1, &cospi_p24_p08); - - u0 = _mm256_unpacklo_epi16(x3, x12); - u1 = _mm256_unpackhi_epi16(x3, x12); - y3 = butter_fly(&u0, &u1, &cospi_m08_p24); - y12 = butter_fly(&u0, &u1, &cospi_p24_p08); - - u0 = _mm256_unpacklo_epi16(x4, x11); - u1 = _mm256_unpackhi_epi16(x4, x11); - y4 = butter_fly(&u0, &u1, &cospi_m24_m08); - y11 = butter_fly(&u0, &u1, &cospi_m08_p24); - - u0 = _mm256_unpacklo_epi16(x5, x10); - u1 = _mm256_unpackhi_epi16(x5, x10); - y5 = butter_fly(&u0, &u1, &cospi_m24_m08); - y10 = butter_fly(&u0, &u1, &cospi_m08_p24); - - // stage 5 - x0 = _mm256_add_epi16(y0, y3); - x1 = _mm256_add_epi16(y1, y2); - x2 = _mm256_sub_epi16(y1, y2); - x3 = _mm256_sub_epi16(y0, y3); - x4 = _mm256_sub_epi16(y7, y4); - x5 = _mm256_sub_epi16(y6, y5); - x6 = _mm256_add_epi16(y6, y5); - x7 = _mm256_add_epi16(y7, y4); - - x8 = _mm256_add_epi16(y8, y11); - x9 = _mm256_add_epi16(y9, y10); - x10 = _mm256_sub_epi16(y9, y10); - x11 = _mm256_sub_epi16(y8, y11); - x12 = _mm256_sub_epi16(y15, y12); - x13 = _mm256_sub_epi16(y14, y13); - x14 = _mm256_add_epi16(y14, y13); - x15 = _mm256_add_epi16(y15, y12); - - // stage 6 - y0 = x0; - y3 = x3; - y4 = x4; - y7 = x7; - y8 = x8; - y11 = x11; - y12 = x12; - y15 = x15; - - u0 = _mm256_unpacklo_epi16(x1, x14); - u1 = _mm256_unpackhi_epi16(x1, x14); - y1 = butter_fly(&u0, &u1, &cospi_m04_p28); - y14 = butter_fly(&u0, &u1, &cospi_p28_p04); - - u0 = _mm256_unpacklo_epi16(x2, x13); - u1 = _mm256_unpackhi_epi16(x2, x13); - y2 = butter_fly(&u0, &u1, &cospi_m28_m04); - y13 = butter_fly(&u0, &u1, &cospi_m04_p28); - - u0 = _mm256_unpacklo_epi16(x5, x10); - u1 = _mm256_unpackhi_epi16(x5, x10); - y5 = butter_fly(&u0, &u1, &cospi_m20_p12); - y10 = butter_fly(&u0, &u1, &cospi_p12_p20); - - u0 = _mm256_unpacklo_epi16(x6, x9); - u1 = _mm256_unpackhi_epi16(x6, x9); - y6 = butter_fly(&u0, &u1, &cospi_m12_m20); - y9 = butter_fly(&u0, &u1, &cospi_m20_p12); - - // stage 7 - x0 = _mm256_add_epi16(y0, y1); - x1 = _mm256_sub_epi16(y0, y1); - x2 = _mm256_sub_epi16(y3, y2); - x3 = _mm256_add_epi16(y3, y2); - x4 = _mm256_add_epi16(y4, y5); - x5 = _mm256_sub_epi16(y4, y5); - x6 = _mm256_sub_epi16(y7, y6); - x7 = _mm256_add_epi16(y7, y6); - - x8 = _mm256_add_epi16(y8, y9); - x9 = _mm256_sub_epi16(y8, y9); - x10 = _mm256_sub_epi16(y11, y10); - x11 = _mm256_add_epi16(y11, y10); - x12 = _mm256_add_epi16(y12, y13); - x13 = _mm256_sub_epi16(y12, y13); - x14 = _mm256_sub_epi16(y15, y14); - x15 = _mm256_add_epi16(y15, y14); - - // stage 8 - u0 = _mm256_unpacklo_epi16(x0, x15); - u1 = _mm256_unpackhi_epi16(x0, x15); - in[0] = butter_fly(&u0, &u1, &cospi_p31_p01); - in[15] = butter_fly(&u0, &u1, &cospi_m01_p31); - - u0 = _mm256_unpacklo_epi16(x1, x14); - u1 = _mm256_unpackhi_epi16(x1, x14); - in[1] = butter_fly(&u0, &u1, &cospi_p15_p17); - in[14] = butter_fly(&u0, &u1, &cospi_m17_p15); - - u0 = _mm256_unpacklo_epi16(x2, x13); - u1 = _mm256_unpackhi_epi16(x2, x13); - in[2] = butter_fly(&u0, &u1, &cospi_p23_p09); - in[13] = butter_fly(&u0, &u1, &cospi_m09_p23); - - u0 = _mm256_unpacklo_epi16(x3, x12); - u1 = _mm256_unpackhi_epi16(x3, x12); - in[3] = butter_fly(&u0, &u1, &cospi_p07_p25); - in[12] = butter_fly(&u0, &u1, &cospi_m25_p07); - - u0 = _mm256_unpacklo_epi16(x4, x11); - u1 = _mm256_unpackhi_epi16(x4, x11); - in[4] = butter_fly(&u0, &u1, &cospi_p27_p05); - in[11] = butter_fly(&u0, &u1, &cospi_m05_p27); - - u0 = _mm256_unpacklo_epi16(x5, x10); - u1 = _mm256_unpackhi_epi16(x5, x10); - in[5] = butter_fly(&u0, &u1, &cospi_p11_p21); - in[10] = butter_fly(&u0, &u1, &cospi_m21_p11); - - u0 = _mm256_unpacklo_epi16(x6, x9); - u1 = _mm256_unpackhi_epi16(x6, x9); - in[6] = butter_fly(&u0, &u1, &cospi_p19_p13); - in[9] = butter_fly(&u0, &u1, &cospi_m13_p19); - - u0 = _mm256_unpacklo_epi16(x7, x8); - u1 = _mm256_unpackhi_epi16(x7, x8); - in[7] = butter_fly(&u0, &u1, &cospi_p03_p29); - in[8] = butter_fly(&u0, &u1, &cospi_m29_p03); -} - -static void fdct32_avx2(__m256i *in0, __m256i *in1) { - __m256i even0[16], even1[16], odd0[16], odd1[16]; - prepare_16x16_even(in0, even0); - fdct16_avx2(even0); - - prepare_16x16_odd(in0, odd0); - fdct16_odd_avx2(odd0); - - prepare_16x16_even(in1, even1); - fdct16_avx2(even1); - - prepare_16x16_odd(in1, odd1); - fdct16_odd_avx2(odd1); - - collect_coeffs(even0, odd0, even1, odd1, in0, in1); - - mm256_transpose_32x32(in0, in1); -} - -static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1, - tran_low_t *output) { - int i = 0; - const int stride = 32; - tran_low_t *coeff = output; - while (i < 32) { - storeu_output_avx2(&in0[i], coeff); - storeu_output_avx2(&in1[i], coeff + 16); - coeff += stride; - i += 1; - } -} - -#if CONFIG_EXT_TX -static void fhalfright32_16col_avx2(__m256i *in) { - int i = 0; - const __m256i zero = _mm256_setzero_si256(); - const __m256i sqrt2 = _mm256_set1_epi16((int16_t)Sqrt2); - const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); - __m256i x0, x1; - - while (i < 16) { - in[i] = _mm256_slli_epi16(in[i], 2); - x0 = _mm256_unpacklo_epi16(in[i + 16], zero); - x1 = _mm256_unpackhi_epi16(in[i + 16], zero); - x0 = _mm256_madd_epi16(x0, sqrt2); - x1 = _mm256_madd_epi16(x1, sqrt2); - x0 = _mm256_add_epi32(x0, dct_rounding); - x1 = _mm256_add_epi32(x1, dct_rounding); - x0 = _mm256_srai_epi32(x0, DCT_CONST_BITS); - x1 = _mm256_srai_epi32(x1, DCT_CONST_BITS); - in[i + 16] = _mm256_packs_epi32(x0, x1); - i += 1; - } - fdct16_avx2(&in[16]); -} - -static void fhalfright32_avx2(__m256i *in0, __m256i *in1) { - fhalfright32_16col_avx2(in0); - fhalfright32_16col_avx2(in1); - mm256_vectors_swap(in0, &in0[16], 16); - mm256_vectors_swap(in1, &in1[16], 16); - mm256_transpose_32x32(in0, in1); -} -#endif // CONFIG_EXT_TX - -static INLINE void load_buffer_32x32(const int16_t *input, int stride, - int flipud, int fliplr, __m256i *in0, - __m256i *in1) { - // Load 4 16x16 blocks - const int16_t *topL = input; - const int16_t *topR = input + 16; - const int16_t *botL = input + 16 * stride; - const int16_t *botR = input + 16 * stride + 16; - - const int16_t *tmp; - - if (flipud) { - // Swap left columns - tmp = topL; - topL = botL; - botL = tmp; - // Swap right columns - tmp = topR; - topR = botR; - botR = tmp; - } - - if (fliplr) { - // Swap top rows - tmp = topL; - topL = topR; - topR = tmp; - // Swap bottom rows - tmp = botL; - botL = botR; - botR = tmp; - } - - // load first 16 columns - load_buffer_16x16(topL, stride, flipud, fliplr, in0); - load_buffer_16x16(botL, stride, flipud, fliplr, in0 + 16); - - // load second 16 columns - load_buffer_16x16(topR, stride, flipud, fliplr, in1); - load_buffer_16x16(botR, stride, flipud, fliplr, in1 + 16); -} - -static INLINE void right_shift_32x32_16col(int bit, __m256i *in) { - int i = 0; - const __m256i rounding = _mm256_set1_epi16((1 << bit) >> 1); - __m256i sign; - while (i < 32) { - sign = _mm256_srai_epi16(in[i], 15); - in[i] = _mm256_add_epi16(in[i], rounding); - in[i] = _mm256_add_epi16(in[i], sign); - in[i] = _mm256_srai_epi16(in[i], bit); - i += 1; - } -} - -// Positive rounding -static INLINE void right_shift_32x32(__m256i *in0, __m256i *in1) { - const int bit = 4; - right_shift_32x32_16col(bit, in0); - right_shift_32x32_16col(bit, in1); -} - -#if CONFIG_EXT_TX -static void fidtx32_avx2(__m256i *in0, __m256i *in1) { - int i = 0; - while (i < 32) { - in0[i] = _mm256_slli_epi16(in0[i], 2); - in1[i] = _mm256_slli_epi16(in1[i], 2); - i += 1; - } - mm256_transpose_32x32(in0, in1); -} -#endif - -void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride, - TxfmParam *txfm_param) { - __m256i in0[32]; // left 32 columns - __m256i in1[32]; // right 32 columns - const TX_TYPE tx_type = txfm_param->tx_type; -#if CONFIG_MRC_TX - assert(tx_type != MRC_DCT && "No avx2 32x32 implementation of MRC_DCT"); -#endif - - switch (tx_type) { - case DCT_DCT: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fdct32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fdct32_avx2(in0, in1); - break; -#if CONFIG_EXT_TX - case ADST_DCT: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fhalfright32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fdct32_avx2(in0, in1); - break; - case DCT_ADST: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fdct32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fhalfright32_avx2(in0, in1); - break; - case ADST_ADST: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fhalfright32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fhalfright32_avx2(in0, in1); - break; - case FLIPADST_DCT: - load_buffer_32x32(input, stride, 1, 0, in0, in1); - fhalfright32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fdct32_avx2(in0, in1); - break; - case DCT_FLIPADST: - load_buffer_32x32(input, stride, 0, 1, in0, in1); - fdct32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fhalfright32_avx2(in0, in1); - break; - case FLIPADST_FLIPADST: - load_buffer_32x32(input, stride, 1, 1, in0, in1); - fhalfright32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fhalfright32_avx2(in0, in1); - break; - case ADST_FLIPADST: - load_buffer_32x32(input, stride, 0, 1, in0, in1); - fhalfright32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fhalfright32_avx2(in0, in1); - break; - case FLIPADST_ADST: - load_buffer_32x32(input, stride, 1, 0, in0, in1); - fhalfright32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fhalfright32_avx2(in0, in1); - break; - case IDTX: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fidtx32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fidtx32_avx2(in0, in1); - break; - case V_DCT: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fdct32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fidtx32_avx2(in0, in1); - break; - case H_DCT: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fidtx32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fdct32_avx2(in0, in1); - break; - case V_ADST: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fhalfright32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fidtx32_avx2(in0, in1); - break; - case H_ADST: - load_buffer_32x32(input, stride, 0, 0, in0, in1); - fidtx32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fhalfright32_avx2(in0, in1); - break; - case V_FLIPADST: - load_buffer_32x32(input, stride, 1, 0, in0, in1); - fhalfright32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fidtx32_avx2(in0, in1); - break; - case H_FLIPADST: - load_buffer_32x32(input, stride, 0, 1, in0, in1); - fidtx32_avx2(in0, in1); - right_shift_32x32(in0, in1); - fhalfright32_avx2(in0, in1); - break; -#endif // CONFIG_EXT_TX - default: assert(0); break; - } - write_buffer_32x32(in0, in1, output); - _mm256_zeroupper(); -} diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm index 7186b6b92..30983d1c1 100644 --- a/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm +++ b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm @@ -14,6 +14,8 @@ %include "aom_ports/x86_abi_support.asm" +SECTION .text + ; void av1_temporal_filter_apply_sse2 | arg ; (unsigned char *frame1, | 0 ; unsigned int stride, | 1 diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c index bf233ca4d..4d2e99f25 100644 --- a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c +++ b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c @@ -31,7 +31,7 @@ uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, uint64_t csse; const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE); - const __m128i v_zext_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + const __m128i v_zext_q = xx_set1_64_from_32i(0xffffffff); __m128i v_acc0_q = _mm_setzero_si128(); -- cgit v1.2.3