summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/encoder/x86
diff options
context:
space:
mode:
authortrav90 <travawine@palemoon.org>2018-10-19 21:52:15 -0500
committertrav90 <travawine@palemoon.org>2018-10-19 21:52:20 -0500
commitbbcc64772580c8a979288791afa02d30bc476d2e (patch)
tree437ce94c3fdd7497508e5b55de06c6d011678597 /third_party/aom/av1/encoder/x86
parent14805f6ddbfb173c327768fff9f81f40ce5e81b0 (diff)
downloadUXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar
UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.gz
UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.lz
UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.xz
UXP-bbcc64772580c8a979288791afa02d30bc476d2e.zip
Update aom to v1.0.0
Update aom to commit id d14c5bb4f336ef1842046089849dee4a301fbbf0.
Diffstat (limited to 'third_party/aom/av1/encoder/x86')
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c1205
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c306
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c2889
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h117
-rw-r--r--third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c84
-rw-r--r--third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c103
-rw-r--r--third_party/aom/av1/encoder/x86/av1_quantize_avx2.c234
-rw-r--r--third_party/aom/av1/encoder/x86/av1_quantize_sse2.c273
-rw-r--r--third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm3
-rw-r--r--third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h141
-rw-r--r--third_party/aom/av1/encoder/x86/corner_match_sse4.c3
-rw-r--r--third_party/aom/av1/encoder/x86/dct_intrin_sse2.c3483
-rw-r--r--third_party/aom/av1/encoder/x86/dct_sse2.asm5
-rw-r--r--third_party/aom/av1/encoder/x86/encodetxb_sse2.c505
-rw-r--r--third_party/aom/av1/encoder/x86/encodetxb_sse4.c80
-rw-r--r--third_party/aom/av1/encoder/x86/error_intrin_avx2.c3
-rw-r--r--third_party/aom/av1/encoder/x86/error_sse2.asm46
-rw-r--r--third_party/aom/av1/encoder/x86/hash_sse42.c51
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c1276
-rw-r--r--third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c1627
-rw-r--r--third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm2
-rw-r--r--third_party/aom/av1/encoder/x86/wedge_utils_sse2.c2
22 files changed, 6114 insertions, 6324 deletions
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
new file mode 100644
index 000000000..84065d6de
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -0,0 +1,1205 @@
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ __m128i buf0[32];
+ __m128i buf1[32];
+ const int32_t *cospi;
+ // stage 0
+ // stage 1
+ buf1[0] = _mm_add_epi32(input[0], input[31]);
+ buf1[31] = _mm_sub_epi32(input[0], input[31]);
+ buf1[1] = _mm_add_epi32(input[1], input[30]);
+ buf1[30] = _mm_sub_epi32(input[1], input[30]);
+ buf1[2] = _mm_add_epi32(input[2], input[29]);
+ buf1[29] = _mm_sub_epi32(input[2], input[29]);
+ buf1[3] = _mm_add_epi32(input[3], input[28]);
+ buf1[28] = _mm_sub_epi32(input[3], input[28]);
+ buf1[4] = _mm_add_epi32(input[4], input[27]);
+ buf1[27] = _mm_sub_epi32(input[4], input[27]);
+ buf1[5] = _mm_add_epi32(input[5], input[26]);
+ buf1[26] = _mm_sub_epi32(input[5], input[26]);
+ buf1[6] = _mm_add_epi32(input[6], input[25]);
+ buf1[25] = _mm_sub_epi32(input[6], input[25]);
+ buf1[7] = _mm_add_epi32(input[7], input[24]);
+ buf1[24] = _mm_sub_epi32(input[7], input[24]);
+ buf1[8] = _mm_add_epi32(input[8], input[23]);
+ buf1[23] = _mm_sub_epi32(input[8], input[23]);
+ buf1[9] = _mm_add_epi32(input[9], input[22]);
+ buf1[22] = _mm_sub_epi32(input[9], input[22]);
+ buf1[10] = _mm_add_epi32(input[10], input[21]);
+ buf1[21] = _mm_sub_epi32(input[10], input[21]);
+ buf1[11] = _mm_add_epi32(input[11], input[20]);
+ buf1[20] = _mm_sub_epi32(input[11], input[20]);
+ buf1[12] = _mm_add_epi32(input[12], input[19]);
+ buf1[19] = _mm_sub_epi32(input[12], input[19]);
+ buf1[13] = _mm_add_epi32(input[13], input[18]);
+ buf1[18] = _mm_sub_epi32(input[13], input[18]);
+ buf1[14] = _mm_add_epi32(input[14], input[17]);
+ buf1[17] = _mm_sub_epi32(input[14], input[17]);
+ buf1[15] = _mm_add_epi32(input[15], input[16]);
+ buf1[16] = _mm_sub_epi32(input[15], input[16]);
+
+ // stage 2
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
+ buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
+ buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
+ buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
+ buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
+ buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
+ buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
+ buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
+ buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
+ buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
+ buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
+ buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
+ buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
+ buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
+ buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
+ buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ buf0[18] = buf1[18];
+ buf0[19] = buf1[19];
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+ buf0[24], cos_bit);
+ buf0[28] = buf1[28];
+ buf0[29] = buf1[29];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 3
+ cospi = cospi_arr(cos_bit);
+ buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
+ buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
+ buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
+ buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
+ buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
+ buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
+ buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
+ buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
+ buf1[8] = buf0[8];
+ buf1[9] = buf0[9];
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+ buf1[12], cos_bit);
+ buf1[14] = buf0[14];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
+ buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
+ buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
+ buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
+ buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
+ buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
+ buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
+ buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
+ buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
+ buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
+ buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
+ buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
+ buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
+ buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
+ buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
+ buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
+
+ // stage 4
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
+ buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
+ buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
+ buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
+ buf0[4] = buf1[4];
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+ cos_bit);
+ buf0[7] = buf1[7];
+ buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
+ buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
+ buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
+ buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
+ buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
+ buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
+ buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
+ buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+ buf0[28], cos_bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ buf0[22] = buf1[22];
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[25] = buf1[25];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 5
+ cospi = cospi_arr(cos_bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+ cos_bit);
+ btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
+ cos_bit);
+ buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
+ buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
+ buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
+ buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
+ buf1[8] = buf0[8];
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
+ buf1[14], cos_bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ buf1[11] = buf0[11];
+ buf1[12] = buf0[12];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
+ buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
+ buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
+ buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
+ buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
+ buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
+ buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
+ buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
+ buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
+ buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
+ buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
+ buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
+ buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
+ buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
+ buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
+ buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
+
+ // stage 6
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+ cos_bit);
+ btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
+ cos_bit);
+ buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
+ buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
+ buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
+ buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
+ buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
+ buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
+ buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
+ buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
+ buf0[16] = buf1[16];
+ btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+ buf0[30], cos_bit);
+ btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ buf0[19] = buf1[19];
+ buf0[20] = buf1[20];
+ btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[27] = buf1[27];
+ buf0[28] = buf1[28];
+ buf0[31] = buf1[31];
+
+ // stage 7
+ cospi = cospi_arr(cos_bit);
+ buf1[0] = buf0[0];
+ buf1[1] = buf0[1];
+ buf1[2] = buf0[2];
+ buf1[3] = buf0[3];
+ buf1[4] = buf0[4];
+ buf1[5] = buf0[5];
+ buf1[6] = buf0[6];
+ buf1[7] = buf0[7];
+ btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
+ cos_bit);
+ btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
+ buf1[14], cos_bit);
+ btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+ buf1[12], cos_bit);
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
+ buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
+ buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
+ buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
+ buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
+ buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
+ buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
+ buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
+ buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
+ buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
+ buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
+ buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
+ buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
+ buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
+ buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
+ buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
+
+ // stage 8
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ buf0[4] = buf1[4];
+ buf0[5] = buf1[5];
+ buf0[6] = buf1[6];
+ buf0[7] = buf1[7];
+ buf0[8] = buf1[8];
+ buf0[9] = buf1[9];
+ buf0[10] = buf1[10];
+ buf0[11] = buf1[11];
+ buf0[12] = buf1[12];
+ buf0[13] = buf1[13];
+ buf0[14] = buf1[14];
+ buf0[15] = buf1[15];
+ btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
+ buf0[31], cos_bit);
+ btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+ buf0[30], cos_bit);
+ btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+ buf0[28], cos_bit);
+ btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
+ buf0[24], cos_bit);
+
+ // stage 9
+ output[0] = buf0[0];
+ output[1] = buf0[16];
+ output[2] = buf0[8];
+ output[3] = buf0[24];
+ output[4] = buf0[4];
+ output[5] = buf0[20];
+ output[6] = buf0[12];
+ output[7] = buf0[28];
+ output[8] = buf0[2];
+ output[9] = buf0[18];
+ output[10] = buf0[10];
+ output[11] = buf0[26];
+ output[12] = buf0[6];
+ output[13] = buf0[22];
+ output[14] = buf0[14];
+ output[15] = buf0[30];
+ output[16] = buf0[1];
+ output[17] = buf0[17];
+ output[18] = buf0[9];
+ output[19] = buf0[25];
+ output[20] = buf0[5];
+ output[21] = buf0[21];
+ output[22] = buf0[13];
+ output[23] = buf0[29];
+ output[24] = buf0[3];
+ output[25] = buf0[19];
+ output[26] = buf0[11];
+ output[27] = buf0[27];
+ output[28] = buf0[7];
+ output[29] = buf0[23];
+ output[30] = buf0[15];
+ output[31] = buf0[31];
+}
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range) {
+ const int txfm_size = 4;
+ const int num_per_128 = 4;
+ const int32_t *cospi;
+ __m128i buf0[4];
+ __m128i buf1[4];
+ int col_num = txfm_size / num_per_128;
+ int col;
+ (void)stage_range;
+ for (col = 0; col < col_num; col++) {
+ // stage 0;
+ int32_t stage_idx = 0;
+ int j;
+ for (j = 0; j < 4; ++j) {
+ buf0[j] = input[j * col_num + col];
+ }
+
+ // stage 1
+ stage_idx++;
+ buf1[0] = buf0[3];
+ buf1[1] = buf0[0];
+ buf1[2] = buf0[1];
+ buf1[3] = buf0[2];
+
+ // stage 2
+ stage_idx++;
+
+ cospi = cospi_arr(cos_bit);
+ btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+ cos_bit);
+ btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
+ buf0[3], cos_bit);
+
+ // stage 3
+ stage_idx++;
+ buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+ buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+ buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+ buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+
+ // stage 4
+ stage_idx++;
+
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+ buf0[3], cos_bit);
+
+ // stage 5
+ stage_idx++;
+ buf1[0] = buf0[0];
+ buf1[1] = _mm_sub_epi32(_mm_setzero_si128(), buf0[2]);
+ buf1[2] = buf0[3];
+ buf1[3] = _mm_sub_epi32(_mm_setzero_si128(), buf0[1]);
+
+ for (j = 0; j < 4; ++j) {
+ output[j * col_num + col] = buf1[j];
+ }
+ }
+}
+
+void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32 = _mm_set1_epi32(-cospi[32]);
+ __m128i cospi_p32 = _mm_set1_epi32(cospi[32]);
+ __m128i cospi_m16 = _mm_set1_epi32(-cospi[16]);
+ __m128i cospi_p48 = _mm_set1_epi32(cospi[48]);
+ __m128i cospi_m48 = _mm_set1_epi32(-cospi[48]);
+ __m128i cospi_p16 = _mm_set1_epi32(cospi[16]);
+ __m128i cospi_m08 = _mm_set1_epi32(-cospi[8]);
+ __m128i cospi_p56 = _mm_set1_epi32(cospi[56]);
+ __m128i cospi_m56 = _mm_set1_epi32(-cospi[56]);
+ __m128i cospi_m40 = _mm_set1_epi32(-cospi[40]);
+ __m128i cospi_p24 = _mm_set1_epi32(cospi[24]);
+ __m128i cospi_m24 = _mm_set1_epi32(-cospi[24]);
+ __m128i cospi_p08 = _mm_set1_epi32(cospi[8]);
+ __m128i cospi_p40 = _mm_set1_epi32(cospi[40]);
+ __m128i cospi_p60 = _mm_set1_epi32(cospi[60]);
+ __m128i cospi_p04 = _mm_set1_epi32(cospi[4]);
+ __m128i cospi_p28 = _mm_set1_epi32(cospi[28]);
+ __m128i cospi_p36 = _mm_set1_epi32(cospi[36]);
+ __m128i cospi_p44 = _mm_set1_epi32(cospi[44]);
+ __m128i cospi_p20 = _mm_set1_epi32(cospi[20]);
+ __m128i cospi_p12 = _mm_set1_epi32(cospi[12]);
+ __m128i cospi_p52 = _mm_set1_epi32(cospi[52]);
+ __m128i cospi_m04 = _mm_set1_epi32(-cospi[4]);
+ __m128i cospi_m60 = _mm_set1_epi32(-cospi[60]);
+ __m128i cospi_m36 = _mm_set1_epi32(-cospi[36]);
+ __m128i cospi_m28 = _mm_set1_epi32(-cospi[28]);
+ __m128i cospi_m20 = _mm_set1_epi32(-cospi[20]);
+ __m128i cospi_m44 = _mm_set1_epi32(-cospi[44]);
+ __m128i cospi_m52 = _mm_set1_epi32(-cospi[52]);
+ __m128i cospi_m12 = _mm_set1_epi32(-cospi[12]);
+ __m128i cospi_p62 = _mm_set1_epi32(cospi[62]);
+ __m128i cospi_p02 = _mm_set1_epi32(cospi[2]);
+ __m128i cospi_p30 = _mm_set1_epi32(cospi[30]);
+ __m128i cospi_p34 = _mm_set1_epi32(cospi[34]);
+ __m128i cospi_p46 = _mm_set1_epi32(cospi[46]);
+ __m128i cospi_p18 = _mm_set1_epi32(cospi[18]);
+ __m128i cospi_p14 = _mm_set1_epi32(cospi[14]);
+ __m128i cospi_p50 = _mm_set1_epi32(cospi[50]);
+ __m128i cospi_p54 = _mm_set1_epi32(cospi[54]);
+ __m128i cospi_p10 = _mm_set1_epi32(cospi[10]);
+ __m128i cospi_p22 = _mm_set1_epi32(cospi[22]);
+ __m128i cospi_p42 = _mm_set1_epi32(cospi[42]);
+ __m128i cospi_p38 = _mm_set1_epi32(cospi[38]);
+ __m128i cospi_p26 = _mm_set1_epi32(cospi[26]);
+ __m128i cospi_p06 = _mm_set1_epi32(cospi[6]);
+ __m128i cospi_p58 = _mm_set1_epi32(cospi[58]);
+ __m128i cospi_p63 = _mm_set1_epi32(cospi[63]);
+ __m128i cospi_p01 = _mm_set1_epi32(cospi[1]);
+ __m128i cospi_p31 = _mm_set1_epi32(cospi[31]);
+ __m128i cospi_p33 = _mm_set1_epi32(cospi[33]);
+ __m128i cospi_p47 = _mm_set1_epi32(cospi[47]);
+ __m128i cospi_p17 = _mm_set1_epi32(cospi[17]);
+ __m128i cospi_p15 = _mm_set1_epi32(cospi[15]);
+ __m128i cospi_p49 = _mm_set1_epi32(cospi[49]);
+ __m128i cospi_p55 = _mm_set1_epi32(cospi[55]);
+ __m128i cospi_p09 = _mm_set1_epi32(cospi[9]);
+ __m128i cospi_p23 = _mm_set1_epi32(cospi[23]);
+ __m128i cospi_p41 = _mm_set1_epi32(cospi[41]);
+ __m128i cospi_p39 = _mm_set1_epi32(cospi[39]);
+ __m128i cospi_p25 = _mm_set1_epi32(cospi[25]);
+ __m128i cospi_p07 = _mm_set1_epi32(cospi[7]);
+ __m128i cospi_p57 = _mm_set1_epi32(cospi[57]);
+ __m128i cospi_p59 = _mm_set1_epi32(cospi[59]);
+ __m128i cospi_p05 = _mm_set1_epi32(cospi[5]);
+ __m128i cospi_p27 = _mm_set1_epi32(cospi[27]);
+ __m128i cospi_p37 = _mm_set1_epi32(cospi[37]);
+ __m128i cospi_p43 = _mm_set1_epi32(cospi[43]);
+ __m128i cospi_p21 = _mm_set1_epi32(cospi[21]);
+ __m128i cospi_p11 = _mm_set1_epi32(cospi[11]);
+ __m128i cospi_p53 = _mm_set1_epi32(cospi[53]);
+ __m128i cospi_p51 = _mm_set1_epi32(cospi[51]);
+ __m128i cospi_p13 = _mm_set1_epi32(cospi[13]);
+ __m128i cospi_p19 = _mm_set1_epi32(cospi[19]);
+ __m128i cospi_p45 = _mm_set1_epi32(cospi[45]);
+ __m128i cospi_p35 = _mm_set1_epi32(cospi[35]);
+ __m128i cospi_p29 = _mm_set1_epi32(cospi[29]);
+ __m128i cospi_p03 = _mm_set1_epi32(cospi[3]);
+ __m128i cospi_p61 = _mm_set1_epi32(cospi[61]);
+
+ // stage 1
+ __m128i x1[64];
+ x1[0] = _mm_add_epi32(input[0], input[63]);
+ x1[63] = _mm_sub_epi32(input[0], input[63]);
+ x1[1] = _mm_add_epi32(input[1], input[62]);
+ x1[62] = _mm_sub_epi32(input[1], input[62]);
+ x1[2] = _mm_add_epi32(input[2], input[61]);
+ x1[61] = _mm_sub_epi32(input[2], input[61]);
+ x1[3] = _mm_add_epi32(input[3], input[60]);
+ x1[60] = _mm_sub_epi32(input[3], input[60]);
+ x1[4] = _mm_add_epi32(input[4], input[59]);
+ x1[59] = _mm_sub_epi32(input[4], input[59]);
+ x1[5] = _mm_add_epi32(input[5], input[58]);
+ x1[58] = _mm_sub_epi32(input[5], input[58]);
+ x1[6] = _mm_add_epi32(input[6], input[57]);
+ x1[57] = _mm_sub_epi32(input[6], input[57]);
+ x1[7] = _mm_add_epi32(input[7], input[56]);
+ x1[56] = _mm_sub_epi32(input[7], input[56]);
+ x1[8] = _mm_add_epi32(input[8], input[55]);
+ x1[55] = _mm_sub_epi32(input[8], input[55]);
+ x1[9] = _mm_add_epi32(input[9], input[54]);
+ x1[54] = _mm_sub_epi32(input[9], input[54]);
+ x1[10] = _mm_add_epi32(input[10], input[53]);
+ x1[53] = _mm_sub_epi32(input[10], input[53]);
+ x1[11] = _mm_add_epi32(input[11], input[52]);
+ x1[52] = _mm_sub_epi32(input[11], input[52]);
+ x1[12] = _mm_add_epi32(input[12], input[51]);
+ x1[51] = _mm_sub_epi32(input[12], input[51]);
+ x1[13] = _mm_add_epi32(input[13], input[50]);
+ x1[50] = _mm_sub_epi32(input[13], input[50]);
+ x1[14] = _mm_add_epi32(input[14], input[49]);
+ x1[49] = _mm_sub_epi32(input[14], input[49]);
+ x1[15] = _mm_add_epi32(input[15], input[48]);
+ x1[48] = _mm_sub_epi32(input[15], input[48]);
+ x1[16] = _mm_add_epi32(input[16], input[47]);
+ x1[47] = _mm_sub_epi32(input[16], input[47]);
+ x1[17] = _mm_add_epi32(input[17], input[46]);
+ x1[46] = _mm_sub_epi32(input[17], input[46]);
+ x1[18] = _mm_add_epi32(input[18], input[45]);
+ x1[45] = _mm_sub_epi32(input[18], input[45]);
+ x1[19] = _mm_add_epi32(input[19], input[44]);
+ x1[44] = _mm_sub_epi32(input[19], input[44]);
+ x1[20] = _mm_add_epi32(input[20], input[43]);
+ x1[43] = _mm_sub_epi32(input[20], input[43]);
+ x1[21] = _mm_add_epi32(input[21], input[42]);
+ x1[42] = _mm_sub_epi32(input[21], input[42]);
+ x1[22] = _mm_add_epi32(input[22], input[41]);
+ x1[41] = _mm_sub_epi32(input[22], input[41]);
+ x1[23] = _mm_add_epi32(input[23], input[40]);
+ x1[40] = _mm_sub_epi32(input[23], input[40]);
+ x1[24] = _mm_add_epi32(input[24], input[39]);
+ x1[39] = _mm_sub_epi32(input[24], input[39]);
+ x1[25] = _mm_add_epi32(input[25], input[38]);
+ x1[38] = _mm_sub_epi32(input[25], input[38]);
+ x1[26] = _mm_add_epi32(input[26], input[37]);
+ x1[37] = _mm_sub_epi32(input[26], input[37]);
+ x1[27] = _mm_add_epi32(input[27], input[36]);
+ x1[36] = _mm_sub_epi32(input[27], input[36]);
+ x1[28] = _mm_add_epi32(input[28], input[35]);
+ x1[35] = _mm_sub_epi32(input[28], input[35]);
+ x1[29] = _mm_add_epi32(input[29], input[34]);
+ x1[34] = _mm_sub_epi32(input[29], input[34]);
+ x1[30] = _mm_add_epi32(input[30], input[33]);
+ x1[33] = _mm_sub_epi32(input[30], input[33]);
+ x1[31] = _mm_add_epi32(input[31], input[32]);
+ x1[32] = _mm_sub_epi32(input[31], input[32]);
+
+ // stage 2
+ __m128i x2[64];
+ x2[0] = _mm_add_epi32(x1[0], x1[31]);
+ x2[31] = _mm_sub_epi32(x1[0], x1[31]);
+ x2[1] = _mm_add_epi32(x1[1], x1[30]);
+ x2[30] = _mm_sub_epi32(x1[1], x1[30]);
+ x2[2] = _mm_add_epi32(x1[2], x1[29]);
+ x2[29] = _mm_sub_epi32(x1[2], x1[29]);
+ x2[3] = _mm_add_epi32(x1[3], x1[28]);
+ x2[28] = _mm_sub_epi32(x1[3], x1[28]);
+ x2[4] = _mm_add_epi32(x1[4], x1[27]);
+ x2[27] = _mm_sub_epi32(x1[4], x1[27]);
+ x2[5] = _mm_add_epi32(x1[5], x1[26]);
+ x2[26] = _mm_sub_epi32(x1[5], x1[26]);
+ x2[6] = _mm_add_epi32(x1[6], x1[25]);
+ x2[25] = _mm_sub_epi32(x1[6], x1[25]);
+ x2[7] = _mm_add_epi32(x1[7], x1[24]);
+ x2[24] = _mm_sub_epi32(x1[7], x1[24]);
+ x2[8] = _mm_add_epi32(x1[8], x1[23]);
+ x2[23] = _mm_sub_epi32(x1[8], x1[23]);
+ x2[9] = _mm_add_epi32(x1[9], x1[22]);
+ x2[22] = _mm_sub_epi32(x1[9], x1[22]);
+ x2[10] = _mm_add_epi32(x1[10], x1[21]);
+ x2[21] = _mm_sub_epi32(x1[10], x1[21]);
+ x2[11] = _mm_add_epi32(x1[11], x1[20]);
+ x2[20] = _mm_sub_epi32(x1[11], x1[20]);
+ x2[12] = _mm_add_epi32(x1[12], x1[19]);
+ x2[19] = _mm_sub_epi32(x1[12], x1[19]);
+ x2[13] = _mm_add_epi32(x1[13], x1[18]);
+ x2[18] = _mm_sub_epi32(x1[13], x1[18]);
+ x2[14] = _mm_add_epi32(x1[14], x1[17]);
+ x2[17] = _mm_sub_epi32(x1[14], x1[17]);
+ x2[15] = _mm_add_epi32(x1[15], x1[16]);
+ x2[16] = _mm_sub_epi32(x1[15], x1[16]);
+ x2[32] = x1[32];
+ x2[33] = x1[33];
+ x2[34] = x1[34];
+ x2[35] = x1[35];
+ x2[36] = x1[36];
+ x2[37] = x1[37];
+ x2[38] = x1[38];
+ x2[39] = x1[39];
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48],
+ __rounding, cos_bit);
+ x2[56] = x1[56];
+ x2[57] = x1[57];
+ x2[58] = x1[58];
+ x2[59] = x1[59];
+ x2[60] = x1[60];
+ x2[61] = x1[61];
+ x2[62] = x1[62];
+ x2[63] = x1[63];
+
+ // stage 3
+ __m128i x3[64];
+ x3[0] = _mm_add_epi32(x2[0], x2[15]);
+ x3[15] = _mm_sub_epi32(x2[0], x2[15]);
+ x3[1] = _mm_add_epi32(x2[1], x2[14]);
+ x3[14] = _mm_sub_epi32(x2[1], x2[14]);
+ x3[2] = _mm_add_epi32(x2[2], x2[13]);
+ x3[13] = _mm_sub_epi32(x2[2], x2[13]);
+ x3[3] = _mm_add_epi32(x2[3], x2[12]);
+ x3[12] = _mm_sub_epi32(x2[3], x2[12]);
+ x3[4] = _mm_add_epi32(x2[4], x2[11]);
+ x3[11] = _mm_sub_epi32(x2[4], x2[11]);
+ x3[5] = _mm_add_epi32(x2[5], x2[10]);
+ x3[10] = _mm_sub_epi32(x2[5], x2[10]);
+ x3[6] = _mm_add_epi32(x2[6], x2[9]);
+ x3[9] = _mm_sub_epi32(x2[6], x2[9]);
+ x3[7] = _mm_add_epi32(x2[7], x2[8]);
+ x3[8] = _mm_sub_epi32(x2[7], x2[8]);
+ x3[16] = x2[16];
+ x3[17] = x2[17];
+ x3[18] = x2[18];
+ x3[19] = x2[19];
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24],
+ __rounding, cos_bit);
+ x3[28] = x2[28];
+ x3[29] = x2[29];
+ x3[30] = x2[30];
+ x3[31] = x2[31];
+ x3[32] = _mm_add_epi32(x2[32], x2[47]);
+ x3[47] = _mm_sub_epi32(x2[32], x2[47]);
+ x3[33] = _mm_add_epi32(x2[33], x2[46]);
+ x3[46] = _mm_sub_epi32(x2[33], x2[46]);
+ x3[34] = _mm_add_epi32(x2[34], x2[45]);
+ x3[45] = _mm_sub_epi32(x2[34], x2[45]);
+ x3[35] = _mm_add_epi32(x2[35], x2[44]);
+ x3[44] = _mm_sub_epi32(x2[35], x2[44]);
+ x3[36] = _mm_add_epi32(x2[36], x2[43]);
+ x3[43] = _mm_sub_epi32(x2[36], x2[43]);
+ x3[37] = _mm_add_epi32(x2[37], x2[42]);
+ x3[42] = _mm_sub_epi32(x2[37], x2[42]);
+ x3[38] = _mm_add_epi32(x2[38], x2[41]);
+ x3[41] = _mm_sub_epi32(x2[38], x2[41]);
+ x3[39] = _mm_add_epi32(x2[39], x2[40]);
+ x3[40] = _mm_sub_epi32(x2[39], x2[40]);
+ x3[48] = _mm_sub_epi32(x2[63], x2[48]);
+ x3[63] = _mm_add_epi32(x2[63], x2[48]);
+ x3[49] = _mm_sub_epi32(x2[62], x2[49]);
+ x3[62] = _mm_add_epi32(x2[62], x2[49]);
+ x3[50] = _mm_sub_epi32(x2[61], x2[50]);
+ x3[61] = _mm_add_epi32(x2[61], x2[50]);
+ x3[51] = _mm_sub_epi32(x2[60], x2[51]);
+ x3[60] = _mm_add_epi32(x2[60], x2[51]);
+ x3[52] = _mm_sub_epi32(x2[59], x2[52]);
+ x3[59] = _mm_add_epi32(x2[59], x2[52]);
+ x3[53] = _mm_sub_epi32(x2[58], x2[53]);
+ x3[58] = _mm_add_epi32(x2[58], x2[53]);
+ x3[54] = _mm_sub_epi32(x2[57], x2[54]);
+ x3[57] = _mm_add_epi32(x2[57], x2[54]);
+ x3[55] = _mm_sub_epi32(x2[56], x2[55]);
+ x3[56] = _mm_add_epi32(x2[56], x2[55]);
+
+ // stage 4
+ __m128i x4[64];
+ x4[0] = _mm_add_epi32(x3[0], x3[7]);
+ x4[7] = _mm_sub_epi32(x3[0], x3[7]);
+ x4[1] = _mm_add_epi32(x3[1], x3[6]);
+ x4[6] = _mm_sub_epi32(x3[1], x3[6]);
+ x4[2] = _mm_add_epi32(x3[2], x3[5]);
+ x4[5] = _mm_sub_epi32(x3[2], x3[5]);
+ x4[3] = _mm_add_epi32(x3[3], x3[4]);
+ x4[4] = _mm_sub_epi32(x3[3], x3[4]);
+ x4[8] = x3[8];
+ x4[9] = x3[9];
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12],
+ __rounding, cos_bit);
+ x4[14] = x3[14];
+ x4[15] = x3[15];
+ x4[16] = _mm_add_epi32(x3[16], x3[23]);
+ x4[23] = _mm_sub_epi32(x3[16], x3[23]);
+ x4[17] = _mm_add_epi32(x3[17], x3[22]);
+ x4[22] = _mm_sub_epi32(x3[17], x3[22]);
+ x4[18] = _mm_add_epi32(x3[18], x3[21]);
+ x4[21] = _mm_sub_epi32(x3[18], x3[21]);
+ x4[19] = _mm_add_epi32(x3[19], x3[20]);
+ x4[20] = _mm_sub_epi32(x3[19], x3[20]);
+ x4[24] = _mm_sub_epi32(x3[31], x3[24]);
+ x4[31] = _mm_add_epi32(x3[31], x3[24]);
+ x4[25] = _mm_sub_epi32(x3[30], x3[25]);
+ x4[30] = _mm_add_epi32(x3[30], x3[25]);
+ x4[26] = _mm_sub_epi32(x3[29], x3[26]);
+ x4[29] = _mm_add_epi32(x3[29], x3[26]);
+ x4[27] = _mm_sub_epi32(x3[28], x3[27]);
+ x4[28] = _mm_add_epi32(x3[28], x3[27]);
+ x4[32] = x3[32];
+ x4[33] = x3[33];
+ x4[34] = x3[34];
+ x4[35] = x3[35];
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52],
+ __rounding, cos_bit);
+ x4[44] = x3[44];
+ x4[45] = x3[45];
+ x4[46] = x3[46];
+ x4[47] = x3[47];
+ x4[48] = x3[48];
+ x4[49] = x3[49];
+ x4[50] = x3[50];
+ x4[51] = x3[51];
+ x4[60] = x3[60];
+ x4[61] = x3[61];
+ x4[62] = x3[62];
+ x4[63] = x3[63];
+
+ // stage 5
+ __m128i x5[64];
+ x5[0] = _mm_add_epi32(x4[0], x4[3]);
+ x5[3] = _mm_sub_epi32(x4[0], x4[3]);
+ x5[1] = _mm_add_epi32(x4[1], x4[2]);
+ x5[2] = _mm_sub_epi32(x4[1], x4[2]);
+ x5[4] = x4[4];
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6],
+ __rounding, cos_bit);
+ x5[7] = x4[7];
+ x5[8] = _mm_add_epi32(x4[8], x4[11]);
+ x5[11] = _mm_sub_epi32(x4[8], x4[11]);
+ x5[9] = _mm_add_epi32(x4[9], x4[10]);
+ x5[10] = _mm_sub_epi32(x4[9], x4[10]);
+ x5[12] = _mm_sub_epi32(x4[15], x4[12]);
+ x5[15] = _mm_add_epi32(x4[15], x4[12]);
+ x5[13] = _mm_sub_epi32(x4[14], x4[13]);
+ x5[14] = _mm_add_epi32(x4[14], x4[13]);
+ x5[16] = x4[16];
+ x5[17] = x4[17];
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26],
+ __rounding, cos_bit);
+ x5[22] = x4[22];
+ x5[23] = x4[23];
+ x5[24] = x4[24];
+ x5[25] = x4[25];
+ x5[30] = x4[30];
+ x5[31] = x4[31];
+ x5[32] = _mm_add_epi32(x4[32], x4[39]);
+ x5[39] = _mm_sub_epi32(x4[32], x4[39]);
+ x5[33] = _mm_add_epi32(x4[33], x4[38]);
+ x5[38] = _mm_sub_epi32(x4[33], x4[38]);
+ x5[34] = _mm_add_epi32(x4[34], x4[37]);
+ x5[37] = _mm_sub_epi32(x4[34], x4[37]);
+ x5[35] = _mm_add_epi32(x4[35], x4[36]);
+ x5[36] = _mm_sub_epi32(x4[35], x4[36]);
+ x5[40] = _mm_sub_epi32(x4[47], x4[40]);
+ x5[47] = _mm_add_epi32(x4[47], x4[40]);
+ x5[41] = _mm_sub_epi32(x4[46], x4[41]);
+ x5[46] = _mm_add_epi32(x4[46], x4[41]);
+ x5[42] = _mm_sub_epi32(x4[45], x4[42]);
+ x5[45] = _mm_add_epi32(x4[45], x4[42]);
+ x5[43] = _mm_sub_epi32(x4[44], x4[43]);
+ x5[44] = _mm_add_epi32(x4[44], x4[43]);
+ x5[48] = _mm_add_epi32(x4[48], x4[55]);
+ x5[55] = _mm_sub_epi32(x4[48], x4[55]);
+ x5[49] = _mm_add_epi32(x4[49], x4[54]);
+ x5[54] = _mm_sub_epi32(x4[49], x4[54]);
+ x5[50] = _mm_add_epi32(x4[50], x4[53]);
+ x5[53] = _mm_sub_epi32(x4[50], x4[53]);
+ x5[51] = _mm_add_epi32(x4[51], x4[52]);
+ x5[52] = _mm_sub_epi32(x4[51], x4[52]);
+ x5[56] = _mm_sub_epi32(x4[63], x4[56]);
+ x5[63] = _mm_add_epi32(x4[63], x4[56]);
+ x5[57] = _mm_sub_epi32(x4[62], x4[57]);
+ x5[62] = _mm_add_epi32(x4[62], x4[57]);
+ x5[58] = _mm_sub_epi32(x4[61], x4[58]);
+ x5[61] = _mm_add_epi32(x4[61], x4[58]);
+ x5[59] = _mm_sub_epi32(x4[60], x4[59]);
+ x5[60] = _mm_add_epi32(x4[60], x4[59]);
+
+ // stage 6
+ __m128i x6[64];
+ btf_32_type0_sse4_1_new(cospi_p32, cospi_p32, x5[0], x5[1], x6[0], x6[1],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p48, cospi_p16, x5[2], x5[3], x6[2], x6[3],
+ __rounding, cos_bit);
+ x6[4] = _mm_add_epi32(x5[4], x5[5]);
+ x6[5] = _mm_sub_epi32(x5[4], x5[5]);
+ x6[6] = _mm_sub_epi32(x5[7], x5[6]);
+ x6[7] = _mm_add_epi32(x5[7], x5[6]);
+ x6[8] = x5[8];
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13],
+ __rounding, cos_bit);
+ x6[11] = x5[11];
+ x6[12] = x5[12];
+ x6[15] = x5[15];
+ x6[16] = _mm_add_epi32(x5[16], x5[19]);
+ x6[19] = _mm_sub_epi32(x5[16], x5[19]);
+ x6[17] = _mm_add_epi32(x5[17], x5[18]);
+ x6[18] = _mm_sub_epi32(x5[17], x5[18]);
+ x6[20] = _mm_sub_epi32(x5[23], x5[20]);
+ x6[23] = _mm_add_epi32(x5[23], x5[20]);
+ x6[21] = _mm_sub_epi32(x5[22], x5[21]);
+ x6[22] = _mm_add_epi32(x5[22], x5[21]);
+ x6[24] = _mm_add_epi32(x5[24], x5[27]);
+ x6[27] = _mm_sub_epi32(x5[24], x5[27]);
+ x6[25] = _mm_add_epi32(x5[25], x5[26]);
+ x6[26] = _mm_sub_epi32(x5[25], x5[26]);
+ x6[28] = _mm_sub_epi32(x5[31], x5[28]);
+ x6[31] = _mm_add_epi32(x5[31], x5[28]);
+ x6[29] = _mm_sub_epi32(x5[30], x5[29]);
+ x6[30] = _mm_add_epi32(x5[30], x5[29]);
+ x6[32] = x5[32];
+ x6[33] = x5[33];
+ btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58],
+ __rounding, cos_bit);
+ x6[38] = x5[38];
+ x6[39] = x5[39];
+ x6[40] = x5[40];
+ x6[41] = x5[41];
+ btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50],
+ __rounding, cos_bit);
+ x6[46] = x5[46];
+ x6[47] = x5[47];
+ x6[48] = x5[48];
+ x6[49] = x5[49];
+ x6[54] = x5[54];
+ x6[55] = x5[55];
+ x6[56] = x5[56];
+ x6[57] = x5[57];
+ x6[62] = x5[62];
+ x6[63] = x5[63];
+
+ // stage 7
+ __m128i x7[64];
+ x7[0] = x6[0];
+ x7[1] = x6[1];
+ x7[2] = x6[2];
+ x7[3] = x6[3];
+ btf_32_type1_sse4_1_new(cospi_p56, cospi_p08, x6[4], x6[7], x7[4], x7[7],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p24, cospi_p40, x6[5], x6[6], x7[5], x7[6],
+ __rounding, cos_bit);
+ x7[8] = _mm_add_epi32(x6[8], x6[9]);
+ x7[9] = _mm_sub_epi32(x6[8], x6[9]);
+ x7[10] = _mm_sub_epi32(x6[11], x6[10]);
+ x7[11] = _mm_add_epi32(x6[11], x6[10]);
+ x7[12] = _mm_add_epi32(x6[12], x6[13]);
+ x7[13] = _mm_sub_epi32(x6[12], x6[13]);
+ x7[14] = _mm_sub_epi32(x6[15], x6[14]);
+ x7[15] = _mm_add_epi32(x6[15], x6[14]);
+ x7[16] = x6[16];
+ btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29],
+ __rounding, cos_bit);
+ x7[19] = x6[19];
+ x7[20] = x6[20];
+ btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25],
+ __rounding, cos_bit);
+ x7[23] = x6[23];
+ x7[24] = x6[24];
+ x7[27] = x6[27];
+ x7[28] = x6[28];
+ x7[31] = x6[31];
+ x7[32] = _mm_add_epi32(x6[32], x6[35]);
+ x7[35] = _mm_sub_epi32(x6[32], x6[35]);
+ x7[33] = _mm_add_epi32(x6[33], x6[34]);
+ x7[34] = _mm_sub_epi32(x6[33], x6[34]);
+ x7[36] = _mm_sub_epi32(x6[39], x6[36]);
+ x7[39] = _mm_add_epi32(x6[39], x6[36]);
+ x7[37] = _mm_sub_epi32(x6[38], x6[37]);
+ x7[38] = _mm_add_epi32(x6[38], x6[37]);
+ x7[40] = _mm_add_epi32(x6[40], x6[43]);
+ x7[43] = _mm_sub_epi32(x6[40], x6[43]);
+ x7[41] = _mm_add_epi32(x6[41], x6[42]);
+ x7[42] = _mm_sub_epi32(x6[41], x6[42]);
+ x7[44] = _mm_sub_epi32(x6[47], x6[44]);
+ x7[47] = _mm_add_epi32(x6[47], x6[44]);
+ x7[45] = _mm_sub_epi32(x6[46], x6[45]);
+ x7[46] = _mm_add_epi32(x6[46], x6[45]);
+ x7[48] = _mm_add_epi32(x6[48], x6[51]);
+ x7[51] = _mm_sub_epi32(x6[48], x6[51]);
+ x7[49] = _mm_add_epi32(x6[49], x6[50]);
+ x7[50] = _mm_sub_epi32(x6[49], x6[50]);
+ x7[52] = _mm_sub_epi32(x6[55], x6[52]);
+ x7[55] = _mm_add_epi32(x6[55], x6[52]);
+ x7[53] = _mm_sub_epi32(x6[54], x6[53]);
+ x7[54] = _mm_add_epi32(x6[54], x6[53]);
+ x7[56] = _mm_add_epi32(x6[56], x6[59]);
+ x7[59] = _mm_sub_epi32(x6[56], x6[59]);
+ x7[57] = _mm_add_epi32(x6[57], x6[58]);
+ x7[58] = _mm_sub_epi32(x6[57], x6[58]);
+ x7[60] = _mm_sub_epi32(x6[63], x6[60]);
+ x7[63] = _mm_add_epi32(x6[63], x6[60]);
+ x7[61] = _mm_sub_epi32(x6[62], x6[61]);
+ x7[62] = _mm_add_epi32(x6[62], x6[61]);
+
+ // stage 8
+ __m128i x8[64];
+ x8[0] = x7[0];
+ x8[1] = x7[1];
+ x8[2] = x7[2];
+ x8[3] = x7[3];
+ x8[4] = x7[4];
+ x8[5] = x7[5];
+ x8[6] = x7[6];
+ x8[7] = x7[7];
+ btf_32_type1_sse4_1_new(cospi_p60, cospi_p04, x7[8], x7[15], x8[8], x8[15],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p28, cospi_p36, x7[9], x7[14], x8[9], x8[14],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p44, cospi_p20, x7[10], x7[13], x8[10], x8[13],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p12, cospi_p52, x7[11], x7[12], x8[11], x8[12],
+ __rounding, cos_bit);
+ x8[16] = _mm_add_epi32(x7[16], x7[17]);
+ x8[17] = _mm_sub_epi32(x7[16], x7[17]);
+ x8[18] = _mm_sub_epi32(x7[19], x7[18]);
+ x8[19] = _mm_add_epi32(x7[19], x7[18]);
+ x8[20] = _mm_add_epi32(x7[20], x7[21]);
+ x8[21] = _mm_sub_epi32(x7[20], x7[21]);
+ x8[22] = _mm_sub_epi32(x7[23], x7[22]);
+ x8[23] = _mm_add_epi32(x7[23], x7[22]);
+ x8[24] = _mm_add_epi32(x7[24], x7[25]);
+ x8[25] = _mm_sub_epi32(x7[24], x7[25]);
+ x8[26] = _mm_sub_epi32(x7[27], x7[26]);
+ x8[27] = _mm_add_epi32(x7[27], x7[26]);
+ x8[28] = _mm_add_epi32(x7[28], x7[29]);
+ x8[29] = _mm_sub_epi32(x7[28], x7[29]);
+ x8[30] = _mm_sub_epi32(x7[31], x7[30]);
+ x8[31] = _mm_add_epi32(x7[31], x7[30]);
+ x8[32] = x7[32];
+ btf_32_type0_sse4_1_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61],
+ __rounding, cos_bit);
+ x8[35] = x7[35];
+ x8[36] = x7[36];
+ btf_32_type0_sse4_1_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57],
+ __rounding, cos_bit);
+ x8[39] = x7[39];
+ x8[40] = x7[40];
+ btf_32_type0_sse4_1_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53],
+ __rounding, cos_bit);
+ x8[43] = x7[43];
+ x8[44] = x7[44];
+ btf_32_type0_sse4_1_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49],
+ __rounding, cos_bit);
+ x8[47] = x7[47];
+ x8[48] = x7[48];
+ x8[51] = x7[51];
+ x8[52] = x7[52];
+ x8[55] = x7[55];
+ x8[56] = x7[56];
+ x8[59] = x7[59];
+ x8[60] = x7[60];
+ x8[63] = x7[63];
+
+ // stage 9
+ __m128i x9[64];
+ x9[0] = x8[0];
+ x9[1] = x8[1];
+ x9[2] = x8[2];
+ x9[3] = x8[3];
+ x9[4] = x8[4];
+ x9[5] = x8[5];
+ x9[6] = x8[6];
+ x9[7] = x8[7];
+ x9[8] = x8[8];
+ x9[9] = x8[9];
+ x9[10] = x8[10];
+ x9[11] = x8[11];
+ x9[12] = x8[12];
+ x9[13] = x8[13];
+ x9[14] = x8[14];
+ x9[15] = x8[15];
+ btf_32_type1_sse4_1_new(cospi_p62, cospi_p02, x8[16], x8[31], x9[16], x9[31],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p30, cospi_p34, x8[17], x8[30], x9[17], x9[30],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p46, cospi_p18, x8[18], x8[29], x9[18], x9[29],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p14, cospi_p50, x8[19], x8[28], x9[19], x9[28],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p54, cospi_p10, x8[20], x8[27], x9[20], x9[27],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p22, cospi_p42, x8[21], x8[26], x9[21], x9[26],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p38, cospi_p26, x8[22], x8[25], x9[22], x9[25],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p06, cospi_p58, x8[23], x8[24], x9[23], x9[24],
+ __rounding, cos_bit);
+ x9[32] = _mm_add_epi32(x8[32], x8[33]);
+ x9[33] = _mm_sub_epi32(x8[32], x8[33]);
+ x9[34] = _mm_sub_epi32(x8[35], x8[34]);
+ x9[35] = _mm_add_epi32(x8[35], x8[34]);
+ x9[36] = _mm_add_epi32(x8[36], x8[37]);
+ x9[37] = _mm_sub_epi32(x8[36], x8[37]);
+ x9[38] = _mm_sub_epi32(x8[39], x8[38]);
+ x9[39] = _mm_add_epi32(x8[39], x8[38]);
+ x9[40] = _mm_add_epi32(x8[40], x8[41]);
+ x9[41] = _mm_sub_epi32(x8[40], x8[41]);
+ x9[42] = _mm_sub_epi32(x8[43], x8[42]);
+ x9[43] = _mm_add_epi32(x8[43], x8[42]);
+ x9[44] = _mm_add_epi32(x8[44], x8[45]);
+ x9[45] = _mm_sub_epi32(x8[44], x8[45]);
+ x9[46] = _mm_sub_epi32(x8[47], x8[46]);
+ x9[47] = _mm_add_epi32(x8[47], x8[46]);
+ x9[48] = _mm_add_epi32(x8[48], x8[49]);
+ x9[49] = _mm_sub_epi32(x8[48], x8[49]);
+ x9[50] = _mm_sub_epi32(x8[51], x8[50]);
+ x9[51] = _mm_add_epi32(x8[51], x8[50]);
+ x9[52] = _mm_add_epi32(x8[52], x8[53]);
+ x9[53] = _mm_sub_epi32(x8[52], x8[53]);
+ x9[54] = _mm_sub_epi32(x8[55], x8[54]);
+ x9[55] = _mm_add_epi32(x8[55], x8[54]);
+ x9[56] = _mm_add_epi32(x8[56], x8[57]);
+ x9[57] = _mm_sub_epi32(x8[56], x8[57]);
+ x9[58] = _mm_sub_epi32(x8[59], x8[58]);
+ x9[59] = _mm_add_epi32(x8[59], x8[58]);
+ x9[60] = _mm_add_epi32(x8[60], x8[61]);
+ x9[61] = _mm_sub_epi32(x8[60], x8[61]);
+ x9[62] = _mm_sub_epi32(x8[63], x8[62]);
+ x9[63] = _mm_add_epi32(x8[63], x8[62]);
+
+ // stage 10
+ __m128i x10[64];
+ x10[0] = x9[0];
+ x10[1] = x9[1];
+ x10[2] = x9[2];
+ x10[3] = x9[3];
+ x10[4] = x9[4];
+ x10[5] = x9[5];
+ x10[6] = x9[6];
+ x10[7] = x9[7];
+ x10[8] = x9[8];
+ x10[9] = x9[9];
+ x10[10] = x9[10];
+ x10[11] = x9[11];
+ x10[12] = x9[12];
+ x10[13] = x9[13];
+ x10[14] = x9[14];
+ x10[15] = x9[15];
+ x10[16] = x9[16];
+ x10[17] = x9[17];
+ x10[18] = x9[18];
+ x10[19] = x9[19];
+ x10[20] = x9[20];
+ x10[21] = x9[21];
+ x10[22] = x9[22];
+ x10[23] = x9[23];
+ x10[24] = x9[24];
+ x10[25] = x9[25];
+ x10[26] = x9[26];
+ x10[27] = x9[27];
+ x10[28] = x9[28];
+ x10[29] = x9[29];
+ x10[30] = x9[30];
+ x10[31] = x9[31];
+ btf_32_type1_sse4_1_new(cospi_p63, cospi_p01, x9[32], x9[63], x10[32],
+ x10[63], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p31, cospi_p33, x9[33], x9[62], x10[33],
+ x10[62], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p47, cospi_p17, x9[34], x9[61], x10[34],
+ x10[61], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p15, cospi_p49, x9[35], x9[60], x10[35],
+ x10[60], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p55, cospi_p09, x9[36], x9[59], x10[36],
+ x10[59], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p23, cospi_p41, x9[37], x9[58], x10[37],
+ x10[58], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p39, cospi_p25, x9[38], x9[57], x10[38],
+ x10[57], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p07, cospi_p57, x9[39], x9[56], x10[39],
+ x10[56], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p59, cospi_p05, x9[40], x9[55], x10[40],
+ x10[55], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p27, cospi_p37, x9[41], x9[54], x10[41],
+ x10[54], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p43, cospi_p21, x9[42], x9[53], x10[42],
+ x10[53], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p11, cospi_p53, x9[43], x9[52], x10[43],
+ x10[52], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p51, cospi_p13, x9[44], x9[51], x10[44],
+ x10[51], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p19, cospi_p45, x9[45], x9[50], x10[45],
+ x10[50], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p35, cospi_p29, x9[46], x9[49], x10[46],
+ x10[49], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47],
+ x10[48], __rounding, cos_bit);
+
+ // stage 11
+ output[0] = x10[0];
+ output[1] = x10[32];
+ output[2] = x10[16];
+ output[3] = x10[48];
+ output[4] = x10[8];
+ output[5] = x10[40];
+ output[6] = x10[24];
+ output[7] = x10[56];
+ output[8] = x10[4];
+ output[9] = x10[36];
+ output[10] = x10[20];
+ output[11] = x10[52];
+ output[12] = x10[12];
+ output[13] = x10[44];
+ output[14] = x10[28];
+ output[15] = x10[60];
+ output[16] = x10[2];
+ output[17] = x10[34];
+ output[18] = x10[18];
+ output[19] = x10[50];
+ output[20] = x10[10];
+ output[21] = x10[42];
+ output[22] = x10[26];
+ output[23] = x10[58];
+ output[24] = x10[6];
+ output[25] = x10[38];
+ output[26] = x10[22];
+ output[27] = x10[54];
+ output[28] = x10[14];
+ output[29] = x10[46];
+ output[30] = x10[30];
+ output[31] = x10[62];
+ output[32] = x10[1];
+ output[33] = x10[33];
+ output[34] = x10[17];
+ output[35] = x10[49];
+ output[36] = x10[9];
+ output[37] = x10[41];
+ output[38] = x10[25];
+ output[39] = x10[57];
+ output[40] = x10[5];
+ output[41] = x10[37];
+ output[42] = x10[21];
+ output[43] = x10[53];
+ output[44] = x10[13];
+ output[45] = x10[45];
+ output[46] = x10[29];
+ output[47] = x10[61];
+ output[48] = x10[3];
+ output[49] = x10[35];
+ output[50] = x10[19];
+ output[51] = x10[51];
+ output[52] = x10[11];
+ output[53] = x10[43];
+ output[54] = x10[27];
+ output[55] = x10[59];
+ output[56] = x10[7];
+ output[57] = x10[39];
+ output[58] = x10[23];
+ output[59] = x10[55];
+ output[60] = x10[15];
+ output[61] = x10[47];
+ output[62] = x10[31];
+ output[63] = x10[63];
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
new file mode 100644
index 000000000..abb95f31e
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+static INLINE void int16_array_with_stride_to_int32_array_without_stride(
+ const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
+ int r, c;
+ for (r = 0; r < txfm1d_size; r++) {
+ for (c = 0; c < txfm1d_size; c++) {
+ output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
+ }
+ }
+}
+
+typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+
+static void fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range) {
+ const int txfm_size = 32;
+ const int num_per_128 = 4;
+ __m128i buf0[32];
+ __m128i buf1[32];
+ int col_num = txfm_size / num_per_128;
+ int col;
+ (void)stage_range;
+ for (col = 0; col < col_num; col++) {
+ int j;
+ for (j = 0; j < 32; ++j) {
+ buf0[j] = input[j * col_num + col];
+ }
+ av1_fdct32_new_sse4_1(buf0, buf1, cos_bit);
+ for (j = 0; j < 32; ++j) {
+ output[j * col_num + col] = buf1[j];
+ }
+ }
+}
+
+static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+ switch (txfm_type) {
+ case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break;
+ default: assert(0);
+ }
+ return NULL;
+}
+
+static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
+ const int stride,
+ const TXFM_2D_FLIP_CFG *cfg,
+ int32_t *txfm_buf) {
+ // TODO(sarahparker) This does not currently support rectangular transforms
+ // and will break without splitting txfm_size out into row and col size.
+ // Rectangular transforms use c code only, so it should be ok for now.
+ // It will be corrected when there are sse implementations for rectangular
+ // transforms.
+ assert(cfg->tx_size < TX_SIZES);
+ const int txfm_size = tx_size_wide[cfg->tx_size];
+ const int8_t *shift = cfg->shift;
+ const int8_t *stage_range_col = cfg->stage_range_col;
+ const int8_t *stage_range_row = cfg->stage_range_row;
+ const int8_t cos_bit_col = cfg->cos_bit_col;
+ const int8_t cos_bit_row = cfg->cos_bit_row;
+ const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+ const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+ __m128i *buf_128 = (__m128i *)txfm_buf;
+ __m128i *out_128 = (__m128i *)output;
+ int num_per_128 = 4;
+ int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+
+ int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
+ txfm_size);
+ av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
+ txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+ av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+ transpose_32(txfm_size, out_128, buf_128);
+ txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
+ av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+ transpose_32(txfm_size, buf_128, out_128);
+}
+
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+ (void)bd;
+ fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
+static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA,
+ const __m128i *inputB, __m128i *output) {
+ __m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]);
+ __m128i temp1 = _mm_unpackhi_epi32(inputA[0], inputA[2]);
+ __m128i temp2 = _mm_unpacklo_epi32(inputA[1], inputA[3]);
+ __m128i temp3 = _mm_unpackhi_epi32(inputA[1], inputA[3]);
+
+ output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+ output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+ output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+ output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+
+ temp0 = _mm_unpacklo_epi32(inputB[0], inputB[2]);
+ temp1 = _mm_unpackhi_epi32(inputB[0], inputB[2]);
+ temp2 = _mm_unpacklo_epi32(inputB[1], inputB[3]);
+ temp3 = _mm_unpackhi_epi32(inputB[1], inputB[3]);
+
+ output[4 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+ output[5 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+ output[6 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+ output[7 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X64;
+ __m128i buf0[64], buf1[512];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+ for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ __m128i bufA[64];
+ __m128i bufB[64];
+ __m128i *buf = buf1 + width * i;
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+ bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+ }
+ av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row);
+ av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row);
+ av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
+ av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+
+ int32_t *output8 = output + 8 * 32 * i;
+ for (int j = 0; j < width_div8; ++j) {
+ __m128i *out = (__m128i *)(output8 + 4 * j);
+ transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_64X32;
+ __m128i buf0[64], buf1[256];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+ assert(tx_type == DCT_DCT);
+ for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ __m128i bufA[64];
+ __m128i bufB[64];
+ __m128i *buf = buf1 + width * i;
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+ bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+ }
+ av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row);
+ av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row);
+ av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
+ av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+
+ int32_t *output8 = output + 8 * 32 * i;
+ for (int j = 0; j < width_div8; ++j) {
+ __m128i *out = (__m128i *)(output8 + 4 * j);
+ transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_32X64;
+ __m128i buf0[64], buf1[256];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ __m128i bufA[32];
+ __m128i bufB[32];
+ __m128i *buf = buf1 + width * i;
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+ bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+ }
+ av1_fdct32_new_sse4_1(bufA, bufA, cos_bit_row);
+ av1_fdct32_new_sse4_1(bufB, bufB, cos_bit_row);
+ av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
+ av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+
+ int32_t *output8 = output + 8 * 32 * i;
+ for (int j = 0; j < (32 / 4); ++j) {
+ __m128i *out = (__m128i *)(output8 + 4 * j);
+ transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+ }
+ }
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+ av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform
+ av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform
+ av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform
+ av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform
+ lowbd_fwd_txfm2d_64x64_sse4_1, // 64x64 transform
+ av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform
+ av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform
+ av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform
+ av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform
+ av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform
+ av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform
+ lowbd_fwd_txfm2d_32x64_sse4_1, // 32x64 transform
+ lowbd_fwd_txfm2d_64x32_sse4_1, // 64x32 transform
+ av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform
+ av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform
+ av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform
+ av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform
+ av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform
+ av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+ if ((fwd_txfm2d_func == NULL) ||
+ (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
+ av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+ } else {
+ fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c
new file mode 100644
index 000000000..6aae7ce1e
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -0,0 +1,2889 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+// TODO(linfengz): refine fdct4x8 and fadst4x8 optimization (if possible).
+
+static void fdct4x4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+ __m128i u[4], v[4];
+
+ u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+ u[1] = _mm_unpacklo_epi16(input[3], input[2]);
+
+ v[0] = _mm_add_epi16(u[0], u[1]);
+ v[1] = _mm_sub_epi16(u[0], u[1]);
+
+ u[0] = _mm_madd_epi16(v[0], cospi_p32_p32); // 0
+ u[1] = _mm_madd_epi16(v[0], cospi_p32_m32); // 2
+ u[2] = _mm_madd_epi16(v[1], cospi_p16_p48); // 1
+ u[3] = _mm_madd_epi16(v[1], cospi_p48_m16); // 3
+
+ v[0] = _mm_add_epi32(u[0], __rounding);
+ v[1] = _mm_add_epi32(u[1], __rounding);
+ v[2] = _mm_add_epi32(u[2], __rounding);
+ v[3] = _mm_add_epi32(u[3], __rounding);
+ u[0] = _mm_srai_epi32(v[0], cos_bit);
+ u[1] = _mm_srai_epi32(v[1], cos_bit);
+ u[2] = _mm_srai_epi32(v[2], cos_bit);
+ u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+ output[0] = _mm_packs_epi32(u[0], u[1]);
+ output[1] = _mm_packs_epi32(u[2], u[3]);
+ output[2] = _mm_srli_si128(output[0], 8);
+ output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fdct8x4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+
+ // stage 1
+ __m128i x1[4];
+ x1[0] = _mm_adds_epi16(input[0], input[3]);
+ x1[3] = _mm_subs_epi16(input[0], input[3]);
+ x1[1] = _mm_adds_epi16(input[1], input[2]);
+ x1[2] = _mm_subs_epi16(input[1], input[2]);
+
+ // stage 2
+ __m128i x2[4];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x1[2], x1[3], x2[2], x2[3]);
+
+ // stage 3
+ output[0] = x2[0];
+ output[1] = x2[2];
+ output[2] = x2[1];
+ output[3] = x2[3];
+}
+
+static void fdct4x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+ // stage 1
+ __m128i x1[8];
+ x1[0] = _mm_adds_epi16(input[0], input[7]);
+ x1[7] = _mm_subs_epi16(input[0], input[7]);
+ x1[1] = _mm_adds_epi16(input[1], input[6]);
+ x1[6] = _mm_subs_epi16(input[1], input[6]);
+ x1[2] = _mm_adds_epi16(input[2], input[5]);
+ x1[5] = _mm_subs_epi16(input[2], input[5]);
+ x1[3] = _mm_adds_epi16(input[3], input[4]);
+ x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+ // stage 2
+ __m128i x2[8];
+ x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+ x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+ x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+ x2[4] = x1[4];
+ btf_16_w4_sse2(&cospi_m32_p32, &cospi_p32_p32, __rounding, cos_bit, &x1[5],
+ &x1[6], &x2[5], &x2[6]);
+ x2[7] = x1[7];
+
+ // stage 3
+ __m128i x3[8];
+ btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x2[0],
+ &x2[1], &x3[0], &x3[1]);
+ btf_16_w4_sse2(&cospi_p48_p16, &cospi_m16_p48, __rounding, cos_bit, &x2[2],
+ &x2[3], &x3[2], &x3[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+ x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+ x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+ x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+ // stage 4
+ __m128i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_w4_sse2(&cospi_p56_p08, &cospi_m08_p56, __rounding, cos_bit, &x3[4],
+ &x3[7], &x4[4], &x4[7]);
+ btf_16_w4_sse2(&cospi_p24_p40, &cospi_m40_p24, __rounding, cos_bit, &x3[5],
+ &x3[6], &x4[5], &x4[6]);
+
+ // stage 5
+ output[0] = x4[0];
+ output[1] = x4[4];
+ output[2] = x4[2];
+ output[3] = x4[6];
+ output[4] = x4[1];
+ output[5] = x4[5];
+ output[6] = x4[3];
+ output[7] = x4[7];
+}
+
+static void fdct8x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+ // stage 1
+ __m128i x1[8];
+ x1[0] = _mm_adds_epi16(input[0], input[7]);
+ x1[7] = _mm_subs_epi16(input[0], input[7]);
+ x1[1] = _mm_adds_epi16(input[1], input[6]);
+ x1[6] = _mm_subs_epi16(input[1], input[6]);
+ x1[2] = _mm_adds_epi16(input[2], input[5]);
+ x1[5] = _mm_subs_epi16(input[2], input[5]);
+ x1[3] = _mm_adds_epi16(input[3], input[4]);
+ x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+ // stage 2
+ __m128i x2[8];
+ x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+ x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+ x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+ x2[4] = x1[4];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]);
+ x2[7] = x1[7];
+
+ // stage 3
+ __m128i x3[8];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+ x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+ x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+ x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+ // stage 4
+ __m128i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], x4[4], x4[7]);
+ btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], x4[5], x4[6]);
+
+ // stage 5
+ output[0] = x4[0];
+ output[1] = x4[4];
+ output[2] = x4[2];
+ output[3] = x4[6];
+ output[4] = x4[1];
+ output[5] = x4[5];
+ output[6] = x4[3];
+ output[7] = x4[7];
+}
+
+static void fdct8x16_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+
+ // stage 1
+ __m128i x1[16];
+ x1[0] = _mm_adds_epi16(input[0], input[15]);
+ x1[15] = _mm_subs_epi16(input[0], input[15]);
+ x1[1] = _mm_adds_epi16(input[1], input[14]);
+ x1[14] = _mm_subs_epi16(input[1], input[14]);
+ x1[2] = _mm_adds_epi16(input[2], input[13]);
+ x1[13] = _mm_subs_epi16(input[2], input[13]);
+ x1[3] = _mm_adds_epi16(input[3], input[12]);
+ x1[12] = _mm_subs_epi16(input[3], input[12]);
+ x1[4] = _mm_adds_epi16(input[4], input[11]);
+ x1[11] = _mm_subs_epi16(input[4], input[11]);
+ x1[5] = _mm_adds_epi16(input[5], input[10]);
+ x1[10] = _mm_subs_epi16(input[5], input[10]);
+ x1[6] = _mm_adds_epi16(input[6], input[9]);
+ x1[9] = _mm_subs_epi16(input[6], input[9]);
+ x1[7] = _mm_adds_epi16(input[7], input[8]);
+ x1[8] = _mm_subs_epi16(input[7], input[8]);
+
+ // stage 2
+ __m128i x2[16];
+ x2[0] = _mm_adds_epi16(x1[0], x1[7]);
+ x2[7] = _mm_subs_epi16(x1[0], x1[7]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[6]);
+ x2[6] = _mm_subs_epi16(x1[1], x1[6]);
+ x2[2] = _mm_adds_epi16(x1[2], x1[5]);
+ x2[5] = _mm_subs_epi16(x1[2], x1[5]);
+ x2[3] = _mm_adds_epi16(x1[3], x1[4]);
+ x2[4] = _mm_subs_epi16(x1[3], x1[4]);
+ x2[8] = x1[8];
+ x2[9] = x1[9];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[10], x1[13], x2[10], x2[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[11], x1[12], x2[11], x2[12]);
+ x2[14] = x1[14];
+ x2[15] = x1[15];
+
+ // stage 3
+ __m128i x3[16];
+ x3[0] = _mm_adds_epi16(x2[0], x2[3]);
+ x3[3] = _mm_subs_epi16(x2[0], x2[3]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[2]);
+ x3[2] = _mm_subs_epi16(x2[1], x2[2]);
+ x3[4] = x2[4];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[5], x2[6], x3[5], x3[6]);
+ x3[7] = x2[7];
+ x3[8] = _mm_adds_epi16(x2[8], x2[11]);
+ x3[11] = _mm_subs_epi16(x2[8], x2[11]);
+ x3[9] = _mm_adds_epi16(x2[9], x2[10]);
+ x3[10] = _mm_subs_epi16(x2[9], x2[10]);
+ x3[12] = _mm_subs_epi16(x2[15], x2[12]);
+ x3[15] = _mm_adds_epi16(x2[15], x2[12]);
+ x3[13] = _mm_subs_epi16(x2[14], x2[13]);
+ x3[14] = _mm_adds_epi16(x2[14], x2[13]);
+
+ // stage 4
+ __m128i x4[16];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x3[2], x3[3], x4[2], x4[3]);
+ x4[4] = _mm_adds_epi16(x3[4], x3[5]);
+ x4[5] = _mm_subs_epi16(x3[4], x3[5]);
+ x4[6] = _mm_subs_epi16(x3[7], x3[6]);
+ x4[7] = _mm_adds_epi16(x3[7], x3[6]);
+ x4[8] = x3[8];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]);
+ x4[11] = x3[11];
+ x4[12] = x3[12];
+ x4[15] = x3[15];
+
+ // stage 5
+ __m128i x5[16];
+ x5[0] = x4[0];
+ x5[1] = x4[1];
+ x5[2] = x4[2];
+ x5[3] = x4[3];
+ btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x4[4], x4[7], x5[4], x5[7]);
+ btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x4[5], x4[6], x5[5], x5[6]);
+ x5[8] = _mm_adds_epi16(x4[8], x4[9]);
+ x5[9] = _mm_subs_epi16(x4[8], x4[9]);
+ x5[10] = _mm_subs_epi16(x4[11], x4[10]);
+ x5[11] = _mm_adds_epi16(x4[11], x4[10]);
+ x5[12] = _mm_adds_epi16(x4[12], x4[13]);
+ x5[13] = _mm_subs_epi16(x4[12], x4[13]);
+ x5[14] = _mm_subs_epi16(x4[15], x4[14]);
+ x5[15] = _mm_adds_epi16(x4[15], x4[14]);
+
+ // stage 6
+ __m128i x6[16];
+ x6[0] = x5[0];
+ x6[1] = x5[1];
+ x6[2] = x5[2];
+ x6[3] = x5[3];
+ x6[4] = x5[4];
+ x6[5] = x5[5];
+ x6[6] = x5[6];
+ x6[7] = x5[7];
+ btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x5[8], x5[15], x6[8], x6[15]);
+ btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x5[9], x5[14], x6[9], x6[14]);
+ btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x5[10], x5[13], x6[10], x6[13]);
+ btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x5[11], x5[12], x6[11], x6[12]);
+
+ // stage 7
+ output[0] = x6[0];
+ output[1] = x6[8];
+ output[2] = x6[4];
+ output[3] = x6[12];
+ output[4] = x6[2];
+ output[5] = x6[10];
+ output[6] = x6[6];
+ output[7] = x6[14];
+ output[8] = x6[1];
+ output[9] = x6[9];
+ output[10] = x6[5];
+ output[11] = x6[13];
+ output[12] = x6[3];
+ output[13] = x6[11];
+ output[14] = x6[7];
+ output[15] = x6[15];
+}
+
+void fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+ __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+ __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+ __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+ __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+ __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+ __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+ __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+ __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+ __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+ __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+ __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+ __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+ __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+ __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+ __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+ __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+
+ // stage 1
+ __m128i x1[32];
+ x1[0] = _mm_adds_epi16(input[0], input[31]);
+ x1[31] = _mm_subs_epi16(input[0], input[31]);
+ x1[1] = _mm_adds_epi16(input[1], input[30]);
+ x1[30] = _mm_subs_epi16(input[1], input[30]);
+ x1[2] = _mm_adds_epi16(input[2], input[29]);
+ x1[29] = _mm_subs_epi16(input[2], input[29]);
+ x1[3] = _mm_adds_epi16(input[3], input[28]);
+ x1[28] = _mm_subs_epi16(input[3], input[28]);
+ x1[4] = _mm_adds_epi16(input[4], input[27]);
+ x1[27] = _mm_subs_epi16(input[4], input[27]);
+ x1[5] = _mm_adds_epi16(input[5], input[26]);
+ x1[26] = _mm_subs_epi16(input[5], input[26]);
+ x1[6] = _mm_adds_epi16(input[6], input[25]);
+ x1[25] = _mm_subs_epi16(input[6], input[25]);
+ x1[7] = _mm_adds_epi16(input[7], input[24]);
+ x1[24] = _mm_subs_epi16(input[7], input[24]);
+ x1[8] = _mm_adds_epi16(input[8], input[23]);
+ x1[23] = _mm_subs_epi16(input[8], input[23]);
+ x1[9] = _mm_adds_epi16(input[9], input[22]);
+ x1[22] = _mm_subs_epi16(input[9], input[22]);
+ x1[10] = _mm_adds_epi16(input[10], input[21]);
+ x1[21] = _mm_subs_epi16(input[10], input[21]);
+ x1[11] = _mm_adds_epi16(input[11], input[20]);
+ x1[20] = _mm_subs_epi16(input[11], input[20]);
+ x1[12] = _mm_adds_epi16(input[12], input[19]);
+ x1[19] = _mm_subs_epi16(input[12], input[19]);
+ x1[13] = _mm_adds_epi16(input[13], input[18]);
+ x1[18] = _mm_subs_epi16(input[13], input[18]);
+ x1[14] = _mm_adds_epi16(input[14], input[17]);
+ x1[17] = _mm_subs_epi16(input[14], input[17]);
+ x1[15] = _mm_adds_epi16(input[15], input[16]);
+ x1[16] = _mm_subs_epi16(input[15], input[16]);
+
+ // stage 2
+ __m128i x2[32];
+ x2[0] = _mm_adds_epi16(x1[0], x1[15]);
+ x2[15] = _mm_subs_epi16(x1[0], x1[15]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[14]);
+ x2[14] = _mm_subs_epi16(x1[1], x1[14]);
+ x2[2] = _mm_adds_epi16(x1[2], x1[13]);
+ x2[13] = _mm_subs_epi16(x1[2], x1[13]);
+ x2[3] = _mm_adds_epi16(x1[3], x1[12]);
+ x2[12] = _mm_subs_epi16(x1[3], x1[12]);
+ x2[4] = _mm_adds_epi16(x1[4], x1[11]);
+ x2[11] = _mm_subs_epi16(x1[4], x1[11]);
+ x2[5] = _mm_adds_epi16(x1[5], x1[10]);
+ x2[10] = _mm_subs_epi16(x1[5], x1[10]);
+ x2[6] = _mm_adds_epi16(x1[6], x1[9]);
+ x2[9] = _mm_subs_epi16(x1[6], x1[9]);
+ x2[7] = _mm_adds_epi16(x1[7], x1[8]);
+ x2[8] = _mm_subs_epi16(x1[7], x1[8]);
+ x2[16] = x1[16];
+ x2[17] = x1[17];
+ x2[18] = x1[18];
+ x2[19] = x1[19];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[20], x1[27], x2[20], x2[27]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[21], x1[26], x2[21], x2[26]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[22], x1[25], x2[22], x2[25]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[23], x1[24], x2[23], x2[24]);
+ x2[28] = x1[28];
+ x2[29] = x1[29];
+ x2[30] = x1[30];
+ x2[31] = x1[31];
+
+ // stage 3
+ __m128i x3[32];
+ x3[0] = _mm_adds_epi16(x2[0], x2[7]);
+ x3[7] = _mm_subs_epi16(x2[0], x2[7]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[6]);
+ x3[6] = _mm_subs_epi16(x2[1], x2[6]);
+ x3[2] = _mm_adds_epi16(x2[2], x2[5]);
+ x3[5] = _mm_subs_epi16(x2[2], x2[5]);
+ x3[3] = _mm_adds_epi16(x2[3], x2[4]);
+ x3[4] = _mm_subs_epi16(x2[3], x2[4]);
+ x3[8] = x2[8];
+ x3[9] = x2[9];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[10], x2[13], x3[10], x3[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[11], x2[12], x3[11], x3[12]);
+ x3[14] = x2[14];
+ x3[15] = x2[15];
+ x3[16] = _mm_adds_epi16(x2[16], x2[23]);
+ x3[23] = _mm_subs_epi16(x2[16], x2[23]);
+ x3[17] = _mm_adds_epi16(x2[17], x2[22]);
+ x3[22] = _mm_subs_epi16(x2[17], x2[22]);
+ x3[18] = _mm_adds_epi16(x2[18], x2[21]);
+ x3[21] = _mm_subs_epi16(x2[18], x2[21]);
+ x3[19] = _mm_adds_epi16(x2[19], x2[20]);
+ x3[20] = _mm_subs_epi16(x2[19], x2[20]);
+ x3[24] = _mm_subs_epi16(x2[31], x2[24]);
+ x3[31] = _mm_adds_epi16(x2[31], x2[24]);
+ x3[25] = _mm_subs_epi16(x2[30], x2[25]);
+ x3[30] = _mm_adds_epi16(x2[30], x2[25]);
+ x3[26] = _mm_subs_epi16(x2[29], x2[26]);
+ x3[29] = _mm_adds_epi16(x2[29], x2[26]);
+ x3[27] = _mm_subs_epi16(x2[28], x2[27]);
+ x3[28] = _mm_adds_epi16(x2[28], x2[27]);
+
+ // stage 4
+ __m128i x4[32];
+ x4[0] = _mm_adds_epi16(x3[0], x3[3]);
+ x4[3] = _mm_subs_epi16(x3[0], x3[3]);
+ x4[1] = _mm_adds_epi16(x3[1], x3[2]);
+ x4[2] = _mm_subs_epi16(x3[1], x3[2]);
+ x4[4] = x3[4];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]);
+ x4[7] = x3[7];
+ x4[8] = _mm_adds_epi16(x3[8], x3[11]);
+ x4[11] = _mm_subs_epi16(x3[8], x3[11]);
+ x4[9] = _mm_adds_epi16(x3[9], x3[10]);
+ x4[10] = _mm_subs_epi16(x3[9], x3[10]);
+ x4[12] = _mm_subs_epi16(x3[15], x3[12]);
+ x4[15] = _mm_adds_epi16(x3[15], x3[12]);
+ x4[13] = _mm_subs_epi16(x3[14], x3[13]);
+ x4[14] = _mm_adds_epi16(x3[14], x3[13]);
+ x4[16] = x3[16];
+ x4[17] = x3[17];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[18], x3[29], x4[18], x4[29]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[19], x3[28], x4[19], x4[28]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[20], x3[27], x4[20], x4[27]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[21], x3[26], x4[21], x4[26]);
+ x4[22] = x3[22];
+ x4[23] = x3[23];
+ x4[24] = x3[24];
+ x4[25] = x3[25];
+ x4[30] = x3[30];
+ x4[31] = x3[31];
+
+ // stage 5
+ __m128i x5[32];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x4[0], x4[1], x5[0], x5[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x4[2], x4[3], x5[2], x5[3]);
+ x5[4] = _mm_adds_epi16(x4[4], x4[5]);
+ x5[5] = _mm_subs_epi16(x4[4], x4[5]);
+ x5[6] = _mm_subs_epi16(x4[7], x4[6]);
+ x5[7] = _mm_adds_epi16(x4[7], x4[6]);
+ x5[8] = x4[8];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[9], x4[14], x5[9], x5[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[10], x4[13], x5[10], x5[13]);
+ x5[11] = x4[11];
+ x5[12] = x4[12];
+ x5[15] = x4[15];
+ x5[16] = _mm_adds_epi16(x4[16], x4[19]);
+ x5[19] = _mm_subs_epi16(x4[16], x4[19]);
+ x5[17] = _mm_adds_epi16(x4[17], x4[18]);
+ x5[18] = _mm_subs_epi16(x4[17], x4[18]);
+ x5[20] = _mm_subs_epi16(x4[23], x4[20]);
+ x5[23] = _mm_adds_epi16(x4[23], x4[20]);
+ x5[21] = _mm_subs_epi16(x4[22], x4[21]);
+ x5[22] = _mm_adds_epi16(x4[22], x4[21]);
+ x5[24] = _mm_adds_epi16(x4[24], x4[27]);
+ x5[27] = _mm_subs_epi16(x4[24], x4[27]);
+ x5[25] = _mm_adds_epi16(x4[25], x4[26]);
+ x5[26] = _mm_subs_epi16(x4[25], x4[26]);
+ x5[28] = _mm_subs_epi16(x4[31], x4[28]);
+ x5[31] = _mm_adds_epi16(x4[31], x4[28]);
+ x5[29] = _mm_subs_epi16(x4[30], x4[29]);
+ x5[30] = _mm_adds_epi16(x4[30], x4[29]);
+
+ // stage 6
+ __m128i x6[32];
+ x6[0] = x5[0];
+ x6[1] = x5[1];
+ x6[2] = x5[2];
+ x6[3] = x5[3];
+ btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x5[4], x5[7], x6[4], x6[7]);
+ btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x5[5], x5[6], x6[5], x6[6]);
+ x6[8] = _mm_adds_epi16(x5[8], x5[9]);
+ x6[9] = _mm_subs_epi16(x5[8], x5[9]);
+ x6[10] = _mm_subs_epi16(x5[11], x5[10]);
+ x6[11] = _mm_adds_epi16(x5[11], x5[10]);
+ x6[12] = _mm_adds_epi16(x5[12], x5[13]);
+ x6[13] = _mm_subs_epi16(x5[12], x5[13]);
+ x6[14] = _mm_subs_epi16(x5[15], x5[14]);
+ x6[15] = _mm_adds_epi16(x5[15], x5[14]);
+ x6[16] = x5[16];
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[17], x5[30], x6[17], x6[30]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[18], x5[29], x6[18], x6[29]);
+ x6[19] = x5[19];
+ x6[20] = x5[20];
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[21], x5[26], x6[21], x6[26]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[22], x5[25], x6[22], x6[25]);
+ x6[23] = x5[23];
+ x6[24] = x5[24];
+ x6[27] = x5[27];
+ x6[28] = x5[28];
+ x6[31] = x5[31];
+
+ // stage 7
+ __m128i x7[32];
+ x7[0] = x6[0];
+ x7[1] = x6[1];
+ x7[2] = x6[2];
+ x7[3] = x6[3];
+ x7[4] = x6[4];
+ x7[5] = x6[5];
+ x7[6] = x6[6];
+ x7[7] = x6[7];
+ btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x6[8], x6[15], x7[8], x7[15]);
+ btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x6[9], x6[14], x7[9], x7[14]);
+ btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x6[10], x6[13], x7[10], x7[13]);
+ btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x6[11], x6[12], x7[11], x7[12]);
+ x7[16] = _mm_adds_epi16(x6[16], x6[17]);
+ x7[17] = _mm_subs_epi16(x6[16], x6[17]);
+ x7[18] = _mm_subs_epi16(x6[19], x6[18]);
+ x7[19] = _mm_adds_epi16(x6[19], x6[18]);
+ x7[20] = _mm_adds_epi16(x6[20], x6[21]);
+ x7[21] = _mm_subs_epi16(x6[20], x6[21]);
+ x7[22] = _mm_subs_epi16(x6[23], x6[22]);
+ x7[23] = _mm_adds_epi16(x6[23], x6[22]);
+ x7[24] = _mm_adds_epi16(x6[24], x6[25]);
+ x7[25] = _mm_subs_epi16(x6[24], x6[25]);
+ x7[26] = _mm_subs_epi16(x6[27], x6[26]);
+ x7[27] = _mm_adds_epi16(x6[27], x6[26]);
+ x7[28] = _mm_adds_epi16(x6[28], x6[29]);
+ x7[29] = _mm_subs_epi16(x6[28], x6[29]);
+ x7[30] = _mm_subs_epi16(x6[31], x6[30]);
+ x7[31] = _mm_adds_epi16(x6[31], x6[30]);
+
+ // stage 8
+ __m128i x8[32];
+ x8[0] = x7[0];
+ x8[1] = x7[1];
+ x8[2] = x7[2];
+ x8[3] = x7[3];
+ x8[4] = x7[4];
+ x8[5] = x7[5];
+ x8[6] = x7[6];
+ x8[7] = x7[7];
+ x8[8] = x7[8];
+ x8[9] = x7[9];
+ x8[10] = x7[10];
+ x8[11] = x7[11];
+ x8[12] = x7[12];
+ x8[13] = x7[13];
+ x8[14] = x7[14];
+ x8[15] = x7[15];
+ btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x7[16], x7[31], x8[16], x8[31]);
+ btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x7[17], x7[30], x8[17], x8[30]);
+ btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x7[18], x7[29], x8[18], x8[29]);
+ btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x7[19], x7[28], x8[19], x8[28]);
+ btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x7[20], x7[27], x8[20], x8[27]);
+ btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x7[21], x7[26], x8[21], x8[26]);
+ btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x7[22], x7[25], x8[22], x8[25]);
+ btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x7[23], x7[24], x8[23], x8[24]);
+
+ // stage 9
+ output[0] = x8[0];
+ output[1] = x8[16];
+ output[2] = x8[8];
+ output[3] = x8[24];
+ output[4] = x8[4];
+ output[5] = x8[20];
+ output[6] = x8[12];
+ output[7] = x8[28];
+ output[8] = x8[2];
+ output[9] = x8[18];
+ output[10] = x8[10];
+ output[11] = x8[26];
+ output[12] = x8[6];
+ output[13] = x8[22];
+ output[14] = x8[14];
+ output[15] = x8[30];
+ output[16] = x8[1];
+ output[17] = x8[17];
+ output[18] = x8[9];
+ output[19] = x8[25];
+ output[20] = x8[5];
+ output[21] = x8[21];
+ output[22] = x8[13];
+ output[23] = x8[29];
+ output[24] = x8[3];
+ output[25] = x8[19];
+ output[26] = x8[11];
+ output[27] = x8[27];
+ output[28] = x8[7];
+ output[29] = x8[23];
+ output[30] = x8[15];
+ output[31] = x8[31];
+}
+
+void fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+ __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
+ __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+ __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
+ __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+ __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+ __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+ __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+ __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+ __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+ __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+ __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+ __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+ __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+ __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+ __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+ __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+ __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+ __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+ __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+ __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+ __m128i cospi_p63_p01 = pair_set_epi16(cospi[63], cospi[1]);
+ __m128i cospi_m01_p63 = pair_set_epi16(-cospi[1], cospi[63]);
+ __m128i cospi_p31_p33 = pair_set_epi16(cospi[31], cospi[33]);
+ __m128i cospi_m33_p31 = pair_set_epi16(-cospi[33], cospi[31]);
+ __m128i cospi_p47_p17 = pair_set_epi16(cospi[47], cospi[17]);
+ __m128i cospi_m17_p47 = pair_set_epi16(-cospi[17], cospi[47]);
+ __m128i cospi_p15_p49 = pair_set_epi16(cospi[15], cospi[49]);
+ __m128i cospi_m49_p15 = pair_set_epi16(-cospi[49], cospi[15]);
+ __m128i cospi_p55_p09 = pair_set_epi16(cospi[55], cospi[9]);
+ __m128i cospi_m09_p55 = pair_set_epi16(-cospi[9], cospi[55]);
+ __m128i cospi_p23_p41 = pair_set_epi16(cospi[23], cospi[41]);
+ __m128i cospi_m41_p23 = pair_set_epi16(-cospi[41], cospi[23]);
+ __m128i cospi_p39_p25 = pair_set_epi16(cospi[39], cospi[25]);
+ __m128i cospi_m25_p39 = pair_set_epi16(-cospi[25], cospi[39]);
+ __m128i cospi_p07_p57 = pair_set_epi16(cospi[7], cospi[57]);
+ __m128i cospi_m57_p07 = pair_set_epi16(-cospi[57], cospi[7]);
+ __m128i cospi_p59_p05 = pair_set_epi16(cospi[59], cospi[5]);
+ __m128i cospi_m05_p59 = pair_set_epi16(-cospi[5], cospi[59]);
+ __m128i cospi_p27_p37 = pair_set_epi16(cospi[27], cospi[37]);
+ __m128i cospi_m37_p27 = pair_set_epi16(-cospi[37], cospi[27]);
+ __m128i cospi_p43_p21 = pair_set_epi16(cospi[43], cospi[21]);
+ __m128i cospi_m21_p43 = pair_set_epi16(-cospi[21], cospi[43]);
+ __m128i cospi_p11_p53 = pair_set_epi16(cospi[11], cospi[53]);
+ __m128i cospi_m53_p11 = pair_set_epi16(-cospi[53], cospi[11]);
+ __m128i cospi_p51_p13 = pair_set_epi16(cospi[51], cospi[13]);
+ __m128i cospi_m13_p51 = pair_set_epi16(-cospi[13], cospi[51]);
+ __m128i cospi_p19_p45 = pair_set_epi16(cospi[19], cospi[45]);
+ __m128i cospi_m45_p19 = pair_set_epi16(-cospi[45], cospi[19]);
+ __m128i cospi_p35_p29 = pair_set_epi16(cospi[35], cospi[29]);
+ __m128i cospi_m29_p35 = pair_set_epi16(-cospi[29], cospi[35]);
+ __m128i cospi_p03_p61 = pair_set_epi16(cospi[3], cospi[61]);
+ __m128i cospi_m61_p03 = pair_set_epi16(-cospi[61], cospi[3]);
+
+ // stage 1
+ __m128i x1[64];
+ x1[0] = _mm_adds_epi16(input[0], input[63]);
+ x1[63] = _mm_subs_epi16(input[0], input[63]);
+ x1[1] = _mm_adds_epi16(input[1], input[62]);
+ x1[62] = _mm_subs_epi16(input[1], input[62]);
+ x1[2] = _mm_adds_epi16(input[2], input[61]);
+ x1[61] = _mm_subs_epi16(input[2], input[61]);
+ x1[3] = _mm_adds_epi16(input[3], input[60]);
+ x1[60] = _mm_subs_epi16(input[3], input[60]);
+ x1[4] = _mm_adds_epi16(input[4], input[59]);
+ x1[59] = _mm_subs_epi16(input[4], input[59]);
+ x1[5] = _mm_adds_epi16(input[5], input[58]);
+ x1[58] = _mm_subs_epi16(input[5], input[58]);
+ x1[6] = _mm_adds_epi16(input[6], input[57]);
+ x1[57] = _mm_subs_epi16(input[6], input[57]);
+ x1[7] = _mm_adds_epi16(input[7], input[56]);
+ x1[56] = _mm_subs_epi16(input[7], input[56]);
+ x1[8] = _mm_adds_epi16(input[8], input[55]);
+ x1[55] = _mm_subs_epi16(input[8], input[55]);
+ x1[9] = _mm_adds_epi16(input[9], input[54]);
+ x1[54] = _mm_subs_epi16(input[9], input[54]);
+ x1[10] = _mm_adds_epi16(input[10], input[53]);
+ x1[53] = _mm_subs_epi16(input[10], input[53]);
+ x1[11] = _mm_adds_epi16(input[11], input[52]);
+ x1[52] = _mm_subs_epi16(input[11], input[52]);
+ x1[12] = _mm_adds_epi16(input[12], input[51]);
+ x1[51] = _mm_subs_epi16(input[12], input[51]);
+ x1[13] = _mm_adds_epi16(input[13], input[50]);
+ x1[50] = _mm_subs_epi16(input[13], input[50]);
+ x1[14] = _mm_adds_epi16(input[14], input[49]);
+ x1[49] = _mm_subs_epi16(input[14], input[49]);
+ x1[15] = _mm_adds_epi16(input[15], input[48]);
+ x1[48] = _mm_subs_epi16(input[15], input[48]);
+ x1[16] = _mm_adds_epi16(input[16], input[47]);
+ x1[47] = _mm_subs_epi16(input[16], input[47]);
+ x1[17] = _mm_adds_epi16(input[17], input[46]);
+ x1[46] = _mm_subs_epi16(input[17], input[46]);
+ x1[18] = _mm_adds_epi16(input[18], input[45]);
+ x1[45] = _mm_subs_epi16(input[18], input[45]);
+ x1[19] = _mm_adds_epi16(input[19], input[44]);
+ x1[44] = _mm_subs_epi16(input[19], input[44]);
+ x1[20] = _mm_adds_epi16(input[20], input[43]);
+ x1[43] = _mm_subs_epi16(input[20], input[43]);
+ x1[21] = _mm_adds_epi16(input[21], input[42]);
+ x1[42] = _mm_subs_epi16(input[21], input[42]);
+ x1[22] = _mm_adds_epi16(input[22], input[41]);
+ x1[41] = _mm_subs_epi16(input[22], input[41]);
+ x1[23] = _mm_adds_epi16(input[23], input[40]);
+ x1[40] = _mm_subs_epi16(input[23], input[40]);
+ x1[24] = _mm_adds_epi16(input[24], input[39]);
+ x1[39] = _mm_subs_epi16(input[24], input[39]);
+ x1[25] = _mm_adds_epi16(input[25], input[38]);
+ x1[38] = _mm_subs_epi16(input[25], input[38]);
+ x1[26] = _mm_adds_epi16(input[26], input[37]);
+ x1[37] = _mm_subs_epi16(input[26], input[37]);
+ x1[27] = _mm_adds_epi16(input[27], input[36]);
+ x1[36] = _mm_subs_epi16(input[27], input[36]);
+ x1[28] = _mm_adds_epi16(input[28], input[35]);
+ x1[35] = _mm_subs_epi16(input[28], input[35]);
+ x1[29] = _mm_adds_epi16(input[29], input[34]);
+ x1[34] = _mm_subs_epi16(input[29], input[34]);
+ x1[30] = _mm_adds_epi16(input[30], input[33]);
+ x1[33] = _mm_subs_epi16(input[30], input[33]);
+ x1[31] = _mm_adds_epi16(input[31], input[32]);
+ x1[32] = _mm_subs_epi16(input[31], input[32]);
+
+ // stage 2
+ __m128i x2[64];
+ x2[0] = _mm_adds_epi16(x1[0], x1[31]);
+ x2[31] = _mm_subs_epi16(x1[0], x1[31]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[30]);
+ x2[30] = _mm_subs_epi16(x1[1], x1[30]);
+ x2[2] = _mm_adds_epi16(x1[2], x1[29]);
+ x2[29] = _mm_subs_epi16(x1[2], x1[29]);
+ x2[3] = _mm_adds_epi16(x1[3], x1[28]);
+ x2[28] = _mm_subs_epi16(x1[3], x1[28]);
+ x2[4] = _mm_adds_epi16(x1[4], x1[27]);
+ x2[27] = _mm_subs_epi16(x1[4], x1[27]);
+ x2[5] = _mm_adds_epi16(x1[5], x1[26]);
+ x2[26] = _mm_subs_epi16(x1[5], x1[26]);
+ x2[6] = _mm_adds_epi16(x1[6], x1[25]);
+ x2[25] = _mm_subs_epi16(x1[6], x1[25]);
+ x2[7] = _mm_adds_epi16(x1[7], x1[24]);
+ x2[24] = _mm_subs_epi16(x1[7], x1[24]);
+ x2[8] = _mm_adds_epi16(x1[8], x1[23]);
+ x2[23] = _mm_subs_epi16(x1[8], x1[23]);
+ x2[9] = _mm_adds_epi16(x1[9], x1[22]);
+ x2[22] = _mm_subs_epi16(x1[9], x1[22]);
+ x2[10] = _mm_adds_epi16(x1[10], x1[21]);
+ x2[21] = _mm_subs_epi16(x1[10], x1[21]);
+ x2[11] = _mm_adds_epi16(x1[11], x1[20]);
+ x2[20] = _mm_subs_epi16(x1[11], x1[20]);
+ x2[12] = _mm_adds_epi16(x1[12], x1[19]);
+ x2[19] = _mm_subs_epi16(x1[12], x1[19]);
+ x2[13] = _mm_adds_epi16(x1[13], x1[18]);
+ x2[18] = _mm_subs_epi16(x1[13], x1[18]);
+ x2[14] = _mm_adds_epi16(x1[14], x1[17]);
+ x2[17] = _mm_subs_epi16(x1[14], x1[17]);
+ x2[15] = _mm_adds_epi16(x1[15], x1[16]);
+ x2[16] = _mm_subs_epi16(x1[15], x1[16]);
+ x2[32] = x1[32];
+ x2[33] = x1[33];
+ x2[34] = x1[34];
+ x2[35] = x1[35];
+ x2[36] = x1[36];
+ x2[37] = x1[37];
+ x2[38] = x1[38];
+ x2[39] = x1[39];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[40], x1[55], x2[40], x2[55]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[41], x1[54], x2[41], x2[54]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[42], x1[53], x2[42], x2[53]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[43], x1[52], x2[43], x2[52]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[44], x1[51], x2[44], x2[51]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[45], x1[50], x2[45], x2[50]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[46], x1[49], x2[46], x2[49]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[47], x1[48], x2[47], x2[48]);
+ x2[56] = x1[56];
+ x2[57] = x1[57];
+ x2[58] = x1[58];
+ x2[59] = x1[59];
+ x2[60] = x1[60];
+ x2[61] = x1[61];
+ x2[62] = x1[62];
+ x2[63] = x1[63];
+
+ // stage 3
+ __m128i x3[64];
+ x3[0] = _mm_adds_epi16(x2[0], x2[15]);
+ x3[15] = _mm_subs_epi16(x2[0], x2[15]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[14]);
+ x3[14] = _mm_subs_epi16(x2[1], x2[14]);
+ x3[2] = _mm_adds_epi16(x2[2], x2[13]);
+ x3[13] = _mm_subs_epi16(x2[2], x2[13]);
+ x3[3] = _mm_adds_epi16(x2[3], x2[12]);
+ x3[12] = _mm_subs_epi16(x2[3], x2[12]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[11]);
+ x3[11] = _mm_subs_epi16(x2[4], x2[11]);
+ x3[5] = _mm_adds_epi16(x2[5], x2[10]);
+ x3[10] = _mm_subs_epi16(x2[5], x2[10]);
+ x3[6] = _mm_adds_epi16(x2[6], x2[9]);
+ x3[9] = _mm_subs_epi16(x2[6], x2[9]);
+ x3[7] = _mm_adds_epi16(x2[7], x2[8]);
+ x3[8] = _mm_subs_epi16(x2[7], x2[8]);
+ x3[16] = x2[16];
+ x3[17] = x2[17];
+ x3[18] = x2[18];
+ x3[19] = x2[19];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[20], x2[27], x3[20], x3[27]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[21], x2[26], x3[21], x3[26]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[22], x2[25], x3[22], x3[25]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[23], x2[24], x3[23], x3[24]);
+ x3[28] = x2[28];
+ x3[29] = x2[29];
+ x3[30] = x2[30];
+ x3[31] = x2[31];
+ x3[32] = _mm_adds_epi16(x2[32], x2[47]);
+ x3[47] = _mm_subs_epi16(x2[32], x2[47]);
+ x3[33] = _mm_adds_epi16(x2[33], x2[46]);
+ x3[46] = _mm_subs_epi16(x2[33], x2[46]);
+ x3[34] = _mm_adds_epi16(x2[34], x2[45]);
+ x3[45] = _mm_subs_epi16(x2[34], x2[45]);
+ x3[35] = _mm_adds_epi16(x2[35], x2[44]);
+ x3[44] = _mm_subs_epi16(x2[35], x2[44]);
+ x3[36] = _mm_adds_epi16(x2[36], x2[43]);
+ x3[43] = _mm_subs_epi16(x2[36], x2[43]);
+ x3[37] = _mm_adds_epi16(x2[37], x2[42]);
+ x3[42] = _mm_subs_epi16(x2[37], x2[42]);
+ x3[38] = _mm_adds_epi16(x2[38], x2[41]);
+ x3[41] = _mm_subs_epi16(x2[38], x2[41]);
+ x3[39] = _mm_adds_epi16(x2[39], x2[40]);
+ x3[40] = _mm_subs_epi16(x2[39], x2[40]);
+ x3[48] = _mm_subs_epi16(x2[63], x2[48]);
+ x3[63] = _mm_adds_epi16(x2[63], x2[48]);
+ x3[49] = _mm_subs_epi16(x2[62], x2[49]);
+ x3[62] = _mm_adds_epi16(x2[62], x2[49]);
+ x3[50] = _mm_subs_epi16(x2[61], x2[50]);
+ x3[61] = _mm_adds_epi16(x2[61], x2[50]);
+ x3[51] = _mm_subs_epi16(x2[60], x2[51]);
+ x3[60] = _mm_adds_epi16(x2[60], x2[51]);
+ x3[52] = _mm_subs_epi16(x2[59], x2[52]);
+ x3[59] = _mm_adds_epi16(x2[59], x2[52]);
+ x3[53] = _mm_subs_epi16(x2[58], x2[53]);
+ x3[58] = _mm_adds_epi16(x2[58], x2[53]);
+ x3[54] = _mm_subs_epi16(x2[57], x2[54]);
+ x3[57] = _mm_adds_epi16(x2[57], x2[54]);
+ x3[55] = _mm_subs_epi16(x2[56], x2[55]);
+ x3[56] = _mm_adds_epi16(x2[56], x2[55]);
+
+ // stage 4
+ __m128i x4[64];
+ x4[0] = _mm_adds_epi16(x3[0], x3[7]);
+ x4[7] = _mm_subs_epi16(x3[0], x3[7]);
+ x4[1] = _mm_adds_epi16(x3[1], x3[6]);
+ x4[6] = _mm_subs_epi16(x3[1], x3[6]);
+ x4[2] = _mm_adds_epi16(x3[2], x3[5]);
+ x4[5] = _mm_subs_epi16(x3[2], x3[5]);
+ x4[3] = _mm_adds_epi16(x3[3], x3[4]);
+ x4[4] = _mm_subs_epi16(x3[3], x3[4]);
+ x4[8] = x3[8];
+ x4[9] = x3[9];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[10], x3[13], x4[10], x4[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[11], x3[12], x4[11], x4[12]);
+ x4[14] = x3[14];
+ x4[15] = x3[15];
+ x4[16] = _mm_adds_epi16(x3[16], x3[23]);
+ x4[23] = _mm_subs_epi16(x3[16], x3[23]);
+ x4[17] = _mm_adds_epi16(x3[17], x3[22]);
+ x4[22] = _mm_subs_epi16(x3[17], x3[22]);
+ x4[18] = _mm_adds_epi16(x3[18], x3[21]);
+ x4[21] = _mm_subs_epi16(x3[18], x3[21]);
+ x4[19] = _mm_adds_epi16(x3[19], x3[20]);
+ x4[20] = _mm_subs_epi16(x3[19], x3[20]);
+ x4[24] = _mm_subs_epi16(x3[31], x3[24]);
+ x4[31] = _mm_adds_epi16(x3[31], x3[24]);
+ x4[25] = _mm_subs_epi16(x3[30], x3[25]);
+ x4[30] = _mm_adds_epi16(x3[30], x3[25]);
+ x4[26] = _mm_subs_epi16(x3[29], x3[26]);
+ x4[29] = _mm_adds_epi16(x3[29], x3[26]);
+ x4[27] = _mm_subs_epi16(x3[28], x3[27]);
+ x4[28] = _mm_adds_epi16(x3[28], x3[27]);
+ x4[32] = x3[32];
+ x4[33] = x3[33];
+ x4[34] = x3[34];
+ x4[35] = x3[35];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[36], x3[59], x4[36], x4[59]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[37], x3[58], x4[37], x4[58]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[38], x3[57], x4[38], x4[57]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[39], x3[56], x4[39], x4[56]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[40], x3[55], x4[40], x4[55]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[41], x3[54], x4[41], x4[54]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[42], x3[53], x4[42], x4[53]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[43], x3[52], x4[43], x4[52]);
+ x4[44] = x3[44];
+ x4[45] = x3[45];
+ x4[46] = x3[46];
+ x4[47] = x3[47];
+ x4[48] = x3[48];
+ x4[49] = x3[49];
+ x4[50] = x3[50];
+ x4[51] = x3[51];
+ x4[60] = x3[60];
+ x4[61] = x3[61];
+ x4[62] = x3[62];
+ x4[63] = x3[63];
+
+ // stage 5
+ __m128i x5[64];
+ x5[0] = _mm_adds_epi16(x4[0], x4[3]);
+ x5[3] = _mm_subs_epi16(x4[0], x4[3]);
+ x5[1] = _mm_adds_epi16(x4[1], x4[2]);
+ x5[2] = _mm_subs_epi16(x4[1], x4[2]);
+ x5[4] = x4[4];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x4[5], x4[6], x5[5], x5[6]);
+ x5[7] = x4[7];
+ x5[8] = _mm_adds_epi16(x4[8], x4[11]);
+ x5[11] = _mm_subs_epi16(x4[8], x4[11]);
+ x5[9] = _mm_adds_epi16(x4[9], x4[10]);
+ x5[10] = _mm_subs_epi16(x4[9], x4[10]);
+ x5[12] = _mm_subs_epi16(x4[15], x4[12]);
+ x5[15] = _mm_adds_epi16(x4[15], x4[12]);
+ x5[13] = _mm_subs_epi16(x4[14], x4[13]);
+ x5[14] = _mm_adds_epi16(x4[14], x4[13]);
+ x5[16] = x4[16];
+ x5[17] = x4[17];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[18], x4[29], x5[18], x5[29]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[19], x4[28], x5[19], x5[28]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[20], x4[27], x5[20], x5[27]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[21], x4[26], x5[21], x5[26]);
+ x5[22] = x4[22];
+ x5[23] = x4[23];
+ x5[24] = x4[24];
+ x5[25] = x4[25];
+ x5[30] = x4[30];
+ x5[31] = x4[31];
+ x5[32] = _mm_adds_epi16(x4[32], x4[39]);
+ x5[39] = _mm_subs_epi16(x4[32], x4[39]);
+ x5[33] = _mm_adds_epi16(x4[33], x4[38]);
+ x5[38] = _mm_subs_epi16(x4[33], x4[38]);
+ x5[34] = _mm_adds_epi16(x4[34], x4[37]);
+ x5[37] = _mm_subs_epi16(x4[34], x4[37]);
+ x5[35] = _mm_adds_epi16(x4[35], x4[36]);
+ x5[36] = _mm_subs_epi16(x4[35], x4[36]);
+ x5[40] = _mm_subs_epi16(x4[47], x4[40]);
+ x5[47] = _mm_adds_epi16(x4[47], x4[40]);
+ x5[41] = _mm_subs_epi16(x4[46], x4[41]);
+ x5[46] = _mm_adds_epi16(x4[46], x4[41]);
+ x5[42] = _mm_subs_epi16(x4[45], x4[42]);
+ x5[45] = _mm_adds_epi16(x4[45], x4[42]);
+ x5[43] = _mm_subs_epi16(x4[44], x4[43]);
+ x5[44] = _mm_adds_epi16(x4[44], x4[43]);
+ x5[48] = _mm_adds_epi16(x4[48], x4[55]);
+ x5[55] = _mm_subs_epi16(x4[48], x4[55]);
+ x5[49] = _mm_adds_epi16(x4[49], x4[54]);
+ x5[54] = _mm_subs_epi16(x4[49], x4[54]);
+ x5[50] = _mm_adds_epi16(x4[50], x4[53]);
+ x5[53] = _mm_subs_epi16(x4[50], x4[53]);
+ x5[51] = _mm_adds_epi16(x4[51], x4[52]);
+ x5[52] = _mm_subs_epi16(x4[51], x4[52]);
+ x5[56] = _mm_subs_epi16(x4[63], x4[56]);
+ x5[63] = _mm_adds_epi16(x4[63], x4[56]);
+ x5[57] = _mm_subs_epi16(x4[62], x4[57]);
+ x5[62] = _mm_adds_epi16(x4[62], x4[57]);
+ x5[58] = _mm_subs_epi16(x4[61], x4[58]);
+ x5[61] = _mm_adds_epi16(x4[61], x4[58]);
+ x5[59] = _mm_subs_epi16(x4[60], x4[59]);
+ x5[60] = _mm_adds_epi16(x4[60], x4[59]);
+
+ // stage 6
+ __m128i x6[64];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x5[0], x5[1], x6[0], x6[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x5[2], x5[3], x6[2], x6[3]);
+ x6[4] = _mm_adds_epi16(x5[4], x5[5]);
+ x6[5] = _mm_subs_epi16(x5[4], x5[5]);
+ x6[6] = _mm_subs_epi16(x5[7], x5[6]);
+ x6[7] = _mm_adds_epi16(x5[7], x5[6]);
+ x6[8] = x5[8];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x5[9], x5[14], x6[9], x6[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x5[10], x5[13], x6[10], x6[13]);
+ x6[11] = x5[11];
+ x6[12] = x5[12];
+ x6[15] = x5[15];
+ x6[16] = _mm_adds_epi16(x5[16], x5[19]);
+ x6[19] = _mm_subs_epi16(x5[16], x5[19]);
+ x6[17] = _mm_adds_epi16(x5[17], x5[18]);
+ x6[18] = _mm_subs_epi16(x5[17], x5[18]);
+ x6[20] = _mm_subs_epi16(x5[23], x5[20]);
+ x6[23] = _mm_adds_epi16(x5[23], x5[20]);
+ x6[21] = _mm_subs_epi16(x5[22], x5[21]);
+ x6[22] = _mm_adds_epi16(x5[22], x5[21]);
+ x6[24] = _mm_adds_epi16(x5[24], x5[27]);
+ x6[27] = _mm_subs_epi16(x5[24], x5[27]);
+ x6[25] = _mm_adds_epi16(x5[25], x5[26]);
+ x6[26] = _mm_subs_epi16(x5[25], x5[26]);
+ x6[28] = _mm_subs_epi16(x5[31], x5[28]);
+ x6[31] = _mm_adds_epi16(x5[31], x5[28]);
+ x6[29] = _mm_subs_epi16(x5[30], x5[29]);
+ x6[30] = _mm_adds_epi16(x5[30], x5[29]);
+ x6[32] = x5[32];
+ x6[33] = x5[33];
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[34], x5[61], x6[34], x6[61]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[35], x5[60], x6[35], x6[60]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[36], x5[59], x6[36], x6[59]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[37], x5[58], x6[37], x6[58]);
+ x6[38] = x5[38];
+ x6[39] = x5[39];
+ x6[40] = x5[40];
+ x6[41] = x5[41];
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[42], x5[53], x6[42], x6[53]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[43], x5[52], x6[43], x6[52]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[44], x5[51], x6[44], x6[51]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[45], x5[50], x6[45], x6[50]);
+ x6[46] = x5[46];
+ x6[47] = x5[47];
+ x6[48] = x5[48];
+ x6[49] = x5[49];
+ x6[54] = x5[54];
+ x6[55] = x5[55];
+ x6[56] = x5[56];
+ x6[57] = x5[57];
+ x6[62] = x5[62];
+ x6[63] = x5[63];
+
+ // stage 7
+ __m128i x7[64];
+ x7[0] = x6[0];
+ x7[1] = x6[1];
+ x7[2] = x6[2];
+ x7[3] = x6[3];
+ btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x6[4], x6[7], x7[4], x7[7]);
+ btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x6[5], x6[6], x7[5], x7[6]);
+ x7[8] = _mm_adds_epi16(x6[8], x6[9]);
+ x7[9] = _mm_subs_epi16(x6[8], x6[9]);
+ x7[10] = _mm_subs_epi16(x6[11], x6[10]);
+ x7[11] = _mm_adds_epi16(x6[11], x6[10]);
+ x7[12] = _mm_adds_epi16(x6[12], x6[13]);
+ x7[13] = _mm_subs_epi16(x6[12], x6[13]);
+ x7[14] = _mm_subs_epi16(x6[15], x6[14]);
+ x7[15] = _mm_adds_epi16(x6[15], x6[14]);
+ x7[16] = x6[16];
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x6[17], x6[30], x7[17], x7[30]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x6[18], x6[29], x7[18], x7[29]);
+ x7[19] = x6[19];
+ x7[20] = x6[20];
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x6[21], x6[26], x7[21], x7[26]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x6[22], x6[25], x7[22], x7[25]);
+ x7[23] = x6[23];
+ x7[24] = x6[24];
+ x7[27] = x6[27];
+ x7[28] = x6[28];
+ x7[31] = x6[31];
+ x7[32] = _mm_adds_epi16(x6[32], x6[35]);
+ x7[35] = _mm_subs_epi16(x6[32], x6[35]);
+ x7[33] = _mm_adds_epi16(x6[33], x6[34]);
+ x7[34] = _mm_subs_epi16(x6[33], x6[34]);
+ x7[36] = _mm_subs_epi16(x6[39], x6[36]);
+ x7[39] = _mm_adds_epi16(x6[39], x6[36]);
+ x7[37] = _mm_subs_epi16(x6[38], x6[37]);
+ x7[38] = _mm_adds_epi16(x6[38], x6[37]);
+ x7[40] = _mm_adds_epi16(x6[40], x6[43]);
+ x7[43] = _mm_subs_epi16(x6[40], x6[43]);
+ x7[41] = _mm_adds_epi16(x6[41], x6[42]);
+ x7[42] = _mm_subs_epi16(x6[41], x6[42]);
+ x7[44] = _mm_subs_epi16(x6[47], x6[44]);
+ x7[47] = _mm_adds_epi16(x6[47], x6[44]);
+ x7[45] = _mm_subs_epi16(x6[46], x6[45]);
+ x7[46] = _mm_adds_epi16(x6[46], x6[45]);
+ x7[48] = _mm_adds_epi16(x6[48], x6[51]);
+ x7[51] = _mm_subs_epi16(x6[48], x6[51]);
+ x7[49] = _mm_adds_epi16(x6[49], x6[50]);
+ x7[50] = _mm_subs_epi16(x6[49], x6[50]);
+ x7[52] = _mm_subs_epi16(x6[55], x6[52]);
+ x7[55] = _mm_adds_epi16(x6[55], x6[52]);
+ x7[53] = _mm_subs_epi16(x6[54], x6[53]);
+ x7[54] = _mm_adds_epi16(x6[54], x6[53]);
+ x7[56] = _mm_adds_epi16(x6[56], x6[59]);
+ x7[59] = _mm_subs_epi16(x6[56], x6[59]);
+ x7[57] = _mm_adds_epi16(x6[57], x6[58]);
+ x7[58] = _mm_subs_epi16(x6[57], x6[58]);
+ x7[60] = _mm_subs_epi16(x6[63], x6[60]);
+ x7[63] = _mm_adds_epi16(x6[63], x6[60]);
+ x7[61] = _mm_subs_epi16(x6[62], x6[61]);
+ x7[62] = _mm_adds_epi16(x6[62], x6[61]);
+
+ // stage 8
+ __m128i x8[64];
+ x8[0] = x7[0];
+ x8[1] = x7[1];
+ x8[2] = x7[2];
+ x8[3] = x7[3];
+ x8[4] = x7[4];
+ x8[5] = x7[5];
+ x8[6] = x7[6];
+ x8[7] = x7[7];
+ btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x7[8], x7[15], x8[8], x8[15]);
+ btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x7[9], x7[14], x8[9], x8[14]);
+ btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x7[10], x7[13], x8[10], x8[13]);
+ btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x7[11], x7[12], x8[11], x8[12]);
+ x8[16] = _mm_adds_epi16(x7[16], x7[17]);
+ x8[17] = _mm_subs_epi16(x7[16], x7[17]);
+ x8[18] = _mm_subs_epi16(x7[19], x7[18]);
+ x8[19] = _mm_adds_epi16(x7[19], x7[18]);
+ x8[20] = _mm_adds_epi16(x7[20], x7[21]);
+ x8[21] = _mm_subs_epi16(x7[20], x7[21]);
+ x8[22] = _mm_subs_epi16(x7[23], x7[22]);
+ x8[23] = _mm_adds_epi16(x7[23], x7[22]);
+ x8[24] = _mm_adds_epi16(x7[24], x7[25]);
+ x8[25] = _mm_subs_epi16(x7[24], x7[25]);
+ x8[26] = _mm_subs_epi16(x7[27], x7[26]);
+ x8[27] = _mm_adds_epi16(x7[27], x7[26]);
+ x8[28] = _mm_adds_epi16(x7[28], x7[29]);
+ x8[29] = _mm_subs_epi16(x7[28], x7[29]);
+ x8[30] = _mm_subs_epi16(x7[31], x7[30]);
+ x8[31] = _mm_adds_epi16(x7[31], x7[30]);
+ x8[32] = x7[32];
+ btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x7[33], x7[62], x8[33], x8[62]);
+ btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x7[34], x7[61], x8[34], x8[61]);
+ x8[35] = x7[35];
+ x8[36] = x7[36];
+ btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x7[37], x7[58], x8[37], x8[58]);
+ btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x7[38], x7[57], x8[38], x8[57]);
+ x8[39] = x7[39];
+ x8[40] = x7[40];
+ btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x7[41], x7[54], x8[41], x8[54]);
+ btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x7[42], x7[53], x8[42], x8[53]);
+ x8[43] = x7[43];
+ x8[44] = x7[44];
+ btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x7[45], x7[50], x8[45], x8[50]);
+ btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x7[46], x7[49], x8[46], x8[49]);
+ x8[47] = x7[47];
+ x8[48] = x7[48];
+ x8[51] = x7[51];
+ x8[52] = x7[52];
+ x8[55] = x7[55];
+ x8[56] = x7[56];
+ x8[59] = x7[59];
+ x8[60] = x7[60];
+ x8[63] = x7[63];
+
+ // stage 9
+ __m128i x9[64];
+ x9[0] = x8[0];
+ x9[1] = x8[1];
+ x9[2] = x8[2];
+ x9[3] = x8[3];
+ x9[4] = x8[4];
+ x9[5] = x8[5];
+ x9[6] = x8[6];
+ x9[7] = x8[7];
+ x9[8] = x8[8];
+ x9[9] = x8[9];
+ x9[10] = x8[10];
+ x9[11] = x8[11];
+ x9[12] = x8[12];
+ x9[13] = x8[13];
+ x9[14] = x8[14];
+ x9[15] = x8[15];
+ btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x8[16], x8[31], x9[16], x9[31]);
+ btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x8[17], x8[30], x9[17], x9[30]);
+ btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x8[18], x8[29], x9[18], x9[29]);
+ btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x8[19], x8[28], x9[19], x9[28]);
+ btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x8[20], x8[27], x9[20], x9[27]);
+ btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x8[21], x8[26], x9[21], x9[26]);
+ btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x8[22], x8[25], x9[22], x9[25]);
+ btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x8[23], x8[24], x9[23], x9[24]);
+ x9[32] = _mm_adds_epi16(x8[32], x8[33]);
+ x9[33] = _mm_subs_epi16(x8[32], x8[33]);
+ x9[34] = _mm_subs_epi16(x8[35], x8[34]);
+ x9[35] = _mm_adds_epi16(x8[35], x8[34]);
+ x9[36] = _mm_adds_epi16(x8[36], x8[37]);
+ x9[37] = _mm_subs_epi16(x8[36], x8[37]);
+ x9[38] = _mm_subs_epi16(x8[39], x8[38]);
+ x9[39] = _mm_adds_epi16(x8[39], x8[38]);
+ x9[40] = _mm_adds_epi16(x8[40], x8[41]);
+ x9[41] = _mm_subs_epi16(x8[40], x8[41]);
+ x9[42] = _mm_subs_epi16(x8[43], x8[42]);
+ x9[43] = _mm_adds_epi16(x8[43], x8[42]);
+ x9[44] = _mm_adds_epi16(x8[44], x8[45]);
+ x9[45] = _mm_subs_epi16(x8[44], x8[45]);
+ x9[46] = _mm_subs_epi16(x8[47], x8[46]);
+ x9[47] = _mm_adds_epi16(x8[47], x8[46]);
+ x9[48] = _mm_adds_epi16(x8[48], x8[49]);
+ x9[49] = _mm_subs_epi16(x8[48], x8[49]);
+ x9[50] = _mm_subs_epi16(x8[51], x8[50]);
+ x9[51] = _mm_adds_epi16(x8[51], x8[50]);
+ x9[52] = _mm_adds_epi16(x8[52], x8[53]);
+ x9[53] = _mm_subs_epi16(x8[52], x8[53]);
+ x9[54] = _mm_subs_epi16(x8[55], x8[54]);
+ x9[55] = _mm_adds_epi16(x8[55], x8[54]);
+ x9[56] = _mm_adds_epi16(x8[56], x8[57]);
+ x9[57] = _mm_subs_epi16(x8[56], x8[57]);
+ x9[58] = _mm_subs_epi16(x8[59], x8[58]);
+ x9[59] = _mm_adds_epi16(x8[59], x8[58]);
+ x9[60] = _mm_adds_epi16(x8[60], x8[61]);
+ x9[61] = _mm_subs_epi16(x8[60], x8[61]);
+ x9[62] = _mm_subs_epi16(x8[63], x8[62]);
+ x9[63] = _mm_adds_epi16(x8[63], x8[62]);
+
+ // stage 10
+ __m128i x10[64];
+ x10[0] = x9[0];
+ x10[1] = x9[1];
+ x10[2] = x9[2];
+ x10[3] = x9[3];
+ x10[4] = x9[4];
+ x10[5] = x9[5];
+ x10[6] = x9[6];
+ x10[7] = x9[7];
+ x10[8] = x9[8];
+ x10[9] = x9[9];
+ x10[10] = x9[10];
+ x10[11] = x9[11];
+ x10[12] = x9[12];
+ x10[13] = x9[13];
+ x10[14] = x9[14];
+ x10[15] = x9[15];
+ x10[16] = x9[16];
+ x10[17] = x9[17];
+ x10[18] = x9[18];
+ x10[19] = x9[19];
+ x10[20] = x9[20];
+ x10[21] = x9[21];
+ x10[22] = x9[22];
+ x10[23] = x9[23];
+ x10[24] = x9[24];
+ x10[25] = x9[25];
+ x10[26] = x9[26];
+ x10[27] = x9[27];
+ x10[28] = x9[28];
+ x10[29] = x9[29];
+ x10[30] = x9[30];
+ x10[31] = x9[31];
+ btf_16_sse2(cospi_p63_p01, cospi_m01_p63, x9[32], x9[63], x10[32], x10[63]);
+ btf_16_sse2(cospi_p31_p33, cospi_m33_p31, x9[33], x9[62], x10[33], x10[62]);
+ btf_16_sse2(cospi_p47_p17, cospi_m17_p47, x9[34], x9[61], x10[34], x10[61]);
+ btf_16_sse2(cospi_p15_p49, cospi_m49_p15, x9[35], x9[60], x10[35], x10[60]);
+ btf_16_sse2(cospi_p55_p09, cospi_m09_p55, x9[36], x9[59], x10[36], x10[59]);
+ btf_16_sse2(cospi_p23_p41, cospi_m41_p23, x9[37], x9[58], x10[37], x10[58]);
+ btf_16_sse2(cospi_p39_p25, cospi_m25_p39, x9[38], x9[57], x10[38], x10[57]);
+ btf_16_sse2(cospi_p07_p57, cospi_m57_p07, x9[39], x9[56], x10[39], x10[56]);
+ btf_16_sse2(cospi_p59_p05, cospi_m05_p59, x9[40], x9[55], x10[40], x10[55]);
+ btf_16_sse2(cospi_p27_p37, cospi_m37_p27, x9[41], x9[54], x10[41], x10[54]);
+ btf_16_sse2(cospi_p43_p21, cospi_m21_p43, x9[42], x9[53], x10[42], x10[53]);
+ btf_16_sse2(cospi_p11_p53, cospi_m53_p11, x9[43], x9[52], x10[43], x10[52]);
+ btf_16_sse2(cospi_p51_p13, cospi_m13_p51, x9[44], x9[51], x10[44], x10[51]);
+ btf_16_sse2(cospi_p19_p45, cospi_m45_p19, x9[45], x9[50], x10[45], x10[50]);
+ btf_16_sse2(cospi_p35_p29, cospi_m29_p35, x9[46], x9[49], x10[46], x10[49]);
+ btf_16_sse2(cospi_p03_p61, cospi_m61_p03, x9[47], x9[48], x10[47], x10[48]);
+
+ // stage 11
+ output[0] = x10[0];
+ output[1] = x10[32];
+ output[2] = x10[16];
+ output[3] = x10[48];
+ output[4] = x10[8];
+ output[5] = x10[40];
+ output[6] = x10[24];
+ output[7] = x10[56];
+ output[8] = x10[4];
+ output[9] = x10[36];
+ output[10] = x10[20];
+ output[11] = x10[52];
+ output[12] = x10[12];
+ output[13] = x10[44];
+ output[14] = x10[28];
+ output[15] = x10[60];
+ output[16] = x10[2];
+ output[17] = x10[34];
+ output[18] = x10[18];
+ output[19] = x10[50];
+ output[20] = x10[10];
+ output[21] = x10[42];
+ output[22] = x10[26];
+ output[23] = x10[58];
+ output[24] = x10[6];
+ output[25] = x10[38];
+ output[26] = x10[22];
+ output[27] = x10[54];
+ output[28] = x10[14];
+ output[29] = x10[46];
+ output[30] = x10[30];
+ output[31] = x10[62];
+ output[32] = x10[1];
+ output[33] = x10[33];
+ output[34] = x10[17];
+ output[35] = x10[49];
+ output[36] = x10[9];
+ output[37] = x10[41];
+ output[38] = x10[25];
+ output[39] = x10[57];
+ output[40] = x10[5];
+ output[41] = x10[37];
+ output[42] = x10[21];
+ output[43] = x10[53];
+ output[44] = x10[13];
+ output[45] = x10[45];
+ output[46] = x10[29];
+ output[47] = x10[61];
+ output[48] = x10[3];
+ output[49] = x10[35];
+ output[50] = x10[19];
+ output[51] = x10[51];
+ output[52] = x10[11];
+ output[53] = x10[43];
+ output[54] = x10[27];
+ output[55] = x10[59];
+ output[56] = x10[7];
+ output[57] = x10[39];
+ output[58] = x10[23];
+ output[59] = x10[55];
+ output[60] = x10[15];
+ output[61] = x10[47];
+ output[62] = x10[31];
+ output[63] = x10[63];
+}
+
+static void fadst4x4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *sinpi = sinpi_arr(cos_bit);
+ const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+ const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+ const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+ const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+ const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+ const __m128i __zero = _mm_set1_epi16(0);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+ const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+ __m128i u[8], v[8];
+
+ u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+ u[1] = _mm_unpacklo_epi16(input[2], input[3]);
+ u[2] = _mm_unpacklo_epi16(in7, __zero);
+ u[3] = _mm_unpacklo_epi16(input[2], __zero);
+ u[4] = _mm_unpacklo_epi16(input[3], __zero);
+
+ v[0] = _mm_madd_epi16(u[0], sinpi_p01_p02); // s0 + s2
+ v[1] = _mm_madd_epi16(u[1], sinpi_p03_p04); // s4 + s5
+ v[2] = _mm_madd_epi16(u[2], sinpi_p03_p03); // x1
+ v[3] = _mm_madd_epi16(u[0], sinpi_p04_m01); // s1 - s3
+ v[4] = _mm_madd_epi16(u[1], sinpi_m03_p02); // -s4 + s6
+ v[5] = _mm_madd_epi16(u[3], sinpi_p03_p03); // s4
+ v[6] = _mm_madd_epi16(u[4], sinpi_p03_p03);
+
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[1] = _mm_sub_epi32(v[2], v[6]);
+ u[2] = _mm_add_epi32(v[3], v[4]);
+ u[3] = _mm_sub_epi32(u[2], u[0]);
+ u[4] = _mm_slli_epi32(v[5], 2);
+ u[5] = _mm_sub_epi32(u[4], v[5]);
+ u[6] = _mm_add_epi32(u[3], u[5]);
+
+ v[0] = _mm_add_epi32(u[0], __rounding);
+ v[1] = _mm_add_epi32(u[1], __rounding);
+ v[2] = _mm_add_epi32(u[2], __rounding);
+ v[3] = _mm_add_epi32(u[6], __rounding);
+
+ u[0] = _mm_srai_epi32(v[0], cos_bit);
+ u[1] = _mm_srai_epi32(v[1], cos_bit);
+ u[2] = _mm_srai_epi32(v[2], cos_bit);
+ u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+ output[0] = _mm_packs_epi32(u[0], u[2]);
+ output[1] = _mm_packs_epi32(u[1], u[3]);
+ output[2] = _mm_srli_si128(output[0], 8);
+ output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fadst4x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+ // stage 1
+ __m128i x1[8];
+ x1[0] = input[0];
+ x1[1] = _mm_subs_epi16(__zero, input[7]);
+ x1[2] = _mm_subs_epi16(__zero, input[3]);
+ x1[3] = input[4];
+ x1[4] = _mm_subs_epi16(__zero, input[1]);
+ x1[5] = input[6];
+ x1[6] = input[2];
+ x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+ // stage 2
+ __m128i x2[8];
+ x2[0] = x1[0];
+ x2[1] = x1[1];
+ btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[2],
+ &x1[3], &x2[2], &x2[3]);
+ x2[4] = x1[4];
+ x2[5] = x1[5];
+ btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[6],
+ &x1[7], &x2[6], &x2[7]);
+
+ // stage 3
+ __m128i x3[8];
+ x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+ x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+ x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+ x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+ x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+ x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+ // stage 4
+ __m128i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_w4_sse2(&cospi_p16_p48, &cospi_p48_m16, __rounding, cos_bit, &x3[4],
+ &x3[5], &x4[4], &x4[5]);
+ btf_16_w4_sse2(&cospi_m48_p16, &cospi_p16_p48, __rounding, cos_bit, &x3[6],
+ &x3[7], &x4[6], &x4[7]);
+
+ // stage 5
+ __m128i x5[8];
+ x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+ x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+ x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+ x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+ x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+ x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+ x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+ x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+
+ // stage 6
+ __m128i x6[8];
+ btf_16_w4_sse2(&cospi_p04_p60, &cospi_p60_m04, __rounding, cos_bit, &x5[0],
+ &x5[1], &x6[0], &x6[1]);
+ btf_16_w4_sse2(&cospi_p20_p44, &cospi_p44_m20, __rounding, cos_bit, &x5[2],
+ &x5[3], &x6[2], &x6[3]);
+ btf_16_w4_sse2(&cospi_p36_p28, &cospi_p28_m36, __rounding, cos_bit, &x5[4],
+ &x5[5], &x6[4], &x6[5]);
+ btf_16_w4_sse2(&cospi_p52_p12, &cospi_p12_m52, __rounding, cos_bit, &x5[6],
+ &x5[7], &x6[6], &x6[7]);
+
+ // stage 7
+ output[0] = x6[1];
+ output[1] = x6[6];
+ output[2] = x6[3];
+ output[3] = x6[4];
+ output[4] = x6[5];
+ output[5] = x6[2];
+ output[6] = x6[7];
+ output[7] = x6[0];
+}
+
+static void fadst8x4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *sinpi = sinpi_arr(cos_bit);
+ const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+ const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+ const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+ const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+ const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+ const __m128i __zero = _mm_set1_epi16(0);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+ const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+ __m128i u_lo[8], u_hi[8], v_lo[8], v_hi[8];
+
+ u_lo[0] = _mm_unpacklo_epi16(input[0], input[1]);
+ u_hi[0] = _mm_unpackhi_epi16(input[0], input[1]);
+ u_lo[1] = _mm_unpacklo_epi16(input[2], input[3]);
+ u_hi[1] = _mm_unpackhi_epi16(input[2], input[3]);
+ u_lo[2] = _mm_unpacklo_epi16(in7, __zero);
+ u_hi[2] = _mm_unpackhi_epi16(in7, __zero);
+ u_lo[3] = _mm_unpacklo_epi16(input[2], __zero);
+ u_hi[3] = _mm_unpackhi_epi16(input[2], __zero);
+ u_lo[4] = _mm_unpacklo_epi16(input[3], __zero);
+ u_hi[4] = _mm_unpackhi_epi16(input[3], __zero);
+
+ v_lo[0] = _mm_madd_epi16(u_lo[0], sinpi_p01_p02); // s0 + s2
+ v_hi[0] = _mm_madd_epi16(u_hi[0], sinpi_p01_p02); // s0 + s2
+ v_lo[1] = _mm_madd_epi16(u_lo[1], sinpi_p03_p04); // s4 + s5
+ v_hi[1] = _mm_madd_epi16(u_hi[1], sinpi_p03_p04); // s4 + s5
+ v_lo[2] = _mm_madd_epi16(u_lo[2], sinpi_p03_p03); // x1
+ v_hi[2] = _mm_madd_epi16(u_hi[2], sinpi_p03_p03); // x1
+ v_lo[3] = _mm_madd_epi16(u_lo[0], sinpi_p04_m01); // s1 - s3
+ v_hi[3] = _mm_madd_epi16(u_hi[0], sinpi_p04_m01); // s1 - s3
+ v_lo[4] = _mm_madd_epi16(u_lo[1], sinpi_m03_p02); // -s4 + s6
+ v_hi[4] = _mm_madd_epi16(u_hi[1], sinpi_m03_p02); // -s4 + s6
+ v_lo[5] = _mm_madd_epi16(u_lo[3], sinpi_p03_p03); // s4
+ v_hi[5] = _mm_madd_epi16(u_hi[3], sinpi_p03_p03); // s4
+ v_lo[6] = _mm_madd_epi16(u_lo[4], sinpi_p03_p03);
+ v_hi[6] = _mm_madd_epi16(u_hi[4], sinpi_p03_p03);
+
+ u_lo[0] = _mm_add_epi32(v_lo[0], v_lo[1]);
+ u_hi[0] = _mm_add_epi32(v_hi[0], v_hi[1]);
+ u_lo[1] = _mm_sub_epi32(v_lo[2], v_lo[6]);
+ u_hi[1] = _mm_sub_epi32(v_hi[2], v_hi[6]);
+ u_lo[2] = _mm_add_epi32(v_lo[3], v_lo[4]);
+ u_hi[2] = _mm_add_epi32(v_hi[3], v_hi[4]);
+ u_lo[3] = _mm_sub_epi32(u_lo[2], u_lo[0]);
+ u_hi[3] = _mm_sub_epi32(u_hi[2], u_hi[0]);
+ u_lo[4] = _mm_slli_epi32(v_lo[5], 2);
+ u_hi[4] = _mm_slli_epi32(v_hi[5], 2);
+ u_lo[5] = _mm_sub_epi32(u_lo[4], v_lo[5]);
+ u_hi[5] = _mm_sub_epi32(u_hi[4], v_hi[5]);
+ u_lo[6] = _mm_add_epi32(u_lo[3], u_lo[5]);
+ u_hi[6] = _mm_add_epi32(u_hi[3], u_hi[5]);
+
+ v_lo[0] = _mm_add_epi32(u_lo[0], __rounding);
+ v_hi[0] = _mm_add_epi32(u_hi[0], __rounding);
+ v_lo[1] = _mm_add_epi32(u_lo[1], __rounding);
+ v_hi[1] = _mm_add_epi32(u_hi[1], __rounding);
+ v_lo[2] = _mm_add_epi32(u_lo[2], __rounding);
+ v_hi[2] = _mm_add_epi32(u_hi[2], __rounding);
+ v_lo[3] = _mm_add_epi32(u_lo[6], __rounding);
+ v_hi[3] = _mm_add_epi32(u_hi[6], __rounding);
+
+ u_lo[0] = _mm_srai_epi32(v_lo[0], cos_bit);
+ u_hi[0] = _mm_srai_epi32(v_hi[0], cos_bit);
+ u_lo[1] = _mm_srai_epi32(v_lo[1], cos_bit);
+ u_hi[1] = _mm_srai_epi32(v_hi[1], cos_bit);
+ u_lo[2] = _mm_srai_epi32(v_lo[2], cos_bit);
+ u_hi[2] = _mm_srai_epi32(v_hi[2], cos_bit);
+ u_lo[3] = _mm_srai_epi32(v_lo[3], cos_bit);
+ u_hi[3] = _mm_srai_epi32(v_hi[3], cos_bit);
+
+ output[0] = _mm_packs_epi32(u_lo[0], u_hi[0]);
+ output[1] = _mm_packs_epi32(u_lo[1], u_hi[1]);
+ output[2] = _mm_packs_epi32(u_lo[2], u_hi[2]);
+ output[3] = _mm_packs_epi32(u_lo[3], u_hi[3]);
+}
+
+static void fadst8x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+ // stage 1
+ __m128i x1[8];
+ x1[0] = input[0];
+ x1[1] = _mm_subs_epi16(__zero, input[7]);
+ x1[2] = _mm_subs_epi16(__zero, input[3]);
+ x1[3] = input[4];
+ x1[4] = _mm_subs_epi16(__zero, input[1]);
+ x1[5] = input[6];
+ x1[6] = input[2];
+ x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+ // stage 2
+ __m128i x2[8];
+ x2[0] = x1[0];
+ x2[1] = x1[1];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+ x2[4] = x1[4];
+ x2[5] = x1[5];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+
+ // stage 3
+ __m128i x3[8];
+ x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+ x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+ x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+ x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+ x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+ x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+ // stage 4
+ __m128i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+
+ // stage 5
+ __m128i x5[8];
+ x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+ x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+ x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+ x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+ x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+ x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+ x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+ x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+
+ // stage 6
+ __m128i x6[8];
+ btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x5[0], x5[1], x6[0], x6[1]);
+ btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x5[2], x5[3], x6[2], x6[3]);
+ btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x5[4], x5[5], x6[4], x6[5]);
+ btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x5[6], x5[7], x6[6], x6[7]);
+
+ // stage 7
+ output[0] = x6[1];
+ output[1] = x6[6];
+ output[2] = x6[3];
+ output[3] = x6[4];
+ output[4] = x6[5];
+ output[5] = x6[2];
+ output[6] = x6[7];
+ output[7] = x6[0];
+}
+
+static void fadst8x16_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+ __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+ __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+ // stage 1
+ __m128i x1[16];
+ x1[0] = input[0];
+ x1[1] = _mm_subs_epi16(__zero, input[15]);
+ x1[2] = _mm_subs_epi16(__zero, input[7]);
+ x1[3] = input[8];
+ x1[4] = _mm_subs_epi16(__zero, input[3]);
+ x1[5] = input[12];
+ x1[6] = input[4];
+ x1[7] = _mm_subs_epi16(__zero, input[11]);
+ x1[8] = _mm_subs_epi16(__zero, input[1]);
+ x1[9] = input[14];
+ x1[10] = input[6];
+ x1[11] = _mm_subs_epi16(__zero, input[9]);
+ x1[12] = input[2];
+ x1[13] = _mm_subs_epi16(__zero, input[13]);
+ x1[14] = _mm_subs_epi16(__zero, input[5]);
+ x1[15] = input[10];
+
+ // stage 2
+ __m128i x2[16];
+ x2[0] = x1[0];
+ x2[1] = x1[1];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+ x2[4] = x1[4];
+ x2[5] = x1[5];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+ x2[8] = x1[8];
+ x2[9] = x1[9];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x2[10], x2[11]);
+ x2[12] = x1[12];
+ x2[13] = x1[13];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x2[14], x2[15]);
+
+ // stage 3
+ __m128i x3[16];
+ x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+ x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+ x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+ x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+ x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+ x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+ x3[8] = _mm_adds_epi16(x2[8], x2[10]);
+ x3[10] = _mm_subs_epi16(x2[8], x2[10]);
+ x3[9] = _mm_adds_epi16(x2[9], x2[11]);
+ x3[11] = _mm_subs_epi16(x2[9], x2[11]);
+ x3[12] = _mm_adds_epi16(x2[12], x2[14]);
+ x3[14] = _mm_subs_epi16(x2[12], x2[14]);
+ x3[13] = _mm_adds_epi16(x2[13], x2[15]);
+ x3[15] = _mm_subs_epi16(x2[13], x2[15]);
+
+ // stage 4
+ __m128i x4[16];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+ x4[8] = x3[8];
+ x4[9] = x3[9];
+ x4[10] = x3[10];
+ x4[11] = x3[11];
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[12], x3[13], x4[12], x4[13]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[14], x3[15], x4[14], x4[15]);
+
+ // stage 5
+ __m128i x5[16];
+ x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+ x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+ x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+ x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+ x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+ x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+ x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+ x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+ x5[8] = _mm_adds_epi16(x4[8], x4[12]);
+ x5[12] = _mm_subs_epi16(x4[8], x4[12]);
+ x5[9] = _mm_adds_epi16(x4[9], x4[13]);
+ x5[13] = _mm_subs_epi16(x4[9], x4[13]);
+ x5[10] = _mm_adds_epi16(x4[10], x4[14]);
+ x5[14] = _mm_subs_epi16(x4[10], x4[14]);
+ x5[11] = _mm_adds_epi16(x4[11], x4[15]);
+ x5[15] = _mm_subs_epi16(x4[11], x4[15]);
+
+ // stage 6
+ __m128i x6[16];
+ x6[0] = x5[0];
+ x6[1] = x5[1];
+ x6[2] = x5[2];
+ x6[3] = x5[3];
+ x6[4] = x5[4];
+ x6[5] = x5[5];
+ x6[6] = x5[6];
+ x6[7] = x5[7];
+ btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x5[8], x5[9], x6[8], x6[9]);
+ btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x5[10], x5[11], x6[10], x6[11]);
+ btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x5[12], x5[13], x6[12], x6[13]);
+ btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x5[14], x5[15], x6[14], x6[15]);
+
+ // stage 7
+ __m128i x7[16];
+ x7[0] = _mm_adds_epi16(x6[0], x6[8]);
+ x7[8] = _mm_subs_epi16(x6[0], x6[8]);
+ x7[1] = _mm_adds_epi16(x6[1], x6[9]);
+ x7[9] = _mm_subs_epi16(x6[1], x6[9]);
+ x7[2] = _mm_adds_epi16(x6[2], x6[10]);
+ x7[10] = _mm_subs_epi16(x6[2], x6[10]);
+ x7[3] = _mm_adds_epi16(x6[3], x6[11]);
+ x7[11] = _mm_subs_epi16(x6[3], x6[11]);
+ x7[4] = _mm_adds_epi16(x6[4], x6[12]);
+ x7[12] = _mm_subs_epi16(x6[4], x6[12]);
+ x7[5] = _mm_adds_epi16(x6[5], x6[13]);
+ x7[13] = _mm_subs_epi16(x6[5], x6[13]);
+ x7[6] = _mm_adds_epi16(x6[6], x6[14]);
+ x7[14] = _mm_subs_epi16(x6[6], x6[14]);
+ x7[7] = _mm_adds_epi16(x6[7], x6[15]);
+ x7[15] = _mm_subs_epi16(x6[7], x6[15]);
+
+ // stage 8
+ __m128i x8[16];
+ btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x7[0], x7[1], x8[0], x8[1]);
+ btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x7[2], x7[3], x8[2], x8[3]);
+ btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x7[4], x7[5], x8[4], x8[5]);
+ btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x7[6], x7[7], x8[6], x8[7]);
+ btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x7[8], x7[9], x8[8], x8[9]);
+ btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x7[10], x7[11], x8[10], x8[11]);
+ btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x7[12], x7[13], x8[12], x8[13]);
+ btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x7[14], x7[15], x8[14], x8[15]);
+
+ // stage 9
+ output[0] = x8[1];
+ output[1] = x8[14];
+ output[2] = x8[3];
+ output[3] = x8[12];
+ output[4] = x8[5];
+ output[5] = x8[10];
+ output[6] = x8[7];
+ output[7] = x8[8];
+ output[8] = x8[9];
+ output[9] = x8[6];
+ output[10] = x8[11];
+ output[11] = x8[4];
+ output[12] = x8[13];
+ output[13] = x8[2];
+ output[14] = x8[15];
+ output[15] = x8[0];
+}
+
+static const transform_1d_sse2 col_txfm4x4_arr[TX_TYPES] = {
+ fdct4x4_new_sse2, // DCT_DCT
+ fadst4x4_new_sse2, // ADST_DCT
+ fdct4x4_new_sse2, // DCT_ADST
+ fadst4x4_new_sse2, // ADST_ADST
+ fadst4x4_new_sse2, // FLIPADST_DCT
+ fdct4x4_new_sse2, // DCT_FLIPADST
+ fadst4x4_new_sse2, // FLIPADST_FLIPADST
+ fadst4x4_new_sse2, // ADST_FLIPADST
+ fadst4x4_new_sse2, // FLIPADST_ADST
+ fidentity4x4_new_sse2, // IDTX
+ fdct4x4_new_sse2, // V_DCT
+ fidentity4x4_new_sse2, // H_DCT
+ fadst4x4_new_sse2, // V_ADST
+ fidentity4x4_new_sse2, // H_ADST
+ fadst4x4_new_sse2, // V_FLIPADST
+ fidentity4x4_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x4_arr[TX_TYPES] = {
+ fdct4x4_new_sse2, // DCT_DCT
+ fdct4x4_new_sse2, // ADST_DCT
+ fadst4x4_new_sse2, // DCT_ADST
+ fadst4x4_new_sse2, // ADST_ADST
+ fdct4x4_new_sse2, // FLIPADST_DCT
+ fadst4x4_new_sse2, // DCT_FLIPADST
+ fadst4x4_new_sse2, // FLIPADST_FLIPADST
+ fadst4x4_new_sse2, // ADST_FLIPADST
+ fadst4x4_new_sse2, // FLIPADST_ADST
+ fidentity4x4_new_sse2, // IDTX
+ fidentity4x4_new_sse2, // V_DCT
+ fdct4x4_new_sse2, // H_DCT
+ fidentity4x4_new_sse2, // V_ADST
+ fadst4x4_new_sse2, // H_ADST
+ fidentity4x4_new_sse2, // V_FLIPADST
+ fadst4x4_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_new_sse2, // DCT_DCT
+ fadst4x8_new_sse2, // ADST_DCT
+ fdct4x8_new_sse2, // DCT_ADST
+ fadst4x8_new_sse2, // ADST_ADST
+ fadst4x8_new_sse2, // FLIPADST_DCT
+ fdct4x8_new_sse2, // DCT_FLIPADST
+ fadst4x8_new_sse2, // FLIPADST_FLIPADST
+ fadst4x8_new_sse2, // ADST_FLIPADST
+ fadst4x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fdct4x8_new_sse2, // V_DCT
+ fidentity8x8_new_sse2, // H_DCT
+ fadst4x8_new_sse2, // V_ADST
+ fidentity8x8_new_sse2, // H_ADST
+ fadst4x8_new_sse2, // V_FLIPADST
+ fidentity8x8_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x4_arr[TX_TYPES] = {
+ fdct8x4_new_sse2, // DCT_DCT
+ fdct8x4_new_sse2, // ADST_DCT
+ fadst8x4_new_sse2, // DCT_ADST
+ fadst8x4_new_sse2, // ADST_ADST
+ fdct8x4_new_sse2, // FLIPADST_DCT
+ fadst8x4_new_sse2, // DCT_FLIPADST
+ fadst8x4_new_sse2, // FLIPADST_FLIPADST
+ fadst8x4_new_sse2, // ADST_FLIPADST
+ fadst8x4_new_sse2, // FLIPADST_ADST
+ fidentity8x4_new_sse2, // IDTX
+ fidentity8x4_new_sse2, // V_DCT
+ fdct8x4_new_sse2, // H_DCT
+ fidentity8x4_new_sse2, // V_ADST
+ fadst8x4_new_sse2, // H_ADST
+ fidentity8x4_new_sse2, // V_FLIPADST
+ fadst8x4_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x4_arr[TX_TYPES] = {
+ fdct8x4_new_sse2, // DCT_DCT
+ fadst8x4_new_sse2, // ADST_DCT
+ fdct8x4_new_sse2, // DCT_ADST
+ fadst8x4_new_sse2, // ADST_ADST
+ fadst8x4_new_sse2, // FLIPADST_DCT
+ fdct8x4_new_sse2, // DCT_FLIPADST
+ fadst8x4_new_sse2, // FLIPADST_FLIPADST
+ fadst8x4_new_sse2, // ADST_FLIPADST
+ fadst8x4_new_sse2, // FLIPADST_ADST
+ fidentity8x4_new_sse2, // IDTX
+ fdct8x4_new_sse2, // V_DCT
+ fidentity8x4_new_sse2, // H_DCT
+ fadst8x4_new_sse2, // V_ADST
+ fidentity8x4_new_sse2, // H_ADST
+ fadst8x4_new_sse2, // V_FLIPADST
+ fidentity8x4_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_new_sse2, // DCT_DCT
+ fdct4x8_new_sse2, // ADST_DCT
+ fadst4x8_new_sse2, // DCT_ADST
+ fadst4x8_new_sse2, // ADST_ADST
+ fdct4x8_new_sse2, // FLIPADST_DCT
+ fadst4x8_new_sse2, // DCT_FLIPADST
+ fadst4x8_new_sse2, // FLIPADST_FLIPADST
+ fadst4x8_new_sse2, // ADST_FLIPADST
+ fadst4x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fidentity8x8_new_sse2, // V_DCT
+ fdct4x8_new_sse2, // H_DCT
+ fidentity8x8_new_sse2, // V_ADST
+ fadst4x8_new_sse2, // H_ADST
+ fidentity8x8_new_sse2, // V_FLIPADST
+ fadst4x8_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_new_sse2, // DCT_DCT
+ fadst8x8_new_sse2, // ADST_DCT
+ fdct8x8_new_sse2, // DCT_ADST
+ fadst8x8_new_sse2, // ADST_ADST
+ fadst8x8_new_sse2, // FLIPADST_DCT
+ fdct8x8_new_sse2, // DCT_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_FLIPADST
+ fadst8x8_new_sse2, // ADST_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fdct8x8_new_sse2, // V_DCT
+ fidentity8x8_new_sse2, // H_DCT
+ fadst8x8_new_sse2, // V_ADST
+ fidentity8x8_new_sse2, // H_ADST
+ fadst8x8_new_sse2, // V_FLIPADST
+ fidentity8x8_new_sse2, // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_new_sse2, // DCT_DCT
+ fdct8x8_new_sse2, // ADST_DCT
+ fadst8x8_new_sse2, // DCT_ADST
+ fadst8x8_new_sse2, // ADST_ADST
+ fdct8x8_new_sse2, // FLIPADST_DCT
+ fadst8x8_new_sse2, // DCT_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_FLIPADST
+ fadst8x8_new_sse2, // ADST_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fidentity8x8_new_sse2, // V_DCT
+ fdct8x8_new_sse2, // H_DCT
+ fidentity8x8_new_sse2, // V_ADST
+ fadst8x8_new_sse2, // H_ADST
+ fidentity8x8_new_sse2, // V_FLIPADST
+ fadst8x8_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_new_sse2, // DCT_DCT
+ fadst8x16_new_sse2, // ADST_DCT
+ fdct8x16_new_sse2, // DCT_ADST
+ fadst8x16_new_sse2, // ADST_ADST
+ fadst8x16_new_sse2, // FLIPADST_DCT
+ fdct8x16_new_sse2, // DCT_FLIPADST
+ fadst8x16_new_sse2, // FLIPADST_FLIPADST
+ fadst8x16_new_sse2, // ADST_FLIPADST
+ fadst8x16_new_sse2, // FLIPADST_ADST
+ fidentity8x16_new_sse2, // IDTX
+ fdct8x16_new_sse2, // V_DCT
+ fidentity8x16_new_sse2, // H_DCT
+ fadst8x16_new_sse2, // V_ADST
+ fidentity8x16_new_sse2, // H_ADST
+ fadst8x16_new_sse2, // V_FLIPADST
+ fidentity8x16_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_new_sse2, // DCT_DCT
+ fdct8x16_new_sse2, // ADST_DCT
+ fadst8x16_new_sse2, // DCT_ADST
+ fadst8x16_new_sse2, // ADST_ADST
+ fdct8x16_new_sse2, // FLIPADST_DCT
+ fadst8x16_new_sse2, // DCT_FLIPADST
+ fadst8x16_new_sse2, // FLIPADST_FLIPADST
+ fadst8x16_new_sse2, // ADST_FLIPADST
+ fadst8x16_new_sse2, // FLIPADST_ADST
+ fidentity8x16_new_sse2, // IDTX
+ fidentity8x16_new_sse2, // V_DCT
+ fdct8x16_new_sse2, // H_DCT
+ fidentity8x16_new_sse2, // V_ADST
+ fadst8x16_new_sse2, // H_ADST
+ fidentity8x16_new_sse2, // V_FLIPADST
+ fadst8x16_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x32_arr[TX_TYPES] = {
+ fdct8x32_new_sse2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity8x32_new_sse2, // IDTX
+ fidentity8x32_new_sse2, // V_DCT
+ fdct8x32_new_sse2, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[4], buf1[4], *buf;
+ const int8_t *shift = fwd_txfm_shift_ls[TX_4X4];
+ const int txw_idx = get_txw_idx(TX_4X4);
+ const int txh_idx = get_txh_idx(TX_4X4);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 4;
+ const int height = 4;
+ const transform_1d_sse2 col_txfm = col_txfm4x4_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm4x4_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_4x4(buf0, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_4x4(buf, buf);
+ store_buffer_16bit_to_32bit_w4(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)stride;
+ (void)bd;
+ __m128i buf0[8], buf1[8], *buf;
+ const int8_t *shift = fwd_txfm_shift_ls[TX_4X8];
+ const int txw_idx = get_txw_idx(TX_4X8);
+ const int txh_idx = get_txh_idx(TX_4X8);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 4;
+ const int height = 8;
+ const transform_1d_sse2 col_txfm = col_txfm4x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_4x8(buf0, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x4(buf, buf);
+ store_rect_buffer_16bit_to_32bit_w4(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_4X16];
+ const int txw_idx = get_txw_idx(TX_4X16);
+ const int txh_idx = get_txh_idx(TX_4X16);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 4;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_4x8(buf0, buf1);
+ transpose_16bit_4x8(buf0 + 8, buf1 + 8);
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + 8 * i, buf, width);
+ } else {
+ buf = buf1 + 8 * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x4(buf, buf);
+ store_buffer_16bit_to_32bit_w4(buf, output + 8 * width * i, width, 8);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[8], buf1[8], *buf;
+ const int8_t *shift = fwd_txfm_shift_ls[TX_8X4];
+ const int txw_idx = get_txw_idx(TX_8X4);
+ const int txh_idx = get_txh_idx(TX_8X4);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 4;
+ const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm4x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip)
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ else
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[8], buf1[8], *buf;
+ const int8_t *shift = fwd_txfm_shift_ls[TX_8X8];
+ const int txw_idx = get_txw_idx(TX_8X8);
+ const int txh_idx = get_txh_idx(TX_8X8);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 8;
+ const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip)
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ else
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_buffer_16bit_to_32bit_w8(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+ const int txw_idx = get_txw_idx(TX_8X16);
+ const int txh_idx = get_txh_idx(TX_8X16);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[32];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_8X32];
+ const int txw_idx = get_txw_idx(TX_8X32);
+ const int txh_idx = get_txh_idx(TX_8X32);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 32;
+ const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+ transpose_16bit_8x8(buf0 + 16, buf1 + 16);
+ transpose_16bit_8x8(buf0 + 24, buf1 + 24);
+
+ for (int i = 0; i < 4; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_16X4];
+ const int txw_idx = get_txw_idx(TX_16X4);
+ const int txh_idx = get_txh_idx(TX_16X4);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 4;
+ const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+ __m128i *buf;
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x4(buf0, buf1 + 8 * i);
+ }
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_4x8(buf, buf);
+ store_buffer_16bit_to_32bit_w8(buf, output, width, height);
+ transpose_16bit_4x8(buf + 8, buf + 8);
+ store_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+ const int txw_idx = get_txw_idx(TX_16X8);
+ const int txh_idx = get_txh_idx(TX_16X8);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 8;
+ const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+ __m128i *buf;
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1 + 8 * i);
+ }
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+ transpose_16bit_8x8(buf + 8, buf + 8);
+ store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[32];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_16X16];
+ const int txw_idx = get_txw_idx(TX_16X16);
+ const int txh_idx = get_txh_idx(TX_16X16);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+ transpose_16bit_8x8(buf + 8, buf + 8);
+ store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+ 8);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[64];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_16X32];
+ const int txw_idx = get_txw_idx(TX_16X32);
+ const int txh_idx = get_txh_idx(TX_16X32);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 32;
+ const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+
+ if (col_txfm != NULL && row_txfm != NULL) {
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 4; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+ 8);
+ transpose_16bit_8x8(buf + 8, buf + 8);
+ store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
+ width, 8);
+ }
+ } else {
+ av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[32];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_32X8];
+ const int txw_idx = get_txw_idx(TX_32X8);
+ const int txh_idx = get_txh_idx(TX_32X8);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 32;
+ const int height = 8;
+ const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+ if (col_txfm != NULL && row_txfm != NULL) {
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 4; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 1; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+ height);
+ transpose_16bit_8x8(buf + 8, buf + 8);
+ store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+ height);
+ transpose_16bit_8x8(buf + 16, buf + 16);
+ store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+ width, height);
+ transpose_16bit_8x8(buf + 24, buf + 24);
+ store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+ width, height);
+ }
+ } else {
+ av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[64];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_32X16];
+ const int txw_idx = get_txw_idx(TX_32X16);
+ const int txh_idx = get_txh_idx(TX_32X16);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 32;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+ if (col_txfm != NULL && row_txfm != NULL) {
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 4; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+ 8);
+ transpose_16bit_8x8(buf + 8, buf + 8);
+ store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
+ width, 8);
+ transpose_16bit_8x8(buf + 16, buf + 16);
+ store_rect_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+ width, 8);
+ transpose_16bit_8x8(buf + 24, buf + 24);
+ store_rect_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+ width, 8);
+ }
+ } else {
+ av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[128];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_32X32];
+ const int txw_idx = get_txw_idx(TX_32X32);
+ const int txh_idx = get_txh_idx(TX_32X32);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 32;
+ const int height = 32;
+ const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+ if (col_txfm != NULL && row_txfm != NULL) {
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 4; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 4; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+ transpose_16bit_8x8(buf + 8, buf + 8);
+ store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+ 8);
+ transpose_16bit_8x8(buf + 16, buf + 16);
+ store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+ width, 8);
+ transpose_16bit_8x8(buf + 24, buf + 24);
+ store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+ width, 8);
+ }
+ } else {
+ av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X16;
+ __m128i buf0[64], buf1[128];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = fdct8x16_new_sse2;
+ const transform_1d_sse2 row_txfm = fdct8x64_new_sse2;
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < height_div8; ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < height_div8; i++) {
+ __m128i *buf = buf1 + width * i;
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ int32_t *output8 = output + 8 * 32 * i;
+ for (int j = 0; j < 4; ++j) {
+ __m128i *buf8 = buf + 8 * j;
+ transpose_16bit_8x8(buf8, buf8);
+ store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, 32, 8);
+ }
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_16X64;
+ __m128i buf0[64], buf1[128];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+ const transform_1d_sse2 row_txfm = fdct8x16_new_sse2;
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < height_div8; ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ __m128i *buf = buf1 + width * i;
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ int32_t *output8 = output + 8 * width * i;
+ for (int j = 0; j < width_div8; ++j) {
+ __m128i *buf8 = buf + 8 * j;
+ transpose_16bit_8x8(buf8, buf8);
+ store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, width, 8);
+ }
+ }
+ // Zero out the bottom 16x32 area.
+ memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+ av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform
+ av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform
+ av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform
+ av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform
+ NULL, // 64x64 transform
+ av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform
+ av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform
+ av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform
+ av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform
+ av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform
+ av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform
+ av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform
+ av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform
+ av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform
+ av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform
+ av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+
+ if ((fwd_txfm2d_func == NULL) ||
+ (txfm_param->lossless && txfm_param->tx_size == TX_4X4))
+ av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+ else
+ fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
new file mode 100644
index 000000000..aa14d3ade
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
+#define AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit);
+void fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit);
+
+static INLINE void fidentity4x4_new_sse2(const __m128i *const input,
+ __m128i *const output,
+ const int8_t cos_bit) {
+ (void)cos_bit;
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int i = 0; i < 4; ++i) {
+ const __m128i a = _mm_unpacklo_epi16(input[i], one);
+ const __m128i b = scale_round_sse2(a, NewSqrt2);
+ output[i] = _mm_packs_epi32(b, b);
+ }
+}
+
+static INLINE void fidentity8x4_new_sse2(const __m128i *const input,
+ __m128i *const output,
+ const int8_t cos_bit) {
+ (void)cos_bit;
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int i = 0; i < 4; ++i) {
+ const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+ const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+ const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+ const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
+ output[i] = _mm_packs_epi32(b_lo, b_hi);
+ }
+}
+
+static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+
+ output[0] = _mm_adds_epi16(input[0], input[0]);
+ output[1] = _mm_adds_epi16(input[1], input[1]);
+ output[2] = _mm_adds_epi16(input[2], input[2]);
+ output[3] = _mm_adds_epi16(input[3], input[3]);
+ output[4] = _mm_adds_epi16(input[4], input[4]);
+ output[5] = _mm_adds_epi16(input[5], input[5]);
+ output[6] = _mm_adds_epi16(input[6], input[6]);
+ output[7] = _mm_adds_epi16(input[7], input[7]);
+}
+
+static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int i = 0; i < 16; ++i) {
+ const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+ const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+ const __m128i b_lo = scale_round_sse2(a_lo, 2 * NewSqrt2);
+ const __m128i b_hi = scale_round_sse2(a_hi, 2 * NewSqrt2);
+ output[i] = _mm_packs_epi32(b_lo, b_hi);
+ }
+}
+
+static INLINE void fidentity8x32_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; ++i) {
+ output[i] = _mm_slli_epi16(input[i], 2);
+ }
+}
+
+static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = {
+ fdct8x32_new_sse2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity8x32_new_sse2, // IDTX
+ fdct8x32_new_sse2, // V_DCT
+ fidentity8x32_new_sse2, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
index c8d4ccb70..b58911fcb 100644
--- a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -11,7 +11,8 @@
#include <immintrin.h>
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
#include "aom/aom_integer.h"
#include "aom_dsp/aom_dsp_common.h"
@@ -32,7 +33,10 @@ static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *dequant_ptr, int log_scale,
__m256i *qp) {
__m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
- round = _mm_srai_epi16(round, log_scale);
+ if (log_scale) {
+ const __m128i round_scale = _mm_set1_epi16(1 << (15 - log_scale));
+ round = _mm_mulhrs_epi16(round, round_scale);
+ }
const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
@@ -45,8 +49,8 @@ static INLINE void quantize(const __m256i *qp, __m256i *c,
const int16_t *iscan_ptr, int log_scale,
tran_low_t *qcoeff, tran_low_t *dqcoeff,
__m256i *eob) {
- const __m256i abs = _mm256_abs_epi32(*c);
- __m256i q = _mm256_add_epi32(abs, qp[0]);
+ const __m256i abs_coeff = _mm256_abs_epi32(*c);
+ __m256i q = _mm256_add_epi32(abs_coeff, qp[0]);
__m256i q_lo = _mm256_mul_epi32(q, qp[1]);
__m256i q_hi = _mm256_srli_epi64(q, 32);
@@ -56,6 +60,9 @@ static INLINE void quantize(const __m256i *qp, __m256i *c,
q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale);
q_hi = _mm256_slli_epi64(q_hi, 32);
q = _mm256_or_si256(q_lo, q_hi);
+ const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale);
+ const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s);
+ q = _mm256_andnot_si256(mask, q);
__m256i dq = _mm256_mullo_epi32(q, qp[2]);
dq = _mm256_srai_epi32(dq, log_scale);
@@ -81,8 +88,8 @@ static INLINE void quantize(const __m256i *qp, __m256i *c,
}
void av1_highbd_quantize_fp_avx2(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan, int log_scale) {
@@ -90,14 +97,23 @@ void av1_highbd_quantize_fp_avx2(
(void)zbin_ptr;
(void)quant_shift_ptr;
const unsigned int step = 8;
+ __m256i qp[3], coeff;
- if (LIKELY(!skip_block)) {
- __m256i qp[3], coeff;
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp);
+ coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
- init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp);
- coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ __m256i eob = _mm256_setzero_si256();
+ quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
- __m256i eob = _mm256_setzero_si256();
+ update_qp(qp);
+ while (n_coeffs > 0) {
+ coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
coeff_ptr += step;
@@ -105,39 +121,17 @@ void av1_highbd_quantize_fp_avx2(
dqcoeff_ptr += step;
iscan += step;
n_coeffs -= step;
-
- update_qp(qp);
- while (n_coeffs > 0) {
- coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
- quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
-
- coeff_ptr += step;
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- iscan += step;
- n_coeffs -= step;
- }
- {
- __m256i eob_s;
- eob_s = _mm256_shuffle_epi32(eob, 0xe);
- eob = _mm256_max_epi16(eob, eob_s);
- eob_s = _mm256_shufflelo_epi16(eob, 0xe);
- eob = _mm256_max_epi16(eob, eob_s);
- eob_s = _mm256_shufflelo_epi16(eob, 1);
- eob = _mm256_max_epi16(eob, eob_s);
- const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
- _mm256_extractf128_si256(eob, 1));
- *eob_ptr = _mm_extract_epi16(final_eob, 0);
- }
- } else {
- do {
- const __m256i zero = _mm256_setzero_si256();
- _mm256_storeu_si256((__m256i *)qcoeff_ptr, zero);
- _mm256_storeu_si256((__m256i *)dqcoeff_ptr, zero);
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- n_coeffs -= step;
- } while (n_coeffs > 0);
- *eob_ptr = 0;
+ }
+ {
+ __m256i eob_s;
+ eob_s = _mm256_shuffle_epi32(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 1);
+ eob = _mm256_max_epi16(eob, eob_s);
+ const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+ _mm256_extractf128_si256(eob, 1));
+ *eob_ptr = _mm_extract_epi16(final_eob, 0);
}
}
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
index 8d717a083..40b3b460b 100644
--- a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -12,8 +12,10 @@
#include <smmintrin.h>
#include <stdint.h>
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/synonyms.h"
// Coefficient quantization phase 1
// param[0-2] : rounding/quan/dequan constants
@@ -36,6 +38,8 @@ static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
dquan[0] = _mm_srli_epi64(dquan[0], scale);
+ const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale);
+ qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]);
}
// Coefficient quantization phase 2
@@ -70,7 +74,8 @@ static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
dquan[0] = _mm_sign_epi32(dquan[0], *sign);
-
+ qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]);
+ dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]);
_mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
_mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
}
@@ -108,12 +113,12 @@ static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
}
void av1_highbd_quantize_fp_sse4_1(
- const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan, int log_scale) {
- __m128i coeff[2], qcoeff[2], dequant[2], qparam[3], coeff_sign;
+ __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign;
__m128i eob = _mm_setzero_si128();
const tran_low_t *src = coeff_ptr;
tran_low_t *quanAddr = qcoeff_ptr;
@@ -121,7 +126,6 @@ void av1_highbd_quantize_fp_sse4_1(
const int shift = 16 - log_scale;
const int coeff_stride = 4;
const int quan_stride = coeff_stride;
- (void)skip_block;
(void)zbin_ptr;
(void)quant_shift_ptr;
(void)scan;
@@ -129,29 +133,54 @@ void av1_highbd_quantize_fp_sse4_1(
memset(quanAddr, 0, count * sizeof(quanAddr[0]));
memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
- if (!skip_block) {
- coeff[0] = _mm_loadu_si128((__m128i const *)src);
+ coeff[0] = _mm_loadu_si128((__m128i const *)src);
+ const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+ const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+
+ qparam[0] = _mm_set_epi32(round1, round1, round1, round0);
+ qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]);
+ qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]);
+ qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1],
+ dequant_ptr[0]);
+
+ // DC and first 3 AC
+ quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+
+ // update round/quan/dquan for AC
+ qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
+ qparam[1] = xx_set1_64_from_32i(quant_ptr[1]);
+ qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]);
+ qparam[3] = _mm_set1_epi32(dequant_ptr[1]);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+ quanAddr, dquanAddr);
+
+ // next 4 AC
+ coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+ quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+ quanAddr + quan_stride, dquanAddr + quan_stride);
+
+ find_eob(quanAddr, iscan, &eob);
+
+ count -= 8;
+
+ // loop for the rest of AC
+ while (count > 0) {
+ src += coeff_stride << 1;
+ quanAddr += quan_stride << 1;
+ dquanAddr += quan_stride << 1;
+ iscan += quan_stride << 1;
- qparam[0] =
- _mm_set_epi32(round_ptr[1] >> log_scale, round_ptr[1] >> log_scale,
- round_ptr[1] >> log_scale, round_ptr[0] >> log_scale);
- qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[0]);
- qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[0]);
+ coeff[0] = _mm_loadu_si128((__m128i const *)src);
+ coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
- // DC and first 3 AC
quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
&coeff_sign);
-
- // update round/quan/dquan for AC
- qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
- qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[1]);
- qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[1]);
-
quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
log_scale, quanAddr, dquanAddr);
- // next 4 AC
- coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
&coeff_sign);
quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
@@ -161,34 +190,6 @@ void av1_highbd_quantize_fp_sse4_1(
find_eob(quanAddr, iscan, &eob);
count -= 8;
-
- // loop for the rest of AC
- while (count > 0) {
- src += coeff_stride << 1;
- quanAddr += quan_stride << 1;
- dquanAddr += quan_stride << 1;
- iscan += quan_stride << 1;
-
- coeff[0] = _mm_loadu_si128((__m128i const *)src);
- coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
-
- quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff,
- dequant, &coeff_sign);
- quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
- log_scale, quanAddr, dquanAddr);
-
- quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff,
- dequant, &coeff_sign);
- quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
- log_scale, quanAddr + quan_stride,
- dquanAddr + quan_stride);
-
- find_eob(quanAddr, iscan, &eob);
-
- count -= 8;
- }
- *eob_ptr = get_accumulated_eob(&eob);
- } else {
- *eob_ptr = 0;
}
+ *eob_ptr = get_accumulated_eob(&eob);
}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
index 078a67510..df22aaba7 100644
--- a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
@@ -11,7 +11,8 @@
#include <immintrin.h>
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
#include "aom/aom_integer.h"
#include "aom_dsp/aom_dsp_common.h"
@@ -57,7 +58,7 @@ static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
init_one_qp(&round, &qp[0]);
init_one_qp(&quant, &qp[1]);
- if (log_scale > 0) {
+ if (log_scale == 1) {
qp[1] = _mm256_slli_epi16(qp[1], log_scale);
}
@@ -94,16 +95,25 @@ static INLINE void update_qp(int log_scale, __m256i *thr, __m256i *qp) {
} \
} while (0)
+static INLINE uint16_t quant_gather_eob(__m256i eob) {
+ const __m128i eob_lo = _mm256_castsi256_si128(eob);
+ const __m128i eob_hi = _mm256_extractf128_si256(eob, 1);
+ __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi);
+ eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s);
+ eob_s = _mm_minpos_epu16(eob_s);
+ return INT16_MAX - _mm_extract_epi16(eob_s, 0);
+}
+
static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
const int16_t *iscan_ptr, tran_low_t *qcoeff,
tran_low_t *dqcoeff, __m256i *eob) {
- const __m256i abs = _mm256_abs_epi16(*c);
- __m256i mask = _mm256_cmpgt_epi16(abs, *thr);
- mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr));
+ const __m256i abs_coeff = _mm256_abs_epi16(*c);
+ __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+ mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
const int nzflag = _mm256_movemask_epi8(mask);
if (nzflag) {
- __m256i q = _mm256_adds_epi16(abs, qp[0]);
+ __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
q = _mm256_mulhi_epi16(q, qp[1]);
q = _mm256_sign_epi16(q, *c);
const __m256i dq = _mm256_mullo_epi16(q, qp[2]);
@@ -123,8 +133,8 @@ static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
}
void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -134,15 +144,26 @@ void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
(void)quant_shift_ptr;
const unsigned int step = 16;
- if (LIKELY(!skip_block)) {
- __m256i qp[3];
- __m256i coeff, thr;
- const int log_scale = 0;
+ __m256i qp[3];
+ __m256i coeff, thr;
+ const int log_scale = 0;
- init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
- read_coeff(coeff_ptr, &coeff);
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+ read_coeff(coeff_ptr, &coeff);
+
+ __m256i eob = _mm256_setzero_si256();
+ quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(log_scale, &thr, qp);
- __m256i eob = _mm256_setzero_si256();
+ while (n_coeffs > 0) {
+ read_coeff(coeff_ptr, &coeff);
quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
coeff_ptr += step;
@@ -150,54 +171,21 @@ void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dqcoeff_ptr += step;
iscan_ptr += step;
n_coeffs -= step;
-
- update_qp(log_scale, &thr, qp);
-
- while (n_coeffs > 0) {
- read_coeff(coeff_ptr, &coeff);
- quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
-
- coeff_ptr += step;
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- iscan_ptr += step;
- n_coeffs -= step;
- }
- {
- __m256i eob_s;
- eob_s = _mm256_shuffle_epi32(eob, 0xe);
- eob = _mm256_max_epi16(eob, eob_s);
- eob_s = _mm256_shufflelo_epi16(eob, 0xe);
- eob = _mm256_max_epi16(eob, eob_s);
- eob_s = _mm256_shufflelo_epi16(eob, 1);
- eob = _mm256_max_epi16(eob, eob_s);
- const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
- _mm256_extractf128_si256(eob, 1));
- *eob_ptr = _mm_extract_epi16(final_eob, 0);
- }
- } else {
- do {
- write_zero(qcoeff_ptr);
- write_zero(dqcoeff_ptr);
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- n_coeffs -= step;
- } while (n_coeffs > 0);
- *eob_ptr = 0;
}
+ *eob_ptr = quant_gather_eob(eob);
}
static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp,
__m256i *c, const int16_t *iscan_ptr,
tran_low_t *qcoeff, tran_low_t *dqcoeff,
__m256i *eob) {
- const __m256i abs = _mm256_abs_epi16(*c);
- __m256i mask = _mm256_cmpgt_epi16(abs, *thr);
- mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr));
+ const __m256i abs_coeff = _mm256_abs_epi16(*c);
+ __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+ mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
const int nzflag = _mm256_movemask_epi8(mask);
if (nzflag) {
- __m256i q = _mm256_adds_epi16(abs, qp[0]);
+ __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
q = _mm256_mulhi_epu16(q, qp[1]);
__m256i dq = _mm256_mullo_epi16(q, qp[2]);
@@ -221,8 +209,8 @@ static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp,
}
void av1_quantize_fp_32x32_avx2(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan_ptr, const int16_t *iscan_ptr) {
@@ -231,15 +219,26 @@ void av1_quantize_fp_32x32_avx2(
(void)quant_shift_ptr;
const unsigned int step = 16;
- if (LIKELY(!skip_block)) {
- __m256i qp[3];
- __m256i coeff, thr;
- const int log_scale = 1;
+ __m256i qp[3];
+ __m256i coeff, thr;
+ const int log_scale = 1;
- init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
- read_coeff(coeff_ptr, &coeff);
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+ read_coeff(coeff_ptr, &coeff);
- __m256i eob = _mm256_setzero_si256();
+ __m256i eob = _mm256_setzero_si256();
+ quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(log_scale, &thr, qp);
+
+ while (n_coeffs > 0) {
+ read_coeff(coeff_ptr, &coeff);
quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
coeff_ptr += step;
@@ -247,40 +246,85 @@ void av1_quantize_fp_32x32_avx2(
dqcoeff_ptr += step;
iscan_ptr += step;
n_coeffs -= step;
+ }
+ *eob_ptr = quant_gather_eob(eob);
+}
+
+static INLINE void quantize_64x64(const __m256i *thr, const __m256i *qp,
+ __m256i *c, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ __m256i *eob) {
+ const __m256i abs_coeff = _mm256_abs_epi16(*c);
+ __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+ mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
+ const int nzflag = _mm256_movemask_epi8(mask);
- update_qp(log_scale, &thr, qp);
-
- while (n_coeffs > 0) {
- read_coeff(coeff_ptr, &coeff);
- quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
- &eob);
-
- coeff_ptr += step;
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- iscan_ptr += step;
- n_coeffs -= step;
- }
- {
- __m256i eob_s;
- eob_s = _mm256_shuffle_epi32(eob, 0xe);
- eob = _mm256_max_epi16(eob, eob_s);
- eob_s = _mm256_shufflelo_epi16(eob, 0xe);
- eob = _mm256_max_epi16(eob, eob_s);
- eob_s = _mm256_shufflelo_epi16(eob, 1);
- eob = _mm256_max_epi16(eob, eob_s);
- const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
- _mm256_extractf128_si256(eob, 1));
- *eob_ptr = _mm_extract_epi16(final_eob, 0);
- }
+ if (nzflag) {
+ __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
+ __m256i qh = _mm256_mulhi_epi16(q, qp[1]);
+ __m256i ql = _mm256_mullo_epi16(q, qp[1]);
+ qh = _mm256_slli_epi16(qh, 2);
+ ql = _mm256_srli_epi16(ql, 14);
+ q = _mm256_or_si256(qh, ql);
+ const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(q, qp[2]), 14);
+ const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(q, qp[2]), 2);
+ __m256i dq = _mm256_or_si256(dqh, dql);
+
+ q = _mm256_sign_epi16(q, *c);
+ dq = _mm256_sign_epi16(dq, *c);
+
+ store_two_quan(q, qcoeff, dq, dqcoeff);
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+ const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+ const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+ __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+ cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+ *eob = _mm256_max_epi16(*eob, cur_eob);
} else {
- do {
- write_zero(qcoeff_ptr);
- write_zero(dqcoeff_ptr);
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- n_coeffs -= step;
- } while (n_coeffs > 0);
- *eob_ptr = 0;
+ write_zero(qcoeff);
+ write_zero(dqcoeff);
+ }
+}
+
+void av1_quantize_fp_64x64_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ const unsigned int step = 16;
+
+ __m256i qp[3];
+ __m256i coeff, thr;
+ const int log_scale = 2;
+
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+ read_coeff(coeff_ptr, &coeff);
+
+ __m256i eob = _mm256_setzero_si256();
+ quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(log_scale, &thr, qp);
+
+ while (n_coeffs > 0) {
+ read_coeff(coeff_ptr, &coeff);
+ quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
}
+ *eob_ptr = quant_gather_eob(eob);
}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
index 4f7c09546..b07e7717f 100644
--- a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
@@ -12,7 +12,8 @@
#include <emmintrin.h>
#include <xmmintrin.h>
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
#include "aom/aom_integer.h"
static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
@@ -67,16 +68,80 @@ static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) {
}
}
+static INLINE void quantize(const int16_t *iscan_ptr,
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const __m128i *round0, const __m128i *round1,
+ const __m128i *quant0, const __m128i *quant1,
+ const __m128i *dequant0, const __m128i *dequant1,
+ const __m128i *thr0, const __m128i *thr1,
+ __m128i *eob) {
+ __m128i coeff0, coeff1;
+ // Do DC and first 15 AC
+ read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
+
+ // Poor man's sign extract
+ const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+ const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0),
+ _mm_cmpeq_epi16(qcoeff0, *thr0));
+ const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1),
+ _mm_cmpeq_epi16(qcoeff1, *thr1));
+ const int16_t nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1);
+
+ if (nzflag) {
+ qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
+ const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
+ const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
+ coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
+
+ write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
+
+ const __m128i zero = _mm_setzero_si128();
+ // Scan for eob
+ const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ const __m128i iscan0 =
+ _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ const __m128i iscan1 =
+ _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
+ const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
+ const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
+ const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
+ const __m128i eob2 = _mm_max_epi16(eob0, eob1);
+ *eob = _mm_max_epi16(*eob, eob2);
+ } else {
+ write_zero(qcoeff_ptr, n_coeffs);
+ write_zero(dqcoeff_ptr, n_coeffs);
+ }
+}
+
void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan_ptr, const int16_t *iscan_ptr) {
- __m128i zero;
- __m128i thr;
- int16_t nzflag;
(void)scan_ptr;
(void)zbin_ptr;
(void)quant_shift_ptr;
@@ -86,167 +151,39 @@ void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
qcoeff_ptr += n_coeffs;
dqcoeff_ptr += n_coeffs;
n_coeffs = -n_coeffs;
- zero = _mm_setzero_si128();
-
- if (!skip_block) {
- __m128i eob;
- __m128i round, quant, dequant;
- {
- __m128i coeff0, coeff1;
-
- // Setup global values
- {
- round = _mm_load_si128((const __m128i *)round_ptr);
- quant = _mm_load_si128((const __m128i *)quant_ptr);
- dequant = _mm_load_si128((const __m128i *)dequant_ptr);
- }
-
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
- // Do DC and first 15 AC
- read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
-
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- round = _mm_unpackhi_epi64(round, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- quant = _mm_unpackhi_epi64(quant, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
-
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- dequant = _mm_unpackhi_epi64(dequant, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
- write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
- }
-
- {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob = _mm_max_epi16(eob, eob1);
- }
- n_coeffs += 8 * 2;
- }
-
- thr = _mm_srai_epi16(dequant, 1);
-
- // AC only loop
- while (n_coeffs < 0) {
- __m128i coeff0, coeff1;
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
-
- read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
-
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
- _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
- if (nzflag) {
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
-
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
- write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
- } else {
- write_zero(qcoeff_ptr, n_coeffs);
- write_zero(dqcoeff_ptr, n_coeffs);
- }
- }
-
- if (nzflag) {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob0, eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob0 = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob0 = _mm_max_epi16(eob0, eob1);
- eob = _mm_max_epi16(eob, eob0);
- }
- n_coeffs += 8 * 2;
- }
-
- // Accumulate EOB
- {
- __m128i eob_shuffled;
- eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
- eob = _mm_max_epi16(eob, eob_shuffled);
- *eob_ptr = _mm_extract_epi16(eob, 1);
- }
- } else {
- do {
- write_zero(dqcoeff_ptr, n_coeffs);
- write_zero(qcoeff_ptr, n_coeffs);
- n_coeffs += 8 * 2;
- } while (n_coeffs < 0);
- *eob_ptr = 0;
+
+ const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
+ const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
+ const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
+ const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
+ const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
+ const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
+ const __m128i thr0 = _mm_srai_epi16(dequant0, 1);
+ const __m128i thr1 = _mm_srai_epi16(dequant1, 1);
+ __m128i eob = _mm_setzero_si128();
+
+ quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
+ &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob);
+
+ n_coeffs += 8 * 2;
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
+ &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1,
+ &eob);
+ n_coeffs += 8 * 2;
+ }
+
+ // Accumulate EOB
+ {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ *eob_ptr = _mm_extract_epi16(eob, 1);
}
}
diff --git a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
index dcc697ba3..faa2a232a 100644
--- a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
+++ b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
@@ -47,6 +47,9 @@
paddd %1, xmm1
SUM_ACROSS_Q %1
%endmacro
+
+SECTION .text
+
;void ssim_parms_sse2(
; unsigned char *s,
; int sp,
diff --git a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
new file mode 100644
index 000000000..0adefecdb
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_TXMF1D_SSE2_H_
+#define AV1_TXMF1D_SSE2_H_
+
+#include <smmintrin.h>
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+ int8_t cos_bit);
+void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+ int8_t cos_bit);
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+static INLINE void transpose_32_4x4(int stride, const __m128i *input,
+ __m128i *output) {
+ __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
+ __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
+ __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
+ __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
+
+ output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+ output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+ output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+ output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+// the entire input block can be represent by a grid of 4x4 blocks
+// each 4x4 blocks can be represent by 4 vertical __m128i
+// we first transpose each 4x4 block internally
+// then transpose the grid
+static INLINE void transpose_32(int txfm_size, const __m128i *input,
+ __m128i *output) {
+ const int num_per_128 = 4;
+ const int row_size = txfm_size;
+ const int col_size = txfm_size / num_per_128;
+ int r, c;
+
+ // transpose each 4x4 block internally
+ for (r = 0; r < row_size; r += 4) {
+ for (c = 0; c < col_size; c++) {
+ transpose_32_4x4(col_size, &input[r * col_size + c],
+ &output[c * 4 * col_size + r / 4]);
+ }
+ }
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
+ do { \
+ const __m128i ww0 = _mm_set1_epi32(w0); \
+ const __m128i ww1 = _mm_set1_epi32(w1); \
+ const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \
+ const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \
+ out0 = _mm_add_epi32(in0_w0, in1_w1); \
+ out0 = av1_round_shift_32_sse4_1(out0, bit); \
+ const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \
+ const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \
+ out1 = _mm_sub_epi32(in0_w1, in1_w0); \
+ out1 = av1_round_shift_32_sse4_1(out1, bit); \
+ } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
+ do { \
+ btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit); \
+ } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+ do { \
+ const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \
+ const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \
+ out0 = _mm_add_epi32(in0_w0, in1_w1); \
+ out0 = _mm_add_epi32(out0, r); \
+ out0 = _mm_srai_epi32(out0, bit); \
+ const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \
+ const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \
+ out1 = _mm_sub_epi32(in0_w1, in1_w0); \
+ out1 = _mm_add_epi32(out1, r); \
+ out1 = _mm_srai_epi32(out1, bit); \
+ } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+ do { \
+ btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit); \
+ } while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AV1_TXMF1D_SSE2_H_
diff --git a/third_party/aom/av1/encoder/x86/corner_match_sse4.c b/third_party/aom/av1/encoder/x86/corner_match_sse4.c
index 179da0d28..381f757da 100644
--- a/third_party/aom/av1/encoder/x86/corner_match_sse4.c
+++ b/third_party/aom/av1/encoder/x86/corner_match_sse4.c
@@ -5,7 +5,8 @@
#include <smmintrin.h>
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
#include "aom_ports/mem.h"
#include "av1/encoder/corner_match.h"
diff --git a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
deleted file mode 100644
index e5b19a44c..000000000
--- a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
+++ /dev/null
@@ -1,3483 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h> // SSE2
-
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/fwd_txfm_sse2.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-#include "aom_ports/mem.h"
-
-static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
- int stride, int flipud, int fliplr) {
- const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
- const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
- __m128i mask;
-
- if (!flipud) {
- in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
- in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
- } else {
- in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
- in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
- in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- }
-
- if (fliplr) {
- in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
- in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
- in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
- in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
- }
-
- in[0] = _mm_slli_epi16(in[0], 4);
- in[1] = _mm_slli_epi16(in[1], 4);
- in[2] = _mm_slli_epi16(in[2], 4);
- in[3] = _mm_slli_epi16(in[3], 4);
-
- mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
- in[0] = _mm_add_epi16(in[0], mask);
- in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
-}
-
-static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
- const __m128i kOne = _mm_set1_epi16(1);
- __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
- __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
- __m128i out01 = _mm_add_epi16(in01, kOne);
- __m128i out23 = _mm_add_epi16(in23, kOne);
- out01 = _mm_srai_epi16(out01, 2);
- out23 = _mm_srai_epi16(out23, 2);
- store_output(&out01, (output + 0 * 8));
- store_output(&out23, (output + 1 * 8));
-}
-
-static INLINE void transpose_4x4(__m128i *res) {
- // Combine and transpose
- // 00 01 02 03 20 21 22 23
- // 10 11 12 13 30 31 32 33
- const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
- const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
- res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
-
- // 00 10 20 30 01 11 21 31
- // 02 12 22 32 03 13 23 33
- // only use the first 4 16-bit integers
- res[1] = _mm_unpackhi_epi64(res[0], res[0]);
- res[3] = _mm_unpackhi_epi64(res[2], res[2]);
-}
-
-static void fdct4_sse2(__m128i *in) {
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- __m128i u[4], v[4];
- u[0] = _mm_unpacklo_epi16(in[0], in[1]);
- u[1] = _mm_unpacklo_epi16(in[3], in[2]);
-
- v[0] = _mm_add_epi16(u[0], u[1]);
- v[1] = _mm_sub_epi16(u[0], u[1]);
-
- u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0
- u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2
- u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1
- u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u[0], u[1]);
- in[1] = _mm_packs_epi32(u[2], u[3]);
- transpose_4x4(in);
-}
-
-static void fadst4_sse2(__m128i *in) {
- const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
- const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
- const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
- const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
- const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
- const __m128i kZero = _mm_set1_epi16(0);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i u[8], v[8];
- __m128i in7 = _mm_add_epi16(in[0], in[1]);
-
- u[0] = _mm_unpacklo_epi16(in[0], in[1]);
- u[1] = _mm_unpacklo_epi16(in[2], in[3]);
- u[2] = _mm_unpacklo_epi16(in7, kZero);
- u[3] = _mm_unpacklo_epi16(in[2], kZero);
- u[4] = _mm_unpacklo_epi16(in[3], kZero);
-
- v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2
- v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5
- v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1
- v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3
- v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6
- v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4
- v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
-
- u[0] = _mm_add_epi32(v[0], v[1]);
- u[1] = _mm_sub_epi32(v[2], v[6]);
- u[2] = _mm_add_epi32(v[3], v[4]);
- u[3] = _mm_sub_epi32(u[2], u[0]);
- u[4] = _mm_slli_epi32(v[5], 2);
- u[5] = _mm_sub_epi32(u[4], v[5]);
- u[6] = _mm_add_epi32(u[3], u[5]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u[0], u[2]);
- in[1] = _mm_packs_epi32(u[1], u[3]);
- transpose_4x4(in);
-}
-
-#if CONFIG_EXT_TX
-static void fidtx4_sse2(__m128i *in) {
- const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
- const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- __m128i v0, v1, v2, v3;
- __m128i u0, u1, u2, u3;
-
- v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
- v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
- v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
- v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
-
- u0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
- u1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
- u2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
- u3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
-
- v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u0, u2);
- in[1] = _mm_packs_epi32(u1, u3);
- transpose_4x4(in);
-}
-#endif // CONFIG_EXT_TX
-
-void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
- TxfmParam *txfm_param) {
- __m128i in[4];
- const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
- assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
- switch (tx_type) {
- case DCT_DCT: aom_fdct4x4_sse2(input, output, stride); break;
- case ADST_DCT:
- load_buffer_4x4(input, in, stride, 0, 0);
- fadst4_sse2(in);
- fdct4_sse2(in);
- write_buffer_4x4(output, in);
- break;
- case DCT_ADST:
- load_buffer_4x4(input, in, stride, 0, 0);
- fdct4_sse2(in);
- fadst4_sse2(in);
- write_buffer_4x4(output, in);
- break;
- case ADST_ADST:
- load_buffer_4x4(input, in, stride, 0, 0);
- fadst4_sse2(in);
- fadst4_sse2(in);
- write_buffer_4x4(output, in);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- load_buffer_4x4(input, in, stride, 1, 0);
- fadst4_sse2(in);
- fdct4_sse2(in);
- write_buffer_4x4(output, in);
- break;
- case DCT_FLIPADST:
- load_buffer_4x4(input, in, stride, 0, 1);
- fdct4_sse2(in);
- fadst4_sse2(in);
- write_buffer_4x4(output, in);
- break;
- case FLIPADST_FLIPADST:
- load_buffer_4x4(input, in, stride, 1, 1);
- fadst4_sse2(in);
- fadst4_sse2(in);
- write_buffer_4x4(output, in);
- break;
- case ADST_FLIPADST:
- load_buffer_4x4(input, in, stride, 0, 1);
- fadst4_sse2(in);
- fadst4_sse2(in);
- write_buffer_4x4(output, in);
- break;
- case FLIPADST_ADST:
- load_buffer_4x4(input, in, stride, 1, 0);
- fadst4_sse2(in);
- fadst4_sse2(in);
- write_buffer_4x4(output, in);
- break;
- case IDTX:
- load_buffer_4x4(input, in, stride, 0, 0);
- fidtx4_sse2(in);
- fidtx4_sse2(in);
- write_buffer_4x4(output, in);
- break;
- case V_DCT:
- load_buffer_4x4(input, in, stride, 0, 0);
- fdct4_sse2(in);
- fidtx4_sse2(in);
- write_buffer_4x4(output, in);
- break;
- case H_DCT:
- load_buffer_4x4(input, in, stride, 0, 0);
- fidtx4_sse2(in);
- fdct4_sse2(in);
- write_buffer_4x4(output, in);
- break;
- case V_ADST:
- load_buffer_4x4(input, in, stride, 0, 0);
- fadst4_sse2(in);
- fidtx4_sse2(in);
- write_buffer_4x4(output, in);
- break;
- case H_ADST:
- load_buffer_4x4(input, in, stride, 0, 0);
- fidtx4_sse2(in);
- fadst4_sse2(in);
- write_buffer_4x4(output, in);
- break;
- case V_FLIPADST:
- load_buffer_4x4(input, in, stride, 1, 0);
- fadst4_sse2(in);
- fidtx4_sse2(in);
- write_buffer_4x4(output, in);
- break;
- case H_FLIPADST:
- load_buffer_4x4(input, in, stride, 0, 1);
- fidtx4_sse2(in);
- fadst4_sse2(in);
- write_buffer_4x4(output, in);
- break;
-#endif // CONFIG_EXT_TX
- default: assert(0);
- }
-}
-
-// load 8x8 array
-static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
- int stride, int flipud, int fliplr) {
- if (!flipud) {
- in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
- in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
- in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
- in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
- in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
- in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
- in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
- in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
- } else {
- in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
- in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
- in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
- in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
- in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
- in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
- in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
- in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
- }
-
- if (fliplr) {
- in[0] = mm_reverse_epi16(in[0]);
- in[1] = mm_reverse_epi16(in[1]);
- in[2] = mm_reverse_epi16(in[2]);
- in[3] = mm_reverse_epi16(in[3]);
- in[4] = mm_reverse_epi16(in[4]);
- in[5] = mm_reverse_epi16(in[5]);
- in[6] = mm_reverse_epi16(in[6]);
- in[7] = mm_reverse_epi16(in[7]);
- }
-
- in[0] = _mm_slli_epi16(in[0], 2);
- in[1] = _mm_slli_epi16(in[1], 2);
- in[2] = _mm_slli_epi16(in[2], 2);
- in[3] = _mm_slli_epi16(in[3], 2);
- in[4] = _mm_slli_epi16(in[4], 2);
- in[5] = _mm_slli_epi16(in[5], 2);
- in[6] = _mm_slli_epi16(in[6], 2);
- in[7] = _mm_slli_epi16(in[7], 2);
-}
-
-// right shift and rounding
-static INLINE void right_shift_8x8(__m128i *res, const int bit) {
- __m128i sign0 = _mm_srai_epi16(res[0], 15);
- __m128i sign1 = _mm_srai_epi16(res[1], 15);
- __m128i sign2 = _mm_srai_epi16(res[2], 15);
- __m128i sign3 = _mm_srai_epi16(res[3], 15);
- __m128i sign4 = _mm_srai_epi16(res[4], 15);
- __m128i sign5 = _mm_srai_epi16(res[5], 15);
- __m128i sign6 = _mm_srai_epi16(res[6], 15);
- __m128i sign7 = _mm_srai_epi16(res[7], 15);
-
- if (bit == 2) {
- const __m128i const_rounding = _mm_set1_epi16(1);
- res[0] = _mm_adds_epi16(res[0], const_rounding);
- res[1] = _mm_adds_epi16(res[1], const_rounding);
- res[2] = _mm_adds_epi16(res[2], const_rounding);
- res[3] = _mm_adds_epi16(res[3], const_rounding);
- res[4] = _mm_adds_epi16(res[4], const_rounding);
- res[5] = _mm_adds_epi16(res[5], const_rounding);
- res[6] = _mm_adds_epi16(res[6], const_rounding);
- res[7] = _mm_adds_epi16(res[7], const_rounding);
- }
-
- res[0] = _mm_sub_epi16(res[0], sign0);
- res[1] = _mm_sub_epi16(res[1], sign1);
- res[2] = _mm_sub_epi16(res[2], sign2);
- res[3] = _mm_sub_epi16(res[3], sign3);
- res[4] = _mm_sub_epi16(res[4], sign4);
- res[5] = _mm_sub_epi16(res[5], sign5);
- res[6] = _mm_sub_epi16(res[6], sign6);
- res[7] = _mm_sub_epi16(res[7], sign7);
-
- if (bit == 1) {
- res[0] = _mm_srai_epi16(res[0], 1);
- res[1] = _mm_srai_epi16(res[1], 1);
- res[2] = _mm_srai_epi16(res[2], 1);
- res[3] = _mm_srai_epi16(res[3], 1);
- res[4] = _mm_srai_epi16(res[4], 1);
- res[5] = _mm_srai_epi16(res[5], 1);
- res[6] = _mm_srai_epi16(res[6], 1);
- res[7] = _mm_srai_epi16(res[7], 1);
- } else {
- res[0] = _mm_srai_epi16(res[0], 2);
- res[1] = _mm_srai_epi16(res[1], 2);
- res[2] = _mm_srai_epi16(res[2], 2);
- res[3] = _mm_srai_epi16(res[3], 2);
- res[4] = _mm_srai_epi16(res[4], 2);
- res[5] = _mm_srai_epi16(res[5], 2);
- res[6] = _mm_srai_epi16(res[6], 2);
- res[7] = _mm_srai_epi16(res[7], 2);
- }
-}
-
-// write 8x8 array
-static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
- int stride) {
- store_output(&res[0], (output + 0 * stride));
- store_output(&res[1], (output + 1 * stride));
- store_output(&res[2], (output + 2 * stride));
- store_output(&res[3], (output + 3 * stride));
- store_output(&res[4], (output + 4 * stride));
- store_output(&res[5], (output + 5 * stride));
- store_output(&res[6], (output + 6 * stride));
- store_output(&res[7], (output + 7 * stride));
-}
-
-// perform in-place transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
- const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 44 54 45 55 46 56 47 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 25 35
- // 44 54 64 74 45 55 65 75
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
- res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
- res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
- res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
- res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
- res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
- res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
- res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
-}
-
-static void fdct8_sse2(__m128i *in) {
- // constants
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i u0, u1, u2, u3, u4, u5, u6, u7;
- __m128i v0, v1, v2, v3, v4, v5, v6, v7;
- __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-
- // stage 1
- s0 = _mm_add_epi16(in[0], in[7]);
- s1 = _mm_add_epi16(in[1], in[6]);
- s2 = _mm_add_epi16(in[2], in[5]);
- s3 = _mm_add_epi16(in[3], in[4]);
- s4 = _mm_sub_epi16(in[3], in[4]);
- s5 = _mm_sub_epi16(in[2], in[5]);
- s6 = _mm_sub_epi16(in[1], in[6]);
- s7 = _mm_sub_epi16(in[0], in[7]);
-
- u0 = _mm_add_epi16(s0, s3);
- u1 = _mm_add_epi16(s1, s2);
- u2 = _mm_sub_epi16(s1, s2);
- u3 = _mm_sub_epi16(s0, s3);
- // interleave and perform butterfly multiplication/addition
- v0 = _mm_unpacklo_epi16(u0, u1);
- v1 = _mm_unpackhi_epi16(u0, u1);
- v2 = _mm_unpacklo_epi16(u2, u3);
- v3 = _mm_unpackhi_epi16(u2, u3);
-
- u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
- u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
- u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
- u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
- u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
- u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
- u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
- u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
-
- // shift and rounding
- v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u0, u1);
- in[2] = _mm_packs_epi32(u4, u5);
- in[4] = _mm_packs_epi32(u2, u3);
- in[6] = _mm_packs_epi32(u6, u7);
-
- // stage 2
- // interleave and perform butterfly multiplication/addition
- u0 = _mm_unpacklo_epi16(s6, s5);
- u1 = _mm_unpackhi_epi16(s6, s5);
- v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
- v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
- v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
- v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-
- // shift and rounding
- u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-
- u0 = _mm_packs_epi32(v0, v1);
- u1 = _mm_packs_epi32(v2, v3);
-
- // stage 3
- s0 = _mm_add_epi16(s4, u0);
- s1 = _mm_sub_epi16(s4, u0);
- s2 = _mm_sub_epi16(s7, u1);
- s3 = _mm_add_epi16(s7, u1);
-
- // stage 4
- u0 = _mm_unpacklo_epi16(s0, s3);
- u1 = _mm_unpackhi_epi16(s0, s3);
- u2 = _mm_unpacklo_epi16(s1, s2);
- u3 = _mm_unpackhi_epi16(s1, s2);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
- v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
- v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
- v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
- v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
- v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
- v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
- v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
-
- // shift and rounding
- u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
- u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
- u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
- u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
- u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
- v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
- v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
- v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
- v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
- in[1] = _mm_packs_epi32(v0, v1);
- in[3] = _mm_packs_epi32(v4, v5);
- in[5] = _mm_packs_epi32(v2, v3);
- in[7] = _mm_packs_epi32(v6, v7);
-
- // transpose
- array_transpose_8x8(in, in);
-}
-
-static void fadst8_sse2(__m128i *in) {
- // Constants
- const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
- const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__const_0 = _mm_set1_epi16(0);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
- __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
- __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
- __m128i s0, s1, s2, s3, s4, s5, s6, s7;
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
- // properly aligned for butterfly input
- in0 = in[7];
- in1 = in[0];
- in2 = in[5];
- in3 = in[2];
- in4 = in[3];
- in5 = in[4];
- in6 = in[1];
- in7 = in[6];
-
- // column transformation
- // stage 1
- // interleave and multiply/add into 32-bit integer
- s0 = _mm_unpacklo_epi16(in0, in1);
- s1 = _mm_unpackhi_epi16(in0, in1);
- s2 = _mm_unpacklo_epi16(in2, in3);
- s3 = _mm_unpackhi_epi16(in2, in3);
- s4 = _mm_unpacklo_epi16(in4, in5);
- s5 = _mm_unpackhi_epi16(in4, in5);
- s6 = _mm_unpacklo_epi16(in6, in7);
- s7 = _mm_unpackhi_epi16(in6, in7);
-
- u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
- u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
- u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
- u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
- u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
- u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
- u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
- u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
- u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
- u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
- u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
- u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
- u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
- u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
- u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
- u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
- // addition
- w0 = _mm_add_epi32(u0, u8);
- w1 = _mm_add_epi32(u1, u9);
- w2 = _mm_add_epi32(u2, u10);
- w3 = _mm_add_epi32(u3, u11);
- w4 = _mm_add_epi32(u4, u12);
- w5 = _mm_add_epi32(u5, u13);
- w6 = _mm_add_epi32(u6, u14);
- w7 = _mm_add_epi32(u7, u15);
- w8 = _mm_sub_epi32(u0, u8);
- w9 = _mm_sub_epi32(u1, u9);
- w10 = _mm_sub_epi32(u2, u10);
- w11 = _mm_sub_epi32(u3, u11);
- w12 = _mm_sub_epi32(u4, u12);
- w13 = _mm_sub_epi32(u5, u13);
- w14 = _mm_sub_epi32(u6, u14);
- w15 = _mm_sub_epi32(u7, u15);
-
- // shift and rounding
- v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
- v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
- v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
- v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
- v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
- v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
- v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
- v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
- u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
- u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
- u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
- u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
- u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
- u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
- u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
- u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
- // back to 16-bit and pack 8 integers into __m128i
- v0 = _mm_add_epi32(w0, w4);
- v1 = _mm_add_epi32(w1, w5);
- v2 = _mm_add_epi32(w2, w6);
- v3 = _mm_add_epi32(w3, w7);
- v4 = _mm_sub_epi32(w0, w4);
- v5 = _mm_sub_epi32(w1, w5);
- v6 = _mm_sub_epi32(w2, w6);
- v7 = _mm_sub_epi32(w3, w7);
-
- w0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- w1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- w2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- w3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
- w4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
- w5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
- w6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
- w7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(w0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(w1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(w2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(w3, DCT_CONST_BITS);
- v4 = _mm_srai_epi32(w4, DCT_CONST_BITS);
- v5 = _mm_srai_epi32(w5, DCT_CONST_BITS);
- v6 = _mm_srai_epi32(w6, DCT_CONST_BITS);
- v7 = _mm_srai_epi32(w7, DCT_CONST_BITS);
-
- in[4] = _mm_packs_epi32(u8, u9);
- in[5] = _mm_packs_epi32(u10, u11);
- in[6] = _mm_packs_epi32(u12, u13);
- in[7] = _mm_packs_epi32(u14, u15);
-
- // stage 2
- s0 = _mm_packs_epi32(v0, v1);
- s1 = _mm_packs_epi32(v2, v3);
- s2 = _mm_packs_epi32(v4, v5);
- s3 = _mm_packs_epi32(v6, v7);
-
- u0 = _mm_unpacklo_epi16(in[4], in[5]);
- u1 = _mm_unpackhi_epi16(in[4], in[5]);
- u2 = _mm_unpacklo_epi16(in[6], in[7]);
- u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
- v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
- v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
- v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
- v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
- v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
- v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
- v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
- w0 = _mm_add_epi32(v0, v4);
- w1 = _mm_add_epi32(v1, v5);
- w2 = _mm_add_epi32(v2, v6);
- w3 = _mm_add_epi32(v3, v7);
- w4 = _mm_sub_epi32(v0, v4);
- w5 = _mm_sub_epi32(v1, v5);
- w6 = _mm_sub_epi32(v2, v6);
- w7 = _mm_sub_epi32(v3, v7);
-
- v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
- v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
- v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
- v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
- v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
- // back to 16-bit intergers
- s4 = _mm_packs_epi32(u0, u1);
- s5 = _mm_packs_epi32(u2, u3);
- s6 = _mm_packs_epi32(u4, u5);
- s7 = _mm_packs_epi32(u6, u7);
-
- // stage 3
- u0 = _mm_unpacklo_epi16(s2, s3);
- u1 = _mm_unpackhi_epi16(s2, s3);
- u2 = _mm_unpacklo_epi16(s6, s7);
- u3 = _mm_unpackhi_epi16(s6, s7);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
- v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
- v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
- v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
- v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
- v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
- v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
- v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
- u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
- u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
- u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
- u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
- u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
- v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
- v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
- v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
- v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
- s2 = _mm_packs_epi32(v0, v1);
- s3 = _mm_packs_epi32(v2, v3);
- s6 = _mm_packs_epi32(v4, v5);
- s7 = _mm_packs_epi32(v6, v7);
-
- // FIXME(jingning): do subtract using bit inversion?
- in[0] = s0;
- in[1] = _mm_sub_epi16(k__const_0, s4);
- in[2] = s6;
- in[3] = _mm_sub_epi16(k__const_0, s2);
- in[4] = s3;
- in[5] = _mm_sub_epi16(k__const_0, s7);
- in[6] = s5;
- in[7] = _mm_sub_epi16(k__const_0, s1);
-
- // transpose
- array_transpose_8x8(in, in);
-}
-
-#if CONFIG_EXT_TX
-static void fidtx8_sse2(__m128i *in) {
- in[0] = _mm_slli_epi16(in[0], 1);
- in[1] = _mm_slli_epi16(in[1], 1);
- in[2] = _mm_slli_epi16(in[2], 1);
- in[3] = _mm_slli_epi16(in[3], 1);
- in[4] = _mm_slli_epi16(in[4], 1);
- in[5] = _mm_slli_epi16(in[5], 1);
- in[6] = _mm_slli_epi16(in[6], 1);
- in[7] = _mm_slli_epi16(in[7], 1);
-
- array_transpose_8x8(in, in);
-}
-#endif // CONFIG_EXT_TX
-
-void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
- TxfmParam *txfm_param) {
- __m128i in[8];
- const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
- assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
- switch (tx_type) {
- case DCT_DCT: aom_fdct8x8_sse2(input, output, stride); break;
- case ADST_DCT:
- load_buffer_8x8(input, in, stride, 0, 0);
- fadst8_sse2(in);
- fdct8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
- case DCT_ADST:
- load_buffer_8x8(input, in, stride, 0, 0);
- fdct8_sse2(in);
- fadst8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
- case ADST_ADST:
- load_buffer_8x8(input, in, stride, 0, 0);
- fadst8_sse2(in);
- fadst8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- load_buffer_8x8(input, in, stride, 1, 0);
- fadst8_sse2(in);
- fdct8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
- case DCT_FLIPADST:
- load_buffer_8x8(input, in, stride, 0, 1);
- fdct8_sse2(in);
- fadst8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
- case FLIPADST_FLIPADST:
- load_buffer_8x8(input, in, stride, 1, 1);
- fadst8_sse2(in);
- fadst8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
- case ADST_FLIPADST:
- load_buffer_8x8(input, in, stride, 0, 1);
- fadst8_sse2(in);
- fadst8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
- case FLIPADST_ADST:
- load_buffer_8x8(input, in, stride, 1, 0);
- fadst8_sse2(in);
- fadst8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
- case IDTX:
- load_buffer_8x8(input, in, stride, 0, 0);
- fidtx8_sse2(in);
- fidtx8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
- case V_DCT:
- load_buffer_8x8(input, in, stride, 0, 0);
- fdct8_sse2(in);
- fidtx8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
- case H_DCT:
- load_buffer_8x8(input, in, stride, 0, 0);
- fidtx8_sse2(in);
- fdct8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
- case V_ADST:
- load_buffer_8x8(input, in, stride, 0, 0);
- fadst8_sse2(in);
- fidtx8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
- case H_ADST:
- load_buffer_8x8(input, in, stride, 0, 0);
- fidtx8_sse2(in);
- fadst8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
- case V_FLIPADST:
- load_buffer_8x8(input, in, stride, 1, 0);
- fadst8_sse2(in);
- fidtx8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
- case H_FLIPADST:
- load_buffer_8x8(input, in, stride, 0, 1);
- fidtx8_sse2(in);
- fadst8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
-#endif // CONFIG_EXT_TX
- default: assert(0);
- }
-}
-
-static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0,
- __m128i *in1, int stride, int flipud,
- int fliplr) {
- // Load 4 8x8 blocks
- const int16_t *topL = input;
- const int16_t *topR = input + 8;
- const int16_t *botL = input + 8 * stride;
- const int16_t *botR = input + 8 * stride + 8;
-
- const int16_t *tmp;
-
- if (flipud) {
- // Swap left columns
- tmp = topL;
- topL = botL;
- botL = tmp;
- // Swap right columns
- tmp = topR;
- topR = botR;
- botR = tmp;
- }
-
- if (fliplr) {
- // Swap top rows
- tmp = topL;
- topL = topR;
- topR = tmp;
- // Swap bottom rows
- tmp = botL;
- botL = botR;
- botR = tmp;
- }
-
- // load first 8 columns
- load_buffer_8x8(topL, in0, stride, flipud, fliplr);
- load_buffer_8x8(botL, in0 + 8, stride, flipud, fliplr);
-
- // load second 8 columns
- load_buffer_8x8(topR, in1, stride, flipud, fliplr);
- load_buffer_8x8(botR, in1 + 8, stride, flipud, fliplr);
-}
-
-static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
- __m128i *in1, int stride) {
- // write first 8 columns
- write_buffer_8x8(output, in0, stride);
- write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
- // write second 8 columns
- output += 8;
- write_buffer_8x8(output, in1, stride);
- write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
-}
-
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
- __m128i tbuf[8];
- array_transpose_8x8(res0, res0);
- array_transpose_8x8(res1, tbuf);
- array_transpose_8x8(res0 + 8, res1);
- array_transpose_8x8(res1 + 8, res1 + 8);
-
- res0[8] = tbuf[0];
- res0[9] = tbuf[1];
- res0[10] = tbuf[2];
- res0[11] = tbuf[3];
- res0[12] = tbuf[4];
- res0[13] = tbuf[5];
- res0[14] = tbuf[6];
- res0[15] = tbuf[7];
-}
-
-static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
- // perform rounding operations
- right_shift_8x8(res0, 2);
- right_shift_8x8(res0 + 8, 2);
- right_shift_8x8(res1, 2);
- right_shift_8x8(res1 + 8, 2);
-}
-
-static void fdct16_8col(__m128i *in) {
- // perform 16x16 1-D DCT for 8 columns
- __m128i i[8], s[8], p[8], t[8], u[16], v[16];
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
- const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
- const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
- const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
- const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
- const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
- const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- // stage 1
- i[0] = _mm_add_epi16(in[0], in[15]);
- i[1] = _mm_add_epi16(in[1], in[14]);
- i[2] = _mm_add_epi16(in[2], in[13]);
- i[3] = _mm_add_epi16(in[3], in[12]);
- i[4] = _mm_add_epi16(in[4], in[11]);
- i[5] = _mm_add_epi16(in[5], in[10]);
- i[6] = _mm_add_epi16(in[6], in[9]);
- i[7] = _mm_add_epi16(in[7], in[8]);
-
- s[0] = _mm_sub_epi16(in[7], in[8]);
- s[1] = _mm_sub_epi16(in[6], in[9]);
- s[2] = _mm_sub_epi16(in[5], in[10]);
- s[3] = _mm_sub_epi16(in[4], in[11]);
- s[4] = _mm_sub_epi16(in[3], in[12]);
- s[5] = _mm_sub_epi16(in[2], in[13]);
- s[6] = _mm_sub_epi16(in[1], in[14]);
- s[7] = _mm_sub_epi16(in[0], in[15]);
-
- p[0] = _mm_add_epi16(i[0], i[7]);
- p[1] = _mm_add_epi16(i[1], i[6]);
- p[2] = _mm_add_epi16(i[2], i[5]);
- p[3] = _mm_add_epi16(i[3], i[4]);
- p[4] = _mm_sub_epi16(i[3], i[4]);
- p[5] = _mm_sub_epi16(i[2], i[5]);
- p[6] = _mm_sub_epi16(i[1], i[6]);
- p[7] = _mm_sub_epi16(i[0], i[7]);
-
- u[0] = _mm_add_epi16(p[0], p[3]);
- u[1] = _mm_add_epi16(p[1], p[2]);
- u[2] = _mm_sub_epi16(p[1], p[2]);
- u[3] = _mm_sub_epi16(p[0], p[3]);
-
- v[0] = _mm_unpacklo_epi16(u[0], u[1]);
- v[1] = _mm_unpackhi_epi16(u[0], u[1]);
- v[2] = _mm_unpacklo_epi16(u[2], u[3]);
- v[3] = _mm_unpackhi_epi16(u[2], u[3]);
-
- u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
- u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
- u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
- u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
- u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
- u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
- u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
- u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u[0], u[1]);
- in[4] = _mm_packs_epi32(u[4], u[5]);
- in[8] = _mm_packs_epi32(u[2], u[3]);
- in[12] = _mm_packs_epi32(u[6], u[7]);
-
- u[0] = _mm_unpacklo_epi16(p[5], p[6]);
- u[1] = _mm_unpackhi_epi16(p[5], p[6]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
- u[0] = _mm_packs_epi32(v[0], v[1]);
- u[1] = _mm_packs_epi32(v[2], v[3]);
-
- t[0] = _mm_add_epi16(p[4], u[0]);
- t[1] = _mm_sub_epi16(p[4], u[0]);
- t[2] = _mm_sub_epi16(p[7], u[1]);
- t[3] = _mm_add_epi16(p[7], u[1]);
-
- u[0] = _mm_unpacklo_epi16(t[0], t[3]);
- u[1] = _mm_unpackhi_epi16(t[0], t[3]);
- u[2] = _mm_unpacklo_epi16(t[1], t[2]);
- u[3] = _mm_unpackhi_epi16(t[1], t[2]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
- v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
- v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
- v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
- v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- in[2] = _mm_packs_epi32(v[0], v[1]);
- in[6] = _mm_packs_epi32(v[4], v[5]);
- in[10] = _mm_packs_epi32(v[2], v[3]);
- in[14] = _mm_packs_epi32(v[6], v[7]);
-
- // stage 2
- u[0] = _mm_unpacklo_epi16(s[2], s[5]);
- u[1] = _mm_unpackhi_epi16(s[2], s[5]);
- u[2] = _mm_unpacklo_epi16(s[3], s[4]);
- u[3] = _mm_unpackhi_epi16(s[3], s[4]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
- v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- t[2] = _mm_packs_epi32(v[0], v[1]);
- t[3] = _mm_packs_epi32(v[2], v[3]);
- t[4] = _mm_packs_epi32(v[4], v[5]);
- t[5] = _mm_packs_epi32(v[6], v[7]);
-
- // stage 3
- p[0] = _mm_add_epi16(s[0], t[3]);
- p[1] = _mm_add_epi16(s[1], t[2]);
- p[2] = _mm_sub_epi16(s[1], t[2]);
- p[3] = _mm_sub_epi16(s[0], t[3]);
- p[4] = _mm_sub_epi16(s[7], t[4]);
- p[5] = _mm_sub_epi16(s[6], t[5]);
- p[6] = _mm_add_epi16(s[6], t[5]);
- p[7] = _mm_add_epi16(s[7], t[4]);
-
- // stage 4
- u[0] = _mm_unpacklo_epi16(p[1], p[6]);
- u[1] = _mm_unpackhi_epi16(p[1], p[6]);
- u[2] = _mm_unpacklo_epi16(p[2], p[5]);
- u[3] = _mm_unpackhi_epi16(p[2], p[5]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
- v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
- v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
- v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
- v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- t[1] = _mm_packs_epi32(v[0], v[1]);
- t[2] = _mm_packs_epi32(v[2], v[3]);
- t[5] = _mm_packs_epi32(v[4], v[5]);
- t[6] = _mm_packs_epi32(v[6], v[7]);
-
- // stage 5
- s[0] = _mm_add_epi16(p[0], t[1]);
- s[1] = _mm_sub_epi16(p[0], t[1]);
- s[2] = _mm_sub_epi16(p[3], t[2]);
- s[3] = _mm_add_epi16(p[3], t[2]);
- s[4] = _mm_add_epi16(p[4], t[5]);
- s[5] = _mm_sub_epi16(p[4], t[5]);
- s[6] = _mm_sub_epi16(p[7], t[6]);
- s[7] = _mm_add_epi16(p[7], t[6]);
-
- // stage 6
- u[0] = _mm_unpacklo_epi16(s[0], s[7]);
- u[1] = _mm_unpackhi_epi16(s[0], s[7]);
- u[2] = _mm_unpacklo_epi16(s[1], s[6]);
- u[3] = _mm_unpackhi_epi16(s[1], s[6]);
- u[4] = _mm_unpacklo_epi16(s[2], s[5]);
- u[5] = _mm_unpackhi_epi16(s[2], s[5]);
- u[6] = _mm_unpacklo_epi16(s[3], s[4]);
- u[7] = _mm_unpackhi_epi16(s[3], s[4]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
- v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
- v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
- v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
- v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
- v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
- v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
- v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
- v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
- v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
- v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
- v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
- v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
- v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
- v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- in[1] = _mm_packs_epi32(v[0], v[1]);
- in[9] = _mm_packs_epi32(v[2], v[3]);
- in[5] = _mm_packs_epi32(v[4], v[5]);
- in[13] = _mm_packs_epi32(v[6], v[7]);
- in[3] = _mm_packs_epi32(v[8], v[9]);
- in[11] = _mm_packs_epi32(v[10], v[11]);
- in[7] = _mm_packs_epi32(v[12], v[13]);
- in[15] = _mm_packs_epi32(v[14], v[15]);
-}
-
-static void fadst16_8col(__m128i *in) {
- // perform 16x16 1-D ADST for 8 columns
- __m128i s[16], x[16], u[32], v[32];
- const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
- const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
- const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
- const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
- const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
- const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
- const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
- const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
- const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
- const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kZero = _mm_set1_epi16(0);
-
- u[0] = _mm_unpacklo_epi16(in[15], in[0]);
- u[1] = _mm_unpackhi_epi16(in[15], in[0]);
- u[2] = _mm_unpacklo_epi16(in[13], in[2]);
- u[3] = _mm_unpackhi_epi16(in[13], in[2]);
- u[4] = _mm_unpacklo_epi16(in[11], in[4]);
- u[5] = _mm_unpackhi_epi16(in[11], in[4]);
- u[6] = _mm_unpacklo_epi16(in[9], in[6]);
- u[7] = _mm_unpackhi_epi16(in[9], in[6]);
- u[8] = _mm_unpacklo_epi16(in[7], in[8]);
- u[9] = _mm_unpackhi_epi16(in[7], in[8]);
- u[10] = _mm_unpacklo_epi16(in[5], in[10]);
- u[11] = _mm_unpackhi_epi16(in[5], in[10]);
- u[12] = _mm_unpacklo_epi16(in[3], in[12]);
- u[13] = _mm_unpackhi_epi16(in[3], in[12]);
- u[14] = _mm_unpacklo_epi16(in[1], in[14]);
- u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
- v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
- v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
- v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
- v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
- v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
- v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
- v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
- v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
- v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
- v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
- v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
- v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
- v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
- v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
- v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
- v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
- v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
- v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
- u[0] = _mm_add_epi32(v[0], v[16]);
- u[1] = _mm_add_epi32(v[1], v[17]);
- u[2] = _mm_add_epi32(v[2], v[18]);
- u[3] = _mm_add_epi32(v[3], v[19]);
- u[4] = _mm_add_epi32(v[4], v[20]);
- u[5] = _mm_add_epi32(v[5], v[21]);
- u[6] = _mm_add_epi32(v[6], v[22]);
- u[7] = _mm_add_epi32(v[7], v[23]);
- u[8] = _mm_add_epi32(v[8], v[24]);
- u[9] = _mm_add_epi32(v[9], v[25]);
- u[10] = _mm_add_epi32(v[10], v[26]);
- u[11] = _mm_add_epi32(v[11], v[27]);
- u[12] = _mm_add_epi32(v[12], v[28]);
- u[13] = _mm_add_epi32(v[13], v[29]);
- u[14] = _mm_add_epi32(v[14], v[30]);
- u[15] = _mm_add_epi32(v[15], v[31]);
- u[16] = _mm_sub_epi32(v[0], v[16]);
- u[17] = _mm_sub_epi32(v[1], v[17]);
- u[18] = _mm_sub_epi32(v[2], v[18]);
- u[19] = _mm_sub_epi32(v[3], v[19]);
- u[20] = _mm_sub_epi32(v[4], v[20]);
- u[21] = _mm_sub_epi32(v[5], v[21]);
- u[22] = _mm_sub_epi32(v[6], v[22]);
- u[23] = _mm_sub_epi32(v[7], v[23]);
- u[24] = _mm_sub_epi32(v[8], v[24]);
- u[25] = _mm_sub_epi32(v[9], v[25]);
- u[26] = _mm_sub_epi32(v[10], v[26]);
- u[27] = _mm_sub_epi32(v[11], v[27]);
- u[28] = _mm_sub_epi32(v[12], v[28]);
- u[29] = _mm_sub_epi32(v[13], v[29]);
- u[30] = _mm_sub_epi32(v[14], v[30]);
- u[31] = _mm_sub_epi32(v[15], v[31]);
-
- v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
- v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
- v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
- v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
- v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
- v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
- v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
- v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
- v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
- v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
- v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
- v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
- v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
- v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
- v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
- v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
- u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
- u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
- u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
- u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
- u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
- u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
- u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
- u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
- u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
- u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
- u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
- u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
- u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
- u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
- u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
- u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
- v[0] = _mm_add_epi32(u[0], u[8]);
- v[1] = _mm_add_epi32(u[1], u[9]);
- v[2] = _mm_add_epi32(u[2], u[10]);
- v[3] = _mm_add_epi32(u[3], u[11]);
- v[4] = _mm_add_epi32(u[4], u[12]);
- v[5] = _mm_add_epi32(u[5], u[13]);
- v[6] = _mm_add_epi32(u[6], u[14]);
- v[7] = _mm_add_epi32(u[7], u[15]);
-
- v[16] = _mm_add_epi32(v[0], v[4]);
- v[17] = _mm_add_epi32(v[1], v[5]);
- v[18] = _mm_add_epi32(v[2], v[6]);
- v[19] = _mm_add_epi32(v[3], v[7]);
- v[20] = _mm_sub_epi32(v[0], v[4]);
- v[21] = _mm_sub_epi32(v[1], v[5]);
- v[22] = _mm_sub_epi32(v[2], v[6]);
- v[23] = _mm_sub_epi32(v[3], v[7]);
- v[16] = _mm_add_epi32(v[16], k__DCT_CONST_ROUNDING);
- v[17] = _mm_add_epi32(v[17], k__DCT_CONST_ROUNDING);
- v[18] = _mm_add_epi32(v[18], k__DCT_CONST_ROUNDING);
- v[19] = _mm_add_epi32(v[19], k__DCT_CONST_ROUNDING);
- v[20] = _mm_add_epi32(v[20], k__DCT_CONST_ROUNDING);
- v[21] = _mm_add_epi32(v[21], k__DCT_CONST_ROUNDING);
- v[22] = _mm_add_epi32(v[22], k__DCT_CONST_ROUNDING);
- v[23] = _mm_add_epi32(v[23], k__DCT_CONST_ROUNDING);
- v[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
- v[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
- v[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
- v[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
- v[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
- v[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
- v[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
- v[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
- s[0] = _mm_packs_epi32(v[16], v[17]);
- s[1] = _mm_packs_epi32(v[18], v[19]);
- s[2] = _mm_packs_epi32(v[20], v[21]);
- s[3] = _mm_packs_epi32(v[22], v[23]);
-
- v[8] = _mm_sub_epi32(u[0], u[8]);
- v[9] = _mm_sub_epi32(u[1], u[9]);
- v[10] = _mm_sub_epi32(u[2], u[10]);
- v[11] = _mm_sub_epi32(u[3], u[11]);
- v[12] = _mm_sub_epi32(u[4], u[12]);
- v[13] = _mm_sub_epi32(u[5], u[13]);
- v[14] = _mm_sub_epi32(u[6], u[14]);
- v[15] = _mm_sub_epi32(u[7], u[15]);
-
- v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
- s[4] = _mm_packs_epi32(v[8], v[9]);
- s[5] = _mm_packs_epi32(v[10], v[11]);
- s[6] = _mm_packs_epi32(v[12], v[13]);
- s[7] = _mm_packs_epi32(v[14], v[15]);
- //
-
- s[8] = _mm_packs_epi32(u[16], u[17]);
- s[9] = _mm_packs_epi32(u[18], u[19]);
- s[10] = _mm_packs_epi32(u[20], u[21]);
- s[11] = _mm_packs_epi32(u[22], u[23]);
- s[12] = _mm_packs_epi32(u[24], u[25]);
- s[13] = _mm_packs_epi32(u[26], u[27]);
- s[14] = _mm_packs_epi32(u[28], u[29]);
- s[15] = _mm_packs_epi32(u[30], u[31]);
-
- // stage 2
- u[0] = _mm_unpacklo_epi16(s[8], s[9]);
- u[1] = _mm_unpackhi_epi16(s[8], s[9]);
- u[2] = _mm_unpacklo_epi16(s[10], s[11]);
- u[3] = _mm_unpackhi_epi16(s[10], s[11]);
- u[4] = _mm_unpacklo_epi16(s[12], s[13]);
- u[5] = _mm_unpackhi_epi16(s[12], s[13]);
- u[6] = _mm_unpacklo_epi16(s[14], s[15]);
- u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
- v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
- v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
- u[0] = _mm_add_epi32(v[0], v[8]);
- u[1] = _mm_add_epi32(v[1], v[9]);
- u[2] = _mm_add_epi32(v[2], v[10]);
- u[3] = _mm_add_epi32(v[3], v[11]);
- u[4] = _mm_add_epi32(v[4], v[12]);
- u[5] = _mm_add_epi32(v[5], v[13]);
- u[6] = _mm_add_epi32(v[6], v[14]);
- u[7] = _mm_add_epi32(v[7], v[15]);
- u[8] = _mm_sub_epi32(v[0], v[8]);
- u[9] = _mm_sub_epi32(v[1], v[9]);
- u[10] = _mm_sub_epi32(v[2], v[10]);
- u[11] = _mm_sub_epi32(v[3], v[11]);
- u[12] = _mm_sub_epi32(v[4], v[12]);
- u[13] = _mm_sub_epi32(v[5], v[13]);
- u[14] = _mm_sub_epi32(v[6], v[14]);
- u[15] = _mm_sub_epi32(v[7], v[15]);
-
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
- v[8] = _mm_add_epi32(u[0], u[4]);
- v[9] = _mm_add_epi32(u[1], u[5]);
- v[10] = _mm_add_epi32(u[2], u[6]);
- v[11] = _mm_add_epi32(u[3], u[7]);
- v[12] = _mm_sub_epi32(u[0], u[4]);
- v[13] = _mm_sub_epi32(u[1], u[5]);
- v[14] = _mm_sub_epi32(u[2], u[6]);
- v[15] = _mm_sub_epi32(u[3], u[7]);
-
- v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
- v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
- s[8] = _mm_packs_epi32(v[8], v[9]);
- s[9] = _mm_packs_epi32(v[10], v[11]);
- s[10] = _mm_packs_epi32(v[12], v[13]);
- s[11] = _mm_packs_epi32(v[14], v[15]);
-
- x[12] = _mm_packs_epi32(u[8], u[9]);
- x[13] = _mm_packs_epi32(u[10], u[11]);
- x[14] = _mm_packs_epi32(u[12], u[13]);
- x[15] = _mm_packs_epi32(u[14], u[15]);
-
- // stage 3
- u[0] = _mm_unpacklo_epi16(s[4], s[5]);
- u[1] = _mm_unpackhi_epi16(s[4], s[5]);
- u[2] = _mm_unpacklo_epi16(s[6], s[7]);
- u[3] = _mm_unpackhi_epi16(s[6], s[7]);
- u[4] = _mm_unpacklo_epi16(x[12], x[13]);
- u[5] = _mm_unpackhi_epi16(x[12], x[13]);
- u[6] = _mm_unpacklo_epi16(x[14], x[15]);
- u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
- u[0] = _mm_add_epi32(v[0], v[4]);
- u[1] = _mm_add_epi32(v[1], v[5]);
- u[2] = _mm_add_epi32(v[2], v[6]);
- u[3] = _mm_add_epi32(v[3], v[7]);
- u[4] = _mm_sub_epi32(v[0], v[4]);
- u[5] = _mm_sub_epi32(v[1], v[5]);
- u[6] = _mm_sub_epi32(v[2], v[6]);
- u[7] = _mm_sub_epi32(v[3], v[7]);
- u[8] = _mm_add_epi32(v[8], v[12]);
- u[9] = _mm_add_epi32(v[9], v[13]);
- u[10] = _mm_add_epi32(v[10], v[14]);
- u[11] = _mm_add_epi32(v[11], v[15]);
- u[12] = _mm_sub_epi32(v[8], v[12]);
- u[13] = _mm_sub_epi32(v[9], v[13]);
- u[14] = _mm_sub_epi32(v[10], v[14]);
- u[15] = _mm_sub_epi32(v[11], v[15]);
-
- u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[4] = _mm_packs_epi32(v[0], v[1]);
- s[5] = _mm_packs_epi32(v[2], v[3]);
- s[6] = _mm_packs_epi32(v[4], v[5]);
- s[7] = _mm_packs_epi32(v[6], v[7]);
-
- s[12] = _mm_packs_epi32(v[8], v[9]);
- s[13] = _mm_packs_epi32(v[10], v[11]);
- s[14] = _mm_packs_epi32(v[12], v[13]);
- s[15] = _mm_packs_epi32(v[14], v[15]);
-
- // stage 4
- u[0] = _mm_unpacklo_epi16(s[2], s[3]);
- u[1] = _mm_unpackhi_epi16(s[2], s[3]);
- u[2] = _mm_unpacklo_epi16(s[6], s[7]);
- u[3] = _mm_unpackhi_epi16(s[6], s[7]);
- u[4] = _mm_unpacklo_epi16(s[10], s[11]);
- u[5] = _mm_unpackhi_epi16(s[10], s[11]);
- u[6] = _mm_unpacklo_epi16(s[14], s[15]);
- u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
- v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
- v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
- v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- in[0] = s[0];
- in[1] = _mm_sub_epi16(kZero, s[8]);
- in[2] = s[12];
- in[3] = _mm_sub_epi16(kZero, s[4]);
- in[4] = _mm_packs_epi32(v[4], v[5]);
- in[5] = _mm_packs_epi32(v[12], v[13]);
- in[6] = _mm_packs_epi32(v[8], v[9]);
- in[7] = _mm_packs_epi32(v[0], v[1]);
- in[8] = _mm_packs_epi32(v[2], v[3]);
- in[9] = _mm_packs_epi32(v[10], v[11]);
- in[10] = _mm_packs_epi32(v[14], v[15]);
- in[11] = _mm_packs_epi32(v[6], v[7]);
- in[12] = s[5];
- in[13] = _mm_sub_epi16(kZero, s[13]);
- in[14] = s[9];
- in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-static void fdct16_sse2(__m128i *in0, __m128i *in1) {
- fdct16_8col(in0);
- fdct16_8col(in1);
- array_transpose_16x16(in0, in1);
-}
-
-static void fadst16_sse2(__m128i *in0, __m128i *in1) {
- fadst16_8col(in0);
- fadst16_8col(in1);
- array_transpose_16x16(in0, in1);
-}
-
-#if CONFIG_EXT_TX
-static void fidtx16_sse2(__m128i *in0, __m128i *in1) {
- idtx16_8col(in0);
- idtx16_8col(in1);
- array_transpose_16x16(in0, in1);
-}
-#endif // CONFIG_EXT_TX
-
-void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
- TxfmParam *txfm_param) {
- __m128i in0[16], in1[16];
- const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
- assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
- switch (tx_type) {
- case DCT_DCT:
- load_buffer_16x16(input, in0, in1, stride, 0, 0);
- fdct16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fdct16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
- case ADST_DCT:
- load_buffer_16x16(input, in0, in1, stride, 0, 0);
- fadst16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fdct16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
- case DCT_ADST:
- load_buffer_16x16(input, in0, in1, stride, 0, 0);
- fdct16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fadst16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
- case ADST_ADST:
- load_buffer_16x16(input, in0, in1, stride, 0, 0);
- fadst16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fadst16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- load_buffer_16x16(input, in0, in1, stride, 1, 0);
- fadst16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fdct16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
- case DCT_FLIPADST:
- load_buffer_16x16(input, in0, in1, stride, 0, 1);
- fdct16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fadst16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
- case FLIPADST_FLIPADST:
- load_buffer_16x16(input, in0, in1, stride, 1, 1);
- fadst16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fadst16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
- case ADST_FLIPADST:
- load_buffer_16x16(input, in0, in1, stride, 0, 1);
- fadst16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fadst16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
- case FLIPADST_ADST:
- load_buffer_16x16(input, in0, in1, stride, 1, 0);
- fadst16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fadst16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
- case IDTX:
- load_buffer_16x16(input, in0, in1, stride, 0, 0);
- fidtx16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fidtx16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
- case V_DCT:
- load_buffer_16x16(input, in0, in1, stride, 0, 0);
- fdct16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fidtx16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
- case H_DCT:
- load_buffer_16x16(input, in0, in1, stride, 0, 0);
- fidtx16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fdct16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
- case V_ADST:
- load_buffer_16x16(input, in0, in1, stride, 0, 0);
- fadst16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fidtx16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
- case H_ADST:
- load_buffer_16x16(input, in0, in1, stride, 0, 0);
- fidtx16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fadst16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
- case V_FLIPADST:
- load_buffer_16x16(input, in0, in1, stride, 1, 0);
- fadst16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fidtx16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
- case H_FLIPADST:
- load_buffer_16x16(input, in0, in1, stride, 0, 1);
- fidtx16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fadst16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
-#endif // CONFIG_EXT_TX
- default: assert(0); break;
- }
-}
-
-static INLINE void prepare_4x8_row_first(__m128i *in) {
- in[0] = _mm_unpacklo_epi64(in[0], in[2]);
- in[1] = _mm_unpacklo_epi64(in[1], in[3]);
- transpose_4x4(in);
- in[4] = _mm_unpacklo_epi64(in[4], in[6]);
- in[5] = _mm_unpacklo_epi64(in[5], in[7]);
- transpose_4x4(in + 4);
-}
-
-// Load input into the left-hand half of in (ie, into lanes 0..3 of
-// each element of in). The right hand half (lanes 4..7) should be
-// treated as being filled with "don't care" values.
-static INLINE void load_buffer_4x8(const int16_t *input, __m128i *in,
- int stride, int flipud, int fliplr) {
- const int shift = 2;
- if (!flipud) {
- in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
- in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
- in[4] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride));
- in[5] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride));
- in[6] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride));
- in[7] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride));
- } else {
- in[0] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride));
- in[1] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride));
- in[2] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride));
- in[3] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride));
- in[4] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
- in[5] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
- in[6] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in[7] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- }
-
- if (fliplr) {
- in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
- in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
- in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
- in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
- in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
- in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
- in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
- in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
- }
-
- in[0] = _mm_slli_epi16(in[0], shift);
- in[1] = _mm_slli_epi16(in[1], shift);
- in[2] = _mm_slli_epi16(in[2], shift);
- in[3] = _mm_slli_epi16(in[3], shift);
- in[4] = _mm_slli_epi16(in[4], shift);
- in[5] = _mm_slli_epi16(in[5], shift);
- in[6] = _mm_slli_epi16(in[6], shift);
- in[7] = _mm_slli_epi16(in[7], shift);
-
- scale_sqrt2_8x4(in);
- scale_sqrt2_8x4(in + 4);
- prepare_4x8_row_first(in);
-}
-
-static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) {
- __m128i in01, in23, in45, in67, sign01, sign23, sign45, sign67;
- const int shift = 1;
-
- // revert the 8x8 txfm's transpose
- array_transpose_8x8(res, res);
-
- in01 = _mm_unpacklo_epi64(res[0], res[1]);
- in23 = _mm_unpacklo_epi64(res[2], res[3]);
- in45 = _mm_unpacklo_epi64(res[4], res[5]);
- in67 = _mm_unpacklo_epi64(res[6], res[7]);
-
- sign01 = _mm_srai_epi16(in01, 15);
- sign23 = _mm_srai_epi16(in23, 15);
- sign45 = _mm_srai_epi16(in45, 15);
- sign67 = _mm_srai_epi16(in67, 15);
-
- in01 = _mm_sub_epi16(in01, sign01);
- in23 = _mm_sub_epi16(in23, sign23);
- in45 = _mm_sub_epi16(in45, sign45);
- in67 = _mm_sub_epi16(in67, sign67);
-
- in01 = _mm_srai_epi16(in01, shift);
- in23 = _mm_srai_epi16(in23, shift);
- in45 = _mm_srai_epi16(in45, shift);
- in67 = _mm_srai_epi16(in67, shift);
-
- store_output(&in01, (output + 0 * 8));
- store_output(&in23, (output + 1 * 8));
- store_output(&in45, (output + 2 * 8));
- store_output(&in67, (output + 3 * 8));
-}
-
-void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
- TxfmParam *txfm_param) {
- __m128i in[8];
- const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
- assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
- switch (tx_type) {
- case DCT_DCT:
- load_buffer_4x8(input, in, stride, 0, 0);
- fdct4_sse2(in);
- fdct4_sse2(in + 4);
- fdct8_sse2(in);
- break;
- case ADST_DCT:
- load_buffer_4x8(input, in, stride, 0, 0);
- fdct4_sse2(in);
- fdct4_sse2(in + 4);
- fadst8_sse2(in);
- break;
- case DCT_ADST:
- load_buffer_4x8(input, in, stride, 0, 0);
- fadst4_sse2(in);
- fadst4_sse2(in + 4);
- fdct8_sse2(in);
- break;
- case ADST_ADST:
- load_buffer_4x8(input, in, stride, 0, 0);
- fadst4_sse2(in);
- fadst4_sse2(in + 4);
- fadst8_sse2(in);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- load_buffer_4x8(input, in, stride, 1, 0);
- fdct4_sse2(in);
- fdct4_sse2(in + 4);
- fadst8_sse2(in);
- break;
- case DCT_FLIPADST:
- load_buffer_4x8(input, in, stride, 0, 1);
- fadst4_sse2(in);
- fadst4_sse2(in + 4);
- fdct8_sse2(in);
- break;
- case FLIPADST_FLIPADST:
- load_buffer_4x8(input, in, stride, 1, 1);
- fadst4_sse2(in);
- fadst4_sse2(in + 4);
- fadst8_sse2(in);
- break;
- case ADST_FLIPADST:
- load_buffer_4x8(input, in, stride, 0, 1);
- fadst4_sse2(in);
- fadst4_sse2(in + 4);
- fadst8_sse2(in);
- break;
- case FLIPADST_ADST:
- load_buffer_4x8(input, in, stride, 1, 0);
- fadst4_sse2(in);
- fadst4_sse2(in + 4);
- fadst8_sse2(in);
- break;
- case IDTX:
- load_buffer_4x8(input, in, stride, 0, 0);
- fidtx4_sse2(in);
- fidtx4_sse2(in + 4);
- fidtx8_sse2(in);
- break;
- case V_DCT:
- load_buffer_4x8(input, in, stride, 0, 0);
- fidtx4_sse2(in);
- fidtx4_sse2(in + 4);
- fdct8_sse2(in);
- break;
- case H_DCT:
- load_buffer_4x8(input, in, stride, 0, 0);
- fdct4_sse2(in);
- fdct4_sse2(in + 4);
- fidtx8_sse2(in);
- break;
- case V_ADST:
- load_buffer_4x8(input, in, stride, 0, 0);
- fidtx4_sse2(in);
- fidtx4_sse2(in + 4);
- fadst8_sse2(in);
- break;
- case H_ADST:
- load_buffer_4x8(input, in, stride, 0, 0);
- fadst4_sse2(in);
- fadst4_sse2(in + 4);
- fidtx8_sse2(in);
- break;
- case V_FLIPADST:
- load_buffer_4x8(input, in, stride, 1, 0);
- fidtx4_sse2(in);
- fidtx4_sse2(in + 4);
- fadst8_sse2(in);
- break;
- case H_FLIPADST:
- load_buffer_4x8(input, in, stride, 0, 1);
- fadst4_sse2(in);
- fadst4_sse2(in + 4);
- fidtx8_sse2(in);
- break;
-#endif
- default: assert(0); break;
- }
- write_buffer_4x8(output, in);
-}
-
-// Load input into the left-hand half of in (ie, into lanes 0..3 of
-// each element of in). The right hand half (lanes 4..7) should be
-// treated as being filled with "don't care" values.
-// The input is split horizontally into two 4x4
-// chunks 'l' and 'r'. Then 'l' is stored in the top-left 4x4
-// block of 'in' and 'r' is stored in the bottom-left block.
-// This is to allow us to reuse 4x4 transforms.
-static INLINE void load_buffer_8x4(const int16_t *input, __m128i *in,
- int stride, int flipud, int fliplr) {
- const int shift = 2;
- if (!flipud) {
- in[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
- in[1] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
- in[2] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
- in[3] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
- } else {
- in[0] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
- in[1] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
- in[2] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
- in[3] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
- }
-
- if (fliplr) {
- in[0] = mm_reverse_epi16(in[0]);
- in[1] = mm_reverse_epi16(in[1]);
- in[2] = mm_reverse_epi16(in[2]);
- in[3] = mm_reverse_epi16(in[3]);
- }
-
- in[0] = _mm_slli_epi16(in[0], shift);
- in[1] = _mm_slli_epi16(in[1], shift);
- in[2] = _mm_slli_epi16(in[2], shift);
- in[3] = _mm_slli_epi16(in[3], shift);
-
- scale_sqrt2_8x4(in);
-
- in[4] = _mm_shuffle_epi32(in[0], 0xe);
- in[5] = _mm_shuffle_epi32(in[1], 0xe);
- in[6] = _mm_shuffle_epi32(in[2], 0xe);
- in[7] = _mm_shuffle_epi32(in[3], 0xe);
-}
-
-static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) {
- __m128i out0, out1, out2, out3, sign0, sign1, sign2, sign3;
- const int shift = 1;
- sign0 = _mm_srai_epi16(res[0], 15);
- sign1 = _mm_srai_epi16(res[1], 15);
- sign2 = _mm_srai_epi16(res[2], 15);
- sign3 = _mm_srai_epi16(res[3], 15);
-
- out0 = _mm_sub_epi16(res[0], sign0);
- out1 = _mm_sub_epi16(res[1], sign1);
- out2 = _mm_sub_epi16(res[2], sign2);
- out3 = _mm_sub_epi16(res[3], sign3);
-
- out0 = _mm_srai_epi16(out0, shift);
- out1 = _mm_srai_epi16(out1, shift);
- out2 = _mm_srai_epi16(out2, shift);
- out3 = _mm_srai_epi16(out3, shift);
-
- store_output(&out0, (output + 0 * 8));
- store_output(&out1, (output + 1 * 8));
- store_output(&out2, (output + 2 * 8));
- store_output(&out3, (output + 3 * 8));
-}
-
-void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride,
- TxfmParam *txfm_param) {
- __m128i in[8];
- const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
- assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
- switch (tx_type) {
- case DCT_DCT:
- load_buffer_8x4(input, in, stride, 0, 0);
- fdct4_sse2(in);
- fdct4_sse2(in + 4);
- fdct8_sse2(in);
- break;
- case ADST_DCT:
- load_buffer_8x4(input, in, stride, 0, 0);
- fadst4_sse2(in);
- fadst4_sse2(in + 4);
- fdct8_sse2(in);
- break;
- case DCT_ADST:
- load_buffer_8x4(input, in, stride, 0, 0);
- fdct4_sse2(in);
- fdct4_sse2(in + 4);
- fadst8_sse2(in);
- break;
- case ADST_ADST:
- load_buffer_8x4(input, in, stride, 0, 0);
- fadst4_sse2(in);
- fadst4_sse2(in + 4);
- fadst8_sse2(in);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- load_buffer_8x4(input, in, stride, 1, 0);
- fadst4_sse2(in);
- fadst4_sse2(in + 4);
- fdct8_sse2(in);
- break;
- case DCT_FLIPADST:
- load_buffer_8x4(input, in, stride, 0, 1);
- fdct4_sse2(in);
- fdct4_sse2(in + 4);
- fadst8_sse2(in);
- break;
- case FLIPADST_FLIPADST:
- load_buffer_8x4(input, in, stride, 1, 1);
- fadst4_sse2(in);
- fadst4_sse2(in + 4);
- fadst8_sse2(in);
- break;
- case ADST_FLIPADST:
- load_buffer_8x4(input, in, stride, 0, 1);
- fadst4_sse2(in);
- fadst4_sse2(in + 4);
- fadst8_sse2(in);
- break;
- case FLIPADST_ADST:
- load_buffer_8x4(input, in, stride, 1, 0);
- fadst4_sse2(in);
- fadst4_sse2(in + 4);
- fadst8_sse2(in);
- break;
- case IDTX:
- load_buffer_8x4(input, in, stride, 0, 0);
- fidtx4_sse2(in);
- fidtx4_sse2(in + 4);
- fidtx8_sse2(in);
- break;
- case V_DCT:
- load_buffer_8x4(input, in, stride, 0, 0);
- fdct4_sse2(in);
- fdct4_sse2(in + 4);
- fidtx8_sse2(in);
- break;
- case H_DCT:
- load_buffer_8x4(input, in, stride, 0, 0);
- fidtx4_sse2(in);
- fidtx4_sse2(in + 4);
- fdct8_sse2(in);
- break;
- case V_ADST:
- load_buffer_8x4(input, in, stride, 0, 0);
- fadst4_sse2(in);
- fadst4_sse2(in + 4);
- fidtx8_sse2(in);
- break;
- case H_ADST:
- load_buffer_8x4(input, in, stride, 0, 0);
- fidtx4_sse2(in);
- fidtx4_sse2(in + 4);
- fadst8_sse2(in);
- break;
- case V_FLIPADST:
- load_buffer_8x4(input, in, stride, 1, 0);
- fadst4_sse2(in);
- fadst4_sse2(in + 4);
- fidtx8_sse2(in);
- break;
- case H_FLIPADST:
- load_buffer_8x4(input, in, stride, 0, 1);
- fidtx4_sse2(in);
- fidtx4_sse2(in + 4);
- fadst8_sse2(in);
- break;
-#endif
- default: assert(0); break;
- }
- write_buffer_8x4(output, in);
-}
-
-static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in,
- int stride, int flipud, int fliplr) {
- // Load 2 8x8 blocks
- const int16_t *t = input;
- const int16_t *b = input + 8 * stride;
-
- if (flipud) {
- const int16_t *const tmp = t;
- t = b;
- b = tmp;
- }
-
- load_buffer_8x8(t, in, stride, flipud, fliplr);
- scale_sqrt2_8x8(in);
- load_buffer_8x8(b, in + 8, stride, flipud, fliplr);
- scale_sqrt2_8x8(in + 8);
-}
-
-static INLINE void round_power_of_two_signed(__m128i *x, int n) {
- const __m128i rounding = _mm_set1_epi16((1 << n) >> 1);
- const __m128i sign = _mm_srai_epi16(*x, 15);
- const __m128i res = _mm_add_epi16(_mm_add_epi16(*x, rounding), sign);
- *x = _mm_srai_epi16(res, n);
-}
-
-static void row_8x16_rounding(__m128i *in, int bits) {
- int i;
- for (i = 0; i < 16; i++) {
- round_power_of_two_signed(&in[i], bits);
- }
-}
-
-void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
- TxfmParam *txfm_param) {
- __m128i in[16];
- const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
- assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
- __m128i *const t = in; // Alias to top 8x8 sub block
- __m128i *const b = in + 8; // Alias to bottom 8x8 sub block
-
- switch (tx_type) {
- case DCT_DCT:
- load_buffer_8x16(input, in, stride, 0, 0);
- array_transpose_8x8(t, t);
- array_transpose_8x8(b, b);
- fdct8_sse2(t);
- fdct8_sse2(b);
- row_8x16_rounding(in, 2);
- fdct16_8col(in);
- break;
- case ADST_DCT:
- load_buffer_8x16(input, in, stride, 0, 0);
- array_transpose_8x8(t, t);
- array_transpose_8x8(b, b);
- fdct8_sse2(t);
- fdct8_sse2(b);
- row_8x16_rounding(in, 2);
- fadst16_8col(in);
- break;
- case DCT_ADST:
- load_buffer_8x16(input, in, stride, 0, 0);
- array_transpose_8x8(t, t);
- array_transpose_8x8(b, b);
- fadst8_sse2(t);
- fadst8_sse2(b);
- row_8x16_rounding(in, 2);
- fdct16_8col(in);
- break;
- case ADST_ADST:
- load_buffer_8x16(input, in, stride, 0, 0);
- array_transpose_8x8(t, t);
- array_transpose_8x8(b, b);
- fadst8_sse2(t);
- fadst8_sse2(b);
- row_8x16_rounding(in, 2);
- fadst16_8col(in);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- load_buffer_8x16(input, in, stride, 1, 0);
- array_transpose_8x8(t, t);
- array_transpose_8x8(b, b);
- fdct8_sse2(t);
- fdct8_sse2(b);
- row_8x16_rounding(in, 2);
- fadst16_8col(in);
- break;
- case DCT_FLIPADST:
- load_buffer_8x16(input, in, stride, 0, 1);
- array_transpose_8x8(t, t);
- array_transpose_8x8(b, b);
- fadst8_sse2(t);
- fadst8_sse2(b);
- row_8x16_rounding(in, 2);
- fdct16_8col(in);
- break;
- case FLIPADST_FLIPADST:
- load_buffer_8x16(input, in, stride, 1, 1);
- array_transpose_8x8(t, t);
- array_transpose_8x8(b, b);
- fadst8_sse2(t);
- fadst8_sse2(b);
- row_8x16_rounding(in, 2);
- fadst16_8col(in);
- break;
- case ADST_FLIPADST:
- load_buffer_8x16(input, in, stride, 0, 1);
- array_transpose_8x8(t, t);
- array_transpose_8x8(b, b);
- fadst8_sse2(t);
- fadst8_sse2(b);
- row_8x16_rounding(in, 2);
- fadst16_8col(in);
- break;
- case FLIPADST_ADST:
- load_buffer_8x16(input, in, stride, 1, 0);
- array_transpose_8x8(t, t);
- array_transpose_8x8(b, b);
- fadst8_sse2(t);
- fadst8_sse2(b);
- row_8x16_rounding(in, 2);
- fadst16_8col(in);
- break;
- case IDTX:
- load_buffer_8x16(input, in, stride, 0, 0);
- array_transpose_8x8(t, t);
- array_transpose_8x8(b, b);
- fidtx8_sse2(t);
- fidtx8_sse2(b);
- row_8x16_rounding(in, 2);
- idtx16_8col(in);
- break;
- case V_DCT:
- load_buffer_8x16(input, in, stride, 0, 0);
- array_transpose_8x8(t, t);
- array_transpose_8x8(b, b);
- fidtx8_sse2(t);
- fidtx8_sse2(b);
- row_8x16_rounding(in, 2);
- fdct16_8col(in);
- break;
- case H_DCT:
- load_buffer_8x16(input, in, stride, 0, 0);
- array_transpose_8x8(t, t);
- array_transpose_8x8(b, b);
- fdct8_sse2(t);
- fdct8_sse2(b);
- row_8x16_rounding(in, 2);
- idtx16_8col(in);
- break;
- case V_ADST:
- load_buffer_8x16(input, in, stride, 0, 0);
- array_transpose_8x8(t, t);
- array_transpose_8x8(b, b);
- fidtx8_sse2(t);
- fidtx8_sse2(b);
- row_8x16_rounding(in, 2);
- fadst16_8col(in);
- break;
- case H_ADST:
- load_buffer_8x16(input, in, stride, 0, 0);
- array_transpose_8x8(t, t);
- array_transpose_8x8(b, b);
- fadst8_sse2(t);
- fadst8_sse2(b);
- row_8x16_rounding(in, 2);
- idtx16_8col(in);
- break;
- case V_FLIPADST:
- load_buffer_8x16(input, in, stride, 1, 0);
- array_transpose_8x8(t, t);
- array_transpose_8x8(b, b);
- fidtx8_sse2(t);
- fidtx8_sse2(b);
- row_8x16_rounding(in, 2);
- fadst16_8col(in);
- break;
- case H_FLIPADST:
- load_buffer_8x16(input, in, stride, 0, 1);
- array_transpose_8x8(t, t);
- array_transpose_8x8(b, b);
- fadst8_sse2(t);
- fadst8_sse2(b);
- row_8x16_rounding(in, 2);
- idtx16_8col(in);
- break;
-#endif
- default: assert(0); break;
- }
- write_buffer_8x8(output, t, 8);
- write_buffer_8x8(output + 64, b, 8);
-}
-
-static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in,
- int stride, int flipud, int fliplr) {
- // Load 2 8x8 blocks
- const int16_t *l = input;
- const int16_t *r = input + 8;
-
- if (fliplr) {
- const int16_t *const tmp = l;
- l = r;
- r = tmp;
- }
-
- // load first 8 columns
- load_buffer_8x8(l, in, stride, flipud, fliplr);
- scale_sqrt2_8x8(in);
- load_buffer_8x8(r, in + 8, stride, flipud, fliplr);
- scale_sqrt2_8x8(in + 8);
-}
-
-#define col_16x8_rounding row_8x16_rounding
-
-void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride,
- TxfmParam *txfm_param) {
- __m128i in[16];
- const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
- assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
- __m128i *const l = in; // Alias to left 8x8 sub block
- __m128i *const r = in + 8; // Alias to right 8x8 sub block, which we store
- // in the second half of the array
-
- switch (tx_type) {
- case DCT_DCT:
- load_buffer_16x8(input, in, stride, 0, 0);
- fdct8_sse2(l);
- fdct8_sse2(r);
- col_16x8_rounding(in, 2);
- fdct16_8col(in);
- break;
- case ADST_DCT:
- load_buffer_16x8(input, in, stride, 0, 0);
- fadst8_sse2(l);
- fadst8_sse2(r);
- col_16x8_rounding(in, 2);
- fdct16_8col(in);
- break;
- case DCT_ADST:
- load_buffer_16x8(input, in, stride, 0, 0);
- fdct8_sse2(l);
- fdct8_sse2(r);
- col_16x8_rounding(in, 2);
- fadst16_8col(in);
- break;
- case ADST_ADST:
- load_buffer_16x8(input, in, stride, 0, 0);
- fadst8_sse2(l);
- fadst8_sse2(r);
- col_16x8_rounding(in, 2);
- fadst16_8col(in);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- load_buffer_16x8(input, in, stride, 1, 0);
- fadst8_sse2(l);
- fadst8_sse2(r);
- col_16x8_rounding(in, 2);
- fdct16_8col(in);
- break;
- case DCT_FLIPADST:
- load_buffer_16x8(input, in, stride, 0, 1);
- fdct8_sse2(l);
- fdct8_sse2(r);
- col_16x8_rounding(in, 2);
- fadst16_8col(in);
- break;
- case FLIPADST_FLIPADST:
- load_buffer_16x8(input, in, stride, 1, 1);
- fadst8_sse2(l);
- fadst8_sse2(r);
- col_16x8_rounding(in, 2);
- fadst16_8col(in);
- break;
- case ADST_FLIPADST:
- load_buffer_16x8(input, in, stride, 0, 1);
- fadst8_sse2(l);
- fadst8_sse2(r);
- col_16x8_rounding(in, 2);
- fadst16_8col(in);
- break;
- case FLIPADST_ADST:
- load_buffer_16x8(input, in, stride, 1, 0);
- fadst8_sse2(l);
- fadst8_sse2(r);
- col_16x8_rounding(in, 2);
- fadst16_8col(in);
- break;
- case IDTX:
- load_buffer_16x8(input, in, stride, 0, 0);
- fidtx8_sse2(l);
- fidtx8_sse2(r);
- col_16x8_rounding(in, 2);
- idtx16_8col(in);
- break;
- case V_DCT:
- load_buffer_16x8(input, in, stride, 0, 0);
- fdct8_sse2(l);
- fdct8_sse2(r);
- col_16x8_rounding(in, 2);
- idtx16_8col(in);
- break;
- case H_DCT:
- load_buffer_16x8(input, in, stride, 0, 0);
- fidtx8_sse2(l);
- fidtx8_sse2(r);
- col_16x8_rounding(in, 2);
- fdct16_8col(in);
- break;
- case V_ADST:
- load_buffer_16x8(input, in, stride, 0, 0);
- fadst8_sse2(l);
- fadst8_sse2(r);
- col_16x8_rounding(in, 2);
- idtx16_8col(in);
- break;
- case H_ADST:
- load_buffer_16x8(input, in, stride, 0, 0);
- fidtx8_sse2(l);
- fidtx8_sse2(r);
- col_16x8_rounding(in, 2);
- fadst16_8col(in);
- break;
- case V_FLIPADST:
- load_buffer_16x8(input, in, stride, 1, 0);
- fadst8_sse2(l);
- fadst8_sse2(r);
- col_16x8_rounding(in, 2);
- idtx16_8col(in);
- break;
- case H_FLIPADST:
- load_buffer_16x8(input, in, stride, 0, 1);
- fidtx8_sse2(l);
- fidtx8_sse2(r);
- col_16x8_rounding(in, 2);
- fadst16_8col(in);
- break;
-#endif
- default: assert(0); break;
- }
- array_transpose_8x8(l, l);
- array_transpose_8x8(r, r);
- write_buffer_8x8(output, l, 16);
- write_buffer_8x8(output + 8, r, 16);
-}
-
-// Note: The 16-column 32-element transforms expect their input to be
-// split up into a 2x2 grid of 8x16 blocks
-static INLINE void fdct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
- __m128i *br) {
- fdct32_8col(tl, bl);
- fdct32_8col(tr, br);
- array_transpose_16x16(tl, tr);
- array_transpose_16x16(bl, br);
-}
-
-#if CONFIG_EXT_TX
-static INLINE void fidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
- __m128i *br) {
- int i;
- for (i = 0; i < 16; ++i) {
- tl[i] = _mm_slli_epi16(tl[i], 2);
- tr[i] = _mm_slli_epi16(tr[i], 2);
- bl[i] = _mm_slli_epi16(bl[i], 2);
- br[i] = _mm_slli_epi16(br[i], 2);
- }
- array_transpose_16x16(tl, tr);
- array_transpose_16x16(bl, br);
-}
-#endif
-
-static INLINE void load_buffer_16x32(const int16_t *input, __m128i *intl,
- __m128i *intr, __m128i *inbl,
- __m128i *inbr, int stride, int flipud,
- int fliplr) {
- int i;
- if (flipud) {
- input = input + 31 * stride;
- stride = -stride;
- }
-
- for (i = 0; i < 16; ++i) {
- intl[i] = _mm_slli_epi16(
- _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
- intr[i] = _mm_slli_epi16(
- _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
- inbl[i] = _mm_slli_epi16(
- _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 0)), 2);
- inbr[i] = _mm_slli_epi16(
- _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 8)), 2);
- }
-
- if (fliplr) {
- __m128i tmp;
- for (i = 0; i < 16; ++i) {
- tmp = intl[i];
- intl[i] = mm_reverse_epi16(intr[i]);
- intr[i] = mm_reverse_epi16(tmp);
- tmp = inbl[i];
- inbl[i] = mm_reverse_epi16(inbr[i]);
- inbr[i] = mm_reverse_epi16(tmp);
- }
- }
-
- scale_sqrt2_8x16(intl);
- scale_sqrt2_8x16(intr);
- scale_sqrt2_8x16(inbl);
- scale_sqrt2_8x16(inbr);
-}
-
-static INLINE void write_buffer_16x32(tran_low_t *output, __m128i *restl,
- __m128i *restr, __m128i *resbl,
- __m128i *resbr) {
- int i;
- for (i = 0; i < 16; ++i) {
- store_output(&restl[i], output + i * 16 + 0);
- store_output(&restr[i], output + i * 16 + 8);
- store_output(&resbl[i], output + (i + 16) * 16 + 0);
- store_output(&resbr[i], output + (i + 16) * 16 + 8);
- }
-}
-
-static INLINE void round_signed_8x8(__m128i *in, const int bit) {
- const __m128i rounding = _mm_set1_epi16((1 << bit) >> 1);
- __m128i sign0 = _mm_srai_epi16(in[0], 15);
- __m128i sign1 = _mm_srai_epi16(in[1], 15);
- __m128i sign2 = _mm_srai_epi16(in[2], 15);
- __m128i sign3 = _mm_srai_epi16(in[3], 15);
- __m128i sign4 = _mm_srai_epi16(in[4], 15);
- __m128i sign5 = _mm_srai_epi16(in[5], 15);
- __m128i sign6 = _mm_srai_epi16(in[6], 15);
- __m128i sign7 = _mm_srai_epi16(in[7], 15);
-
- in[0] = _mm_add_epi16(_mm_add_epi16(in[0], rounding), sign0);
- in[1] = _mm_add_epi16(_mm_add_epi16(in[1], rounding), sign1);
- in[2] = _mm_add_epi16(_mm_add_epi16(in[2], rounding), sign2);
- in[3] = _mm_add_epi16(_mm_add_epi16(in[3], rounding), sign3);
- in[4] = _mm_add_epi16(_mm_add_epi16(in[4], rounding), sign4);
- in[5] = _mm_add_epi16(_mm_add_epi16(in[5], rounding), sign5);
- in[6] = _mm_add_epi16(_mm_add_epi16(in[6], rounding), sign6);
- in[7] = _mm_add_epi16(_mm_add_epi16(in[7], rounding), sign7);
-
- in[0] = _mm_srai_epi16(in[0], bit);
- in[1] = _mm_srai_epi16(in[1], bit);
- in[2] = _mm_srai_epi16(in[2], bit);
- in[3] = _mm_srai_epi16(in[3], bit);
- in[4] = _mm_srai_epi16(in[4], bit);
- in[5] = _mm_srai_epi16(in[5], bit);
- in[6] = _mm_srai_epi16(in[6], bit);
- in[7] = _mm_srai_epi16(in[7], bit);
-}
-
-static INLINE void round_signed_16x16(__m128i *in0, __m128i *in1) {
- const int bit = 4;
- round_signed_8x8(in0, bit);
- round_signed_8x8(in0 + 8, bit);
- round_signed_8x8(in1, bit);
- round_signed_8x8(in1 + 8, bit);
-}
-
-// Note:
-// suffix "t" indicates the transpose operation comes first
-static void fdct16t_sse2(__m128i *in0, __m128i *in1) {
- array_transpose_16x16(in0, in1);
- fdct16_8col(in0);
- fdct16_8col(in1);
-}
-
-static void fadst16t_sse2(__m128i *in0, __m128i *in1) {
- array_transpose_16x16(in0, in1);
- fadst16_8col(in0);
- fadst16_8col(in1);
-}
-
-static INLINE void fdct32t_16col(__m128i *tl, __m128i *tr, __m128i *bl,
- __m128i *br) {
- array_transpose_16x16(tl, tr);
- array_transpose_16x16(bl, br);
- fdct32_8col(tl, bl);
- fdct32_8col(tr, br);
-}
-
-typedef enum transpose_indicator_ {
- transpose,
- no_transpose,
-} transpose_indicator;
-
-static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
- __m128i *br, transpose_indicator t) {
- __m128i tmpl[16], tmpr[16];
- int i;
-
- // Copy the bottom half of the input to temporary storage
- for (i = 0; i < 16; ++i) {
- tmpl[i] = bl[i];
- tmpr[i] = br[i];
- }
-
- // Generate the bottom half of the output
- for (i = 0; i < 16; ++i) {
- bl[i] = _mm_slli_epi16(tl[i], 2);
- br[i] = _mm_slli_epi16(tr[i], 2);
- }
- array_transpose_16x16(bl, br);
-
- // Copy the temporary storage back to the top half of the input
- for (i = 0; i < 16; ++i) {
- tl[i] = tmpl[i];
- tr[i] = tmpr[i];
- }
-
- // Generate the top half of the output
- scale_sqrt2_8x16(tl);
- scale_sqrt2_8x16(tr);
- if (t == transpose)
- fdct16t_sse2(tl, tr);
- else
- fdct16_sse2(tl, tr);
-}
-
-// Note on data layout, for both this and the 32x16 transforms:
-// So that we can reuse the 16-element transforms easily,
-// we want to split the input into 8x16 blocks.
-// For 16x32, this means the input is a 2x2 grid of such blocks.
-// For 32x16, it means the input is a 4x1 grid.
-void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride,
- TxfmParam *txfm_param) {
- __m128i intl[16], intr[16], inbl[16], inbr[16];
- const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
- assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
- switch (tx_type) {
- case DCT_DCT:
- load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
- fdct16t_sse2(intl, intr);
- fdct16t_sse2(inbl, inbr);
- round_signed_16x16(intl, intr);
- round_signed_16x16(inbl, inbr);
- fdct32t_16col(intl, intr, inbl, inbr);
- break;
- case ADST_DCT:
- load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
- fdct16t_sse2(intl, intr);
- fdct16t_sse2(inbl, inbr);
- round_signed_16x16(intl, intr);
- round_signed_16x16(inbl, inbr);
- fhalfright32_16col(intl, intr, inbl, inbr, transpose);
- break;
- case DCT_ADST:
- load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
- fadst16t_sse2(intl, intr);
- fadst16t_sse2(inbl, inbr);
- round_signed_16x16(intl, intr);
- round_signed_16x16(inbl, inbr);
- fdct32t_16col(intl, intr, inbl, inbr);
- break;
- case ADST_ADST:
- load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
- fadst16t_sse2(intl, intr);
- fadst16t_sse2(inbl, inbr);
- round_signed_16x16(intl, intr);
- round_signed_16x16(inbl, inbr);
- fhalfright32_16col(intl, intr, inbl, inbr, transpose);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
- fdct16t_sse2(intl, intr);
- fdct16t_sse2(inbl, inbr);
- round_signed_16x16(intl, intr);
- round_signed_16x16(inbl, inbr);
- fhalfright32_16col(intl, intr, inbl, inbr, transpose);
- break;
- case DCT_FLIPADST:
- load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
- fadst16t_sse2(intl, intr);
- fadst16t_sse2(inbl, inbr);
- round_signed_16x16(intl, intr);
- round_signed_16x16(inbl, inbr);
- fdct32t_16col(intl, intr, inbl, inbr);
- break;
- case FLIPADST_FLIPADST:
- load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 1);
- fadst16t_sse2(intl, intr);
- fadst16t_sse2(inbl, inbr);
- round_signed_16x16(intl, intr);
- round_signed_16x16(inbl, inbr);
- fhalfright32_16col(intl, intr, inbl, inbr, transpose);
- break;
- case ADST_FLIPADST:
- load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
- fadst16t_sse2(intl, intr);
- fadst16t_sse2(inbl, inbr);
- round_signed_16x16(intl, intr);
- round_signed_16x16(inbl, inbr);
- fhalfright32_16col(intl, intr, inbl, inbr, transpose);
- break;
- case FLIPADST_ADST:
- load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
- fadst16t_sse2(intl, intr);
- fadst16t_sse2(inbl, inbr);
- round_signed_16x16(intl, intr);
- round_signed_16x16(inbl, inbr);
- fhalfright32_16col(intl, intr, inbl, inbr, transpose);
- break;
- case IDTX:
- load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
- fidtx16_sse2(intl, intr);
- fidtx16_sse2(inbl, inbr);
- round_signed_16x16(intl, intr);
- round_signed_16x16(inbl, inbr);
- fidtx32_16col(intl, intr, inbl, inbr);
- break;
- case V_DCT:
- load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
- fidtx16_sse2(intl, intr);
- fidtx16_sse2(inbl, inbr);
- round_signed_16x16(intl, intr);
- round_signed_16x16(inbl, inbr);
- fdct32t_16col(intl, intr, inbl, inbr);
- break;
- case H_DCT:
- load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
- fdct16t_sse2(intl, intr);
- fdct16t_sse2(inbl, inbr);
- round_signed_16x16(intl, intr);
- round_signed_16x16(inbl, inbr);
- fidtx32_16col(intl, intr, inbl, inbr);
- break;
- case V_ADST:
- load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
- fidtx16_sse2(intl, intr);
- fidtx16_sse2(inbl, inbr);
- round_signed_16x16(intl, intr);
- round_signed_16x16(inbl, inbr);
- fhalfright32_16col(intl, intr, inbl, inbr, transpose);
- break;
- case H_ADST:
- load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
- fadst16t_sse2(intl, intr);
- fadst16t_sse2(inbl, inbr);
- round_signed_16x16(intl, intr);
- round_signed_16x16(inbl, inbr);
- fidtx32_16col(intl, intr, inbl, inbr);
- break;
- case V_FLIPADST:
- load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
- fidtx16_sse2(intl, intr);
- fidtx16_sse2(inbl, inbr);
- round_signed_16x16(intl, intr);
- round_signed_16x16(inbl, inbr);
- fhalfright32_16col(intl, intr, inbl, inbr, transpose);
- break;
- case H_FLIPADST:
- load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
- fadst16t_sse2(intl, intr);
- fadst16t_sse2(inbl, inbr);
- round_signed_16x16(intl, intr);
- round_signed_16x16(inbl, inbr);
- fidtx32_16col(intl, intr, inbl, inbr);
- break;
-#endif
- default: assert(0); break;
- }
- write_buffer_16x32(output, intl, intr, inbl, inbr);
-}
-
-static INLINE void load_buffer_32x16(const int16_t *input, __m128i *in0,
- __m128i *in1, __m128i *in2, __m128i *in3,
- int stride, int flipud, int fliplr) {
- int i;
- if (flipud) {
- input += 15 * stride;
- stride = -stride;
- }
-
- for (i = 0; i < 16; ++i) {
- in0[i] = _mm_slli_epi16(
- _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
- in1[i] = _mm_slli_epi16(
- _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
- in2[i] = _mm_slli_epi16(
- _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2);
- in3[i] = _mm_slli_epi16(
- _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2);
- }
-
- if (fliplr) {
- for (i = 0; i < 16; ++i) {
- __m128i tmp1 = in0[i];
- __m128i tmp2 = in1[i];
- in0[i] = mm_reverse_epi16(in3[i]);
- in1[i] = mm_reverse_epi16(in2[i]);
- in2[i] = mm_reverse_epi16(tmp2);
- in3[i] = mm_reverse_epi16(tmp1);
- }
- }
-
- scale_sqrt2_8x16(in0);
- scale_sqrt2_8x16(in1);
- scale_sqrt2_8x16(in2);
- scale_sqrt2_8x16(in3);
-}
-
-static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0,
- __m128i *res1, __m128i *res2,
- __m128i *res3) {
- int i;
- for (i = 0; i < 16; ++i) {
- store_output(&res0[i], output + i * 32 + 0);
- store_output(&res1[i], output + i * 32 + 8);
- store_output(&res2[i], output + i * 32 + 16);
- store_output(&res3[i], output + i * 32 + 24);
- }
-}
-
-void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride,
- TxfmParam *txfm_param) {
- __m128i in0[16], in1[16], in2[16], in3[16];
- const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
- assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
- load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
- switch (tx_type) {
- case DCT_DCT:
- fdct16_sse2(in0, in1);
- fdct16_sse2(in2, in3);
- round_signed_16x16(in0, in1);
- round_signed_16x16(in2, in3);
- fdct32_16col(in0, in1, in2, in3);
- break;
- case ADST_DCT:
- fadst16_sse2(in0, in1);
- fadst16_sse2(in2, in3);
- round_signed_16x16(in0, in1);
- round_signed_16x16(in2, in3);
- fdct32_16col(in0, in1, in2, in3);
- break;
- case DCT_ADST:
- fdct16_sse2(in0, in1);
- fdct16_sse2(in2, in3);
- round_signed_16x16(in0, in1);
- round_signed_16x16(in2, in3);
- fhalfright32_16col(in0, in1, in2, in3, no_transpose);
- break;
- case ADST_ADST:
- fadst16_sse2(in0, in1);
- fadst16_sse2(in2, in3);
- round_signed_16x16(in0, in1);
- round_signed_16x16(in2, in3);
- fhalfright32_16col(in0, in1, in2, in3, no_transpose);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
- fadst16_sse2(in0, in1);
- fadst16_sse2(in2, in3);
- round_signed_16x16(in0, in1);
- round_signed_16x16(in2, in3);
- fdct32_16col(in0, in1, in2, in3);
- break;
- case DCT_FLIPADST:
- load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
- fdct16_sse2(in0, in1);
- fdct16_sse2(in2, in3);
- round_signed_16x16(in0, in1);
- round_signed_16x16(in2, in3);
- fhalfright32_16col(in0, in1, in2, in3, no_transpose);
- break;
- case FLIPADST_FLIPADST:
- load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 1);
- fadst16_sse2(in0, in1);
- fadst16_sse2(in2, in3);
- round_signed_16x16(in0, in1);
- round_signed_16x16(in2, in3);
- fhalfright32_16col(in0, in1, in2, in3, no_transpose);
- break;
- case ADST_FLIPADST:
- load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
- fadst16_sse2(in0, in1);
- fadst16_sse2(in2, in3);
- round_signed_16x16(in0, in1);
- round_signed_16x16(in2, in3);
- fhalfright32_16col(in0, in1, in2, in3, no_transpose);
- break;
- case FLIPADST_ADST:
- load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
- fadst16_sse2(in0, in1);
- fadst16_sse2(in2, in3);
- round_signed_16x16(in0, in1);
- round_signed_16x16(in2, in3);
- fhalfright32_16col(in0, in1, in2, in3, no_transpose);
- break;
- case IDTX:
- load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
- fidtx16_sse2(in0, in1);
- fidtx16_sse2(in2, in3);
- round_signed_16x16(in0, in1);
- round_signed_16x16(in2, in3);
- fidtx32_16col(in0, in1, in2, in3);
- break;
- case V_DCT:
- load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
- fdct16_sse2(in0, in1);
- fdct16_sse2(in2, in3);
- round_signed_16x16(in0, in1);
- round_signed_16x16(in2, in3);
- fidtx32_16col(in0, in1, in2, in3);
- break;
- case H_DCT:
- load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
- fidtx16_sse2(in0, in1);
- fidtx16_sse2(in2, in3);
- round_signed_16x16(in0, in1);
- round_signed_16x16(in2, in3);
- fdct32_16col(in0, in1, in2, in3);
- break;
- case V_ADST:
- load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
- fadst16_sse2(in0, in1);
- fadst16_sse2(in2, in3);
- round_signed_16x16(in0, in1);
- round_signed_16x16(in2, in3);
- fidtx32_16col(in0, in1, in2, in3);
- break;
- case H_ADST:
- load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
- fidtx16_sse2(in0, in1);
- fidtx16_sse2(in2, in3);
- round_signed_16x16(in0, in1);
- round_signed_16x16(in2, in3);
- fhalfright32_16col(in0, in1, in2, in3, no_transpose);
- break;
- case V_FLIPADST:
- load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
- fadst16_sse2(in0, in1);
- fadst16_sse2(in2, in3);
- round_signed_16x16(in0, in1);
- round_signed_16x16(in2, in3);
- fidtx32_16col(in0, in1, in2, in3);
- break;
- case H_FLIPADST:
- load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
- fidtx16_sse2(in0, in1);
- fidtx16_sse2(in2, in3);
- round_signed_16x16(in0, in1);
- round_signed_16x16(in2, in3);
- fhalfright32_16col(in0, in1, in2, in3, no_transpose);
- break;
-#endif
- default: assert(0); break;
- }
- write_buffer_32x16(output, in0, in1, in2, in3);
-}
-
-// Note:
-// 32x32 hybrid fwd txfm
-// 4x2 grids of 8x16 block. Each block is represented by __m128i in[16]
-static INLINE void load_buffer_32x32(const int16_t *input,
- __m128i *in0 /*in0[32]*/,
- __m128i *in1 /*in1[32]*/,
- __m128i *in2 /*in2[32]*/,
- __m128i *in3 /*in3[32]*/, int stride,
- int flipud, int fliplr) {
- if (flipud) {
- input += 31 * stride;
- stride = -stride;
- }
-
- int i;
- for (i = 0; i < 32; ++i) {
- in0[i] = _mm_slli_epi16(
- _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
- in1[i] = _mm_slli_epi16(
- _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
- in2[i] = _mm_slli_epi16(
- _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2);
- in3[i] = _mm_slli_epi16(
- _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2);
- }
-
- if (fliplr) {
- for (i = 0; i < 32; ++i) {
- __m128i tmp1 = in0[i];
- __m128i tmp2 = in1[i];
- in0[i] = mm_reverse_epi16(in3[i]);
- in1[i] = mm_reverse_epi16(in2[i]);
- in2[i] = mm_reverse_epi16(tmp2);
- in3[i] = mm_reverse_epi16(tmp1);
- }
- }
-}
-
-static INLINE void swap_16x16(__m128i *b0l /*b0l[16]*/,
- __m128i *b0r /*b0r[16]*/,
- __m128i *b1l /*b1l[16]*/,
- __m128i *b1r /*b1r[16]*/) {
- int i;
- for (i = 0; i < 16; ++i) {
- __m128i tmp0 = b1l[i];
- __m128i tmp1 = b1r[i];
- b1l[i] = b0l[i];
- b1r[i] = b0r[i];
- b0l[i] = tmp0;
- b0r[i] = tmp1;
- }
-}
-
-static INLINE void fdct32(__m128i *in0, __m128i *in1, __m128i *in2,
- __m128i *in3) {
- fdct32_8col(in0, &in0[16]);
- fdct32_8col(in1, &in1[16]);
- fdct32_8col(in2, &in2[16]);
- fdct32_8col(in3, &in3[16]);
-
- array_transpose_16x16(in0, in1);
- array_transpose_16x16(&in0[16], &in1[16]);
- array_transpose_16x16(in2, in3);
- array_transpose_16x16(&in2[16], &in3[16]);
-
- swap_16x16(&in0[16], &in1[16], in2, in3);
-}
-
-static INLINE void fhalfright32(__m128i *in0, __m128i *in1, __m128i *in2,
- __m128i *in3) {
- fhalfright32_16col(in0, in1, &in0[16], &in1[16], no_transpose);
- fhalfright32_16col(in2, in3, &in2[16], &in3[16], no_transpose);
- swap_16x16(&in0[16], &in1[16], in2, in3);
-}
-
-#if CONFIG_EXT_TX
-static INLINE void fidtx32(__m128i *in0, __m128i *in1, __m128i *in2,
- __m128i *in3) {
- fidtx32_16col(in0, in1, &in0[16], &in1[16]);
- fidtx32_16col(in2, in3, &in2[16], &in3[16]);
- swap_16x16(&in0[16], &in1[16], in2, in3);
-}
-#endif
-
-static INLINE void round_signed_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
- __m128i *in3) {
- round_signed_16x16(in0, in1);
- round_signed_16x16(&in0[16], &in1[16]);
- round_signed_16x16(in2, in3);
- round_signed_16x16(&in2[16], &in3[16]);
-}
-
-static INLINE void write_buffer_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
- __m128i *in3, tran_low_t *output) {
- int i;
- for (i = 0; i < 32; ++i) {
- store_output(&in0[i], output + i * 32 + 0);
- store_output(&in1[i], output + i * 32 + 8);
- store_output(&in2[i], output + i * 32 + 16);
- store_output(&in3[i], output + i * 32 + 24);
- }
-}
-
-void av1_fht32x32_sse2(const int16_t *input, tran_low_t *output, int stride,
- TxfmParam *txfm_param) {
- __m128i in0[32], in1[32], in2[32], in3[32];
- const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
- assert(tx_type != MRC_DCT && "No 32x32 sse2 MRC_DCT implementation");
-#endif
-
- load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 0);
- switch (tx_type) {
- case DCT_DCT:
- fdct32(in0, in1, in2, in3);
- round_signed_32x32(in0, in1, in2, in3);
- fdct32(in0, in1, in2, in3);
- break;
- case ADST_DCT:
- fhalfright32(in0, in1, in2, in3);
- round_signed_32x32(in0, in1, in2, in3);
- fdct32(in0, in1, in2, in3);
- break;
- case DCT_ADST:
- fdct32(in0, in1, in2, in3);
- round_signed_32x32(in0, in1, in2, in3);
- fhalfright32(in0, in1, in2, in3);
- break;
- case ADST_ADST:
- fhalfright32(in0, in1, in2, in3);
- round_signed_32x32(in0, in1, in2, in3);
- fhalfright32(in0, in1, in2, in3);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
- fhalfright32(in0, in1, in2, in3);
- round_signed_32x32(in0, in1, in2, in3);
- fdct32(in0, in1, in2, in3);
- break;
- case DCT_FLIPADST:
- load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
- fdct32(in0, in1, in2, in3);
- round_signed_32x32(in0, in1, in2, in3);
- fhalfright32(in0, in1, in2, in3);
- break;
- case FLIPADST_FLIPADST:
- load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 1);
- fhalfright32(in0, in1, in2, in3);
- round_signed_32x32(in0, in1, in2, in3);
- fhalfright32(in0, in1, in2, in3);
- break;
- case ADST_FLIPADST:
- load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
- fhalfright32(in0, in1, in2, in3);
- round_signed_32x32(in0, in1, in2, in3);
- fhalfright32(in0, in1, in2, in3);
- break;
- case FLIPADST_ADST:
- load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
- fhalfright32(in0, in1, in2, in3);
- round_signed_32x32(in0, in1, in2, in3);
- fhalfright32(in0, in1, in2, in3);
- break;
- case IDTX:
- fidtx32(in0, in1, in2, in3);
- round_signed_32x32(in0, in1, in2, in3);
- fidtx32(in0, in1, in2, in3);
- break;
- case V_DCT:
- fdct32(in0, in1, in2, in3);
- round_signed_32x32(in0, in1, in2, in3);
- fidtx32(in0, in1, in2, in3);
- break;
- case H_DCT:
- fidtx32(in0, in1, in2, in3);
- round_signed_32x32(in0, in1, in2, in3);
- fdct32(in0, in1, in2, in3);
- break;
- case V_ADST:
- fhalfright32(in0, in1, in2, in3);
- round_signed_32x32(in0, in1, in2, in3);
- fidtx32(in0, in1, in2, in3);
- break;
- case H_ADST:
- fidtx32(in0, in1, in2, in3);
- round_signed_32x32(in0, in1, in2, in3);
- fhalfright32(in0, in1, in2, in3);
- break;
- case V_FLIPADST:
- load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
- fhalfright32(in0, in1, in2, in3);
- round_signed_32x32(in0, in1, in2, in3);
- fidtx32(in0, in1, in2, in3);
- break;
- case H_FLIPADST:
- load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
- fidtx32(in0, in1, in2, in3);
- round_signed_32x32(in0, in1, in2, in3);
- fhalfright32(in0, in1, in2, in3);
- break;
-#endif
- default: assert(0);
- }
- write_buffer_32x32(in0, in1, in2, in3, output);
-}
diff --git a/third_party/aom/av1/encoder/x86/dct_sse2.asm b/third_party/aom/av1/encoder/x86/dct_sse2.asm
index a99db3d6e..b18554818 100644
--- a/third_party/aom/av1/encoder/x86/dct_sse2.asm
+++ b/third_party/aom/av1/encoder/x86/dct_sse2.asm
@@ -63,7 +63,6 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
psllw m0, 2
psllw m1, 2
-%if CONFIG_HIGHBITDEPTH
; sign extension
mova m2, m0
mova m3, m1
@@ -79,9 +78,5 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
mova [outputq + 16], m2
mova [outputq + 32], m1
mova [outputq + 48], m3
-%else
- mova [outputq], m0
- mova [outputq + 16], m1
-%endif
RET
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse2.c b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c
new file mode 100644
index 000000000..dedb4d02f
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+
+static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src,
+ const int stride,
+ const ptrdiff_t *const offsets,
+ __m128i *const level) {
+ level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride);
+ level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride);
+ level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride);
+ level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride);
+ level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src,
+ const int stride,
+ const ptrdiff_t *const offsets,
+ __m128i *const level) {
+ level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride);
+ level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride);
+ level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride);
+ level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride);
+ level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src,
+ const int stride,
+ const ptrdiff_t *const offsets,
+ __m128i *const level) {
+ level[0] = _mm_loadu_si128((__m128i *)(src + 1));
+ level[1] = _mm_loadu_si128((__m128i *)(src + stride));
+ level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0]));
+ level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1]));
+ level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2]));
+}
+
+static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
+ const __m128i const_3 = _mm_set1_epi8(3);
+ const __m128i const_4 = _mm_set1_epi8(4);
+ __m128i count;
+
+ count = _mm_min_epu8(level[0], const_3);
+ level[1] = _mm_min_epu8(level[1], const_3);
+ level[2] = _mm_min_epu8(level[2], const_3);
+ level[3] = _mm_min_epu8(level[3], const_3);
+ level[4] = _mm_min_epu8(level[4], const_3);
+ count = _mm_add_epi8(count, level[1]);
+ count = _mm_add_epi8(count, level[2]);
+ count = _mm_add_epi8(count, level[3]);
+ count = _mm_add_epi8(count, level[4]);
+ count = _mm_avg_epu8(count, _mm_setzero_si128());
+ count = _mm_min_epu8(count, const_4);
+ return count;
+}
+
+static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *const coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const __m128i pos_to_offset_large = _mm_set1_epi8(21);
+ __m128i pos_to_offset =
+ (height == 4)
+ ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21)
+ : _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21,
+ 21, 21);
+ __m128i count;
+ __m128i level[5];
+ int8_t *cc = coeff_contexts;
+ int row = height;
+
+ assert(!(height % 4));
+
+ do {
+ load_levels_4x4x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)cc, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 4 * stride;
+ cc += 16;
+ row -= 4;
+ } while (row);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ __m128i count;
+ __m128i level[5];
+ int row = height;
+
+ assert(!(height % 4));
+
+ do {
+ load_levels_4x4x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ levels += 4 * stride;
+ coeff_contexts += 16;
+ row -= 4;
+ } while (row);
+}
+
+static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+ __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ __m128i count;
+ __m128i level[5];
+ int row = height;
+
+ assert(!(height % 4));
+
+ do {
+ load_levels_4x4x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 4 * stride;
+ coeff_contexts += 16;
+ row -= 4;
+ } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ int8_t *cc = coeff_contexts;
+ int row = height;
+ __m128i count;
+ __m128i level[5];
+ __m128i pos_to_offset[3];
+
+ assert(!(height % 2));
+
+ if (height == 8) {
+ pos_to_offset[0] =
+ _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+ 21, 21, 21, 21, 21);
+ } else if (height < 8) {
+ pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21,
+ 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+ 21, 21, 21, 21, 21);
+ } else {
+ pos_to_offset[0] = _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11);
+ pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+ 21, 21, 21, 21, 21);
+ }
+ pos_to_offset[2] = _mm_set1_epi8(21);
+
+ do {
+ load_levels_8x2x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset[0]);
+ _mm_store_si128((__m128i *)cc, count);
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ levels += 2 * stride;
+ cc += 16;
+ row -= 2;
+ } while (row);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ const __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ int row = height;
+ __m128i count;
+ __m128i level[5];
+
+ assert(!(height % 2));
+
+ do {
+ load_levels_8x2x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ levels += 2 * stride;
+ coeff_contexts += 16;
+ row -= 2;
+ } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+ __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5);
+ int row = height;
+ __m128i count;
+ __m128i level[5];
+
+ assert(!(height % 2));
+
+ do {
+ load_levels_8x2x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 2 * stride;
+ coeff_contexts += 16;
+ row -= 2;
+ } while (row);
+}
+
+static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+ const int real_width,
+ const int real_height,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = width + TX_PAD_HOR;
+ int8_t *cc = coeff_contexts;
+ int row = height;
+ __m128i pos_to_offset[5];
+ __m128i pos_to_offset_large[3];
+ __m128i count;
+ __m128i level[5];
+
+ assert(!(width % 16));
+
+ pos_to_offset_large[2] = _mm_set1_epi8(21);
+ if (real_width == real_height) {
+ pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
+ pos_to_offset_large[2];
+ } else if (real_width > real_height) {
+ pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8(
+ 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
+ pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
+ } else { // real_width < real_height
+ pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8(
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11);
+ pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[4] = pos_to_offset_large[2];
+ pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(11);
+ }
+
+ do {
+ int w = width;
+
+ do {
+ load_levels_16x1x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset[0]);
+ _mm_store_si128((__m128i *)cc, count);
+ levels += 16;
+ cc += 16;
+ w -= 16;
+ pos_to_offset[0] = pos_to_offset_large[0];
+ } while (w);
+
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ pos_to_offset[2] = pos_to_offset[3];
+ pos_to_offset[3] = pos_to_offset[4];
+ pos_to_offset_large[0] = pos_to_offset_large[1];
+ pos_to_offset_large[1] = pos_to_offset_large[2];
+ levels += TX_PAD_HOR;
+ } while (--row);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = width + TX_PAD_HOR;
+ const __m128i pos_to_offset_large =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ __m128i count;
+ __m128i level[5];
+ int row = height;
+
+ assert(!(width % 16));
+
+ do {
+ __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ int w = width;
+
+ do {
+ load_levels_16x1x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 16;
+ coeff_contexts += 16;
+ w -= 16;
+ } while (w);
+
+ levels += TX_PAD_HOR;
+ } while (--row);
+}
+
+static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = width + TX_PAD_HOR;
+ __m128i pos_to_offset[3];
+ __m128i count;
+ __m128i level[5];
+ int row = height;
+
+ assert(!(width % 16));
+
+ pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0);
+ pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5);
+ pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+
+ do {
+ int w = width;
+
+ do {
+ load_levels_16x1x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset[0]);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ levels += 16;
+ coeff_contexts += 16;
+ w -= 16;
+ } while (w);
+
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ levels += TX_PAD_HOR;
+ } while (--row);
+}
+
+// Note: levels[] must be in the range [0, 127], inclusive.
+void av1_get_nz_map_contexts_sse2(const uint8_t *const levels,
+ const int16_t *const scan, const uint16_t eob,
+ const TX_SIZE tx_size,
+ const TX_CLASS tx_class,
+ int8_t *const coeff_contexts) {
+ const int last_idx = eob - 1;
+ if (!last_idx) {
+ coeff_contexts[0] = 0;
+ return;
+ }
+
+ const int real_width = tx_size_wide[tx_size];
+ const int real_height = tx_size_high[tx_size];
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ const int stride = width + TX_PAD_HOR;
+ ptrdiff_t offsets[3];
+
+ /* coeff_contexts must be 16 byte aligned. */
+ assert(!((intptr_t)coeff_contexts & 0xf));
+
+ if (tx_class == TX_CLASS_2D) {
+ offsets[0] = 0 * stride + 2;
+ offsets[1] = 1 * stride + 1;
+ offsets[2] = 2 * stride + 0;
+
+ if (width == 4) {
+ get_4_nz_map_contexts_2d(levels, height, offsets, coeff_contexts);
+ } else if (width == 8) {
+ get_8_coeff_contexts_2d(levels, height, offsets, coeff_contexts);
+ } else if (width == 16) {
+ get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+ offsets, coeff_contexts);
+ } else {
+ get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+ offsets, coeff_contexts);
+ }
+ } else if (tx_class == TX_CLASS_HORIZ) {
+ offsets[0] = 2;
+ offsets[1] = 3;
+ offsets[2] = 4;
+ if (width == 4) {
+ get_4_nz_map_contexts_hor(levels, height, offsets, coeff_contexts);
+ } else if (width == 8) {
+ get_8_coeff_contexts_hor(levels, height, offsets, coeff_contexts);
+ } else {
+ get_16n_coeff_contexts_hor(levels, width, height, offsets,
+ coeff_contexts);
+ }
+ } else { // TX_CLASS_VERT
+ offsets[0] = 2 * stride;
+ offsets[1] = 3 * stride;
+ offsets[2] = 4 * stride;
+ if (width == 4) {
+ get_4_nz_map_contexts_ver(levels, height, offsets, coeff_contexts);
+ } else if (width == 8) {
+ get_8_coeff_contexts_ver(levels, height, offsets, coeff_contexts);
+ } else {
+ get_16n_coeff_contexts_ver(levels, width, height, offsets,
+ coeff_contexts);
+ }
+ }
+
+ const int bwl = get_txb_bwl(tx_size);
+ const int pos = scan[last_idx];
+ if (last_idx <= (height << bwl) / 8)
+ coeff_contexts[pos] = 1;
+ else if (last_idx <= (height << bwl) / 4)
+ coeff_contexts[pos] = 2;
+ else
+ coeff_contexts[pos] = 3;
+}
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
new file mode 100644
index 000000000..b3a879b0f
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+
+void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
+ const int height, uint8_t *const levels) {
+ const int stride = width + TX_PAD_HOR;
+ memset(levels - TX_PAD_TOP * stride, 0,
+ sizeof(*levels) * TX_PAD_TOP * stride);
+ memset(levels + stride * height, 0,
+ sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
+
+ const __m128i zeros = _mm_setzero_si128();
+ int i = 0;
+ uint8_t *ls = levels;
+ const tran_low_t *cf = coeff;
+ if (width == 4) {
+ do {
+ const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
+ const __m128i coeffB = _mm_load_si128((__m128i *)(cf + width));
+ const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+ const __m128i absAB = _mm_abs_epi16(coeffAB);
+ const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+ const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros);
+ _mm_storeu_si128((__m128i *)ls, lsAB);
+ ls += (stride << 1);
+ cf += (width << 1);
+ i += 2;
+ } while (i < height);
+ } else if (width == 8) {
+ do {
+ const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
+ const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4));
+ const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+ const __m128i absAB = _mm_abs_epi16(coeffAB);
+ const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+ _mm_storeu_si128((__m128i *)ls, absAB8);
+ ls += stride;
+ cf += width;
+ i += 1;
+ } while (i < height);
+ } else {
+ do {
+ int j = 0;
+ do {
+ const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
+ const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4));
+ const __m128i coeffC = _mm_load_si128((__m128i *)(cf + 8));
+ const __m128i coeffD = _mm_load_si128((__m128i *)(cf + 12));
+ const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+ const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD);
+ const __m128i absAB = _mm_abs_epi16(coeffAB);
+ const __m128i absCD = _mm_abs_epi16(coeffCD);
+ const __m128i absABCD = _mm_packs_epi16(absAB, absCD);
+ _mm_storeu_si128((__m128i *)(ls + j), absABCD);
+ j += 16;
+ cf += 16;
+ } while (j < width);
+ *(int32_t *)(ls + width) = 0;
+ ls += stride;
+ i += 1;
+ } while (i < height);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
index 6599630d0..7d4f69585 100644
--- a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
+++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
@@ -11,7 +11,8 @@
#include <immintrin.h> // AVX2
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
#include "aom/aom_integer.h"
static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
diff --git a/third_party/aom/av1/encoder/x86/error_sse2.asm b/third_party/aom/av1/encoder/x86/error_sse2.asm
index 4680f1fab..72e9e22b1 100644
--- a/third_party/aom/av1/encoder/x86/error_sse2.asm
+++ b/third_party/aom/av1/encoder/x86/error_sse2.asm
@@ -77,49 +77,3 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
movd edx, m5
%endif
RET
-
-; Compute the sum of squared difference between two int16_t vectors.
-; int64_t av1_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
-; intptr_t block_size)
-
-INIT_XMM sse2
-cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
- pxor m4, m4 ; sse accumulator
- pxor m5, m5 ; dedicated zero register
- lea uqcq, [uqcq+sizeq*2]
- lea dqcq, [dqcq+sizeq*2]
- neg sizeq
-.loop:
- mova m2, [uqcq+sizeq*2]
- mova m0, [dqcq+sizeq*2]
- mova m3, [uqcq+sizeq*2+mmsize]
- mova m1, [dqcq+sizeq*2+mmsize]
- psubw m0, m2
- psubw m1, m3
- ; individual errors are max. 15bit+sign, so squares are 30bit, and
- ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
- pmaddwd m0, m0
- pmaddwd m1, m1
- ; accumulate in 64bit
- punpckldq m3, m0, m5
- punpckhdq m0, m5
- paddq m4, m3
- punpckldq m3, m1, m5
- paddq m4, m0
- punpckhdq m1, m5
- paddq m4, m3
- paddq m4, m1
- add sizeq, mmsize
- jl .loop
-
- ; accumulate horizontally and store in return value
- movhlps m5, m4
- paddq m4, m5
-%if ARCH_X86_64
- movq rax, m4
-%else
- pshufd m5, m4, 0x1
- movd eax, m4
- movd edx, m5
-%endif
- RET
diff --git a/third_party/aom/av1/encoder/x86/hash_sse42.c b/third_party/aom/av1/encoder/x86/hash_sse42.c
new file mode 100644
index 000000000..65fa46311
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/hash_sse42.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <smmintrin.h>
+
+// Byte-boundary alignment issues
+#define ALIGN_SIZE 8
+#define ALIGN_MASK (ALIGN_SIZE - 1)
+
+#define CALC_CRC(op, crc, type, buf, len) \
+ while ((len) >= sizeof(type)) { \
+ (crc) = op((crc), *(type *)(buf)); \
+ (len) -= sizeof(type); \
+ buf += sizeof(type); \
+ }
+
+/**
+ * Calculates 32-bit CRC for the input buffer
+ * polynomial is 0x11EDC6F41
+ * @return A 32-bit unsigned integer representing the CRC
+ */
+uint32_t av1_get_crc32c_value_sse4_2(void *crc_calculator, uint8_t *p,
+ size_t len) {
+ (void)crc_calculator;
+ const uint8_t *buf = p;
+ uint32_t crc = 0xFFFFFFFF;
+
+ // Align the input to the word boundary
+ for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) {
+ crc = _mm_crc32_u8(crc, *buf);
+ }
+
+#ifdef __x86_64__
+ uint64_t crc64 = crc;
+ CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len);
+ crc = (uint32_t)crc64;
+#endif
+ CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len);
+ CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len);
+ CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len);
+ return (crc ^= 0xFFFFFFFF);
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index b684f7a3a..4cd6371a6 100644
--- a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -11,11 +11,12 @@
#include <assert.h>
#include <smmintrin.h> /* SSE4.1 */
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "av1/common/av1_fwd_txfm1d_cfg.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
#include "av1/common/av1_txfm.h"
#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
#include "aom_dsp/txfm_common.h"
#include "aom_dsp/x86/txfm_common_sse2.h"
#include "aom_ports/mem.h"
@@ -121,72 +122,57 @@ static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
}
static void fadst4x4_sse4_1(__m128i *in, int bit) {
- const int32_t *cospi = cospi_arr(bit);
- const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
- const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
- const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
- const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
- const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const int32_t *sinpi = sinpi_arr(bit);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
- const __m128i kZero = _mm_setzero_si128();
- __m128i s0, s1, s2, s3;
+ const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
+ const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
+ const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
+ const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
+ __m128i t;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m128i x0, x1, x2, x3;
__m128i u0, u1, u2, u3;
__m128i v0, v1, v2, v3;
- // stage 0
- // stage 1
- // stage 2
- u0 = _mm_mullo_epi32(in[3], cospi8);
- u1 = _mm_mullo_epi32(in[0], cospi56);
- u2 = _mm_add_epi32(u0, u1);
- s0 = _mm_add_epi32(u2, rnding);
- s0 = _mm_srai_epi32(s0, bit);
-
- v0 = _mm_mullo_epi32(in[3], cospi56);
- v1 = _mm_mullo_epi32(in[0], cospi8);
- v2 = _mm_sub_epi32(v0, v1);
- s1 = _mm_add_epi32(v2, rnding);
- s1 = _mm_srai_epi32(s1, bit);
-
- u0 = _mm_mullo_epi32(in[1], cospi40);
- u1 = _mm_mullo_epi32(in[2], cospi24);
- u2 = _mm_add_epi32(u0, u1);
- s2 = _mm_add_epi32(u2, rnding);
- s2 = _mm_srai_epi32(s2, bit);
-
- v0 = _mm_mullo_epi32(in[1], cospi24);
- v1 = _mm_mullo_epi32(in[2], cospi40);
- v2 = _mm_sub_epi32(v0, v1);
- s3 = _mm_add_epi32(v2, rnding);
- s3 = _mm_srai_epi32(s3, bit);
-
- // stage 3
- u0 = _mm_add_epi32(s0, s2);
- u2 = _mm_sub_epi32(s0, s2);
- u1 = _mm_add_epi32(s1, s3);
- u3 = _mm_sub_epi32(s1, s3);
-
- // stage 4
- v0 = _mm_mullo_epi32(u2, cospi32);
- v1 = _mm_mullo_epi32(u3, cospi32);
- v2 = _mm_add_epi32(v0, v1);
- s2 = _mm_add_epi32(v2, rnding);
- u2 = _mm_srai_epi32(s2, bit);
+ s0 = _mm_mullo_epi32(in[0], sinpi1);
+ s1 = _mm_mullo_epi32(in[0], sinpi4);
+ s2 = _mm_mullo_epi32(in[1], sinpi2);
+ s3 = _mm_mullo_epi32(in[1], sinpi1);
+ s4 = _mm_mullo_epi32(in[2], sinpi3);
+ s5 = _mm_mullo_epi32(in[3], sinpi4);
+ s6 = _mm_mullo_epi32(in[3], sinpi2);
+ t = _mm_add_epi32(in[0], in[1]);
+ s7 = _mm_sub_epi32(t, in[3]);
+
+ t = _mm_add_epi32(s0, s2);
+ x0 = _mm_add_epi32(t, s5);
+ x1 = _mm_mullo_epi32(s7, sinpi3);
+ t = _mm_sub_epi32(s1, s3);
+ x2 = _mm_add_epi32(t, s6);
+ x3 = s4;
+
+ s0 = _mm_add_epi32(x0, x3);
+ s1 = x1;
+ s2 = _mm_sub_epi32(x2, x3);
+ t = _mm_sub_epi32(x2, x0);
+ s3 = _mm_add_epi32(t, x3);
+
+ u0 = _mm_add_epi32(s0, rnding);
+ u0 = _mm_srai_epi32(u0, bit);
+
+ u1 = _mm_add_epi32(s1, rnding);
+ u1 = _mm_srai_epi32(u1, bit);
+
+ u2 = _mm_add_epi32(s2, rnding);
+ u2 = _mm_srai_epi32(u2, bit);
+
+ u3 = _mm_add_epi32(s3, rnding);
+ u3 = _mm_srai_epi32(u3, bit);
- v2 = _mm_sub_epi32(v0, v1);
- s3 = _mm_add_epi32(v2, rnding);
- u3 = _mm_srai_epi32(s3, bit);
-
- // u0, u1, u2, u3
- u2 = _mm_sub_epi32(kZero, u2);
- u1 = _mm_sub_epi32(kZero, u1);
-
- // u0, u2, u3, u1
- // Transpose 4x4 32-bit
- v0 = _mm_unpacklo_epi32(u0, u2);
- v1 = _mm_unpackhi_epi32(u0, u2);
- v2 = _mm_unpacklo_epi32(u3, u1);
- v3 = _mm_unpackhi_epi32(u3, u1);
+ v0 = _mm_unpacklo_epi32(u0, u1);
+ v1 = _mm_unpackhi_epi32(u0, u1);
+ v2 = _mm_unpacklo_epi32(u2, u3);
+ v3 = _mm_unpackhi_epi32(u2, u3);
in[0] = _mm_unpacklo_epi64(v0, v2);
in[1] = _mm_unpackhi_epi64(v0, v2);
@@ -197,84 +183,65 @@ static void fadst4x4_sse4_1(__m128i *in, int bit) {
void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
int input_stride, TX_TYPE tx_type, int bd) {
__m128i in[4];
- const TXFM_1D_CFG *row_cfg = NULL;
- const TXFM_1D_CFG *col_cfg = NULL;
+ const int8_t *shift = fwd_txfm_shift_ls[TX_4X4];
+ const int txw_idx = get_txw_idx(TX_4X4);
+ const int txh_idx = get_txh_idx(TX_4X4);
switch (tx_type) {
case DCT_DCT:
- row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
- col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
- load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
- fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
- fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
write_buffer_4x4(in, coeff);
break;
case ADST_DCT:
- row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
- load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
- fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
- fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
write_buffer_4x4(in, coeff);
break;
case DCT_ADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
- col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
- load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
- fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
- fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
write_buffer_4x4(in, coeff);
break;
case ADST_ADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
- load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
- fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
- fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
write_buffer_4x4(in, coeff);
break;
-#if CONFIG_EXT_TX
case FLIPADST_DCT:
- row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
- load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]);
- fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
- fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
write_buffer_4x4(in, coeff);
break;
case DCT_FLIPADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
- col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
- load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]);
- fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
- fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+ fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
write_buffer_4x4(in, coeff);
break;
case FLIPADST_FLIPADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
- load_buffer_4x4(input, in, input_stride, 1, 1, row_cfg->shift[0]);
- fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
- fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
write_buffer_4x4(in, coeff);
break;
case ADST_FLIPADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
- load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]);
- fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
- fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
write_buffer_4x4(in, coeff);
break;
case FLIPADST_ADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
- load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]);
- fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
- fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
write_buffer_4x4(in, coeff);
break;
-#endif
default: assert(0);
}
(void)bd;
@@ -624,415 +591,274 @@ static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
- const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
- const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
- const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
- const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
- const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
- const __m128i kZero = _mm_setzero_si128();
- __m128i u[8], v[8], x;
-
- // Even 8 points: 0, 2, ..., 14
- // stage 0
- // stage 1
- // stage 2
- // (1)
- u[0] = _mm_mullo_epi32(in[14], cospi4);
- x = _mm_mullo_epi32(in[0], cospi60);
- u[0] = _mm_add_epi32(u[0], x);
- u[0] = _mm_add_epi32(u[0], rnding);
- u[0] = _mm_srai_epi32(u[0], bit);
-
- u[1] = _mm_mullo_epi32(in[14], cospi60);
- x = _mm_mullo_epi32(in[0], cospi4);
- u[1] = _mm_sub_epi32(u[1], x);
- u[1] = _mm_add_epi32(u[1], rnding);
- u[1] = _mm_srai_epi32(u[1], bit);
-
- // (2)
- u[2] = _mm_mullo_epi32(in[10], cospi20);
- x = _mm_mullo_epi32(in[4], cospi44);
- u[2] = _mm_add_epi32(u[2], x);
- u[2] = _mm_add_epi32(u[2], rnding);
- u[2] = _mm_srai_epi32(u[2], bit);
-
- u[3] = _mm_mullo_epi32(in[10], cospi44);
- x = _mm_mullo_epi32(in[4], cospi20);
- u[3] = _mm_sub_epi32(u[3], x);
- u[3] = _mm_add_epi32(u[3], rnding);
- u[3] = _mm_srai_epi32(u[3], bit);
-
- // (3)
- u[4] = _mm_mullo_epi32(in[6], cospi36);
- x = _mm_mullo_epi32(in[8], cospi28);
- u[4] = _mm_add_epi32(u[4], x);
- u[4] = _mm_add_epi32(u[4], rnding);
- u[4] = _mm_srai_epi32(u[4], bit);
-
- u[5] = _mm_mullo_epi32(in[6], cospi28);
- x = _mm_mullo_epi32(in[8], cospi36);
- u[5] = _mm_sub_epi32(u[5], x);
- u[5] = _mm_add_epi32(u[5], rnding);
- u[5] = _mm_srai_epi32(u[5], bit);
-
- // (4)
- u[6] = _mm_mullo_epi32(in[2], cospi52);
- x = _mm_mullo_epi32(in[12], cospi12);
- u[6] = _mm_add_epi32(u[6], x);
- u[6] = _mm_add_epi32(u[6], rnding);
- u[6] = _mm_srai_epi32(u[6], bit);
-
- u[7] = _mm_mullo_epi32(in[2], cospi12);
- x = _mm_mullo_epi32(in[12], cospi52);
- u[7] = _mm_sub_epi32(u[7], x);
- u[7] = _mm_add_epi32(u[7], rnding);
- u[7] = _mm_srai_epi32(u[7], bit);
-
- // stage 3
- v[0] = _mm_add_epi32(u[0], u[4]);
- v[4] = _mm_sub_epi32(u[0], u[4]);
- v[1] = _mm_add_epi32(u[1], u[5]);
- v[5] = _mm_sub_epi32(u[1], u[5]);
- v[2] = _mm_add_epi32(u[2], u[6]);
- v[6] = _mm_sub_epi32(u[2], u[6]);
- v[3] = _mm_add_epi32(u[3], u[7]);
- v[7] = _mm_sub_epi32(u[3], u[7]);
-
- // stage 4
- u[0] = v[0];
- u[1] = v[1];
- u[2] = v[2];
- u[3] = v[3];
-
- u[4] = _mm_mullo_epi32(v[4], cospi16);
- x = _mm_mullo_epi32(v[5], cospi48);
- u[4] = _mm_add_epi32(u[4], x);
- u[4] = _mm_add_epi32(u[4], rnding);
- u[4] = _mm_srai_epi32(u[4], bit);
-
- u[5] = _mm_mullo_epi32(v[4], cospi48);
- x = _mm_mullo_epi32(v[5], cospi16);
- u[5] = _mm_sub_epi32(u[5], x);
- u[5] = _mm_add_epi32(u[5], rnding);
- u[5] = _mm_srai_epi32(u[5], bit);
-
- u[6] = _mm_mullo_epi32(v[6], cospim48);
- x = _mm_mullo_epi32(v[7], cospi16);
- u[6] = _mm_add_epi32(u[6], x);
- u[6] = _mm_add_epi32(u[6], rnding);
- u[6] = _mm_srai_epi32(u[6], bit);
-
- u[7] = _mm_mullo_epi32(v[6], cospi16);
- x = _mm_mullo_epi32(v[7], cospim48);
- u[7] = _mm_sub_epi32(u[7], x);
- u[7] = _mm_add_epi32(u[7], rnding);
- u[7] = _mm_srai_epi32(u[7], bit);
-
- // stage 5
- v[0] = _mm_add_epi32(u[0], u[2]);
- v[2] = _mm_sub_epi32(u[0], u[2]);
- v[1] = _mm_add_epi32(u[1], u[3]);
- v[3] = _mm_sub_epi32(u[1], u[3]);
- v[4] = _mm_add_epi32(u[4], u[6]);
- v[6] = _mm_sub_epi32(u[4], u[6]);
- v[5] = _mm_add_epi32(u[5], u[7]);
- v[7] = _mm_sub_epi32(u[5], u[7]);
-
- // stage 6
- u[0] = v[0];
- u[1] = v[1];
- u[4] = v[4];
- u[5] = v[5];
-
- v[0] = _mm_mullo_epi32(v[2], cospi32);
- x = _mm_mullo_epi32(v[3], cospi32);
- u[2] = _mm_add_epi32(v[0], x);
- u[2] = _mm_add_epi32(u[2], rnding);
- u[2] = _mm_srai_epi32(u[2], bit);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x, y;
+ int col;
- u[3] = _mm_sub_epi32(v[0], x);
- u[3] = _mm_add_epi32(u[3], rnding);
- u[3] = _mm_srai_epi32(u[3], bit);
+ // Note:
+ // Even column: 0, 2, ..., 14
+ // Odd column: 1, 3, ..., 15
+ // one even column plus one odd column constructs one row (8 coeffs)
+ // total we have 8 rows (8x8).
+ for (col = 0; col < 2; ++col) {
+ // stage 0
+ // stage 1
+ u0 = in[2 * 0 + col];
+ u1 = _mm_sub_epi32(zero, in[2 * 7 + col]);
+ u2 = _mm_sub_epi32(zero, in[2 * 3 + col]);
+ u3 = in[2 * 4 + col];
+ u4 = _mm_sub_epi32(zero, in[2 * 1 + col]);
+ u5 = in[2 * 6 + col];
+ u6 = in[2 * 2 + col];
+ u7 = _mm_sub_epi32(zero, in[2 * 5 + col]);
- v[0] = _mm_mullo_epi32(v[6], cospi32);
- x = _mm_mullo_epi32(v[7], cospi32);
- u[6] = _mm_add_epi32(v[0], x);
- u[6] = _mm_add_epi32(u[6], rnding);
- u[6] = _mm_srai_epi32(u[6], bit);
-
- u[7] = _mm_sub_epi32(v[0], x);
- u[7] = _mm_add_epi32(u[7], rnding);
- u[7] = _mm_srai_epi32(u[7], bit);
-
- // stage 7
- out[0] = u[0];
- out[2] = _mm_sub_epi32(kZero, u[4]);
- out[4] = u[6];
- out[6] = _mm_sub_epi32(kZero, u[2]);
- out[8] = u[3];
- out[10] = _mm_sub_epi32(kZero, u[7]);
- out[12] = u[5];
- out[14] = _mm_sub_epi32(kZero, u[1]);
+ // stage 2
+ v0 = u0;
+ v1 = u1;
- // Odd 8 points: 1, 3, ..., 15
- // stage 0
- // stage 1
- // stage 2
- // (1)
- u[0] = _mm_mullo_epi32(in[15], cospi4);
- x = _mm_mullo_epi32(in[1], cospi60);
- u[0] = _mm_add_epi32(u[0], x);
- u[0] = _mm_add_epi32(u[0], rnding);
- u[0] = _mm_srai_epi32(u[0], bit);
+ x = _mm_mullo_epi32(u2, cospi32);
+ y = _mm_mullo_epi32(u3, cospi32);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
- u[1] = _mm_mullo_epi32(in[15], cospi60);
- x = _mm_mullo_epi32(in[1], cospi4);
- u[1] = _mm_sub_epi32(u[1], x);
- u[1] = _mm_add_epi32(u[1], rnding);
- u[1] = _mm_srai_epi32(u[1], bit);
+ v3 = _mm_sub_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
- // (2)
- u[2] = _mm_mullo_epi32(in[11], cospi20);
- x = _mm_mullo_epi32(in[5], cospi44);
- u[2] = _mm_add_epi32(u[2], x);
- u[2] = _mm_add_epi32(u[2], rnding);
- u[2] = _mm_srai_epi32(u[2], bit);
+ v4 = u4;
+ v5 = u5;
- u[3] = _mm_mullo_epi32(in[11], cospi44);
- x = _mm_mullo_epi32(in[5], cospi20);
- u[3] = _mm_sub_epi32(u[3], x);
- u[3] = _mm_add_epi32(u[3], rnding);
- u[3] = _mm_srai_epi32(u[3], bit);
+ x = _mm_mullo_epi32(u6, cospi32);
+ y = _mm_mullo_epi32(u7, cospi32);
+ v6 = _mm_add_epi32(x, y);
+ v6 = _mm_add_epi32(v6, rnding);
+ v6 = _mm_srai_epi32(v6, bit);
- // (3)
- u[4] = _mm_mullo_epi32(in[7], cospi36);
- x = _mm_mullo_epi32(in[9], cospi28);
- u[4] = _mm_add_epi32(u[4], x);
- u[4] = _mm_add_epi32(u[4], rnding);
- u[4] = _mm_srai_epi32(u[4], bit);
-
- u[5] = _mm_mullo_epi32(in[7], cospi28);
- x = _mm_mullo_epi32(in[9], cospi36);
- u[5] = _mm_sub_epi32(u[5], x);
- u[5] = _mm_add_epi32(u[5], rnding);
- u[5] = _mm_srai_epi32(u[5], bit);
-
- // (4)
- u[6] = _mm_mullo_epi32(in[3], cospi52);
- x = _mm_mullo_epi32(in[13], cospi12);
- u[6] = _mm_add_epi32(u[6], x);
- u[6] = _mm_add_epi32(u[6], rnding);
- u[6] = _mm_srai_epi32(u[6], bit);
-
- u[7] = _mm_mullo_epi32(in[3], cospi12);
- x = _mm_mullo_epi32(in[13], cospi52);
- u[7] = _mm_sub_epi32(u[7], x);
- u[7] = _mm_add_epi32(u[7], rnding);
- u[7] = _mm_srai_epi32(u[7], bit);
+ v7 = _mm_sub_epi32(x, y);
+ v7 = _mm_add_epi32(v7, rnding);
+ v7 = _mm_srai_epi32(v7, bit);
- // stage 3
- v[0] = _mm_add_epi32(u[0], u[4]);
- v[4] = _mm_sub_epi32(u[0], u[4]);
- v[1] = _mm_add_epi32(u[1], u[5]);
- v[5] = _mm_sub_epi32(u[1], u[5]);
- v[2] = _mm_add_epi32(u[2], u[6]);
- v[6] = _mm_sub_epi32(u[2], u[6]);
- v[3] = _mm_add_epi32(u[3], u[7]);
- v[7] = _mm_sub_epi32(u[3], u[7]);
+ // stage 3
+ u0 = _mm_add_epi32(v0, v2);
+ u1 = _mm_add_epi32(v1, v3);
+ u2 = _mm_sub_epi32(v0, v2);
+ u3 = _mm_sub_epi32(v1, v3);
+ u4 = _mm_add_epi32(v4, v6);
+ u5 = _mm_add_epi32(v5, v7);
+ u6 = _mm_sub_epi32(v4, v6);
+ u7 = _mm_sub_epi32(v5, v7);
- // stage 4
- u[0] = v[0];
- u[1] = v[1];
- u[2] = v[2];
- u[3] = v[3];
-
- u[4] = _mm_mullo_epi32(v[4], cospi16);
- x = _mm_mullo_epi32(v[5], cospi48);
- u[4] = _mm_add_epi32(u[4], x);
- u[4] = _mm_add_epi32(u[4], rnding);
- u[4] = _mm_srai_epi32(u[4], bit);
-
- u[5] = _mm_mullo_epi32(v[4], cospi48);
- x = _mm_mullo_epi32(v[5], cospi16);
- u[5] = _mm_sub_epi32(u[5], x);
- u[5] = _mm_add_epi32(u[5], rnding);
- u[5] = _mm_srai_epi32(u[5], bit);
-
- u[6] = _mm_mullo_epi32(v[6], cospim48);
- x = _mm_mullo_epi32(v[7], cospi16);
- u[6] = _mm_add_epi32(u[6], x);
- u[6] = _mm_add_epi32(u[6], rnding);
- u[6] = _mm_srai_epi32(u[6], bit);
-
- u[7] = _mm_mullo_epi32(v[6], cospi16);
- x = _mm_mullo_epi32(v[7], cospim48);
- u[7] = _mm_sub_epi32(u[7], x);
- u[7] = _mm_add_epi32(u[7], rnding);
- u[7] = _mm_srai_epi32(u[7], bit);
+ // stage 4
+ v0 = u0;
+ v1 = u1;
+ v2 = u2;
+ v3 = u3;
+
+ x = _mm_mullo_epi32(u4, cospi16);
+ y = _mm_mullo_epi32(u5, cospi48);
+ v4 = _mm_add_epi32(x, y);
+ v4 = _mm_add_epi32(v4, rnding);
+ v4 = _mm_srai_epi32(v4, bit);
+
+ x = _mm_mullo_epi32(u4, cospi48);
+ y = _mm_mullo_epi32(u5, cospim16);
+ v5 = _mm_add_epi32(x, y);
+ v5 = _mm_add_epi32(v5, rnding);
+ v5 = _mm_srai_epi32(v5, bit);
+
+ x = _mm_mullo_epi32(u6, cospim48);
+ y = _mm_mullo_epi32(u7, cospi16);
+ v6 = _mm_add_epi32(x, y);
+ v6 = _mm_add_epi32(v6, rnding);
+ v6 = _mm_srai_epi32(v6, bit);
+
+ x = _mm_mullo_epi32(u6, cospi16);
+ y = _mm_mullo_epi32(u7, cospi48);
+ v7 = _mm_add_epi32(x, y);
+ v7 = _mm_add_epi32(v7, rnding);
+ v7 = _mm_srai_epi32(v7, bit);
- // stage 5
- v[0] = _mm_add_epi32(u[0], u[2]);
- v[2] = _mm_sub_epi32(u[0], u[2]);
- v[1] = _mm_add_epi32(u[1], u[3]);
- v[3] = _mm_sub_epi32(u[1], u[3]);
- v[4] = _mm_add_epi32(u[4], u[6]);
- v[6] = _mm_sub_epi32(u[4], u[6]);
- v[5] = _mm_add_epi32(u[5], u[7]);
- v[7] = _mm_sub_epi32(u[5], u[7]);
-
- // stage 6
- u[0] = v[0];
- u[1] = v[1];
- u[4] = v[4];
- u[5] = v[5];
-
- v[0] = _mm_mullo_epi32(v[2], cospi32);
- x = _mm_mullo_epi32(v[3], cospi32);
- u[2] = _mm_add_epi32(v[0], x);
- u[2] = _mm_add_epi32(u[2], rnding);
- u[2] = _mm_srai_epi32(u[2], bit);
+ // stage 5
+ u0 = _mm_add_epi32(v0, v4);
+ u1 = _mm_add_epi32(v1, v5);
+ u2 = _mm_add_epi32(v2, v6);
+ u3 = _mm_add_epi32(v3, v7);
+ u4 = _mm_sub_epi32(v0, v4);
+ u5 = _mm_sub_epi32(v1, v5);
+ u6 = _mm_sub_epi32(v2, v6);
+ u7 = _mm_sub_epi32(v3, v7);
- u[3] = _mm_sub_epi32(v[0], x);
- u[3] = _mm_add_epi32(u[3], rnding);
- u[3] = _mm_srai_epi32(u[3], bit);
+ // stage 6
+ x = _mm_mullo_epi32(u0, cospi4);
+ y = _mm_mullo_epi32(u1, cospi60);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ x = _mm_mullo_epi32(u0, cospi60);
+ y = _mm_mullo_epi32(u1, cospim4);
+ v1 = _mm_add_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u2, cospi20);
+ y = _mm_mullo_epi32(u3, cospi44);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u2, cospi44);
+ y = _mm_mullo_epi32(u3, cospim20);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ x = _mm_mullo_epi32(u4, cospi36);
+ y = _mm_mullo_epi32(u5, cospi28);
+ v4 = _mm_add_epi32(x, y);
+ v4 = _mm_add_epi32(v4, rnding);
+ v4 = _mm_srai_epi32(v4, bit);
+
+ x = _mm_mullo_epi32(u4, cospi28);
+ y = _mm_mullo_epi32(u5, cospim36);
+ v5 = _mm_add_epi32(x, y);
+ v5 = _mm_add_epi32(v5, rnding);
+ v5 = _mm_srai_epi32(v5, bit);
+
+ x = _mm_mullo_epi32(u6, cospi52);
+ y = _mm_mullo_epi32(u7, cospi12);
+ v6 = _mm_add_epi32(x, y);
+ v6 = _mm_add_epi32(v6, rnding);
+ v6 = _mm_srai_epi32(v6, bit);
+
+ x = _mm_mullo_epi32(u6, cospi12);
+ y = _mm_mullo_epi32(u7, cospim52);
+ v7 = _mm_add_epi32(x, y);
+ v7 = _mm_add_epi32(v7, rnding);
+ v7 = _mm_srai_epi32(v7, bit);
- v[0] = _mm_mullo_epi32(v[6], cospi32);
- x = _mm_mullo_epi32(v[7], cospi32);
- u[6] = _mm_add_epi32(v[0], x);
- u[6] = _mm_add_epi32(u[6], rnding);
- u[6] = _mm_srai_epi32(u[6], bit);
-
- u[7] = _mm_sub_epi32(v[0], x);
- u[7] = _mm_add_epi32(u[7], rnding);
- u[7] = _mm_srai_epi32(u[7], bit);
-
- // stage 7
- out[1] = u[0];
- out[3] = _mm_sub_epi32(kZero, u[4]);
- out[5] = u[6];
- out[7] = _mm_sub_epi32(kZero, u[2]);
- out[9] = u[3];
- out[11] = _mm_sub_epi32(kZero, u[7]);
- out[13] = u[5];
- out[15] = _mm_sub_epi32(kZero, u[1]);
+ // stage 7
+ out[2 * 0 + col] = v1;
+ out[2 * 1 + col] = v6;
+ out[2 * 2 + col] = v3;
+ out[2 * 3 + col] = v4;
+ out[2 * 4 + col] = v5;
+ out[2 * 5 + col] = v2;
+ out[2 * 6 + col] = v7;
+ out[2 * 7 + col] = v0;
+ }
}
void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
__m128i in[16], out[16];
- const TXFM_1D_CFG *row_cfg = NULL;
- const TXFM_1D_CFG *col_cfg = NULL;
+ const int8_t *shift = fwd_txfm_shift_ls[TX_8X8];
+ const int txw_idx = get_txw_idx(TX_8X8);
+ const int txh_idx = get_txh_idx(TX_8X8);
switch (tx_type) {
case DCT_DCT:
- row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
- col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
- load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
- fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
- col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case ADST_DCT:
- row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
- load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
- fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
- col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case DCT_ADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
- col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
- load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
- fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
- col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case ADST_ADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
- load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
- fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
- col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
-#if CONFIG_EXT_TX
case FLIPADST_DCT:
- row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
- load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]);
- fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
- col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+ load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case DCT_FLIPADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
- col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
- load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]);
- fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
- col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+ load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case FLIPADST_FLIPADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
- load_buffer_8x8(input, in, stride, 1, 1, row_cfg->shift[0]);
- fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
- col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+ load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case ADST_FLIPADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
- load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]);
- fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
- col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+ load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case FLIPADST_ADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
- load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]);
- fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
- col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+ load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
-#endif // CONFIG_EXT_TX
default: assert(0);
}
(void)bd;
@@ -1402,230 +1228,174 @@ static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
- const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
- const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
- const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
- const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
- const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
- const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
- const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
- const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
- const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
- const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i zero = _mm_setzero_si128();
+
__m128i u[16], v[16], x, y;
- const int col_num = 4;
int col;
- // Calculate the column 0, 1, 2, 3
- for (col = 0; col < col_num; ++col) {
+ for (col = 0; col < 4; ++col) {
// stage 0
// stage 1
- // stage 2
- v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2);
- x = _mm_mullo_epi32(in[0 * col_num + col], cospi62);
- v[0] = _mm_add_epi32(v[0], x);
- v[0] = _mm_add_epi32(v[0], rnding);
- v[0] = _mm_srai_epi32(v[0], bit);
+ u[0] = in[0 * 4 + col];
+ u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]);
+ u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]);
+ u[3] = in[8 * 4 + col];
+ u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]);
+ u[5] = in[12 * 4 + col];
+ u[6] = in[4 * 4 + col];
+ u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]);
+ u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]);
+ u[9] = in[14 * 4 + col];
+ u[10] = in[6 * 4 + col];
+ u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]);
+ u[12] = in[2 * 4 + col];
+ u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]);
+ u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]);
+ u[15] = in[10 * 4 + col];
- v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62);
- x = _mm_mullo_epi32(in[0 * col_num + col], cospi2);
- v[1] = _mm_sub_epi32(v[1], x);
- v[1] = _mm_add_epi32(v[1], rnding);
- v[1] = _mm_srai_epi32(v[1], bit);
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
- v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10);
- x = _mm_mullo_epi32(in[2 * col_num + col], cospi54);
- v[2] = _mm_add_epi32(v[2], x);
+ x = _mm_mullo_epi32(u[2], cospi32);
+ y = _mm_mullo_epi32(u[3], cospi32);
+ v[2] = _mm_add_epi32(x, y);
v[2] = _mm_add_epi32(v[2], rnding);
v[2] = _mm_srai_epi32(v[2], bit);
- v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54);
- x = _mm_mullo_epi32(in[2 * col_num + col], cospi10);
- v[3] = _mm_sub_epi32(v[3], x);
+ v[3] = _mm_sub_epi32(x, y);
v[3] = _mm_add_epi32(v[3], rnding);
v[3] = _mm_srai_epi32(v[3], bit);
- v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18);
- x = _mm_mullo_epi32(in[4 * col_num + col], cospi46);
- v[4] = _mm_add_epi32(v[4], x);
- v[4] = _mm_add_epi32(v[4], rnding);
- v[4] = _mm_srai_epi32(v[4], bit);
-
- v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46);
- x = _mm_mullo_epi32(in[4 * col_num + col], cospi18);
- v[5] = _mm_sub_epi32(v[5], x);
- v[5] = _mm_add_epi32(v[5], rnding);
- v[5] = _mm_srai_epi32(v[5], bit);
-
- v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26);
- x = _mm_mullo_epi32(in[6 * col_num + col], cospi38);
- v[6] = _mm_add_epi32(v[6], x);
+ v[4] = u[4];
+ v[5] = u[5];
+
+ x = _mm_mullo_epi32(u[6], cospi32);
+ y = _mm_mullo_epi32(u[7], cospi32);
+ v[6] = _mm_add_epi32(x, y);
v[6] = _mm_add_epi32(v[6], rnding);
v[6] = _mm_srai_epi32(v[6], bit);
- v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38);
- x = _mm_mullo_epi32(in[6 * col_num + col], cospi26);
- v[7] = _mm_sub_epi32(v[7], x);
+ v[7] = _mm_sub_epi32(x, y);
v[7] = _mm_add_epi32(v[7], rnding);
v[7] = _mm_srai_epi32(v[7], bit);
- v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34);
- x = _mm_mullo_epi32(in[8 * col_num + col], cospi30);
- v[8] = _mm_add_epi32(v[8], x);
- v[8] = _mm_add_epi32(v[8], rnding);
- v[8] = _mm_srai_epi32(v[8], bit);
-
- v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30);
- x = _mm_mullo_epi32(in[8 * col_num + col], cospi34);
- v[9] = _mm_sub_epi32(v[9], x);
- v[9] = _mm_add_epi32(v[9], rnding);
- v[9] = _mm_srai_epi32(v[9], bit);
+ v[8] = u[8];
+ v[9] = u[9];
- v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42);
- x = _mm_mullo_epi32(in[10 * col_num + col], cospi22);
- v[10] = _mm_add_epi32(v[10], x);
+ x = _mm_mullo_epi32(u[10], cospi32);
+ y = _mm_mullo_epi32(u[11], cospi32);
+ v[10] = _mm_add_epi32(x, y);
v[10] = _mm_add_epi32(v[10], rnding);
v[10] = _mm_srai_epi32(v[10], bit);
- v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22);
- x = _mm_mullo_epi32(in[10 * col_num + col], cospi42);
- v[11] = _mm_sub_epi32(v[11], x);
+ v[11] = _mm_sub_epi32(x, y);
v[11] = _mm_add_epi32(v[11], rnding);
v[11] = _mm_srai_epi32(v[11], bit);
- v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50);
- x = _mm_mullo_epi32(in[12 * col_num + col], cospi14);
- v[12] = _mm_add_epi32(v[12], x);
- v[12] = _mm_add_epi32(v[12], rnding);
- v[12] = _mm_srai_epi32(v[12], bit);
-
- v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14);
- x = _mm_mullo_epi32(in[12 * col_num + col], cospi50);
- v[13] = _mm_sub_epi32(v[13], x);
- v[13] = _mm_add_epi32(v[13], rnding);
- v[13] = _mm_srai_epi32(v[13], bit);
+ v[12] = u[12];
+ v[13] = u[13];
- v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58);
- x = _mm_mullo_epi32(in[14 * col_num + col], cospi6);
- v[14] = _mm_add_epi32(v[14], x);
+ x = _mm_mullo_epi32(u[14], cospi32);
+ y = _mm_mullo_epi32(u[15], cospi32);
+ v[14] = _mm_add_epi32(x, y);
v[14] = _mm_add_epi32(v[14], rnding);
v[14] = _mm_srai_epi32(v[14], bit);
- v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6);
- x = _mm_mullo_epi32(in[14 * col_num + col], cospi58);
- v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_sub_epi32(x, y);
v[15] = _mm_add_epi32(v[15], rnding);
v[15] = _mm_srai_epi32(v[15], bit);
// stage 3
- u[0] = _mm_add_epi32(v[0], v[8]);
- u[8] = _mm_sub_epi32(v[0], v[8]);
- u[1] = _mm_add_epi32(v[1], v[9]);
- u[9] = _mm_sub_epi32(v[1], v[9]);
- u[2] = _mm_add_epi32(v[2], v[10]);
- u[10] = _mm_sub_epi32(v[2], v[10]);
- u[3] = _mm_add_epi32(v[3], v[11]);
- u[11] = _mm_sub_epi32(v[3], v[11]);
- u[4] = _mm_add_epi32(v[4], v[12]);
- u[12] = _mm_sub_epi32(v[4], v[12]);
- u[5] = _mm_add_epi32(v[5], v[13]);
- u[13] = _mm_sub_epi32(v[5], v[13]);
- u[6] = _mm_add_epi32(v[6], v[14]);
- u[14] = _mm_sub_epi32(v[6], v[14]);
- u[7] = _mm_add_epi32(v[7], v[15]);
- u[15] = _mm_sub_epi32(v[7], v[15]);
+ u[0] = _mm_add_epi32(v[0], v[2]);
+ u[1] = _mm_add_epi32(v[1], v[3]);
+ u[2] = _mm_sub_epi32(v[0], v[2]);
+ u[3] = _mm_sub_epi32(v[1], v[3]);
+ u[4] = _mm_add_epi32(v[4], v[6]);
+ u[5] = _mm_add_epi32(v[5], v[7]);
+ u[6] = _mm_sub_epi32(v[4], v[6]);
+ u[7] = _mm_sub_epi32(v[5], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[10]);
+ u[9] = _mm_add_epi32(v[9], v[11]);
+ u[10] = _mm_sub_epi32(v[8], v[10]);
+ u[11] = _mm_sub_epi32(v[9], v[11]);
+ u[12] = _mm_add_epi32(v[12], v[14]);
+ u[13] = _mm_add_epi32(v[13], v[15]);
+ u[14] = _mm_sub_epi32(v[12], v[14]);
+ u[15] = _mm_sub_epi32(v[13], v[15]);
// stage 4
v[0] = u[0];
v[1] = u[1];
v[2] = u[2];
v[3] = u[3];
- v[4] = u[4];
- v[5] = u[5];
- v[6] = u[6];
- v[7] = u[7];
-
- v[8] = _mm_mullo_epi32(u[8], cospi8);
- x = _mm_mullo_epi32(u[9], cospi56);
- v[8] = _mm_add_epi32(v[8], x);
- v[8] = _mm_add_epi32(v[8], rnding);
- v[8] = _mm_srai_epi32(v[8], bit);
-
- v[9] = _mm_mullo_epi32(u[8], cospi56);
- x = _mm_mullo_epi32(u[9], cospi8);
- v[9] = _mm_sub_epi32(v[9], x);
- v[9] = _mm_add_epi32(v[9], rnding);
- v[9] = _mm_srai_epi32(v[9], bit);
-
- v[10] = _mm_mullo_epi32(u[10], cospi40);
- x = _mm_mullo_epi32(u[11], cospi24);
- v[10] = _mm_add_epi32(v[10], x);
- v[10] = _mm_add_epi32(v[10], rnding);
- v[10] = _mm_srai_epi32(v[10], bit);
-
- v[11] = _mm_mullo_epi32(u[10], cospi24);
- x = _mm_mullo_epi32(u[11], cospi40);
- v[11] = _mm_sub_epi32(v[11], x);
- v[11] = _mm_add_epi32(v[11], rnding);
- v[11] = _mm_srai_epi32(v[11], bit);
-
- v[12] = _mm_mullo_epi32(u[12], cospim56);
- x = _mm_mullo_epi32(u[13], cospi8);
- v[12] = _mm_add_epi32(v[12], x);
- v[12] = _mm_add_epi32(v[12], rnding);
- v[12] = _mm_srai_epi32(v[12], bit);
-
- v[13] = _mm_mullo_epi32(u[12], cospi8);
- x = _mm_mullo_epi32(u[13], cospim56);
- v[13] = _mm_sub_epi32(v[13], x);
- v[13] = _mm_add_epi32(v[13], rnding);
- v[13] = _mm_srai_epi32(v[13], bit);
-
- v[14] = _mm_mullo_epi32(u[14], cospim24);
- x = _mm_mullo_epi32(u[15], cospi40);
- v[14] = _mm_add_epi32(v[14], x);
- v[14] = _mm_add_epi32(v[14], rnding);
- v[14] = _mm_srai_epi32(v[14], bit);
-
- v[15] = _mm_mullo_epi32(u[14], cospi40);
- x = _mm_mullo_epi32(u[15], cospim24);
- v[15] = _mm_sub_epi32(v[15], x);
- v[15] = _mm_add_epi32(v[15], rnding);
- v[15] = _mm_srai_epi32(v[15], bit);
+ v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
+ v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
+ v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
+ v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+ v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
// stage 5
u[0] = _mm_add_epi32(v[0], v[4]);
- u[4] = _mm_sub_epi32(v[0], v[4]);
u[1] = _mm_add_epi32(v[1], v[5]);
- u[5] = _mm_sub_epi32(v[1], v[5]);
u[2] = _mm_add_epi32(v[2], v[6]);
- u[6] = _mm_sub_epi32(v[2], v[6]);
u[3] = _mm_add_epi32(v[3], v[7]);
+ u[4] = _mm_sub_epi32(v[0], v[4]);
+ u[5] = _mm_sub_epi32(v[1], v[5]);
+ u[6] = _mm_sub_epi32(v[2], v[6]);
u[7] = _mm_sub_epi32(v[3], v[7]);
u[8] = _mm_add_epi32(v[8], v[12]);
- u[12] = _mm_sub_epi32(v[8], v[12]);
u[9] = _mm_add_epi32(v[9], v[13]);
- u[13] = _mm_sub_epi32(v[9], v[13]);
u[10] = _mm_add_epi32(v[10], v[14]);
- u[14] = _mm_sub_epi32(v[10], v[14]);
u[11] = _mm_add_epi32(v[11], v[15]);
+ u[12] = _mm_sub_epi32(v[8], v[12]);
+ u[13] = _mm_sub_epi32(v[9], v[13]);
+ u[14] = _mm_sub_epi32(v[10], v[14]);
u[15] = _mm_sub_epi32(v[11], v[15]);
// stage 6
@@ -1633,148 +1403,72 @@ static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
v[1] = u[1];
v[2] = u[2];
v[3] = u[3];
-
- v[4] = _mm_mullo_epi32(u[4], cospi16);
- x = _mm_mullo_epi32(u[5], cospi48);
- v[4] = _mm_add_epi32(v[4], x);
- v[4] = _mm_add_epi32(v[4], rnding);
- v[4] = _mm_srai_epi32(v[4], bit);
-
- v[5] = _mm_mullo_epi32(u[4], cospi48);
- x = _mm_mullo_epi32(u[5], cospi16);
- v[5] = _mm_sub_epi32(v[5], x);
- v[5] = _mm_add_epi32(v[5], rnding);
- v[5] = _mm_srai_epi32(v[5], bit);
-
- v[6] = _mm_mullo_epi32(u[6], cospim48);
- x = _mm_mullo_epi32(u[7], cospi16);
- v[6] = _mm_add_epi32(v[6], x);
- v[6] = _mm_add_epi32(v[6], rnding);
- v[6] = _mm_srai_epi32(v[6], bit);
-
- v[7] = _mm_mullo_epi32(u[6], cospi16);
- x = _mm_mullo_epi32(u[7], cospim48);
- v[7] = _mm_sub_epi32(v[7], x);
- v[7] = _mm_add_epi32(v[7], rnding);
- v[7] = _mm_srai_epi32(v[7], bit);
-
- v[8] = u[8];
- v[9] = u[9];
- v[10] = u[10];
- v[11] = u[11];
-
- v[12] = _mm_mullo_epi32(u[12], cospi16);
- x = _mm_mullo_epi32(u[13], cospi48);
- v[12] = _mm_add_epi32(v[12], x);
- v[12] = _mm_add_epi32(v[12], rnding);
- v[12] = _mm_srai_epi32(v[12], bit);
-
- v[13] = _mm_mullo_epi32(u[12], cospi48);
- x = _mm_mullo_epi32(u[13], cospi16);
- v[13] = _mm_sub_epi32(v[13], x);
- v[13] = _mm_add_epi32(v[13], rnding);
- v[13] = _mm_srai_epi32(v[13], bit);
-
- v[14] = _mm_mullo_epi32(u[14], cospim48);
- x = _mm_mullo_epi32(u[15], cospi16);
- v[14] = _mm_add_epi32(v[14], x);
- v[14] = _mm_add_epi32(v[14], rnding);
- v[14] = _mm_srai_epi32(v[14], bit);
-
- v[15] = _mm_mullo_epi32(u[14], cospi16);
- x = _mm_mullo_epi32(u[15], cospim48);
- v[15] = _mm_sub_epi32(v[15], x);
- v[15] = _mm_add_epi32(v[15], rnding);
- v[15] = _mm_srai_epi32(v[15], bit);
-
- // stage 7
- u[0] = _mm_add_epi32(v[0], v[2]);
- u[2] = _mm_sub_epi32(v[0], v[2]);
- u[1] = _mm_add_epi32(v[1], v[3]);
- u[3] = _mm_sub_epi32(v[1], v[3]);
- u[4] = _mm_add_epi32(v[4], v[6]);
- u[6] = _mm_sub_epi32(v[4], v[6]);
- u[5] = _mm_add_epi32(v[5], v[7]);
- u[7] = _mm_sub_epi32(v[5], v[7]);
- u[8] = _mm_add_epi32(v[8], v[10]);
- u[10] = _mm_sub_epi32(v[8], v[10]);
- u[9] = _mm_add_epi32(v[9], v[11]);
- u[11] = _mm_sub_epi32(v[9], v[11]);
- u[12] = _mm_add_epi32(v[12], v[14]);
- u[14] = _mm_sub_epi32(v[12], v[14]);
- u[13] = _mm_add_epi32(v[13], v[15]);
- u[15] = _mm_sub_epi32(v[13], v[15]);
-
- // stage 8
- v[0] = u[0];
- v[1] = u[1];
-
- y = _mm_mullo_epi32(u[2], cospi32);
- x = _mm_mullo_epi32(u[3], cospi32);
- v[2] = _mm_add_epi32(y, x);
- v[2] = _mm_add_epi32(v[2], rnding);
- v[2] = _mm_srai_epi32(v[2], bit);
-
- v[3] = _mm_sub_epi32(y, x);
- v[3] = _mm_add_epi32(v[3], rnding);
- v[3] = _mm_srai_epi32(v[3], bit);
-
v[4] = u[4];
v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+ v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
+ v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
- y = _mm_mullo_epi32(u[6], cospi32);
- x = _mm_mullo_epi32(u[7], cospi32);
- v[6] = _mm_add_epi32(y, x);
- v[6] = _mm_add_epi32(v[6], rnding);
- v[6] = _mm_srai_epi32(v[6], bit);
-
- v[7] = _mm_sub_epi32(y, x);
- v[7] = _mm_add_epi32(v[7], rnding);
- v[7] = _mm_srai_epi32(v[7], bit);
-
- v[8] = u[8];
- v[9] = u[9];
-
- y = _mm_mullo_epi32(u[10], cospi32);
- x = _mm_mullo_epi32(u[11], cospi32);
- v[10] = _mm_add_epi32(y, x);
- v[10] = _mm_add_epi32(v[10], rnding);
- v[10] = _mm_srai_epi32(v[10], bit);
-
- v[11] = _mm_sub_epi32(y, x);
- v[11] = _mm_add_epi32(v[11], rnding);
- v[11] = _mm_srai_epi32(v[11], bit);
-
- v[12] = u[12];
- v[13] = u[13];
-
- y = _mm_mullo_epi32(u[14], cospi32);
- x = _mm_mullo_epi32(u[15], cospi32);
- v[14] = _mm_add_epi32(y, x);
- v[14] = _mm_add_epi32(v[14], rnding);
- v[14] = _mm_srai_epi32(v[14], bit);
+ // stage 7
+ u[0] = _mm_add_epi32(v[0], v[8]);
+ u[1] = _mm_add_epi32(v[1], v[9]);
+ u[2] = _mm_add_epi32(v[2], v[10]);
+ u[3] = _mm_add_epi32(v[3], v[11]);
+ u[4] = _mm_add_epi32(v[4], v[12]);
+ u[5] = _mm_add_epi32(v[5], v[13]);
+ u[6] = _mm_add_epi32(v[6], v[14]);
+ u[7] = _mm_add_epi32(v[7], v[15]);
+ u[8] = _mm_sub_epi32(v[0], v[8]);
+ u[9] = _mm_sub_epi32(v[1], v[9]);
+ u[10] = _mm_sub_epi32(v[2], v[10]);
+ u[11] = _mm_sub_epi32(v[3], v[11]);
+ u[12] = _mm_sub_epi32(v[4], v[12]);
+ u[13] = _mm_sub_epi32(v[5], v[13]);
+ u[14] = _mm_sub_epi32(v[6], v[14]);
+ u[15] = _mm_sub_epi32(v[7], v[15]);
- v[15] = _mm_sub_epi32(y, x);
- v[15] = _mm_add_epi32(v[15], rnding);
- v[15] = _mm_srai_epi32(v[15], bit);
+ // stage 8
+ v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
+ v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
+ v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
+ v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
+ v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
+ v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
+ v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
+ v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
+ v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
+ v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
// stage 9
- out[0 * col_num + col] = v[0];
- out[1 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[8]);
- out[2 * col_num + col] = v[12];
- out[3 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[4]);
- out[4 * col_num + col] = v[6];
- out[5 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[14]);
- out[6 * col_num + col] = v[10];
- out[7 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[2]);
- out[8 * col_num + col] = v[3];
- out[9 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[11]);
- out[10 * col_num + col] = v[15];
- out[11 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[7]);
- out[12 * col_num + col] = v[5];
- out[13 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[13]);
- out[14 * col_num + col] = v[9];
- out[15 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[1]);
+ out[0 * 4 + col] = v[1];
+ out[1 * 4 + col] = v[14];
+ out[2 * 4 + col] = v[3];
+ out[3 * 4 + col] = v[12];
+ out[4 * 4 + col] = v[5];
+ out[5 * 4 + col] = v[10];
+ out[6 * 4 + col] = v[7];
+ out[7 * 4 + col] = v[8];
+ out[8 * 4 + col] = v[9];
+ out[9 * 4 + col] = v[6];
+ out[10 * 4 + col] = v[11];
+ out[11 * 4 + col] = v[4];
+ out[12 * 4 + col] = v[13];
+ out[13 * 4 + col] = v[2];
+ out[14 * 4 + col] = v[15];
+ out[15 * 4 + col] = v[0];
}
}
@@ -1802,111 +1496,91 @@ static void write_buffer_16x16(const __m128i *in, int32_t *output) {
void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
int stride, TX_TYPE tx_type, int bd) {
__m128i in[64], out[64];
- const TXFM_1D_CFG *row_cfg = NULL;
- const TXFM_1D_CFG *col_cfg = NULL;
-
+ const int8_t *shift = fwd_txfm_shift_ls[TX_16X16];
+ const int txw_idx = get_txw_idx(TX_16X16);
+ const int txh_idx = get_txh_idx(TX_16X16);
switch (tx_type) {
case DCT_DCT:
- row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
- col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
- load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
- fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
- col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case ADST_DCT:
- row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
- load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
- fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
- col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case DCT_ADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
- col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
- load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
- fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
- col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case ADST_ADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
- load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
- fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
- col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
-#if CONFIG_EXT_TX
case FLIPADST_DCT:
- row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
- load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]);
- fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
- col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+ load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case DCT_FLIPADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
- col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
- load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]);
- fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
- col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+ load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case FLIPADST_FLIPADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
- load_buffer_16x16(input, in, stride, 1, 1, row_cfg->shift[0]);
- fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
- col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+ load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case ADST_FLIPADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
- load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]);
- fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
- col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+ load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case FLIPADST_ADST:
- row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
- col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
- load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]);
- fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
- col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+ load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
-#endif // CONFIG_EXT_TX
default: assert(0);
}
(void)bd;
diff --git a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
deleted file mode 100644
index 88621c82b..000000000
--- a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ /dev/null
@@ -1,1627 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h> // avx2
-
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/fwd_txfm_avx2.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_avx2.h"
-
-static INLINE void load_buffer_16x16(const int16_t *input, int stride,
- int flipud, int fliplr, __m256i *in) {
- if (!flipud) {
- in[0] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride));
- in[1] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride));
- in[2] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride));
- in[3] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride));
- in[4] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride));
- in[5] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride));
- in[6] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride));
- in[7] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride));
- in[8] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride));
- in[9] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride));
- in[10] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride));
- in[11] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride));
- in[12] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride));
- in[13] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride));
- in[14] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride));
- in[15] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride));
- } else {
- in[0] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride));
- in[1] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride));
- in[2] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride));
- in[3] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride));
- in[4] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride));
- in[5] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride));
- in[6] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride));
- in[7] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride));
- in[8] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride));
- in[9] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride));
- in[10] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride));
- in[11] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride));
- in[12] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride));
- in[13] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride));
- in[14] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride));
- in[15] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride));
- }
-
- if (fliplr) {
- mm256_reverse_epi16(&in[0]);
- mm256_reverse_epi16(&in[1]);
- mm256_reverse_epi16(&in[2]);
- mm256_reverse_epi16(&in[3]);
- mm256_reverse_epi16(&in[4]);
- mm256_reverse_epi16(&in[5]);
- mm256_reverse_epi16(&in[6]);
- mm256_reverse_epi16(&in[7]);
- mm256_reverse_epi16(&in[8]);
- mm256_reverse_epi16(&in[9]);
- mm256_reverse_epi16(&in[10]);
- mm256_reverse_epi16(&in[11]);
- mm256_reverse_epi16(&in[12]);
- mm256_reverse_epi16(&in[13]);
- mm256_reverse_epi16(&in[14]);
- mm256_reverse_epi16(&in[15]);
- }
-
- in[0] = _mm256_slli_epi16(in[0], 2);
- in[1] = _mm256_slli_epi16(in[1], 2);
- in[2] = _mm256_slli_epi16(in[2], 2);
- in[3] = _mm256_slli_epi16(in[3], 2);
- in[4] = _mm256_slli_epi16(in[4], 2);
- in[5] = _mm256_slli_epi16(in[5], 2);
- in[6] = _mm256_slli_epi16(in[6], 2);
- in[7] = _mm256_slli_epi16(in[7], 2);
- in[8] = _mm256_slli_epi16(in[8], 2);
- in[9] = _mm256_slli_epi16(in[9], 2);
- in[10] = _mm256_slli_epi16(in[10], 2);
- in[11] = _mm256_slli_epi16(in[11], 2);
- in[12] = _mm256_slli_epi16(in[12], 2);
- in[13] = _mm256_slli_epi16(in[13], 2);
- in[14] = _mm256_slli_epi16(in[14], 2);
- in[15] = _mm256_slli_epi16(in[15], 2);
-}
-
-static INLINE void write_buffer_16x16(const __m256i *in, tran_low_t *output) {
- int i;
- for (i = 0; i < 16; ++i) {
- storeu_output_avx2(&in[i], output + (i << 4));
- }
-}
-
-static void right_shift_16x16(__m256i *in) {
- const __m256i one = _mm256_set1_epi16(1);
- __m256i s0 = _mm256_srai_epi16(in[0], 15);
- __m256i s1 = _mm256_srai_epi16(in[1], 15);
- __m256i s2 = _mm256_srai_epi16(in[2], 15);
- __m256i s3 = _mm256_srai_epi16(in[3], 15);
- __m256i s4 = _mm256_srai_epi16(in[4], 15);
- __m256i s5 = _mm256_srai_epi16(in[5], 15);
- __m256i s6 = _mm256_srai_epi16(in[6], 15);
- __m256i s7 = _mm256_srai_epi16(in[7], 15);
- __m256i s8 = _mm256_srai_epi16(in[8], 15);
- __m256i s9 = _mm256_srai_epi16(in[9], 15);
- __m256i s10 = _mm256_srai_epi16(in[10], 15);
- __m256i s11 = _mm256_srai_epi16(in[11], 15);
- __m256i s12 = _mm256_srai_epi16(in[12], 15);
- __m256i s13 = _mm256_srai_epi16(in[13], 15);
- __m256i s14 = _mm256_srai_epi16(in[14], 15);
- __m256i s15 = _mm256_srai_epi16(in[15], 15);
-
- in[0] = _mm256_add_epi16(in[0], one);
- in[1] = _mm256_add_epi16(in[1], one);
- in[2] = _mm256_add_epi16(in[2], one);
- in[3] = _mm256_add_epi16(in[3], one);
- in[4] = _mm256_add_epi16(in[4], one);
- in[5] = _mm256_add_epi16(in[5], one);
- in[6] = _mm256_add_epi16(in[6], one);
- in[7] = _mm256_add_epi16(in[7], one);
- in[8] = _mm256_add_epi16(in[8], one);
- in[9] = _mm256_add_epi16(in[9], one);
- in[10] = _mm256_add_epi16(in[10], one);
- in[11] = _mm256_add_epi16(in[11], one);
- in[12] = _mm256_add_epi16(in[12], one);
- in[13] = _mm256_add_epi16(in[13], one);
- in[14] = _mm256_add_epi16(in[14], one);
- in[15] = _mm256_add_epi16(in[15], one);
-
- in[0] = _mm256_sub_epi16(in[0], s0);
- in[1] = _mm256_sub_epi16(in[1], s1);
- in[2] = _mm256_sub_epi16(in[2], s2);
- in[3] = _mm256_sub_epi16(in[3], s3);
- in[4] = _mm256_sub_epi16(in[4], s4);
- in[5] = _mm256_sub_epi16(in[5], s5);
- in[6] = _mm256_sub_epi16(in[6], s6);
- in[7] = _mm256_sub_epi16(in[7], s7);
- in[8] = _mm256_sub_epi16(in[8], s8);
- in[9] = _mm256_sub_epi16(in[9], s9);
- in[10] = _mm256_sub_epi16(in[10], s10);
- in[11] = _mm256_sub_epi16(in[11], s11);
- in[12] = _mm256_sub_epi16(in[12], s12);
- in[13] = _mm256_sub_epi16(in[13], s13);
- in[14] = _mm256_sub_epi16(in[14], s14);
- in[15] = _mm256_sub_epi16(in[15], s15);
-
- in[0] = _mm256_srai_epi16(in[0], 2);
- in[1] = _mm256_srai_epi16(in[1], 2);
- in[2] = _mm256_srai_epi16(in[2], 2);
- in[3] = _mm256_srai_epi16(in[3], 2);
- in[4] = _mm256_srai_epi16(in[4], 2);
- in[5] = _mm256_srai_epi16(in[5], 2);
- in[6] = _mm256_srai_epi16(in[6], 2);
- in[7] = _mm256_srai_epi16(in[7], 2);
- in[8] = _mm256_srai_epi16(in[8], 2);
- in[9] = _mm256_srai_epi16(in[9], 2);
- in[10] = _mm256_srai_epi16(in[10], 2);
- in[11] = _mm256_srai_epi16(in[11], 2);
- in[12] = _mm256_srai_epi16(in[12], 2);
- in[13] = _mm256_srai_epi16(in[13], 2);
- in[14] = _mm256_srai_epi16(in[14], 2);
- in[15] = _mm256_srai_epi16(in[15], 2);
-}
-
-static void fdct16_avx2(__m256i *in) {
- // sequence: cospi_L_H = pairs(L, H) and L first
- const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
- const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64);
- const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
- const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
- const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-
- const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64);
- const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
- const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64);
- const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
-
- const __m256i cospi_p30_p02 = pair256_set_epi16(cospi_30_64, cospi_2_64);
- const __m256i cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64);
-
- const __m256i cospi_p14_p18 = pair256_set_epi16(cospi_14_64, cospi_18_64);
- const __m256i cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64);
-
- const __m256i cospi_p22_p10 = pair256_set_epi16(cospi_22_64, cospi_10_64);
- const __m256i cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64);
-
- const __m256i cospi_p06_p26 = pair256_set_epi16(cospi_6_64, cospi_26_64);
- const __m256i cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64);
-
- __m256i u0, u1, u2, u3, u4, u5, u6, u7;
- __m256i s0, s1, s2, s3, s4, s5, s6, s7;
- __m256i t0, t1, t2, t3, t4, t5, t6, t7;
- __m256i v0, v1, v2, v3;
- __m256i x0, x1;
-
- // 0, 4, 8, 12
- u0 = _mm256_add_epi16(in[0], in[15]);
- u1 = _mm256_add_epi16(in[1], in[14]);
- u2 = _mm256_add_epi16(in[2], in[13]);
- u3 = _mm256_add_epi16(in[3], in[12]);
- u4 = _mm256_add_epi16(in[4], in[11]);
- u5 = _mm256_add_epi16(in[5], in[10]);
- u6 = _mm256_add_epi16(in[6], in[9]);
- u7 = _mm256_add_epi16(in[7], in[8]);
-
- s0 = _mm256_add_epi16(u0, u7);
- s1 = _mm256_add_epi16(u1, u6);
- s2 = _mm256_add_epi16(u2, u5);
- s3 = _mm256_add_epi16(u3, u4);
-
- // 0, 8
- v0 = _mm256_add_epi16(s0, s3);
- v1 = _mm256_add_epi16(s1, s2);
-
- x0 = _mm256_unpacklo_epi16(v0, v1);
- x1 = _mm256_unpackhi_epi16(v0, v1);
-
- t0 = butter_fly(&x0, &x1, &cospi_p16_p16);
- t1 = butter_fly(&x0, &x1, &cospi_p16_m16);
-
- // 4, 12
- v0 = _mm256_sub_epi16(s1, s2);
- v1 = _mm256_sub_epi16(s0, s3);
-
- x0 = _mm256_unpacklo_epi16(v0, v1);
- x1 = _mm256_unpackhi_epi16(v0, v1);
-
- t2 = butter_fly(&x0, &x1, &cospi_p24_p08);
- t3 = butter_fly(&x0, &x1, &cospi_m08_p24);
-
- // 2, 6, 10, 14
- s0 = _mm256_sub_epi16(u3, u4);
- s1 = _mm256_sub_epi16(u2, u5);
- s2 = _mm256_sub_epi16(u1, u6);
- s3 = _mm256_sub_epi16(u0, u7);
-
- v0 = s0; // output[4]
- v3 = s3; // output[7]
-
- x0 = _mm256_unpacklo_epi16(s2, s1);
- x1 = _mm256_unpackhi_epi16(s2, s1);
-
- v2 = butter_fly(&x0, &x1, &cospi_p16_p16); // output[5]
- v1 = butter_fly(&x0, &x1, &cospi_p16_m16); // output[6]
-
- s0 = _mm256_add_epi16(v0, v1); // step[4]
- s1 = _mm256_sub_epi16(v0, v1); // step[5]
- s2 = _mm256_sub_epi16(v3, v2); // step[6]
- s3 = _mm256_add_epi16(v3, v2); // step[7]
-
- // 2, 14
- x0 = _mm256_unpacklo_epi16(s0, s3);
- x1 = _mm256_unpackhi_epi16(s0, s3);
-
- t4 = butter_fly(&x0, &x1, &cospi_p28_p04);
- t5 = butter_fly(&x0, &x1, &cospi_m04_p28);
-
- // 10, 6
- x0 = _mm256_unpacklo_epi16(s1, s2);
- x1 = _mm256_unpackhi_epi16(s1, s2);
- t6 = butter_fly(&x0, &x1, &cospi_p12_p20);
- t7 = butter_fly(&x0, &x1, &cospi_m20_p12);
-
- // 1, 3, 5, 7, 9, 11, 13, 15
- s0 = _mm256_sub_epi16(in[7], in[8]); // step[8]
- s1 = _mm256_sub_epi16(in[6], in[9]); // step[9]
- u2 = _mm256_sub_epi16(in[5], in[10]);
- u3 = _mm256_sub_epi16(in[4], in[11]);
- u4 = _mm256_sub_epi16(in[3], in[12]);
- u5 = _mm256_sub_epi16(in[2], in[13]);
- s6 = _mm256_sub_epi16(in[1], in[14]); // step[14]
- s7 = _mm256_sub_epi16(in[0], in[15]); // step[15]
-
- in[0] = t0;
- in[8] = t1;
- in[4] = t2;
- in[12] = t3;
- in[2] = t4;
- in[14] = t5;
- in[10] = t6;
- in[6] = t7;
-
- x0 = _mm256_unpacklo_epi16(u5, u2);
- x1 = _mm256_unpackhi_epi16(u5, u2);
-
- s2 = butter_fly(&x0, &x1, &cospi_p16_p16); // step[13]
- s5 = butter_fly(&x0, &x1, &cospi_p16_m16); // step[10]
-
- x0 = _mm256_unpacklo_epi16(u4, u3);
- x1 = _mm256_unpackhi_epi16(u4, u3);
-
- s3 = butter_fly(&x0, &x1, &cospi_p16_p16); // step[12]
- s4 = butter_fly(&x0, &x1, &cospi_p16_m16); // step[11]
-
- u0 = _mm256_add_epi16(s0, s4); // output[8]
- u1 = _mm256_add_epi16(s1, s5);
- u2 = _mm256_sub_epi16(s1, s5);
- u3 = _mm256_sub_epi16(s0, s4);
- u4 = _mm256_sub_epi16(s7, s3);
- u5 = _mm256_sub_epi16(s6, s2);
- u6 = _mm256_add_epi16(s6, s2);
- u7 = _mm256_add_epi16(s7, s3);
-
- // stage 4
- s0 = u0;
- s3 = u3;
- s4 = u4;
- s7 = u7;
-
- x0 = _mm256_unpacklo_epi16(u1, u6);
- x1 = _mm256_unpackhi_epi16(u1, u6);
-
- s1 = butter_fly(&x0, &x1, &cospi_m08_p24);
- s6 = butter_fly(&x0, &x1, &cospi_p24_p08);
-
- x0 = _mm256_unpacklo_epi16(u2, u5);
- x1 = _mm256_unpackhi_epi16(u2, u5);
-
- s2 = butter_fly(&x0, &x1, &cospi_m24_m08);
- s5 = butter_fly(&x0, &x1, &cospi_m08_p24);
-
- // stage 5
- u0 = _mm256_add_epi16(s0, s1);
- u1 = _mm256_sub_epi16(s0, s1);
- u2 = _mm256_sub_epi16(s3, s2);
- u3 = _mm256_add_epi16(s3, s2);
- u4 = _mm256_add_epi16(s4, s5);
- u5 = _mm256_sub_epi16(s4, s5);
- u6 = _mm256_sub_epi16(s7, s6);
- u7 = _mm256_add_epi16(s7, s6);
-
- // stage 6
- x0 = _mm256_unpacklo_epi16(u0, u7);
- x1 = _mm256_unpackhi_epi16(u0, u7);
- in[1] = butter_fly(&x0, &x1, &cospi_p30_p02);
- in[15] = butter_fly(&x0, &x1, &cospi_m02_p30);
-
- x0 = _mm256_unpacklo_epi16(u1, u6);
- x1 = _mm256_unpackhi_epi16(u1, u6);
- in[9] = butter_fly(&x0, &x1, &cospi_p14_p18);
- in[7] = butter_fly(&x0, &x1, &cospi_m18_p14);
-
- x0 = _mm256_unpacklo_epi16(u2, u5);
- x1 = _mm256_unpackhi_epi16(u2, u5);
- in[5] = butter_fly(&x0, &x1, &cospi_p22_p10);
- in[11] = butter_fly(&x0, &x1, &cospi_m10_p22);
-
- x0 = _mm256_unpacklo_epi16(u3, u4);
- x1 = _mm256_unpackhi_epi16(u3, u4);
- in[13] = butter_fly(&x0, &x1, &cospi_p06_p26);
- in[3] = butter_fly(&x0, &x1, &cospi_m26_p06);
-}
-
-void fadst16_avx2(__m256i *in) {
- const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
- const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
- const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
- const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
- const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
- const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
- const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
- const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
- const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
- const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
- const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
- const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
- const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
- const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
- const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
- const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
- const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
- const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
- const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
- const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
- const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
- const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
- const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
- const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
- const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
- const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
- const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
- const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
- const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
- const __m256i zero = _mm256_setzero_si256();
- const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
- __m256i s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
- __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
- __m256i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
- __m256i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
- __m256i y0, y1;
-
- // stage 1, s takes low 256 bits; x takes high 256 bits
- y0 = _mm256_unpacklo_epi16(in[15], in[0]);
- y1 = _mm256_unpackhi_epi16(in[15], in[0]);
- s0 = _mm256_madd_epi16(y0, cospi_p01_p31);
- x0 = _mm256_madd_epi16(y1, cospi_p01_p31);
- s1 = _mm256_madd_epi16(y0, cospi_p31_m01);
- x1 = _mm256_madd_epi16(y1, cospi_p31_m01);
-
- y0 = _mm256_unpacklo_epi16(in[13], in[2]);
- y1 = _mm256_unpackhi_epi16(in[13], in[2]);
- s2 = _mm256_madd_epi16(y0, cospi_p05_p27);
- x2 = _mm256_madd_epi16(y1, cospi_p05_p27);
- s3 = _mm256_madd_epi16(y0, cospi_p27_m05);
- x3 = _mm256_madd_epi16(y1, cospi_p27_m05);
-
- y0 = _mm256_unpacklo_epi16(in[11], in[4]);
- y1 = _mm256_unpackhi_epi16(in[11], in[4]);
- s4 = _mm256_madd_epi16(y0, cospi_p09_p23);
- x4 = _mm256_madd_epi16(y1, cospi_p09_p23);
- s5 = _mm256_madd_epi16(y0, cospi_p23_m09);
- x5 = _mm256_madd_epi16(y1, cospi_p23_m09);
-
- y0 = _mm256_unpacklo_epi16(in[9], in[6]);
- y1 = _mm256_unpackhi_epi16(in[9], in[6]);
- s6 = _mm256_madd_epi16(y0, cospi_p13_p19);
- x6 = _mm256_madd_epi16(y1, cospi_p13_p19);
- s7 = _mm256_madd_epi16(y0, cospi_p19_m13);
- x7 = _mm256_madd_epi16(y1, cospi_p19_m13);
-
- y0 = _mm256_unpacklo_epi16(in[7], in[8]);
- y1 = _mm256_unpackhi_epi16(in[7], in[8]);
- s8 = _mm256_madd_epi16(y0, cospi_p17_p15);
- x8 = _mm256_madd_epi16(y1, cospi_p17_p15);
- s9 = _mm256_madd_epi16(y0, cospi_p15_m17);
- x9 = _mm256_madd_epi16(y1, cospi_p15_m17);
-
- y0 = _mm256_unpacklo_epi16(in[5], in[10]);
- y1 = _mm256_unpackhi_epi16(in[5], in[10]);
- s10 = _mm256_madd_epi16(y0, cospi_p21_p11);
- x10 = _mm256_madd_epi16(y1, cospi_p21_p11);
- s11 = _mm256_madd_epi16(y0, cospi_p11_m21);
- x11 = _mm256_madd_epi16(y1, cospi_p11_m21);
-
- y0 = _mm256_unpacklo_epi16(in[3], in[12]);
- y1 = _mm256_unpackhi_epi16(in[3], in[12]);
- s12 = _mm256_madd_epi16(y0, cospi_p25_p07);
- x12 = _mm256_madd_epi16(y1, cospi_p25_p07);
- s13 = _mm256_madd_epi16(y0, cospi_p07_m25);
- x13 = _mm256_madd_epi16(y1, cospi_p07_m25);
-
- y0 = _mm256_unpacklo_epi16(in[1], in[14]);
- y1 = _mm256_unpackhi_epi16(in[1], in[14]);
- s14 = _mm256_madd_epi16(y0, cospi_p29_p03);
- x14 = _mm256_madd_epi16(y1, cospi_p29_p03);
- s15 = _mm256_madd_epi16(y0, cospi_p03_m29);
- x15 = _mm256_madd_epi16(y1, cospi_p03_m29);
-
- // u takes low 256 bits; v takes high 256 bits
- u0 = _mm256_add_epi32(s0, s8);
- u1 = _mm256_add_epi32(s1, s9);
- u2 = _mm256_add_epi32(s2, s10);
- u3 = _mm256_add_epi32(s3, s11);
- u4 = _mm256_add_epi32(s4, s12);
- u5 = _mm256_add_epi32(s5, s13);
- u6 = _mm256_add_epi32(s6, s14);
- u7 = _mm256_add_epi32(s7, s15);
-
- u8 = _mm256_sub_epi32(s0, s8);
- u9 = _mm256_sub_epi32(s1, s9);
- u10 = _mm256_sub_epi32(s2, s10);
- u11 = _mm256_sub_epi32(s3, s11);
- u12 = _mm256_sub_epi32(s4, s12);
- u13 = _mm256_sub_epi32(s5, s13);
- u14 = _mm256_sub_epi32(s6, s14);
- u15 = _mm256_sub_epi32(s7, s15);
-
- v0 = _mm256_add_epi32(x0, x8);
- v1 = _mm256_add_epi32(x1, x9);
- v2 = _mm256_add_epi32(x2, x10);
- v3 = _mm256_add_epi32(x3, x11);
- v4 = _mm256_add_epi32(x4, x12);
- v5 = _mm256_add_epi32(x5, x13);
- v6 = _mm256_add_epi32(x6, x14);
- v7 = _mm256_add_epi32(x7, x15);
-
- v8 = _mm256_sub_epi32(x0, x8);
- v9 = _mm256_sub_epi32(x1, x9);
- v10 = _mm256_sub_epi32(x2, x10);
- v11 = _mm256_sub_epi32(x3, x11);
- v12 = _mm256_sub_epi32(x4, x12);
- v13 = _mm256_sub_epi32(x5, x13);
- v14 = _mm256_sub_epi32(x6, x14);
- v15 = _mm256_sub_epi32(x7, x15);
-
- // low 256 bits rounding
- u8 = _mm256_add_epi32(u8, dct_rounding);
- u9 = _mm256_add_epi32(u9, dct_rounding);
- u10 = _mm256_add_epi32(u10, dct_rounding);
- u11 = _mm256_add_epi32(u11, dct_rounding);
- u12 = _mm256_add_epi32(u12, dct_rounding);
- u13 = _mm256_add_epi32(u13, dct_rounding);
- u14 = _mm256_add_epi32(u14, dct_rounding);
- u15 = _mm256_add_epi32(u15, dct_rounding);
-
- u8 = _mm256_srai_epi32(u8, DCT_CONST_BITS);
- u9 = _mm256_srai_epi32(u9, DCT_CONST_BITS);
- u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS);
- u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS);
- u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
- u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
- u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
- u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
-
- // high 256 bits rounding
- v8 = _mm256_add_epi32(v8, dct_rounding);
- v9 = _mm256_add_epi32(v9, dct_rounding);
- v10 = _mm256_add_epi32(v10, dct_rounding);
- v11 = _mm256_add_epi32(v11, dct_rounding);
- v12 = _mm256_add_epi32(v12, dct_rounding);
- v13 = _mm256_add_epi32(v13, dct_rounding);
- v14 = _mm256_add_epi32(v14, dct_rounding);
- v15 = _mm256_add_epi32(v15, dct_rounding);
-
- v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS);
- v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS);
- v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
- v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
- v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
- v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
- v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
- v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
-
- // Saturation pack 32-bit to 16-bit
- x8 = _mm256_packs_epi32(u8, v8);
- x9 = _mm256_packs_epi32(u9, v9);
- x10 = _mm256_packs_epi32(u10, v10);
- x11 = _mm256_packs_epi32(u11, v11);
- x12 = _mm256_packs_epi32(u12, v12);
- x13 = _mm256_packs_epi32(u13, v13);
- x14 = _mm256_packs_epi32(u14, v14);
- x15 = _mm256_packs_epi32(u15, v15);
-
- // stage 2
- y0 = _mm256_unpacklo_epi16(x8, x9);
- y1 = _mm256_unpackhi_epi16(x8, x9);
- s8 = _mm256_madd_epi16(y0, cospi_p04_p28);
- x8 = _mm256_madd_epi16(y1, cospi_p04_p28);
- s9 = _mm256_madd_epi16(y0, cospi_p28_m04);
- x9 = _mm256_madd_epi16(y1, cospi_p28_m04);
-
- y0 = _mm256_unpacklo_epi16(x10, x11);
- y1 = _mm256_unpackhi_epi16(x10, x11);
- s10 = _mm256_madd_epi16(y0, cospi_p20_p12);
- x10 = _mm256_madd_epi16(y1, cospi_p20_p12);
- s11 = _mm256_madd_epi16(y0, cospi_p12_m20);
- x11 = _mm256_madd_epi16(y1, cospi_p12_m20);
-
- y0 = _mm256_unpacklo_epi16(x12, x13);
- y1 = _mm256_unpackhi_epi16(x12, x13);
- s12 = _mm256_madd_epi16(y0, cospi_m28_p04);
- x12 = _mm256_madd_epi16(y1, cospi_m28_p04);
- s13 = _mm256_madd_epi16(y0, cospi_p04_p28);
- x13 = _mm256_madd_epi16(y1, cospi_p04_p28);
-
- y0 = _mm256_unpacklo_epi16(x14, x15);
- y1 = _mm256_unpackhi_epi16(x14, x15);
- s14 = _mm256_madd_epi16(y0, cospi_m12_p20);
- x14 = _mm256_madd_epi16(y1, cospi_m12_p20);
- s15 = _mm256_madd_epi16(y0, cospi_p20_p12);
- x15 = _mm256_madd_epi16(y1, cospi_p20_p12);
-
- x0 = _mm256_add_epi32(u0, u4);
- s0 = _mm256_add_epi32(v0, v4);
- x1 = _mm256_add_epi32(u1, u5);
- s1 = _mm256_add_epi32(v1, v5);
- x2 = _mm256_add_epi32(u2, u6);
- s2 = _mm256_add_epi32(v2, v6);
- x3 = _mm256_add_epi32(u3, u7);
- s3 = _mm256_add_epi32(v3, v7);
-
- v8 = _mm256_sub_epi32(u0, u4);
- v9 = _mm256_sub_epi32(v0, v4);
- v10 = _mm256_sub_epi32(u1, u5);
- v11 = _mm256_sub_epi32(v1, v5);
- v12 = _mm256_sub_epi32(u2, u6);
- v13 = _mm256_sub_epi32(v2, v6);
- v14 = _mm256_sub_epi32(u3, u7);
- v15 = _mm256_sub_epi32(v3, v7);
-
- v8 = _mm256_add_epi32(v8, dct_rounding);
- v9 = _mm256_add_epi32(v9, dct_rounding);
- v10 = _mm256_add_epi32(v10, dct_rounding);
- v11 = _mm256_add_epi32(v11, dct_rounding);
- v12 = _mm256_add_epi32(v12, dct_rounding);
- v13 = _mm256_add_epi32(v13, dct_rounding);
- v14 = _mm256_add_epi32(v14, dct_rounding);
- v15 = _mm256_add_epi32(v15, dct_rounding);
-
- v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS);
- v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS);
- v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
- v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
- v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
- v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
- v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
- v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
-
- x4 = _mm256_packs_epi32(v8, v9);
- x5 = _mm256_packs_epi32(v10, v11);
- x6 = _mm256_packs_epi32(v12, v13);
- x7 = _mm256_packs_epi32(v14, v15);
-
- u8 = _mm256_add_epi32(s8, s12);
- u9 = _mm256_add_epi32(s9, s13);
- u10 = _mm256_add_epi32(s10, s14);
- u11 = _mm256_add_epi32(s11, s15);
- u12 = _mm256_sub_epi32(s8, s12);
- u13 = _mm256_sub_epi32(s9, s13);
- u14 = _mm256_sub_epi32(s10, s14);
- u15 = _mm256_sub_epi32(s11, s15);
-
- v8 = _mm256_add_epi32(x8, x12);
- v9 = _mm256_add_epi32(x9, x13);
- v10 = _mm256_add_epi32(x10, x14);
- v11 = _mm256_add_epi32(x11, x15);
- v12 = _mm256_sub_epi32(x8, x12);
- v13 = _mm256_sub_epi32(x9, x13);
- v14 = _mm256_sub_epi32(x10, x14);
- v15 = _mm256_sub_epi32(x11, x15);
-
- u12 = _mm256_add_epi32(u12, dct_rounding);
- u13 = _mm256_add_epi32(u13, dct_rounding);
- u14 = _mm256_add_epi32(u14, dct_rounding);
- u15 = _mm256_add_epi32(u15, dct_rounding);
-
- u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
- u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
- u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
- u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
-
- v12 = _mm256_add_epi32(v12, dct_rounding);
- v13 = _mm256_add_epi32(v13, dct_rounding);
- v14 = _mm256_add_epi32(v14, dct_rounding);
- v15 = _mm256_add_epi32(v15, dct_rounding);
-
- v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
- v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
- v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
- v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
-
- x12 = _mm256_packs_epi32(u12, v12);
- x13 = _mm256_packs_epi32(u13, v13);
- x14 = _mm256_packs_epi32(u14, v14);
- x15 = _mm256_packs_epi32(u15, v15);
-
- // stage 3
- y0 = _mm256_unpacklo_epi16(x4, x5);
- y1 = _mm256_unpackhi_epi16(x4, x5);
- s4 = _mm256_madd_epi16(y0, cospi_p08_p24);
- x4 = _mm256_madd_epi16(y1, cospi_p08_p24);
- s5 = _mm256_madd_epi16(y0, cospi_p24_m08);
- x5 = _mm256_madd_epi16(y1, cospi_p24_m08);
-
- y0 = _mm256_unpacklo_epi16(x6, x7);
- y1 = _mm256_unpackhi_epi16(x6, x7);
- s6 = _mm256_madd_epi16(y0, cospi_m24_p08);
- x6 = _mm256_madd_epi16(y1, cospi_m24_p08);
- s7 = _mm256_madd_epi16(y0, cospi_p08_p24);
- x7 = _mm256_madd_epi16(y1, cospi_p08_p24);
-
- y0 = _mm256_unpacklo_epi16(x12, x13);
- y1 = _mm256_unpackhi_epi16(x12, x13);
- s12 = _mm256_madd_epi16(y0, cospi_p08_p24);
- x12 = _mm256_madd_epi16(y1, cospi_p08_p24);
- s13 = _mm256_madd_epi16(y0, cospi_p24_m08);
- x13 = _mm256_madd_epi16(y1, cospi_p24_m08);
-
- y0 = _mm256_unpacklo_epi16(x14, x15);
- y1 = _mm256_unpackhi_epi16(x14, x15);
- s14 = _mm256_madd_epi16(y0, cospi_m24_p08);
- x14 = _mm256_madd_epi16(y1, cospi_m24_p08);
- s15 = _mm256_madd_epi16(y0, cospi_p08_p24);
- x15 = _mm256_madd_epi16(y1, cospi_p08_p24);
-
- u0 = _mm256_add_epi32(x0, x2);
- v0 = _mm256_add_epi32(s0, s2);
- u1 = _mm256_add_epi32(x1, x3);
- v1 = _mm256_add_epi32(s1, s3);
- u2 = _mm256_sub_epi32(x0, x2);
- v2 = _mm256_sub_epi32(s0, s2);
- u3 = _mm256_sub_epi32(x1, x3);
- v3 = _mm256_sub_epi32(s1, s3);
-
- u0 = _mm256_add_epi32(u0, dct_rounding);
- v0 = _mm256_add_epi32(v0, dct_rounding);
- u1 = _mm256_add_epi32(u1, dct_rounding);
- v1 = _mm256_add_epi32(v1, dct_rounding);
- u2 = _mm256_add_epi32(u2, dct_rounding);
- v2 = _mm256_add_epi32(v2, dct_rounding);
- u3 = _mm256_add_epi32(u3, dct_rounding);
- v3 = _mm256_add_epi32(v3, dct_rounding);
-
- u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
- v0 = _mm256_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
- v1 = _mm256_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS);
- v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS);
- v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS);
-
- in[0] = _mm256_packs_epi32(u0, v0);
- x1 = _mm256_packs_epi32(u1, v1);
- x2 = _mm256_packs_epi32(u2, v2);
- x3 = _mm256_packs_epi32(u3, v3);
-
- // Rounding on s4 + s6, s5 + s7, s4 - s6, s5 - s7
- u4 = _mm256_add_epi32(s4, s6);
- u5 = _mm256_add_epi32(s5, s7);
- u6 = _mm256_sub_epi32(s4, s6);
- u7 = _mm256_sub_epi32(s5, s7);
-
- v4 = _mm256_add_epi32(x4, x6);
- v5 = _mm256_add_epi32(x5, x7);
- v6 = _mm256_sub_epi32(x4, x6);
- v7 = _mm256_sub_epi32(x5, x7);
-
- u4 = _mm256_add_epi32(u4, dct_rounding);
- u5 = _mm256_add_epi32(u5, dct_rounding);
- u6 = _mm256_add_epi32(u6, dct_rounding);
- u7 = _mm256_add_epi32(u7, dct_rounding);
-
- u4 = _mm256_srai_epi32(u4, DCT_CONST_BITS);
- u5 = _mm256_srai_epi32(u5, DCT_CONST_BITS);
- u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS);
- u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS);
-
- v4 = _mm256_add_epi32(v4, dct_rounding);
- v5 = _mm256_add_epi32(v5, dct_rounding);
- v6 = _mm256_add_epi32(v6, dct_rounding);
- v7 = _mm256_add_epi32(v7, dct_rounding);
-
- v4 = _mm256_srai_epi32(v4, DCT_CONST_BITS);
- v5 = _mm256_srai_epi32(v5, DCT_CONST_BITS);
- v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS);
- v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS);
-
- x4 = _mm256_packs_epi32(u4, v4);
- in[12] = _mm256_packs_epi32(u5, v5);
- x6 = _mm256_packs_epi32(u6, v6);
- x7 = _mm256_packs_epi32(u7, v7);
-
- u0 = _mm256_add_epi32(u8, u10);
- v0 = _mm256_add_epi32(v8, v10);
- u1 = _mm256_add_epi32(u9, u11);
- v1 = _mm256_add_epi32(v9, v11);
- u2 = _mm256_sub_epi32(u8, u10);
- v2 = _mm256_sub_epi32(v8, v10);
- u3 = _mm256_sub_epi32(u9, u11);
- v3 = _mm256_sub_epi32(v9, v11);
-
- u0 = _mm256_add_epi32(u0, dct_rounding);
- v0 = _mm256_add_epi32(v0, dct_rounding);
- u1 = _mm256_add_epi32(u1, dct_rounding);
- v1 = _mm256_add_epi32(v1, dct_rounding);
- u2 = _mm256_add_epi32(u2, dct_rounding);
- v2 = _mm256_add_epi32(v2, dct_rounding);
- u3 = _mm256_add_epi32(u3, dct_rounding);
- v3 = _mm256_add_epi32(v3, dct_rounding);
-
- u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
- v0 = _mm256_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
- v1 = _mm256_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS);
- v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS);
- v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS);
-
- x8 = _mm256_packs_epi32(u0, v0);
- in[14] = _mm256_packs_epi32(u1, v1);
- x10 = _mm256_packs_epi32(u2, v2);
- x11 = _mm256_packs_epi32(u3, v3);
-
- // Rounding on s12 + s14, s13 + s15, s12 - s14, s13 - s15
- u12 = _mm256_add_epi32(s12, s14);
- u13 = _mm256_add_epi32(s13, s15);
- u14 = _mm256_sub_epi32(s12, s14);
- u15 = _mm256_sub_epi32(s13, s15);
-
- v12 = _mm256_add_epi32(x12, x14);
- v13 = _mm256_add_epi32(x13, x15);
- v14 = _mm256_sub_epi32(x12, x14);
- v15 = _mm256_sub_epi32(x13, x15);
-
- u12 = _mm256_add_epi32(u12, dct_rounding);
- u13 = _mm256_add_epi32(u13, dct_rounding);
- u14 = _mm256_add_epi32(u14, dct_rounding);
- u15 = _mm256_add_epi32(u15, dct_rounding);
-
- u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
- u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
- u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
- u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
-
- v12 = _mm256_add_epi32(v12, dct_rounding);
- v13 = _mm256_add_epi32(v13, dct_rounding);
- v14 = _mm256_add_epi32(v14, dct_rounding);
- v15 = _mm256_add_epi32(v15, dct_rounding);
-
- v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
- v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
- v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
- v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
-
- x12 = _mm256_packs_epi32(u12, v12);
- x13 = _mm256_packs_epi32(u13, v13);
- x14 = _mm256_packs_epi32(u14, v14);
- x15 = _mm256_packs_epi32(u15, v15);
- in[2] = x12;
-
- // stage 4
- y0 = _mm256_unpacklo_epi16(x2, x3);
- y1 = _mm256_unpackhi_epi16(x2, x3);
- s2 = _mm256_madd_epi16(y0, cospi_m16_m16);
- x2 = _mm256_madd_epi16(y1, cospi_m16_m16);
- s3 = _mm256_madd_epi16(y0, cospi_p16_m16);
- x3 = _mm256_madd_epi16(y1, cospi_p16_m16);
-
- y0 = _mm256_unpacklo_epi16(x6, x7);
- y1 = _mm256_unpackhi_epi16(x6, x7);
- s6 = _mm256_madd_epi16(y0, cospi_p16_p16);
- x6 = _mm256_madd_epi16(y1, cospi_p16_p16);
- s7 = _mm256_madd_epi16(y0, cospi_m16_p16);
- x7 = _mm256_madd_epi16(y1, cospi_m16_p16);
-
- y0 = _mm256_unpacklo_epi16(x10, x11);
- y1 = _mm256_unpackhi_epi16(x10, x11);
- s10 = _mm256_madd_epi16(y0, cospi_p16_p16);
- x10 = _mm256_madd_epi16(y1, cospi_p16_p16);
- s11 = _mm256_madd_epi16(y0, cospi_m16_p16);
- x11 = _mm256_madd_epi16(y1, cospi_m16_p16);
-
- y0 = _mm256_unpacklo_epi16(x14, x15);
- y1 = _mm256_unpackhi_epi16(x14, x15);
- s14 = _mm256_madd_epi16(y0, cospi_m16_m16);
- x14 = _mm256_madd_epi16(y1, cospi_m16_m16);
- s15 = _mm256_madd_epi16(y0, cospi_p16_m16);
- x15 = _mm256_madd_epi16(y1, cospi_p16_m16);
-
- // Rounding
- u2 = _mm256_add_epi32(s2, dct_rounding);
- u3 = _mm256_add_epi32(s3, dct_rounding);
- u6 = _mm256_add_epi32(s6, dct_rounding);
- u7 = _mm256_add_epi32(s7, dct_rounding);
-
- u10 = _mm256_add_epi32(s10, dct_rounding);
- u11 = _mm256_add_epi32(s11, dct_rounding);
- u14 = _mm256_add_epi32(s14, dct_rounding);
- u15 = _mm256_add_epi32(s15, dct_rounding);
-
- u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS);
- u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS);
- u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS);
- u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS);
-
- u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS);
- u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS);
- u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
- u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
-
- v2 = _mm256_add_epi32(x2, dct_rounding);
- v3 = _mm256_add_epi32(x3, dct_rounding);
- v6 = _mm256_add_epi32(x6, dct_rounding);
- v7 = _mm256_add_epi32(x7, dct_rounding);
-
- v10 = _mm256_add_epi32(x10, dct_rounding);
- v11 = _mm256_add_epi32(x11, dct_rounding);
- v14 = _mm256_add_epi32(x14, dct_rounding);
- v15 = _mm256_add_epi32(x15, dct_rounding);
-
- v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS);
- v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS);
- v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS);
- v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS);
-
- v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
- v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
- v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
- v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
-
- in[7] = _mm256_packs_epi32(u2, v2);
- in[8] = _mm256_packs_epi32(u3, v3);
-
- in[4] = _mm256_packs_epi32(u6, v6);
- in[11] = _mm256_packs_epi32(u7, v7);
-
- in[6] = _mm256_packs_epi32(u10, v10);
- in[9] = _mm256_packs_epi32(u11, v11);
-
- in[5] = _mm256_packs_epi32(u14, v14);
- in[10] = _mm256_packs_epi32(u15, v15);
-
- in[1] = _mm256_sub_epi16(zero, x8);
- in[3] = _mm256_sub_epi16(zero, x4);
- in[13] = _mm256_sub_epi16(zero, x13);
- in[15] = _mm256_sub_epi16(zero, x1);
-}
-
-#if CONFIG_EXT_TX
-static void fidtx16_avx2(__m256i *in) {
- txfm_scaling16_avx2((int16_t)Sqrt2, in);
-}
-#endif
-
-void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
- TxfmParam *txfm_param) {
- __m256i in[16];
- const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
- assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
- switch (tx_type) {
- case DCT_DCT:
- load_buffer_16x16(input, stride, 0, 0, in);
- fdct16_avx2(in);
- mm256_transpose_16x16(in, in);
- right_shift_16x16(in);
- fdct16_avx2(in);
- break;
- case ADST_DCT:
- load_buffer_16x16(input, stride, 0, 0, in);
- fadst16_avx2(in);
- mm256_transpose_16x16(in, in);
- right_shift_16x16(in);
- fdct16_avx2(in);
- break;
- case DCT_ADST:
- load_buffer_16x16(input, stride, 0, 0, in);
- fdct16_avx2(in);
- mm256_transpose_16x16(in, in);
- right_shift_16x16(in);
- fadst16_avx2(in);
- break;
- case ADST_ADST:
- load_buffer_16x16(input, stride, 0, 0, in);
- fadst16_avx2(in);
- mm256_transpose_16x16(in, in);
- right_shift_16x16(in);
- fadst16_avx2(in);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- load_buffer_16x16(input, stride, 1, 0, in);
- fadst16_avx2(in);
- mm256_transpose_16x16(in, in);
- right_shift_16x16(in);
- fdct16_avx2(in);
- break;
- case DCT_FLIPADST:
- load_buffer_16x16(input, stride, 0, 1, in);
- fdct16_avx2(in);
- mm256_transpose_16x16(in, in);
- right_shift_16x16(in);
- fadst16_avx2(in);
- break;
- case FLIPADST_FLIPADST:
- load_buffer_16x16(input, stride, 1, 1, in);
- fadst16_avx2(in);
- mm256_transpose_16x16(in, in);
- right_shift_16x16(in);
- fadst16_avx2(in);
- break;
- case ADST_FLIPADST:
- load_buffer_16x16(input, stride, 0, 1, in);
- fadst16_avx2(in);
- mm256_transpose_16x16(in, in);
- right_shift_16x16(in);
- fadst16_avx2(in);
- break;
- case FLIPADST_ADST:
- load_buffer_16x16(input, stride, 1, 0, in);
- fadst16_avx2(in);
- mm256_transpose_16x16(in, in);
- right_shift_16x16(in);
- fadst16_avx2(in);
- break;
- case IDTX:
- load_buffer_16x16(input, stride, 0, 0, in);
- fidtx16_avx2(in);
- mm256_transpose_16x16(in, in);
- right_shift_16x16(in);
- fidtx16_avx2(in);
- break;
- case V_DCT:
- load_buffer_16x16(input, stride, 0, 0, in);
- fdct16_avx2(in);
- mm256_transpose_16x16(in, in);
- right_shift_16x16(in);
- fidtx16_avx2(in);
- break;
- case H_DCT:
- load_buffer_16x16(input, stride, 0, 0, in);
- fidtx16_avx2(in);
- mm256_transpose_16x16(in, in);
- right_shift_16x16(in);
- fdct16_avx2(in);
- break;
- case V_ADST:
- load_buffer_16x16(input, stride, 0, 0, in);
- fadst16_avx2(in);
- mm256_transpose_16x16(in, in);
- right_shift_16x16(in);
- fidtx16_avx2(in);
- break;
- case H_ADST:
- load_buffer_16x16(input, stride, 0, 0, in);
- fidtx16_avx2(in);
- mm256_transpose_16x16(in, in);
- right_shift_16x16(in);
- fadst16_avx2(in);
- break;
- case V_FLIPADST:
- load_buffer_16x16(input, stride, 1, 0, in);
- fadst16_avx2(in);
- mm256_transpose_16x16(in, in);
- right_shift_16x16(in);
- fidtx16_avx2(in);
- break;
- case H_FLIPADST:
- load_buffer_16x16(input, stride, 0, 1, in);
- fidtx16_avx2(in);
- mm256_transpose_16x16(in, in);
- right_shift_16x16(in);
- fadst16_avx2(in);
- break;
-#endif // CONFIG_EXT_TX
- default: assert(0); break;
- }
- mm256_transpose_16x16(in, in);
- write_buffer_16x16(in, output);
- _mm256_zeroupper();
-}
-
-static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
- int i = 0;
- __m256i temp;
- while (i < size) {
- temp = a0[i];
- a0[i] = a1[i];
- a1[i] = temp;
- i++;
- }
-}
-
-static void mm256_transpose_32x32(__m256i *in0, __m256i *in1) {
- mm256_transpose_16x16(in0, in0);
- mm256_transpose_16x16(&in0[16], &in0[16]);
- mm256_transpose_16x16(in1, in1);
- mm256_transpose_16x16(&in1[16], &in1[16]);
- mm256_vectors_swap(&in0[16], in1, 16);
-}
-
-static void prepare_16x16_even(const __m256i *in, __m256i *even) {
- even[0] = _mm256_add_epi16(in[0], in[31]);
- even[1] = _mm256_add_epi16(in[1], in[30]);
- even[2] = _mm256_add_epi16(in[2], in[29]);
- even[3] = _mm256_add_epi16(in[3], in[28]);
- even[4] = _mm256_add_epi16(in[4], in[27]);
- even[5] = _mm256_add_epi16(in[5], in[26]);
- even[6] = _mm256_add_epi16(in[6], in[25]);
- even[7] = _mm256_add_epi16(in[7], in[24]);
- even[8] = _mm256_add_epi16(in[8], in[23]);
- even[9] = _mm256_add_epi16(in[9], in[22]);
- even[10] = _mm256_add_epi16(in[10], in[21]);
- even[11] = _mm256_add_epi16(in[11], in[20]);
- even[12] = _mm256_add_epi16(in[12], in[19]);
- even[13] = _mm256_add_epi16(in[13], in[18]);
- even[14] = _mm256_add_epi16(in[14], in[17]);
- even[15] = _mm256_add_epi16(in[15], in[16]);
-}
-
-static void prepare_16x16_odd(const __m256i *in, __m256i *odd) {
- odd[0] = _mm256_sub_epi16(in[15], in[16]);
- odd[1] = _mm256_sub_epi16(in[14], in[17]);
- odd[2] = _mm256_sub_epi16(in[13], in[18]);
- odd[3] = _mm256_sub_epi16(in[12], in[19]);
- odd[4] = _mm256_sub_epi16(in[11], in[20]);
- odd[5] = _mm256_sub_epi16(in[10], in[21]);
- odd[6] = _mm256_sub_epi16(in[9], in[22]);
- odd[7] = _mm256_sub_epi16(in[8], in[23]);
- odd[8] = _mm256_sub_epi16(in[7], in[24]);
- odd[9] = _mm256_sub_epi16(in[6], in[25]);
- odd[10] = _mm256_sub_epi16(in[5], in[26]);
- odd[11] = _mm256_sub_epi16(in[4], in[27]);
- odd[12] = _mm256_sub_epi16(in[3], in[28]);
- odd[13] = _mm256_sub_epi16(in[2], in[29]);
- odd[14] = _mm256_sub_epi16(in[1], in[30]);
- odd[15] = _mm256_sub_epi16(in[0], in[31]);
-}
-
-static void collect_16col(const __m256i *even, const __m256i *odd,
- __m256i *out) {
- // fdct16_avx2() already maps the output
- out[0] = even[0];
- out[2] = even[1];
- out[4] = even[2];
- out[6] = even[3];
- out[8] = even[4];
- out[10] = even[5];
- out[12] = even[6];
- out[14] = even[7];
- out[16] = even[8];
- out[18] = even[9];
- out[20] = even[10];
- out[22] = even[11];
- out[24] = even[12];
- out[26] = even[13];
- out[28] = even[14];
- out[30] = even[15];
-
- out[1] = odd[0];
- out[17] = odd[1];
- out[9] = odd[2];
- out[25] = odd[3];
- out[5] = odd[4];
- out[21] = odd[5];
- out[13] = odd[6];
- out[29] = odd[7];
- out[3] = odd[8];
- out[19] = odd[9];
- out[11] = odd[10];
- out[27] = odd[11];
- out[7] = odd[12];
- out[23] = odd[13];
- out[15] = odd[14];
- out[31] = odd[15];
-}
-
-static void collect_coeffs(const __m256i *first_16col_even,
- const __m256i *first_16col_odd,
- const __m256i *second_16col_even,
- const __m256i *second_16col_odd, __m256i *in0,
- __m256i *in1) {
- collect_16col(first_16col_even, first_16col_odd, in0);
- collect_16col(second_16col_even, second_16col_odd, in1);
-}
-
-static void fdct16_odd_avx2(__m256i *in) {
- // sequence: cospi_L_H = pairs(L, H) and L first
- const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64);
- const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
- const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
- const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
- const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
- const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64);
- const __m256i cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
- const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64);
- const __m256i cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
-
- const __m256i cospi_p31_p01 = pair256_set_epi16(cospi_31_64, cospi_1_64);
- const __m256i cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
- const __m256i cospi_p15_p17 = pair256_set_epi16(cospi_15_64, cospi_17_64);
- const __m256i cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
- const __m256i cospi_p23_p09 = pair256_set_epi16(cospi_23_64, cospi_9_64);
- const __m256i cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
- const __m256i cospi_p07_p25 = pair256_set_epi16(cospi_7_64, cospi_25_64);
- const __m256i cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
- const __m256i cospi_p27_p05 = pair256_set_epi16(cospi_27_64, cospi_5_64);
- const __m256i cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
- const __m256i cospi_p11_p21 = pair256_set_epi16(cospi_11_64, cospi_21_64);
- const __m256i cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
- const __m256i cospi_p19_p13 = pair256_set_epi16(cospi_19_64, cospi_13_64);
- const __m256i cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
- const __m256i cospi_p03_p29 = pair256_set_epi16(cospi_3_64, cospi_29_64);
- const __m256i cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
-
- __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
- __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15;
- __m256i u0, u1;
-
- // stage 1 is in prepare_16x16_odd()
-
- // stage 2
- y0 = in[0];
- y1 = in[1];
- y2 = in[2];
- y3 = in[3];
-
- u0 = _mm256_unpacklo_epi16(in[4], in[11]);
- u1 = _mm256_unpackhi_epi16(in[4], in[11]);
- y4 = butter_fly(&u0, &u1, &cospi_m16_p16);
- y11 = butter_fly(&u0, &u1, &cospi_p16_p16);
-
- u0 = _mm256_unpacklo_epi16(in[5], in[10]);
- u1 = _mm256_unpackhi_epi16(in[5], in[10]);
- y5 = butter_fly(&u0, &u1, &cospi_m16_p16);
- y10 = butter_fly(&u0, &u1, &cospi_p16_p16);
-
- u0 = _mm256_unpacklo_epi16(in[6], in[9]);
- u1 = _mm256_unpackhi_epi16(in[6], in[9]);
- y6 = butter_fly(&u0, &u1, &cospi_m16_p16);
- y9 = butter_fly(&u0, &u1, &cospi_p16_p16);
-
- u0 = _mm256_unpacklo_epi16(in[7], in[8]);
- u1 = _mm256_unpackhi_epi16(in[7], in[8]);
- y7 = butter_fly(&u0, &u1, &cospi_m16_p16);
- y8 = butter_fly(&u0, &u1, &cospi_p16_p16);
-
- y12 = in[12];
- y13 = in[13];
- y14 = in[14];
- y15 = in[15];
-
- // stage 3
- x0 = _mm256_add_epi16(y0, y7);
- x1 = _mm256_add_epi16(y1, y6);
- x2 = _mm256_add_epi16(y2, y5);
- x3 = _mm256_add_epi16(y3, y4);
- x4 = _mm256_sub_epi16(y3, y4);
- x5 = _mm256_sub_epi16(y2, y5);
- x6 = _mm256_sub_epi16(y1, y6);
- x7 = _mm256_sub_epi16(y0, y7);
- x8 = _mm256_sub_epi16(y15, y8);
- x9 = _mm256_sub_epi16(y14, y9);
- x10 = _mm256_sub_epi16(y13, y10);
- x11 = _mm256_sub_epi16(y12, y11);
- x12 = _mm256_add_epi16(y12, y11);
- x13 = _mm256_add_epi16(y13, y10);
- x14 = _mm256_add_epi16(y14, y9);
- x15 = _mm256_add_epi16(y15, y8);
-
- // stage 4
- y0 = x0;
- y1 = x1;
- y6 = x6;
- y7 = x7;
- y8 = x8;
- y9 = x9;
- y14 = x14;
- y15 = x15;
-
- u0 = _mm256_unpacklo_epi16(x2, x13);
- u1 = _mm256_unpackhi_epi16(x2, x13);
- y2 = butter_fly(&u0, &u1, &cospi_m08_p24);
- y13 = butter_fly(&u0, &u1, &cospi_p24_p08);
-
- u0 = _mm256_unpacklo_epi16(x3, x12);
- u1 = _mm256_unpackhi_epi16(x3, x12);
- y3 = butter_fly(&u0, &u1, &cospi_m08_p24);
- y12 = butter_fly(&u0, &u1, &cospi_p24_p08);
-
- u0 = _mm256_unpacklo_epi16(x4, x11);
- u1 = _mm256_unpackhi_epi16(x4, x11);
- y4 = butter_fly(&u0, &u1, &cospi_m24_m08);
- y11 = butter_fly(&u0, &u1, &cospi_m08_p24);
-
- u0 = _mm256_unpacklo_epi16(x5, x10);
- u1 = _mm256_unpackhi_epi16(x5, x10);
- y5 = butter_fly(&u0, &u1, &cospi_m24_m08);
- y10 = butter_fly(&u0, &u1, &cospi_m08_p24);
-
- // stage 5
- x0 = _mm256_add_epi16(y0, y3);
- x1 = _mm256_add_epi16(y1, y2);
- x2 = _mm256_sub_epi16(y1, y2);
- x3 = _mm256_sub_epi16(y0, y3);
- x4 = _mm256_sub_epi16(y7, y4);
- x5 = _mm256_sub_epi16(y6, y5);
- x6 = _mm256_add_epi16(y6, y5);
- x7 = _mm256_add_epi16(y7, y4);
-
- x8 = _mm256_add_epi16(y8, y11);
- x9 = _mm256_add_epi16(y9, y10);
- x10 = _mm256_sub_epi16(y9, y10);
- x11 = _mm256_sub_epi16(y8, y11);
- x12 = _mm256_sub_epi16(y15, y12);
- x13 = _mm256_sub_epi16(y14, y13);
- x14 = _mm256_add_epi16(y14, y13);
- x15 = _mm256_add_epi16(y15, y12);
-
- // stage 6
- y0 = x0;
- y3 = x3;
- y4 = x4;
- y7 = x7;
- y8 = x8;
- y11 = x11;
- y12 = x12;
- y15 = x15;
-
- u0 = _mm256_unpacklo_epi16(x1, x14);
- u1 = _mm256_unpackhi_epi16(x1, x14);
- y1 = butter_fly(&u0, &u1, &cospi_m04_p28);
- y14 = butter_fly(&u0, &u1, &cospi_p28_p04);
-
- u0 = _mm256_unpacklo_epi16(x2, x13);
- u1 = _mm256_unpackhi_epi16(x2, x13);
- y2 = butter_fly(&u0, &u1, &cospi_m28_m04);
- y13 = butter_fly(&u0, &u1, &cospi_m04_p28);
-
- u0 = _mm256_unpacklo_epi16(x5, x10);
- u1 = _mm256_unpackhi_epi16(x5, x10);
- y5 = butter_fly(&u0, &u1, &cospi_m20_p12);
- y10 = butter_fly(&u0, &u1, &cospi_p12_p20);
-
- u0 = _mm256_unpacklo_epi16(x6, x9);
- u1 = _mm256_unpackhi_epi16(x6, x9);
- y6 = butter_fly(&u0, &u1, &cospi_m12_m20);
- y9 = butter_fly(&u0, &u1, &cospi_m20_p12);
-
- // stage 7
- x0 = _mm256_add_epi16(y0, y1);
- x1 = _mm256_sub_epi16(y0, y1);
- x2 = _mm256_sub_epi16(y3, y2);
- x3 = _mm256_add_epi16(y3, y2);
- x4 = _mm256_add_epi16(y4, y5);
- x5 = _mm256_sub_epi16(y4, y5);
- x6 = _mm256_sub_epi16(y7, y6);
- x7 = _mm256_add_epi16(y7, y6);
-
- x8 = _mm256_add_epi16(y8, y9);
- x9 = _mm256_sub_epi16(y8, y9);
- x10 = _mm256_sub_epi16(y11, y10);
- x11 = _mm256_add_epi16(y11, y10);
- x12 = _mm256_add_epi16(y12, y13);
- x13 = _mm256_sub_epi16(y12, y13);
- x14 = _mm256_sub_epi16(y15, y14);
- x15 = _mm256_add_epi16(y15, y14);
-
- // stage 8
- u0 = _mm256_unpacklo_epi16(x0, x15);
- u1 = _mm256_unpackhi_epi16(x0, x15);
- in[0] = butter_fly(&u0, &u1, &cospi_p31_p01);
- in[15] = butter_fly(&u0, &u1, &cospi_m01_p31);
-
- u0 = _mm256_unpacklo_epi16(x1, x14);
- u1 = _mm256_unpackhi_epi16(x1, x14);
- in[1] = butter_fly(&u0, &u1, &cospi_p15_p17);
- in[14] = butter_fly(&u0, &u1, &cospi_m17_p15);
-
- u0 = _mm256_unpacklo_epi16(x2, x13);
- u1 = _mm256_unpackhi_epi16(x2, x13);
- in[2] = butter_fly(&u0, &u1, &cospi_p23_p09);
- in[13] = butter_fly(&u0, &u1, &cospi_m09_p23);
-
- u0 = _mm256_unpacklo_epi16(x3, x12);
- u1 = _mm256_unpackhi_epi16(x3, x12);
- in[3] = butter_fly(&u0, &u1, &cospi_p07_p25);
- in[12] = butter_fly(&u0, &u1, &cospi_m25_p07);
-
- u0 = _mm256_unpacklo_epi16(x4, x11);
- u1 = _mm256_unpackhi_epi16(x4, x11);
- in[4] = butter_fly(&u0, &u1, &cospi_p27_p05);
- in[11] = butter_fly(&u0, &u1, &cospi_m05_p27);
-
- u0 = _mm256_unpacklo_epi16(x5, x10);
- u1 = _mm256_unpackhi_epi16(x5, x10);
- in[5] = butter_fly(&u0, &u1, &cospi_p11_p21);
- in[10] = butter_fly(&u0, &u1, &cospi_m21_p11);
-
- u0 = _mm256_unpacklo_epi16(x6, x9);
- u1 = _mm256_unpackhi_epi16(x6, x9);
- in[6] = butter_fly(&u0, &u1, &cospi_p19_p13);
- in[9] = butter_fly(&u0, &u1, &cospi_m13_p19);
-
- u0 = _mm256_unpacklo_epi16(x7, x8);
- u1 = _mm256_unpackhi_epi16(x7, x8);
- in[7] = butter_fly(&u0, &u1, &cospi_p03_p29);
- in[8] = butter_fly(&u0, &u1, &cospi_m29_p03);
-}
-
-static void fdct32_avx2(__m256i *in0, __m256i *in1) {
- __m256i even0[16], even1[16], odd0[16], odd1[16];
- prepare_16x16_even(in0, even0);
- fdct16_avx2(even0);
-
- prepare_16x16_odd(in0, odd0);
- fdct16_odd_avx2(odd0);
-
- prepare_16x16_even(in1, even1);
- fdct16_avx2(even1);
-
- prepare_16x16_odd(in1, odd1);
- fdct16_odd_avx2(odd1);
-
- collect_coeffs(even0, odd0, even1, odd1, in0, in1);
-
- mm256_transpose_32x32(in0, in1);
-}
-
-static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1,
- tran_low_t *output) {
- int i = 0;
- const int stride = 32;
- tran_low_t *coeff = output;
- while (i < 32) {
- storeu_output_avx2(&in0[i], coeff);
- storeu_output_avx2(&in1[i], coeff + 16);
- coeff += stride;
- i += 1;
- }
-}
-
-#if CONFIG_EXT_TX
-static void fhalfright32_16col_avx2(__m256i *in) {
- int i = 0;
- const __m256i zero = _mm256_setzero_si256();
- const __m256i sqrt2 = _mm256_set1_epi16((int16_t)Sqrt2);
- const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
- __m256i x0, x1;
-
- while (i < 16) {
- in[i] = _mm256_slli_epi16(in[i], 2);
- x0 = _mm256_unpacklo_epi16(in[i + 16], zero);
- x1 = _mm256_unpackhi_epi16(in[i + 16], zero);
- x0 = _mm256_madd_epi16(x0, sqrt2);
- x1 = _mm256_madd_epi16(x1, sqrt2);
- x0 = _mm256_add_epi32(x0, dct_rounding);
- x1 = _mm256_add_epi32(x1, dct_rounding);
- x0 = _mm256_srai_epi32(x0, DCT_CONST_BITS);
- x1 = _mm256_srai_epi32(x1, DCT_CONST_BITS);
- in[i + 16] = _mm256_packs_epi32(x0, x1);
- i += 1;
- }
- fdct16_avx2(&in[16]);
-}
-
-static void fhalfright32_avx2(__m256i *in0, __m256i *in1) {
- fhalfright32_16col_avx2(in0);
- fhalfright32_16col_avx2(in1);
- mm256_vectors_swap(in0, &in0[16], 16);
- mm256_vectors_swap(in1, &in1[16], 16);
- mm256_transpose_32x32(in0, in1);
-}
-#endif // CONFIG_EXT_TX
-
-static INLINE void load_buffer_32x32(const int16_t *input, int stride,
- int flipud, int fliplr, __m256i *in0,
- __m256i *in1) {
- // Load 4 16x16 blocks
- const int16_t *topL = input;
- const int16_t *topR = input + 16;
- const int16_t *botL = input + 16 * stride;
- const int16_t *botR = input + 16 * stride + 16;
-
- const int16_t *tmp;
-
- if (flipud) {
- // Swap left columns
- tmp = topL;
- topL = botL;
- botL = tmp;
- // Swap right columns
- tmp = topR;
- topR = botR;
- botR = tmp;
- }
-
- if (fliplr) {
- // Swap top rows
- tmp = topL;
- topL = topR;
- topR = tmp;
- // Swap bottom rows
- tmp = botL;
- botL = botR;
- botR = tmp;
- }
-
- // load first 16 columns
- load_buffer_16x16(topL, stride, flipud, fliplr, in0);
- load_buffer_16x16(botL, stride, flipud, fliplr, in0 + 16);
-
- // load second 16 columns
- load_buffer_16x16(topR, stride, flipud, fliplr, in1);
- load_buffer_16x16(botR, stride, flipud, fliplr, in1 + 16);
-}
-
-static INLINE void right_shift_32x32_16col(int bit, __m256i *in) {
- int i = 0;
- const __m256i rounding = _mm256_set1_epi16((1 << bit) >> 1);
- __m256i sign;
- while (i < 32) {
- sign = _mm256_srai_epi16(in[i], 15);
- in[i] = _mm256_add_epi16(in[i], rounding);
- in[i] = _mm256_add_epi16(in[i], sign);
- in[i] = _mm256_srai_epi16(in[i], bit);
- i += 1;
- }
-}
-
-// Positive rounding
-static INLINE void right_shift_32x32(__m256i *in0, __m256i *in1) {
- const int bit = 4;
- right_shift_32x32_16col(bit, in0);
- right_shift_32x32_16col(bit, in1);
-}
-
-#if CONFIG_EXT_TX
-static void fidtx32_avx2(__m256i *in0, __m256i *in1) {
- int i = 0;
- while (i < 32) {
- in0[i] = _mm256_slli_epi16(in0[i], 2);
- in1[i] = _mm256_slli_epi16(in1[i], 2);
- i += 1;
- }
- mm256_transpose_32x32(in0, in1);
-}
-#endif
-
-void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride,
- TxfmParam *txfm_param) {
- __m256i in0[32]; // left 32 columns
- __m256i in1[32]; // right 32 columns
- const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
- assert(tx_type != MRC_DCT && "No avx2 32x32 implementation of MRC_DCT");
-#endif
-
- switch (tx_type) {
- case DCT_DCT:
- load_buffer_32x32(input, stride, 0, 0, in0, in1);
- fdct32_avx2(in0, in1);
- right_shift_32x32(in0, in1);
- fdct32_avx2(in0, in1);
- break;
-#if CONFIG_EXT_TX
- case ADST_DCT:
- load_buffer_32x32(input, stride, 0, 0, in0, in1);
- fhalfright32_avx2(in0, in1);
- right_shift_32x32(in0, in1);
- fdct32_avx2(in0, in1);
- break;
- case DCT_ADST:
- load_buffer_32x32(input, stride, 0, 0, in0, in1);
- fdct32_avx2(in0, in1);
- right_shift_32x32(in0, in1);
- fhalfright32_avx2(in0, in1);
- break;
- case ADST_ADST:
- load_buffer_32x32(input, stride, 0, 0, in0, in1);
- fhalfright32_avx2(in0, in1);
- right_shift_32x32(in0, in1);
- fhalfright32_avx2(in0, in1);
- break;
- case FLIPADST_DCT:
- load_buffer_32x32(input, stride, 1, 0, in0, in1);
- fhalfright32_avx2(in0, in1);
- right_shift_32x32(in0, in1);
- fdct32_avx2(in0, in1);
- break;
- case DCT_FLIPADST:
- load_buffer_32x32(input, stride, 0, 1, in0, in1);
- fdct32_avx2(in0, in1);
- right_shift_32x32(in0, in1);
- fhalfright32_avx2(in0, in1);
- break;
- case FLIPADST_FLIPADST:
- load_buffer_32x32(input, stride, 1, 1, in0, in1);
- fhalfright32_avx2(in0, in1);
- right_shift_32x32(in0, in1);
- fhalfright32_avx2(in0, in1);
- break;
- case ADST_FLIPADST:
- load_buffer_32x32(input, stride, 0, 1, in0, in1);
- fhalfright32_avx2(in0, in1);
- right_shift_32x32(in0, in1);
- fhalfright32_avx2(in0, in1);
- break;
- case FLIPADST_ADST:
- load_buffer_32x32(input, stride, 1, 0, in0, in1);
- fhalfright32_avx2(in0, in1);
- right_shift_32x32(in0, in1);
- fhalfright32_avx2(in0, in1);
- break;
- case IDTX:
- load_buffer_32x32(input, stride, 0, 0, in0, in1);
- fidtx32_avx2(in0, in1);
- right_shift_32x32(in0, in1);
- fidtx32_avx2(in0, in1);
- break;
- case V_DCT:
- load_buffer_32x32(input, stride, 0, 0, in0, in1);
- fdct32_avx2(in0, in1);
- right_shift_32x32(in0, in1);
- fidtx32_avx2(in0, in1);
- break;
- case H_DCT:
- load_buffer_32x32(input, stride, 0, 0, in0, in1);
- fidtx32_avx2(in0, in1);
- right_shift_32x32(in0, in1);
- fdct32_avx2(in0, in1);
- break;
- case V_ADST:
- load_buffer_32x32(input, stride, 0, 0, in0, in1);
- fhalfright32_avx2(in0, in1);
- right_shift_32x32(in0, in1);
- fidtx32_avx2(in0, in1);
- break;
- case H_ADST:
- load_buffer_32x32(input, stride, 0, 0, in0, in1);
- fidtx32_avx2(in0, in1);
- right_shift_32x32(in0, in1);
- fhalfright32_avx2(in0, in1);
- break;
- case V_FLIPADST:
- load_buffer_32x32(input, stride, 1, 0, in0, in1);
- fhalfright32_avx2(in0, in1);
- right_shift_32x32(in0, in1);
- fidtx32_avx2(in0, in1);
- break;
- case H_FLIPADST:
- load_buffer_32x32(input, stride, 0, 1, in0, in1);
- fidtx32_avx2(in0, in1);
- right_shift_32x32(in0, in1);
- fhalfright32_avx2(in0, in1);
- break;
-#endif // CONFIG_EXT_TX
- default: assert(0); break;
- }
- write_buffer_32x32(in0, in1, output);
- _mm256_zeroupper();
-}
diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm
index 7186b6b92..30983d1c1 100644
--- a/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm
@@ -14,6 +14,8 @@
%include "aom_ports/x86_abi_support.asm"
+SECTION .text
+
; void av1_temporal_filter_apply_sse2 | arg
; (unsigned char *frame1, | 0
; unsigned int stride, | 1
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
index bf233ca4d..4d2e99f25 100644
--- a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
@@ -31,7 +31,7 @@ uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
uint64_t csse;
const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
- const __m128i v_zext_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+ const __m128i v_zext_q = xx_set1_64_from_32i(0xffffffff);
__m128i v_acc0_q = _mm_setzero_si128();