diff options
author | trav90 <travawine@palemoon.org> | 2018-10-19 23:05:00 -0500 |
---|---|---|
committer | trav90 <travawine@palemoon.org> | 2018-10-19 23:05:03 -0500 |
commit | d2499ead93dc4298c0882fe98902acb1b5209f99 (patch) | |
tree | cb0b942aed59e5108f9a3e9d64e7b77854383421 /third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c | |
parent | 41fbdea457bf50c0a43e1c27c5cbf7f0a3a9eb33 (diff) | |
download | UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.gz UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.lz UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.xz UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.zip |
Update libaom to commit ID 1e227d41f0616de9548a673a83a21ef990b62591
Diffstat (limited to 'third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c')
-rw-r--r-- | third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c | 259 |
1 files changed, 130 insertions, 129 deletions
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c index c71f2e74c..07615543c 100644 --- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c @@ -395,7 +395,8 @@ void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output, } void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output, - int8_t cos_bit) { + int8_t cos_bit, const int instride, + const int outstride) { const int32_t *cospi = cospi_arr(cos_bit); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); @@ -480,70 +481,70 @@ void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output, // stage 1 __m128i x1[64]; - x1[0] = _mm_add_epi32(input[0], input[63]); - x1[63] = _mm_sub_epi32(input[0], input[63]); - x1[1] = _mm_add_epi32(input[1], input[62]); - x1[62] = _mm_sub_epi32(input[1], input[62]); - x1[2] = _mm_add_epi32(input[2], input[61]); - x1[61] = _mm_sub_epi32(input[2], input[61]); - x1[3] = _mm_add_epi32(input[3], input[60]); - x1[60] = _mm_sub_epi32(input[3], input[60]); - x1[4] = _mm_add_epi32(input[4], input[59]); - x1[59] = _mm_sub_epi32(input[4], input[59]); - x1[5] = _mm_add_epi32(input[5], input[58]); - x1[58] = _mm_sub_epi32(input[5], input[58]); - x1[6] = _mm_add_epi32(input[6], input[57]); - x1[57] = _mm_sub_epi32(input[6], input[57]); - x1[7] = _mm_add_epi32(input[7], input[56]); - x1[56] = _mm_sub_epi32(input[7], input[56]); - x1[8] = _mm_add_epi32(input[8], input[55]); - x1[55] = _mm_sub_epi32(input[8], input[55]); - x1[9] = _mm_add_epi32(input[9], input[54]); - x1[54] = _mm_sub_epi32(input[9], input[54]); - x1[10] = _mm_add_epi32(input[10], input[53]); - x1[53] = _mm_sub_epi32(input[10], input[53]); - x1[11] = _mm_add_epi32(input[11], input[52]); - x1[52] = _mm_sub_epi32(input[11], input[52]); - x1[12] = _mm_add_epi32(input[12], input[51]); - x1[51] = _mm_sub_epi32(input[12], input[51]); - x1[13] = _mm_add_epi32(input[13], input[50]); - x1[50] = _mm_sub_epi32(input[13], input[50]); - x1[14] = _mm_add_epi32(input[14], input[49]); - x1[49] = _mm_sub_epi32(input[14], input[49]); - x1[15] = _mm_add_epi32(input[15], input[48]); - x1[48] = _mm_sub_epi32(input[15], input[48]); - x1[16] = _mm_add_epi32(input[16], input[47]); - x1[47] = _mm_sub_epi32(input[16], input[47]); - x1[17] = _mm_add_epi32(input[17], input[46]); - x1[46] = _mm_sub_epi32(input[17], input[46]); - x1[18] = _mm_add_epi32(input[18], input[45]); - x1[45] = _mm_sub_epi32(input[18], input[45]); - x1[19] = _mm_add_epi32(input[19], input[44]); - x1[44] = _mm_sub_epi32(input[19], input[44]); - x1[20] = _mm_add_epi32(input[20], input[43]); - x1[43] = _mm_sub_epi32(input[20], input[43]); - x1[21] = _mm_add_epi32(input[21], input[42]); - x1[42] = _mm_sub_epi32(input[21], input[42]); - x1[22] = _mm_add_epi32(input[22], input[41]); - x1[41] = _mm_sub_epi32(input[22], input[41]); - x1[23] = _mm_add_epi32(input[23], input[40]); - x1[40] = _mm_sub_epi32(input[23], input[40]); - x1[24] = _mm_add_epi32(input[24], input[39]); - x1[39] = _mm_sub_epi32(input[24], input[39]); - x1[25] = _mm_add_epi32(input[25], input[38]); - x1[38] = _mm_sub_epi32(input[25], input[38]); - x1[26] = _mm_add_epi32(input[26], input[37]); - x1[37] = _mm_sub_epi32(input[26], input[37]); - x1[27] = _mm_add_epi32(input[27], input[36]); - x1[36] = _mm_sub_epi32(input[27], input[36]); - x1[28] = _mm_add_epi32(input[28], input[35]); - x1[35] = _mm_sub_epi32(input[28], input[35]); - x1[29] = _mm_add_epi32(input[29], input[34]); - x1[34] = _mm_sub_epi32(input[29], input[34]); - x1[30] = _mm_add_epi32(input[30], input[33]); - x1[33] = _mm_sub_epi32(input[30], input[33]); - x1[31] = _mm_add_epi32(input[31], input[32]); - x1[32] = _mm_sub_epi32(input[31], input[32]); + x1[0] = _mm_add_epi32(input[0 * instride], input[63 * instride]); + x1[63] = _mm_sub_epi32(input[0 * instride], input[63 * instride]); + x1[1] = _mm_add_epi32(input[1 * instride], input[62 * instride]); + x1[62] = _mm_sub_epi32(input[1 * instride], input[62 * instride]); + x1[2] = _mm_add_epi32(input[2 * instride], input[61 * instride]); + x1[61] = _mm_sub_epi32(input[2 * instride], input[61 * instride]); + x1[3] = _mm_add_epi32(input[3 * instride], input[60 * instride]); + x1[60] = _mm_sub_epi32(input[3 * instride], input[60 * instride]); + x1[4] = _mm_add_epi32(input[4 * instride], input[59 * instride]); + x1[59] = _mm_sub_epi32(input[4 * instride], input[59 * instride]); + x1[5] = _mm_add_epi32(input[5 * instride], input[58 * instride]); + x1[58] = _mm_sub_epi32(input[5 * instride], input[58 * instride]); + x1[6] = _mm_add_epi32(input[6 * instride], input[57 * instride]); + x1[57] = _mm_sub_epi32(input[6 * instride], input[57 * instride]); + x1[7] = _mm_add_epi32(input[7 * instride], input[56 * instride]); + x1[56] = _mm_sub_epi32(input[7 * instride], input[56 * instride]); + x1[8] = _mm_add_epi32(input[8 * instride], input[55 * instride]); + x1[55] = _mm_sub_epi32(input[8 * instride], input[55 * instride]); + x1[9] = _mm_add_epi32(input[9 * instride], input[54 * instride]); + x1[54] = _mm_sub_epi32(input[9 * instride], input[54 * instride]); + x1[10] = _mm_add_epi32(input[10 * instride], input[53 * instride]); + x1[53] = _mm_sub_epi32(input[10 * instride], input[53 * instride]); + x1[11] = _mm_add_epi32(input[11 * instride], input[52 * instride]); + x1[52] = _mm_sub_epi32(input[11 * instride], input[52 * instride]); + x1[12] = _mm_add_epi32(input[12 * instride], input[51 * instride]); + x1[51] = _mm_sub_epi32(input[12 * instride], input[51 * instride]); + x1[13] = _mm_add_epi32(input[13 * instride], input[50 * instride]); + x1[50] = _mm_sub_epi32(input[13 * instride], input[50 * instride]); + x1[14] = _mm_add_epi32(input[14 * instride], input[49 * instride]); + x1[49] = _mm_sub_epi32(input[14 * instride], input[49 * instride]); + x1[15] = _mm_add_epi32(input[15 * instride], input[48 * instride]); + x1[48] = _mm_sub_epi32(input[15 * instride], input[48 * instride]); + x1[16] = _mm_add_epi32(input[16 * instride], input[47 * instride]); + x1[47] = _mm_sub_epi32(input[16 * instride], input[47 * instride]); + x1[17] = _mm_add_epi32(input[17 * instride], input[46 * instride]); + x1[46] = _mm_sub_epi32(input[17 * instride], input[46 * instride]); + x1[18] = _mm_add_epi32(input[18 * instride], input[45 * instride]); + x1[45] = _mm_sub_epi32(input[18 * instride], input[45 * instride]); + x1[19] = _mm_add_epi32(input[19 * instride], input[44 * instride]); + x1[44] = _mm_sub_epi32(input[19 * instride], input[44 * instride]); + x1[20] = _mm_add_epi32(input[20 * instride], input[43 * instride]); + x1[43] = _mm_sub_epi32(input[20 * instride], input[43 * instride]); + x1[21] = _mm_add_epi32(input[21 * instride], input[42 * instride]); + x1[42] = _mm_sub_epi32(input[21 * instride], input[42 * instride]); + x1[22] = _mm_add_epi32(input[22 * instride], input[41 * instride]); + x1[41] = _mm_sub_epi32(input[22 * instride], input[41 * instride]); + x1[23] = _mm_add_epi32(input[23 * instride], input[40 * instride]); + x1[40] = _mm_sub_epi32(input[23 * instride], input[40 * instride]); + x1[24] = _mm_add_epi32(input[24 * instride], input[39 * instride]); + x1[39] = _mm_sub_epi32(input[24 * instride], input[39 * instride]); + x1[25] = _mm_add_epi32(input[25 * instride], input[38 * instride]); + x1[38] = _mm_sub_epi32(input[25 * instride], input[38 * instride]); + x1[26] = _mm_add_epi32(input[26 * instride], input[37 * instride]); + x1[37] = _mm_sub_epi32(input[26 * instride], input[37 * instride]); + x1[27] = _mm_add_epi32(input[27 * instride], input[36 * instride]); + x1[36] = _mm_sub_epi32(input[27 * instride], input[36 * instride]); + x1[28] = _mm_add_epi32(input[28 * instride], input[35 * instride]); + x1[35] = _mm_sub_epi32(input[28 * instride], input[35 * instride]); + x1[29] = _mm_add_epi32(input[29 * instride], input[34 * instride]); + x1[34] = _mm_sub_epi32(input[29 * instride], input[34 * instride]); + x1[30] = _mm_add_epi32(input[30 * instride], input[33 * instride]); + x1[33] = _mm_sub_epi32(input[30 * instride], input[33 * instride]); + x1[31] = _mm_add_epi32(input[31 * instride], input[32 * instride]); + x1[32] = _mm_sub_epi32(input[31 * instride], input[32 * instride]); // stage 2 __m128i x2[64]; @@ -1149,68 +1150,68 @@ void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output, x10[48], __rounding, cos_bit); // stage 11 - output[0] = x10[0]; - output[1] = x10[32]; - output[2] = x10[16]; - output[3] = x10[48]; - output[4] = x10[8]; - output[5] = x10[40]; - output[6] = x10[24]; - output[7] = x10[56]; - output[8] = x10[4]; - output[9] = x10[36]; - output[10] = x10[20]; - output[11] = x10[52]; - output[12] = x10[12]; - output[13] = x10[44]; - output[14] = x10[28]; - output[15] = x10[60]; - output[16] = x10[2]; - output[17] = x10[34]; - output[18] = x10[18]; - output[19] = x10[50]; - output[20] = x10[10]; - output[21] = x10[42]; - output[22] = x10[26]; - output[23] = x10[58]; - output[24] = x10[6]; - output[25] = x10[38]; - output[26] = x10[22]; - output[27] = x10[54]; - output[28] = x10[14]; - output[29] = x10[46]; - output[30] = x10[30]; - output[31] = x10[62]; - output[32] = x10[1]; - output[33] = x10[33]; - output[34] = x10[17]; - output[35] = x10[49]; - output[36] = x10[9]; - output[37] = x10[41]; - output[38] = x10[25]; - output[39] = x10[57]; - output[40] = x10[5]; - output[41] = x10[37]; - output[42] = x10[21]; - output[43] = x10[53]; - output[44] = x10[13]; - output[45] = x10[45]; - output[46] = x10[29]; - output[47] = x10[61]; - output[48] = x10[3]; - output[49] = x10[35]; - output[50] = x10[19]; - output[51] = x10[51]; - output[52] = x10[11]; - output[53] = x10[43]; - output[54] = x10[27]; - output[55] = x10[59]; - output[56] = x10[7]; - output[57] = x10[39]; - output[58] = x10[23]; - output[59] = x10[55]; - output[60] = x10[15]; - output[61] = x10[47]; - output[62] = x10[31]; - output[63] = x10[63]; + output[0 * outstride] = x10[0]; + output[1 * outstride] = x10[32]; + output[2 * outstride] = x10[16]; + output[3 * outstride] = x10[48]; + output[4 * outstride] = x10[8]; + output[5 * outstride] = x10[40]; + output[6 * outstride] = x10[24]; + output[7 * outstride] = x10[56]; + output[8 * outstride] = x10[4]; + output[9 * outstride] = x10[36]; + output[10 * outstride] = x10[20]; + output[11 * outstride] = x10[52]; + output[12 * outstride] = x10[12]; + output[13 * outstride] = x10[44]; + output[14 * outstride] = x10[28]; + output[15 * outstride] = x10[60]; + output[16 * outstride] = x10[2]; + output[17 * outstride] = x10[34]; + output[18 * outstride] = x10[18]; + output[19 * outstride] = x10[50]; + output[20 * outstride] = x10[10]; + output[21 * outstride] = x10[42]; + output[22 * outstride] = x10[26]; + output[23 * outstride] = x10[58]; + output[24 * outstride] = x10[6]; + output[25 * outstride] = x10[38]; + output[26 * outstride] = x10[22]; + output[27 * outstride] = x10[54]; + output[28 * outstride] = x10[14]; + output[29 * outstride] = x10[46]; + output[30 * outstride] = x10[30]; + output[31 * outstride] = x10[62]; + output[32 * outstride] = x10[1]; + output[33 * outstride] = x10[33]; + output[34 * outstride] = x10[17]; + output[35 * outstride] = x10[49]; + output[36 * outstride] = x10[9]; + output[37 * outstride] = x10[41]; + output[38 * outstride] = x10[25]; + output[39 * outstride] = x10[57]; + output[40 * outstride] = x10[5]; + output[41 * outstride] = x10[37]; + output[42 * outstride] = x10[21]; + output[43 * outstride] = x10[53]; + output[44 * outstride] = x10[13]; + output[45 * outstride] = x10[45]; + output[46 * outstride] = x10[29]; + output[47 * outstride] = x10[61]; + output[48 * outstride] = x10[3]; + output[49 * outstride] = x10[35]; + output[50 * outstride] = x10[19]; + output[51 * outstride] = x10[51]; + output[52 * outstride] = x10[11]; + output[53 * outstride] = x10[43]; + output[54 * outstride] = x10[27]; + output[55 * outstride] = x10[59]; + output[56 * outstride] = x10[7]; + output[57 * outstride] = x10[39]; + output[58 * outstride] = x10[23]; + output[59 * outstride] = x10[55]; + output[60 * outstride] = x10[15]; + output[61 * outstride] = x10[47]; + output[62 * outstride] = x10[31]; + output[63 * outstride] = x10[63]; } |