summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c')
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c259
1 files changed, 130 insertions, 129 deletions
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
index c71f2e74c..07615543c 100644
--- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -395,7 +395,8 @@ void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
}
void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+ int8_t cos_bit, const int instride,
+ const int outstride) {
const int32_t *cospi = cospi_arr(cos_bit);
const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
@@ -480,70 +481,70 @@ void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
// stage 1
__m128i x1[64];
- x1[0] = _mm_add_epi32(input[0], input[63]);
- x1[63] = _mm_sub_epi32(input[0], input[63]);
- x1[1] = _mm_add_epi32(input[1], input[62]);
- x1[62] = _mm_sub_epi32(input[1], input[62]);
- x1[2] = _mm_add_epi32(input[2], input[61]);
- x1[61] = _mm_sub_epi32(input[2], input[61]);
- x1[3] = _mm_add_epi32(input[3], input[60]);
- x1[60] = _mm_sub_epi32(input[3], input[60]);
- x1[4] = _mm_add_epi32(input[4], input[59]);
- x1[59] = _mm_sub_epi32(input[4], input[59]);
- x1[5] = _mm_add_epi32(input[5], input[58]);
- x1[58] = _mm_sub_epi32(input[5], input[58]);
- x1[6] = _mm_add_epi32(input[6], input[57]);
- x1[57] = _mm_sub_epi32(input[6], input[57]);
- x1[7] = _mm_add_epi32(input[7], input[56]);
- x1[56] = _mm_sub_epi32(input[7], input[56]);
- x1[8] = _mm_add_epi32(input[8], input[55]);
- x1[55] = _mm_sub_epi32(input[8], input[55]);
- x1[9] = _mm_add_epi32(input[9], input[54]);
- x1[54] = _mm_sub_epi32(input[9], input[54]);
- x1[10] = _mm_add_epi32(input[10], input[53]);
- x1[53] = _mm_sub_epi32(input[10], input[53]);
- x1[11] = _mm_add_epi32(input[11], input[52]);
- x1[52] = _mm_sub_epi32(input[11], input[52]);
- x1[12] = _mm_add_epi32(input[12], input[51]);
- x1[51] = _mm_sub_epi32(input[12], input[51]);
- x1[13] = _mm_add_epi32(input[13], input[50]);
- x1[50] = _mm_sub_epi32(input[13], input[50]);
- x1[14] = _mm_add_epi32(input[14], input[49]);
- x1[49] = _mm_sub_epi32(input[14], input[49]);
- x1[15] = _mm_add_epi32(input[15], input[48]);
- x1[48] = _mm_sub_epi32(input[15], input[48]);
- x1[16] = _mm_add_epi32(input[16], input[47]);
- x1[47] = _mm_sub_epi32(input[16], input[47]);
- x1[17] = _mm_add_epi32(input[17], input[46]);
- x1[46] = _mm_sub_epi32(input[17], input[46]);
- x1[18] = _mm_add_epi32(input[18], input[45]);
- x1[45] = _mm_sub_epi32(input[18], input[45]);
- x1[19] = _mm_add_epi32(input[19], input[44]);
- x1[44] = _mm_sub_epi32(input[19], input[44]);
- x1[20] = _mm_add_epi32(input[20], input[43]);
- x1[43] = _mm_sub_epi32(input[20], input[43]);
- x1[21] = _mm_add_epi32(input[21], input[42]);
- x1[42] = _mm_sub_epi32(input[21], input[42]);
- x1[22] = _mm_add_epi32(input[22], input[41]);
- x1[41] = _mm_sub_epi32(input[22], input[41]);
- x1[23] = _mm_add_epi32(input[23], input[40]);
- x1[40] = _mm_sub_epi32(input[23], input[40]);
- x1[24] = _mm_add_epi32(input[24], input[39]);
- x1[39] = _mm_sub_epi32(input[24], input[39]);
- x1[25] = _mm_add_epi32(input[25], input[38]);
- x1[38] = _mm_sub_epi32(input[25], input[38]);
- x1[26] = _mm_add_epi32(input[26], input[37]);
- x1[37] = _mm_sub_epi32(input[26], input[37]);
- x1[27] = _mm_add_epi32(input[27], input[36]);
- x1[36] = _mm_sub_epi32(input[27], input[36]);
- x1[28] = _mm_add_epi32(input[28], input[35]);
- x1[35] = _mm_sub_epi32(input[28], input[35]);
- x1[29] = _mm_add_epi32(input[29], input[34]);
- x1[34] = _mm_sub_epi32(input[29], input[34]);
- x1[30] = _mm_add_epi32(input[30], input[33]);
- x1[33] = _mm_sub_epi32(input[30], input[33]);
- x1[31] = _mm_add_epi32(input[31], input[32]);
- x1[32] = _mm_sub_epi32(input[31], input[32]);
+ x1[0] = _mm_add_epi32(input[0 * instride], input[63 * instride]);
+ x1[63] = _mm_sub_epi32(input[0 * instride], input[63 * instride]);
+ x1[1] = _mm_add_epi32(input[1 * instride], input[62 * instride]);
+ x1[62] = _mm_sub_epi32(input[1 * instride], input[62 * instride]);
+ x1[2] = _mm_add_epi32(input[2 * instride], input[61 * instride]);
+ x1[61] = _mm_sub_epi32(input[2 * instride], input[61 * instride]);
+ x1[3] = _mm_add_epi32(input[3 * instride], input[60 * instride]);
+ x1[60] = _mm_sub_epi32(input[3 * instride], input[60 * instride]);
+ x1[4] = _mm_add_epi32(input[4 * instride], input[59 * instride]);
+ x1[59] = _mm_sub_epi32(input[4 * instride], input[59 * instride]);
+ x1[5] = _mm_add_epi32(input[5 * instride], input[58 * instride]);
+ x1[58] = _mm_sub_epi32(input[5 * instride], input[58 * instride]);
+ x1[6] = _mm_add_epi32(input[6 * instride], input[57 * instride]);
+ x1[57] = _mm_sub_epi32(input[6 * instride], input[57 * instride]);
+ x1[7] = _mm_add_epi32(input[7 * instride], input[56 * instride]);
+ x1[56] = _mm_sub_epi32(input[7 * instride], input[56 * instride]);
+ x1[8] = _mm_add_epi32(input[8 * instride], input[55 * instride]);
+ x1[55] = _mm_sub_epi32(input[8 * instride], input[55 * instride]);
+ x1[9] = _mm_add_epi32(input[9 * instride], input[54 * instride]);
+ x1[54] = _mm_sub_epi32(input[9 * instride], input[54 * instride]);
+ x1[10] = _mm_add_epi32(input[10 * instride], input[53 * instride]);
+ x1[53] = _mm_sub_epi32(input[10 * instride], input[53 * instride]);
+ x1[11] = _mm_add_epi32(input[11 * instride], input[52 * instride]);
+ x1[52] = _mm_sub_epi32(input[11 * instride], input[52 * instride]);
+ x1[12] = _mm_add_epi32(input[12 * instride], input[51 * instride]);
+ x1[51] = _mm_sub_epi32(input[12 * instride], input[51 * instride]);
+ x1[13] = _mm_add_epi32(input[13 * instride], input[50 * instride]);
+ x1[50] = _mm_sub_epi32(input[13 * instride], input[50 * instride]);
+ x1[14] = _mm_add_epi32(input[14 * instride], input[49 * instride]);
+ x1[49] = _mm_sub_epi32(input[14 * instride], input[49 * instride]);
+ x1[15] = _mm_add_epi32(input[15 * instride], input[48 * instride]);
+ x1[48] = _mm_sub_epi32(input[15 * instride], input[48 * instride]);
+ x1[16] = _mm_add_epi32(input[16 * instride], input[47 * instride]);
+ x1[47] = _mm_sub_epi32(input[16 * instride], input[47 * instride]);
+ x1[17] = _mm_add_epi32(input[17 * instride], input[46 * instride]);
+ x1[46] = _mm_sub_epi32(input[17 * instride], input[46 * instride]);
+ x1[18] = _mm_add_epi32(input[18 * instride], input[45 * instride]);
+ x1[45] = _mm_sub_epi32(input[18 * instride], input[45 * instride]);
+ x1[19] = _mm_add_epi32(input[19 * instride], input[44 * instride]);
+ x1[44] = _mm_sub_epi32(input[19 * instride], input[44 * instride]);
+ x1[20] = _mm_add_epi32(input[20 * instride], input[43 * instride]);
+ x1[43] = _mm_sub_epi32(input[20 * instride], input[43 * instride]);
+ x1[21] = _mm_add_epi32(input[21 * instride], input[42 * instride]);
+ x1[42] = _mm_sub_epi32(input[21 * instride], input[42 * instride]);
+ x1[22] = _mm_add_epi32(input[22 * instride], input[41 * instride]);
+ x1[41] = _mm_sub_epi32(input[22 * instride], input[41 * instride]);
+ x1[23] = _mm_add_epi32(input[23 * instride], input[40 * instride]);
+ x1[40] = _mm_sub_epi32(input[23 * instride], input[40 * instride]);
+ x1[24] = _mm_add_epi32(input[24 * instride], input[39 * instride]);
+ x1[39] = _mm_sub_epi32(input[24 * instride], input[39 * instride]);
+ x1[25] = _mm_add_epi32(input[25 * instride], input[38 * instride]);
+ x1[38] = _mm_sub_epi32(input[25 * instride], input[38 * instride]);
+ x1[26] = _mm_add_epi32(input[26 * instride], input[37 * instride]);
+ x1[37] = _mm_sub_epi32(input[26 * instride], input[37 * instride]);
+ x1[27] = _mm_add_epi32(input[27 * instride], input[36 * instride]);
+ x1[36] = _mm_sub_epi32(input[27 * instride], input[36 * instride]);
+ x1[28] = _mm_add_epi32(input[28 * instride], input[35 * instride]);
+ x1[35] = _mm_sub_epi32(input[28 * instride], input[35 * instride]);
+ x1[29] = _mm_add_epi32(input[29 * instride], input[34 * instride]);
+ x1[34] = _mm_sub_epi32(input[29 * instride], input[34 * instride]);
+ x1[30] = _mm_add_epi32(input[30 * instride], input[33 * instride]);
+ x1[33] = _mm_sub_epi32(input[30 * instride], input[33 * instride]);
+ x1[31] = _mm_add_epi32(input[31 * instride], input[32 * instride]);
+ x1[32] = _mm_sub_epi32(input[31 * instride], input[32 * instride]);
// stage 2
__m128i x2[64];
@@ -1149,68 +1150,68 @@ void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
x10[48], __rounding, cos_bit);
// stage 11
- output[0] = x10[0];
- output[1] = x10[32];
- output[2] = x10[16];
- output[3] = x10[48];
- output[4] = x10[8];
- output[5] = x10[40];
- output[6] = x10[24];
- output[7] = x10[56];
- output[8] = x10[4];
- output[9] = x10[36];
- output[10] = x10[20];
- output[11] = x10[52];
- output[12] = x10[12];
- output[13] = x10[44];
- output[14] = x10[28];
- output[15] = x10[60];
- output[16] = x10[2];
- output[17] = x10[34];
- output[18] = x10[18];
- output[19] = x10[50];
- output[20] = x10[10];
- output[21] = x10[42];
- output[22] = x10[26];
- output[23] = x10[58];
- output[24] = x10[6];
- output[25] = x10[38];
- output[26] = x10[22];
- output[27] = x10[54];
- output[28] = x10[14];
- output[29] = x10[46];
- output[30] = x10[30];
- output[31] = x10[62];
- output[32] = x10[1];
- output[33] = x10[33];
- output[34] = x10[17];
- output[35] = x10[49];
- output[36] = x10[9];
- output[37] = x10[41];
- output[38] = x10[25];
- output[39] = x10[57];
- output[40] = x10[5];
- output[41] = x10[37];
- output[42] = x10[21];
- output[43] = x10[53];
- output[44] = x10[13];
- output[45] = x10[45];
- output[46] = x10[29];
- output[47] = x10[61];
- output[48] = x10[3];
- output[49] = x10[35];
- output[50] = x10[19];
- output[51] = x10[51];
- output[52] = x10[11];
- output[53] = x10[43];
- output[54] = x10[27];
- output[55] = x10[59];
- output[56] = x10[7];
- output[57] = x10[39];
- output[58] = x10[23];
- output[59] = x10[55];
- output[60] = x10[15];
- output[61] = x10[47];
- output[62] = x10[31];
- output[63] = x10[63];
+ output[0 * outstride] = x10[0];
+ output[1 * outstride] = x10[32];
+ output[2 * outstride] = x10[16];
+ output[3 * outstride] = x10[48];
+ output[4 * outstride] = x10[8];
+ output[5 * outstride] = x10[40];
+ output[6 * outstride] = x10[24];
+ output[7 * outstride] = x10[56];
+ output[8 * outstride] = x10[4];
+ output[9 * outstride] = x10[36];
+ output[10 * outstride] = x10[20];
+ output[11 * outstride] = x10[52];
+ output[12 * outstride] = x10[12];
+ output[13 * outstride] = x10[44];
+ output[14 * outstride] = x10[28];
+ output[15 * outstride] = x10[60];
+ output[16 * outstride] = x10[2];
+ output[17 * outstride] = x10[34];
+ output[18 * outstride] = x10[18];
+ output[19 * outstride] = x10[50];
+ output[20 * outstride] = x10[10];
+ output[21 * outstride] = x10[42];
+ output[22 * outstride] = x10[26];
+ output[23 * outstride] = x10[58];
+ output[24 * outstride] = x10[6];
+ output[25 * outstride] = x10[38];
+ output[26 * outstride] = x10[22];
+ output[27 * outstride] = x10[54];
+ output[28 * outstride] = x10[14];
+ output[29 * outstride] = x10[46];
+ output[30 * outstride] = x10[30];
+ output[31 * outstride] = x10[62];
+ output[32 * outstride] = x10[1];
+ output[33 * outstride] = x10[33];
+ output[34 * outstride] = x10[17];
+ output[35 * outstride] = x10[49];
+ output[36 * outstride] = x10[9];
+ output[37 * outstride] = x10[41];
+ output[38 * outstride] = x10[25];
+ output[39 * outstride] = x10[57];
+ output[40 * outstride] = x10[5];
+ output[41 * outstride] = x10[37];
+ output[42 * outstride] = x10[21];
+ output[43 * outstride] = x10[53];
+ output[44 * outstride] = x10[13];
+ output[45 * outstride] = x10[45];
+ output[46 * outstride] = x10[29];
+ output[47 * outstride] = x10[61];
+ output[48 * outstride] = x10[3];
+ output[49 * outstride] = x10[35];
+ output[50 * outstride] = x10[19];
+ output[51 * outstride] = x10[51];
+ output[52 * outstride] = x10[11];
+ output[53 * outstride] = x10[43];
+ output[54 * outstride] = x10[27];
+ output[55 * outstride] = x10[59];
+ output[56 * outstride] = x10[7];
+ output[57 * outstride] = x10[39];
+ output[58 * outstride] = x10[23];
+ output[59 * outstride] = x10[55];
+ output[60 * outstride] = x10[15];
+ output[61 * outstride] = x10[47];
+ output[62 * outstride] = x10[31];
+ output[63 * outstride] = x10[63];
}