summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c')
-rw-r--r--third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c180
1 files changed, 91 insertions, 89 deletions
diff --git a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index 198e4e4c4..8495ad1aa 100644
--- a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -269,8 +269,8 @@ static void fdct16_avx2(__m256i *in) {
x0 = _mm256_unpacklo_epi16(v0, v1);
x1 = _mm256_unpackhi_epi16(v0, v1);
- t0 = butter_fly(x0, x1, cospi_p16_p16);
- t1 = butter_fly(x0, x1, cospi_p16_m16);
+ t0 = butter_fly(&x0, &x1, &cospi_p16_p16);
+ t1 = butter_fly(&x0, &x1, &cospi_p16_m16);
// 4, 12
v0 = _mm256_sub_epi16(s1, s2);
@@ -279,8 +279,8 @@ static void fdct16_avx2(__m256i *in) {
x0 = _mm256_unpacklo_epi16(v0, v1);
x1 = _mm256_unpackhi_epi16(v0, v1);
- t2 = butter_fly(x0, x1, cospi_p24_p08);
- t3 = butter_fly(x0, x1, cospi_m08_p24);
+ t2 = butter_fly(&x0, &x1, &cospi_p24_p08);
+ t3 = butter_fly(&x0, &x1, &cospi_m08_p24);
// 2, 6, 10, 14
s0 = _mm256_sub_epi16(u3, u4);
@@ -294,8 +294,8 @@ static void fdct16_avx2(__m256i *in) {
x0 = _mm256_unpacklo_epi16(s2, s1);
x1 = _mm256_unpackhi_epi16(s2, s1);
- v2 = butter_fly(x0, x1, cospi_p16_p16); // output[5]
- v1 = butter_fly(x0, x1, cospi_p16_m16); // output[6]
+ v2 = butter_fly(&x0, &x1, &cospi_p16_p16); // output[5]
+ v1 = butter_fly(&x0, &x1, &cospi_p16_m16); // output[6]
s0 = _mm256_add_epi16(v0, v1); // step[4]
s1 = _mm256_sub_epi16(v0, v1); // step[5]
@@ -306,14 +306,14 @@ static void fdct16_avx2(__m256i *in) {
x0 = _mm256_unpacklo_epi16(s0, s3);
x1 = _mm256_unpackhi_epi16(s0, s3);
- t4 = butter_fly(x0, x1, cospi_p28_p04);
- t5 = butter_fly(x0, x1, cospi_m04_p28);
+ t4 = butter_fly(&x0, &x1, &cospi_p28_p04);
+ t5 = butter_fly(&x0, &x1, &cospi_m04_p28);
// 10, 6
x0 = _mm256_unpacklo_epi16(s1, s2);
x1 = _mm256_unpackhi_epi16(s1, s2);
- t6 = butter_fly(x0, x1, cospi_p12_p20);
- t7 = butter_fly(x0, x1, cospi_m20_p12);
+ t6 = butter_fly(&x0, &x1, &cospi_p12_p20);
+ t7 = butter_fly(&x0, &x1, &cospi_m20_p12);
// 1, 3, 5, 7, 9, 11, 13, 15
s0 = _mm256_sub_epi16(in[7], in[8]); // step[8]
@@ -337,14 +337,14 @@ static void fdct16_avx2(__m256i *in) {
x0 = _mm256_unpacklo_epi16(u5, u2);
x1 = _mm256_unpackhi_epi16(u5, u2);
- s2 = butter_fly(x0, x1, cospi_p16_p16); // step[13]
- s5 = butter_fly(x0, x1, cospi_p16_m16); // step[10]
+ s2 = butter_fly(&x0, &x1, &cospi_p16_p16); // step[13]
+ s5 = butter_fly(&x0, &x1, &cospi_p16_m16); // step[10]
x0 = _mm256_unpacklo_epi16(u4, u3);
x1 = _mm256_unpackhi_epi16(u4, u3);
- s3 = butter_fly(x0, x1, cospi_p16_p16); // step[12]
- s4 = butter_fly(x0, x1, cospi_p16_m16); // step[11]
+ s3 = butter_fly(&x0, &x1, &cospi_p16_p16); // step[12]
+ s4 = butter_fly(&x0, &x1, &cospi_p16_m16); // step[11]
u0 = _mm256_add_epi16(s0, s4); // output[8]
u1 = _mm256_add_epi16(s1, s5);
@@ -364,14 +364,14 @@ static void fdct16_avx2(__m256i *in) {
x0 = _mm256_unpacklo_epi16(u1, u6);
x1 = _mm256_unpackhi_epi16(u1, u6);
- s1 = butter_fly(x0, x1, cospi_m08_p24);
- s6 = butter_fly(x0, x1, cospi_p24_p08);
+ s1 = butter_fly(&x0, &x1, &cospi_m08_p24);
+ s6 = butter_fly(&x0, &x1, &cospi_p24_p08);
x0 = _mm256_unpacklo_epi16(u2, u5);
x1 = _mm256_unpackhi_epi16(u2, u5);
- s2 = butter_fly(x0, x1, cospi_m24_m08);
- s5 = butter_fly(x0, x1, cospi_m08_p24);
+ s2 = butter_fly(&x0, &x1, &cospi_m24_m08);
+ s5 = butter_fly(&x0, &x1, &cospi_m08_p24);
// stage 5
u0 = _mm256_add_epi16(s0, s1);
@@ -386,23 +386,23 @@ static void fdct16_avx2(__m256i *in) {
// stage 6
x0 = _mm256_unpacklo_epi16(u0, u7);
x1 = _mm256_unpackhi_epi16(u0, u7);
- in[1] = butter_fly(x0, x1, cospi_p30_p02);
- in[15] = butter_fly(x0, x1, cospi_m02_p30);
+ in[1] = butter_fly(&x0, &x1, &cospi_p30_p02);
+ in[15] = butter_fly(&x0, &x1, &cospi_m02_p30);
x0 = _mm256_unpacklo_epi16(u1, u6);
x1 = _mm256_unpackhi_epi16(u1, u6);
- in[9] = butter_fly(x0, x1, cospi_p14_p18);
- in[7] = butter_fly(x0, x1, cospi_m18_p14);
+ in[9] = butter_fly(&x0, &x1, &cospi_p14_p18);
+ in[7] = butter_fly(&x0, &x1, &cospi_m18_p14);
x0 = _mm256_unpacklo_epi16(u2, u5);
x1 = _mm256_unpackhi_epi16(u2, u5);
- in[5] = butter_fly(x0, x1, cospi_p22_p10);
- in[11] = butter_fly(x0, x1, cospi_m10_p22);
+ in[5] = butter_fly(&x0, &x1, &cospi_p22_p10);
+ in[11] = butter_fly(&x0, &x1, &cospi_m10_p22);
x0 = _mm256_unpacklo_epi16(u3, u4);
x1 = _mm256_unpackhi_epi16(u3, u4);
- in[13] = butter_fly(x0, x1, cospi_p06_p26);
- in[3] = butter_fly(x0, x1, cospi_m26_p06);
+ in[13] = butter_fly(&x0, &x1, &cospi_p06_p26);
+ in[3] = butter_fly(&x0, &x1, &cospi_m26_p06);
}
void fadst16_avx2(__m256i *in) {
@@ -953,7 +953,9 @@ void fadst16_avx2(__m256i *in) {
}
#if CONFIG_EXT_TX
-static void fidtx16_avx2(__m256i *in) { txfm_scaling16_avx2(Sqrt2, in); }
+static void fidtx16_avx2(__m256i *in) {
+ txfm_scaling16_avx2((int16_t)Sqrt2, in);
+}
#endif
void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
@@ -964,28 +966,28 @@ void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
case DCT_DCT:
load_buffer_16x16(input, stride, 0, 0, in);
fdct16_avx2(in);
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fdct16_avx2(in);
break;
case ADST_DCT:
load_buffer_16x16(input, stride, 0, 0, in);
fadst16_avx2(in);
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fdct16_avx2(in);
break;
case DCT_ADST:
load_buffer_16x16(input, stride, 0, 0, in);
fdct16_avx2(in);
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case ADST_ADST:
load_buffer_16x16(input, stride, 0, 0, in);
fadst16_avx2(in);
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
@@ -993,91 +995,91 @@ void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
case FLIPADST_DCT:
load_buffer_16x16(input, stride, 1, 0, in);
fadst16_avx2(in);
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fdct16_avx2(in);
break;
case DCT_FLIPADST:
load_buffer_16x16(input, stride, 0, 1, in);
fdct16_avx2(in);
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case FLIPADST_FLIPADST:
load_buffer_16x16(input, stride, 1, 1, in);
fadst16_avx2(in);
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case ADST_FLIPADST:
load_buffer_16x16(input, stride, 0, 1, in);
fadst16_avx2(in);
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case FLIPADST_ADST:
load_buffer_16x16(input, stride, 1, 0, in);
fadst16_avx2(in);
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case IDTX:
load_buffer_16x16(input, stride, 0, 0, in);
fidtx16_avx2(in);
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fidtx16_avx2(in);
break;
case V_DCT:
load_buffer_16x16(input, stride, 0, 0, in);
fdct16_avx2(in);
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fidtx16_avx2(in);
break;
case H_DCT:
load_buffer_16x16(input, stride, 0, 0, in);
fidtx16_avx2(in);
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fdct16_avx2(in);
break;
case V_ADST:
load_buffer_16x16(input, stride, 0, 0, in);
fadst16_avx2(in);
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fidtx16_avx2(in);
break;
case H_ADST:
load_buffer_16x16(input, stride, 0, 0, in);
fidtx16_avx2(in);
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case V_FLIPADST:
load_buffer_16x16(input, stride, 1, 0, in);
fadst16_avx2(in);
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fidtx16_avx2(in);
break;
case H_FLIPADST:
load_buffer_16x16(input, stride, 0, 1, in);
fidtx16_avx2(in);
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
#endif // CONFIG_EXT_TX
default: assert(0); break;
}
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
write_buffer_16x16(in, output);
_mm256_zeroupper();
}
@@ -1110,10 +1112,10 @@ static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
}
static void mm256_transpose_32x32(__m256i *in0, __m256i *in1) {
- mm256_transpose_16x16(in0);
- mm256_transpose_16x16(&in0[16]);
- mm256_transpose_16x16(in1);
- mm256_transpose_16x16(&in1[16]);
+ mm256_transpose_16x16(in0, in0);
+ mm256_transpose_16x16(&in0[16], &in0[16]);
+ mm256_transpose_16x16(in1, in1);
+ mm256_transpose_16x16(&in1[16], &in1[16]);
mm256_vectors_swap(&in0[16], in1, 16);
}
@@ -1247,23 +1249,23 @@ static void fdct16_odd_avx2(__m256i *in) {
u0 = _mm256_unpacklo_epi16(in[4], in[11]);
u1 = _mm256_unpackhi_epi16(in[4], in[11]);
- y4 = butter_fly(u0, u1, cospi_m16_p16);
- y11 = butter_fly(u0, u1, cospi_p16_p16);
+ y4 = butter_fly(&u0, &u1, &cospi_m16_p16);
+ y11 = butter_fly(&u0, &u1, &cospi_p16_p16);
u0 = _mm256_unpacklo_epi16(in[5], in[10]);
u1 = _mm256_unpackhi_epi16(in[5], in[10]);
- y5 = butter_fly(u0, u1, cospi_m16_p16);
- y10 = butter_fly(u0, u1, cospi_p16_p16);
+ y5 = butter_fly(&u0, &u1, &cospi_m16_p16);
+ y10 = butter_fly(&u0, &u1, &cospi_p16_p16);
u0 = _mm256_unpacklo_epi16(in[6], in[9]);
u1 = _mm256_unpackhi_epi16(in[6], in[9]);
- y6 = butter_fly(u0, u1, cospi_m16_p16);
- y9 = butter_fly(u0, u1, cospi_p16_p16);
+ y6 = butter_fly(&u0, &u1, &cospi_m16_p16);
+ y9 = butter_fly(&u0, &u1, &cospi_p16_p16);
u0 = _mm256_unpacklo_epi16(in[7], in[8]);
u1 = _mm256_unpackhi_epi16(in[7], in[8]);
- y7 = butter_fly(u0, u1, cospi_m16_p16);
- y8 = butter_fly(u0, u1, cospi_p16_p16);
+ y7 = butter_fly(&u0, &u1, &cospi_m16_p16);
+ y8 = butter_fly(&u0, &u1, &cospi_p16_p16);
y12 = in[12];
y13 = in[13];
@@ -1300,23 +1302,23 @@ static void fdct16_odd_avx2(__m256i *in) {
u0 = _mm256_unpacklo_epi16(x2, x13);
u1 = _mm256_unpackhi_epi16(x2, x13);
- y2 = butter_fly(u0, u1, cospi_m08_p24);
- y13 = butter_fly(u0, u1, cospi_p24_p08);
+ y2 = butter_fly(&u0, &u1, &cospi_m08_p24);
+ y13 = butter_fly(&u0, &u1, &cospi_p24_p08);
u0 = _mm256_unpacklo_epi16(x3, x12);
u1 = _mm256_unpackhi_epi16(x3, x12);
- y3 = butter_fly(u0, u1, cospi_m08_p24);
- y12 = butter_fly(u0, u1, cospi_p24_p08);
+ y3 = butter_fly(&u0, &u1, &cospi_m08_p24);
+ y12 = butter_fly(&u0, &u1, &cospi_p24_p08);
u0 = _mm256_unpacklo_epi16(x4, x11);
u1 = _mm256_unpackhi_epi16(x4, x11);
- y4 = butter_fly(u0, u1, cospi_m24_m08);
- y11 = butter_fly(u0, u1, cospi_m08_p24);
+ y4 = butter_fly(&u0, &u1, &cospi_m24_m08);
+ y11 = butter_fly(&u0, &u1, &cospi_m08_p24);
u0 = _mm256_unpacklo_epi16(x5, x10);
u1 = _mm256_unpackhi_epi16(x5, x10);
- y5 = butter_fly(u0, u1, cospi_m24_m08);
- y10 = butter_fly(u0, u1, cospi_m08_p24);
+ y5 = butter_fly(&u0, &u1, &cospi_m24_m08);
+ y10 = butter_fly(&u0, &u1, &cospi_m08_p24);
// stage 5
x0 = _mm256_add_epi16(y0, y3);
@@ -1349,23 +1351,23 @@ static void fdct16_odd_avx2(__m256i *in) {
u0 = _mm256_unpacklo_epi16(x1, x14);
u1 = _mm256_unpackhi_epi16(x1, x14);
- y1 = butter_fly(u0, u1, cospi_m04_p28);
- y14 = butter_fly(u0, u1, cospi_p28_p04);
+ y1 = butter_fly(&u0, &u1, &cospi_m04_p28);
+ y14 = butter_fly(&u0, &u1, &cospi_p28_p04);
u0 = _mm256_unpacklo_epi16(x2, x13);
u1 = _mm256_unpackhi_epi16(x2, x13);
- y2 = butter_fly(u0, u1, cospi_m28_m04);
- y13 = butter_fly(u0, u1, cospi_m04_p28);
+ y2 = butter_fly(&u0, &u1, &cospi_m28_m04);
+ y13 = butter_fly(&u0, &u1, &cospi_m04_p28);
u0 = _mm256_unpacklo_epi16(x5, x10);
u1 = _mm256_unpackhi_epi16(x5, x10);
- y5 = butter_fly(u0, u1, cospi_m20_p12);
- y10 = butter_fly(u0, u1, cospi_p12_p20);
+ y5 = butter_fly(&u0, &u1, &cospi_m20_p12);
+ y10 = butter_fly(&u0, &u1, &cospi_p12_p20);
u0 = _mm256_unpacklo_epi16(x6, x9);
u1 = _mm256_unpackhi_epi16(x6, x9);
- y6 = butter_fly(u0, u1, cospi_m12_m20);
- y9 = butter_fly(u0, u1, cospi_m20_p12);
+ y6 = butter_fly(&u0, &u1, &cospi_m12_m20);
+ y9 = butter_fly(&u0, &u1, &cospi_m20_p12);
// stage 7
x0 = _mm256_add_epi16(y0, y1);
@@ -1389,43 +1391,43 @@ static void fdct16_odd_avx2(__m256i *in) {
// stage 8
u0 = _mm256_unpacklo_epi16(x0, x15);
u1 = _mm256_unpackhi_epi16(x0, x15);
- in[0] = butter_fly(u0, u1, cospi_p31_p01);
- in[15] = butter_fly(u0, u1, cospi_m01_p31);
+ in[0] = butter_fly(&u0, &u1, &cospi_p31_p01);
+ in[15] = butter_fly(&u0, &u1, &cospi_m01_p31);
u0 = _mm256_unpacklo_epi16(x1, x14);
u1 = _mm256_unpackhi_epi16(x1, x14);
- in[1] = butter_fly(u0, u1, cospi_p15_p17);
- in[14] = butter_fly(u0, u1, cospi_m17_p15);
+ in[1] = butter_fly(&u0, &u1, &cospi_p15_p17);
+ in[14] = butter_fly(&u0, &u1, &cospi_m17_p15);
u0 = _mm256_unpacklo_epi16(x2, x13);
u1 = _mm256_unpackhi_epi16(x2, x13);
- in[2] = butter_fly(u0, u1, cospi_p23_p09);
- in[13] = butter_fly(u0, u1, cospi_m09_p23);
+ in[2] = butter_fly(&u0, &u1, &cospi_p23_p09);
+ in[13] = butter_fly(&u0, &u1, &cospi_m09_p23);
u0 = _mm256_unpacklo_epi16(x3, x12);
u1 = _mm256_unpackhi_epi16(x3, x12);
- in[3] = butter_fly(u0, u1, cospi_p07_p25);
- in[12] = butter_fly(u0, u1, cospi_m25_p07);
+ in[3] = butter_fly(&u0, &u1, &cospi_p07_p25);
+ in[12] = butter_fly(&u0, &u1, &cospi_m25_p07);
u0 = _mm256_unpacklo_epi16(x4, x11);
u1 = _mm256_unpackhi_epi16(x4, x11);
- in[4] = butter_fly(u0, u1, cospi_p27_p05);
- in[11] = butter_fly(u0, u1, cospi_m05_p27);
+ in[4] = butter_fly(&u0, &u1, &cospi_p27_p05);
+ in[11] = butter_fly(&u0, &u1, &cospi_m05_p27);
u0 = _mm256_unpacklo_epi16(x5, x10);
u1 = _mm256_unpackhi_epi16(x5, x10);
- in[5] = butter_fly(u0, u1, cospi_p11_p21);
- in[10] = butter_fly(u0, u1, cospi_m21_p11);
+ in[5] = butter_fly(&u0, &u1, &cospi_p11_p21);
+ in[10] = butter_fly(&u0, &u1, &cospi_m21_p11);
u0 = _mm256_unpacklo_epi16(x6, x9);
u1 = _mm256_unpackhi_epi16(x6, x9);
- in[6] = butter_fly(u0, u1, cospi_p19_p13);
- in[9] = butter_fly(u0, u1, cospi_m13_p19);
+ in[6] = butter_fly(&u0, &u1, &cospi_p19_p13);
+ in[9] = butter_fly(&u0, &u1, &cospi_m13_p19);
u0 = _mm256_unpacklo_epi16(x7, x8);
u1 = _mm256_unpackhi_epi16(x7, x8);
- in[7] = butter_fly(u0, u1, cospi_p03_p29);
- in[8] = butter_fly(u0, u1, cospi_m29_p03);
+ in[7] = butter_fly(&u0, &u1, &cospi_p03_p29);
+ in[8] = butter_fly(&u0, &u1, &cospi_m29_p03);
}
static void fdct32_avx2(__m256i *in0, __m256i *in1) {
@@ -1464,7 +1466,7 @@ static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1,
static void fhalfright32_16col_avx2(__m256i *in) {
int i = 0;
const __m256i zero = _mm256_setzero_si256();
- const __m256i sqrt2 = _mm256_set1_epi16(Sqrt2);
+ const __m256i sqrt2 = _mm256_set1_epi16((int16_t)Sqrt2);
const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
__m256i x0, x1;