diff options
Diffstat (limited to 'third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h')
-rw-r--r-- | third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h | 207 |
1 files changed, 0 insertions, 207 deletions
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h index 58e8971dd..12ccf7f26 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h @@ -12,15 +12,10 @@ #ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_ #define AOM_DSP_X86_FWD_TXFM_SSE2_H_ -#include "aom_dsp/x86/txfm_common_intrin.h" - #ifdef __cplusplus extern "C" { #endif -#define pair_set_epi32(a, b) \ - _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a)) - static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { __m128i buf0, buf1; buf0 = _mm_mul_epu32(a, b); @@ -140,112 +135,6 @@ static INLINE int check_epi16_overflow_x32( return res0 + res1; } -static INLINE int k_check_epi32_overflow_4(const __m128i *preg0, - const __m128i *preg1, - const __m128i *preg2, - const __m128i *preg3, - const __m128i *zero) { - __m128i minus_one = _mm_set1_epi32(-1); - // Check for overflows - __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1); - __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1); - __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1); - __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1); - __m128i reg0_top_dwords = - _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1)); - __m128i reg1_top_dwords = - _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1)); - __m128i reg2_top_dwords = - _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1)); - __m128i reg3_top_dwords = - _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1)); - __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords); - __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords); - __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero); - __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero); - __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one); - __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one); - int overflow_01 = - _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01)); - int overflow_23 = - _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23)); - return (overflow_01 + overflow_23); -} - -static INLINE int k_check_epi32_overflow_8( - const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, - const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, - const __m128i *preg6, const __m128i *preg7, const __m128i *zero) { - int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); - } - return overflow; -} - -static INLINE int k_check_epi32_overflow_16( - const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, - const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, - const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, - const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, - const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, - const __m128i *preg15, const __m128i *zero) { - int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); - if (!overflow) { - overflow = - k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero); - } - } - } - return overflow; -} - -static INLINE int k_check_epi32_overflow_32( - const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, - const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, - const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, - const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, - const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, - const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, - const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, - const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, - const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, - const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, - const __m128i *preg30, const __m128i *preg31, const __m128i *zero) { - int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); - if (!overflow) { - overflow = - k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero); - if (!overflow) { - overflow = - k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero); - if (!overflow) { - overflow = - k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg24, preg25, preg26, - preg27, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg28, preg29, preg30, - preg31, zero); - } - } - } - } - } - } - } - return overflow; -} - static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { if (sizeof(tran_low_t) == 4) { const __m128i zero = _mm_setzero_si128(); @@ -259,102 +148,6 @@ static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { } } -static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1, - const __m128i *pmultiplier, - const __m128i *prounding, int shift) { - const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier); - const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier); - const __m128i v0 = _mm_add_epi32(u0, *prounding); - const __m128i v1 = _mm_add_epi32(u1, *prounding); - const __m128i w0 = _mm_srai_epi32(v0, shift); - const __m128i w1 = _mm_srai_epi32(v1, shift); - return _mm_packs_epi32(w0, w1); -} - -static INLINE void transpose_and_output8x8( - const __m128i *pin00, const __m128i *pin01, const __m128i *pin02, - const __m128i *pin03, const __m128i *pin04, const __m128i *pin05, - const __m128i *pin06, const __m128i *pin07, int pass, int16_t *out0_ptr, - tran_low_t *out1_ptr) { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01); - const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03); - const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01); - const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03); - const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05); - const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07); - const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05); - const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - if (pass == 0) { - _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0); - _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1); - _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2); - _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3); - _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4); - _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5); - _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6); - _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7); - } else { - storeu_output(&tr2_0, (out1_ptr + 0 * 16)); - storeu_output(&tr2_1, (out1_ptr + 1 * 16)); - storeu_output(&tr2_2, (out1_ptr + 2 * 16)); - storeu_output(&tr2_3, (out1_ptr + 3 * 16)); - storeu_output(&tr2_4, (out1_ptr + 4 * 16)); - storeu_output(&tr2_5, (out1_ptr + 5 * 16)); - storeu_output(&tr2_6, (out1_ptr + 6 * 16)); - storeu_output(&tr2_7, (out1_ptr + 7 * 16)); - } -} - -void fdct32_8col(__m128i *in0, __m128i *in1); - #ifdef __cplusplus } // extern "C" #endif |