diff options
Diffstat (limited to 'third_party/aom/av1/common/x86/idct_intrin_sse2.c')
-rw-r--r-- | third_party/aom/av1/common/x86/idct_intrin_sse2.c | 1402 |
1 files changed, 1402 insertions, 0 deletions
diff --git a/third_party/aom/av1/common/x86/idct_intrin_sse2.c b/third_party/aom/av1/common/x86/idct_intrin_sse2.c new file mode 100644 index 000000000..522e8988c --- /dev/null +++ b/third_party/aom/av1/common/x86/idct_intrin_sse2.c @@ -0,0 +1,1402 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./av1_rtcd.h" +#include "aom_dsp/x86/inv_txfm_sse2.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_ports/mem.h" +#include "av1/common/enums.h" + +#if CONFIG_EXT_TX +static INLINE void fliplr_4x4(__m128i in[2]) { + in[0] = _mm_shufflelo_epi16(in[0], 0x1b); + in[0] = _mm_shufflehi_epi16(in[0], 0x1b); + in[1] = _mm_shufflelo_epi16(in[1], 0x1b); + in[1] = _mm_shufflehi_epi16(in[1], 0x1b); +} + +static INLINE void fliplr_8x8(__m128i in[8]) { + in[0] = mm_reverse_epi16(in[0]); + in[1] = mm_reverse_epi16(in[1]); + in[2] = mm_reverse_epi16(in[2]); + in[3] = mm_reverse_epi16(in[3]); + + in[4] = mm_reverse_epi16(in[4]); + in[5] = mm_reverse_epi16(in[5]); + in[6] = mm_reverse_epi16(in[6]); + in[7] = mm_reverse_epi16(in[7]); +} + +static INLINE void fliplr_16x8(__m128i in[16]) { + fliplr_8x8(&in[0]); + fliplr_8x8(&in[8]); +} + +#define FLIPLR_16x16(in0, in1) \ + do { \ + __m128i *tmp; \ + fliplr_16x8(in0); \ + fliplr_16x8(in1); \ + tmp = (in0); \ + (in0) = (in1); \ + (in1) = tmp; \ + } while (0) + +#define FLIPUD_PTR(dest, stride, size) \ + do { \ + (dest) = (dest) + ((size)-1) * (stride); \ + (stride) = -(stride); \ + } while (0) +#endif + +void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + __m128i in[2]; + const __m128i zero = _mm_setzero_si128(); + const __m128i eight = _mm_set1_epi16(8); + + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8); + + switch (tx_type) { + case DCT_DCT: + aom_idct4_sse2(in); + aom_idct4_sse2(in); + break; + case ADST_DCT: + aom_idct4_sse2(in); + aom_iadst4_sse2(in); + break; + case DCT_ADST: + aom_iadst4_sse2(in); + aom_idct4_sse2(in); + break; + case ADST_ADST: + aom_iadst4_sse2(in); + aom_iadst4_sse2(in); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + aom_idct4_sse2(in); + aom_iadst4_sse2(in); + FLIPUD_PTR(dest, stride, 4); + break; + case DCT_FLIPADST: + aom_iadst4_sse2(in); + aom_idct4_sse2(in); + fliplr_4x4(in); + break; + case FLIPADST_FLIPADST: + aom_iadst4_sse2(in); + aom_iadst4_sse2(in); + FLIPUD_PTR(dest, stride, 4); + fliplr_4x4(in); + break; + case ADST_FLIPADST: + aom_iadst4_sse2(in); + aom_iadst4_sse2(in); + fliplr_4x4(in); + break; + case FLIPADST_ADST: + aom_iadst4_sse2(in); + aom_iadst4_sse2(in); + FLIPUD_PTR(dest, stride, 4); + break; +#endif // CONFIG_EXT_TX + default: assert(0); break; + } + + // Final round and shift + in[0] = _mm_add_epi16(in[0], eight); + in[1] = _mm_add_epi16(in[1], eight); + + in[0] = _mm_srai_epi16(in[0], 4); + in[1] = _mm_srai_epi16(in[1], 4); + + // Reconstruction and Store + { + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0)); + __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1)); + __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); + __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); + d0 = _mm_unpacklo_epi32(d0, d1); + d2 = _mm_unpacklo_epi32(d2, d3); + d0 = _mm_unpacklo_epi8(d0, zero); + d2 = _mm_unpacklo_epi8(d2, zero); + d0 = _mm_add_epi16(d0, in[0]); + d2 = _mm_add_epi16(d2, in[1]); + d0 = _mm_packus_epi16(d0, d2); + // store result[0] + *(int *)dest = _mm_cvtsi128_si32(d0); + // store result[1] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); + // store result[2] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); + // store result[3] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); + } +} + +void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + __m128i in[8]; + const __m128i zero = _mm_setzero_si128(); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + + // load input data + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8 * 1); + in[2] = load_input_data(input + 8 * 2); + in[3] = load_input_data(input + 8 * 3); + in[4] = load_input_data(input + 8 * 4); + in[5] = load_input_data(input + 8 * 5); + in[6] = load_input_data(input + 8 * 6); + in[7] = load_input_data(input + 8 * 7); + + switch (tx_type) { + case DCT_DCT: + aom_idct8_sse2(in); + aom_idct8_sse2(in); + break; + case ADST_DCT: + aom_idct8_sse2(in); + aom_iadst8_sse2(in); + break; + case DCT_ADST: + aom_iadst8_sse2(in); + aom_idct8_sse2(in); + break; + case ADST_ADST: + aom_iadst8_sse2(in); + aom_iadst8_sse2(in); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + aom_idct8_sse2(in); + aom_iadst8_sse2(in); + FLIPUD_PTR(dest, stride, 8); + break; + case DCT_FLIPADST: + aom_iadst8_sse2(in); + aom_idct8_sse2(in); + fliplr_8x8(in); + break; + case FLIPADST_FLIPADST: + aom_iadst8_sse2(in); + aom_iadst8_sse2(in); + FLIPUD_PTR(dest, stride, 8); + fliplr_8x8(in); + break; + case ADST_FLIPADST: + aom_iadst8_sse2(in); + aom_iadst8_sse2(in); + fliplr_8x8(in); + break; + case FLIPADST_ADST: + aom_iadst8_sse2(in); + aom_iadst8_sse2(in); + FLIPUD_PTR(dest, stride, 8); + break; +#endif // CONFIG_EXT_TX + default: assert(0); break; + } + + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 5); + in[1] = _mm_srai_epi16(in[1], 5); + in[2] = _mm_srai_epi16(in[2], 5); + in[3] = _mm_srai_epi16(in[3], 5); + in[4] = _mm_srai_epi16(in[4], 5); + in[5] = _mm_srai_epi16(in[5], 5); + in[6] = _mm_srai_epi16(in[6], 5); + in[7] = _mm_srai_epi16(in[7], 5); + + RECON_AND_STORE(dest + 0 * stride, in[0]); + RECON_AND_STORE(dest + 1 * stride, in[1]); + RECON_AND_STORE(dest + 2 * stride, in[2]); + RECON_AND_STORE(dest + 3 * stride, in[3]); + RECON_AND_STORE(dest + 4 * stride, in[4]); + RECON_AND_STORE(dest + 5 * stride, in[5]); + RECON_AND_STORE(dest + 6 * stride, in[6]); + RECON_AND_STORE(dest + 7 * stride, in[7]); +} + +#if CONFIG_EXT_TX +static void iidtx16_sse2(__m128i *in0, __m128i *in1) { + array_transpose_16x16(in0, in1); + idtx16_8col(in0); + idtx16_8col(in1); +} +#endif // CONFIG_EXT_TX + +void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + __m128i in[32]; + __m128i *in0 = &in[0]; + __m128i *in1 = &in[16]; + + load_buffer_8x16(input, in0); + input += 8; + load_buffer_8x16(input, in1); + + switch (tx_type) { + case DCT_DCT: + aom_idct16_sse2(in0, in1); + aom_idct16_sse2(in0, in1); + break; + case ADST_DCT: + aom_idct16_sse2(in0, in1); + aom_iadst16_sse2(in0, in1); + break; + case DCT_ADST: + aom_iadst16_sse2(in0, in1); + aom_idct16_sse2(in0, in1); + break; + case ADST_ADST: + aom_iadst16_sse2(in0, in1); + aom_iadst16_sse2(in0, in1); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + aom_idct16_sse2(in0, in1); + aom_iadst16_sse2(in0, in1); + FLIPUD_PTR(dest, stride, 16); + break; + case DCT_FLIPADST: + aom_iadst16_sse2(in0, in1); + aom_idct16_sse2(in0, in1); + FLIPLR_16x16(in0, in1); + break; + case FLIPADST_FLIPADST: + aom_iadst16_sse2(in0, in1); + aom_iadst16_sse2(in0, in1); + FLIPUD_PTR(dest, stride, 16); + FLIPLR_16x16(in0, in1); + break; + case ADST_FLIPADST: + aom_iadst16_sse2(in0, in1); + aom_iadst16_sse2(in0, in1); + FLIPLR_16x16(in0, in1); + break; + case FLIPADST_ADST: + aom_iadst16_sse2(in0, in1); + aom_iadst16_sse2(in0, in1); + FLIPUD_PTR(dest, stride, 16); + break; + case IDTX: + iidtx16_sse2(in0, in1); + iidtx16_sse2(in0, in1); + break; + case V_DCT: + iidtx16_sse2(in0, in1); + aom_idct16_sse2(in0, in1); + break; + case H_DCT: + aom_idct16_sse2(in0, in1); + iidtx16_sse2(in0, in1); + break; + case V_ADST: + iidtx16_sse2(in0, in1); + aom_iadst16_sse2(in0, in1); + break; + case H_ADST: + aom_iadst16_sse2(in0, in1); + iidtx16_sse2(in0, in1); + break; + case V_FLIPADST: + iidtx16_sse2(in0, in1); + aom_iadst16_sse2(in0, in1); + FLIPUD_PTR(dest, stride, 16); + break; + case H_FLIPADST: + aom_iadst16_sse2(in0, in1); + iidtx16_sse2(in0, in1); + FLIPLR_16x16(in0, in1); + break; +#endif // CONFIG_EXT_TX + default: assert(0); break; + } + + write_buffer_8x16(dest, in0, stride); + dest += 8; + write_buffer_8x16(dest, in1, stride); +} + +#if CONFIG_EXT_TX +static void iidtx8_sse2(__m128i *in) { + in[0] = _mm_slli_epi16(in[0], 1); + in[1] = _mm_slli_epi16(in[1], 1); + in[2] = _mm_slli_epi16(in[2], 1); + in[3] = _mm_slli_epi16(in[3], 1); + in[4] = _mm_slli_epi16(in[4], 1); + in[5] = _mm_slli_epi16(in[5], 1); + in[6] = _mm_slli_epi16(in[6], 1); + in[7] = _mm_slli_epi16(in[7], 1); +} + +static INLINE void iidtx4_sse2(__m128i *in) { + const __m128i v_scale_w = _mm_set1_epi16(Sqrt2); + + const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w); + const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w); + const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w); + const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w); + + const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w); + const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w); + const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w); + const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w); + + in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS)); + in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS)); +} + +// load 8x8 array +static INLINE void flip_buffer_lr_8x8(__m128i *in) { + in[0] = mm_reverse_epi16(in[0]); + in[1] = mm_reverse_epi16(in[1]); + in[2] = mm_reverse_epi16(in[2]); + in[3] = mm_reverse_epi16(in[3]); + in[4] = mm_reverse_epi16(in[4]); + in[5] = mm_reverse_epi16(in[5]); + in[6] = mm_reverse_epi16(in[6]); + in[7] = mm_reverse_epi16(in[7]); +} +#endif // CONFIG_EXT_TX + +void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + __m128i in[16]; + + in[0] = load_input_data(input + 0 * 8); + in[1] = load_input_data(input + 1 * 8); + in[2] = load_input_data(input + 2 * 8); + in[3] = load_input_data(input + 3 * 8); + in[4] = load_input_data(input + 4 * 8); + in[5] = load_input_data(input + 5 * 8); + in[6] = load_input_data(input + 6 * 8); + in[7] = load_input_data(input + 7 * 8); + + in[8] = load_input_data(input + 8 * 8); + in[9] = load_input_data(input + 9 * 8); + in[10] = load_input_data(input + 10 * 8); + in[11] = load_input_data(input + 11 * 8); + in[12] = load_input_data(input + 12 * 8); + in[13] = load_input_data(input + 13 * 8); + in[14] = load_input_data(input + 14 * 8); + in[15] = load_input_data(input + 15 * 8); + + // Row transform + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: +#if CONFIG_EXT_TX + case FLIPADST_DCT: + case H_DCT: +#endif + aom_idct8_sse2(in); + array_transpose_8x8(in, in); + aom_idct8_sse2(in + 8); + array_transpose_8x8(in + 8, in + 8); + break; + case DCT_ADST: + case ADST_ADST: +#if CONFIG_EXT_TX + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + case H_ADST: + case H_FLIPADST: +#endif + aom_iadst8_sse2(in); + array_transpose_8x8(in, in); + aom_iadst8_sse2(in + 8); + array_transpose_8x8(in + 8, in + 8); + break; +#if CONFIG_EXT_TX + case V_FLIPADST: + case V_ADST: + case V_DCT: + case IDTX: + iidtx8_sse2(in); + iidtx8_sse2(in + 8); + break; +#endif + default: assert(0); break; + } + scale_sqrt2_8x8(in); + scale_sqrt2_8x8(in + 8); + + // Column transform + switch (tx_type) { + case DCT_DCT: + case DCT_ADST: +#if CONFIG_EXT_TX + case DCT_FLIPADST: + case V_DCT: +#endif + idct16_8col(in); + break; + case ADST_DCT: + case ADST_ADST: +#if CONFIG_EXT_TX + case FLIPADST_ADST: + case ADST_FLIPADST: + case FLIPADST_FLIPADST: + case FLIPADST_DCT: + case V_ADST: + case V_FLIPADST: +#endif + iadst16_8col(in); + break; +#if CONFIG_EXT_TX + case H_DCT: + case H_ADST: + case H_FLIPADST: + case IDTX: idtx16_8col(in); break; +#endif + default: assert(0); break; + } + + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: +#if CONFIG_EXT_TX + case H_DCT: +#endif + case DCT_ADST: + case ADST_ADST: +#if CONFIG_EXT_TX + case H_ADST: + case V_ADST: + case V_DCT: + case IDTX: +#endif + write_buffer_8x16(dest, in, stride); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + case FLIPADST_ADST: + case V_FLIPADST: write_buffer_8x16(dest + stride * 15, in, -stride); break; + case DCT_FLIPADST: + case ADST_FLIPADST: + case H_FLIPADST: + flip_buffer_lr_8x8(in); + flip_buffer_lr_8x8(in + 8); + write_buffer_8x16(dest, in, stride); + break; + case FLIPADST_FLIPADST: + flip_buffer_lr_8x8(in); + flip_buffer_lr_8x8(in + 8); + write_buffer_8x16(dest + stride * 15, in, -stride); + break; +#endif + default: assert(0); break; + } +} + +static INLINE void write_buffer_8x8_round6(uint8_t *dest, __m128i *in, + int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + + RECON_AND_STORE(dest + 0 * stride, in[0]); + RECON_AND_STORE(dest + 1 * stride, in[1]); + RECON_AND_STORE(dest + 2 * stride, in[2]); + RECON_AND_STORE(dest + 3 * stride, in[3]); + RECON_AND_STORE(dest + 4 * stride, in[4]); + RECON_AND_STORE(dest + 5 * stride, in[5]); + RECON_AND_STORE(dest + 6 * stride, in[6]); + RECON_AND_STORE(dest + 7 * stride, in[7]); +} + +void av1_iht16x8_128_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + __m128i in[16]; + + // Transpose 16x8 input into in[] + in[0] = load_input_data(input + 0 * 16); + in[1] = load_input_data(input + 1 * 16); + in[2] = load_input_data(input + 2 * 16); + in[3] = load_input_data(input + 3 * 16); + in[4] = load_input_data(input + 4 * 16); + in[5] = load_input_data(input + 5 * 16); + in[6] = load_input_data(input + 6 * 16); + in[7] = load_input_data(input + 7 * 16); + array_transpose_8x8(in, in); + + in[8] = load_input_data(input + 8 + 0 * 16); + in[9] = load_input_data(input + 8 + 1 * 16); + in[10] = load_input_data(input + 8 + 2 * 16); + in[11] = load_input_data(input + 8 + 3 * 16); + in[12] = load_input_data(input + 8 + 4 * 16); + in[13] = load_input_data(input + 8 + 5 * 16); + in[14] = load_input_data(input + 8 + 6 * 16); + in[15] = load_input_data(input + 8 + 7 * 16); + array_transpose_8x8(in + 8, in + 8); + + // Row transform + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: +#if CONFIG_EXT_TX + case FLIPADST_DCT: + case H_DCT: +#endif + idct16_8col(in); + break; + case DCT_ADST: + case ADST_ADST: +#if CONFIG_EXT_TX + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + case H_ADST: + case H_FLIPADST: +#endif + iadst16_8col(in); + break; +#if CONFIG_EXT_TX + case V_FLIPADST: + case V_ADST: + case V_DCT: + case IDTX: idtx16_8col(in); break; +#endif + default: assert(0); break; + } + + // Scale + scale_sqrt2_8x8(in); + scale_sqrt2_8x8(in + 8); + + // Column transform + switch (tx_type) { + case DCT_DCT: + case DCT_ADST: +#if CONFIG_EXT_TX + case DCT_FLIPADST: + case V_DCT: +#endif + aom_idct8_sse2(in); + aom_idct8_sse2(in + 8); + break; + case ADST_DCT: + case ADST_ADST: +#if CONFIG_EXT_TX + case FLIPADST_ADST: + case ADST_FLIPADST: + case FLIPADST_FLIPADST: + case FLIPADST_DCT: + case V_ADST: + case V_FLIPADST: +#endif + aom_iadst8_sse2(in); + aom_iadst8_sse2(in + 8); + break; +#if CONFIG_EXT_TX + case H_DCT: + case H_ADST: + case H_FLIPADST: + case IDTX: + array_transpose_8x8(in, in); + array_transpose_8x8(in + 8, in + 8); + iidtx8_sse2(in); + iidtx8_sse2(in + 8); + break; +#endif + default: assert(0); break; + } + + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: +#if CONFIG_EXT_TX + case H_DCT: + case H_ADST: + case V_ADST: + case V_DCT: + case IDTX: +#endif + write_buffer_8x8_round6(dest, in, stride); + write_buffer_8x8_round6(dest + 8, in + 8, stride); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + case FLIPADST_ADST: + case V_FLIPADST: + write_buffer_8x8_round6(dest + stride * 7, in, -stride); + write_buffer_8x8_round6(dest + stride * 7 + 8, in + 8, -stride); + break; + case DCT_FLIPADST: + case ADST_FLIPADST: + case H_FLIPADST: + flip_buffer_lr_8x8(in); + flip_buffer_lr_8x8(in + 8); + write_buffer_8x8_round6(dest, in + 8, stride); + write_buffer_8x8_round6(dest + 8, in, stride); + break; + case FLIPADST_FLIPADST: + flip_buffer_lr_8x8(in); + flip_buffer_lr_8x8(in + 8); + write_buffer_8x8_round6(dest + stride * 7, in + 8, -stride); + write_buffer_8x8_round6(dest + stride * 7 + 8, in, -stride); + break; +#endif + default: assert(0); break; + } +} + +static INLINE void write_buffer_8x4_round5(uint8_t *dest, __m128i *in, + int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i zero = _mm_setzero_si128(); + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 5); + in[1] = _mm_srai_epi16(in[1], 5); + in[2] = _mm_srai_epi16(in[2], 5); + in[3] = _mm_srai_epi16(in[3], 5); + + RECON_AND_STORE(dest + 0 * stride, in[0]); + RECON_AND_STORE(dest + 1 * stride, in[1]); + RECON_AND_STORE(dest + 2 * stride, in[2]); + RECON_AND_STORE(dest + 3 * stride, in[3]); +} + +void av1_iht8x4_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + __m128i in[8]; + + in[0] = load_input_data(input + 0 * 8); + in[1] = load_input_data(input + 1 * 8); + in[2] = load_input_data(input + 2 * 8); + in[3] = load_input_data(input + 3 * 8); + + // Row transform + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: +#if CONFIG_EXT_TX + case FLIPADST_DCT: + case H_DCT: +#endif + aom_idct8_sse2(in); + break; + case DCT_ADST: + case ADST_ADST: aom_iadst8_sse2(in); break; +#if CONFIG_EXT_TX + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + case H_ADST: + case H_FLIPADST: aom_iadst8_sse2(in); break; + case V_FLIPADST: + case V_ADST: + case V_DCT: + case IDTX: iidtx8_sse2(in); array_transpose_8x8(in, in); +#endif + break; + default: assert(0); break; + } + + scale_sqrt2_8x8(in); + + // Repack data. We pack into the bottom half of 'in' + // so that the next repacking stage can pack into the + // top half without overwriting anything + in[7] = _mm_unpacklo_epi64(in[6], in[7]); + in[6] = _mm_unpacklo_epi64(in[4], in[5]); + in[5] = _mm_unpacklo_epi64(in[2], in[3]); + in[4] = _mm_unpacklo_epi64(in[0], in[1]); + + // Column transform + switch (tx_type) { + case DCT_DCT: + case DCT_ADST: +#if CONFIG_EXT_TX + case DCT_FLIPADST: + case V_DCT: +#endif + aom_idct4_sse2(in + 4); + aom_idct4_sse2(in + 6); + break; + case ADST_DCT: + case ADST_ADST: +#if CONFIG_EXT_TX + case FLIPADST_ADST: + case ADST_FLIPADST: + case FLIPADST_FLIPADST: + case FLIPADST_DCT: + case V_ADST: + case V_FLIPADST: +#endif + aom_iadst4_sse2(in + 4); + aom_iadst4_sse2(in + 6); + break; +#if CONFIG_EXT_TX + case H_DCT: + case H_ADST: + case H_FLIPADST: + case IDTX: + iidtx4_sse2(in + 4); + array_transpose_4x4(in + 4); + iidtx4_sse2(in + 6); + array_transpose_4x4(in + 6); + break; +#endif + default: assert(0); break; + } + + // Repack data + in[0] = _mm_unpacklo_epi64(in[4], in[6]); + in[1] = _mm_unpackhi_epi64(in[4], in[6]); + in[2] = _mm_unpacklo_epi64(in[5], in[7]); + in[3] = _mm_unpackhi_epi64(in[5], in[7]); + + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: +#if CONFIG_EXT_TX + case H_DCT: + case H_ADST: + case V_ADST: + case V_DCT: + case IDTX: break; + case FLIPADST_DCT: + case FLIPADST_ADST: + case V_FLIPADST: FLIPUD_PTR(dest, stride, 4); break; + case DCT_FLIPADST: + case ADST_FLIPADST: + case H_FLIPADST: + in[0] = mm_reverse_epi16(in[0]); + in[1] = mm_reverse_epi16(in[1]); + in[2] = mm_reverse_epi16(in[2]); + in[3] = mm_reverse_epi16(in[3]); + break; + case FLIPADST_FLIPADST: + in[0] = mm_reverse_epi16(in[0]); + in[1] = mm_reverse_epi16(in[1]); + in[2] = mm_reverse_epi16(in[2]); + in[3] = mm_reverse_epi16(in[3]); + FLIPUD_PTR(dest, stride, 4); +#endif + break; + default: assert(0); break; + } + write_buffer_8x4_round5(dest, in, stride); +} + +static INLINE void write_buffer_4x8_round5(uint8_t *dest, __m128i *in, + int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i zero = _mm_setzero_si128(); + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 5); + in[1] = _mm_srai_epi16(in[1], 5); + in[2] = _mm_srai_epi16(in[2], 5); + in[3] = _mm_srai_epi16(in[3], 5); + + // Reconstruction and Store + { + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0)); + __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1)); + __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); + __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); + __m128i d4 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 4)); + __m128i d5 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 5)); + __m128i d6 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 6)); + __m128i d7 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 7)); + + d0 = _mm_unpacklo_epi32(d0, d1); + d2 = _mm_unpacklo_epi32(d2, d3); + d4 = _mm_unpacklo_epi32(d4, d5); + d6 = _mm_unpacklo_epi32(d6, d7); + d0 = _mm_unpacklo_epi8(d0, zero); + d2 = _mm_unpacklo_epi8(d2, zero); + d4 = _mm_unpacklo_epi8(d4, zero); + d6 = _mm_unpacklo_epi8(d6, zero); + d0 = _mm_add_epi16(d0, in[0]); + d2 = _mm_add_epi16(d2, in[1]); + d4 = _mm_add_epi16(d4, in[2]); + d6 = _mm_add_epi16(d6, in[3]); + + d0 = _mm_packus_epi16(d0, d2); + *(int *)dest = _mm_cvtsi128_si32(d0); + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); + d0 = _mm_packus_epi16(d4, d6); + *(int *)(dest + stride * 4) = _mm_cvtsi128_si32(d0); + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 5) = _mm_cvtsi128_si32(d0); + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 6) = _mm_cvtsi128_si32(d0); + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 7) = _mm_cvtsi128_si32(d0); + } +} + +void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + __m128i in[8]; + + // Load rows, packed two per element of 'in'. + // We pack into the bottom half of 'in' so that the + // later repacking stage can pack into the + // top half without overwriting anything + in[4] = load_input_data(input + 0 * 8); + in[5] = load_input_data(input + 1 * 8); + in[6] = load_input_data(input + 2 * 8); + in[7] = load_input_data(input + 3 * 8); + + // Row transform + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: +#if CONFIG_EXT_TX + case FLIPADST_DCT: + case H_DCT: +#endif + aom_idct4_sse2(in + 4); + aom_idct4_sse2(in + 6); + break; + case DCT_ADST: + case ADST_ADST: +#if CONFIG_EXT_TX + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + case H_ADST: + case H_FLIPADST: +#endif + aom_iadst4_sse2(in + 4); + aom_iadst4_sse2(in + 6); + break; +#if CONFIG_EXT_TX + case V_FLIPADST: + case V_ADST: + case V_DCT: + case IDTX: + iidtx4_sse2(in + 4); + array_transpose_4x4(in + 4); + iidtx4_sse2(in + 6); + array_transpose_4x4(in + 6); + break; +#endif + default: assert(0); break; + } + + scale_sqrt2_8x4(in + 4); + + // Repack data + in[0] = _mm_unpacklo_epi64(in[4], in[6]); + in[1] = _mm_unpackhi_epi64(in[4], in[6]); + in[2] = _mm_unpacklo_epi64(in[5], in[7]); + in[3] = _mm_unpackhi_epi64(in[5], in[7]); + + // Column transform + switch (tx_type) { + case DCT_DCT: + case DCT_ADST: +#if CONFIG_EXT_TX + case DCT_FLIPADST: + case V_DCT: +#endif + aom_idct8_sse2(in); + break; + case ADST_DCT: + case ADST_ADST: +#if CONFIG_EXT_TX + case FLIPADST_ADST: + case ADST_FLIPADST: + case FLIPADST_FLIPADST: + case FLIPADST_DCT: + case V_ADST: + case V_FLIPADST: +#endif + aom_iadst8_sse2(in); + break; +#if CONFIG_EXT_TX + case H_DCT: + case H_ADST: + case H_FLIPADST: + case IDTX: + iidtx8_sse2(in); + array_transpose_8x8(in, in); + break; +#endif + default: assert(0); break; + } + + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: +#if CONFIG_EXT_TX + case H_DCT: + case H_ADST: + case V_ADST: + case V_DCT: + case IDTX: +#endif + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + case FLIPADST_ADST: + case V_FLIPADST: FLIPUD_PTR(dest, stride, 8); break; + case DCT_FLIPADST: + case ADST_FLIPADST: + case H_FLIPADST: + in[0] = _mm_shufflelo_epi16(in[0], 0x1b); + in[1] = _mm_shufflelo_epi16(in[1], 0x1b); + in[2] = _mm_shufflelo_epi16(in[2], 0x1b); + in[3] = _mm_shufflelo_epi16(in[3], 0x1b); + in[4] = _mm_shufflelo_epi16(in[4], 0x1b); + in[5] = _mm_shufflelo_epi16(in[5], 0x1b); + in[6] = _mm_shufflelo_epi16(in[6], 0x1b); + in[7] = _mm_shufflelo_epi16(in[7], 0x1b); + break; + case FLIPADST_FLIPADST: + in[0] = _mm_shufflelo_epi16(in[0], 0x1b); + in[1] = _mm_shufflelo_epi16(in[1], 0x1b); + in[2] = _mm_shufflelo_epi16(in[2], 0x1b); + in[3] = _mm_shufflelo_epi16(in[3], 0x1b); + in[4] = _mm_shufflelo_epi16(in[4], 0x1b); + in[5] = _mm_shufflelo_epi16(in[5], 0x1b); + in[6] = _mm_shufflelo_epi16(in[6], 0x1b); + in[7] = _mm_shufflelo_epi16(in[7], 0x1b); + FLIPUD_PTR(dest, stride, 8); + break; +#endif + default: assert(0); break; + } + in[0] = _mm_unpacklo_epi64(in[0], in[1]); + in[1] = _mm_unpacklo_epi64(in[2], in[3]); + in[2] = _mm_unpacklo_epi64(in[4], in[5]); + in[3] = _mm_unpacklo_epi64(in[6], in[7]); + write_buffer_4x8_round5(dest, in, stride); +} + +// Note: The 16-column 32-element transforms take input in the form of four +// 8x16 blocks (each stored as a __m128i[16]), which are the four quadrants +// of the overall 16x32 input buffer. +static INLINE void idct32_16col(__m128i *tl, __m128i *tr, __m128i *bl, + __m128i *br) { + array_transpose_16x16(tl, tr); + array_transpose_16x16(bl, br); + idct32_8col(tl, bl); + idct32_8col(tr, br); +} + +static INLINE void ihalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl, + __m128i *br) { + __m128i tmpl[16], tmpr[16]; + int i; + + // Copy the top half of the input to temporary storage + for (i = 0; i < 16; ++i) { + tmpl[i] = tl[i]; + tmpr[i] = tr[i]; + } + + // Generate the top half of the output + for (i = 0; i < 16; ++i) { + tl[i] = _mm_slli_epi16(bl[i], 2); + tr[i] = _mm_slli_epi16(br[i], 2); + } + array_transpose_16x16(tl, tr); + + // Copy the temporary storage back to the bottom half of the input + for (i = 0; i < 16; ++i) { + bl[i] = tmpl[i]; + br[i] = tmpr[i]; + } + + // Generate the bottom half of the output + scale_sqrt2_8x16(bl); + scale_sqrt2_8x16(br); + aom_idct16_sse2(bl, br); // Includes a transposition +} + +#if CONFIG_EXT_TX +static INLINE void iidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl, + __m128i *br) { + int i; + array_transpose_16x16(tl, tr); + array_transpose_16x16(bl, br); + for (i = 0; i < 16; ++i) { + tl[i] = _mm_slli_epi16(tl[i], 2); + tr[i] = _mm_slli_epi16(tr[i], 2); + bl[i] = _mm_slli_epi16(bl[i], 2); + br[i] = _mm_slli_epi16(br[i], 2); + } +} +#endif // CONFIG_EXT_TX + +static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl, + __m128i *intr, __m128i *inbl, + __m128i *inbr, int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + int i; + + for (i = 0; i < 16; ++i) { + intl[i] = _mm_adds_epi16(intl[i], final_rounding); + intr[i] = _mm_adds_epi16(intr[i], final_rounding); + inbl[i] = _mm_adds_epi16(inbl[i], final_rounding); + inbr[i] = _mm_adds_epi16(inbr[i], final_rounding); + intl[i] = _mm_srai_epi16(intl[i], 6); + intr[i] = _mm_srai_epi16(intr[i], 6); + inbl[i] = _mm_srai_epi16(inbl[i], 6); + inbr[i] = _mm_srai_epi16(inbr[i], 6); + RECON_AND_STORE(dest + i * stride + 0, intl[i]); + RECON_AND_STORE(dest + i * stride + 8, intr[i]); + RECON_AND_STORE(dest + (i + 16) * stride + 0, inbl[i]); + RECON_AND_STORE(dest + (i + 16) * stride + 8, inbr[i]); + } +} + +void av1_iht16x32_512_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + __m128i intl[16], intr[16], inbl[16], inbr[16]; + + int i; + for (i = 0; i < 16; ++i) { + intl[i] = load_input_data(input + i * 16 + 0); + intr[i] = load_input_data(input + i * 16 + 8); + inbl[i] = load_input_data(input + (i + 16) * 16 + 0); + inbr[i] = load_input_data(input + (i + 16) * 16 + 8); + } + + // Row transform + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: +#if CONFIG_EXT_TX + case FLIPADST_DCT: + case H_DCT: +#endif + aom_idct16_sse2(intl, intr); + aom_idct16_sse2(inbl, inbr); + break; + case DCT_ADST: + case ADST_ADST: +#if CONFIG_EXT_TX + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + case H_ADST: + case H_FLIPADST: +#endif + aom_iadst16_sse2(intl, intr); + aom_iadst16_sse2(inbl, inbr); + break; +#if CONFIG_EXT_TX + case V_FLIPADST: + case V_ADST: + case V_DCT: + case IDTX: + iidtx16_sse2(intl, intr); + iidtx16_sse2(inbl, inbr); + break; +#endif + default: assert(0); break; + } + + scale_sqrt2_8x16(intl); + scale_sqrt2_8x16(intr); + scale_sqrt2_8x16(inbl); + scale_sqrt2_8x16(inbr); + + // Column transform + switch (tx_type) { + case DCT_DCT: + case DCT_ADST: +#if CONFIG_EXT_TX + case DCT_FLIPADST: + case V_DCT: +#endif + idct32_16col(intl, intr, inbl, inbr); + break; + case ADST_DCT: + case ADST_ADST: +#if CONFIG_EXT_TX + case FLIPADST_ADST: + case ADST_FLIPADST: + case FLIPADST_FLIPADST: + case FLIPADST_DCT: + case V_ADST: + case V_FLIPADST: +#endif + ihalfright32_16col(intl, intr, inbl, inbr); + break; +#if CONFIG_EXT_TX + case H_DCT: + case H_ADST: + case H_FLIPADST: + case IDTX: iidtx32_16col(intl, intr, inbl, inbr); break; +#endif + default: assert(0); break; + } + + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: +#if CONFIG_EXT_TX + case H_DCT: + case H_ADST: + case V_ADST: + case V_DCT: + case IDTX: +#endif + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + case FLIPADST_ADST: + case V_FLIPADST: FLIPUD_PTR(dest, stride, 32); break; + case DCT_FLIPADST: + case ADST_FLIPADST: + case H_FLIPADST: + for (i = 0; i < 16; ++i) { + __m128i tmp = intl[i]; + intl[i] = mm_reverse_epi16(intr[i]); + intr[i] = mm_reverse_epi16(tmp); + tmp = inbl[i]; + inbl[i] = mm_reverse_epi16(inbr[i]); + inbr[i] = mm_reverse_epi16(tmp); + } + break; + case FLIPADST_FLIPADST: + for (i = 0; i < 16; ++i) { + __m128i tmp = intl[i]; + intl[i] = mm_reverse_epi16(intr[i]); + intr[i] = mm_reverse_epi16(tmp); + tmp = inbl[i]; + inbl[i] = mm_reverse_epi16(inbr[i]); + inbr[i] = mm_reverse_epi16(tmp); + } + FLIPUD_PTR(dest, stride, 32); + break; +#endif + default: assert(0); break; + } + write_buffer_16x32_round6(dest, intl, intr, inbl, inbr, stride); +} + +static INLINE void write_buffer_32x16_round6(uint8_t *dest, __m128i *in0, + __m128i *in1, __m128i *in2, + __m128i *in3, int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + int i; + + for (i = 0; i < 16; ++i) { + in0[i] = _mm_adds_epi16(in0[i], final_rounding); + in1[i] = _mm_adds_epi16(in1[i], final_rounding); + in2[i] = _mm_adds_epi16(in2[i], final_rounding); + in3[i] = _mm_adds_epi16(in3[i], final_rounding); + in0[i] = _mm_srai_epi16(in0[i], 6); + in1[i] = _mm_srai_epi16(in1[i], 6); + in2[i] = _mm_srai_epi16(in2[i], 6); + in3[i] = _mm_srai_epi16(in3[i], 6); + RECON_AND_STORE(dest + i * stride + 0, in0[i]); + RECON_AND_STORE(dest + i * stride + 8, in1[i]); + RECON_AND_STORE(dest + i * stride + 16, in2[i]); + RECON_AND_STORE(dest + i * stride + 24, in3[i]); + } +} + +void av1_iht32x16_512_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + __m128i in0[16], in1[16], in2[16], in3[16]; + int i; + + for (i = 0; i < 16; ++i) { + in0[i] = load_input_data(input + i * 32 + 0); + in1[i] = load_input_data(input + i * 32 + 8); + in2[i] = load_input_data(input + i * 32 + 16); + in3[i] = load_input_data(input + i * 32 + 24); + } + + // Row transform + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: +#if CONFIG_EXT_TX + case FLIPADST_DCT: + case H_DCT: +#endif + idct32_16col(in0, in1, in2, in3); + break; + case DCT_ADST: + case ADST_ADST: +#if CONFIG_EXT_TX + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + case H_ADST: + case H_FLIPADST: +#endif + ihalfright32_16col(in0, in1, in2, in3); + break; +#if CONFIG_EXT_TX + case V_FLIPADST: + case V_ADST: + case V_DCT: + case IDTX: iidtx32_16col(in0, in1, in2, in3); break; +#endif + default: assert(0); break; + } + + scale_sqrt2_8x16(in0); + scale_sqrt2_8x16(in1); + scale_sqrt2_8x16(in2); + scale_sqrt2_8x16(in3); + + // Column transform + switch (tx_type) { + case DCT_DCT: + case DCT_ADST: +#if CONFIG_EXT_TX + case DCT_FLIPADST: + case V_DCT: +#endif + aom_idct16_sse2(in0, in1); + aom_idct16_sse2(in2, in3); + break; + case ADST_DCT: + case ADST_ADST: +#if CONFIG_EXT_TX + case FLIPADST_ADST: + case ADST_FLIPADST: + case FLIPADST_FLIPADST: + case FLIPADST_DCT: + case V_ADST: + case V_FLIPADST: +#endif + aom_iadst16_sse2(in0, in1); + aom_iadst16_sse2(in2, in3); + break; +#if CONFIG_EXT_TX + case H_DCT: + case H_ADST: + case H_FLIPADST: + case IDTX: + iidtx16_sse2(in0, in1); + iidtx16_sse2(in2, in3); + break; +#endif + default: assert(0); break; + } + + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: +#if CONFIG_EXT_TX + case H_DCT: + case H_ADST: + case V_ADST: + case V_DCT: + case IDTX: +#endif + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + case FLIPADST_ADST: + case V_FLIPADST: FLIPUD_PTR(dest, stride, 16); break; + case DCT_FLIPADST: + case ADST_FLIPADST: + case H_FLIPADST: + for (i = 0; i < 16; ++i) { + __m128i tmp1 = in0[i]; + __m128i tmp2 = in1[i]; + in0[i] = mm_reverse_epi16(in3[i]); + in1[i] = mm_reverse_epi16(in2[i]); + in2[i] = mm_reverse_epi16(tmp2); + in3[i] = mm_reverse_epi16(tmp1); + } + break; + case FLIPADST_FLIPADST: + for (i = 0; i < 16; ++i) { + __m128i tmp1 = in0[i]; + __m128i tmp2 = in1[i]; + in0[i] = mm_reverse_epi16(in3[i]); + in1[i] = mm_reverse_epi16(in2[i]); + in2[i] = mm_reverse_epi16(tmp2); + in3[i] = mm_reverse_epi16(tmp1); + } + FLIPUD_PTR(dest, stride, 16); + break; +#endif + default: assert(0); break; + } + write_buffer_32x16_round6(dest, in0, in1, in2, in3, stride); +} |