summaryrefslogtreecommitdiffstats
path: root/media/libaom/src/av1/encoder/x86/av1_highbd_quantize_sse4.c
diff options
context:
space:
mode:
Diffstat (limited to 'media/libaom/src/av1/encoder/x86/av1_highbd_quantize_sse4.c')
-rw-r--r--media/libaom/src/av1/encoder/x86/av1_highbd_quantize_sse4.c195
1 files changed, 195 insertions, 0 deletions
diff --git a/media/libaom/src/av1/encoder/x86/av1_highbd_quantize_sse4.c b/media/libaom/src/av1/encoder/x86/av1_highbd_quantize_sse4.c
new file mode 100644
index 000000000..40b3b460b
--- /dev/null
+++ b/media/libaom/src/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <stdint.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Coefficient quantization phase 1
+// param[0-2] : rounding/quan/dequan constants
+static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
+ const int shift, const int scale,
+ __m128i *qcoeff, __m128i *dquan,
+ __m128i *sign) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi32(1);
+
+ *sign = _mm_cmplt_epi32(*coeff, zero);
+ *sign = _mm_or_si128(*sign, one);
+ *coeff = _mm_abs_epi32(*coeff);
+
+ qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
+ qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
+ qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
+
+ qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
+ qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
+ dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
+ dquan[0] = _mm_srli_epi64(dquan[0], scale);
+ const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale);
+ qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]);
+}
+
+// Coefficient quantization phase 2
+static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
+ const __m128i *sign,
+ const __m128i *param, const int shift,
+ const int scale, tran_low_t *qAddr,
+ tran_low_t *dqAddr) {
+ __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
+ __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
+
+ qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
+ qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
+ dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
+ dquan[1] = _mm_srli_epi64(dquan[1], scale);
+
+ // combine L&H
+ qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
+ qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
+
+ qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
+ qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
+
+ dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
+ dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
+
+ dquan[0] = _mm_and_si128(dquan[0], mask0H);
+ dquan[1] = _mm_and_si128(dquan[1], mask0L);
+
+ qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
+ dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
+
+ qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
+ dquan[0] = _mm_sign_epi32(dquan[0], *sign);
+ qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]);
+ dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]);
+ _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
+ _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
+}
+
+static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
+ __m128i *eob) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i mask, iscanIdx;
+ const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
+ const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
+ __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
+ __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
+
+ nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
+ nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
+
+ mask = _mm_packs_epi32(nz_flag0, nz_flag1);
+ iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
+ iscanIdx = _mm_sub_epi16(iscanIdx, mask);
+ iscanIdx = _mm_and_si128(iscanIdx, mask);
+ *eob = _mm_max_epi16(*eob, iscanIdx);
+}
+
+static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
+ __m128i eob_shuffled;
+ uint16_t eobValue;
+ eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
+ *eob = _mm_max_epi16(*eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
+ *eob = _mm_max_epi16(*eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
+ *eob = _mm_max_epi16(*eob, eob_shuffled);
+ eobValue = _mm_extract_epi16(*eob, 0);
+ return eobValue;
+}
+
+void av1_highbd_quantize_fp_sse4_1(
+ const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, int log_scale) {
+ __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign;
+ __m128i eob = _mm_setzero_si128();
+ const tran_low_t *src = coeff_ptr;
+ tran_low_t *quanAddr = qcoeff_ptr;
+ tran_low_t *dquanAddr = dqcoeff_ptr;
+ const int shift = 16 - log_scale;
+ const int coeff_stride = 4;
+ const int quan_stride = coeff_stride;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)scan;
+
+ memset(quanAddr, 0, count * sizeof(quanAddr[0]));
+ memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
+
+ coeff[0] = _mm_loadu_si128((__m128i const *)src);
+ const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+ const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+
+ qparam[0] = _mm_set_epi32(round1, round1, round1, round0);
+ qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]);
+ qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]);
+ qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1],
+ dequant_ptr[0]);
+
+ // DC and first 3 AC
+ quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+
+ // update round/quan/dquan for AC
+ qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
+ qparam[1] = xx_set1_64_from_32i(quant_ptr[1]);
+ qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]);
+ qparam[3] = _mm_set1_epi32(dequant_ptr[1]);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+ quanAddr, dquanAddr);
+
+ // next 4 AC
+ coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+ quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+ quanAddr + quan_stride, dquanAddr + quan_stride);
+
+ find_eob(quanAddr, iscan, &eob);
+
+ count -= 8;
+
+ // loop for the rest of AC
+ while (count > 0) {
+ src += coeff_stride << 1;
+ quanAddr += quan_stride << 1;
+ dquanAddr += quan_stride << 1;
+ iscan += quan_stride << 1;
+
+ coeff[0] = _mm_loadu_si128((__m128i const *)src);
+ coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+
+ quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+ log_scale, quanAddr, dquanAddr);
+
+ quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+ log_scale, quanAddr + quan_stride,
+ dquanAddr + quan_stride);
+
+ find_eob(quanAddr, iscan, &eob);
+
+ count -= 8;
+ }
+ *eob_ptr = get_accumulated_eob(&eob);
+}