diff options
Diffstat (limited to 'third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c')
-rw-r--r-- | third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c | 149 |
1 files changed, 72 insertions, 77 deletions
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c index 5570ca5b7..58e5f98e5 100644 --- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c @@ -16,7 +16,7 @@ #include "aom_ports/mem.h" void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, - int skip_block, const int16_t *zbin_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, @@ -41,50 +41,48 @@ void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = ((int)count / 4) - 1; i >= 0; i--) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (test == 0xffff) - non_zero_regs--; - else - break; - } + // Pre-scan pass + for (i = ((int)count / 4) - 1; i >= 0; i--) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (test == 0xffff) + non_zero_regs--; + else + break; + } - // Quantization pass: - for (i = 0; i < non_zero_regs; i++) { - __m128i coeffs, coeffs_sign, tmp1, tmp2; - int test; - int abs_coeff[4]; - int coeff_sign[4]; - - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - coeffs_sign = _mm_srai_epi32(coeffs, 31); - coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); - tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); - tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); - tmp1 = _mm_or_si128(tmp1, tmp2); - test = _mm_movemask_epi8(tmp1); - _mm_storeu_si128((__m128i *)abs_coeff, coeffs); - _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); - - for (j = 0; j < 4; j++) { - if (test & (1 << (4 * j))) { - int k = 4 * i + j; - const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; - const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; - const uint32_t abs_qcoeff = - (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); - qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; - dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; - if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; - } + // Quantization pass: + for (i = 0; i < non_zero_regs; i++) { + __m128i coeffs, coeffs_sign, tmp1, tmp2; + int test; + int abs_coeff[4]; + int coeff_sign[4]; + + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + coeffs_sign = _mm_srai_epi32(coeffs, 31); + coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); + tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); + tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); + tmp1 = _mm_or_si128(tmp1, tmp2); + test = _mm_movemask_epi8(tmp1); + _mm_storeu_si128((__m128i *)abs_coeff, coeffs); + _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); + + for (j = 0; j < 4; j++) { + if (test & (1 << (4 * j))) { + int k = 4 * i + j; + const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; + const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; + const uint32_t abs_qcoeff = + (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); + qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; + dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; + if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; } } } @@ -92,8 +90,8 @@ void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, } void aom_highbd_quantize_b_32x32_sse2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -116,38 +114,35 @@ void aom_highbd_quantize_b_32x32_sse2( memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs / 4; i++) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (!(test & 0xf)) idx_arr[idx++] = i * 4; - if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; - if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; - if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; - } + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; + } - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = idx_arr[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; - } + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; } *eob_ptr = eob + 1; } |