diff options
Diffstat (limited to 'third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm')
-rw-r--r-- | third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm | 272 |
1 files changed, 272 insertions, 0 deletions
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm new file mode 100644 index 000000000..39d4ca674 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm @@ -0,0 +1,272 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_1: times 8 dw 1 + +SECTION .text + +%macro QUANTIZE_FN 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, \ + eob, scan, iscan + + ; actual quantize loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + movifnidn dequantq, dequantmp + mova m0, [zbinq] ; m0 = zbin + mova m1, [roundq] ; m1 = round + mova m2, [quantq] ; m2 = quant +%ifidn %1, b_32x32 + pcmpeqw m5, m5 + psrlw m5, 15 + paddw m0, m5 + paddw m1, m5 + psrlw m0, 1 ; m0 = (m0 + 1) / 2 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif + mova m3, [dequantq] ; m3 = dequant + mov r2, shiftmp + psubw m0, [GLOBAL(pw_1)] + mova m4, [r2] ; m4 = shift + mov r3, qcoeffmp + mov r4, dqcoeffmp + mov r5, iscanmp +%ifidn %1, b_32x32 + psllw m4, 1 +%endif + pxor m5, m5 ; m5 = dedicated zero + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob + lea coeffq, [ coeffq+ncoeffq*4] + lea qcoeffq, [ qcoeffq+ncoeffq*4] + lea dqcoeffq, [dqcoeffq+ncoeffq*4] + lea iscanq, [ iscanq+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs + ; coeff stored as 32bit numbers & require 16bit numbers + mova m9, [ coeffq+ncoeffq*4+ 0] + packssdw m9, [ coeffq+ncoeffq*4+16] + mova m10, [ coeffq+ncoeffq*4+32] + packssdw m10, [ coeffq+ncoeffq*4+48] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + punpckhqdq m0, m0 + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m8, m6 ; m8 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m8, m4 ; m8 = m8*qsh>>16 + punpckhqdq m4, m4 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m8, m7 + pand m13, m12 + + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + mova m11, m8 + mova m6, m8 + pcmpgtw m5, m8 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register + +%ifidn %1, b_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 +%endif + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + mova m11, m8 + mova m6, m8 + pcmpgtw m5, m8 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + jz .accumulate_eob + +.ac_only_loop: + ; pack coeff from 32bit to 16bit array + mova m9, [ coeffq+ncoeffq*4+ 0] + packssdw m9, [ coeffq+ncoeffq*4+16] + mova m10, [ coeffq+ncoeffq*4+32] + packssdw m10, [ coeffq+ncoeffq*4+48] + + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin +%ifidn %1, b_32x32 + pmovmskb r6d, m7 + pmovmskb r2d, m12 + or r6, r2 + jz .skip_iter +%endif + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m14, m6 ; m14 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m14, m4 ; m14 = m14*qsh>>16 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m14, m7 + pand m13, m12 + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pxor m11, m11 + mova m11, m14 + mova m6, m14 + pcmpgtw m5, m14 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register + +%ifidn %1, b_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; dqc[i] = qc[i] * q + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif + + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + mova m11, m14 + mova m6, m14 + pcmpgtw m5, m14 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 + + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jl .ac_only_loop + +%ifidn %1, b_32x32 + jmp .accumulate_eob +.skip_iter: + mova [qcoeffq+ncoeffq*4+ 0], m5 + mova [qcoeffq+ncoeffq*4+16], m5 + mova [qcoeffq+ncoeffq*4+32], m5 + mova [qcoeffq+ncoeffq*4+48], m5 + mova [dqcoeffq+ncoeffq*4+ 0], m5 + mova [dqcoeffq+ncoeffq*4+16], m5 + mova [dqcoeffq+ncoeffq*4+32], m5 + mova [dqcoeffq+ncoeffq*4+48], m5 + add ncoeffq, mmsize + jl .ac_only_loop +%endif + +.accumulate_eob: + ; horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + pextrw r6, m8, 0 + mov [r2], r6 + RET +%endmacro + +INIT_XMM ssse3 +QUANTIZE_FN b, 9 +QUANTIZE_FN b_32x32, 9 |