; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "third_party/x86inc/x86inc.asm" SECTION_RODATA pw_1: times 8 dw 1 SECTION .text %macro QUANTIZE_FN 2 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ shift, qcoeff, dqcoeff, dequant, \ eob, scan, iscan ; actual quantize loop - setup pointers, rounders, etc. movifnidn coeffq, coeffmp movifnidn ncoeffq, ncoeffmp movifnidn zbinq, zbinmp movifnidn roundq, roundmp movifnidn quantq, quantmp movifnidn dequantq, dequantmp mova m0, [zbinq] ; m0 = zbin mova m1, [roundq] ; m1 = round mova m2, [quantq] ; m2 = quant %ifidn %1, b_32x32 pcmpeqw m5, m5 psrlw m5, 15 paddw m0, m5 paddw m1, m5 psrlw m0, 1 ; m0 = (m0 + 1) / 2 psrlw m1, 1 ; m1 = (m1 + 1) / 2 %endif mova m3, [dequantq] ; m3 = dequant mov r2, shiftmp psubw m0, [GLOBAL(pw_1)] mova m4, [r2] ; m4 = shift mov r3, qcoeffmp mov r4, dqcoeffmp mov r5, iscanmp %ifidn %1, b_32x32 psllw m4, 1 %endif pxor m5, m5 ; m5 = dedicated zero DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob lea coeffq, [ coeffq+ncoeffq*4] lea qcoeffq, [ qcoeffq+ncoeffq*4] lea dqcoeffq, [dqcoeffq+ncoeffq*4] lea iscanq, [ iscanq+ncoeffq*2] neg ncoeffq ; get DC and first 15 AC coeffs ; coeff stored as 32bit numbers & require 16bit numbers mova m9, [ coeffq+ncoeffq*4+ 0] packssdw m9, [ coeffq+ncoeffq*4+16] mova m10, [ coeffq+ncoeffq*4+32] packssdw m10, [ coeffq+ncoeffq*4+48] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin punpckhqdq m0, m0 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin paddsw m6, m1 ; m6 += round punpckhqdq m1, m1 paddsw m11, m1 ; m11 += round pmulhw m8, m6, m2 ; m8 = m6*q>>16 punpckhqdq m2, m2 pmulhw m13, m11, m2 ; m13 = m11*q>>16 paddw m8, m6 ; m8 += m6 paddw m13, m11 ; m13 += m11 pmulhw m8, m4 ; m8 = m8*qsh>>16 punpckhqdq m4, m4 pmulhw m13, m4 ; m13 = m13*qsh>>16 psignw m8, m9 ; m8 = reinsert sign psignw m13, m10 ; m13 = reinsert sign pand m8, m7 pand m13, m12 ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff mova m11, m8 mova m6, m8 pcmpgtw m5, m8 punpcklwd m11, m5 punpckhwd m6, m5 mova [qcoeffq+ncoeffq*4+ 0], m11 mova [qcoeffq+ncoeffq*4+16], m6 pxor m5, m5 mova m11, m13 mova m6, m13 pcmpgtw m5, m13 punpcklwd m11, m5 punpckhwd m6, m5 mova [qcoeffq+ncoeffq*4+32], m11 mova [qcoeffq+ncoeffq*4+48], m6 pxor m5, m5 ; reset m5 to zero register %ifidn %1, b_32x32 pabsw m8, m8 pabsw m13, m13 %endif pmullw m8, m3 ; dqc[i] = qc[i] * q punpckhqdq m3, m3 pmullw m13, m3 ; dqc[i] = qc[i] * q %ifidn %1, b_32x32 psrlw m8, 1 psrlw m13, 1 psignw m8, m9 psignw m13, m10 %endif ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff mova m11, m8 mova m6, m8 pcmpgtw m5, m8 punpcklwd m11, m5 punpckhwd m6, m5 mova [dqcoeffq+ncoeffq*4+ 0], m11 mova [dqcoeffq+ncoeffq*4+16], m6 pxor m5, m5 mova m11, m13 mova m6, m13 pcmpgtw m5, m13 punpcklwd m11, m5 punpckhwd m6, m5 mova [dqcoeffq+ncoeffq*4+32], m11 mova [dqcoeffq+ncoeffq*4+48], m6 pxor m5, m5 ; reset m5 to zero register pcmpeqw m8, m5 ; m8 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] psubw m6, m7 ; m6 = scan[i] + 1 psubw m11, m12 ; m11 = scan[i] + 1 pandn m8, m6 ; m8 = max(eob) pandn m13, m11 ; m13 = max(eob) pmaxsw m8, m13 add ncoeffq, mmsize jz .accumulate_eob .ac_only_loop: ; pack coeff from 32bit to 16bit array mova m9, [ coeffq+ncoeffq*4+ 0] packssdw m9, [ coeffq+ncoeffq*4+16] mova m10, [ coeffq+ncoeffq*4+32] packssdw m10, [ coeffq+ncoeffq*4+48] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin %ifidn %1, b_32x32 pmovmskb r6d, m7 pmovmskb r2d, m12 or r6, r2 jz .skip_iter %endif paddsw m6, m1 ; m6 += round paddsw m11, m1 ; m11 += round pmulhw m14, m6, m2 ; m14 = m6*q>>16 pmulhw m13, m11, m2 ; m13 = m11*q>>16 paddw m14, m6 ; m14 += m6 paddw m13, m11 ; m13 += m11 pmulhw m14, m4 ; m14 = m14*qsh>>16 pmulhw m13, m4 ; m13 = m13*qsh>>16 psignw m14, m9 ; m14 = reinsert sign psignw m13, m10 ; m13 = reinsert sign pand m14, m7 pand m13, m12 ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff pxor m11, m11 mova m11, m14 mova m6, m14 pcmpgtw m5, m14 punpcklwd m11, m5 punpckhwd m6, m5 mova [qcoeffq+ncoeffq*4+ 0], m11 mova [qcoeffq+ncoeffq*4+16], m6 pxor m5, m5 mova m11, m13 mova m6, m13 pcmpgtw m5, m13 punpcklwd m11, m5 punpckhwd m6, m5 mova [qcoeffq+ncoeffq*4+32], m11 mova [qcoeffq+ncoeffq*4+48], m6 pxor m5, m5 ; reset m5 to zero register %ifidn %1, b_32x32 pabsw m14, m14 pabsw m13, m13 %endif pmullw m14, m3 ; dqc[i] = qc[i] * q pmullw m13, m3 ; dqc[i] = qc[i] * q %ifidn %1, b_32x32 psrlw m14, 1 psrlw m13, 1 psignw m14, m9 psignw m13, m10 %endif ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff mova m11, m14 mova m6, m14 pcmpgtw m5, m14 punpcklwd m11, m5 punpckhwd m6, m5 mova [dqcoeffq+ncoeffq*4+ 0], m11 mova [dqcoeffq+ncoeffq*4+16], m6 pxor m5, m5 mova m11, m13 mova m6, m13 pcmpgtw m5, m13 punpcklwd m11, m5 punpckhwd m6, m5 mova [dqcoeffq+ncoeffq*4+32], m11 mova [dqcoeffq+ncoeffq*4+48], m6 pxor m5, m5 pcmpeqw m14, m5 ; m14 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] psubw m6, m7 ; m6 = scan[i] + 1 psubw m11, m12 ; m11 = scan[i] + 1 pandn m14, m6 ; m14 = max(eob) pandn m13, m11 ; m13 = max(eob) pmaxsw m8, m14 pmaxsw m8, m13 add ncoeffq, mmsize jl .ac_only_loop %ifidn %1, b_32x32 jmp .accumulate_eob .skip_iter: mova [qcoeffq+ncoeffq*4+ 0], m5 mova [qcoeffq+ncoeffq*4+16], m5 mova [qcoeffq+ncoeffq*4+32], m5 mova [qcoeffq+ncoeffq*4+48], m5 mova [dqcoeffq+ncoeffq*4+ 0], m5 mova [dqcoeffq+ncoeffq*4+16], m5 mova [dqcoeffq+ncoeffq*4+32], m5 mova [dqcoeffq+ncoeffq*4+48], m5 add ncoeffq, mmsize jl .ac_only_loop %endif .accumulate_eob: ; horizontally accumulate/max eobs and write into [eob] memory pointer mov r2, eobmp pshufd m7, m8, 0xe pmaxsw m8, m7 pshuflw m7, m8, 0xe pmaxsw m8, m7 pshuflw m7, m8, 0x1 pmaxsw m8, m7 pextrw r6, m8, 0 mov [r2], r6 RET %endmacro INIT_XMM ssse3 QUANTIZE_FN b, 9 QUANTIZE_FN b_32x32, 9