;
;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;

;  This file is a duplicate of mfqe_sse2.asm in VP8.
;  TODO(jackychen): Find a way to fix the duplicate.
%include "vpx_ports/x86_abi_support.asm"

;void vp9_filter_by_weight16x16_sse2
;(
;    unsigned char *src,
;    int            src_stride,
;    unsigned char *dst,
;    int            dst_stride,
;    int            src_weight
;)
global sym(vp9_filter_by_weight16x16_sse2) PRIVATE
sym(vp9_filter_by_weight16x16_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    SAVE_XMM 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    movd        xmm0, arg(4)                ; src_weight
    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words

    movdqa      xmm1, [GLOBAL(tMFQE)]
    psubw       xmm1, xmm0                  ; dst_weight

    mov         rax, arg(0)                 ; src
    mov         rsi, arg(1)                 ; src_stride
    mov         rdx, arg(2)                 ; dst
    mov         rdi, arg(3)                 ; dst_stride

    mov         rcx, 16                     ; loop count
    pxor        xmm6, xmm6

.combine
    movdqa      xmm2, [rax]
    movdqa      xmm4, [rdx]
    add         rax, rsi

    ; src * src_weight
    movdqa      xmm3, xmm2
    punpcklbw   xmm2, xmm6
    punpckhbw   xmm3, xmm6
    pmullw      xmm2, xmm0
    pmullw      xmm3, xmm0

    ; dst * dst_weight
    movdqa      xmm5, xmm4
    punpcklbw   xmm4, xmm6
    punpckhbw   xmm5, xmm6
    pmullw      xmm4, xmm1
    pmullw      xmm5, xmm1

    ; sum, round and shift
    paddw       xmm2, xmm4
    paddw       xmm3, xmm5
    paddw       xmm2, [GLOBAL(tMFQE_round)]
    paddw       xmm3, [GLOBAL(tMFQE_round)]
    psrlw       xmm2, 4
    psrlw       xmm3, 4

    packuswb    xmm2, xmm3
    movdqa      [rdx], xmm2
    add         rdx, rdi

    dec         rcx
    jnz         .combine

    ; begin epilog
    pop         rdi
    pop         rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp

    ret

;void vp9_filter_by_weight8x8_sse2
;(
;    unsigned char *src,
;    int            src_stride,
;    unsigned char *dst,
;    int            dst_stride,
;    int            src_weight
;)
global sym(vp9_filter_by_weight8x8_sse2) PRIVATE
sym(vp9_filter_by_weight8x8_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    movd        xmm0, arg(4)                ; src_weight
    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words

    movdqa      xmm1, [GLOBAL(tMFQE)]
    psubw       xmm1, xmm0                  ; dst_weight

    mov         rax, arg(0)                 ; src
    mov         rsi, arg(1)                 ; src_stride
    mov         rdx, arg(2)                 ; dst
    mov         rdi, arg(3)                 ; dst_stride

    mov         rcx, 8                      ; loop count
    pxor        xmm4, xmm4

.combine
    movq        xmm2, [rax]
    movq        xmm3, [rdx]
    add         rax, rsi

    ; src * src_weight
    punpcklbw   xmm2, xmm4
    pmullw      xmm2, xmm0

    ; dst * dst_weight
    punpcklbw   xmm3, xmm4
    pmullw      xmm3, xmm1

    ; sum, round and shift
    paddw       xmm2, xmm3
    paddw       xmm2, [GLOBAL(tMFQE_round)]
    psrlw       xmm2, 4

    packuswb    xmm2, xmm4
    movq        [rdx], xmm2
    add         rdx, rdi

    dec         rcx
    jnz         .combine

    ; begin epilog
    pop         rdi
    pop         rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp

    ret

;void vp9_variance_and_sad_16x16_sse2 | arg
;(
;    unsigned char *src1,          0
;    int            stride1,       1
;    unsigned char *src2,          2
;    int            stride2,       3
;    unsigned int  *variance,      4
;    unsigned int  *sad,           5
;)
global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE
sym(vp9_variance_and_sad_16x16_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    mov         rax,        arg(0)          ; src1
    mov         rcx,        arg(1)          ; stride1
    mov         rdx,        arg(2)          ; src2
    mov         rdi,        arg(3)          ; stride2

    mov         rsi,        16              ; block height

    ; Prep accumulator registers
    pxor        xmm3, xmm3                  ; SAD
    pxor        xmm4, xmm4                  ; sum of src2
    pxor        xmm5, xmm5                  ; sum of src2^2

    ; Because we're working with the actual output frames
    ; we can't depend on any kind of data alignment.
.accumulate
    movdqa      xmm0, [rax]                 ; src1
    movdqa      xmm1, [rdx]                 ; src2
    add         rax, rcx                    ; src1 + stride1
    add         rdx, rdi                    ; src2 + stride2

    ; SAD(src1, src2)
    psadbw      xmm0, xmm1
    paddusw     xmm3, xmm0

    ; SUM(src2)
    pxor        xmm2, xmm2
    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
    paddusw     xmm4, xmm2

    ; pmaddubsw would be ideal if it took two unsigned values. instead,
    ; it expects a signed and an unsigned value. so instead we zero extend
    ; and operate on words.
    pxor        xmm2, xmm2
    movdqa      xmm0, xmm1
    punpcklbw   xmm0, xmm2
    punpckhbw   xmm1, xmm2
    pmaddwd     xmm0, xmm0
    pmaddwd     xmm1, xmm1
    paddd       xmm5, xmm0
    paddd       xmm5, xmm1

    sub         rsi,        1
    jnz         .accumulate

    ; phaddd only operates on adjacent double words.
    ; Finalize SAD and store
    movdqa      xmm0, xmm3
    psrldq      xmm0, 8
    paddusw     xmm0, xmm3
    paddd       xmm0, [GLOBAL(t128)]
    psrld       xmm0, 8

    mov         rax,  arg(5)
    movd        [rax], xmm0

    ; Accumulate sum of src2
    movdqa      xmm0, xmm4
    psrldq      xmm0, 8
    paddusw     xmm0, xmm4
    ; Square src2. Ignore high value
    pmuludq     xmm0, xmm0
    psrld       xmm0, 8

    ; phaddw could be used to sum adjacent values but we want
    ; all the values summed. promote to doubles, accumulate,
    ; shift and sum
    pxor        xmm2, xmm2
    movdqa      xmm1, xmm5
    punpckldq   xmm1, xmm2
    punpckhdq   xmm5, xmm2
    paddd       xmm1, xmm5
    movdqa      xmm2, xmm1
    psrldq      xmm1, 8
    paddd       xmm1, xmm2

    psubd       xmm1, xmm0

    ; (variance + 128) >> 8
    paddd       xmm1, [GLOBAL(t128)]
    psrld       xmm1, 8
    mov         rax,  arg(4)

    movd        [rax], xmm1


    ; begin epilog
    pop         rdi
    pop         rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret

SECTION_RODATA
align 16
t128:
%ifndef __NASM_VER__
    ddq 128
%elif CONFIG_BIG_ENDIAN
    dq  0, 128
%else
    dq  128, 0
%endif
align 16
tMFQE: ; 1 << MFQE_PRECISION
    times 8 dw 0x10
align 16
tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
    times 8 dw 0x08