third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215

;
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
;

;


%include "aom_ports/x86_abi_support.asm"

; void av1_temporal_filter_apply_sse2 | arg
;  (unsigned char  *frame1,           |  0
;   unsigned int    stride,           |  1
;   unsigned char  *frame2,           |  2
;   unsigned int    block_width,      |  3
;   unsigned int    block_height,     |  4
;   int             strength,         |  5
;   int             filter_weight,    |  6
;   unsigned int   *accumulator,      |  7
;   unsigned short *count)            |  8
global sym(av1_temporal_filter_apply_sse2) PRIVATE
sym(av1_temporal_filter_apply_sse2):

    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 9
    SAVE_XMM 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ALIGN_STACK 16, rax
    %define block_width    0
    %define block_height  16
    %define strength      32
    %define filter_weight 48
    %define rounding_bit  64
    %define rbp_backup    80
    %define stack_size    96
    sub         rsp,           stack_size
    mov         [rsp + rbp_backup], rbp
    ; end prolog

        mov         edx,            arg(3)
        mov         [rsp + block_width], rdx
        mov         edx,            arg(4)
        mov         [rsp + block_height], rdx
        movd        xmm6,           arg(5)
        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read

        ; calculate the rounding bit outside the loop
        ; 0x8000 >> (16 - strength)
        mov         rdx,            16
        sub         rdx,            arg(5) ; 16 - strength
        movq        xmm4,           rdx    ; can't use rdx w/ shift
        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
        psrlw       xmm5,           xmm4
        movdqa      [rsp + rounding_bit], xmm5

        mov         rsi,            arg(0) ; src/frame1
        mov         rdx,            arg(2) ; predictor frame
        mov         rdi,            arg(7) ; accumulator
        mov         rax,            arg(8) ; count

        ; dup the filter weight and store for later
        movd        xmm0,           arg(6) ; filter_weight
        pshuflw     xmm0,           xmm0, 0
        punpcklwd   xmm0,           xmm0
        movdqa      [rsp + filter_weight], xmm0

        mov         rbp,            arg(1) ; stride
        pxor        xmm7,           xmm7   ; zero for extraction

        mov         rcx,            [rsp + block_width]
        imul        rcx,            [rsp + block_height]
        add         rcx,            rdx
        cmp         dword ptr [rsp + block_width], 8
        jne         .temporal_filter_apply_load_16

.temporal_filter_apply_load_8:
        movq        xmm0,           [rsi]  ; first row
        lea         rsi,            [rsi + rbp] ; += stride
        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
        movq        xmm1,           [rsi]  ; second row
        lea         rsi,            [rsi + rbp] ; += stride
        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
        jmp         .temporal_filter_apply_load_finished

.temporal_filter_apply_load_16:
        movdqa      xmm0,           [rsi]  ; src (frame1)
        lea         rsi,            [rsi + rbp] ; += stride
        movdqa      xmm1,           xmm0
        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
        punpckhbw   xmm1,           xmm7   ; src[ 8-15]

.temporal_filter_apply_load_finished:
        movdqa      xmm2,           [rdx]  ; predictor (frame2)
        movdqa      xmm3,           xmm2
        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]

        ; modifier = src_byte - pixel_value
        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
        psubw       xmm1,           xmm3   ; src - pred[ 8-15]

        ; modifier *= modifier
        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2

        ; modifier *= 3
        pmullw      xmm0,           [GLOBAL(_const_3w)]
        pmullw      xmm1,           [GLOBAL(_const_3w)]

        ; modifer += 0x8000 >> (16 - strength)
        paddw       xmm0,           [rsp + rounding_bit]
        paddw       xmm1,           [rsp + rounding_bit]

        ; modifier >>= strength
        psrlw       xmm0,           [rsp + strength]
        psrlw       xmm1,           [rsp + strength]

        ; modifier = 16 - modifier
        ; saturation takes care of modifier > 16
        movdqa      xmm3,           [GLOBAL(_const_16w)]
        movdqa      xmm2,           [GLOBAL(_const_16w)]
        psubusw     xmm3,           xmm1
        psubusw     xmm2,           xmm0

        ; modifier *= filter_weight
        pmullw      xmm2,           [rsp + filter_weight]
        pmullw      xmm3,           [rsp + filter_weight]

        ; count
        movdqa      xmm4,           [rax]
        movdqa      xmm5,           [rax+16]
        ; += modifier
        paddw       xmm4,           xmm2
        paddw       xmm5,           xmm3
        ; write back
        movdqa      [rax],          xmm4
        movdqa      [rax+16],       xmm5
        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))

        ; load and extract the predictor up to shorts
        pxor        xmm7,           xmm7
        movdqa      xmm0,           [rdx]
        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
        movdqa      xmm1,           xmm0
        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]

        ; modifier *= pixel_value
        pmullw      xmm0,           xmm2
        pmullw      xmm1,           xmm3

        ; expand to double words
        movdqa      xmm2,           xmm0
        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
        movdqa      xmm3,           xmm1
        punpcklwd   xmm1,           xmm7   ; [ 8-11]
        punpckhwd   xmm3,           xmm7   ; [12-15]

        ; accumulator
        movdqa      xmm4,           [rdi]
        movdqa      xmm5,           [rdi+16]
        movdqa      xmm6,           [rdi+32]
        movdqa      xmm7,           [rdi+48]
        ; += modifier
        paddd       xmm4,           xmm0
        paddd       xmm5,           xmm2
        paddd       xmm6,           xmm1
        paddd       xmm7,           xmm3
        ; write back
        movdqa      [rdi],          xmm4
        movdqa      [rdi+16],       xmm5
        movdqa      [rdi+32],       xmm6
        movdqa      [rdi+48],       xmm7
        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))

        cmp         rdx,            rcx
        je          .temporal_filter_apply_epilog
        pxor        xmm7,           xmm7   ; zero for extraction
        cmp         dword ptr [rsp + block_width], 16
        je          .temporal_filter_apply_load_16
        jmp         .temporal_filter_apply_load_8

.temporal_filter_apply_epilog:
    ; begin epilog
    mov         rbp,            [rsp + rbp_backup]
    add         rsp,            stack_size
    pop         rsp
    pop         rdi
    pop         rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret

SECTION_RODATA
align 16
_const_3w:
    times 8 dw 3
align 16
_const_top_bit:
    times 8 dw 1<<15
align 16
_const_16w:
    times 8 dw 16