1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
|
;
; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
; This file is a duplicate of mfqe_sse2.asm in VP8.
; TODO(jackychen): Find a way to fix the duplicate.
%include "vpx_ports/x86_abi_support.asm"
;void vp9_filter_by_weight16x16_sse2
;(
; unsigned char *src,
; int src_stride,
; unsigned char *dst,
; int dst_stride,
; int src_weight
;)
global sym(vp9_filter_by_weight16x16_sse2) PRIVATE
sym(vp9_filter_by_weight16x16_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
SAVE_XMM 6
GET_GOT rbx
push rsi
push rdi
; end prolog
movd xmm0, arg(4) ; src_weight
pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
punpcklqdq xmm0, xmm0 ; replicate to all hi words
movdqa xmm1, [GLOBAL(tMFQE)]
psubw xmm1, xmm0 ; dst_weight
mov rax, arg(0) ; src
mov rsi, arg(1) ; src_stride
mov rdx, arg(2) ; dst
mov rdi, arg(3) ; dst_stride
mov rcx, 16 ; loop count
pxor xmm6, xmm6
.combine
movdqa xmm2, [rax]
movdqa xmm4, [rdx]
add rax, rsi
; src * src_weight
movdqa xmm3, xmm2
punpcklbw xmm2, xmm6
punpckhbw xmm3, xmm6
pmullw xmm2, xmm0
pmullw xmm3, xmm0
; dst * dst_weight
movdqa xmm5, xmm4
punpcklbw xmm4, xmm6
punpckhbw xmm5, xmm6
pmullw xmm4, xmm1
pmullw xmm5, xmm1
; sum, round and shift
paddw xmm2, xmm4
paddw xmm3, xmm5
paddw xmm2, [GLOBAL(tMFQE_round)]
paddw xmm3, [GLOBAL(tMFQE_round)]
psrlw xmm2, 4
psrlw xmm3, 4
packuswb xmm2, xmm3
movdqa [rdx], xmm2
add rdx, rdi
dec rcx
jnz .combine
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp9_filter_by_weight8x8_sse2
;(
; unsigned char *src,
; int src_stride,
; unsigned char *dst,
; int dst_stride,
; int src_weight
;)
global sym(vp9_filter_by_weight8x8_sse2) PRIVATE
sym(vp9_filter_by_weight8x8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
push rsi
push rdi
; end prolog
movd xmm0, arg(4) ; src_weight
pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
punpcklqdq xmm0, xmm0 ; replicate to all hi words
movdqa xmm1, [GLOBAL(tMFQE)]
psubw xmm1, xmm0 ; dst_weight
mov rax, arg(0) ; src
mov rsi, arg(1) ; src_stride
mov rdx, arg(2) ; dst
mov rdi, arg(3) ; dst_stride
mov rcx, 8 ; loop count
pxor xmm4, xmm4
.combine
movq xmm2, [rax]
movq xmm3, [rdx]
add rax, rsi
; src * src_weight
punpcklbw xmm2, xmm4
pmullw xmm2, xmm0
; dst * dst_weight
punpcklbw xmm3, xmm4
pmullw xmm3, xmm1
; sum, round and shift
paddw xmm2, xmm3
paddw xmm2, [GLOBAL(tMFQE_round)]
psrlw xmm2, 4
packuswb xmm2, xmm4
movq [rdx], xmm2
add rdx, rdi
dec rcx
jnz .combine
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp9_variance_and_sad_16x16_sse2 | arg
;(
; unsigned char *src1, 0
; int stride1, 1
; unsigned char *src2, 2
; int stride2, 3
; unsigned int *variance, 4
; unsigned int *sad, 5
;)
global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE
sym(vp9_variance_and_sad_16x16_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rax, arg(0) ; src1
mov rcx, arg(1) ; stride1
mov rdx, arg(2) ; src2
mov rdi, arg(3) ; stride2
mov rsi, 16 ; block height
; Prep accumulator registers
pxor xmm3, xmm3 ; SAD
pxor xmm4, xmm4 ; sum of src2
pxor xmm5, xmm5 ; sum of src2^2
; Because we're working with the actual output frames
; we can't depend on any kind of data alignment.
.accumulate
movdqa xmm0, [rax] ; src1
movdqa xmm1, [rdx] ; src2
add rax, rcx ; src1 + stride1
add rdx, rdi ; src2 + stride2
; SAD(src1, src2)
psadbw xmm0, xmm1
paddusw xmm3, xmm0
; SUM(src2)
pxor xmm2, xmm2
psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0
paddusw xmm4, xmm2
; pmaddubsw would be ideal if it took two unsigned values. instead,
; it expects a signed and an unsigned value. so instead we zero extend
; and operate on words.
pxor xmm2, xmm2
movdqa xmm0, xmm1
punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2
pmaddwd xmm0, xmm0
pmaddwd xmm1, xmm1
paddd xmm5, xmm0
paddd xmm5, xmm1
sub rsi, 1
jnz .accumulate
; phaddd only operates on adjacent double words.
; Finalize SAD and store
movdqa xmm0, xmm3
psrldq xmm0, 8
paddusw xmm0, xmm3
paddd xmm0, [GLOBAL(t128)]
psrld xmm0, 8
mov rax, arg(5)
movd [rax], xmm0
; Accumulate sum of src2
movdqa xmm0, xmm4
psrldq xmm0, 8
paddusw xmm0, xmm4
; Square src2. Ignore high value
pmuludq xmm0, xmm0
psrld xmm0, 8
; phaddw could be used to sum adjacent values but we want
; all the values summed. promote to doubles, accumulate,
; shift and sum
pxor xmm2, xmm2
movdqa xmm1, xmm5
punpckldq xmm1, xmm2
punpckhdq xmm5, xmm2
paddd xmm1, xmm5
movdqa xmm2, xmm1
psrldq xmm1, 8
paddd xmm1, xmm2
psubd xmm1, xmm0
; (variance + 128) >> 8
paddd xmm1, [GLOBAL(t128)]
psrld xmm1, 8
mov rax, arg(4)
movd [rax], xmm1
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
t128:
%ifndef __NASM_VER__
ddq 128
%elif CONFIG_BIG_ENDIAN
dq 0, 128
%else
dq 128, 0
%endif
align 16
tMFQE: ; 1 << MFQE_PRECISION
times 8 dw 0x10
align 16
tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
times 8 dw 0x08
|