1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
|
;
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
;
;
%include "aom_ports/x86_abi_support.asm"
; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
%macro TABULATE_SSIM 0
paddusw xmm15, xmm3 ; sum_s
paddusw xmm14, xmm4 ; sum_r
movdqa xmm1, xmm3
pmaddwd xmm1, xmm1
paddd xmm13, xmm1 ; sum_sq_s
movdqa xmm2, xmm4
pmaddwd xmm2, xmm2
paddd xmm12, xmm2 ; sum_sq_r
pmaddwd xmm3, xmm4
paddd xmm11, xmm3 ; sum_sxr
%endmacro
; Sum across the register %1 starting with q words
%macro SUM_ACROSS_Q 1
movdqa xmm2,%1
punpckldq %1,xmm0
punpckhdq xmm2,xmm0
paddq %1,xmm2
movdqa xmm2,%1
punpcklqdq %1,xmm0
punpckhqdq xmm2,xmm0
paddq %1,xmm2
%endmacro
; Sum across the register %1 starting with q words
%macro SUM_ACROSS_W 1
movdqa xmm1, %1
punpcklwd %1,xmm0
punpckhwd xmm1,xmm0
paddd %1, xmm1
SUM_ACROSS_Q %1
%endmacro
SECTION .text
;void ssim_parms_sse2(
; unsigned char *s,
; int sp,
; unsigned char *r,
; int rp
; uint32_t *sum_s,
; uint32_t *sum_r,
; uint32_t *sum_sq_s,
; uint32_t *sum_sq_r,
; uint32_t *sum_sxr);
;
; TODO: Use parm passing through structure, probably don't need the pxors
; ( calling app will initialize to 0 ) could easily fit everything in sse2
; without too much hastle, and can probably do better estimates with psadw
; or pavgb At this point this is just meant to be first pass for calculating
; all the parms needed for 16x16 ssim so we can play with dssim as distortion
; in mode selection code.
global sym(aom_ssim_parms_16x16_sse2) PRIVATE
sym(aom_ssim_parms_16x16_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
SAVE_XMM 15
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;s
mov rcx, arg(1) ;sp
mov rdi, arg(2) ;r
mov rax, arg(3) ;rp
pxor xmm0, xmm0
pxor xmm15,xmm15 ;sum_s
pxor xmm14,xmm14 ;sum_r
pxor xmm13,xmm13 ;sum_sq_s
pxor xmm12,xmm12 ;sum_sq_r
pxor xmm11,xmm11 ;sum_sxr
mov rdx, 16 ;row counter
.NextRow:
;grab source and reference pixels
movdqu xmm5, [rsi]
movdqu xmm6, [rdi]
movdqa xmm3, xmm5
movdqa xmm4, xmm6
punpckhbw xmm3, xmm0 ; high_s
punpckhbw xmm4, xmm0 ; high_r
TABULATE_SSIM
movdqa xmm3, xmm5
movdqa xmm4, xmm6
punpcklbw xmm3, xmm0 ; low_s
punpcklbw xmm4, xmm0 ; low_r
TABULATE_SSIM
add rsi, rcx ; next s row
add rdi, rax ; next r row
dec rdx ; counter
jnz .NextRow
SUM_ACROSS_W xmm15
SUM_ACROSS_W xmm14
SUM_ACROSS_Q xmm13
SUM_ACROSS_Q xmm12
SUM_ACROSS_Q xmm11
mov rdi,arg(4)
movd [rdi], xmm15;
mov rdi,arg(5)
movd [rdi], xmm14;
mov rdi,arg(6)
movd [rdi], xmm13;
mov rdi,arg(7)
movd [rdi], xmm12;
mov rdi,arg(8)
movd [rdi], xmm11;
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void ssim_parms_sse2(
; unsigned char *s,
; int sp,
; unsigned char *r,
; int rp
; uint32_t *sum_s,
; uint32_t *sum_r,
; uint32_t *sum_sq_s,
; uint32_t *sum_sq_r,
; uint32_t *sum_sxr);
;
; TODO: Use parm passing through structure, probably don't need the pxors
; ( calling app will initialize to 0 ) could easily fit everything in sse2
; without too much hastle, and can probably do better estimates with psadw
; or pavgb At this point this is just meant to be first pass for calculating
; all the parms needed for 16x16 ssim so we can play with dssim as distortion
; in mode selection code.
global sym(aom_ssim_parms_8x8_sse2) PRIVATE
sym(aom_ssim_parms_8x8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
SAVE_XMM 15
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;s
mov rcx, arg(1) ;sp
mov rdi, arg(2) ;r
mov rax, arg(3) ;rp
pxor xmm0, xmm0
pxor xmm15,xmm15 ;sum_s
pxor xmm14,xmm14 ;sum_r
pxor xmm13,xmm13 ;sum_sq_s
pxor xmm12,xmm12 ;sum_sq_r
pxor xmm11,xmm11 ;sum_sxr
mov rdx, 8 ;row counter
.NextRow:
;grab source and reference pixels
movq xmm3, [rsi]
movq xmm4, [rdi]
punpcklbw xmm3, xmm0 ; low_s
punpcklbw xmm4, xmm0 ; low_r
TABULATE_SSIM
add rsi, rcx ; next s row
add rdi, rax ; next r row
dec rdx ; counter
jnz .NextRow
SUM_ACROSS_W xmm15
SUM_ACROSS_W xmm14
SUM_ACROSS_Q xmm13
SUM_ACROSS_Q xmm12
SUM_ACROSS_Q xmm11
mov rdi,arg(4)
movd [rdi], xmm15;
mov rdi,arg(5)
movd [rdi], xmm14;
mov rdi,arg(6)
movd [rdi], xmm13;
mov rdi,arg(7)
movd [rdi], xmm12;
mov rdi,arg(8)
movd [rdi], xmm11;
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
|