summaryrefslogtreecommitdiffstats
path: root/media/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
blob: e81aef53d5146a285582550b2d27538ed8d4ffea (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


    EXPORT  |vp8_sixtap_predict8x4_armv6|

    AREA    |.text|, CODE, READONLY  ; name this block of code
;-------------------------------------
; r0    unsigned char *src_ptr,
; r1    int  src_pixels_per_line,
; r2    int  xoffset,
; r3    int  yoffset,
; stack unsigned char *dst_ptr,
; stack int  dst_pitch
;-------------------------------------
;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
;and the result is stored in transpose.
|vp8_sixtap_predict8x4_armv6| PROC
    stmdb       sp!, {r4 - r11, lr}
    str         r3, [sp, #-184]!            ;reserve space on stack for temporary storage, store yoffset

    cmp         r2, #0                      ;skip first_pass filter if xoffset=0
    add         lr, sp, #4                  ;point to temporary buffer
    beq         skip_firstpass_filter

;first-pass filter
    adr         r12, filter8_coeff
    sub         r0, r0, r1, lsl #1

    add         r3, r1, #10                 ; preload next low
    pld         [r0, r3]

    add         r2, r12, r2, lsl #4         ;calculate filter location
    add         r0, r0, #3                  ;adjust src only for loading convinience

    ldr         r3, [r2]                    ; load up packed filter coefficients
    ldr         r4, [r2, #4]
    ldr         r5, [r2, #8]

    mov         r2, #0x90000                ; height=9 is top part of counter

    sub         r1, r1, #8

|first_pass_hloop_v6|
    ldrb        r6, [r0, #-5]               ; load source data
    ldrb        r7, [r0, #-4]
    ldrb        r8, [r0, #-3]
    ldrb        r9, [r0, #-2]
    ldrb        r10, [r0, #-1]

    orr         r2, r2, #0x4                ; construct loop counter. width=8=4x2

    pkhbt       r6, r6, r7, lsl #16         ; r7 | r6
    pkhbt       r7, r7, r8, lsl #16         ; r8 | r7

    pkhbt       r8, r8, r9, lsl #16         ; r9 | r8
    pkhbt       r9, r9, r10, lsl #16        ; r10 | r9

|first_pass_wloop_v6|
    smuad       r11, r6, r3                 ; vp8_filter[0], vp8_filter[1]
    smuad       r12, r7, r3

    ldrb        r6, [r0], #1

    smlad       r11, r8, r4, r11            ; vp8_filter[2], vp8_filter[3]
    ldrb        r7, [r0], #1
    smlad       r12, r9, r4, r12

    pkhbt       r10, r10, r6, lsl #16       ; r10 | r9
    pkhbt       r6, r6, r7, lsl #16         ; r11 | r10
    smlad       r11, r10, r5, r11           ; vp8_filter[4], vp8_filter[5]
    smlad       r12, r6, r5, r12

    sub         r2, r2, #1

    add         r11, r11, #0x40             ; round_shift_and_clamp
    tst         r2, #0xff                   ; test loop counter
    usat        r11, #8, r11, asr #7
    add         r12, r12, #0x40
    strh        r11, [lr], #20              ; result is transposed and stored, which
    usat        r12, #8, r12, asr #7

    strh        r12, [lr], #20

    movne       r11, r6
    movne       r12, r7

    movne       r6, r8
    movne       r7, r9
    movne       r8, r10
    movne       r9, r11
    movne       r10, r12

    bne         first_pass_wloop_v6

    ;;add       r9, ppl, #30                ; attempt to load 2 adjacent cache lines
    ;;IF ARCHITECTURE=6
    ;pld        [src, ppl]
    ;;pld       [src, r9]
    ;;ENDIF

    subs        r2, r2, #0x10000

    sub         lr, lr, #158

    add         r0, r0, r1                  ; move to next input line

    add         r11, r1, #18                ; preload next low. adding back block width(=8), which is subtracted earlier
    pld         [r0, r11]

    bne         first_pass_hloop_v6

;second pass filter
secondpass_filter
    ldr         r3, [sp], #4                ; load back yoffset
    ldr         r0, [sp, #216]              ; load dst address from stack 180+36
    ldr         r1, [sp, #220]              ; load dst stride from stack 180+40

    cmp         r3, #0
    beq         skip_secondpass_filter

    adr         r12, filter8_coeff
    add         lr, r12, r3, lsl #4         ;calculate filter location

    mov         r2, #0x00080000

    ldr         r3, [lr]                    ; load up packed filter coefficients
    ldr         r4, [lr, #4]
    ldr         r5, [lr, #8]

    pkhbt       r12, r4, r3                 ; pack the filter differently
    pkhbt       r11, r5, r4

second_pass_hloop_v6
    ldr         r6, [sp]                    ; load the data
    ldr         r7, [sp, #4]

    orr         r2, r2, #2                  ; loop counter

second_pass_wloop_v6
    smuad       lr, r3, r6                  ; apply filter
    smulbt      r10, r3, r6

    ldr         r8, [sp, #8]

    smlad       lr, r4, r7, lr
    smladx      r10, r12, r7, r10

    ldrh        r9, [sp, #12]

    smlad       lr, r5, r8, lr
    smladx      r10, r11, r8, r10

    add         sp, sp, #4
    smlatb      r10, r5, r9, r10

    sub         r2, r2, #1

    add         lr, lr, #0x40               ; round_shift_and_clamp
    tst         r2, #0xff
    usat        lr, #8, lr, asr #7
    add         r10, r10, #0x40
    strb        lr, [r0], r1                ; the result is transposed back and stored
    usat        r10, #8, r10, asr #7

    strb        r10, [r0],r1

    movne       r6, r7
    movne       r7, r8

    bne         second_pass_wloop_v6

    subs        r2, r2, #0x10000
    add         sp, sp, #12                 ; updata src for next loop (20-8)
    sub         r0, r0, r1, lsl #2
    add         r0, r0, #1

    bne         second_pass_hloop_v6

    add         sp, sp, #20
    ldmia       sp!, {r4 - r11, pc}

;--------------------
skip_firstpass_filter
    sub         r0, r0, r1, lsl #1
    sub         r1, r1, #8
    mov         r2, #9

skip_firstpass_hloop
    ldrb        r4, [r0], #1                ; load data
    subs        r2, r2, #1
    ldrb        r5, [r0], #1
    strh        r4, [lr], #20               ; store it to immediate buffer
    ldrb        r6, [r0], #1                ; load data
    strh        r5, [lr], #20
    ldrb        r7, [r0], #1
    strh        r6, [lr], #20
    ldrb        r8, [r0], #1
    strh        r7, [lr], #20
    ldrb        r9, [r0], #1
    strh        r8, [lr], #20
    ldrb        r10, [r0], #1
    strh        r9, [lr], #20
    ldrb        r11, [r0], #1
    strh        r10, [lr], #20
    add         r0, r0, r1                  ; move to next input line
    strh        r11, [lr], #20

    sub         lr, lr, #158                ; move over to next column
    bne         skip_firstpass_hloop

    b           secondpass_filter

;--------------------
skip_secondpass_filter
    mov         r2, #8
    add         sp, sp, #4                  ;start from src[0] instead of src[-2]

skip_secondpass_hloop
    ldr         r6, [sp], #4
    subs        r2, r2, #1
    ldr         r8, [sp], #4

    mov         r7, r6, lsr #16             ; unpack
    strb        r6, [r0], r1
    mov         r9, r8, lsr #16
    strb        r7, [r0], r1
    add         sp, sp, #12                 ; 20-8
    strb        r8, [r0], r1
    strb        r9, [r0], r1

    sub         r0, r0, r1, lsl #2
    add         r0, r0, #1

    bne         skip_secondpass_hloop

    add         sp, sp, #16                 ; 180 - (160 +4)

    ldmia       sp!, {r4 - r11, pc}

    ENDP

;-----------------
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
filter8_coeff
    DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000
    DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000
    DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000
    DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000
    DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000
    DCD     0xfffa0000,     0x005d0032,     0x0000fff7,     0x00000000
    DCD     0xfff80001,     0x006c0024,     0x0002fff5,     0x00000000
    DCD     0xffff0000,     0x007b000c,     0x0000fffa,     0x00000000

    ;DCD        0,  0,  128,    0,   0,  0
    ;DCD        0, -6,  123,   12,  -1,  0
    ;DCD        2, -11, 108,   36,  -8,  1
    ;DCD        0, -9,   93,   50,  -6,  0
    ;DCD        3, -16,  77,   77, -16,  3
    ;DCD        0, -6,   50,   93,  -9,  0
    ;DCD        1, -8,   36,  108, -11,  2
    ;DCD        0, -1,   12,  123,  -6,  0

    END