third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147

;
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
;


    EXPORT  |aom_idct32x32_1_add_neon|
    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2

    ;TODO(hkuang): put the following macros in a seperate
    ;file so other idct function could also use them.
    MACRO
    LD_16x8          $src, $stride
    vld1.8           {q8}, [$src], $stride
    vld1.8           {q9}, [$src], $stride
    vld1.8           {q10}, [$src], $stride
    vld1.8           {q11}, [$src], $stride
    vld1.8           {q12}, [$src], $stride
    vld1.8           {q13}, [$src], $stride
    vld1.8           {q14}, [$src], $stride
    vld1.8           {q15}, [$src], $stride
    MEND

    MACRO
    ADD_DIFF_16x8    $diff
    vqadd.u8         q8, q8, $diff
    vqadd.u8         q9, q9, $diff
    vqadd.u8         q10, q10, $diff
    vqadd.u8         q11, q11, $diff
    vqadd.u8         q12, q12, $diff
    vqadd.u8         q13, q13, $diff
    vqadd.u8         q14, q14, $diff
    vqadd.u8         q15, q15, $diff
    MEND

    MACRO
    SUB_DIFF_16x8    $diff
    vqsub.u8         q8, q8, $diff
    vqsub.u8         q9, q9, $diff
    vqsub.u8         q10, q10, $diff
    vqsub.u8         q11, q11, $diff
    vqsub.u8         q12, q12, $diff
    vqsub.u8         q13, q13, $diff
    vqsub.u8         q14, q14, $diff
    vqsub.u8         q15, q15, $diff
    MEND

    MACRO
    ST_16x8          $dst, $stride
    vst1.8           {q8}, [$dst], $stride
    vst1.8           {q9}, [$dst], $stride
    vst1.8           {q10},[$dst], $stride
    vst1.8           {q11},[$dst], $stride
    vst1.8           {q12},[$dst], $stride
    vst1.8           {q13},[$dst], $stride
    vst1.8           {q14},[$dst], $stride
    vst1.8           {q15},[$dst], $stride
    MEND

;void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
;                              int dest_stride)
;
; r0  int16_t input
; r1  uint8_t *dest
; r2  int dest_stride

|aom_idct32x32_1_add_neon| PROC
    push             {lr}
    pld              [r1]
    add              r3, r1, #16               ; r3 dest + 16 for second loop
    ldrsh            r0, [r0]

    ; generate cospi_16_64 = 11585
    mov              r12, #0x2d00
    add              r12, #0x41

    ; out = dct_const_round_shift(input[0] * cospi_16_64)
    mul              r0, r0, r12               ; input[0] * cospi_16_64
    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
    asr              r0, r0, #14               ; >> DCT_CONST_BITS

    ; out = dct_const_round_shift(out * cospi_16_64)
    mul              r0, r0, r12               ; out * cospi_16_64
    mov              r12, r1                   ; save dest
    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
    asr              r0, r0, #14               ; >> DCT_CONST_BITS

    ; a1 = ROUND_POWER_OF_TWO(out, 6)
    add              r0, r0, #32               ; + (1 <<((6) - 1))
    asrs             r0, r0, #6                ; >> 6
    bge              diff_positive_32_32

diff_negative_32_32
    neg              r0, r0
    usat             r0, #8, r0
    vdup.u8          q0, r0
    mov              r0, #4

diff_negative_32_32_loop
    sub              r0, #1
    LD_16x8          r1, r2
    SUB_DIFF_16x8    q0
    ST_16x8          r12, r2

    LD_16x8          r1, r2
    SUB_DIFF_16x8    q0
    ST_16x8          r12, r2
    cmp              r0, #2
    moveq            r1, r3
    moveq            r12, r3
    cmp              r0, #0
    bne              diff_negative_32_32_loop
    pop              {pc}

diff_positive_32_32
    usat             r0, #8, r0
    vdup.u8          q0, r0
    mov              r0, #4

diff_positive_32_32_loop
    sub              r0, #1
    LD_16x8          r1, r2
    ADD_DIFF_16x8    q0
    ST_16x8          r12, r2

    LD_16x8          r1, r2
    ADD_DIFF_16x8    q0
    ST_16x8          r12, r2
    cmp              r0, #2
    moveq            r1, r3
    moveq            r12, r3
    cmp              r0, #0
    bne              diff_positive_32_32_loop
    pop              {pc}

    ENDP             ; |aom_idct32x32_1_add_neon|
    END