summaryrefslogtreecommitdiffstats
path: root/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S
blob: cbb8f25608726fc2a02458858486d3ecb75ae043 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
@//
@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
@//
@//  Use of this source code is governed by a BSD-style license
@//  that can be found in the LICENSE file in the root of the source
@//  tree. An additional intellectual property rights grant can be found
@//  in the file PATENTS.  All contributing project authors may
@//  be found in the AUTHORS file in the root of the source tree.
@//
@//
@//  This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
@//  to support float instead of SC32.
@//

@//
@// Description:
@// Compute a Radix 4 FFT stage for a N point complex signal
@//
@//


@// Include standard headers

#include "dl/api/armCOMM_s.h"
#include "dl/api/omxtypes_s.h"


@// Import symbols required from other files
@// (For example tables)




@// Set debugging level
@//DEBUG_ON    SETL {TRUE}



@// Guarding implementation by the processor name




@// Guarding implementation by the processor name


@// Import symbols required from other files
@// (For example tables)


@//Input Registers

#define pSrc            r0
#define pDst            r2
#define pTwiddle        r1
#define subFFTNum       r6
#define subFFTSize      r7



@//Output Registers


@//Local Scratch Registers

#define grpCount        r3
#define pointStep       r4
#define outPointStep    r5
#define stepTwiddle     r12
#define setCount        r14
#define srcStep         r8
#define setStep         r9
#define dstStep         r10
#define twStep          r11
#define t1              r3

@// Neon Registers

#define dW1     D0.F32
#define dW2     D1.F32
#define dW3     D2.F32

#define dXr0    D4.F32
#define dXi0    D5.F32
#define dXr1    D6.F32
#define dXi1    D7.F32
#define dXr2    D8.F32
#define dXi2    D9.F32
#define dXr3    D10.F32
#define dXi3    D11.F32
#define dYr0    D12.F32
#define dYi0    D13.F32
#define dYr1    D14.F32
#define dYi1    D15.F32
#define dYr2    D16.F32
#define dYi2    D17.F32
#define dYr3    D18.F32
#define dYi3    D19.F32
#define qT0     d16.f32
#define qT1     d18.f32
#define qT2     d12.f32
#define qT3     d14.f32
#define dZr0    D20.F32
#define dZi0    D21.F32
#define dZr1    D22.F32
#define dZi1    D23.F32
#define dZr2    D24.F32
#define dZi2    D25.F32
#define dZr3    D26.F32
#define dZi3    D27.F32

#define qY0     Q6.F32
#define qY1     Q7.F32
#define qY2     Q8.F32
#define qY3     Q9.F32
#define qX0     Q2.F32
#define qZ0     Q10.F32
#define qZ1     Q11.F32
#define qZ2     Q12.F32
#define qZ3     Q13.F32

        .MACRO FFTSTAGE scaled, inverse , name

        @// Define stack arguments


        @// Update grpCount and grpSize rightaway inorder to reuse
        @// pGrpCount and pGrpSize regs

        LSL     grpCount,subFFTSize,#2
        LSR     subFFTNum,subFFTNum,#2
        MOV     subFFTSize,grpCount

        VLD1     dW1,[pTwiddle]                    @//[wi | wr]
        @// pT0+1 increments pT0 by 8 bytes
        @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
        MOV     pointStep,subFFTNum,LSL #1


        @// pOut0+1 increments pOut0 by 8 bytes
        @// pOut0+outPointStep == increment of 8*outPointStep bytes
        @//   = 2*size bytes

        MOV     stepTwiddle,#0
        VLD1     dW2,[pTwiddle]                    @//[wi | wr]
        SMULBB  outPointStep,grpCount,pointStep
        LSL     pointStep,pointStep,#2             @// 2*grpSize

        VLD1     dW3,[pTwiddle]                    @//[wi | wr]
        MOV     srcStep,pointStep,LSL #1           @// srcStep = 2*pointStep
        ADD     setStep,srcStep,pointStep          @// setStep = 3*pointStep

        RSB     setStep,setStep,#0                 @// setStep = - 3*pointStep
        SUB     srcStep,srcStep,#16                @// srcStep = 2*pointStep-16

        MOV     dstStep,outPointStep,LSL #1
        ADD     dstStep,dstStep,outPointStep       @// dstStep = 3*outPointStep
        @// dstStep = - 3*outPointStep+16
        RSB     dstStep,dstStep,#16



radix4GrpLoop\name :

        VLD2    {dXr0,dXi0},[pSrc],pointStep       @//  data[0]
        ADD      stepTwiddle,stepTwiddle,pointStep
        VLD2    {dXr1,dXi1},[pSrc],pointStep       @//  data[1]
        @// set pTwiddle to the first point
        ADD      pTwiddle,pTwiddle,stepTwiddle
        VLD2    {dXr2,dXi2},[pSrc],pointStep       @//  data[2]
        MOV      twStep,stepTwiddle,LSL #2

        @//  data[3] & update pSrc for the next set
        VLD2    {dXr3,dXi3},[pSrc],setStep
        SUB      twStep,stepTwiddle,twStep         @// twStep = -3*stepTwiddle

        MOV      setCount,pointStep,LSR #3
        @// set pSrc to data[0] of the next set
        ADD     pSrc,pSrc,#16
        @// increment to data[1] of the next set
        ADD     pSrc,pSrc,pointStep


        @// Loop on the sets

radix4SetLoop\name :



        .ifeqs  "\inverse", "TRUE"
            VMUL   dZr1,dXr1,dW1[0]
            VMUL   dZi1,dXi1,dW1[0]
            VMUL   dZr2,dXr2,dW2[0]
            VMUL   dZi2,dXi2,dW2[0]
            VMUL   dZr3,dXr3,dW3[0]
            VMUL   dZi3,dXi3,dW3[0]

            VMLA   dZr1,dXi1,dW1[1]                @// real part
            VMLS   dZi1,dXr1,dW1[1]                @// imag part

            @//  data[1] for next iteration
            VLD2    {dXr1,dXi1},[pSrc],pointStep

            VMLA   dZr2,dXi2,dW2[1]                @// real part
            VMLS   dZi2,dXr2,dW2[1]                @// imag part

            @//  data[2] for next iteration
            VLD2    {dXr2,dXi2},[pSrc],pointStep

            VMLA   dZr3,dXi3,dW3[1]                @// real part
            VMLS   dZi3,dXr3,dW3[1]                @// imag part
        .else
            VMUL   dZr1,dXr1,dW1[0]
            VMUL   dZi1,dXi1,dW1[0]
            VMUL   dZr2,dXr2,dW2[0]
            VMUL   dZi2,dXi2,dW2[0]
            VMUL   dZr3,dXr3,dW3[0]
            VMUL   dZi3,dXi3,dW3[0]

            VMLS   dZr1,dXi1,dW1[1]                @// real part
            VMLA   dZi1,dXr1,dW1[1]                @// imag part

            @//  data[1] for next iteration
            VLD2    {dXr1,dXi1},[pSrc],pointStep

            VMLS   dZr2,dXi2,dW2[1]                @// real part
            VMLA   dZi2,dXr2,dW2[1]                @// imag part

            @//  data[2] for next iteration
            VLD2    {dXr2,dXi2},[pSrc],pointStep

            VMLS   dZr3,dXi3,dW3[1]                @// real part
            VMLA   dZi3,dXr3,dW3[1]                @// imag part
        .endif

        @//  data[3] & update pSrc to data[0]
        @// But don't read on the very last iteration because that reads past 
	@// the end of pSrc. The last iteration is grpCount = 4, setCount = 2.
        cmp     grpCount, #4
        cmpeq   setCount, #2                      @// Test setCount if grpCount = 4
        @// These are executed only if both grpCount = 4 and setCount = 2       
        addeq   pSrc, pSrc, setStep
        beq     radix4SkipRead\name
        VLD2    {dXr3,dXi3},[pSrc],setStep
radix4SkipRead\name:
        SUBS    setCount,setCount,#2

        @// finish first stage of 4 point FFT
        VADD    qY0,qX0,qZ2
        VSUB    qY2,qX0,qZ2

        @//  data[0] for next iteration
        VLD2    {dXr0,dXi0},[pSrc, :128]!
        VADD    qY1,qZ1,qZ3
        VSUB    qY3,qZ1,qZ3

        @// finish second stage of 4 point FFT

        VSUB    qZ0,qY2,qY1


        .ifeqs  "\inverse", "TRUE"

            VADD    dZr3,dYr0,dYi3
            VST2    {dZr0,dZi0},[pDst, :128],outPointStep
            VSUB    dZi3,dYi0,dYr3

            VADD    qZ2,qY2,qY1
            VST2    {dZr3,dZi3},[pDst, :128],outPointStep

            VSUB    dZr1,dYr0,dYi3
            VST2    {dZr2,dZi2},[pDst, :128],outPointStep
            VADD    dZi1,dYi0,dYr3

            VST2    {dZr1,dZi1},[pDst, :128],dstStep


        .else

            VSUB    dZr1,dYr0,dYi3
            VST2    {dZr0,dZi0},[pDst, :128],outPointStep
            VADD    dZi1,dYi0,dYr3

            VADD    qZ2,qY2,qY1
            VST2    {dZr1,dZi1},[pDst, :128],outPointStep

            VADD    dZr3,dYr0,dYi3
            VST2    {dZr2,dZi2},[pDst, :128],outPointStep
            VSUB    dZi3,dYi0,dYr3

            VST2    {dZr3,dZi3},[pDst, :128],dstStep


        .endif

        @// increment to data[1] of the next set
        ADD     pSrc,pSrc,pointStep
        BGT     radix4SetLoop\name


        VLD1     dW1,[pTwiddle, :64],stepTwiddle    @//[wi | wr]
        @// subtract 4 since grpCount multiplied by 4
        SUBS    grpCount,grpCount,#4
        VLD1     dW2,[pTwiddle, :64],stepTwiddle    @//[wi | wr]
        @// increment pSrc for the next grp
        ADD     pSrc,pSrc,srcStep
        VLD1     dW3,[pTwiddle, :64],twStep         @//[wi | wr]
        BGT     radix4GrpLoop\name


        @// Reset and Swap pSrc and pDst for the next stage
        MOV     t1,pDst
        @// pDst -= 2*size; pSrc -= 8*size bytes
        SUB     pDst,pSrc,outPointStep,LSL #2
        SUB     pSrc,t1,outPointStep


        .endm


        M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe,r4
            FFTSTAGE "FALSE","FALSE",FWD
        M_END


        M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe,r4
            FFTSTAGE "FALSE","TRUE",INV
        M_END


        .end