Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_fft_float32.neon.c
1/*
2 * Copyright 2013-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * NE10 Library : dsp/NE10_fft_float32.neon.c
30 */
31
32#include <arm_neon.h>
33
34#include "NE10_types.h"
35#include "NE10_macros.h"
36#include "NE10_fft.h"
37#include "NE10_dsp.h"
38
39static inline void ne10_fft4_forward_float32 (ne10_fft_cpx_float32_t * Fout,
41{
42 ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
43 ne10_float32_t tmp_r, tmp_i;
44
45 s2_r = Fin[0].r - Fin[2].r;
46 s2_i = Fin[0].i - Fin[2].i;
47
48 tmp_r = Fin[0].r + Fin[2].r;
49 tmp_i = Fin[0].i + Fin[2].i;
50
51 s0_r = Fin[1].r + Fin[3].r;
52 s0_i = Fin[1].i + Fin[3].i;
53
54 s1_r = Fin[1].r - Fin[3].r;
55 s1_i = Fin[1].i - Fin[3].i;
56 Fout[2].r = tmp_r - s0_r;
57 Fout[2].i = tmp_i - s0_i;
58 Fout[0].r = tmp_r + s0_r;
59 Fout[0].i = tmp_i + s0_i;
60
61 Fout[1].r = s2_r + s1_i;
62 Fout[1].i = s2_i - s1_r;
63 Fout[3].r = s2_r - s1_i;
64 Fout[3].i = s2_i + s1_r;
65}
66
67static inline void ne10_fft4_backward_float32 (ne10_fft_cpx_float32_t * Fout,
69{
70 ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
71 ne10_float32_t tmp_r, tmp_i;
72
73 s2_r = Fin[0].r - Fin[2].r;
74 s2_i = Fin[0].i - Fin[2].i;
75
76 tmp_r = Fin[0].r + Fin[2].r;
77 tmp_i = Fin[0].i + Fin[2].i;
78
79 s0_r = Fin[1].r + Fin[3].r;
80 s0_i = Fin[1].i + Fin[3].i;
81
82 s1_r = Fin[1].r - Fin[3].r;
83 s1_i = Fin[1].i - Fin[3].i;
84 Fout[2].r = (tmp_r - s0_r) * 0.25f;
85 Fout[2].i = (tmp_i - s0_i) * 0.25f;
86 Fout[0].r = (tmp_r + s0_r) * 0.25f;
87 Fout[0].i = (tmp_i + s0_i) * 0.25f;
88
89 Fout[1].r = (s2_r - s1_i) * 0.25f;
90 Fout[1].i = (s2_i + s1_r) * 0.25f;
91 Fout[3].r = (s2_r + s1_i) * 0.25f;
92 Fout[3].i = (s2_i - s1_r) * 0.25f;
93}
94
95static inline void ne10_fft8_forward_float32 (ne10_fft_cpx_float32_t * Fout,
97{
98 ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
99 ne10_float32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
100 const ne10_float32_t TW_81 = 0.70710678;
101
102 s0_r = Fin[0].r + Fin[4].r;
103 s0_i = Fin[0].i + Fin[4].i;
104 s1_r = Fin[0].r - Fin[4].r;
105 s1_i = Fin[0].i - Fin[4].i;
106 s2_r = Fin[1].r + Fin[5].r;
107 s2_i = Fin[1].i + Fin[5].i;
108 s3_r = Fin[1].r - Fin[5].r;
109 s3_i = Fin[1].i - Fin[5].i;
110 s4_r = Fin[2].r + Fin[6].r;
111 s4_i = Fin[2].i + Fin[6].i;
112 s5_r = Fin[2].r - Fin[6].r;
113 s5_i = Fin[2].i - Fin[6].i;
114 s6_r = Fin[3].r + Fin[7].r;
115 s6_i = Fin[3].i + Fin[7].i;
116 s7_r = Fin[3].r - Fin[7].r;
117 s7_i = Fin[3].i - Fin[7].i;
118
119 t0_r = s0_r - s4_r;
120 t0_i = s0_i - s4_i;
121 t1_r = s0_r + s4_r;
122 t1_i = s0_i + s4_i;
123 t2_r = s2_r + s6_r;
124 t2_i = s2_i + s6_i;
125 t3_r = s2_r - s6_r;
126 t3_i = s2_i - s6_i;
127 Fout[0].r = t1_r + t2_r;
128 Fout[0].i = t1_i + t2_i;
129 Fout[4].r = t1_r - t2_r;
130 Fout[4].i = t1_i - t2_i;
131 Fout[2].r = t0_r + t3_i;
132 Fout[2].i = t0_i - t3_r;
133 Fout[6].r = t0_r - t3_i;
134 Fout[6].i = t0_i + t3_r;
135
136 t4_r = (s3_r + s3_i) * TW_81;
137 t4_i = - (s3_r - s3_i) * TW_81;
138 t5_r = (s7_r - s7_i) * TW_81;
139 t5_i = (s7_r + s7_i) * TW_81;
140
141 t0_r = s1_r - s5_i;
142 t0_i = s1_i + s5_r;
143 t1_r = s1_r + s5_i;
144 t1_i = s1_i - s5_r;
145 t2_r = t4_r - t5_r;
146 t2_i = t4_i - t5_i;
147 t3_r = t4_r + t5_r;
148 t3_i = t4_i + t5_i;
149 Fout[1].r = t1_r + t2_r;
150 Fout[1].i = t1_i + t2_i;
151 Fout[5].r = t1_r - t2_r;
152 Fout[5].i = t1_i - t2_i;
153 Fout[3].r = t0_r + t3_i;
154 Fout[3].i = t0_i - t3_r;
155 Fout[7].r = t0_r - t3_i;
156 Fout[7].i = t0_i + t3_r;
157}
158
159static inline void ne10_fft8_backward_float32 (ne10_fft_cpx_float32_t * Fout,
161
162{
163 ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
164 ne10_float32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
165 const ne10_float32_t TW_81 = 0.70710678;
166
167 s0_r = Fin[0].r + Fin[4].r;
168 s0_i = Fin[0].i + Fin[4].i;
169 s1_r = Fin[0].r - Fin[4].r;
170 s1_i = Fin[0].i - Fin[4].i;
171 s2_r = Fin[1].r + Fin[5].r;
172 s2_i = Fin[1].i + Fin[5].i;
173 s3_r = Fin[1].r - Fin[5].r;
174 s3_i = Fin[1].i - Fin[5].i;
175 s4_r = Fin[2].r + Fin[6].r;
176 s4_i = Fin[2].i + Fin[6].i;
177 s5_r = Fin[2].r - Fin[6].r;
178 s5_i = Fin[2].i - Fin[6].i;
179 s6_r = Fin[3].r + Fin[7].r;
180 s6_i = Fin[3].i + Fin[7].i;
181 s7_r = Fin[3].r - Fin[7].r;
182 s7_i = Fin[3].i - Fin[7].i;
183
184 t0_r = s0_r - s4_r;
185 t0_i = s0_i - s4_i;
186 t1_r = s0_r + s4_r;
187 t1_i = s0_i + s4_i;
188 t2_r = s2_r + s6_r;
189 t2_i = s2_i + s6_i;
190 t3_r = s2_r - s6_r;
191 t3_i = s2_i - s6_i;
192 Fout[0].r = (t1_r + t2_r) * 0.125f;
193 Fout[0].i = (t1_i + t2_i) * 0.125f;
194 Fout[4].r = (t1_r - t2_r) * 0.125f;
195 Fout[4].i = (t1_i - t2_i) * 0.125f;
196 Fout[2].r = (t0_r - t3_i) * 0.125f;
197 Fout[2].i = (t0_i + t3_r) * 0.125f;
198 Fout[6].r = (t0_r + t3_i) * 0.125f;
199 Fout[6].i = (t0_i - t3_r) * 0.125f;
200
201 t4_r = (s3_r - s3_i) * TW_81;
202 t4_i = (s3_r + s3_i) * TW_81;
203 t5_r = (s7_r + s7_i) * TW_81;
204 t5_i = - (s7_r - s7_i) * TW_81;
205
206 t0_r = s1_r + s5_i;
207 t0_i = s1_i - s5_r;
208 t1_r = s1_r - s5_i;
209 t1_i = s1_i + s5_r;
210 t2_r = t4_r - t5_r;
211 t2_i = t4_i - t5_i;
212 t3_r = t4_r + t5_r;
213 t3_i = t4_i + t5_i;
214 Fout[1].r = (t1_r + t2_r) * 0.125f;
215 Fout[1].i = (t1_i + t2_i) * 0.125f;
216 Fout[5].r = (t1_r - t2_r) * 0.125f;
217 Fout[5].i = (t1_i - t2_i) * 0.125f;
218 Fout[3].r = (t0_r - t3_i) * 0.125f;
219 Fout[3].i = (t0_i + t3_r) * 0.125f;
220 Fout[7].r = (t0_r + t3_i) * 0.125f;
221 Fout[7].i = (t0_i - t3_r) * 0.125f;
222}
223
224static void ne10_fft16_forward_float32_neon (ne10_fft_cpx_float32_t * Fout,
226 ne10_fft_cpx_float32_t * twiddles)
227{
228 ne10_fft_cpx_float32_t *tw1, *tw2, *tw3;
229
230 // the first stage
231 float32_t *p_src0, *p_src4, *p_src8, *p_src12;
232 float32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
233 float32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
234 float32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
235 float32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
236 p_src0 = (float32_t*) (& (Fin[0]));
237 p_src4 = (float32_t*) (& (Fin[4]));
238 p_src8 = (float32_t*) (& (Fin[8]));
239 p_src12 = (float32_t*) (& (Fin[12]));
240 q2_in_0123 = vld2q_f32 (p_src0);
241 q2_in_4567 = vld2q_f32 (p_src4);
242 q2_in_89ab = vld2q_f32 (p_src8);
243 q2_in_cdef = vld2q_f32 (p_src12);
244
245 q_t2_r = vsubq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
246 q_t2_i = vsubq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
247 q_t3_r = vaddq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
248 q_t3_i = vaddq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
249
250 q_t0_r = vaddq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
251 q_t0_i = vaddq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
252 q_t1_r = vsubq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
253 q_t1_i = vsubq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
254
255 q_out_r26ae = vsubq_f32 (q_t3_r, q_t0_r);
256 q_out_i26ae = vsubq_f32 (q_t3_i, q_t0_i);
257 q_out_r048c = vaddq_f32 (q_t3_r, q_t0_r);
258 q_out_i048c = vaddq_f32 (q_t3_i, q_t0_i);
259 q_out_r159d = vaddq_f32 (q_t2_r, q_t1_i);
260 q_out_i159d = vsubq_f32 (q_t2_i, q_t1_r);
261 q_out_r37bf = vsubq_f32 (q_t2_r, q_t1_i);
262 q_out_i37bf = vaddq_f32 (q_t2_i, q_t1_r);
263
264 // second stages
265 float32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
266 float32_t *p_tw1, *p_tw2, *p_tw3;
267 float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
268 float32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
269 float32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
270 float32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
271 float32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
272 float32x4x2_t q2_tw1, q2_tw2, q2_tw3;
273 float32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
274 tw1 = twiddles;
275 tw2 = twiddles + 4;
276 tw3 = twiddles + 8;
277 p_dst0 = (float32_t*) (&Fout[0]);
278 p_dst1 = (float32_t*) (&Fout[4]);
279 p_dst2 = (float32_t*) (&Fout[8]);
280 p_dst3 = (float32_t*) (&Fout[12]);
281 p_tw1 = (float32_t*) tw1;
282 p_tw2 = (float32_t*) tw2;
283 p_tw3 = (float32_t*) tw3;
284 q2_tmp_0 = vzipq_f32 (q_out_r048c, q_out_r159d);
285 q2_tmp_1 = vzipq_f32 (q_out_i048c, q_out_i159d);
286 q2_tmp_2 = vzipq_f32 (q_out_r26ae, q_out_r37bf);
287 q2_tmp_3 = vzipq_f32 (q_out_i26ae, q_out_i37bf);
288 q_in_r0123 = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[0]), vget_low_f32 (q2_tmp_2.val[0]));
289 q_in_i0123 = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[0]), vget_low_f32 (q2_tmp_3.val[0]));
290 q_in_r4567 = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[0]), vget_high_f32 (q2_tmp_2.val[0]));
291 q_in_i4567 = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[0]), vget_high_f32 (q2_tmp_3.val[0]));
292 q_in_r89ab = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[1]), vget_low_f32 (q2_tmp_2.val[1]));
293 q_in_i89ab = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[1]), vget_low_f32 (q2_tmp_3.val[1]));
294 q_in_rcdef = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[1]), vget_high_f32 (q2_tmp_2.val[1]));
295 q_in_icdef = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[1]), vget_high_f32 (q2_tmp_3.val[1]));
296 q2_tw1 = vld2q_f32 (p_tw1);
297 q2_tw2 = vld2q_f32 (p_tw2);
298 q2_tw3 = vld2q_f32 (p_tw3);
299
300 q_s0_r = vmulq_f32 (q_in_r4567, q2_tw1.val[0]);
301 q_s0_i = vmulq_f32 (q_in_r4567, q2_tw1.val[1]);
302 q_s1_r = vmulq_f32 (q_in_r89ab, q2_tw2.val[0]);
303 q_s1_i = vmulq_f32 (q_in_r89ab, q2_tw2.val[1]);
304 q_s2_r = vmulq_f32 (q_in_rcdef, q2_tw3.val[0]);
305 q_s2_i = vmulq_f32 (q_in_rcdef, q2_tw3.val[1]);
306 q_s0_r = vmlsq_f32 (q_s0_r, q_in_i4567, q2_tw1.val[1]);
307 q_s0_i = vmlaq_f32 (q_s0_i, q_in_i4567, q2_tw1.val[0]);
308 q_s1_r = vmlsq_f32 (q_s1_r, q_in_i89ab, q2_tw2.val[1]);
309 q_s1_i = vmlaq_f32 (q_s1_i, q_in_i89ab, q2_tw2.val[0]);
310 q_s2_r = vmlsq_f32 (q_s2_r, q_in_icdef, q2_tw3.val[1]);
311 q_s2_i = vmlaq_f32 (q_s2_i, q_in_icdef, q2_tw3.val[0]);
312
313
314 q_s5_r = vsubq_f32 (q_in_r0123, q_s1_r);
315 q_s5_i = vsubq_f32 (q_in_i0123, q_s1_i);
316 q2_out_0123.val[0] = vaddq_f32 (q_in_r0123, q_s1_r);
317 q2_out_0123.val[1] = vaddq_f32 (q_in_i0123, q_s1_i);
318
319 q_s3_r = vaddq_f32 (q_s0_r, q_s2_r);
320 q_s3_i = vaddq_f32 (q_s0_i, q_s2_i);
321 q_s4_r = vsubq_f32 (q_s0_r, q_s2_r);
322 q_s4_i = vsubq_f32 (q_s0_i, q_s2_i);
323
324 q2_out_89ab.val[0] = vsubq_f32 (q2_out_0123.val[0], q_s3_r);
325 q2_out_89ab.val[1] = vsubq_f32 (q2_out_0123.val[1], q_s3_i);
326 q2_out_0123.val[0] = vaddq_f32 (q2_out_0123.val[0], q_s3_r);
327 q2_out_0123.val[1] = vaddq_f32 (q2_out_0123.val[1], q_s3_i);
328
329 q2_out_4567.val[0] = vaddq_f32 (q_s5_r, q_s4_i);
330 q2_out_4567.val[1] = vsubq_f32 (q_s5_i, q_s4_r);
331 q2_out_cdef.val[0] = vsubq_f32 (q_s5_r, q_s4_i);
332 q2_out_cdef.val[1] = vaddq_f32 (q_s5_i, q_s4_r);
333
334 vst2q_f32 (p_dst0, q2_out_0123);
335 vst2q_f32 (p_dst1, q2_out_4567);
336 vst2q_f32 (p_dst2, q2_out_89ab);
337 vst2q_f32 (p_dst3, q2_out_cdef);
338}
339
340static void ne10_fft16_backward_float32_neon (ne10_fft_cpx_float32_t * Fout,
342 ne10_fft_cpx_float32_t * twiddles)
343{
344 ne10_fft_cpx_float32_t *tw1, *tw2, *tw3;
345
346 // the first stage
347 float32_t *p_src0, *p_src4, *p_src8, *p_src12;
348 float32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
349 float32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
350 float32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
351 float32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
352 p_src0 = (float32_t*) (& (Fin[0]));
353 p_src4 = (float32_t*) (& (Fin[4]));
354 p_src8 = (float32_t*) (& (Fin[8]));
355 p_src12 = (float32_t*) (& (Fin[12]));
356 q2_in_0123 = vld2q_f32 (p_src0);
357 q2_in_4567 = vld2q_f32 (p_src4);
358 q2_in_89ab = vld2q_f32 (p_src8);
359 q2_in_cdef = vld2q_f32 (p_src12);
360
361 q_t2_r = vsubq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
362 q_t2_i = vsubq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
363 q_t3_r = vaddq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
364 q_t3_i = vaddq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
365
366 q_t0_r = vaddq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
367 q_t0_i = vaddq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
368 q_t1_r = vsubq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
369 q_t1_i = vsubq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
370
371 q_out_r26ae = vsubq_f32 (q_t3_r, q_t0_r);
372 q_out_i26ae = vsubq_f32 (q_t3_i, q_t0_i);
373 q_out_r048c = vaddq_f32 (q_t3_r, q_t0_r);
374 q_out_i048c = vaddq_f32 (q_t3_i, q_t0_i);
375 q_out_r159d = vsubq_f32 (q_t2_r, q_t1_i);
376 q_out_i159d = vaddq_f32 (q_t2_i, q_t1_r);
377 q_out_r37bf = vaddq_f32 (q_t2_r, q_t1_i);
378 q_out_i37bf = vsubq_f32 (q_t2_i, q_t1_r);
379
380 // second stages
381 float32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
382 float32_t *p_tw1, *p_tw2, *p_tw3;
383 float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
384 float32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
385 float32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
386 float32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
387 float32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
388 float32x4x2_t q2_tw1, q2_tw2, q2_tw3;
389 float32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
390 float32x4_t q_one_by_nfft;
391 tw1 = twiddles;
392 tw2 = twiddles + 4;
393 tw3 = twiddles + 8;
394 p_dst0 = (float32_t*) (&Fout[0]);
395 p_dst1 = (float32_t*) (&Fout[4]);
396 p_dst2 = (float32_t*) (&Fout[8]);
397 p_dst3 = (float32_t*) (&Fout[12]);
398 p_tw1 = (float32_t*) tw1;
399 p_tw2 = (float32_t*) tw2;
400 p_tw3 = (float32_t*) tw3;
401 q2_tmp_0 = vzipq_f32 (q_out_r048c, q_out_r159d);
402 q2_tmp_1 = vzipq_f32 (q_out_i048c, q_out_i159d);
403 q2_tmp_2 = vzipq_f32 (q_out_r26ae, q_out_r37bf);
404 q2_tmp_3 = vzipq_f32 (q_out_i26ae, q_out_i37bf);
405 q_in_r0123 = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[0]), vget_low_f32 (q2_tmp_2.val[0]));
406 q_in_i0123 = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[0]), vget_low_f32 (q2_tmp_3.val[0]));
407 q_in_r4567 = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[0]), vget_high_f32 (q2_tmp_2.val[0]));
408 q_in_i4567 = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[0]), vget_high_f32 (q2_tmp_3.val[0]));
409 q_in_r89ab = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[1]), vget_low_f32 (q2_tmp_2.val[1]));
410 q_in_i89ab = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[1]), vget_low_f32 (q2_tmp_3.val[1]));
411 q_in_rcdef = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[1]), vget_high_f32 (q2_tmp_2.val[1]));
412 q_in_icdef = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[1]), vget_high_f32 (q2_tmp_3.val[1]));
413 q2_tw1 = vld2q_f32 (p_tw1);
414 q2_tw2 = vld2q_f32 (p_tw2);
415 q2_tw3 = vld2q_f32 (p_tw3);
416
417 q_s0_r = vmulq_f32 (q_in_r4567, q2_tw1.val[0]);
418 q_s0_i = vmulq_f32 (q_in_i4567, q2_tw1.val[0]);
419 q_s1_r = vmulq_f32 (q_in_r89ab, q2_tw2.val[0]);
420 q_s1_i = vmulq_f32 (q_in_i89ab, q2_tw2.val[0]);
421 q_s2_r = vmulq_f32 (q_in_rcdef, q2_tw3.val[0]);
422 q_s2_i = vmulq_f32 (q_in_icdef, q2_tw3.val[0]);
423 q_s0_r = vmlaq_f32 (q_s0_r, q_in_i4567, q2_tw1.val[1]);
424 q_s0_i = vmlsq_f32 (q_s0_i, q_in_r4567, q2_tw1.val[1]);
425 q_s1_r = vmlaq_f32 (q_s1_r, q_in_i89ab, q2_tw2.val[1]);
426 q_s1_i = vmlsq_f32 (q_s1_i, q_in_r89ab, q2_tw2.val[1]);
427 q_s2_r = vmlaq_f32 (q_s2_r, q_in_icdef, q2_tw3.val[1]);
428 q_s2_i = vmlsq_f32 (q_s2_i, q_in_rcdef, q2_tw3.val[1]);
429
430 q_s5_r = vsubq_f32 (q_in_r0123, q_s1_r);
431 q_s5_i = vsubq_f32 (q_in_i0123, q_s1_i);
432 q2_out_0123.val[0] = vaddq_f32 (q_in_r0123, q_s1_r);
433 q2_out_0123.val[1] = vaddq_f32 (q_in_i0123, q_s1_i);
434
435 q_s3_r = vaddq_f32 (q_s0_r, q_s2_r);
436 q_s3_i = vaddq_f32 (q_s0_i, q_s2_i);
437 q_s4_r = vsubq_f32 (q_s0_r, q_s2_r);
438 q_s4_i = vsubq_f32 (q_s0_i, q_s2_i);
439
440 q_one_by_nfft = vdupq_n_f32 (0.0625f);
441 q2_out_89ab.val[0] = vsubq_f32 (q2_out_0123.val[0], q_s3_r);
442 q2_out_89ab.val[1] = vsubq_f32 (q2_out_0123.val[1], q_s3_i);
443 q2_out_0123.val[0] = vaddq_f32 (q2_out_0123.val[0], q_s3_r);
444 q2_out_0123.val[1] = vaddq_f32 (q2_out_0123.val[1], q_s3_i);
445
446 q2_out_4567.val[0] = vsubq_f32 (q_s5_r, q_s4_i);
447 q2_out_4567.val[1] = vaddq_f32 (q_s5_i, q_s4_r);
448 q2_out_cdef.val[0] = vaddq_f32 (q_s5_r, q_s4_i);
449 q2_out_cdef.val[1] = vsubq_f32 (q_s5_i, q_s4_r);
450
451 q2_out_89ab.val[0] = vmulq_f32 (q2_out_89ab.val[0], q_one_by_nfft);
452 q2_out_89ab.val[1] = vmulq_f32 (q2_out_89ab.val[1], q_one_by_nfft);
453 q2_out_0123.val[0] = vmulq_f32 (q2_out_0123.val[0], q_one_by_nfft);
454 q2_out_0123.val[1] = vmulq_f32 (q2_out_0123.val[1], q_one_by_nfft);
455 q2_out_4567.val[0] = vmulq_f32 (q2_out_4567.val[0], q_one_by_nfft);
456 q2_out_4567.val[1] = vmulq_f32 (q2_out_4567.val[1], q_one_by_nfft);
457 q2_out_cdef.val[0] = vmulq_f32 (q2_out_cdef.val[0], q_one_by_nfft);
458 q2_out_cdef.val[1] = vmulq_f32 (q2_out_cdef.val[1], q_one_by_nfft);
459
460 vst2q_f32 (p_dst0, q2_out_0123);
461 vst2q_f32 (p_dst1, q2_out_4567);
462 vst2q_f32 (p_dst2, q2_out_89ab);
463 vst2q_f32 (p_dst3, q2_out_cdef);
464}
465
466static void ne10_fft_split_r2c_1d_float32_neon (ne10_fft_cpx_float32_t *dst,
467 const ne10_fft_cpx_float32_t *src,
468 ne10_fft_cpx_float32_t *twiddles,
469 ne10_int32_t ncfft)
470{
471 ne10_int32_t k;
472 ne10_int32_t count = ncfft / 2;
473 ne10_fft_cpx_float32_t fpnk, fpk, f1k, f2k, tw, tdc;
474 float32x4x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
475 float32x4_t q_fpnk_r, q_fpnk_i;
476 float32x4_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
477 float32x4_t q_tw_r, q_tw_i;
478 float32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_val;
479 float32x4_t q_dst_r, q_dst_i, q_dst2_r, q_dst2_i;
480 float32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
481
482 tdc.r = src[0].r;
483 tdc.i = src[0].i;
484
485 dst[0].r = tdc.r + tdc.i;
486 dst[ncfft].r = tdc.r - tdc.i;
487 dst[ncfft].i = dst[0].i = 0;
488
489 if (count >= 4)
490 {
491 for (k = 1; k <= count ; k += 4)
492 {
493 p_src = (float32_t*) (& (src[k]));
494 p_src2 = (float32_t*) (& (src[ncfft - k - 3]));
495 p_twiddles = (float32_t*) (& (twiddles[k - 1]));
496 p_dst = (float32_t*) (& (dst[k]));
497 p_dst2 = (float32_t*) (& (dst[ncfft - k - 3]));
498
499 q2_fpk = vld2q_f32 (p_src);
500 q2_fpnk = vld2q_f32 (p_src2);
501 q2_tw = vld2q_f32 (p_twiddles);
502 q2_fpnk.val[0] = vrev64q_f32 (q2_fpnk.val[0]);
503 q2_fpnk.val[1] = vrev64q_f32 (q2_fpnk.val[1]);
504 q_fpnk_r = vcombine_f32 (vget_high_f32 (q2_fpnk.val[0]), vget_low_f32 (q2_fpnk.val[0]));
505 q_fpnk_i = vcombine_f32 (vget_high_f32 (q2_fpnk.val[1]), vget_low_f32 (q2_fpnk.val[1]));
506 q_fpnk_i = vnegq_f32 (q_fpnk_i);
507
508 q_f1k_r = vaddq_f32 (q2_fpk.val[0], q_fpnk_r);
509 q_f1k_i = vaddq_f32 (q2_fpk.val[1], q_fpnk_i);
510
511 q_f2k_r = vsubq_f32 (q2_fpk.val[0], q_fpnk_r);
512 q_f2k_i = vsubq_f32 (q2_fpk.val[1], q_fpnk_i);
513
514 q_tmp0 = vmulq_f32 (q_f2k_r, q2_tw.val[0]);
515 q_tmp1 = vmulq_f32 (q_f2k_i, q2_tw.val[1]);
516 q_tmp2 = vmulq_f32 (q_f2k_r, q2_tw.val[1]);
517 q_tmp3 = vmulq_f32 (q_f2k_i, q2_tw.val[0]);
518 q_tw_r = vsubq_f32 (q_tmp0, q_tmp1);
519 q_tw_i = vaddq_f32 (q_tmp2, q_tmp3);
520
521 q_val = vdupq_n_f32 (0.5f);
522 q_dst2_r = vsubq_f32 (q_f1k_r, q_tw_r);
523 q_dst2_i = vsubq_f32 (q_tw_i, q_f1k_i);
524 q_dst_r = vaddq_f32 (q_f1k_r, q_tw_r);
525 q_dst_i = vaddq_f32 (q_f1k_i, q_tw_i);
526 q_dst2_r = vmulq_f32 (q_dst2_r, q_val);
527 q_dst2_i = vmulq_f32 (q_dst2_i, q_val);
528 q2_dst.val[0] = vmulq_f32 (q_dst_r, q_val);
529 q2_dst.val[1] = vmulq_f32 (q_dst_i, q_val);
530 q_dst2_r = vrev64q_f32 (q_dst2_r);
531 q_dst2_i = vrev64q_f32 (q_dst2_i);
532 q2_dst2.val[0] = vcombine_f32 (vget_high_f32 (q_dst2_r), vget_low_f32 (q_dst2_r));
533 q2_dst2.val[1] = vcombine_f32 (vget_high_f32 (q_dst2_i), vget_low_f32 (q_dst2_i));
534 vst2q_f32 (p_dst, q2_dst);
535 vst2q_f32 (p_dst2, q2_dst2);
536
537 }
538 }
539 else
540 {
541 for (k = 1; k <= count ; k++)
542 {
543 fpk = src[k];
544 fpnk.r = src[ncfft - k].r;
545 fpnk.i = - src[ncfft - k].i;
546
547 f1k.r = fpk.r + fpnk.r;
548 f1k.i = fpk.i + fpnk.i;
549
550 f2k.r = fpk.r - fpnk.r;
551 f2k.i = fpk.i - fpnk.i;
552
553 tw.r = f2k.r * (twiddles[k - 1]).r - f2k.i * (twiddles[k - 1]).i;
554 tw.i = f2k.r * (twiddles[k - 1]).i + f2k.i * (twiddles[k - 1]).r;
555
556 dst[k].r = (f1k.r + tw.r) * 0.5f;
557 dst[k].i = (f1k.i + tw.i) * 0.5f;
558 dst[ncfft - k].r = (f1k.r - tw.r) * 0.5f;
559 dst[ncfft - k].i = (tw.i - f1k.i) * 0.5f;
560 }
561 }
562}
563
564static void ne10_fft_split_c2r_1d_float32_neon (ne10_fft_cpx_float32_t *dst,
565 const ne10_fft_cpx_float32_t *src,
566 ne10_fft_cpx_float32_t *twiddles,
567 ne10_int32_t ncfft)
568{
569
570 ne10_int32_t k;
571 ne10_int32_t count = ncfft / 2;
572 ne10_fft_cpx_float32_t fk, fnkc, fek, fok, tmp;
573 float32x4x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
574 float32x4_t q_fnkc_r, q_fnkc_i;
575 float32x4_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
576 float32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_val;
577 float32x4_t q_dst2_r, q_dst2_i;
578 float32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
579
580 dst[0].r = (src[0].r + src[ncfft].r) * 0.5f;
581 dst[0].i = (src[0].r - src[ncfft].r) * 0.5f;
582
583 if (count >= 4)
584 {
585 for (k = 1; k <= count ; k += 4)
586 {
587 p_src = (float32_t*) (& (src[k]));
588 p_src2 = (float32_t*) (& (src[ncfft - k - 3]));
589 p_twiddles = (float32_t*) (& (twiddles[k - 1]));
590 p_dst = (float32_t*) (& (dst[k]));
591 p_dst2 = (float32_t*) (& (dst[ncfft - k - 3]));
592
593 q2_fk = vld2q_f32 (p_src);
594 q2_fnkc = vld2q_f32 (p_src2);
595 q2_tw = vld2q_f32 (p_twiddles);
596 q2_fnkc.val[0] = vrev64q_f32 (q2_fnkc.val[0]);
597 q2_fnkc.val[1] = vrev64q_f32 (q2_fnkc.val[1]);
598 q_fnkc_r = vcombine_f32 (vget_high_f32 (q2_fnkc.val[0]), vget_low_f32 (q2_fnkc.val[0]));
599 q_fnkc_i = vcombine_f32 (vget_high_f32 (q2_fnkc.val[1]), vget_low_f32 (q2_fnkc.val[1]));
600 q_fnkc_i = vnegq_f32 (q_fnkc_i);
601
602 q_fek_r = vaddq_f32 (q2_fk.val[0], q_fnkc_r);
603 q_fek_i = vaddq_f32 (q2_fk.val[1], q_fnkc_i);
604
605 q_tmp0 = vsubq_f32 (q2_fk.val[0], q_fnkc_r);
606 q_tmp1 = vsubq_f32 (q2_fk.val[1], q_fnkc_i);
607
608 q_fok_r = vmulq_f32 (q_tmp0, q2_tw.val[0]);
609 q_fok_i = vmulq_f32 (q_tmp1, q2_tw.val[0]);
610 q_tmp2 = vmulq_f32 (q_tmp1, q2_tw.val[1]);
611 q_tmp3 = vmulq_f32 (q_tmp0, q2_tw.val[1]);
612 q_fok_r = vaddq_f32 (q_fok_r, q_tmp2);
613 q_fok_i = vsubq_f32 (q_fok_i, q_tmp3);
614
615 q_val = vdupq_n_f32 (0.5f);
616 q_dst2_r = vsubq_f32 (q_fek_r, q_fok_r);
617 q_dst2_i = vsubq_f32 (q_fok_i, q_fek_i);
618 q2_dst.val[0] = vaddq_f32 (q_fek_r, q_fok_r);
619 q2_dst.val[1] = vaddq_f32 (q_fek_i, q_fok_i);
620 q_dst2_r = vmulq_f32 (q_dst2_r, q_val);
621 q_dst2_i = vmulq_f32 (q_dst2_i, q_val);
622 q2_dst.val[0] = vmulq_f32 (q2_dst.val[0], q_val);
623 q2_dst.val[1] = vmulq_f32 (q2_dst.val[1], q_val);
624 q_dst2_r = vrev64q_f32 (q_dst2_r);
625 q_dst2_i = vrev64q_f32 (q_dst2_i);
626 q2_dst2.val[0] = vcombine_f32 (vget_high_f32 (q_dst2_r), vget_low_f32 (q_dst2_r));
627 q2_dst2.val[1] = vcombine_f32 (vget_high_f32 (q_dst2_i), vget_low_f32 (q_dst2_i));
628 vst2q_f32 (p_dst, q2_dst);
629 vst2q_f32 (p_dst2, q2_dst2);
630
631 }
632 }
633 else
634 {
635 for (k = 1; k <= count ; k++)
636 {
637 fk = src[k];
638 fnkc.r = src[ncfft - k].r;
639 fnkc.i = -src[ncfft - k].i;
640
641 fek.r = fk.r + fnkc.r;
642 fek.i = fk.i + fnkc.i;
643
644 tmp.r = fk.r - fnkc.r;
645 tmp.i = fk.i - fnkc.i;
646
647 fok.r = tmp.r * twiddles[k - 1].r + tmp.i * twiddles[k - 1].i;
648 fok.i = tmp.i * twiddles[k - 1].r - tmp.r * twiddles[k - 1].i;
649
650 dst[k].r = (fek.r + fok.r) * 0.5f;
651 dst[k].i = (fek.i + fok.i) * 0.5f;
652
653 dst[ncfft - k].r = (fek.r - fok.r) * 0.5f;
654 dst[ncfft - k].i = (fok.i - fek.i) * 0.5f;
655 }
656 }
657}
658
679 ne10_int32_t inverse_fft)
680{
681 // For input shorter than 16, fall back to c version.
682 // We would not get much improvement from NEON for these cases.
683 if (cfg->nfft < 16)
684 {
685 ne10_fft_c2c_1d_float32_c (fout, fin, cfg, inverse_fft);
686 return;
687 }
688
689 ne10_int32_t stage_count = cfg->factors[0];
690 ne10_int32_t algorithm_flag = cfg->factors[2 * (stage_count + 1)];
691
692 assert ((algorithm_flag == NE10_FFT_ALG_24)
693 || (algorithm_flag == NE10_FFT_ALG_ANY));
694
695 // For NE10_FFT_ALG_ANY.
696 // Function will return inside this branch.
697 if (algorithm_flag == NE10_FFT_ALG_ANY)
698 {
699 if (inverse_fft)
700 {
701 ne10_mixed_radix_generic_butterfly_inverse_float32_neon (fout, fin,
702 cfg->factors, cfg->twiddles, cfg->buffer, cfg->is_backward_scaled);
703 }
704 else
705 {
706 ne10_mixed_radix_generic_butterfly_float32_neon (fout, fin,
707 cfg->factors, cfg->twiddles, cfg->buffer, cfg->is_forward_scaled);
708 }
709 return;
710 }
711
712 // Since function goes pass assertion and skips branch above, algorithm_flag
713 // must be NE10_FFT_ALG_24.
714 if (inverse_fft)
715 {
716 switch (cfg->nfft)
717 {
718 case 4:
719 ne10_fft4_backward_float32 (fout, fin);
720 break;
721 case 8:
722 ne10_fft8_backward_float32 (fout, fin);
723 break;
724 case 16:
725 ne10_fft16_backward_float32_neon (fout, fin, cfg->twiddles);
726 break;
727 default:
728 ne10_mixed_radix_fft_backward_float32_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
729 break;
730 }
731 }
732 else
733 {
734 switch (cfg->nfft)
735 {
736 case 4:
737 ne10_fft4_forward_float32 (fout, fin);
738 break;
739 case 8:
740 ne10_fft8_forward_float32 (fout, fin);
741 break;
742 case 16:
743 ne10_fft16_forward_float32_neon (fout, fin, cfg->twiddles);
744 break;
745 default:
746 ne10_mixed_radix_fft_forward_float32_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
747 break;
748 }
749 }
750}
751
//end of C2C_FFT_IFFT group
755
772 ne10_float32_t *fin,
774{
775 ne10_fft_cpx_float32_t * tmpbuf1 = cfg->buffer;
776 ne10_fft_cpx_float32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
777 ne10_fft_state_float32_t c2c_state;
778
779 c2c_state.nfft = cfg->ncfft;
780 c2c_state.factors = cfg->factors;
781 c2c_state.twiddles = cfg->twiddles;
782 c2c_state.buffer = tmpbuf2;
783
784 ne10_fft_c2c_1d_float32_neon (tmpbuf1, (ne10_fft_cpx_float32_t*) fin, &c2c_state, 0);
785 ne10_fft_split_r2c_1d_float32_neon (fout, tmpbuf1, cfg->super_twiddles, cfg->ncfft);
786}
787
798void ne10_fft_c2r_1d_float32_neon (ne10_float32_t *fout,
801{
802 ne10_fft_cpx_float32_t * tmpbuf1 = cfg->buffer;
803 ne10_fft_cpx_float32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
804 ne10_fft_state_float32_t c2c_state;
805
806 c2c_state.nfft = cfg->ncfft;
807 c2c_state.factors = cfg->factors;
808 c2c_state.twiddles = cfg->twiddles;
809 c2c_state.buffer = tmpbuf2;
810
811 ne10_fft_split_c2r_1d_float32_neon (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft);
812 ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) fout, tmpbuf1, &c2c_state, 1);
813}
814
void ne10_fft_c2c_1d_float32_neon(ne10_fft_cpx_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_cfg_float32_t cfg, ne10_int32_t inverse_fft)
Mixed radix-2/3/4/5 complex FFT/IFFT of float(32-bit) data.
void ne10_fft_c2c_1d_float32_c(ne10_fft_cpx_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_cfg_float32_t cfg, ne10_int32_t inverse_fft)
Mixed radix-2/3/4/5 complex FFT/IFFT of float(32-bit) data.
void ne10_fft_r2c_1d_float32_neon(ne10_fft_cpx_float32_t *fout, ne10_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Mixed radix-2/4 FFT (real to complex) of float(32-bit) data.
void ne10_fft_c2r_1d_float32_neon(ne10_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Mixed radix-2/4 IFFT (complex to real) of float(32-bit) data.
structure for the floating point FFT state
Definition NE10_types.h:241
ne10_int32_t is_forward_scaled
@biref Flag to control scaling behaviour in forward floating point complex FFT.
Definition NE10_types.h:255
ne10_int32_t is_backward_scaled
@biref Flag to control scaling behaviour in backward floating point complex FFT.
Definition NE10_types.h:264