Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_fft_generic_float32.h
1/*
2 * Copyright 2014-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/* license of Kiss FFT */
29/*
30Copyright (c) 2003-2010, Mark Borgerding
31
32All rights reserved.
33
34Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
35
36 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
37 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
38 * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
39
40THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41*/
42
43/*
44 * NE10 Library : dsp/NE10_fft_generic_float32.h
45 */
46
47#ifndef NE10_FFT_GENERIC_FLOAT32_H
48#define NE10_FFT_GENERIC_FLOAT32_H
49
50#include "NE10_types.h"
51#include "NE10_macros.h"
52#include "NE10_fft.h"
53
55// Multiply input with twiddles
57static inline void FFT2_MUL_TW (ne10_fft_cpx_float32_t scratch_out[2],
58 const ne10_fft_cpx_float32_t scratch_in[2],
59 const ne10_fft_cpx_float32_t scratch_tw[1])
60{
61 scratch_out[0] = scratch_in[0];
62 NE10_CPX_MUL_F32 (scratch_out[1], scratch_in[1], scratch_tw[0]);
63}
64
65static inline void FFT3_MUL_TW (ne10_fft_cpx_float32_t scratch_out[3],
66 const ne10_fft_cpx_float32_t scratch_in[3],
67 const ne10_fft_cpx_float32_t scratch_tw[2])
68{
69 FFT2_MUL_TW (scratch_out, scratch_in, scratch_tw);
70 NE10_CPX_MUL_F32 (scratch_out[2], scratch_in[2], scratch_tw[1]);
71}
72
73static inline void FFT4_MUL_TW (ne10_fft_cpx_float32_t scratch_out[4],
74 const ne10_fft_cpx_float32_t scratch_in[4],
75 const ne10_fft_cpx_float32_t scratch_tw[3])
76{
77 FFT3_MUL_TW (scratch_out, scratch_in, scratch_tw);
78 NE10_CPX_MUL_F32 (scratch_out[3], scratch_in[3], scratch_tw[2]);
79}
80
81static inline void FFT8_FCU (ne10_fft_cpx_float32_t out[8],
82 const ne10_fft_cpx_float32_t in[8])
83{
85
86 static const ne10_fft_cpx_float32_t TW_8[4] = {
87 { 1.00000, 0.00000 },
88 { 0.70711, -0.70711 },
89 { 0.00000, -1.00000 },
90 { -0.70711, -0.70711 },
91 };
92
93#define NE10_BUTTERFLY_INDEX_F32(OUT,IN,OUT_I,OUT_J,IN_I,IN_J) \
94 do { \
95 NE10_CPX_ADD (OUT[OUT_I],IN[IN_I],IN[IN_J]); \
96 NE10_CPX_SUB (OUT[OUT_J],IN[IN_I],IN[IN_J]); \
97 } while (0)
98
99 // STAGE - 1
100 // in -> s
101 {
102 NE10_BUTTERFLY_INDEX_F32 (s,in,0,4,0,4);
103 NE10_BUTTERFLY_INDEX_F32 (s,in,1,5,1,5);
104 NE10_BUTTERFLY_INDEX_F32 (s,in,2,6,2,6);
105 NE10_BUTTERFLY_INDEX_F32 (s,in,3,7,3,7);
106 }
107
108 // STAGE - 2
109 // s -> out
110 {
111 // TW
112#define NE10_CPX_MUL_TW8_F32(OUT,TW_8_TABLE,OUT_I,TW_J) \
113 do { \
114 ne10_fft_cpx_float32_t TW_TMP = TW_8_TABLE[TW_J]; \
115 NE10_CPX_MUL_F32 (OUT[OUT_I],OUT[OUT_I],TW_TMP); \
116 } while (0)
117
118 NE10_CPX_MUL_TW8_F32 (s,TW_8,4,0);
119 NE10_CPX_MUL_TW8_F32 (s,TW_8,5,1);
120 NE10_CPX_MUL_TW8_F32 (s,TW_8,6,2);
121 NE10_CPX_MUL_TW8_F32 (s,TW_8,7,3);
122
123 NE10_BUTTERFLY_INDEX_F32 (out,s,0,2,0,2);
124 NE10_BUTTERFLY_INDEX_F32 (out,s,1,3,1,3);
125 NE10_BUTTERFLY_INDEX_F32 (out,s,4,6,4,6);
126 NE10_BUTTERFLY_INDEX_F32 (out,s,5,7,5,7);
127 }
128 // STAGE - 3
129 // out -> s
130 {
131 // TW
132 NE10_CPX_MUL_TW8_F32 (out,TW_8,2,0);
133 NE10_CPX_MUL_TW8_F32 (out,TW_8,3,2);
134 NE10_CPX_MUL_TW8_F32 (out,TW_8,6,0);
135 NE10_CPX_MUL_TW8_F32 (out,TW_8,7,2);
136#undef NE10_CPX_MUL_TW8_F32
137
138 NE10_BUTTERFLY_INDEX_F32 (s,out,0,4,0,1);
139 NE10_BUTTERFLY_INDEX_F32 (s,out,2,6,2,3);
140 NE10_BUTTERFLY_INDEX_F32 (s,out,1,5,4,5);
141 NE10_BUTTERFLY_INDEX_F32 (s,out,3,7,6,7);
142 }
143
144 out[0] = s[0];
145 out[1] = s[1];
146 out[2] = s[2];
147 out[3] = s[3];
148 out[4] = s[4];
149 out[5] = s[5];
150 out[6] = s[6];
151 out[7] = s[7];
152}
153
154static inline void FFT5_MUL_TW (ne10_fft_cpx_float32_t scratch_out[5],
155 const ne10_fft_cpx_float32_t scratch_in[5],
156 const ne10_fft_cpx_float32_t scratch_tw[4])
157{
158 FFT4_MUL_TW (scratch_out, scratch_in, scratch_tw);
159 NE10_CPX_MUL_F32 (scratch_out[4], scratch_in[4], scratch_tw[3]);
160}
161
163// FFT Kernel
164// F: Forward
165// C: Complex
166// U: Unscaled
168static inline void FFT2_FCU (ne10_fft_cpx_float32_t scratch_out[2],
169 const ne10_fft_cpx_float32_t scratch_in[2])
170{
171 NE10_CPX_ADD (scratch_out[0], scratch_in[0], scratch_in[1]);
172 NE10_CPX_SUB (scratch_out[1], scratch_in[0], scratch_in[1]);
173}
174
175static inline void FFT3_FCU (ne10_fft_cpx_float32_t Fout[3],
176 const ne10_fft_cpx_float32_t Fin[3])
177{
178 ne10_fft_cpx_float32_t scratch[4];
179 ne10_fft_cpx_float32_t scratch_in[3];
180
181 scratch_in[0] = Fin[0];
182 scratch_in[1] = Fin[1];
183 scratch_in[2] = Fin[2];
184
185 scratch[1] = scratch_in[1];
186 scratch[2] = scratch_in[2];
187
188 NE10_CPX_ADD (scratch[3], scratch[1], scratch[2]);
189 NE10_CPX_SUB (scratch[0], scratch[1], scratch[2]);
190
191 scratch_in[1].r = scratch_in[0].r - scratch[3].r * 0.5;
192 scratch_in[1].i = scratch_in[0].i - scratch[3].i * 0.5;
193
194 scratch[0].r *= -TW_3I_F32;
195 scratch[0].i *= -TW_3I_F32;
196
197 scratch_in[0].r += scratch[3].r;
198 scratch_in[0].i += scratch[3].i;
199
200 scratch_in[2].r = scratch_in[1].r + scratch[0].i;
201 scratch_in[2].i = scratch_in[1].i - scratch[0].r;
202
203 scratch_in[1].r -= scratch[0].i;
204 scratch_in[1].i += scratch[0].r;
205
206 Fout[0] = scratch_in[0];
207 Fout[1] = scratch_in[1];
208 Fout[2] = scratch_in[2];
209}
210
211static inline void FFT4_FCU (ne10_fft_cpx_float32_t scratch_out[4],
212 const ne10_fft_cpx_float32_t scratch_in[4])
213{
214 ne10_fft_cpx_float32_t scratch[4];
215
216 NE10_CPX_ADD (scratch[0], scratch_in[0], scratch_in[2]);
217 NE10_CPX_SUB (scratch[1], scratch_in[0], scratch_in[2]);
218 NE10_CPX_ADD (scratch[2], scratch_in[1], scratch_in[3]);
219 NE10_CPX_SUB (scratch[3], scratch_in[1], scratch_in[3]);
220
221 NE10_CPX_SUB (scratch_out[2], scratch[0], scratch[2]);
222 NE10_CPX_ADD (scratch_out[0], scratch[0], scratch[2]);
223
224 scratch_out[1].r = scratch[1].r + scratch[3].i;
225 scratch_out[1].i = scratch[1].i - scratch[3].r;
226 scratch_out[3].r = scratch[1].r - scratch[3].i;
227 scratch_out[3].i = scratch[1].i + scratch[3].r;
228}
229
230static inline void FFT4_FCU_INPLACE (ne10_fft_cpx_float32_t scratch_out[4])
231{
232 ne10_fft_cpx_float32_t scratch[4];
233
234 NE10_CPX_ADD (scratch[0], scratch_out[0], scratch_out[2]);
235 NE10_CPX_SUB (scratch[1], scratch_out[0], scratch_out[2]);
236 NE10_CPX_ADD (scratch[2], scratch_out[1], scratch_out[3]);
237 NE10_CPX_SUB (scratch[3], scratch_out[1], scratch_out[3]);
238
239 NE10_CPX_SUB (scratch_out[2], scratch[0], scratch[2]);
240 NE10_CPX_ADD (scratch_out[0], scratch[0], scratch[2]);
241
242 scratch_out[1].r = scratch[1].r + scratch[3].i;
243 scratch_out[1].i = scratch[1].i - scratch[3].r;
244 scratch_out[3].r = scratch[1].r - scratch[3].i;
245 scratch_out[3].i = scratch[1].i + scratch[3].r;
246}
247
248static inline void FFT5_FCU (ne10_fft_cpx_float32_t Fout[5],
249 const ne10_fft_cpx_float32_t Fin[5])
250{
251 ne10_fft_cpx_float32_t scratch[13], scratch_in[5];
252
253 scratch_in[0] = Fin[0];
254 scratch_in[1] = Fin[1];
255 scratch_in[2] = Fin[2];
256 scratch_in[3] = Fin[3];
257 scratch_in[4] = Fin[4];
258
259 scratch[0] = scratch_in[0];
260 scratch[1] = scratch_in[1];
261 scratch[2] = scratch_in[2];
262 scratch[3] = scratch_in[3];
263 scratch[4] = scratch_in[4];
264
265 NE10_CPX_ADD (scratch[ 7], scratch[1], scratch[4]);
266 NE10_CPX_SUB (scratch[10], scratch[1], scratch[4]);
267 NE10_CPX_ADD (scratch[ 8], scratch[2], scratch[3]);
268 NE10_CPX_SUB (scratch[ 9], scratch[2], scratch[3]);
269
270 scratch_in[0].r += scratch[7].r + scratch[8].r;
271 scratch_in[0].i += scratch[7].i + scratch[8].i;
272
273 scratch[5].r = scratch[0].r
274 + NE10_S_MUL (scratch[7].r, TW_5A_F32.r)
275 + NE10_S_MUL (scratch[8].r, TW_5B_F32.r);
276 scratch[5].i = scratch[0].i
277 + NE10_S_MUL (scratch[7].i, TW_5A_F32.r)
278 + NE10_S_MUL (scratch[8].i, TW_5B_F32.r);
279
280 scratch[6].r = NE10_S_MUL (scratch[10].i, TW_5A_F32.i)
281 + NE10_S_MUL (scratch[9].i, TW_5B_F32.i);
282 scratch[6].i = -NE10_S_MUL (scratch[10].r, TW_5A_F32.i)
283 - NE10_S_MUL (scratch[9].r, TW_5B_F32.i);
284
285 NE10_CPX_SUB (scratch_in[1], scratch[5], scratch[6]);
286 NE10_CPX_ADD (scratch_in[4], scratch[5], scratch[6]);
287
288 scratch[11].r = scratch[0].r
289 + NE10_S_MUL (scratch[7].r, TW_5B_F32.r)
290 + NE10_S_MUL (scratch[8].r, TW_5A_F32.r);
291 scratch[11].i = scratch[0].i
292 + NE10_S_MUL (scratch[7].i, TW_5B_F32.r)
293 + NE10_S_MUL (scratch[8].i, TW_5A_F32.r);
294
295 scratch[12].r = -NE10_S_MUL (scratch[10].i, TW_5B_F32.i)
296 + NE10_S_MUL (scratch[9].i, TW_5A_F32.i);
297 scratch[12].i = NE10_S_MUL (scratch[10].r, TW_5B_F32.i)
298 - NE10_S_MUL (scratch[9].r, TW_5A_F32.i);
299
300 NE10_CPX_ADD (scratch_in[2], scratch[11], scratch[12]);
301 NE10_CPX_SUB (scratch_in[3], scratch[11], scratch[12]);
302
303 Fout[0] = scratch_in[0];
304 Fout[1] = scratch_in[1];
305 Fout[2] = scratch_in[2];
306 Fout[3] = scratch_in[3];
307 Fout[4] = scratch_in[4];
308}
309#endif // NE10_FFT_GENERIC_FLOAT32_H