Mercurial > hg > peckfft
annotate peck_fft.c @ 11:abdcde012978
backup
author | Peter Meerwald <p.meerwald@bct-electronic.com> |
---|---|
date | Thu, 22 Sep 2011 15:19:18 +0200 |
parents | 05f6ab0a17c0 |
children | 655dc5c14169 |
rev | line source |
---|---|
0 | 1 /* |
2 Copyright (c) 2003-2010, Mark Borgerding | |
3 | |
4 All rights reserved. | |
5 | |
6 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: | |
7 | |
8 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. | |
9 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. | |
10 * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission. | |
11 | |
12 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
13 */ | |
14 | |
9 | 15 #include "armv7_cycles.h" |
0 | 16 #include "_peck_fft_guts.h" |
17 /* The guts header contains all the multiplication and addition macros that are defined for | |
1 | 18 * fixed or floating point complex numbers. It also delares the kf_ internal functions. |
0 | 19 */ |
9 | 20 #if !BFLY2_ASM |
0 | 21 static void kf_bfly2( |
10 | 22 peck_fft_cpx * __restrict Fout, |
4
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
23 const size_t fstride, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
24 const peck_fft_cfg st, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
25 int m) { |
0 | 26 |
7 | 27 // printf("kf_bfly2, %d\n", fstride); |
0 | 28 |
9 | 29 peck_fft_cpx * __restrict tw1 = st->twiddles; |
30 peck_fft_cpx * __restrict Fout2 = Fout + m; | |
1 | 31 do { |
9 | 32 peck_fft_cpx t; |
1 | 33 C_MUL(t, *Fout2, *tw1); |
0 | 34 tw1 += fstride; |
1 | 35 C_SUB(*Fout2, *Fout, t); |
36 C_ADDTO(*Fout, t); | |
0 | 37 ++Fout2; |
38 ++Fout; | |
1 | 39 } while (--m); |
0 | 40 } |
9 | 41 #endif |
0 | 42 |
10 | 43 #if !BFLY4_ASM |
0 | 44 static void kf_bfly4( |
10 | 45 peck_fft_cpx * __restrict Fout, |
4
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
46 const size_t fstride, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
47 const peck_fft_cfg st, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
48 const size_t m) { |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
49 |
10 | 50 peck_fft_cpx scratch[4]; |
51 peck_fft_cpx * __restrict tw1, * __restrict tw2, * __restrict tw3; | |
52 size_t k = m; | |
53 const size_t m2 = 2*m; | |
54 const size_t m3 = 3*m; | |
0 | 55 |
7 | 56 // printf("kf_bfly4, %d\n", fstride); |
0 | 57 |
58 tw3 = tw2 = tw1 = st->twiddles; | |
59 | |
10 | 60 if (st->inverse) { |
61 do { | |
62 C_MUL(scratch[0], Fout[m], *tw1); | |
63 C_MUL(scratch[3], Fout[m2], *tw2); | |
64 C_MUL(scratch[2], Fout[m3], *tw3); | |
1 | 65 |
10 | 66 C_SUB(scratch[1], *Fout, scratch[3]); |
67 C_ADDTO(*Fout, scratch[3]); | |
68 | |
69 C_ADD(scratch[3], scratch[0], scratch[2]); | |
70 C_SUB(Fout[m2], *Fout, scratch[3]); | |
71 C_ADDTO(*Fout, scratch[3]); | |
72 | |
73 tw1 += fstride; | |
74 tw2 += fstride*2; | |
75 tw3 += fstride*3; | |
76 | |
77 C_SUB(scratch[3], scratch[0], scratch[2]); | |
78 Fout[m].r = scratch[1].r - scratch[3].i; | |
79 Fout[m].i = scratch[1].i + scratch[3].r; | |
80 Fout[m3].r = scratch[1].r + scratch[3].i; | |
81 Fout[m3].i = scratch[1].i - scratch[3].r; | |
0 | 82 |
10 | 83 ++Fout; |
84 } while (--k); | |
85 } | |
86 else { | |
87 do { | |
88 C_MUL(scratch[0], Fout[m], *tw1); | |
89 C_MUL(scratch[3], Fout[m2], *tw2); | |
90 C_MUL(scratch[2], Fout[m3], *tw3); | |
91 | |
92 C_SUB(scratch[1], *Fout, scratch[3]); | |
93 C_ADDTO(*Fout, scratch[3]); | |
94 | |
95 C_ADD(scratch[3], scratch[0], scratch[2]); | |
96 C_SUB(Fout[m2], *Fout, scratch[3]); | |
97 C_ADDTO(*Fout, scratch[3]); | |
98 | |
99 tw1 += fstride; | |
100 tw2 += fstride*2; | |
101 tw3 += fstride*3; | |
102 | |
103 C_SUB(scratch[3], scratch[0], scratch[2]); | |
104 Fout[m].r = scratch[1].r + scratch[3].i; | |
105 Fout[m].i = scratch[1].i - scratch[3].r; | |
106 Fout[m3].r = scratch[1].r - scratch[3].i; | |
107 Fout[m3].i = scratch[1].i + scratch[3].r; | |
108 | |
109 ++Fout; | |
110 } while (--k); | |
111 } | |
0 | 112 } |
10 | 113 #endif |
0 | 114 |
115 static void kf_bfly3( | |
4
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
116 peck_fft_cpx * Fout, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
117 const size_t fstride, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
118 const peck_fft_cfg st, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
119 size_t m) { |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
120 |
0 | 121 size_t k=m; |
122 const size_t m2 = 2*m; | |
1 | 123 peck_fft_cpx *tw1, *tw2; |
0 | 124 peck_fft_cpx scratch[5]; |
125 peck_fft_cpx epi3; | |
126 epi3 = st->twiddles[fstride*m]; | |
127 | |
128 printf("kf_bfly3\n"); | |
129 | |
5 | 130 tw1 = tw2 = st->twiddles; |
0 | 131 |
1 | 132 do { |
0 | 133 C_MUL(scratch[1],Fout[m] , *tw1); |
134 C_MUL(scratch[2],Fout[m2] , *tw2); | |
135 | |
136 C_ADD(scratch[3],scratch[1],scratch[2]); | |
137 C_SUB(scratch[0],scratch[1],scratch[2]); | |
138 tw1 += fstride; | |
139 tw2 += fstride*2; | |
140 | |
141 Fout[m].r = Fout->r - HALF_OF(scratch[3].r); | |
142 Fout[m].i = Fout->i - HALF_OF(scratch[3].i); | |
143 | |
144 C_MULBYSCALAR( scratch[0] , epi3.i ); | |
145 | |
146 C_ADDTO(*Fout,scratch[3]); | |
147 | |
148 Fout[m2].r = Fout[m].r + scratch[0].i; | |
149 Fout[m2].i = Fout[m].i - scratch[0].r; | |
150 | |
151 Fout[m].r -= scratch[0].i; | |
152 Fout[m].i += scratch[0].r; | |
153 | |
154 ++Fout; | |
1 | 155 } while (--k); |
0 | 156 } |
157 | |
158 static void kf_bfly5( | |
4
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
159 peck_fft_cpx * Fout, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
160 const size_t fstride, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
161 const peck_fft_cfg st, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
162 int m |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
163 ) { |
0 | 164 peck_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4; |
165 int u; | |
166 peck_fft_cpx scratch[13]; | |
167 peck_fft_cpx * twiddles = st->twiddles; | |
168 peck_fft_cpx *tw; | |
169 peck_fft_cpx ya,yb; | |
170 ya = twiddles[fstride*m]; | |
171 yb = twiddles[fstride*2*m]; | |
172 | |
173 printf("kf_bfly5\n"); | |
174 | |
5 | 175 Fout0 = Fout; |
176 Fout1 = Fout0 + m; | |
177 Fout2 = Fout0 + 2*m; | |
178 Fout3 = Fout0 + 3*m; | |
179 Fout4 = Fout0 + 4*m; | |
0 | 180 |
181 tw=st->twiddles; | |
4
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
182 for (u = 0; u < m; ++u) { |
0 | 183 scratch[0] = *Fout0; |
184 | |
6
fee54f1878f7
kill FIXED_POINT stuff, simplify
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
5
diff
changeset
|
185 C_MUL(scratch[1], *Fout1, tw[u*fstride]); |
fee54f1878f7
kill FIXED_POINT stuff, simplify
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
5
diff
changeset
|
186 C_MUL(scratch[2], *Fout2, tw[2*u*fstride]); |
fee54f1878f7
kill FIXED_POINT stuff, simplify
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
5
diff
changeset
|
187 C_MUL(scratch[3], *Fout3, tw[3*u*fstride]); |
fee54f1878f7
kill FIXED_POINT stuff, simplify
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
5
diff
changeset
|
188 C_MUL(scratch[4], *Fout4, tw[4*u*fstride]); |
0 | 189 |
6
fee54f1878f7
kill FIXED_POINT stuff, simplify
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
5
diff
changeset
|
190 C_ADD(scratch[7], scratch[1], scratch[4]); |
fee54f1878f7
kill FIXED_POINT stuff, simplify
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
5
diff
changeset
|
191 C_SUB(scratch[10], scratch[1], scratch[4]); |
fee54f1878f7
kill FIXED_POINT stuff, simplify
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
5
diff
changeset
|
192 C_ADD(scratch[8], scratch[2], scratch[3]); |
fee54f1878f7
kill FIXED_POINT stuff, simplify
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
5
diff
changeset
|
193 C_SUB(scratch[9], scratch[2], scratch[3]); |
0 | 194 |
195 Fout0->r += scratch[7].r + scratch[8].r; | |
196 Fout0->i += scratch[7].i + scratch[8].i; | |
197 | |
198 scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r); | |
199 scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r); | |
200 | |
201 scratch[6].r = S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i); | |
202 scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i); | |
203 | |
204 C_SUB(*Fout1,scratch[5],scratch[6]); | |
205 C_ADD(*Fout4,scratch[5],scratch[6]); | |
206 | |
207 scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r); | |
208 scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r); | |
209 scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i); | |
210 scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i); | |
211 | |
212 C_ADD(*Fout2,scratch[11],scratch[12]); | |
213 C_SUB(*Fout3,scratch[11],scratch[12]); | |
214 | |
215 ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4; | |
216 } | |
217 } | |
218 | |
219 /* perform the butterfly for one stage of a mixed radix FFT */ | |
220 static void kf_bfly_generic( | |
4
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
221 peck_fft_cpx * Fout, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
222 const size_t fstride, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
223 const peck_fft_cfg st, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
224 int m, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
225 int p) { |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
226 |
0 | 227 int u,k,q1,q; |
228 peck_fft_cpx * twiddles = st->twiddles; | |
229 peck_fft_cpx t; | |
230 int Norig = st->nfft; | |
231 | |
232 printf("kf_bfly_generic\n"); | |
233 | |
234 peck_fft_cpx * scratch = (peck_fft_cpx*)PECK_FFT_TMP_ALLOC(sizeof(peck_fft_cpx)*p); | |
235 | |
236 for ( u=0; u<m; ++u ) { | |
237 k=u; | |
238 for (q1 = 0; q1 < p; ++q1) { | |
239 scratch[q1] = Fout[k]; | |
240 k += m; | |
241 } | |
242 | |
243 k=u; | |
244 for ( q1=0 ; q1<p ; ++q1 ) { | |
245 int twidx=0; | |
246 Fout[ k ] = scratch[0]; | |
247 for (q=1;q<p;++q ) { | |
248 twidx += fstride * k; | |
249 if (twidx>=Norig) twidx-=Norig; | |
250 C_MUL(t,scratch[q] , twiddles[twidx] ); | |
251 C_ADDTO( Fout[ k ] ,t); | |
252 } | |
253 k += m; | |
254 } | |
255 } | |
256 PECK_FFT_TMP_FREE(scratch); | |
257 } | |
258 | |
259 static void kf_work( | |
260 peck_fft_cpx * Fout, | |
261 const peck_fft_cpx * f, | |
262 const size_t fstride, | |
263 int *factors, | |
264 const peck_fft_cfg st) { | |
5 | 265 |
0 | 266 peck_fft_cpx *Fout_beg = Fout; |
267 const int p = *factors++; /* the radix */ | |
268 const int m = *factors++; /* stage's FFT length / p */ | |
269 const peck_fft_cpx *Fout_end = Fout + p*m; | |
270 | |
7 | 271 // printf("kf_work, %d\n", fstride); |
0 | 272 |
273 if (m == 1) { | |
274 do { | |
275 *Fout = *f; | |
276 f += fstride; | |
277 } while (++Fout != Fout_end); | |
278 } else { | |
279 do { | |
280 // recursive call: | |
281 // DFT of size m*p performed by doing | |
282 // p instances of smaller DFTs of size m, | |
283 // each one takes a decimated version of the input | |
284 kf_work(Fout, f, fstride*p, factors, st); | |
285 f += fstride; | |
286 } while ((Fout += m) != Fout_end); | |
287 } | |
288 | |
5 | 289 Fout = Fout_beg; |
0 | 290 |
291 // recombine the p smaller DFTs | |
292 switch (p) { | |
9 | 293 case 2: |
294 kf_bfly2(Fout, fstride, st, m); | |
295 break; | |
0 | 296 case 3: kf_bfly3(Fout, fstride, st, m); break; |
11 | 297 case 4: |
298 { | |
299 static unsigned counter = 0; | |
300 armv7_cycles_start(); | |
301 unsigned int t1 = armv7_cycles_read(); | |
302 kf_bfly4(Fout, fstride, st, m); | |
303 unsigned int t2 = armv7_cycles_read(); | |
304 armv7_cycles_stop(); | |
305 counter++; | |
306 if (counter > 150 && counter < 160) printf("XX %d\n", t2-t1); | |
307 } | |
308 break; | |
0 | 309 case 5: kf_bfly5(Fout, fstride, st, m); break; |
310 default: kf_bfly_generic(Fout, fstride, st, m, p); break; | |
311 } | |
312 } | |
313 | |
314 /* | |
315 * facbuf is populated by p1, m1, p2, m2, ... | |
316 * where | |
317 * p[i] * m[i] = m[i-1] | |
318 * m0 = n | |
319 */ | |
320 static void kf_factor(int n, int * facbuf) { | |
321 int p = 4; | |
322 float floor_sqrt; | |
323 floor_sqrt = floorf(sqrtf(n)); | |
324 | |
325 /* factor out powers of 4, powers of 2, then any remaining primes */ | |
326 do { | |
327 while (n % p) { | |
328 switch (p) { | |
329 case 4: p = 2; break; | |
330 case 2: p = 3; break; | |
331 default: p += 2; break; | |
332 } | |
333 if (p > floor_sqrt) | |
334 p = n; /* no more factors, skip to end */ | |
335 } | |
336 n /= p; | |
337 *facbuf++ = p; | |
338 *facbuf++ = n; | |
339 } while (n > 1); | |
340 } | |
341 | |
342 /* | |
343 * User-callable function to allocate all necessary storage space for the fft. | |
344 * The return value is a contiguous block of memory, allocated with malloc. As such, | |
345 * it can be freed with free(), rather than a peck_fft-specific function. | |
346 */ | |
347 peck_fft_cfg peck_fft_alloc(int nfft, int inverse_fft, void * mem, size_t * lenmem) { | |
348 peck_fft_cfg st = NULL; | |
349 size_t memneeded = sizeof(struct peck_fft_state) | |
350 + sizeof(peck_fft_cpx)*(nfft-1); /* twiddle factors */ | |
351 | |
352 if (lenmem == NULL) { | |
353 st = ( peck_fft_cfg)PECK_FFT_MALLOC(memneeded); | |
354 } else { | |
355 if (mem != NULL && *lenmem >= memneeded) | |
356 st = (peck_fft_cfg)mem; | |
357 *lenmem = memneeded; | |
358 } | |
359 | |
360 if (st) { | |
361 int i; | |
362 st->nfft=nfft; | |
363 st->inverse = inverse_fft; | |
364 | |
365 for (i = 0; i < nfft; ++i) { | |
366 const float pi = 3.14159265359f; | |
367 float phase = -2*pi*i / nfft; | |
368 if (st->inverse) | |
369 phase *= -1; | |
370 kf_cexp(st->twiddles+i, phase); | |
371 } | |
372 | |
373 kf_factor(nfft, st->factors); | |
374 } | |
375 return st; | |
376 } | |
377 | |
378 void peck_fft(peck_fft_cfg cfg, const peck_fft_cpx *fin, peck_fft_cpx *fout) { | |
379 kf_work(fout, fin, 1, cfg->factors, cfg); | |
380 } | |
381 | |
382 void peck_fft_cleanup(void) { | |
383 /* nothing needed any more */ | |
384 } | |
385 | |
386 int peck_fft_next_fast_size(int n) { | |
387 while (1) { | |
388 int m = n; | |
389 while ((m % 2) == 0) m /= 2; | |
390 while ((m % 3) == 0) m /= 3; | |
391 while ((m % 5) == 0) m /= 5; | |
392 if (m <= 1) | |
393 break; /* n is completely factorable by twos, threes, and fives */ | |
394 n++; | |
395 } | |
396 return n; | |
397 } |