Mercurial > hg > peckfft
annotate peck_fft.c @ 9:8726585681f6
backup
author | Peter Meerwald <p.meerwald@bct-electronic.com> |
---|---|
date | Wed, 21 Sep 2011 12:18:40 +0200 |
parents | 707be088ccc3 |
children | 05f6ab0a17c0 |
rev | line source |
---|---|
0 | 1 /* |
2 Copyright (c) 2003-2010, Mark Borgerding | |
3 | |
4 All rights reserved. | |
5 | |
6 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: | |
7 | |
8 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. | |
9 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. | |
10 * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission. | |
11 | |
12 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
13 */ | |
14 | |
9 | 15 #include "armv7_cycles.h" |
0 | 16 #include "_peck_fft_guts.h" |
17 /* The guts header contains all the multiplication and addition macros that are defined for | |
1 | 18 * fixed or floating point complex numbers. It also delares the kf_ internal functions. |
0 | 19 */ |
9 | 20 #if !BFLY2_ASM |
0 | 21 static void kf_bfly2( |
5 | 22 peck_fft_cpx *Fout, |
4
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
23 const size_t fstride, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
24 const peck_fft_cfg st, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
25 int m) { |
0 | 26 |
7 | 27 // printf("kf_bfly2, %d\n", fstride); |
0 | 28 |
9 | 29 peck_fft_cpx * __restrict tw1 = st->twiddles; |
30 peck_fft_cpx * __restrict Fout2 = Fout + m; | |
1 | 31 do { |
9 | 32 peck_fft_cpx t; |
1 | 33 C_MUL(t, *Fout2, *tw1); |
0 | 34 tw1 += fstride; |
1 | 35 C_SUB(*Fout2, *Fout, t); |
36 C_ADDTO(*Fout, t); | |
0 | 37 ++Fout2; |
38 ++Fout; | |
1 | 39 } while (--m); |
0 | 40 } |
9 | 41 #endif |
0 | 42 |
43 static void kf_bfly4( | |
4
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
44 peck_fft_cpx * Fout, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
45 const size_t fstride, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
46 const peck_fft_cfg st, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
47 const size_t m) { |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
48 |
0 | 49 peck_fft_cpx *tw1,*tw2,*tw3; |
50 peck_fft_cpx scratch[6]; | |
51 size_t k=m; | |
52 const size_t m2=2*m; | |
53 const size_t m3=3*m; | |
54 | |
7 | 55 // printf("kf_bfly4, %d\n", fstride); |
0 | 56 |
57 tw3 = tw2 = tw1 = st->twiddles; | |
58 | |
59 do { | |
1 | 60 C_MUL(scratch[0], Fout[m], *tw1); |
61 C_MUL(scratch[1], Fout[m2], *tw2); | |
62 C_MUL(scratch[2], Fout[m3], *tw3); | |
63 | |
64 C_SUB(scratch[5], *Fout, scratch[1]); | |
0 | 65 C_ADDTO(*Fout, scratch[1]); |
1 | 66 C_ADD(scratch[3], scratch[0], scratch[2]); |
67 C_SUB(scratch[4], scratch[0], scratch[2]); | |
68 C_SUB(Fout[m2], *Fout, scratch[3]); | |
0 | 69 tw1 += fstride; |
70 tw2 += fstride*2; | |
71 tw3 += fstride*3; | |
1 | 72 C_ADDTO(*Fout, scratch[3]); |
0 | 73 |
1 | 74 if (st->inverse) { |
0 | 75 Fout[m].r = scratch[5].r - scratch[4].i; |
76 Fout[m].i = scratch[5].i + scratch[4].r; | |
77 Fout[m3].r = scratch[5].r + scratch[4].i; | |
78 Fout[m3].i = scratch[5].i - scratch[4].r; | |
1 | 79 } else { |
0 | 80 Fout[m].r = scratch[5].r + scratch[4].i; |
81 Fout[m].i = scratch[5].i - scratch[4].r; | |
82 Fout[m3].r = scratch[5].r - scratch[4].i; | |
83 Fout[m3].i = scratch[5].i + scratch[4].r; | |
84 } | |
85 ++Fout; | |
1 | 86 } while (--k); |
0 | 87 } |
88 | |
89 static void kf_bfly3( | |
4
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
90 peck_fft_cpx * Fout, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
91 const size_t fstride, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
92 const peck_fft_cfg st, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
93 size_t m) { |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
94 |
0 | 95 size_t k=m; |
96 const size_t m2 = 2*m; | |
1 | 97 peck_fft_cpx *tw1, *tw2; |
0 | 98 peck_fft_cpx scratch[5]; |
99 peck_fft_cpx epi3; | |
100 epi3 = st->twiddles[fstride*m]; | |
101 | |
102 printf("kf_bfly3\n"); | |
103 | |
5 | 104 tw1 = tw2 = st->twiddles; |
0 | 105 |
1 | 106 do { |
0 | 107 C_MUL(scratch[1],Fout[m] , *tw1); |
108 C_MUL(scratch[2],Fout[m2] , *tw2); | |
109 | |
110 C_ADD(scratch[3],scratch[1],scratch[2]); | |
111 C_SUB(scratch[0],scratch[1],scratch[2]); | |
112 tw1 += fstride; | |
113 tw2 += fstride*2; | |
114 | |
115 Fout[m].r = Fout->r - HALF_OF(scratch[3].r); | |
116 Fout[m].i = Fout->i - HALF_OF(scratch[3].i); | |
117 | |
118 C_MULBYSCALAR( scratch[0] , epi3.i ); | |
119 | |
120 C_ADDTO(*Fout,scratch[3]); | |
121 | |
122 Fout[m2].r = Fout[m].r + scratch[0].i; | |
123 Fout[m2].i = Fout[m].i - scratch[0].r; | |
124 | |
125 Fout[m].r -= scratch[0].i; | |
126 Fout[m].i += scratch[0].r; | |
127 | |
128 ++Fout; | |
1 | 129 } while (--k); |
0 | 130 } |
131 | |
132 static void kf_bfly5( | |
4
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
133 peck_fft_cpx * Fout, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
134 const size_t fstride, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
135 const peck_fft_cfg st, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
136 int m |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
137 ) { |
0 | 138 peck_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4; |
139 int u; | |
140 peck_fft_cpx scratch[13]; | |
141 peck_fft_cpx * twiddles = st->twiddles; | |
142 peck_fft_cpx *tw; | |
143 peck_fft_cpx ya,yb; | |
144 ya = twiddles[fstride*m]; | |
145 yb = twiddles[fstride*2*m]; | |
146 | |
147 printf("kf_bfly5\n"); | |
148 | |
5 | 149 Fout0 = Fout; |
150 Fout1 = Fout0 + m; | |
151 Fout2 = Fout0 + 2*m; | |
152 Fout3 = Fout0 + 3*m; | |
153 Fout4 = Fout0 + 4*m; | |
0 | 154 |
155 tw=st->twiddles; | |
4
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
156 for (u = 0; u < m; ++u) { |
0 | 157 scratch[0] = *Fout0; |
158 | |
6
fee54f1878f7
kill FIXED_POINT stuff, simplify
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
5
diff
changeset
|
159 C_MUL(scratch[1], *Fout1, tw[u*fstride]); |
fee54f1878f7
kill FIXED_POINT stuff, simplify
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
5
diff
changeset
|
160 C_MUL(scratch[2], *Fout2, tw[2*u*fstride]); |
fee54f1878f7
kill FIXED_POINT stuff, simplify
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
5
diff
changeset
|
161 C_MUL(scratch[3], *Fout3, tw[3*u*fstride]); |
fee54f1878f7
kill FIXED_POINT stuff, simplify
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
5
diff
changeset
|
162 C_MUL(scratch[4], *Fout4, tw[4*u*fstride]); |
0 | 163 |
6
fee54f1878f7
kill FIXED_POINT stuff, simplify
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
5
diff
changeset
|
164 C_ADD(scratch[7], scratch[1], scratch[4]); |
fee54f1878f7
kill FIXED_POINT stuff, simplify
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
5
diff
changeset
|
165 C_SUB(scratch[10], scratch[1], scratch[4]); |
fee54f1878f7
kill FIXED_POINT stuff, simplify
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
5
diff
changeset
|
166 C_ADD(scratch[8], scratch[2], scratch[3]); |
fee54f1878f7
kill FIXED_POINT stuff, simplify
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
5
diff
changeset
|
167 C_SUB(scratch[9], scratch[2], scratch[3]); |
0 | 168 |
169 Fout0->r += scratch[7].r + scratch[8].r; | |
170 Fout0->i += scratch[7].i + scratch[8].i; | |
171 | |
172 scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r); | |
173 scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r); | |
174 | |
175 scratch[6].r = S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i); | |
176 scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i); | |
177 | |
178 C_SUB(*Fout1,scratch[5],scratch[6]); | |
179 C_ADD(*Fout4,scratch[5],scratch[6]); | |
180 | |
181 scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r); | |
182 scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r); | |
183 scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i); | |
184 scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i); | |
185 | |
186 C_ADD(*Fout2,scratch[11],scratch[12]); | |
187 C_SUB(*Fout3,scratch[11],scratch[12]); | |
188 | |
189 ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4; | |
190 } | |
191 } | |
192 | |
193 /* perform the butterfly for one stage of a mixed radix FFT */ | |
194 static void kf_bfly_generic( | |
4
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
195 peck_fft_cpx * Fout, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
196 const size_t fstride, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
197 const peck_fft_cfg st, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
198 int m, |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
199 int p) { |
2d6c49fcafcb
neon2 and neon4 support
Peter Meerwald <p.meerwald@bct-electronic.com>
parents:
1
diff
changeset
|
200 |
0 | 201 int u,k,q1,q; |
202 peck_fft_cpx * twiddles = st->twiddles; | |
203 peck_fft_cpx t; | |
204 int Norig = st->nfft; | |
205 | |
206 printf("kf_bfly_generic\n"); | |
207 | |
208 peck_fft_cpx * scratch = (peck_fft_cpx*)PECK_FFT_TMP_ALLOC(sizeof(peck_fft_cpx)*p); | |
209 | |
210 for ( u=0; u<m; ++u ) { | |
211 k=u; | |
212 for (q1 = 0; q1 < p; ++q1) { | |
213 scratch[q1] = Fout[k]; | |
214 k += m; | |
215 } | |
216 | |
217 k=u; | |
218 for ( q1=0 ; q1<p ; ++q1 ) { | |
219 int twidx=0; | |
220 Fout[ k ] = scratch[0]; | |
221 for (q=1;q<p;++q ) { | |
222 twidx += fstride * k; | |
223 if (twidx>=Norig) twidx-=Norig; | |
224 C_MUL(t,scratch[q] , twiddles[twidx] ); | |
225 C_ADDTO( Fout[ k ] ,t); | |
226 } | |
227 k += m; | |
228 } | |
229 } | |
230 PECK_FFT_TMP_FREE(scratch); | |
231 } | |
232 | |
233 static void kf_work( | |
234 peck_fft_cpx * Fout, | |
235 const peck_fft_cpx * f, | |
236 const size_t fstride, | |
237 int *factors, | |
238 const peck_fft_cfg st) { | |
5 | 239 |
0 | 240 peck_fft_cpx *Fout_beg = Fout; |
241 const int p = *factors++; /* the radix */ | |
242 const int m = *factors++; /* stage's FFT length / p */ | |
243 const peck_fft_cpx *Fout_end = Fout + p*m; | |
244 | |
7 | 245 // printf("kf_work, %d\n", fstride); |
0 | 246 |
247 if (m == 1) { | |
248 do { | |
249 *Fout = *f; | |
250 f += fstride; | |
251 } while (++Fout != Fout_end); | |
252 } else { | |
253 do { | |
254 // recursive call: | |
255 // DFT of size m*p performed by doing | |
256 // p instances of smaller DFTs of size m, | |
257 // each one takes a decimated version of the input | |
258 kf_work(Fout, f, fstride*p, factors, st); | |
259 f += fstride; | |
260 } while ((Fout += m) != Fout_end); | |
261 } | |
262 | |
5 | 263 Fout = Fout_beg; |
0 | 264 |
265 // recombine the p smaller DFTs | |
266 switch (p) { | |
9 | 267 case 2: |
268 kf_bfly2(Fout, fstride, st, m); | |
269 break; | |
0 | 270 case 3: kf_bfly3(Fout, fstride, st, m); break; |
271 case 4: kf_bfly4(Fout, fstride, st, m); break; | |
272 case 5: kf_bfly5(Fout, fstride, st, m); break; | |
273 default: kf_bfly_generic(Fout, fstride, st, m, p); break; | |
274 } | |
275 } | |
276 | |
277 /* | |
278 * facbuf is populated by p1, m1, p2, m2, ... | |
279 * where | |
280 * p[i] * m[i] = m[i-1] | |
281 * m0 = n | |
282 */ | |
283 static void kf_factor(int n, int * facbuf) { | |
284 int p = 4; | |
285 float floor_sqrt; | |
286 floor_sqrt = floorf(sqrtf(n)); | |
287 | |
288 /* factor out powers of 4, powers of 2, then any remaining primes */ | |
289 do { | |
290 while (n % p) { | |
291 switch (p) { | |
292 case 4: p = 2; break; | |
293 case 2: p = 3; break; | |
294 default: p += 2; break; | |
295 } | |
296 if (p > floor_sqrt) | |
297 p = n; /* no more factors, skip to end */ | |
298 } | |
299 n /= p; | |
300 *facbuf++ = p; | |
301 *facbuf++ = n; | |
302 } while (n > 1); | |
303 } | |
304 | |
305 /* | |
306 * User-callable function to allocate all necessary storage space for the fft. | |
307 * The return value is a contiguous block of memory, allocated with malloc. As such, | |
308 * it can be freed with free(), rather than a peck_fft-specific function. | |
309 */ | |
310 peck_fft_cfg peck_fft_alloc(int nfft, int inverse_fft, void * mem, size_t * lenmem) { | |
311 peck_fft_cfg st = NULL; | |
312 size_t memneeded = sizeof(struct peck_fft_state) | |
313 + sizeof(peck_fft_cpx)*(nfft-1); /* twiddle factors */ | |
314 | |
315 if (lenmem == NULL) { | |
316 st = ( peck_fft_cfg)PECK_FFT_MALLOC(memneeded); | |
317 } else { | |
318 if (mem != NULL && *lenmem >= memneeded) | |
319 st = (peck_fft_cfg)mem; | |
320 *lenmem = memneeded; | |
321 } | |
322 | |
323 if (st) { | |
324 int i; | |
325 st->nfft=nfft; | |
326 st->inverse = inverse_fft; | |
327 | |
328 for (i = 0; i < nfft; ++i) { | |
329 const float pi = 3.14159265359f; | |
330 float phase = -2*pi*i / nfft; | |
331 if (st->inverse) | |
332 phase *= -1; | |
333 kf_cexp(st->twiddles+i, phase); | |
334 } | |
335 | |
336 kf_factor(nfft, st->factors); | |
337 } | |
338 return st; | |
339 } | |
340 | |
341 void peck_fft(peck_fft_cfg cfg, const peck_fft_cpx *fin, peck_fft_cpx *fout) { | |
342 kf_work(fout, fin, 1, cfg->factors, cfg); | |
343 } | |
344 | |
345 void peck_fft_cleanup(void) { | |
346 /* nothing needed any more */ | |
347 } | |
348 | |
349 int peck_fft_next_fast_size(int n) { | |
350 while (1) { | |
351 int m = n; | |
352 while ((m % 2) == 0) m /= 2; | |
353 while ((m % 3) == 0) m /= 3; | |
354 while ((m % 5) == 0) m /= 5; | |
355 if (m <= 1) | |
356 break; /* n is completely factorable by twos, threes, and fives */ | |
357 n++; | |
358 } | |
359 return n; | |
360 } |