# HG changeset patch # User Peter Meerwald # Date 1316703505 -7200 # Node ID 655dc5c14169409a45ddc43c16c25f98a71b7bda # Parent abdcde0129780a508120f363912c774f3954d032 backup diff -r abdcde012978 -r 655dc5c14169 _peck_fft_guts.h --- a/_peck_fft_guts.h Thu Sep 22 15:19:18 2011 +0200 +++ b/_peck_fft_guts.h Thu Sep 22 16:58:25 2011 +0200 @@ -13,16 +13,15 @@ */ /* peck_fft.h - defines peck_fft_scalar as either short or a float type - and defines - typedef struct { peck_fft_scalar r; peck_fft_scalar i; }peck_fft_cpx; */ + * defines peck_fft_scalar as either short or a float type and defines + * typedef struct { peck_fft_scalar r; peck_fft_scalar i; } peck_fft_cpx; + */ #include "peck_fft.h" #include #define MAXFACTORS 32 /* e.g. an fft of length 128 has 4 factors - as far as kissfft is concerned - 4*4*4*2 + * as far as kissfft is concerned: 4*4*4*2 */ struct peck_fft_state{ diff -r abdcde012978 -r 655dc5c14169 kf_bfly4.S --- a/kf_bfly4.S Thu Sep 22 15:19:18 2011 +0200 +++ b/kf_bfly4.S Thu Sep 22 16:58:25 2011 +0200 @@ -36,13 +36,13 @@ mov r6, ip .inverse_loop: // C_MUL(scratch[0], Fout[m], *tw1); - vld1.32 {d18,d19}, [r5] - vld1.32 {d16,d17}, [ip] + vld1.32 {d18,d19}, [r5,:64] + vld1.32 {d16,d17}, [ip,:64] vmul.f32 d20, d18, d16 vmul.f32 d21, d16, d19 // load Fout[m2], *tw2 - vld1.32 {d14,d15}, [r4] - vld1.32 {d12,d13}, [r6] + vld1.32 {d14,d15}, [r4,:64] + vld1.32 {d12,d13}, [r6,:64] vmls.f32 d20, d19, d17 vmla.f32 d21, d18, d17 @@ -50,8 +50,8 @@ vmul.f32 d22, d14, d12 vmul.f32 d23, d12, d15 // load Fout[m3], *tw3 - vld1.32 {d18,d19}, [r0] - vld1.32 {d16,d17}, [sl] + vld1.32 {d18,d19}, [r0,:64] + vld1.32 {d16,d17}, [sl,:64] vmls.f32 d22, d15, d13 vmla.f32 d23, d14, d13 @@ -60,7 +60,7 @@ vmul.f32 d25, d16, d19 // C_SUB(scratch[1], *Fout, scratch[3]); - vld1.32 {d14,d15}, [r2] + vld1.32 {d14,d15}, [r2,:64] vsub.f32 q13, q7, q11 vmls.f32 d24, d19, d17 @@ -70,15 +70,15 @@ vadd.f32 q7, q7, q11 // C_ADD(scratch[3], scratch[0], scratch[2]); vadd.f32 q11, q10, q12 - vst1.32 {d16,d17}, [r2] + vst1.32 {d16,d17}, [r2,:64] // C_SUB(Fout[m2], *Fout, scratch[3]); vsub.f32 q9, q7, q11 - vst1.32 {d18,d19}, [r4]! + vst1.32 {d18,d19}, [r4,:64]! // C_ADDTO(*Fout, scratch[3]); vadd.f32 q7, q7, q11 - vst1.32 {d14,d15}, [r2]! + vst1.32 {d14,d15}, [r2,:64]! add ip, ip, r8 add r6, r6, r7 @@ -91,13 +91,13 @@ // Fout[m].i = scratch[1].i + scratch[3].r; vsub.f32 d18, d26, d23 vadd.f32 d19, d27, d22 - vst1.32 {d18,d19}, [r5]! + vst1.32 {d18,d19}, [r5,:64]! // Fout[m3].r = scratch[1].r + scratch[3].i; // Fout[m3].i = scratch[1].i - scratch[3].r; vadd.f32 d18, d26, d23 vsub.f32 d19, d27, d22 - vst1.32 {d18,d19}, [r0]! + vst1.32 {d18,d19}, [r0,:64]! subs r3, r3, #1 bne .inverse_loop @@ -108,13 +108,13 @@ mov r6, ip .forward_loop: // C_MUL(scratch[0], Fout[m], *tw1); - vld1.32 {d18,d19}, [r5] - vld1.32 {d16,d17}, [ip] + vld1.32 {d18,d19}, [r5,:64] + vld1.32 {d16,d17}, [ip,:64] vmul.f32 d20, d18, d16 vmul.f32 d21, d16, d19 // load Fout[m2], *tw2 - vld1.32 {d14,d15}, [r4] - vld1.32 {d12,d13}, [r6] + vld1.32 {d14,d15}, [r4,:64] + vld1.32 {d12,d13}, [r6,:64] vmls.f32 d20, d19, d17 vmla.f32 d21, d18, d17 @@ -122,8 +122,8 @@ vmul.f32 d22, d14, d12 vmul.f32 d23, d12, d15 // load Fout[m3], *tw3 - vld1.32 {d18,d19}, [r0] - vld1.32 {d16,d17}, [sl] + vld1.32 {d18,d19}, [r0,:64] + vld1.32 {d16,d17}, [sl,:64] vmls.f32 d22, d15, d13 vmla.f32 d23, d14, d13 @@ -132,7 +132,7 @@ vmul.f32 d25, d16, d19 // C_SUB(scratch[1], *Fout, scratch[3]); - vld1.32 {d14,d15}, [r2] + vld1.32 {d14,d15}, [r2,:64] vsub.f32 q13, q7, q11 vmls.f32 d24, d19, d17 @@ -142,15 +142,15 @@ vadd.f32 q7, q7, q11 // C_ADD(scratch[3], scratch[0], scratch[2]); vadd.f32 q11, q10, q12 - vst1.32 {d16,d17}, [r2] + vst1.32 {d16,d17}, [r2,:64] // C_SUB(Fout[m2], *Fout, scratch[3]); vsub.f32 q9, q7, q11 - vst1.32 {d18,d19}, [r4]! + vst1.32 {d18,d19}, [r4,:64]! // C_ADDTO(*Fout, scratch[3]); vadd.f32 q7, q7, q11 - vst1.32 {d14,d15}, [r2]! + vst1.32 {d14,d15}, [r2,:64]! add ip, ip, r8 add r6, r6, r7 @@ -163,13 +163,13 @@ // Fout[m].i = scratch[1].i + scratch[3].r; vadd.f32 d18, d26, d23 vsub.f32 d19, d27, d22 - vst1.32 {d18,d19}, [r5]! + vst1.32 {d18,d19}, [r5,:64]! // Fout[m3].r = scratch[1].r + scratch[3].i; // Fout[m3].i = scratch[1].i - scratch[3].r; vsub.f32 d18, d26, d23 vadd.f32 d19, d27, d22 - vst1.32 {d18,d19}, [r0]! + vst1.32 {d18,d19}, [r0,:64]! subs r3, r3, #1 bne .forward_loop diff -r abdcde012978 -r 655dc5c14169 peck_fft.c --- a/peck_fft.c Thu Sep 22 15:19:18 2011 +0200 +++ b/peck_fft.c Thu Sep 22 16:58:25 2011 +0200 @@ -296,14 +296,15 @@ case 3: kf_bfly3(Fout, fstride, st, m); break; case 4: { -static unsigned counter = 0; - armv7_cycles_start(); - unsigned int t1 = armv7_cycles_read(); +//static unsigned counter = 0; +// armv7_cycles_start(); +// unsigned int t1 = armv7_cycles_read(); +//printf("%08x %d %d\n", Fout, fstride, m); kf_bfly4(Fout, fstride, st, m); - unsigned int t2 = armv7_cycles_read(); - armv7_cycles_stop(); - counter++; - if (counter > 150 && counter < 160) printf("XX %d\n", t2-t1); +// unsigned int t2 = armv7_cycles_read(); +// armv7_cycles_stop(); +// counter++; +// if (counter > 150 && counter < 160) printf("XX %d\n", t2-t1); } break; case 5: kf_bfly5(Fout, fstride, st, m); break; @@ -344,16 +345,16 @@ * The return value is a contiguous block of memory, allocated with malloc. As such, * it can be freed with free(), rather than a peck_fft-specific function. */ -peck_fft_cfg peck_fft_alloc(int nfft, int inverse_fft, void * mem, size_t * lenmem) { +peck_fft_cfg peck_fft_alloc(int nfft, int inverse_fft, void *mem, size_t *lenmem) { peck_fft_cfg st = NULL; size_t memneeded = sizeof(struct peck_fft_state) + sizeof(peck_fft_cpx)*(nfft-1); /* twiddle factors */ if (lenmem == NULL) { - st = ( peck_fft_cfg)PECK_FFT_MALLOC(memneeded); + st = (peck_fft_cfg) PECK_FFT_MALLOC(memneeded); } else { if (mem != NULL && *lenmem >= memneeded) - st = (peck_fft_cfg)mem; + st = (peck_fft_cfg) mem; *lenmem = memneeded; } diff -r abdcde012978 -r 655dc5c14169 peck_fftr.c --- a/peck_fftr.c Thu Sep 22 15:19:18 2011 +0200 +++ b/peck_fftr.c Thu Sep 22 16:58:25 2011 +0200 @@ -19,9 +19,7 @@ peck_fft_cfg substate; peck_fft_cpx *tmpbuf; peck_fft_cpx *super_twiddles; -#if USE_SIMD == SIMD_SSE2 void *pad; -#endif }; peck_fftr_cfg peck_fftr_alloc(int nfft, int inverse_fft, void *mem, size_t *lenmem) {