# HG changeset patch
# User Peter Meerwald
# Date 1316703505 -7200
# Node ID 655dc5c14169409a45ddc43c16c25f98a71b7bda
# Parent abdcde0129780a508120f363912c774f3954d032
backup
diff -r abdcde012978 -r 655dc5c14169 _peck_fft_guts.h
--- a/_peck_fft_guts.h Thu Sep 22 15:19:18 2011 +0200
+++ b/_peck_fft_guts.h Thu Sep 22 16:58:25 2011 +0200
@@ -13,16 +13,15 @@
*/
/* peck_fft.h
- defines peck_fft_scalar as either short or a float type
- and defines
- typedef struct { peck_fft_scalar r; peck_fft_scalar i; }peck_fft_cpx; */
+ * defines peck_fft_scalar as either short or a float type and defines
+ * typedef struct { peck_fft_scalar r; peck_fft_scalar i; } peck_fft_cpx;
+ */
#include "peck_fft.h"
#include
#define MAXFACTORS 32
/* e.g. an fft of length 128 has 4 factors
- as far as kissfft is concerned
- 4*4*4*2
+ * as far as kissfft is concerned: 4*4*4*2
*/
struct peck_fft_state{
diff -r abdcde012978 -r 655dc5c14169 kf_bfly4.S
--- a/kf_bfly4.S Thu Sep 22 15:19:18 2011 +0200
+++ b/kf_bfly4.S Thu Sep 22 16:58:25 2011 +0200
@@ -36,13 +36,13 @@
mov r6, ip
.inverse_loop:
// C_MUL(scratch[0], Fout[m], *tw1);
- vld1.32 {d18,d19}, [r5]
- vld1.32 {d16,d17}, [ip]
+ vld1.32 {d18,d19}, [r5,:64]
+ vld1.32 {d16,d17}, [ip,:64]
vmul.f32 d20, d18, d16
vmul.f32 d21, d16, d19
// load Fout[m2], *tw2
- vld1.32 {d14,d15}, [r4]
- vld1.32 {d12,d13}, [r6]
+ vld1.32 {d14,d15}, [r4,:64]
+ vld1.32 {d12,d13}, [r6,:64]
vmls.f32 d20, d19, d17
vmla.f32 d21, d18, d17
@@ -50,8 +50,8 @@
vmul.f32 d22, d14, d12
vmul.f32 d23, d12, d15
// load Fout[m3], *tw3
- vld1.32 {d18,d19}, [r0]
- vld1.32 {d16,d17}, [sl]
+ vld1.32 {d18,d19}, [r0,:64]
+ vld1.32 {d16,d17}, [sl,:64]
vmls.f32 d22, d15, d13
vmla.f32 d23, d14, d13
@@ -60,7 +60,7 @@
vmul.f32 d25, d16, d19
// C_SUB(scratch[1], *Fout, scratch[3]);
- vld1.32 {d14,d15}, [r2]
+ vld1.32 {d14,d15}, [r2,:64]
vsub.f32 q13, q7, q11
vmls.f32 d24, d19, d17
@@ -70,15 +70,15 @@
vadd.f32 q7, q7, q11
// C_ADD(scratch[3], scratch[0], scratch[2]);
vadd.f32 q11, q10, q12
- vst1.32 {d16,d17}, [r2]
+ vst1.32 {d16,d17}, [r2,:64]
// C_SUB(Fout[m2], *Fout, scratch[3]);
vsub.f32 q9, q7, q11
- vst1.32 {d18,d19}, [r4]!
+ vst1.32 {d18,d19}, [r4,:64]!
// C_ADDTO(*Fout, scratch[3]);
vadd.f32 q7, q7, q11
- vst1.32 {d14,d15}, [r2]!
+ vst1.32 {d14,d15}, [r2,:64]!
add ip, ip, r8
add r6, r6, r7
@@ -91,13 +91,13 @@
// Fout[m].i = scratch[1].i + scratch[3].r;
vsub.f32 d18, d26, d23
vadd.f32 d19, d27, d22
- vst1.32 {d18,d19}, [r5]!
+ vst1.32 {d18,d19}, [r5,:64]!
// Fout[m3].r = scratch[1].r + scratch[3].i;
// Fout[m3].i = scratch[1].i - scratch[3].r;
vadd.f32 d18, d26, d23
vsub.f32 d19, d27, d22
- vst1.32 {d18,d19}, [r0]!
+ vst1.32 {d18,d19}, [r0,:64]!
subs r3, r3, #1
bne .inverse_loop
@@ -108,13 +108,13 @@
mov r6, ip
.forward_loop:
// C_MUL(scratch[0], Fout[m], *tw1);
- vld1.32 {d18,d19}, [r5]
- vld1.32 {d16,d17}, [ip]
+ vld1.32 {d18,d19}, [r5,:64]
+ vld1.32 {d16,d17}, [ip,:64]
vmul.f32 d20, d18, d16
vmul.f32 d21, d16, d19
// load Fout[m2], *tw2
- vld1.32 {d14,d15}, [r4]
- vld1.32 {d12,d13}, [r6]
+ vld1.32 {d14,d15}, [r4,:64]
+ vld1.32 {d12,d13}, [r6,:64]
vmls.f32 d20, d19, d17
vmla.f32 d21, d18, d17
@@ -122,8 +122,8 @@
vmul.f32 d22, d14, d12
vmul.f32 d23, d12, d15
// load Fout[m3], *tw3
- vld1.32 {d18,d19}, [r0]
- vld1.32 {d16,d17}, [sl]
+ vld1.32 {d18,d19}, [r0,:64]
+ vld1.32 {d16,d17}, [sl,:64]
vmls.f32 d22, d15, d13
vmla.f32 d23, d14, d13
@@ -132,7 +132,7 @@
vmul.f32 d25, d16, d19
// C_SUB(scratch[1], *Fout, scratch[3]);
- vld1.32 {d14,d15}, [r2]
+ vld1.32 {d14,d15}, [r2,:64]
vsub.f32 q13, q7, q11
vmls.f32 d24, d19, d17
@@ -142,15 +142,15 @@
vadd.f32 q7, q7, q11
// C_ADD(scratch[3], scratch[0], scratch[2]);
vadd.f32 q11, q10, q12
- vst1.32 {d16,d17}, [r2]
+ vst1.32 {d16,d17}, [r2,:64]
// C_SUB(Fout[m2], *Fout, scratch[3]);
vsub.f32 q9, q7, q11
- vst1.32 {d18,d19}, [r4]!
+ vst1.32 {d18,d19}, [r4,:64]!
// C_ADDTO(*Fout, scratch[3]);
vadd.f32 q7, q7, q11
- vst1.32 {d14,d15}, [r2]!
+ vst1.32 {d14,d15}, [r2,:64]!
add ip, ip, r8
add r6, r6, r7
@@ -163,13 +163,13 @@
// Fout[m].i = scratch[1].i + scratch[3].r;
vadd.f32 d18, d26, d23
vsub.f32 d19, d27, d22
- vst1.32 {d18,d19}, [r5]!
+ vst1.32 {d18,d19}, [r5,:64]!
// Fout[m3].r = scratch[1].r + scratch[3].i;
// Fout[m3].i = scratch[1].i - scratch[3].r;
vsub.f32 d18, d26, d23
vadd.f32 d19, d27, d22
- vst1.32 {d18,d19}, [r0]!
+ vst1.32 {d18,d19}, [r0,:64]!
subs r3, r3, #1
bne .forward_loop
diff -r abdcde012978 -r 655dc5c14169 peck_fft.c
--- a/peck_fft.c Thu Sep 22 15:19:18 2011 +0200
+++ b/peck_fft.c Thu Sep 22 16:58:25 2011 +0200
@@ -296,14 +296,15 @@
case 3: kf_bfly3(Fout, fstride, st, m); break;
case 4:
{
-static unsigned counter = 0;
- armv7_cycles_start();
- unsigned int t1 = armv7_cycles_read();
+//static unsigned counter = 0;
+// armv7_cycles_start();
+// unsigned int t1 = armv7_cycles_read();
+//printf("%08x %d %d\n", Fout, fstride, m);
kf_bfly4(Fout, fstride, st, m);
- unsigned int t2 = armv7_cycles_read();
- armv7_cycles_stop();
- counter++;
- if (counter > 150 && counter < 160) printf("XX %d\n", t2-t1);
+// unsigned int t2 = armv7_cycles_read();
+// armv7_cycles_stop();
+// counter++;
+// if (counter > 150 && counter < 160) printf("XX %d\n", t2-t1);
}
break;
case 5: kf_bfly5(Fout, fstride, st, m); break;
@@ -344,16 +345,16 @@
* The return value is a contiguous block of memory, allocated with malloc. As such,
* it can be freed with free(), rather than a peck_fft-specific function.
*/
-peck_fft_cfg peck_fft_alloc(int nfft, int inverse_fft, void * mem, size_t * lenmem) {
+peck_fft_cfg peck_fft_alloc(int nfft, int inverse_fft, void *mem, size_t *lenmem) {
peck_fft_cfg st = NULL;
size_t memneeded = sizeof(struct peck_fft_state)
+ sizeof(peck_fft_cpx)*(nfft-1); /* twiddle factors */
if (lenmem == NULL) {
- st = ( peck_fft_cfg)PECK_FFT_MALLOC(memneeded);
+ st = (peck_fft_cfg) PECK_FFT_MALLOC(memneeded);
} else {
if (mem != NULL && *lenmem >= memneeded)
- st = (peck_fft_cfg)mem;
+ st = (peck_fft_cfg) mem;
*lenmem = memneeded;
}
diff -r abdcde012978 -r 655dc5c14169 peck_fftr.c
--- a/peck_fftr.c Thu Sep 22 15:19:18 2011 +0200
+++ b/peck_fftr.c Thu Sep 22 16:58:25 2011 +0200
@@ -19,9 +19,7 @@
peck_fft_cfg substate;
peck_fft_cpx *tmpbuf;
peck_fft_cpx *super_twiddles;
-#if USE_SIMD == SIMD_SSE2
void *pad;
-#endif
};
peck_fftr_cfg peck_fftr_alloc(int nfft, int inverse_fft, void *mem, size_t *lenmem) {