changeset 12:655dc5c14169

backup
author Peter Meerwald <p.meerwald@bct-electronic.com>
date Thu, 22 Sep 2011 16:58:25 +0200 (2011-09-22)
parents abdcde012978
children 3e85a9101f02
files _peck_fft_guts.h kf_bfly4.S peck_fft.c peck_fftr.c
diffstat 4 files changed, 39 insertions(+), 41 deletions(-) [+]
line wrap: on
line diff
--- a/_peck_fft_guts.h	Thu Sep 22 15:19:18 2011 +0200
+++ b/_peck_fft_guts.h	Thu Sep 22 16:58:25 2011 +0200
@@ -13,16 +13,15 @@
 */
 
 /* peck_fft.h
-   defines peck_fft_scalar as either short or a float type
-   and defines
-   typedef struct { peck_fft_scalar r; peck_fft_scalar i; }peck_fft_cpx; */
+ * defines peck_fft_scalar as either short or a float type and defines
+ * typedef struct { peck_fft_scalar r; peck_fft_scalar i; } peck_fft_cpx; 
+ */
 #include "peck_fft.h"
 #include <limits.h>
 
 #define MAXFACTORS 32
 /* e.g. an fft of length 128 has 4 factors 
- as far as kissfft is concerned
- 4*4*4*2
+ * as far as kissfft is concerned: 4*4*4*2
  */
 
 struct peck_fft_state{
--- a/kf_bfly4.S	Thu Sep 22 15:19:18 2011 +0200
+++ b/kf_bfly4.S	Thu Sep 22 16:58:25 2011 +0200
@@ -36,13 +36,13 @@
 	mov	r6, ip
 .inverse_loop:
     // C_MUL(scratch[0], Fout[m], *tw1);
-    vld1.32     {d18,d19}, [r5]
-    vld1.32     {d16,d17}, [ip]
+    vld1.32     {d18,d19}, [r5,:64]
+    vld1.32     {d16,d17}, [ip,:64]
 	vmul.f32	d20, d18, d16
 	vmul.f32	d21, d16, d19
         // load Fout[m2], *tw2
-        vld1.32     {d14,d15}, [r4]
-        vld1.32     {d12,d13}, [r6]
+        vld1.32     {d14,d15}, [r4,:64]
+        vld1.32     {d12,d13}, [r6,:64]
 	vmls.f32	d20, d19, d17
 	vmla.f32	d21, d18, d17
 	
@@ -50,8 +50,8 @@
 	vmul.f32	d22, d14, d12
 	vmul.f32	d23, d12, d15
         // load Fout[m3], *tw3
-        vld1.32     {d18,d19}, [r0]
-        vld1.32     {d16,d17}, [sl]
+        vld1.32     {d18,d19}, [r0,:64]
+        vld1.32     {d16,d17}, [sl,:64]
 	vmls.f32	d22, d15, d13
 	vmla.f32	d23, d14, d13
 
@@ -60,7 +60,7 @@
 	vmul.f32	d25, d16, d19
 	
     // C_SUB(scratch[1], *Fout, scratch[3]);
-    vld1.32     {d14,d15}, [r2]
+    vld1.32     {d14,d15}, [r2,:64]
     vsub.f32   q13, q7, q11
 
 	vmls.f32	d24, d19, d17
@@ -70,15 +70,15 @@
     vadd.f32   q7, q7, q11
         // C_ADD(scratch[3], scratch[0], scratch[2]);
         vadd.f32   q11, q10, q12
-	vst1.32     {d16,d17}, [r2]
+	vst1.32     {d16,d17}, [r2,:64]
 	
     // C_SUB(Fout[m2], *Fout, scratch[3]);
     vsub.f32   q9, q7, q11
-    vst1.32     {d18,d19}, [r4]!
+    vst1.32     {d18,d19}, [r4,:64]!
     
     // C_ADDTO(*Fout, scratch[3]);
     vadd.f32   q7, q7, q11
-    vst1.32     {d14,d15}, [r2]!
+    vst1.32     {d14,d15}, [r2,:64]!
 
 	add	ip, ip, r8
 	add	r6, r6, r7
@@ -91,13 +91,13 @@
     // Fout[m].i = scratch[1].i + scratch[3].r;
     vsub.f32    d18, d26, d23
     vadd.f32    d19, d27, d22
-    vst1.32     {d18,d19}, [r5]!
+    vst1.32     {d18,d19}, [r5,:64]!
     
     // Fout[m3].r = scratch[1].r + scratch[3].i;
     // Fout[m3].i = scratch[1].i - scratch[3].r;
     vadd.f32    d18, d26, d23
     vsub.f32    d19, d27, d22
-    vst1.32     {d18,d19}, [r0]!
+    vst1.32     {d18,d19}, [r0,:64]!
 
 	subs	r3, r3, #1
 	bne	.inverse_loop
@@ -108,13 +108,13 @@
 	mov	r6, ip
 .forward_loop:
     // C_MUL(scratch[0], Fout[m], *tw1);
-    vld1.32     {d18,d19}, [r5]
-    vld1.32     {d16,d17}, [ip]
+    vld1.32     {d18,d19}, [r5,:64]
+    vld1.32     {d16,d17}, [ip,:64]
 	vmul.f32	d20, d18, d16
 	vmul.f32	d21, d16, d19
         // load Fout[m2], *tw2
-        vld1.32     {d14,d15}, [r4]
-        vld1.32     {d12,d13}, [r6]
+        vld1.32     {d14,d15}, [r4,:64]
+        vld1.32     {d12,d13}, [r6,:64]
 	vmls.f32	d20, d19, d17
 	vmla.f32	d21, d18, d17
 	
@@ -122,8 +122,8 @@
 	vmul.f32	d22, d14, d12
 	vmul.f32	d23, d12, d15
         // load Fout[m3], *tw3
-        vld1.32     {d18,d19}, [r0]
-        vld1.32     {d16,d17}, [sl]
+        vld1.32     {d18,d19}, [r0,:64]
+        vld1.32     {d16,d17}, [sl,:64]
 	vmls.f32	d22, d15, d13
 	vmla.f32	d23, d14, d13
 
@@ -132,7 +132,7 @@
 	vmul.f32	d25, d16, d19
 	
     // C_SUB(scratch[1], *Fout, scratch[3]);
-    vld1.32     {d14,d15}, [r2]
+    vld1.32     {d14,d15}, [r2,:64]
     vsub.f32   q13, q7, q11
 
 	vmls.f32	d24, d19, d17
@@ -142,15 +142,15 @@
     vadd.f32   q7, q7, q11
         // C_ADD(scratch[3], scratch[0], scratch[2]);
         vadd.f32   q11, q10, q12
-	vst1.32     {d16,d17}, [r2]
+	vst1.32     {d16,d17}, [r2,:64]
 	
     // C_SUB(Fout[m2], *Fout, scratch[3]);
     vsub.f32   q9, q7, q11
-    vst1.32     {d18,d19}, [r4]!
+    vst1.32     {d18,d19}, [r4,:64]!
     
     // C_ADDTO(*Fout, scratch[3]);
     vadd.f32   q7, q7, q11
-    vst1.32     {d14,d15}, [r2]!
+    vst1.32     {d14,d15}, [r2,:64]!
 
 	add	ip, ip, r8
 	add	r6, r6, r7
@@ -163,13 +163,13 @@
     // Fout[m].i = scratch[1].i + scratch[3].r;
     vadd.f32    d18, d26, d23
     vsub.f32    d19, d27, d22
-    vst1.32     {d18,d19}, [r5]!
+    vst1.32     {d18,d19}, [r5,:64]!
     
     // Fout[m3].r = scratch[1].r + scratch[3].i;
     // Fout[m3].i = scratch[1].i - scratch[3].r;
     vsub.f32    d18, d26, d23
     vadd.f32    d19, d27, d22
-    vst1.32     {d18,d19}, [r0]!
+    vst1.32     {d18,d19}, [r0,:64]!
 
 	subs	r3, r3, #1
 	bne	.forward_loop
--- a/peck_fft.c	Thu Sep 22 15:19:18 2011 +0200
+++ b/peck_fft.c	Thu Sep 22 16:58:25 2011 +0200
@@ -296,14 +296,15 @@
         case 3: kf_bfly3(Fout, fstride, st, m); break; 
         case 4: 
         {
-static unsigned counter = 0;        
-            armv7_cycles_start();
-            unsigned int t1 = armv7_cycles_read();
+//static unsigned counter = 0;        
+//            armv7_cycles_start();
+//            unsigned int t1 = armv7_cycles_read();
+//printf("%08x %d %d\n", Fout, fstride, m);
             kf_bfly4(Fout, fstride, st, m); 
-            unsigned int t2 = armv7_cycles_read();
-            armv7_cycles_stop();
-            counter++;
-            if (counter > 150 && counter < 160) printf("XX %d\n", t2-t1);
+//            unsigned int t2 = armv7_cycles_read();
+//            armv7_cycles_stop();
+//            counter++;
+//            if (counter > 150 && counter < 160) printf("XX %d\n", t2-t1);
 }            
             break;
         case 5: kf_bfly5(Fout, fstride, st, m); break; 
@@ -344,16 +345,16 @@
  * The return value is a contiguous block of memory, allocated with malloc.  As such,
  * it can be freed with free(), rather than a peck_fft-specific function.
  */
-peck_fft_cfg peck_fft_alloc(int nfft, int inverse_fft, void * mem, size_t * lenmem) {
+peck_fft_cfg peck_fft_alloc(int nfft, int inverse_fft, void *mem, size_t *lenmem) {
     peck_fft_cfg st = NULL;
     size_t memneeded = sizeof(struct peck_fft_state)
         + sizeof(peck_fft_cpx)*(nfft-1); /* twiddle factors */
 
     if (lenmem == NULL) {
-        st = ( peck_fft_cfg)PECK_FFT_MALLOC(memneeded);
+        st = (peck_fft_cfg) PECK_FFT_MALLOC(memneeded);
     } else {
         if (mem != NULL && *lenmem >= memneeded)
-            st = (peck_fft_cfg)mem;
+            st = (peck_fft_cfg) mem;
         *lenmem = memneeded;
     }
     
--- a/peck_fftr.c	Thu Sep 22 15:19:18 2011 +0200
+++ b/peck_fftr.c	Thu Sep 22 16:58:25 2011 +0200
@@ -19,9 +19,7 @@
     peck_fft_cfg substate;
     peck_fft_cpx *tmpbuf;
     peck_fft_cpx *super_twiddles;
-#if USE_SIMD == SIMD_SSE2
     void *pad;
-#endif    
 };
 
 peck_fftr_cfg peck_fftr_alloc(int nfft, int inverse_fft, void *mem, size_t *lenmem) {

Repositories maintained by Peter Meerwald, pmeerw@pmeerw.net.