changeset 4:2d6c49fcafcb

neon2 and neon4 support
author Peter Meerwald <p.meerwald@bct-electronic.com>
date Fri, 16 Sep 2011 14:04:19 +0200 (2011-09-16)
parents 3b31bd44a09f
children c7237a7544eb
files _peck_fft_guts.h compile.sh peck_fft.c peck_fft.h peck_fftr.c peck_test.c
diffstat 6 files changed, 109 insertions(+), 72 deletions(-) [+]
line wrap: on
line diff
--- a/_peck_fft_guts.h	Fri Sep 16 13:08:20 2011 +0200
+++ b/_peck_fft_guts.h	Fri Sep 16 14:04:19 2011 +0200
@@ -125,17 +125,25 @@
 
 
 #ifdef FIXED_POINT
-#  define PECK_FFT_COS(phase)  floor(.5+SAMP_MAX * cos (phase))
-#  define PECK_FFT_SIN(phase)  floor(.5+SAMP_MAX * sin (phase))
-#  define HALF_OF(x) ((x)>>1)
-#elif defined(USE_SIMD)
-#  define PECK_FFT_COS(phase) _mm_set1_ps( cos(phase) )
-#  define PECK_FFT_SIN(phase) _mm_set1_ps( sin(phase) )
-#  define HALF_OF(x) ((x)*_mm_set1_ps(.5))
+    #define PECK_FFT_COS(phase) floorf(0.5f+SAMP_MAX * cosf(phase))
+    #define PECK_FFT_SIN(phase) floorf(0.5f+SAMP_MAX * sinf(phase))
+    #define HALF_OF(x) ((x)>>1)
+#elif USE_SIMD == SIMD_SSE2
+    #define PECK_FFT_COS(phase) _mm_set1_ps(cosf(phase))
+    #define PECK_FFT_SIN(phase) _mm_set1_ps(sinf(phase))
+    #define HALF_OF(x) ((x)*_mm_set1_ps(0.5f))
+#elif USE_SIMD == SIMD_NEON4
+    #define PECK_FFT_COS(phase) vdupq_n_f32(cosf(phase))
+    #define PECK_FFT_SIN(phase) vdupq_n_f32(sinf(phase))
+    #define HALF_OF(x) ((x)*vdupq_n_f32(0.5f))
+#elif USE_SIMD == SIMD_NEON2
+    #define PECK_FFT_COS(phase) vdup_n_f32(cosf(phase))
+    #define PECK_FFT_SIN(phase) vdup_n_f32(sinf(phase))
+    #define HALF_OF(x) ((x)*vdup_n_f32(0.5f))
 #else
-#  define PECK_FFT_COS(phase) (peck_fft_scalar) cos(phase)
-#  define PECK_FFT_SIN(phase) (peck_fft_scalar) sin(phase)
-#  define HALF_OF(x) ((x)*.5)
+    #define PECK_FFT_COS(phase) (peck_fft_scalar) cosf(phase)
+    #define PECK_FFT_SIN(phase) (peck_fft_scalar) sinf(phase)
+    #define HALF_OF(x) ((x)*0.5f)
 #endif
 
 #define  kf_cexp(x,phase) \
--- a/compile.sh	Fri Sep 16 13:08:20 2011 +0200
+++ b/compile.sh	Fri Sep 16 14:04:19 2011 +0200
@@ -1,21 +1,22 @@
+
+/opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \
+	-O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \
+    -DUSE_SIMD=SIMD_NEON2 \
+    -I . \
+    -o peck_test_arm \
+    peck_fftr.c peck_fft.c \
+    peck_test.c \
+    -lm 
 
 gcc \
+    -O2 -march=native -msse2 -mfpmath=sse -ffast-math -fomit-frame-pointer \
+    -DUSE_SIMD=SIMD_SSE2 \
     -I . \
     -o peck_test_x86 \
     peck_fftr.c peck_fft.c \
     peck_test.c \
     -lm 
 
-exit
+# time ./peck_test_x86
 
-/opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \
-    -I . \
-	-O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \
-    -o peck_test_arm \
-    peck_fftr.c peck_fft.c \
-    peck_test.c \
-    -lm 
-
-time ./peck_test_x86
-
-scp peck_test_arm root@192.168.233.114:.
+scp peck_test_arm root@192.168.233.104:.
--- a/peck_fft.c	Fri Sep 16 13:08:20 2011 +0200
+++ b/peck_fft.c	Fri Sep 16 14:04:19 2011 +0200
@@ -19,10 +19,10 @@
  */
 
 static void kf_bfly2(
-        peck_fft_cpx * Fout,
-        const size_t fstride,
-        const peck_fft_cfg st,
-        int m) {
+    peck_fft_cpx * Fout,
+    const size_t fstride,
+    const peck_fft_cfg st,
+    int m) {
 
 //printf("kf_bfly2\n");
 
@@ -44,10 +44,11 @@
 }
 
 static void kf_bfly4(
-        peck_fft_cpx * Fout,
-        const size_t fstride,
-        const peck_fft_cfg st,
-        const size_t m) {
+    peck_fft_cpx * Fout,
+    const size_t fstride,
+    const peck_fft_cfg st,
+    const size_t m) {
+
     peck_fft_cpx *tw1,*tw2,*tw3;
     peck_fft_cpx scratch[6];
     size_t k=m;
@@ -94,10 +95,11 @@
 }
 
 static void kf_bfly3(
-         peck_fft_cpx * Fout,
-         const size_t fstride,
-         const peck_fft_cfg st,
-         size_t m) {
+     peck_fft_cpx * Fout,
+     const size_t fstride,
+     const peck_fft_cfg st,
+     size_t m) {
+
      size_t k=m;
      const size_t m2 = 2*m;
      peck_fft_cpx *tw1, *tw2;
@@ -107,7 +109,6 @@
 
 printf("kf_bfly3\n");
 
-
      tw1=tw2=st->twiddles;
 
      do {
@@ -139,12 +140,11 @@
 }
 
 static void kf_bfly5(
-        peck_fft_cpx * Fout,
-        const size_t fstride,
-        const peck_fft_cfg st,
-        int m
-        )
-{
+    peck_fft_cpx * Fout,
+    const size_t fstride,
+    const peck_fft_cfg st,
+    int m
+    ) {
     peck_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
     int u;
     peck_fft_cpx scratch[13];
@@ -156,7 +156,6 @@
 
 printf("kf_bfly5\n");
 
-
     Fout0=Fout;
     Fout1=Fout0+m;
     Fout2=Fout0+2*m;
@@ -164,7 +163,7 @@
     Fout4=Fout0+4*m;
 
     tw=st->twiddles;
-    for ( u=0; u<m; ++u ) {
+    for (u = 0; u < m; ++u) {
         C_FIXDIV( *Fout0,5); C_FIXDIV( *Fout1,5); C_FIXDIV( *Fout2,5); C_FIXDIV( *Fout3,5); C_FIXDIV( *Fout4,5);
         scratch[0] = *Fout0;
 
@@ -204,13 +203,12 @@
 
 /* perform the butterfly for one stage of a mixed radix FFT */
 static void kf_bfly_generic(
-        peck_fft_cpx * Fout,
-        const size_t fstride,
-        const peck_fft_cfg st,
-        int m,
-        int p
-        )
-{
+    peck_fft_cpx * Fout,
+    const size_t fstride,
+    const peck_fft_cfg st,
+    int m,
+    int p) {
+    
     int u,k,q1,q;
     peck_fft_cpx * twiddles = st->twiddles;
     peck_fft_cpx t;
@@ -218,7 +216,6 @@
 
 printf("kf_bfly_generic\n");
 
-
     peck_fft_cpx * scratch = (peck_fft_cpx*)PECK_FFT_TMP_ALLOC(sizeof(peck_fft_cpx)*p);
 
     for ( u=0; u<m; ++u ) {
--- a/peck_fft.h	Fri Sep 16 13:08:20 2011 +0200
+++ b/peck_fft.h	Fri Sep 16 14:04:19 2011 +0200
@@ -10,14 +10,28 @@
 extern "C" {
 #endif
 
-#ifdef USE_SIMD
-# include <xmmintrin.h>
-# define peck_fft_scalar __m128
-#define PECK_FFT_MALLOC(nbytes) _mm_malloc(nbytes, 16)
-#define PECK_FFT_FREE _mm_free
+#define SIMD_SSE2 1
+#define SIMD_NEON4 2
+#define SIMD_NEON2 3
+
+#if USE_SIMD == SIMD_SSE2
+    #include <xmmintrin.h>
+    #define peck_fft_scalar __m128
+    #define PECK_FFT_MALLOC(nbytes) _mm_malloc(nbytes, 16)
+    #define PECK_FFT_FREE _mm_free
+#elif USE_SIMD == SIMD_NEON4
+    #include <arm_neon.h>
+    #define peck_fft_scalar float32x4_t
+    #define PECK_FFT_MALLOC malloc
+    #define PECK_FFT_FREE free
+#elif USE_SIMD == SIMD_NEON2
+    #include <arm_neon.h>
+    #define peck_fft_scalar float32x2_t
+    #define PECK_FFT_MALLOC malloc
+    #define PECK_FFT_FREE free
 #else	
-#define PECK_FFT_MALLOC malloc
-#define PECK_FFT_FREE free
+    #define PECK_FFT_MALLOC malloc
+    #define PECK_FFT_FREE free
 #endif	
 
 
--- a/peck_fftr.c	Fri Sep 16 13:08:20 2011 +0200
+++ b/peck_fftr.c	Fri Sep 16 14:04:19 2011 +0200
@@ -19,8 +19,8 @@
     peck_fft_cfg substate;
     peck_fft_cpx *tmpbuf;
     peck_fft_cpx *super_twiddles;
-#ifdef USE_SIMD    
-    void * pad;
+#if USE_SIMD == SIMD_SSE2
+    void *pad;
 #endif    
 };
 
@@ -34,10 +34,9 @@
         return NULL;
     }
     nfft >>= 1;
+    peck_fft_alloc(nfft, inverse_fft, NULL, &subsize);
 
-    peck_fft_alloc(nfft, inverse_fft, NULL, &subsize);
     memneeded = sizeof(struct peck_fftr_state) + subsize + sizeof(peck_fft_cpx) * (nfft * 3 / 2);
-
     if (lenmem == NULL) {
         st = (peck_fftr_cfg) PECK_FFT_MALLOC(memneeded);
     } else {
@@ -51,6 +50,7 @@
     st->substate = (peck_fft_cfg) (st + 1); /* just beyond peck_fftr_state struct */
     st->tmpbuf = (peck_fft_cpx *) (((char *) st->substate) + subsize);
     st->super_twiddles = st->tmpbuf + nfft;
+
     peck_fft_alloc(nfft, inverse_fft, st->substate, &subsize);
 
     for (i = 0; i < nfft/2; ++i) {
@@ -60,6 +60,7 @@
             phase *= -1;
         kf_cexp(st->super_twiddles+i, phase);
     }
+
     return st;
 }
 
@@ -94,8 +95,12 @@
     CHECK_OVERFLOW_OP(tdc.r ,-, tdc.i);
     freqdata[0].r = tdc.r + tdc.i;
     freqdata[ncfft].r = tdc.r - tdc.i;
-#ifdef USE_SIMD    
+#if USE_SIMD == SIMD_SSE2
     freqdata[ncfft].i = freqdata[0].i = _mm_set1_ps(0);
+#elif USE_SIMD == SIMD_NEON4
+    freqdata[ncfft].i = freqdata[0].i = vdupq_n_f32(0.0f);
+#elif USE_SIMD == SIMD_NEON2
+    freqdata[ncfft].i = freqdata[0].i = vdup_n_f32(0.0f);
 #else
     freqdata[ncfft].i = freqdata[0].i = 0;
 #endif
@@ -138,16 +143,20 @@
         fk = freqdata[k];
         fnkc.r = freqdata[ncfft - k].r;
         fnkc.i = -freqdata[ncfft - k].i;
-        C_FIXDIV(fk , 2);
-        C_FIXDIV(fnkc , 2);
+        C_FIXDIV(fk, 2);
+        C_FIXDIV(fnkc, 2);
 
         C_ADD(fek, fk, fnkc);
         C_SUB(tmp, fk, fnkc);
         C_MUL(fok, tmp, st->super_twiddles[k-1]);
         C_ADD(st->tmpbuf[k],     fek, fok);
         C_SUB(st->tmpbuf[ncfft - k], fek, fok);
-#ifdef USE_SIMD        
-        st->tmpbuf[ncfft - k].i *= _mm_set1_ps(-1.0);
+#if USE_SIMD == SIMD_SSE2
+        st->tmpbuf[ncfft - k].i *= _mm_set1_ps(-1.0f);
+#elif USE_SIMD == SIMD_NEON4
+        st->tmpbuf[ncfft - k].i *= vdupq_n_f32(-1.0f);
+#elif USE_SIMD == SIMD_NEON2
+        st->tmpbuf[ncfft - k].i *= vdup_n_f32(-1.0f);
 #else
         st->tmpbuf[ncfft - k].i *= -1;
 #endif
--- a/peck_test.c	Fri Sep 16 13:08:20 2011 +0200
+++ b/peck_test.c	Fri Sep 16 14:04:19 2011 +0200
@@ -25,7 +25,7 @@
     unsigned int i, j;
     peck_fftr_cfg p, pi;
 
-    enable_runfast();
+//    enable_runfast();
 
     const unsigned int N = 256;
 
@@ -34,7 +34,15 @@
     peck_fft_scalar res[N];
 
     for (i = 0; i < N; i++) {
+#if USE_SIMD == SIMD_SSE2
+        in[i] = _mm_set1_ps((i % 13) / 3);
+#elif USE_SIMD == SIMD_NEON4
+        in[i] = vdupq_n_f32((i % 13) / 3);
+#elif USE_SIMD == SIMD_NEON2
+        in[i] = vdup_n_f32((i % 13) / 3);
+#else
         in[i] = (i % 13) / 3; 
+#endif        
     }
 
     p = peck_fftr_alloc(N, 0, NULL, NULL);
@@ -43,7 +51,7 @@
     for (j = 0; j < 10000; j++) {
         if (j == 0) {
             for (i = 0; i < 8; i++)
-                printf("%d: %f\n", i, in[i]);
+                printf("%d: %f\n", i, *(float*)&in[i]);
             printf("----\n");
         }
 
@@ -51,7 +59,7 @@
 
         if (j == 0) {
             for (i = 0; i < 8; i++)
-                printf("%d: %f %f\n", i, out[i].r, out[i].i);
+                printf("%d: %f %f\n", i, *(float*)&out[i].r, *(float*)&out[i].i);
             printf("----\n");
         }
         
@@ -59,7 +67,7 @@
 
         if (j == 0) {
             for (i = 0; i < 8; i++)
-                printf("%d: %f\n", i, res[i] / N);
+                printf("%d: %f\n", i, *(float*)&res[i] / N);
         }
     }
     peck_fftr_free(p);    
@@ -67,7 +75,7 @@
     peck_fft_cleanup();
 
     for (i = 0; i < N; i++) {
-        if (fabs(in[i] - res[i]/N) > 0.00001) {
+        if (fabs(*(float*)&in[i] - *(float*)&res[i]/N) > 0.00001) {
             fprintf(stderr, "!!!! ERROR !!!! at %d\n", i);
             exit(EXIT_FAILURE);
         }

Repositories maintained by Peter Meerwald, pmeerw@pmeerw.net.