# HG changeset patch # User Peter Meerwald # Date 1316174659 -7200 # Node ID 2d6c49fcafcbe4b62cf1c0ad0ec6a7875f429938 # Parent 3b31bd44a09f7975d07f047952b78993b97ffe35 neon2 and neon4 support diff -r 3b31bd44a09f -r 2d6c49fcafcb _peck_fft_guts.h --- a/_peck_fft_guts.h Fri Sep 16 13:08:20 2011 +0200 +++ b/_peck_fft_guts.h Fri Sep 16 14:04:19 2011 +0200 @@ -125,17 +125,25 @@ #ifdef FIXED_POINT -# define PECK_FFT_COS(phase) floor(.5+SAMP_MAX * cos (phase)) -# define PECK_FFT_SIN(phase) floor(.5+SAMP_MAX * sin (phase)) -# define HALF_OF(x) ((x)>>1) -#elif defined(USE_SIMD) -# define PECK_FFT_COS(phase) _mm_set1_ps( cos(phase) ) -# define PECK_FFT_SIN(phase) _mm_set1_ps( sin(phase) ) -# define HALF_OF(x) ((x)*_mm_set1_ps(.5)) + #define PECK_FFT_COS(phase) floorf(0.5f+SAMP_MAX * cosf(phase)) + #define PECK_FFT_SIN(phase) floorf(0.5f+SAMP_MAX * sinf(phase)) + #define HALF_OF(x) ((x)>>1) +#elif USE_SIMD == SIMD_SSE2 + #define PECK_FFT_COS(phase) _mm_set1_ps(cosf(phase)) + #define PECK_FFT_SIN(phase) _mm_set1_ps(sinf(phase)) + #define HALF_OF(x) ((x)*_mm_set1_ps(0.5f)) +#elif USE_SIMD == SIMD_NEON4 + #define PECK_FFT_COS(phase) vdupq_n_f32(cosf(phase)) + #define PECK_FFT_SIN(phase) vdupq_n_f32(sinf(phase)) + #define HALF_OF(x) ((x)*vdupq_n_f32(0.5f)) +#elif USE_SIMD == SIMD_NEON2 + #define PECK_FFT_COS(phase) vdup_n_f32(cosf(phase)) + #define PECK_FFT_SIN(phase) vdup_n_f32(sinf(phase)) + #define HALF_OF(x) ((x)*vdup_n_f32(0.5f)) #else -# define PECK_FFT_COS(phase) (peck_fft_scalar) cos(phase) -# define PECK_FFT_SIN(phase) (peck_fft_scalar) sin(phase) -# define HALF_OF(x) ((x)*.5) + #define PECK_FFT_COS(phase) (peck_fft_scalar) cosf(phase) + #define PECK_FFT_SIN(phase) (peck_fft_scalar) sinf(phase) + #define HALF_OF(x) ((x)*0.5f) #endif #define kf_cexp(x,phase) \ diff -r 3b31bd44a09f -r 2d6c49fcafcb compile.sh --- a/compile.sh Fri Sep 16 13:08:20 2011 +0200 +++ b/compile.sh Fri Sep 16 14:04:19 2011 +0200 @@ -1,21 +1,22 @@ + +/opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \ + -O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \ + -DUSE_SIMD=SIMD_NEON2 \ + -I . \ + -o peck_test_arm \ + peck_fftr.c peck_fft.c \ + peck_test.c \ + -lm gcc \ + -O2 -march=native -msse2 -mfpmath=sse -ffast-math -fomit-frame-pointer \ + -DUSE_SIMD=SIMD_SSE2 \ -I . \ -o peck_test_x86 \ peck_fftr.c peck_fft.c \ peck_test.c \ -lm -exit +# time ./peck_test_x86 -/opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \ - -I . \ - -O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \ - -o peck_test_arm \ - peck_fftr.c peck_fft.c \ - peck_test.c \ - -lm - -time ./peck_test_x86 - -scp peck_test_arm root@192.168.233.114:. +scp peck_test_arm root@192.168.233.104:. diff -r 3b31bd44a09f -r 2d6c49fcafcb peck_fft.c --- a/peck_fft.c Fri Sep 16 13:08:20 2011 +0200 +++ b/peck_fft.c Fri Sep 16 14:04:19 2011 +0200 @@ -19,10 +19,10 @@ */ static void kf_bfly2( - peck_fft_cpx * Fout, - const size_t fstride, - const peck_fft_cfg st, - int m) { + peck_fft_cpx * Fout, + const size_t fstride, + const peck_fft_cfg st, + int m) { //printf("kf_bfly2\n"); @@ -44,10 +44,11 @@ } static void kf_bfly4( - peck_fft_cpx * Fout, - const size_t fstride, - const peck_fft_cfg st, - const size_t m) { + peck_fft_cpx * Fout, + const size_t fstride, + const peck_fft_cfg st, + const size_t m) { + peck_fft_cpx *tw1,*tw2,*tw3; peck_fft_cpx scratch[6]; size_t k=m; @@ -94,10 +95,11 @@ } static void kf_bfly3( - peck_fft_cpx * Fout, - const size_t fstride, - const peck_fft_cfg st, - size_t m) { + peck_fft_cpx * Fout, + const size_t fstride, + const peck_fft_cfg st, + size_t m) { + size_t k=m; const size_t m2 = 2*m; peck_fft_cpx *tw1, *tw2; @@ -107,7 +109,6 @@ printf("kf_bfly3\n"); - tw1=tw2=st->twiddles; do { @@ -139,12 +140,11 @@ } static void kf_bfly5( - peck_fft_cpx * Fout, - const size_t fstride, - const peck_fft_cfg st, - int m - ) -{ + peck_fft_cpx * Fout, + const size_t fstride, + const peck_fft_cfg st, + int m + ) { peck_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4; int u; peck_fft_cpx scratch[13]; @@ -156,7 +156,6 @@ printf("kf_bfly5\n"); - Fout0=Fout; Fout1=Fout0+m; Fout2=Fout0+2*m; @@ -164,7 +163,7 @@ Fout4=Fout0+4*m; tw=st->twiddles; - for ( u=0; utwiddles; peck_fft_cpx t; @@ -218,7 +216,6 @@ printf("kf_bfly_generic\n"); - peck_fft_cpx * scratch = (peck_fft_cpx*)PECK_FFT_TMP_ALLOC(sizeof(peck_fft_cpx)*p); for ( u=0; u -# define peck_fft_scalar __m128 -#define PECK_FFT_MALLOC(nbytes) _mm_malloc(nbytes, 16) -#define PECK_FFT_FREE _mm_free +#define SIMD_SSE2 1 +#define SIMD_NEON4 2 +#define SIMD_NEON2 3 + +#if USE_SIMD == SIMD_SSE2 + #include + #define peck_fft_scalar __m128 + #define PECK_FFT_MALLOC(nbytes) _mm_malloc(nbytes, 16) + #define PECK_FFT_FREE _mm_free +#elif USE_SIMD == SIMD_NEON4 + #include + #define peck_fft_scalar float32x4_t + #define PECK_FFT_MALLOC malloc + #define PECK_FFT_FREE free +#elif USE_SIMD == SIMD_NEON2 + #include + #define peck_fft_scalar float32x2_t + #define PECK_FFT_MALLOC malloc + #define PECK_FFT_FREE free #else -#define PECK_FFT_MALLOC malloc -#define PECK_FFT_FREE free + #define PECK_FFT_MALLOC malloc + #define PECK_FFT_FREE free #endif diff -r 3b31bd44a09f -r 2d6c49fcafcb peck_fftr.c --- a/peck_fftr.c Fri Sep 16 13:08:20 2011 +0200 +++ b/peck_fftr.c Fri Sep 16 14:04:19 2011 +0200 @@ -19,8 +19,8 @@ peck_fft_cfg substate; peck_fft_cpx *tmpbuf; peck_fft_cpx *super_twiddles; -#ifdef USE_SIMD - void * pad; +#if USE_SIMD == SIMD_SSE2 + void *pad; #endif }; @@ -34,10 +34,9 @@ return NULL; } nfft >>= 1; + peck_fft_alloc(nfft, inverse_fft, NULL, &subsize); - peck_fft_alloc(nfft, inverse_fft, NULL, &subsize); memneeded = sizeof(struct peck_fftr_state) + subsize + sizeof(peck_fft_cpx) * (nfft * 3 / 2); - if (lenmem == NULL) { st = (peck_fftr_cfg) PECK_FFT_MALLOC(memneeded); } else { @@ -51,6 +50,7 @@ st->substate = (peck_fft_cfg) (st + 1); /* just beyond peck_fftr_state struct */ st->tmpbuf = (peck_fft_cpx *) (((char *) st->substate) + subsize); st->super_twiddles = st->tmpbuf + nfft; + peck_fft_alloc(nfft, inverse_fft, st->substate, &subsize); for (i = 0; i < nfft/2; ++i) { @@ -60,6 +60,7 @@ phase *= -1; kf_cexp(st->super_twiddles+i, phase); } + return st; } @@ -94,8 +95,12 @@ CHECK_OVERFLOW_OP(tdc.r ,-, tdc.i); freqdata[0].r = tdc.r + tdc.i; freqdata[ncfft].r = tdc.r - tdc.i; -#ifdef USE_SIMD +#if USE_SIMD == SIMD_SSE2 freqdata[ncfft].i = freqdata[0].i = _mm_set1_ps(0); +#elif USE_SIMD == SIMD_NEON4 + freqdata[ncfft].i = freqdata[0].i = vdupq_n_f32(0.0f); +#elif USE_SIMD == SIMD_NEON2 + freqdata[ncfft].i = freqdata[0].i = vdup_n_f32(0.0f); #else freqdata[ncfft].i = freqdata[0].i = 0; #endif @@ -138,16 +143,20 @@ fk = freqdata[k]; fnkc.r = freqdata[ncfft - k].r; fnkc.i = -freqdata[ncfft - k].i; - C_FIXDIV(fk , 2); - C_FIXDIV(fnkc , 2); + C_FIXDIV(fk, 2); + C_FIXDIV(fnkc, 2); C_ADD(fek, fk, fnkc); C_SUB(tmp, fk, fnkc); C_MUL(fok, tmp, st->super_twiddles[k-1]); C_ADD(st->tmpbuf[k], fek, fok); C_SUB(st->tmpbuf[ncfft - k], fek, fok); -#ifdef USE_SIMD - st->tmpbuf[ncfft - k].i *= _mm_set1_ps(-1.0); +#if USE_SIMD == SIMD_SSE2 + st->tmpbuf[ncfft - k].i *= _mm_set1_ps(-1.0f); +#elif USE_SIMD == SIMD_NEON4 + st->tmpbuf[ncfft - k].i *= vdupq_n_f32(-1.0f); +#elif USE_SIMD == SIMD_NEON2 + st->tmpbuf[ncfft - k].i *= vdup_n_f32(-1.0f); #else st->tmpbuf[ncfft - k].i *= -1; #endif diff -r 3b31bd44a09f -r 2d6c49fcafcb peck_test.c --- a/peck_test.c Fri Sep 16 13:08:20 2011 +0200 +++ b/peck_test.c Fri Sep 16 14:04:19 2011 +0200 @@ -25,7 +25,7 @@ unsigned int i, j; peck_fftr_cfg p, pi; - enable_runfast(); +// enable_runfast(); const unsigned int N = 256; @@ -34,7 +34,15 @@ peck_fft_scalar res[N]; for (i = 0; i < N; i++) { +#if USE_SIMD == SIMD_SSE2 + in[i] = _mm_set1_ps((i % 13) / 3); +#elif USE_SIMD == SIMD_NEON4 + in[i] = vdupq_n_f32((i % 13) / 3); +#elif USE_SIMD == SIMD_NEON2 + in[i] = vdup_n_f32((i % 13) / 3); +#else in[i] = (i % 13) / 3; +#endif } p = peck_fftr_alloc(N, 0, NULL, NULL); @@ -43,7 +51,7 @@ for (j = 0; j < 10000; j++) { if (j == 0) { for (i = 0; i < 8; i++) - printf("%d: %f\n", i, in[i]); + printf("%d: %f\n", i, *(float*)&in[i]); printf("----\n"); } @@ -51,7 +59,7 @@ if (j == 0) { for (i = 0; i < 8; i++) - printf("%d: %f %f\n", i, out[i].r, out[i].i); + printf("%d: %f %f\n", i, *(float*)&out[i].r, *(float*)&out[i].i); printf("----\n"); } @@ -59,7 +67,7 @@ if (j == 0) { for (i = 0; i < 8; i++) - printf("%d: %f\n", i, res[i] / N); + printf("%d: %f\n", i, *(float*)&res[i] / N); } } peck_fftr_free(p); @@ -67,7 +75,7 @@ peck_fft_cleanup(); for (i = 0; i < N; i++) { - if (fabs(in[i] - res[i]/N) > 0.00001) { + if (fabs(*(float*)&in[i] - *(float*)&res[i]/N) > 0.00001) { fprintf(stderr, "!!!! ERROR !!!! at %d\n", i); exit(EXIT_FAILURE); }