# HG changeset patch
# User Peter Meerwald
# Date 1316174659 -7200
# Node ID 2d6c49fcafcbe4b62cf1c0ad0ec6a7875f429938
# Parent 3b31bd44a09f7975d07f047952b78993b97ffe35
neon2 and neon4 support
diff -r 3b31bd44a09f -r 2d6c49fcafcb _peck_fft_guts.h
--- a/_peck_fft_guts.h Fri Sep 16 13:08:20 2011 +0200
+++ b/_peck_fft_guts.h Fri Sep 16 14:04:19 2011 +0200
@@ -125,17 +125,25 @@
#ifdef FIXED_POINT
-# define PECK_FFT_COS(phase) floor(.5+SAMP_MAX * cos (phase))
-# define PECK_FFT_SIN(phase) floor(.5+SAMP_MAX * sin (phase))
-# define HALF_OF(x) ((x)>>1)
-#elif defined(USE_SIMD)
-# define PECK_FFT_COS(phase) _mm_set1_ps( cos(phase) )
-# define PECK_FFT_SIN(phase) _mm_set1_ps( sin(phase) )
-# define HALF_OF(x) ((x)*_mm_set1_ps(.5))
+ #define PECK_FFT_COS(phase) floorf(0.5f+SAMP_MAX * cosf(phase))
+ #define PECK_FFT_SIN(phase) floorf(0.5f+SAMP_MAX * sinf(phase))
+ #define HALF_OF(x) ((x)>>1)
+#elif USE_SIMD == SIMD_SSE2
+ #define PECK_FFT_COS(phase) _mm_set1_ps(cosf(phase))
+ #define PECK_FFT_SIN(phase) _mm_set1_ps(sinf(phase))
+ #define HALF_OF(x) ((x)*_mm_set1_ps(0.5f))
+#elif USE_SIMD == SIMD_NEON4
+ #define PECK_FFT_COS(phase) vdupq_n_f32(cosf(phase))
+ #define PECK_FFT_SIN(phase) vdupq_n_f32(sinf(phase))
+ #define HALF_OF(x) ((x)*vdupq_n_f32(0.5f))
+#elif USE_SIMD == SIMD_NEON2
+ #define PECK_FFT_COS(phase) vdup_n_f32(cosf(phase))
+ #define PECK_FFT_SIN(phase) vdup_n_f32(sinf(phase))
+ #define HALF_OF(x) ((x)*vdup_n_f32(0.5f))
#else
-# define PECK_FFT_COS(phase) (peck_fft_scalar) cos(phase)
-# define PECK_FFT_SIN(phase) (peck_fft_scalar) sin(phase)
-# define HALF_OF(x) ((x)*.5)
+ #define PECK_FFT_COS(phase) (peck_fft_scalar) cosf(phase)
+ #define PECK_FFT_SIN(phase) (peck_fft_scalar) sinf(phase)
+ #define HALF_OF(x) ((x)*0.5f)
#endif
#define kf_cexp(x,phase) \
diff -r 3b31bd44a09f -r 2d6c49fcafcb compile.sh
--- a/compile.sh Fri Sep 16 13:08:20 2011 +0200
+++ b/compile.sh Fri Sep 16 14:04:19 2011 +0200
@@ -1,21 +1,22 @@
+
+/opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \
+ -O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \
+ -DUSE_SIMD=SIMD_NEON2 \
+ -I . \
+ -o peck_test_arm \
+ peck_fftr.c peck_fft.c \
+ peck_test.c \
+ -lm
gcc \
+ -O2 -march=native -msse2 -mfpmath=sse -ffast-math -fomit-frame-pointer \
+ -DUSE_SIMD=SIMD_SSE2 \
-I . \
-o peck_test_x86 \
peck_fftr.c peck_fft.c \
peck_test.c \
-lm
-exit
+# time ./peck_test_x86
-/opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \
- -I . \
- -O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \
- -o peck_test_arm \
- peck_fftr.c peck_fft.c \
- peck_test.c \
- -lm
-
-time ./peck_test_x86
-
-scp peck_test_arm root@192.168.233.114:.
+scp peck_test_arm root@192.168.233.104:.
diff -r 3b31bd44a09f -r 2d6c49fcafcb peck_fft.c
--- a/peck_fft.c Fri Sep 16 13:08:20 2011 +0200
+++ b/peck_fft.c Fri Sep 16 14:04:19 2011 +0200
@@ -19,10 +19,10 @@
*/
static void kf_bfly2(
- peck_fft_cpx * Fout,
- const size_t fstride,
- const peck_fft_cfg st,
- int m) {
+ peck_fft_cpx * Fout,
+ const size_t fstride,
+ const peck_fft_cfg st,
+ int m) {
//printf("kf_bfly2\n");
@@ -44,10 +44,11 @@
}
static void kf_bfly4(
- peck_fft_cpx * Fout,
- const size_t fstride,
- const peck_fft_cfg st,
- const size_t m) {
+ peck_fft_cpx * Fout,
+ const size_t fstride,
+ const peck_fft_cfg st,
+ const size_t m) {
+
peck_fft_cpx *tw1,*tw2,*tw3;
peck_fft_cpx scratch[6];
size_t k=m;
@@ -94,10 +95,11 @@
}
static void kf_bfly3(
- peck_fft_cpx * Fout,
- const size_t fstride,
- const peck_fft_cfg st,
- size_t m) {
+ peck_fft_cpx * Fout,
+ const size_t fstride,
+ const peck_fft_cfg st,
+ size_t m) {
+
size_t k=m;
const size_t m2 = 2*m;
peck_fft_cpx *tw1, *tw2;
@@ -107,7 +109,6 @@
printf("kf_bfly3\n");
-
tw1=tw2=st->twiddles;
do {
@@ -139,12 +140,11 @@
}
static void kf_bfly5(
- peck_fft_cpx * Fout,
- const size_t fstride,
- const peck_fft_cfg st,
- int m
- )
-{
+ peck_fft_cpx * Fout,
+ const size_t fstride,
+ const peck_fft_cfg st,
+ int m
+ ) {
peck_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
int u;
peck_fft_cpx scratch[13];
@@ -156,7 +156,6 @@
printf("kf_bfly5\n");
-
Fout0=Fout;
Fout1=Fout0+m;
Fout2=Fout0+2*m;
@@ -164,7 +163,7 @@
Fout4=Fout0+4*m;
tw=st->twiddles;
- for ( u=0; utwiddles;
peck_fft_cpx t;
@@ -218,7 +216,6 @@
printf("kf_bfly_generic\n");
-
peck_fft_cpx * scratch = (peck_fft_cpx*)PECK_FFT_TMP_ALLOC(sizeof(peck_fft_cpx)*p);
for ( u=0; u
-# define peck_fft_scalar __m128
-#define PECK_FFT_MALLOC(nbytes) _mm_malloc(nbytes, 16)
-#define PECK_FFT_FREE _mm_free
+#define SIMD_SSE2 1
+#define SIMD_NEON4 2
+#define SIMD_NEON2 3
+
+#if USE_SIMD == SIMD_SSE2
+ #include
+ #define peck_fft_scalar __m128
+ #define PECK_FFT_MALLOC(nbytes) _mm_malloc(nbytes, 16)
+ #define PECK_FFT_FREE _mm_free
+#elif USE_SIMD == SIMD_NEON4
+ #include
+ #define peck_fft_scalar float32x4_t
+ #define PECK_FFT_MALLOC malloc
+ #define PECK_FFT_FREE free
+#elif USE_SIMD == SIMD_NEON2
+ #include
+ #define peck_fft_scalar float32x2_t
+ #define PECK_FFT_MALLOC malloc
+ #define PECK_FFT_FREE free
#else
-#define PECK_FFT_MALLOC malloc
-#define PECK_FFT_FREE free
+ #define PECK_FFT_MALLOC malloc
+ #define PECK_FFT_FREE free
#endif
diff -r 3b31bd44a09f -r 2d6c49fcafcb peck_fftr.c
--- a/peck_fftr.c Fri Sep 16 13:08:20 2011 +0200
+++ b/peck_fftr.c Fri Sep 16 14:04:19 2011 +0200
@@ -19,8 +19,8 @@
peck_fft_cfg substate;
peck_fft_cpx *tmpbuf;
peck_fft_cpx *super_twiddles;
-#ifdef USE_SIMD
- void * pad;
+#if USE_SIMD == SIMD_SSE2
+ void *pad;
#endif
};
@@ -34,10 +34,9 @@
return NULL;
}
nfft >>= 1;
+ peck_fft_alloc(nfft, inverse_fft, NULL, &subsize);
- peck_fft_alloc(nfft, inverse_fft, NULL, &subsize);
memneeded = sizeof(struct peck_fftr_state) + subsize + sizeof(peck_fft_cpx) * (nfft * 3 / 2);
-
if (lenmem == NULL) {
st = (peck_fftr_cfg) PECK_FFT_MALLOC(memneeded);
} else {
@@ -51,6 +50,7 @@
st->substate = (peck_fft_cfg) (st + 1); /* just beyond peck_fftr_state struct */
st->tmpbuf = (peck_fft_cpx *) (((char *) st->substate) + subsize);
st->super_twiddles = st->tmpbuf + nfft;
+
peck_fft_alloc(nfft, inverse_fft, st->substate, &subsize);
for (i = 0; i < nfft/2; ++i) {
@@ -60,6 +60,7 @@
phase *= -1;
kf_cexp(st->super_twiddles+i, phase);
}
+
return st;
}
@@ -94,8 +95,12 @@
CHECK_OVERFLOW_OP(tdc.r ,-, tdc.i);
freqdata[0].r = tdc.r + tdc.i;
freqdata[ncfft].r = tdc.r - tdc.i;
-#ifdef USE_SIMD
+#if USE_SIMD == SIMD_SSE2
freqdata[ncfft].i = freqdata[0].i = _mm_set1_ps(0);
+#elif USE_SIMD == SIMD_NEON4
+ freqdata[ncfft].i = freqdata[0].i = vdupq_n_f32(0.0f);
+#elif USE_SIMD == SIMD_NEON2
+ freqdata[ncfft].i = freqdata[0].i = vdup_n_f32(0.0f);
#else
freqdata[ncfft].i = freqdata[0].i = 0;
#endif
@@ -138,16 +143,20 @@
fk = freqdata[k];
fnkc.r = freqdata[ncfft - k].r;
fnkc.i = -freqdata[ncfft - k].i;
- C_FIXDIV(fk , 2);
- C_FIXDIV(fnkc , 2);
+ C_FIXDIV(fk, 2);
+ C_FIXDIV(fnkc, 2);
C_ADD(fek, fk, fnkc);
C_SUB(tmp, fk, fnkc);
C_MUL(fok, tmp, st->super_twiddles[k-1]);
C_ADD(st->tmpbuf[k], fek, fok);
C_SUB(st->tmpbuf[ncfft - k], fek, fok);
-#ifdef USE_SIMD
- st->tmpbuf[ncfft - k].i *= _mm_set1_ps(-1.0);
+#if USE_SIMD == SIMD_SSE2
+ st->tmpbuf[ncfft - k].i *= _mm_set1_ps(-1.0f);
+#elif USE_SIMD == SIMD_NEON4
+ st->tmpbuf[ncfft - k].i *= vdupq_n_f32(-1.0f);
+#elif USE_SIMD == SIMD_NEON2
+ st->tmpbuf[ncfft - k].i *= vdup_n_f32(-1.0f);
#else
st->tmpbuf[ncfft - k].i *= -1;
#endif
diff -r 3b31bd44a09f -r 2d6c49fcafcb peck_test.c
--- a/peck_test.c Fri Sep 16 13:08:20 2011 +0200
+++ b/peck_test.c Fri Sep 16 14:04:19 2011 +0200
@@ -25,7 +25,7 @@
unsigned int i, j;
peck_fftr_cfg p, pi;
- enable_runfast();
+// enable_runfast();
const unsigned int N = 256;
@@ -34,7 +34,15 @@
peck_fft_scalar res[N];
for (i = 0; i < N; i++) {
+#if USE_SIMD == SIMD_SSE2
+ in[i] = _mm_set1_ps((i % 13) / 3);
+#elif USE_SIMD == SIMD_NEON4
+ in[i] = vdupq_n_f32((i % 13) / 3);
+#elif USE_SIMD == SIMD_NEON2
+ in[i] = vdup_n_f32((i % 13) / 3);
+#else
in[i] = (i % 13) / 3;
+#endif
}
p = peck_fftr_alloc(N, 0, NULL, NULL);
@@ -43,7 +51,7 @@
for (j = 0; j < 10000; j++) {
if (j == 0) {
for (i = 0; i < 8; i++)
- printf("%d: %f\n", i, in[i]);
+ printf("%d: %f\n", i, *(float*)&in[i]);
printf("----\n");
}
@@ -51,7 +59,7 @@
if (j == 0) {
for (i = 0; i < 8; i++)
- printf("%d: %f %f\n", i, out[i].r, out[i].i);
+ printf("%d: %f %f\n", i, *(float*)&out[i].r, *(float*)&out[i].i);
printf("----\n");
}
@@ -59,7 +67,7 @@
if (j == 0) {
for (i = 0; i < 8; i++)
- printf("%d: %f\n", i, res[i] / N);
+ printf("%d: %f\n", i, *(float*)&res[i] / N);
}
}
peck_fftr_free(p);
@@ -67,7 +75,7 @@
peck_fft_cleanup();
for (i = 0; i < N; i++) {
- if (fabs(in[i] - res[i]/N) > 0.00001) {
+ if (fabs(*(float*)&in[i] - *(float*)&res[i]/N) > 0.00001) {
fprintf(stderr, "!!!! ERROR !!!! at %d\n", i);
exit(EXIT_FAILURE);
}