# HG changeset patch
# User Peter Meerwald
# Date 1316611258 -7200
# Node ID 05f6ab0a17c060ef80b3fcc2fe403111d37bc703
# Parent 8726585681f6835a8c2f4707ab36dcee120f3f04
backup
diff -r 8726585681f6 -r 05f6ab0a17c0 compile.sh
--- a/compile.sh Wed Sep 21 12:18:40 2011 +0200
+++ b/compile.sh Wed Sep 21 15:20:58 2011 +0200
@@ -1,6 +1,5 @@
/opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \
-O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \
- -DUSE_SIMD=SIMD_NEON2 -DBFLY2_ASM=1 \
-I . \
-c -o kf_bfly2_only.o -g \
kf_bfly2.S \
@@ -8,15 +7,23 @@
/opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \
-O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \
- -DUSE_SIMD=SIMD_NEON2 -DBFLY2_ASM=1 \
+ -I . \
+ -c -o kf_bfly4_only.o -g \
+ kf_bfly4.S \
+ -lm
+
+
+/opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \
+ -O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \
+ -DUSE_SIMD=SIMD_NEON2 -DBFLY2_ASM=1 -DBFLY4_ASM=1 \
-I . -I ../armv7_cycles \
-o peck_test_arm -g \
- peck_fft.c peck_fftr.c peck_test.c kf_bfly2.S ../armv7_cycles/armv7_cycles.c \
+ peck_fft.c peck_fftr.c peck_test.c kf_bfly2.S kf_bfly4.S ../armv7_cycles/armv7_cycles.c \
-lm
gcc \
-O2 -march=native -msse2 -mfpmath=sse -ffast-math -fomit-frame-pointer \
- -DUSE_SIMD=SIMD_SSE2 -DBFLY2_ASM=0 \
+ -DUSE_SIMD=SIMD_SSE2 -DBFLY2_ASM=0 -DBFLY4_ASM=0 \
-I . -I ../armv7_cycles \
-o peck_test_x86 \
peck_fftr.c peck_fft.c \
@@ -25,7 +32,7 @@
/opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \
-O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \
- -DUSE_SIMD=SIMD_NEON -DBFLY2_ASM=0 \
+ -DUSE_SIMD=SIMD_NEON -DBFLY2_ASM=0 -DBFLY4_ASM=0 \
-I . -I ../armv7_cycles \
-o peck_test_neon \
peck_fftr.c peck_fft.c \
@@ -34,7 +41,7 @@
/opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \
-O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \
- -DUSE_SIMD=SIMD_NEON2 -DBFLY2_ASM=0 \
+ -DUSE_SIMD=SIMD_NEON2 -DBFLY2_ASM=0 -DBFLY4_ASM=0 \
-I . -I ../armv7_cycles \
-o peck_test_neon2 \
peck_fftr.c peck_fft.c \
@@ -43,7 +50,7 @@
/opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \
-O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \
- -DUSE_SIMD=SIMD_NEON4 -DBFLY2_ASM=0 \
+ -DUSE_SIMD=SIMD_NEON4 -DBFLY2_ASM=0 -DBFLY4_ASM=0 \
-I . -I ../armv7_cycles \
-o peck_test_neon4 \
peck_fftr.c peck_fft.c \
diff -r 8726585681f6 -r 05f6ab0a17c0 kf_bfly4.S
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/kf_bfly4.S Wed Sep 21 15:20:58 2011 +0200
@@ -0,0 +1,184 @@
+ .cpu cortex-a8
+ .eabi_attribute 27, 3
+ .fpu neon
+ .eabi_attribute 23, 1
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+ .eabi_attribute 26, 2
+ .eabi_attribute 30, 1
+ .eabi_attribute 18, 4
+
+ .text
+ .align 2
+ .global kf_bfly4
+ .type kf_bfly4, %function
+kf_bfly4:
+ .fnstart
+ .cfi_startproc
+ stmfd sp!, {r4, r5, r6, r7, r8, sl}
+ .save {r4, r5, r6, r7, r8, sl}
+ mov r4, r3, asl #1
+ add r6, r4, r3
+ add ip, r2, #264
+ ldr r2, [r2, #4]
+ cmp r2, #0
+ beq .forward
+ mov r8, r1, asl #4
+ mov r7, r1, asl #5
+ add r1, r1, r1, asl #1
+ mov r1, r1, asl #4
+ add r2, r0, #0
+ add r5, r0, r3, asl #4
+ add r4, r0, r4, asl #4
+ add r0, r0, r6, asl #4
+ mov sl, ip
+ mov r6, ip
+.inverse_loop:
+ // C_MUL(scratch[0], Fout[m], *tw1);
+ vld1.32 {d18,d19}, [r5]
+ vld1.32 {d16,d17}, [ip]
+ vmul.f32 d20, d18, d16
+ vmls.f32 d20, d19, d17
+ vmul.f32 d21, d16, d19
+ vmla.f32 d21, d18, d17
+
+ // C_MUL(scratch[3], Fout[m2], *tw2);
+ vld1.32 {d18,d19}, [r4]
+ vld1.32 {d16,d17}, [r6]
+ vmul.f32 d22, d18, d16
+ vmls.f32 d22, d19, d17
+ vmul.f32 d23, d16, d19
+ vmla.f32 d23, d18, d17
+
+ // C_MUL(scratch[2], Fout[m3], *tw3);
+ vld1.32 {d18,d19}, [r0]
+ vld1.32 {d16,d17}, [sl]
+ vmul.f32 d24, d18, d16
+ vmls.f32 d24, d19, d17
+ vmul.f32 d25, d16, d19
+ vmla.f32 d25, d18, d17
+
+ // C_SUB(scratch[1], *Fout, scratch[3]);
+ vld1.32 {d16,d17}, [r2]
+ vsubq.f32 q13, q8, q11
+
+ // C_ADDTO(*Fout, scratch[3]);
+ vaddq.f32 q8, q8, q11
+ vst1.32 {d16,d17}, [r2]
+
+ // C_ADD(scratch[3], scratch[0], scratch[2]);
+ vaddq.f32 q11, q10, q12
+
+ // C_SUB(Fout[m2], *Fout, scratch[3]);
+ vsubq.f32 q9, q8, q11
+ vst1.32 {d18,d19}, [r4]!
+
+ // C_ADDTO(*Fout, scratch[3]);
+ vaddq.f32 q8, q8, q11
+ vst1.32 {d16,d17}, [r2]!
+
+ add ip, ip, r8
+ add r6, r6, r7
+ add sl, sl, r1
+
+ // C_SUB(scratch[3], scratch[0], scratch[2]);
+ vsubq.f32 q11, q10, q12
+
+ // Fout[m].r = scratch[1].r - scratch[3].i;
+ // Fout[m].i = scratch[1].i + scratch[3].r;
+ vsub.f32 d18, d26, d23
+ vadd.f32 d19, d27, d22
+ vst1.32 {d18,d19}, [r5]!
+
+ // Fout[m3].r = scratch[1].r + scratch[3].i;
+ // Fout[m3].i = scratch[1].i - scratch[3].r;
+ vadd.f32 d18, d26, d23
+ vsub.f32 d19, d27, d22
+ vst1.32 {d18,d19}, [r0]!
+
+ subs r3, r3, #1
+ bne .inverse_loop
+ b .done
+.forward:
+ mov r8, r1, asl #4
+ mov r7, r1, asl #5
+ add r1, r1, r1, asl #1
+ mov r1, r1, asl #4
+ add r2, r0, #0
+ add r5, r0, r3, asl #4
+ add r4, r0, r4, asl #4
+ add r0, r0, r6, asl #4
+ mov sl, ip
+ mov r6, ip
+.forward_loop:
+ // C_MUL(scratch[0], Fout[m], *tw1);
+ vld1.32 {d18,d19}, [r5]
+ vld1.32 {d16,d17}, [ip]
+ vmul.f32 d20, d18, d16
+ vmls.f32 d20, d19, d17
+ vmul.f32 d21, d16, d19
+ vmla.f32 d21, d18, d17
+
+ // C_MUL(scratch[3], Fout[m2], *tw2);
+ vld1.32 {d18,d19}, [r4]
+ vld1.32 {d16,d17}, [r6]
+ vmul.f32 d22, d18, d16
+ vmls.f32 d22, d19, d17
+ vmul.f32 d23, d16, d19
+ vmla.f32 d23, d18, d17
+
+ // C_MUL(scratch[2], Fout[m3], *tw3);
+ vld1.32 {d18,d19}, [r0]
+ vld1.32 {d16,d17}, [sl]
+ vmul.f32 d24, d18, d16
+ vmls.f32 d24, d19, d17
+ vmul.f32 d25, d16, d19
+ vmla.f32 d25, d18, d17
+
+ // C_SUB(scratch[1], *Fout, scratch[3]);
+ vld1.32 {d16,d17}, [r2]
+ vsubq.f32 q13, q8, q11
+
+ // C_ADDTO(*Fout, scratch[3]);
+ vaddq.f32 q8, q8, q11
+ vst1.32 {d16,d17}, [r2]
+
+ // C_ADD(scratch[3], scratch[0], scratch[2]);
+ vaddq.f32 q11, q10, q12
+
+ // C_SUB(Fout[m2], *Fout, scratch[3]);
+ vsubq.f32 q9, q8, q11
+ vst1.32 {d18,d19}, [r4]!
+
+ // C_ADDTO(*Fout, scratch[3]);
+ vaddq.f32 q8, q8, q11
+ vst1.32 {d16,d17}, [r2]!
+
+ add ip, ip, r8
+ add r6, r6, r7
+ add sl, sl, r1
+
+ // C_SUB(scratch[3], scratch[0], scratch[2]);
+ vsubq.f32 q11, q10, q12
+
+ // Fout[m].r = scratch[1].r - scratch[3].i;
+ // Fout[m].i = scratch[1].i + scratch[3].r;
+ vadd.f32 d18, d26, d23
+ vsub.f32 d19, d27, d22
+ vst1.32 {d18,d19}, [r5]!
+
+ // Fout[m3].r = scratch[1].r + scratch[3].i;
+ // Fout[m3].i = scratch[1].i - scratch[3].r;
+ vsub.f32 d18, d26, d23
+ vadd.f32 d19, d27, d22
+ vst1.32 {d18,d19}, [r0]!
+
+ subs r3, r3, #1
+ bne .forward_loop
+.done:
+ ldmfd sp!, {r4, r5, r6, r7, r8, sl}
+ bx lr
+ .cfi_endproc
+ .fnend
+ .size kf_bfly4, .-kf_bfly4
+
diff -r 8726585681f6 -r 05f6ab0a17c0 peck_fft.c
--- a/peck_fft.c Wed Sep 21 12:18:40 2011 +0200
+++ b/peck_fft.c Wed Sep 21 15:20:58 2011 +0200
@@ -19,7 +19,7 @@
*/
#if !BFLY2_ASM
static void kf_bfly2(
- peck_fft_cpx *Fout,
+ peck_fft_cpx * __restrict Fout,
const size_t fstride,
const peck_fft_cfg st,
int m) {
@@ -40,51 +40,77 @@
}
#endif
+#if !BFLY4_ASM
static void kf_bfly4(
- peck_fft_cpx * Fout,
+ peck_fft_cpx * __restrict Fout,
const size_t fstride,
const peck_fft_cfg st,
const size_t m) {
- peck_fft_cpx *tw1,*tw2,*tw3;
- peck_fft_cpx scratch[6];
- size_t k=m;
- const size_t m2=2*m;
- const size_t m3=3*m;
+ peck_fft_cpx scratch[4];
+ peck_fft_cpx * __restrict tw1, * __restrict tw2, * __restrict tw3;
+ size_t k = m;
+ const size_t m2 = 2*m;
+ const size_t m3 = 3*m;
// printf("kf_bfly4, %d\n", fstride);
tw3 = tw2 = tw1 = st->twiddles;
- do {
- C_MUL(scratch[0], Fout[m], *tw1);
- C_MUL(scratch[1], Fout[m2], *tw2);
- C_MUL(scratch[2], Fout[m3], *tw3);
+ if (st->inverse) {
+ do {
+ C_MUL(scratch[0], Fout[m], *tw1);
+ C_MUL(scratch[3], Fout[m2], *tw2);
+ C_MUL(scratch[2], Fout[m3], *tw3);
- C_SUB(scratch[5], *Fout, scratch[1]);
- C_ADDTO(*Fout, scratch[1]);
- C_ADD(scratch[3], scratch[0], scratch[2]);
- C_SUB(scratch[4], scratch[0], scratch[2]);
- C_SUB(Fout[m2], *Fout, scratch[3]);
- tw1 += fstride;
- tw2 += fstride*2;
- tw3 += fstride*3;
- C_ADDTO(*Fout, scratch[3]);
+ C_SUB(scratch[1], *Fout, scratch[3]);
+ C_ADDTO(*Fout, scratch[3]);
+
+ C_ADD(scratch[3], scratch[0], scratch[2]);
+ C_SUB(Fout[m2], *Fout, scratch[3]);
+ C_ADDTO(*Fout, scratch[3]);
+
+ tw1 += fstride;
+ tw2 += fstride*2;
+ tw3 += fstride*3;
+
+ C_SUB(scratch[3], scratch[0], scratch[2]);
+ Fout[m].r = scratch[1].r - scratch[3].i;
+ Fout[m].i = scratch[1].i + scratch[3].r;
+ Fout[m3].r = scratch[1].r + scratch[3].i;
+ Fout[m3].i = scratch[1].i - scratch[3].r;
- if (st->inverse) {
- Fout[m].r = scratch[5].r - scratch[4].i;
- Fout[m].i = scratch[5].i + scratch[4].r;
- Fout[m3].r = scratch[5].r + scratch[4].i;
- Fout[m3].i = scratch[5].i - scratch[4].r;
- } else {
- Fout[m].r = scratch[5].r + scratch[4].i;
- Fout[m].i = scratch[5].i - scratch[4].r;
- Fout[m3].r = scratch[5].r - scratch[4].i;
- Fout[m3].i = scratch[5].i + scratch[4].r;
- }
- ++Fout;
- } while (--k);
+ ++Fout;
+ } while (--k);
+ }
+ else {
+ do {
+ C_MUL(scratch[0], Fout[m], *tw1);
+ C_MUL(scratch[3], Fout[m2], *tw2);
+ C_MUL(scratch[2], Fout[m3], *tw3);
+
+ C_SUB(scratch[1], *Fout, scratch[3]);
+ C_ADDTO(*Fout, scratch[3]);
+
+ C_ADD(scratch[3], scratch[0], scratch[2]);
+ C_SUB(Fout[m2], *Fout, scratch[3]);
+ C_ADDTO(*Fout, scratch[3]);
+
+ tw1 += fstride;
+ tw2 += fstride*2;
+ tw3 += fstride*3;
+
+ C_SUB(scratch[3], scratch[0], scratch[2]);
+ Fout[m].r = scratch[1].r + scratch[3].i;
+ Fout[m].i = scratch[1].i - scratch[3].r;
+ Fout[m3].r = scratch[1].r - scratch[3].i;
+ Fout[m3].i = scratch[1].i + scratch[3].r;
+
+ ++Fout;
+ } while (--k);
+ }
}
+#endif
static void kf_bfly3(
peck_fft_cpx * Fout,