# HG changeset patch # User Peter Meerwald # Date 1316611258 -7200 # Node ID 05f6ab0a17c060ef80b3fcc2fe403111d37bc703 # Parent 8726585681f6835a8c2f4707ab36dcee120f3f04 backup diff -r 8726585681f6 -r 05f6ab0a17c0 compile.sh --- a/compile.sh Wed Sep 21 12:18:40 2011 +0200 +++ b/compile.sh Wed Sep 21 15:20:58 2011 +0200 @@ -1,6 +1,5 @@ /opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \ -O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \ - -DUSE_SIMD=SIMD_NEON2 -DBFLY2_ASM=1 \ -I . \ -c -o kf_bfly2_only.o -g \ kf_bfly2.S \ @@ -8,15 +7,23 @@ /opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \ -O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \ - -DUSE_SIMD=SIMD_NEON2 -DBFLY2_ASM=1 \ + -I . \ + -c -o kf_bfly4_only.o -g \ + kf_bfly4.S \ + -lm + + +/opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \ + -O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \ + -DUSE_SIMD=SIMD_NEON2 -DBFLY2_ASM=1 -DBFLY4_ASM=1 \ -I . -I ../armv7_cycles \ -o peck_test_arm -g \ - peck_fft.c peck_fftr.c peck_test.c kf_bfly2.S ../armv7_cycles/armv7_cycles.c \ + peck_fft.c peck_fftr.c peck_test.c kf_bfly2.S kf_bfly4.S ../armv7_cycles/armv7_cycles.c \ -lm gcc \ -O2 -march=native -msse2 -mfpmath=sse -ffast-math -fomit-frame-pointer \ - -DUSE_SIMD=SIMD_SSE2 -DBFLY2_ASM=0 \ + -DUSE_SIMD=SIMD_SSE2 -DBFLY2_ASM=0 -DBFLY4_ASM=0 \ -I . -I ../armv7_cycles \ -o peck_test_x86 \ peck_fftr.c peck_fft.c \ @@ -25,7 +32,7 @@ /opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \ -O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \ - -DUSE_SIMD=SIMD_NEON -DBFLY2_ASM=0 \ + -DUSE_SIMD=SIMD_NEON -DBFLY2_ASM=0 -DBFLY4_ASM=0 \ -I . -I ../armv7_cycles \ -o peck_test_neon \ peck_fftr.c peck_fft.c \ @@ -34,7 +41,7 @@ /opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \ -O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \ - -DUSE_SIMD=SIMD_NEON2 -DBFLY2_ASM=0 \ + -DUSE_SIMD=SIMD_NEON2 -DBFLY2_ASM=0 -DBFLY4_ASM=0 \ -I . -I ../armv7_cycles \ -o peck_test_neon2 \ peck_fftr.c peck_fft.c \ @@ -43,7 +50,7 @@ /opt/arm-2011.03/bin/arm-none-linux-gnueabi-gcc \ -O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math -fomit-frame-pointer \ - -DUSE_SIMD=SIMD_NEON4 -DBFLY2_ASM=0 \ + -DUSE_SIMD=SIMD_NEON4 -DBFLY2_ASM=0 -DBFLY4_ASM=0 \ -I . -I ../armv7_cycles \ -o peck_test_neon4 \ peck_fftr.c peck_fft.c \ diff -r 8726585681f6 -r 05f6ab0a17c0 kf_bfly4.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kf_bfly4.S Wed Sep 21 15:20:58 2011 +0200 @@ -0,0 +1,184 @@ + .cpu cortex-a8 + .eabi_attribute 27, 3 + .fpu neon + .eabi_attribute 23, 1 + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + .eabi_attribute 26, 2 + .eabi_attribute 30, 1 + .eabi_attribute 18, 4 + + .text + .align 2 + .global kf_bfly4 + .type kf_bfly4, %function +kf_bfly4: + .fnstart + .cfi_startproc + stmfd sp!, {r4, r5, r6, r7, r8, sl} + .save {r4, r5, r6, r7, r8, sl} + mov r4, r3, asl #1 + add r6, r4, r3 + add ip, r2, #264 + ldr r2, [r2, #4] + cmp r2, #0 + beq .forward + mov r8, r1, asl #4 + mov r7, r1, asl #5 + add r1, r1, r1, asl #1 + mov r1, r1, asl #4 + add r2, r0, #0 + add r5, r0, r3, asl #4 + add r4, r0, r4, asl #4 + add r0, r0, r6, asl #4 + mov sl, ip + mov r6, ip +.inverse_loop: + // C_MUL(scratch[0], Fout[m], *tw1); + vld1.32 {d18,d19}, [r5] + vld1.32 {d16,d17}, [ip] + vmul.f32 d20, d18, d16 + vmls.f32 d20, d19, d17 + vmul.f32 d21, d16, d19 + vmla.f32 d21, d18, d17 + + // C_MUL(scratch[3], Fout[m2], *tw2); + vld1.32 {d18,d19}, [r4] + vld1.32 {d16,d17}, [r6] + vmul.f32 d22, d18, d16 + vmls.f32 d22, d19, d17 + vmul.f32 d23, d16, d19 + vmla.f32 d23, d18, d17 + + // C_MUL(scratch[2], Fout[m3], *tw3); + vld1.32 {d18,d19}, [r0] + vld1.32 {d16,d17}, [sl] + vmul.f32 d24, d18, d16 + vmls.f32 d24, d19, d17 + vmul.f32 d25, d16, d19 + vmla.f32 d25, d18, d17 + + // C_SUB(scratch[1], *Fout, scratch[3]); + vld1.32 {d16,d17}, [r2] + vsubq.f32 q13, q8, q11 + + // C_ADDTO(*Fout, scratch[3]); + vaddq.f32 q8, q8, q11 + vst1.32 {d16,d17}, [r2] + + // C_ADD(scratch[3], scratch[0], scratch[2]); + vaddq.f32 q11, q10, q12 + + // C_SUB(Fout[m2], *Fout, scratch[3]); + vsubq.f32 q9, q8, q11 + vst1.32 {d18,d19}, [r4]! + + // C_ADDTO(*Fout, scratch[3]); + vaddq.f32 q8, q8, q11 + vst1.32 {d16,d17}, [r2]! + + add ip, ip, r8 + add r6, r6, r7 + add sl, sl, r1 + + // C_SUB(scratch[3], scratch[0], scratch[2]); + vsubq.f32 q11, q10, q12 + + // Fout[m].r = scratch[1].r - scratch[3].i; + // Fout[m].i = scratch[1].i + scratch[3].r; + vsub.f32 d18, d26, d23 + vadd.f32 d19, d27, d22 + vst1.32 {d18,d19}, [r5]! + + // Fout[m3].r = scratch[1].r + scratch[3].i; + // Fout[m3].i = scratch[1].i - scratch[3].r; + vadd.f32 d18, d26, d23 + vsub.f32 d19, d27, d22 + vst1.32 {d18,d19}, [r0]! + + subs r3, r3, #1 + bne .inverse_loop + b .done +.forward: + mov r8, r1, asl #4 + mov r7, r1, asl #5 + add r1, r1, r1, asl #1 + mov r1, r1, asl #4 + add r2, r0, #0 + add r5, r0, r3, asl #4 + add r4, r0, r4, asl #4 + add r0, r0, r6, asl #4 + mov sl, ip + mov r6, ip +.forward_loop: + // C_MUL(scratch[0], Fout[m], *tw1); + vld1.32 {d18,d19}, [r5] + vld1.32 {d16,d17}, [ip] + vmul.f32 d20, d18, d16 + vmls.f32 d20, d19, d17 + vmul.f32 d21, d16, d19 + vmla.f32 d21, d18, d17 + + // C_MUL(scratch[3], Fout[m2], *tw2); + vld1.32 {d18,d19}, [r4] + vld1.32 {d16,d17}, [r6] + vmul.f32 d22, d18, d16 + vmls.f32 d22, d19, d17 + vmul.f32 d23, d16, d19 + vmla.f32 d23, d18, d17 + + // C_MUL(scratch[2], Fout[m3], *tw3); + vld1.32 {d18,d19}, [r0] + vld1.32 {d16,d17}, [sl] + vmul.f32 d24, d18, d16 + vmls.f32 d24, d19, d17 + vmul.f32 d25, d16, d19 + vmla.f32 d25, d18, d17 + + // C_SUB(scratch[1], *Fout, scratch[3]); + vld1.32 {d16,d17}, [r2] + vsubq.f32 q13, q8, q11 + + // C_ADDTO(*Fout, scratch[3]); + vaddq.f32 q8, q8, q11 + vst1.32 {d16,d17}, [r2] + + // C_ADD(scratch[3], scratch[0], scratch[2]); + vaddq.f32 q11, q10, q12 + + // C_SUB(Fout[m2], *Fout, scratch[3]); + vsubq.f32 q9, q8, q11 + vst1.32 {d18,d19}, [r4]! + + // C_ADDTO(*Fout, scratch[3]); + vaddq.f32 q8, q8, q11 + vst1.32 {d16,d17}, [r2]! + + add ip, ip, r8 + add r6, r6, r7 + add sl, sl, r1 + + // C_SUB(scratch[3], scratch[0], scratch[2]); + vsubq.f32 q11, q10, q12 + + // Fout[m].r = scratch[1].r - scratch[3].i; + // Fout[m].i = scratch[1].i + scratch[3].r; + vadd.f32 d18, d26, d23 + vsub.f32 d19, d27, d22 + vst1.32 {d18,d19}, [r5]! + + // Fout[m3].r = scratch[1].r + scratch[3].i; + // Fout[m3].i = scratch[1].i - scratch[3].r; + vsub.f32 d18, d26, d23 + vadd.f32 d19, d27, d22 + vst1.32 {d18,d19}, [r0]! + + subs r3, r3, #1 + bne .forward_loop +.done: + ldmfd sp!, {r4, r5, r6, r7, r8, sl} + bx lr + .cfi_endproc + .fnend + .size kf_bfly4, .-kf_bfly4 + diff -r 8726585681f6 -r 05f6ab0a17c0 peck_fft.c --- a/peck_fft.c Wed Sep 21 12:18:40 2011 +0200 +++ b/peck_fft.c Wed Sep 21 15:20:58 2011 +0200 @@ -19,7 +19,7 @@ */ #if !BFLY2_ASM static void kf_bfly2( - peck_fft_cpx *Fout, + peck_fft_cpx * __restrict Fout, const size_t fstride, const peck_fft_cfg st, int m) { @@ -40,51 +40,77 @@ } #endif +#if !BFLY4_ASM static void kf_bfly4( - peck_fft_cpx * Fout, + peck_fft_cpx * __restrict Fout, const size_t fstride, const peck_fft_cfg st, const size_t m) { - peck_fft_cpx *tw1,*tw2,*tw3; - peck_fft_cpx scratch[6]; - size_t k=m; - const size_t m2=2*m; - const size_t m3=3*m; + peck_fft_cpx scratch[4]; + peck_fft_cpx * __restrict tw1, * __restrict tw2, * __restrict tw3; + size_t k = m; + const size_t m2 = 2*m; + const size_t m3 = 3*m; // printf("kf_bfly4, %d\n", fstride); tw3 = tw2 = tw1 = st->twiddles; - do { - C_MUL(scratch[0], Fout[m], *tw1); - C_MUL(scratch[1], Fout[m2], *tw2); - C_MUL(scratch[2], Fout[m3], *tw3); + if (st->inverse) { + do { + C_MUL(scratch[0], Fout[m], *tw1); + C_MUL(scratch[3], Fout[m2], *tw2); + C_MUL(scratch[2], Fout[m3], *tw3); - C_SUB(scratch[5], *Fout, scratch[1]); - C_ADDTO(*Fout, scratch[1]); - C_ADD(scratch[3], scratch[0], scratch[2]); - C_SUB(scratch[4], scratch[0], scratch[2]); - C_SUB(Fout[m2], *Fout, scratch[3]); - tw1 += fstride; - tw2 += fstride*2; - tw3 += fstride*3; - C_ADDTO(*Fout, scratch[3]); + C_SUB(scratch[1], *Fout, scratch[3]); + C_ADDTO(*Fout, scratch[3]); + + C_ADD(scratch[3], scratch[0], scratch[2]); + C_SUB(Fout[m2], *Fout, scratch[3]); + C_ADDTO(*Fout, scratch[3]); + + tw1 += fstride; + tw2 += fstride*2; + tw3 += fstride*3; + + C_SUB(scratch[3], scratch[0], scratch[2]); + Fout[m].r = scratch[1].r - scratch[3].i; + Fout[m].i = scratch[1].i + scratch[3].r; + Fout[m3].r = scratch[1].r + scratch[3].i; + Fout[m3].i = scratch[1].i - scratch[3].r; - if (st->inverse) { - Fout[m].r = scratch[5].r - scratch[4].i; - Fout[m].i = scratch[5].i + scratch[4].r; - Fout[m3].r = scratch[5].r + scratch[4].i; - Fout[m3].i = scratch[5].i - scratch[4].r; - } else { - Fout[m].r = scratch[5].r + scratch[4].i; - Fout[m].i = scratch[5].i - scratch[4].r; - Fout[m3].r = scratch[5].r - scratch[4].i; - Fout[m3].i = scratch[5].i + scratch[4].r; - } - ++Fout; - } while (--k); + ++Fout; + } while (--k); + } + else { + do { + C_MUL(scratch[0], Fout[m], *tw1); + C_MUL(scratch[3], Fout[m2], *tw2); + C_MUL(scratch[2], Fout[m3], *tw3); + + C_SUB(scratch[1], *Fout, scratch[3]); + C_ADDTO(*Fout, scratch[3]); + + C_ADD(scratch[3], scratch[0], scratch[2]); + C_SUB(Fout[m2], *Fout, scratch[3]); + C_ADDTO(*Fout, scratch[3]); + + tw1 += fstride; + tw2 += fstride*2; + tw3 += fstride*3; + + C_SUB(scratch[3], scratch[0], scratch[2]); + Fout[m].r = scratch[1].r + scratch[3].i; + Fout[m].i = scratch[1].i - scratch[3].r; + Fout[m3].r = scratch[1].r - scratch[3].i; + Fout[m3].i = scratch[1].i + scratch[3].r; + + ++Fout; + } while (--k); + } } +#endif static void kf_bfly3( peck_fft_cpx * Fout,