# HG changeset patch # User Peter Meerwald # Date 1341408248 -7200 # Node ID 09ee6a01a3d3710416875d4ebca52eceb5b7e505 # Parent b829afbea564ccfc4ec54e75505c396c61ddaf0d new diff -r b829afbea564 -r 09ee6a01a3d3 compile.sh --- a/compile.sh Fri Apr 20 14:26:14 2012 +0200 +++ b/compile.sh Wed Jul 04 15:24:08 2012 +0200 @@ -1,11 +1,17 @@ -arm-linux-gnueabi-gcc -Wall -g -O2 -static \ - -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon \ +#!/bin/sh + +#CC="arm-linux-gnueabi-gcc -Wall -g -O2 -static -marm -fomit-frame-pointer -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon" +#CC="/opt/arm-2012.03/bin/arm-none-linux-gnueabi-gcc -Wall -g -O3 -static -marm -fomit-frame-pointer -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon" +CC="/home/pmeerw/openbricks-bct2/build/build.brettl3.eglibc/toolchain/bin/armv7-openbricks-linux-gnueabi-gcc -Wall -g -O3 -static -marm -fomit-frame-pointer -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon" + +$CC \ -o sconv_neon sconv_neon.c -lm -arm-linux-gnueabi-gcc -Wall -g -O2 -static \ - -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon \ - -o remap_neon remap_neon.c -lm +$CC \ + -o remap_neon remap_neon.c xxx.S -lm -arm-linux-gnueabi-gcc -Wall -g -O2 -static \ - -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon \ +$CC \ -o svolume_neon svolume_neon.c -lm + +$CC \ + -S -o asdf.s remap_neon.c -lm diff -r b829afbea564 -r 09ee6a01a3d3 remap_neon.c --- a/remap_neon.c Fri Apr 20 14:26:14 2012 +0200 +++ b/remap_neon.c Wed Jul 04 15:24:08 2012 +0200 @@ -188,8 +188,6 @@ } } - - static void remap_stereo_to_mono_c(pa_remap_t *m, void *dst, const void *src, unsigned n) { unsigned i; @@ -235,115 +233,179 @@ } } - #if defined(__arm__) #include "arm_neon.h" -void remap_mono_to_stereo_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) { - unsigned i; +static void mono_to_stereo_float_neon_a8(float *dst, const float *src, unsigned n) { + int i = n & 3; + + asm volatile ( + "mov %[n], %[n], lsr #2\n\t" + "1:\n\t" + "vld1.32 {q0}, [%[src]]!\n\t" + "vmov q1, q0\n\t" + "subs %[n], %[n], #1\n\t" + "vst2.32 {q0,q1}, [%[dst]]!\n\t" + "bgt 1b\n\t" + // output operands (or input operands that get modified) + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) + : // input operands + : "memory", "cc" // clobber list + ); + + while (i--) { + dst[0] = dst[1] = src[0]; + src++; + dst += 2; + } +} + +static void mono_to_stereo_float_neon_a9(float *dst, const float *src, unsigned n) { + int i = n & 1; + + asm volatile ( + "mov %[n], %[n], lsr #1\n\t" + "1:\n\t" + "ldm %[src]!, {r4,r6}\n\t" + "mov r5, r4\n\t" + "mov r7, r6\n\t" + "subs %[n], %[n], #1\n\t" + "stm %[dst]!, {r4-r7}\n\t" + "bgt 1b\n\t" + // output operands (or input operands that get modified) + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) + : // input operands + : "memory", "cc", "r4", "r5", "r6", "r7" // clobber list + ); + + while (i--) { + dst[0] = dst[1] = src[0]; + src++; + dst += 2; + } +} + +static void mono_to_stereo_int16_neon(int16_t *dst, const int16_t *src, unsigned n) { + int i = n & 7; + + asm volatile ( + "mov %[n], %[n], lsr #3\n\t" + "1:\n\t" + "vld1.16 {q0}, [%[src]]!\n\t" + "vmov q1, q0\n\t" + "subs %[n], %[n], #1\n\t" + "vst2.16 {q0,q1}, [%[dst]]!\n\t" + "bgt 1b\n\t" + // output operands (or input operands that get modified) + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) + : // input operands + : "memory", "cc" // clobber list + ); + + while (i--) { + dst[0] = dst[1] = src[0]; + src++; + dst += 2; + } +} + +static void remap_mono_to_stereo_neon_a9(pa_remap_t *m, void *dst, const void *src, unsigned n) { switch (*m->format) { case PA_SAMPLE_FLOAT32NE: - { - float *d = (float *) dst, *s = (float *) src; - - for (i = 0; i < n/4; i++) { - float32x4x2_t stereo; - stereo.val[0] = vld1q_f32(s); - stereo.val[1] = stereo.val[0]; - vst2q_f32(d, stereo); - s += 4; - d += 8; - } - - for (i = n & ~3; i < n; i++) { - d[0] = d[1] = s[0]; - s++; - d += 2; - } + mono_to_stereo_float_neon_a9(dst, src, n); break; - } case PA_SAMPLE_S16NE: - { - int16_t *d = (int16_t *) dst, *s = (int16_t *) src; - - for (i = 0; i < n/8; i++) { - int16x8x2_t stereo; - stereo.val[0] = vld1q_s16(s); - stereo.val[1] = stereo.val[0]; - vst2q_s16(d, stereo); - s += 8; - d += 16; - } - - for (i = n & ~7; i < n; i++) { - d[0] = d[1] = s[0]; - s++; - d += 2; - } + mono_to_stereo_int16_neon(dst, src, n); break; - } default: pa_assert_not_reached(); } } -/* don't inline, causes ICE, see https://bugs.launchpad.net/bugs/936863 */ -static __attribute__ ((noinline)) void stereo_to_mono_float(float *d, const float *s, unsigned n) { - unsigned i; - - for (i = 0; i < n/4; i++) { - float32x4x2_t stereo = vld2q_f32(s); - float32x4_t mono = vaddq_f32(stereo.val[0], stereo.val[1]); - vst1q_f32(d, mono); - s += 8; - d += 4; - } - for (i = n & ~3; i < n; i++) { - d[0] = s[0] + s[1]; - s += 2; - d++; +static void remap_mono_to_stereo_neon_a8(pa_remap_t *m, void *dst, const void *src, unsigned n) { + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + mono_to_stereo_float_neon_a8(dst, src, n); + break; + case PA_SAMPLE_S16NE: + mono_to_stereo_int16_neon(dst, src, n); + break; + default: + pa_assert_not_reached(); } } -/* don't inline, causes ICE, see https://bugs.launchpad.net/bugs/936863 */ -static __attribute__ ((noinline)) void stereo_to_mono_int16(int16_t *d, const int16_t *s, unsigned n) { - unsigned int i; +static void stereo_to_mono_float_neon(float *dst, const float *src, unsigned n) { + int i = n & 3; + + asm volatile ( + "mov %[n], %[n], lsr #2\n\t" + "1:\n\t" + "vld2.32 {q0,q1}, [%[src]]!\n\t" + "vadd.f32 q0, q0, q1\n\t" + "subs %[n], %[n], #1\n\t" + "vst1.32 {q0}, [%[dst]]!\n\t" + "bgt 1b\n\t" + // output operands (or input operands that get modified) + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) + : // input operands + : "memory", "cc" // clobber list + ); - for (i = 0; i < n/8; i++) { - int16x8x2_t stereo = vld2q_s16(s); - int16x8_t mono = vaddq_s16(stereo.val[0], stereo.val[1]); - vst1q_s16(d, mono); - s += 16; - d += 8; + while (i--) { + dst[0] = src[0] + src[1]; + src += 2; + dst++; } - for (i = n & ~7; i < n; i++) { - d[0] = s[0] + s[1]; - s += 2; - d++; +} + +static void stereo_to_mono_int16_neon(int16_t *dst, const int16_t *src, unsigned n) { + int i = n & 7; + + asm volatile ( + "mov %[n], %[n], lsr #3\n\t" + "1:\n\t" + "vld2.16 {q0,q1}, [%[src]]!\n\t" + "vadd.s16 q0, q0, q1\n\t" + "subs %[n], %[n], #1\n\t" + "vst1.16 {q0}, [%[dst]]!\n\t" + "bgt 1b\n\t" + // output operands (or input operands that get modified) + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) + : // input operands + : "memory", "cc" // clobber list + ); + + while (i--) { + dst[0] = src[0] + src[1]; + src += 2; + dst++; } } static void remap_stereo_to_mono_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) { switch (*m->format) { case PA_SAMPLE_FLOAT32NE: - stereo_to_mono_float(dst, src, n); + stereo_to_mono_float_neon(dst, src, n); break; case PA_SAMPLE_S16NE: - stereo_to_mono_int16(dst, src, n); + stereo_to_mono_int16_neon(dst, src, n); break; default: pa_assert_not_reached(); } } + #define SAMPLES 1019 -#define TIMES 10000 +#define TIMES 500000 static void run_test_mono_to_stereo_float(void) { - float stereo[2*SAMPLES]; + float stereo_a9[2*SAMPLES]; + float stereo_a8[2*SAMPLES]; float stereo_ref[2*SAMPLES]; float stereo_gen[2*SAMPLES]; - float mono[SAMPLES]; + float mono[SAMPLES]; int i; pa_usec_t start, stop; pa_sample_format_t sf; @@ -353,7 +415,9 @@ pa_log_debug("checking NEON remap_mono_to_stereo(float, %d)", SAMPLES); memset(stereo_ref, 0, sizeof(stereo_ref)); - memset(stereo, 0, sizeof(stereo)); + memset(stereo_gen, 0, sizeof(stereo_gen)); + memset(stereo_a9, 0, sizeof(stereo_a9)); + memset(stereo_a8, 0, sizeof(stereo_a8)); for (i = 0; i < SAMPLES; i++) { mono[i] = rand()/(float) RAND_MAX - 0.5f; @@ -370,43 +434,57 @@ remap.map_table_f[0][0] = 1.0; remap.map_table_f[1][0] = 1.0; + remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES); + remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES); remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES); - remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); for (i = 0; i < 2*SAMPLES; i++) { - if (fabsf(stereo[i] - stereo_ref[i]) > 0.00001) { - pa_log_debug("%d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_ref[i], + if (fabsf(stereo_a9[i] - stereo_ref[i]) > 0.00001) { + pa_log_debug("NEON/A9 %d: %.3f != %.3f (%.3f)", i, stereo_a9[i], stereo_ref[i], mono[i/2]); } } for (i = 0; i < 2*SAMPLES; i++) { - if (fabsf(stereo[i] - stereo_gen[i]) > 0.00001) { - pa_log_debug("%d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_gen[i], + if (fabsf(stereo_a8[i] - stereo_ref[i]) > 0.00001) { + pa_log_debug("NEON/A8 %d: %.3f != %.3f (%.3f)", i, stereo_a8[i], stereo_ref[i], + mono[i/2]); + } + } + for (i = 0; i < 2*SAMPLES; i++) { + if (fabsf(stereo_gen[i] - stereo_ref[i]) > 0.00001) { + pa_log_debug("generic %d: %.3f != %.3f (%.3f)", i, stereo_gen[i], stereo_ref[i], mono[i/2]); } } start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { - remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); + remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); } stop = pa_rtclock_now(); - pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { - remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); + remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES); } stop = pa_rtclock_now(); - pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); + pa_log_info("NEON/A9:\t%llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("NEON/A8:\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES); } stop = pa_rtclock_now(); - pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start)); + pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start)); } static void run_test_stereo_to_mono_float(void) { @@ -456,25 +534,26 @@ remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); } stop = pa_rtclock_now(); - pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + pa_log_info("NEON:\t\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES); } stop = pa_rtclock_now(); - pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); + pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES); } stop = pa_rtclock_now(); - pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start)); + pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start)); } static void run_test_mono_to_stereo_s16(void) { - int16_t stereo[2*SAMPLES]; + int16_t stereo_a9[2*SAMPLES]; + int16_t stereo_a8[2*SAMPLES]; int16_t stereo_ref[2*SAMPLES]; int16_t stereo_gen[2*SAMPLES]; int16_t mono[SAMPLES]; @@ -487,7 +566,9 @@ pa_log_debug("checking NEON remap_mono_to_stereo(s16, %d)", SAMPLES); memset(stereo_ref, 0, sizeof(stereo_ref)); - memset(stereo, 0, sizeof(stereo)); + memset(stereo_a9, 0, sizeof(stereo_a9)); + memset(stereo_a8, 0, sizeof(stereo_a8)); + memset(stereo_gen, 0, sizeof(stereo_gen)); for (i = 0; i < SAMPLES; i++) { mono[i] = rand() - RAND_MAX/2; @@ -501,47 +582,61 @@ oss.channels = 2; remap.i_ss = &iss; remap.o_ss = &oss; - remap.map_table_f[0][0] = 1.0; - remap.map_table_f[1][0] = 1.0; + remap.map_table_i[0][0] = 0x10000; + remap.map_table_i[1][0] = 0x10000; remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES); - remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); + remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES); + remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES); for (i = 0; i < 2*SAMPLES; i++) { - if (abs(stereo[i] - stereo_ref[i]) > 0) { - pa_log_debug("%d: %d != %d (%d)", i, stereo[i], stereo_ref[i], + if (abs(stereo_a9[i] - stereo_ref[i]) > 0) { + pa_log_debug("NEON/A9 %d: %d != %d (%d)", i, stereo_a9[i], stereo_ref[i], + mono[i/2]); + } + } + for (i = 0; i < 2*SAMPLES; i++) { + if (abs(stereo_a8[i] - stereo_ref[i]) > 0) { + pa_log_debug("NEON/A8 %d: %d != %d (%d)", i, stereo_a8[i], stereo_ref[i], mono[i/2]); } } for (i = 0; i < 2*SAMPLES; i++) { - if (abs(stereo[i] - stereo_gen[i]) > 0) { - pa_log_debug("%d: %d != %d (%d)", i, stereo[i], stereo_gen[i], + if (abs(stereo_gen[i] - stereo_ref[i]) > 0) { + pa_log_debug("generic %d: %d != %d (%d)", i, stereo_gen[i], stereo_ref[i], mono[i/2]); } } start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { - remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); + remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES); } stop = pa_rtclock_now(); - pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + pa_log_info("NEON/A9:\t%llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("NEON/A8:\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); } stop = pa_rtclock_now(); - pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); + pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES); } stop = pa_rtclock_now(); - pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start)); + pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start)); } static void run_test_stereo_to_mono_s16(void) { @@ -558,6 +653,7 @@ pa_log_debug("checking NEON remap_stereo_to_mono(s16, %d)", SAMPLES); memset(mono_ref, 0, sizeof(mono_ref)); + memset(mono_gen, 0, sizeof(mono_gen)); memset(mono, 0, sizeof(mono)); for (i = 0; i < 2*SAMPLES; i++) { @@ -572,8 +668,8 @@ oss.channels = 1; remap.i_ss = &iss; remap.o_ss = &oss; - remap.map_table_f[0][0] = 1.0; - remap.map_table_f[0][1] = 1.0; + remap.map_table_i[0][0] = 0x10000; + remap.map_table_i[0][1] = 0x10000; remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES); remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES); @@ -597,34 +693,31 @@ remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); } stop = pa_rtclock_now(); - pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + pa_log_info("NEON:\t\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES); } stop = pa_rtclock_now(); - pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); + pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES); } stop = pa_rtclock_now(); - pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start)); + pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start)); } - #endif /* defined(__arm__) */ int main() { - run_test_stereo_to_mono_float(); run_test_stereo_to_mono_s16(); run_test_mono_to_stereo_float(); run_test_mono_to_stereo_s16(); - return EXIT_SUCCESS; }