# HG changeset patch # User Peter Meerwald # Date 1334924774 -7200 # Node ID b829afbea564ccfc4ec54e75505c396c61ddaf0d # Parent e0040ee59c3c88b43a3ca2826188a2db74b15b51 more testing diff -r e0040ee59c3c -r b829afbea564 remap_neon.c --- a/remap_neon.c Thu Jan 12 17:27:46 2012 +0100 +++ b/remap_neon.c Fri Apr 20 14:26:14 2012 +0200 @@ -10,18 +10,31 @@ #include #include +typedef unsigned char uint8_t; +typedef short int16_t; +typedef unsigned int uint32_t; -typedef short int16_t; typedef enum pa_sample_format { PA_SAMPLE_S16LE, PA_SAMPLE_FLOAT32LE, } pa_sample_format_t; #define PA_SAMPLE_S16NE PA_SAMPLE_S16LE #define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE + +typedef struct pa_sample_spec { + pa_sample_format_t format; + uint32_t rate; + uint8_t channels; +} pa_sample_spec; + +#define PA_CHANNELS_MAX 32 typedef struct { pa_sample_format_t *format; + pa_sample_spec *i_ss, *o_ss; + float map_table_f[PA_CHANNELS_MAX][PA_CHANNELS_MAX]; + int32_t map_table_i[PA_CHANNELS_MAX][PA_CHANNELS_MAX]; } pa_remap_t; -typedef void (*pa_remap_func_t)(pa_remap_t *m, void *dst, const void *src, unsigned n); + typedef long long unsigned int pa_usec_t; #define pa_assert(x) assert(x) @@ -48,6 +61,80 @@ return tv.tv_sec * 1000000ULL + tv.tv_usec; } +static void remap_channels_matrix_c(pa_remap_t *m, void *dst, const void *src, unsigned n) { + unsigned oc, ic, i; + unsigned n_ic, n_oc; + + n_ic = m->i_ss->channels; + n_oc = m->o_ss->channels; + + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + { + float *d, *s; + + memset(dst, 0, n * sizeof(float) * n_oc); + + for (oc = 0; oc < n_oc; oc++) { + + for (ic = 0; ic < n_ic; ic++) { + float vol; + + vol = m->map_table_f[oc][ic]; + + if (vol <= 0.0) + continue; + + d = (float *)dst + oc; + s = (float *)src + ic; + + if (vol >= 1.0) { + for (i = n; i > 0; i--, s += n_ic, d += n_oc) + *d += *s; + } else { + for (i = n; i > 0; i--, s += n_ic, d += n_oc) + *d += *s * vol; + } + } + } + + break; + } + case PA_SAMPLE_S16NE: + { + int16_t *d, *s; + + memset(dst, 0, n * sizeof(int16_t) * n_oc); + + for (oc = 0; oc < n_oc; oc++) { + + for (ic = 0; ic < n_ic; ic++) { + int32_t vol; + + vol = m->map_table_i[oc][ic]; + + if (vol <= 0) + continue; + + d = (int16_t *)dst + oc; + s = (int16_t *)src + ic; + + if (vol >= 0x10000) { + for (i = n; i > 0; i--, s += n_ic, d += n_oc) + *d += *s; + } else { + for (i = n; i > 0; i--, s += n_ic, d += n_oc) + *d += (int16_t) (((int32_t)*s * vol) >> 16); + } + } + } + break; + } + default: + pa_assert_not_reached(); + } +} + static void remap_mono_to_stereo_c(pa_remap_t *m, void *dst, const void *src, unsigned n) { unsigned i; @@ -101,6 +188,54 @@ } } + + +static void remap_stereo_to_mono_c(pa_remap_t *m, void *dst, const void *src, unsigned n) { + unsigned i; + + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + { + float *d = (float *) dst, *s = (float *) src; + + for (i = n >> 2; i > 0; i--) { + d[0] = s[0] + s[1]; + d[1] = s[2] + s[3]; + d[2] = s[4] + s[5]; + d[3] = s[6] + s[7]; + s += 8; + d += 4; + } + for (i = n & 3; i; i--) { + d[0] = s[0] + s[1]; + s += 2; + d += 1; + } + break; + } + case PA_SAMPLE_S16NE: + { + int16_t *d = (int16_t *) dst, *s = (int16_t *) src; + + for (i = n >> 2; i > 0; i--) { + *d++ += s[0] + s[1]; + *d++ += s[2] + s[3]; + *d++ += s[4] + s[5]; + *d++ += s[6] + s[7]; + s += 8; + } + for (i = n & 3; i; i--) { + *d++ += s[0] + s[1]; + s += 2; + } + break; + } + default: + pa_assert_not_reached(); + } +} + + #if defined(__arm__) #include "arm_neon.h" @@ -153,17 +288,66 @@ } } +/* don't inline, causes ICE, see https://bugs.launchpad.net/bugs/936863 */ +static __attribute__ ((noinline)) void stereo_to_mono_float(float *d, const float *s, unsigned n) { + unsigned i; + + for (i = 0; i < n/4; i++) { + float32x4x2_t stereo = vld2q_f32(s); + float32x4_t mono = vaddq_f32(stereo.val[0], stereo.val[1]); + vst1q_f32(d, mono); + s += 8; + d += 4; + } + for (i = n & ~3; i < n; i++) { + d[0] = s[0] + s[1]; + s += 2; + d++; + } +} + +/* don't inline, causes ICE, see https://bugs.launchpad.net/bugs/936863 */ +static __attribute__ ((noinline)) void stereo_to_mono_int16(int16_t *d, const int16_t *s, unsigned n) { + unsigned int i; + + for (i = 0; i < n/8; i++) { + int16x8x2_t stereo = vld2q_s16(s); + int16x8_t mono = vaddq_s16(stereo.val[0], stereo.val[1]); + vst1q_s16(d, mono); + s += 16; + d += 8; + } + for (i = n & ~7; i < n; i++) { + d[0] = s[0] + s[1]; + s += 2; + d++; + } +} + +static void remap_stereo_to_mono_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) { + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + stereo_to_mono_float(dst, src, n); + break; + case PA_SAMPLE_S16NE: + stereo_to_mono_int16(dst, src, n); + break; + default: + pa_assert_not_reached(); + } +} #define SAMPLES 1019 #define TIMES 10000 -static void run_test_float(void) { +static void run_test_mono_to_stereo_float(void) { float stereo[2*SAMPLES]; float stereo_ref[2*SAMPLES]; + float stereo_gen[2*SAMPLES]; float mono[SAMPLES]; int i; pa_usec_t start, stop; - pa_remap_func_t func; pa_sample_format_t sf; + pa_sample_spec iss, oss; pa_remap_t remap; pa_log_debug("checking NEON remap_mono_to_stereo(float, %d)", SAMPLES); @@ -177,8 +361,17 @@ sf = PA_SAMPLE_FLOAT32NE; remap.format = &sf; - func = (pa_remap_func_t) remap_mono_to_stereo_c; - func(&remap, stereo_ref, mono, SAMPLES); + iss.format = PA_SAMPLE_FLOAT32NE; + iss.channels = 1; + oss.format = PA_SAMPLE_FLOAT32NE; + oss.channels = 2; + remap.i_ss = &iss; + remap.o_ss = &oss; + remap.map_table_f[0][0] = 1.0; + remap.map_table_f[1][0] = 1.0; + + remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); + remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES); remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); for (i = 0; i < 2*SAMPLES; i++) { @@ -187,6 +380,12 @@ mono[i/2]); } } + for (i = 0; i < 2*SAMPLES; i++) { + if (fabsf(stereo[i] - stereo_gen[i]) > 0.00001) { + pa_log_debug("%d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_gen[i], + mono[i/2]); + } + } start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { @@ -197,20 +396,92 @@ start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { - func(&remap, stereo_ref, mono, SAMPLES); + remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start)); } -static void run_test_s16(void) { +static void run_test_stereo_to_mono_float(void) { + float stereo[2*SAMPLES]; + float mono_ref[SAMPLES]; + float mono_gen[SAMPLES]; + float mono[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_sample_format_t sf; + pa_sample_spec iss, oss; + pa_remap_t remap; + + pa_log_debug("checking NEON remap_stereo_to_mono(float, %d)", SAMPLES); + + memset(mono_ref, 0, sizeof(mono_ref)); + memset(mono, 0, sizeof(mono)); + + for (i = 0; i < 2*SAMPLES; i++) { + stereo[i] = rand()/(float) RAND_MAX - 0.5f; + } + + sf = PA_SAMPLE_FLOAT32NE; + remap.format = &sf; + iss.format = PA_SAMPLE_FLOAT32NE; + iss.channels = 2; + oss.format = PA_SAMPLE_FLOAT32NE; + oss.channels = 1; + remap.i_ss = &iss; + remap.o_ss = &oss; + remap.map_table_f[0][0] = 1.0; + remap.map_table_f[0][1] = 1.0; + + remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES); + remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES); + remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); + + for (i = 0; i < SAMPLES; i++) { + if (fabsf(mono[i] - mono_ref[i]) > 0.00001) { + pa_log_debug("%d: %.3f != %.3f (%.3f %0.3f)", i, mono[i], mono_ref[i], + stereo[2*i+0], stereo[2*i+1]); + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start)); +} + +static void run_test_mono_to_stereo_s16(void) { int16_t stereo[2*SAMPLES]; int16_t stereo_ref[2*SAMPLES]; + int16_t stereo_gen[2*SAMPLES]; int16_t mono[SAMPLES]; int i; pa_usec_t start, stop; - pa_remap_func_t func; pa_sample_format_t sf; + pa_sample_spec iss, oss; pa_remap_t remap; pa_log_debug("checking NEON remap_mono_to_stereo(s16, %d)", SAMPLES); @@ -224,10 +495,19 @@ sf = PA_SAMPLE_S16NE; remap.format = &sf; - func = (pa_remap_func_t) remap_mono_to_stereo_c; - func(&remap, stereo_ref, mono, SAMPLES); + iss.format = PA_SAMPLE_S16NE; + iss.channels = 1; + oss.format = PA_SAMPLE_S16NE; + oss.channels = 2; + remap.i_ss = &iss; + remap.o_ss = &oss; + remap.map_table_f[0][0] = 1.0; + remap.map_table_f[1][0] = 1.0; + + remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); + remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES); remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); - + for (i = 0; i < 2*SAMPLES; i++) { if (abs(stereo[i] - stereo_ref[i]) > 0) { pa_log_debug("%d: %d != %d (%d)", i, stereo[i], stereo_ref[i], @@ -235,6 +515,13 @@ } } + for (i = 0; i < 2*SAMPLES; i++) { + if (abs(stereo[i] - stereo_gen[i]) > 0) { + pa_log_debug("%d: %d != %d (%d)", i, stereo[i], stereo_gen[i], + mono[i/2]); + } + } + start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); @@ -244,18 +531,100 @@ start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { - func(&remap, stereo_ref, mono, SAMPLES); + remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start)); } +static void run_test_stereo_to_mono_s16(void) { + int16_t stereo[2*SAMPLES]; + int16_t mono_ref[SAMPLES]; + int16_t mono_gen[SAMPLES]; + int16_t mono[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_sample_format_t sf; + pa_sample_spec iss, oss; + pa_remap_t remap; + + pa_log_debug("checking NEON remap_stereo_to_mono(s16, %d)", SAMPLES); + + memset(mono_ref, 0, sizeof(mono_ref)); + memset(mono, 0, sizeof(mono)); + + for (i = 0; i < 2*SAMPLES; i++) { + stereo[i] = rand() - RAND_MAX/2; + } + + sf = PA_SAMPLE_S16NE; + remap.format = &sf; + iss.format = PA_SAMPLE_S16NE; + iss.channels = 2; + oss.format = PA_SAMPLE_S16NE; + oss.channels = 1; + remap.i_ss = &iss; + remap.o_ss = &oss; + remap.map_table_f[0][0] = 1.0; + remap.map_table_f[0][1] = 1.0; + + remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES); + remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES); + remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); + + for (i = 0; i < SAMPLES; i++) { + if (abs(mono[i] - mono_ref[i]) > 0) { + pa_log_debug("%d: %d != %d (%d)", i, mono[i], mono_ref[i], + stereo[2*i+0], stereo[2*i+1]); + } + } + for (i = 0; i < SAMPLES; i++) { + if (abs(mono[i] - mono_gen[i]) > 0) { + pa_log_debug("%d: %d != %d (%d)", i, mono[i], mono_gen[i], + stereo[2*i+0], stereo[2*i+1]); + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start)); +} + + #endif /* defined(__arm__) */ int main() { - run_test_float(); - run_test_s16(); + run_test_stereo_to_mono_float(); + run_test_stereo_to_mono_s16(); + + run_test_mono_to_stereo_float(); + run_test_mono_to_stereo_s16(); + return EXIT_SUCCESS; } diff -r e0040ee59c3c -r b829afbea564 sconv_neon.c --- a/sconv_neon.c Thu Jan 12 17:27:46 2012 +0100 +++ b/sconv_neon.c Fri Apr 20 14:26:14 2012 +0200 @@ -103,7 +103,7 @@ } #define SAMPLES 1019 -#define TIMES 300 +#define TIMES 10000 static void run_test_from(void) { int16_t samples[SAMPLES]; diff -r e0040ee59c3c -r b829afbea564 svolume_neon.c --- a/svolume_neon.c Thu Jan 12 17:27:46 2012 +0100 +++ b/svolume_neon.c Fri Apr 20 14:26:14 2012 +0200 @@ -333,7 +333,7 @@ } #define SAMPLES 1019 -#define TIMES 1000 +#define TIMES 3000 #define CHANNELS 4 #define PADDING 16