Mercurial > hg > pa-neon
view remap_neon.c @ 1:b829afbea564
more testing
author | Peter Meerwald <p.meerwald@bct-electronic.com> |
---|---|
date | Fri, 20 Apr 2012 14:26:14 +0200 |
parents | e0040ee59c3c |
children | 09ee6a01a3d3 |
line wrap: on
line source
/* * Copyright 2012 Peter Meerwald <p.meerwald@bct-electronic.com> */ #include <stdlib.h> #include <stdio.h> #include <stdarg.h> #include <string.h> #include <math.h> #include <sys/time.h> #include <assert.h> typedef unsigned char uint8_t; typedef short int16_t; typedef unsigned int uint32_t; typedef enum pa_sample_format { PA_SAMPLE_S16LE, PA_SAMPLE_FLOAT32LE, } pa_sample_format_t; #define PA_SAMPLE_S16NE PA_SAMPLE_S16LE #define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE typedef struct pa_sample_spec { pa_sample_format_t format; uint32_t rate; uint8_t channels; } pa_sample_spec; #define PA_CHANNELS_MAX 32 typedef struct { pa_sample_format_t *format; pa_sample_spec *i_ss, *o_ss; float map_table_f[PA_CHANNELS_MAX][PA_CHANNELS_MAX]; int32_t map_table_i[PA_CHANNELS_MAX][PA_CHANNELS_MAX]; } pa_remap_t; typedef long long unsigned int pa_usec_t; #define pa_assert(x) assert(x) #define pa_assert_not_reached() assert(0) #define PA_CLAMP_UNLIKELY(x, low, high) \ (((x) < (low)) ? (low) : (((x) > (high)) ? (high) : (x))) static void pa_log_info(const char *format, ...) { va_list ap; char buf[1024]; va_start(ap, format); vsprintf(buf, format, ap); printf("%s\n", buf); va_end(ap); } #define pa_log_debug pa_log_info static pa_usec_t pa_rtclock_now() { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec * 1000000ULL + tv.tv_usec; } static void remap_channels_matrix_c(pa_remap_t *m, void *dst, const void *src, unsigned n) { unsigned oc, ic, i; unsigned n_ic, n_oc; n_ic = m->i_ss->channels; n_oc = m->o_ss->channels; switch (*m->format) { case PA_SAMPLE_FLOAT32NE: { float *d, *s; memset(dst, 0, n * sizeof(float) * n_oc); for (oc = 0; oc < n_oc; oc++) { for (ic = 0; ic < n_ic; ic++) { float vol; vol = m->map_table_f[oc][ic]; if (vol <= 0.0) continue; d = (float *)dst + oc; s = (float *)src + ic; if (vol >= 1.0) { for (i = n; i > 0; i--, s += n_ic, d += n_oc) *d += *s; } else { for (i = n; i > 0; i--, s += n_ic, d += n_oc) *d += *s * vol; } } } break; } case PA_SAMPLE_S16NE: { int16_t *d, *s; memset(dst, 0, n * sizeof(int16_t) * n_oc); for (oc = 0; oc < n_oc; oc++) { for (ic = 0; ic < n_ic; ic++) { int32_t vol; vol = m->map_table_i[oc][ic]; if (vol <= 0) continue; d = (int16_t *)dst + oc; s = (int16_t *)src + ic; if (vol >= 0x10000) { for (i = n; i > 0; i--, s += n_ic, d += n_oc) *d += *s; } else { for (i = n; i > 0; i--, s += n_ic, d += n_oc) *d += (int16_t) (((int32_t)*s * vol) >> 16); } } } break; } default: pa_assert_not_reached(); } } static void remap_mono_to_stereo_c(pa_remap_t *m, void *dst, const void *src, unsigned n) { unsigned i; switch (*m->format) { case PA_SAMPLE_FLOAT32NE: { float *d, *s; d = (float *) dst; s = (float *) src; for (i = n >> 2; i; i--) { d[0] = d[1] = s[0]; d[2] = d[3] = s[1]; d[4] = d[5] = s[2]; d[6] = d[7] = s[3]; s += 4; d += 8; } for (i = n & 3; i; i--) { d[0] = d[1] = s[0]; s++; d += 2; } break; } case PA_SAMPLE_S16NE: { int16_t *d, *s; d = (int16_t *) dst; s = (int16_t *) src; for (i = n >> 2; i; i--) { d[0] = d[1] = s[0]; d[2] = d[3] = s[1]; d[4] = d[5] = s[2]; d[6] = d[7] = s[3]; s += 4; d += 8; } for (i = n & 3; i; i--) { d[0] = d[1] = s[0]; s++; d += 2; } break; } default: pa_assert_not_reached(); } } static void remap_stereo_to_mono_c(pa_remap_t *m, void *dst, const void *src, unsigned n) { unsigned i; switch (*m->format) { case PA_SAMPLE_FLOAT32NE: { float *d = (float *) dst, *s = (float *) src; for (i = n >> 2; i > 0; i--) { d[0] = s[0] + s[1]; d[1] = s[2] + s[3]; d[2] = s[4] + s[5]; d[3] = s[6] + s[7]; s += 8; d += 4; } for (i = n & 3; i; i--) { d[0] = s[0] + s[1]; s += 2; d += 1; } break; } case PA_SAMPLE_S16NE: { int16_t *d = (int16_t *) dst, *s = (int16_t *) src; for (i = n >> 2; i > 0; i--) { *d++ += s[0] + s[1]; *d++ += s[2] + s[3]; *d++ += s[4] + s[5]; *d++ += s[6] + s[7]; s += 8; } for (i = n & 3; i; i--) { *d++ += s[0] + s[1]; s += 2; } break; } default: pa_assert_not_reached(); } } #if defined(__arm__) #include "arm_neon.h" void remap_mono_to_stereo_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) { unsigned i; switch (*m->format) { case PA_SAMPLE_FLOAT32NE: { float *d = (float *) dst, *s = (float *) src; for (i = 0; i < n/4; i++) { float32x4x2_t stereo; stereo.val[0] = vld1q_f32(s); stereo.val[1] = stereo.val[0]; vst2q_f32(d, stereo); s += 4; d += 8; } for (i = n & ~3; i < n; i++) { d[0] = d[1] = s[0]; s++; d += 2; } break; } case PA_SAMPLE_S16NE: { int16_t *d = (int16_t *) dst, *s = (int16_t *) src; for (i = 0; i < n/8; i++) { int16x8x2_t stereo; stereo.val[0] = vld1q_s16(s); stereo.val[1] = stereo.val[0]; vst2q_s16(d, stereo); s += 8; d += 16; } for (i = n & ~7; i < n; i++) { d[0] = d[1] = s[0]; s++; d += 2; } break; } default: pa_assert_not_reached(); } } /* don't inline, causes ICE, see https://bugs.launchpad.net/bugs/936863 */ static __attribute__ ((noinline)) void stereo_to_mono_float(float *d, const float *s, unsigned n) { unsigned i; for (i = 0; i < n/4; i++) { float32x4x2_t stereo = vld2q_f32(s); float32x4_t mono = vaddq_f32(stereo.val[0], stereo.val[1]); vst1q_f32(d, mono); s += 8; d += 4; } for (i = n & ~3; i < n; i++) { d[0] = s[0] + s[1]; s += 2; d++; } } /* don't inline, causes ICE, see https://bugs.launchpad.net/bugs/936863 */ static __attribute__ ((noinline)) void stereo_to_mono_int16(int16_t *d, const int16_t *s, unsigned n) { unsigned int i; for (i = 0; i < n/8; i++) { int16x8x2_t stereo = vld2q_s16(s); int16x8_t mono = vaddq_s16(stereo.val[0], stereo.val[1]); vst1q_s16(d, mono); s += 16; d += 8; } for (i = n & ~7; i < n; i++) { d[0] = s[0] + s[1]; s += 2; d++; } } static void remap_stereo_to_mono_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) { switch (*m->format) { case PA_SAMPLE_FLOAT32NE: stereo_to_mono_float(dst, src, n); break; case PA_SAMPLE_S16NE: stereo_to_mono_int16(dst, src, n); break; default: pa_assert_not_reached(); } } #define SAMPLES 1019 #define TIMES 10000 static void run_test_mono_to_stereo_float(void) { float stereo[2*SAMPLES]; float stereo_ref[2*SAMPLES]; float stereo_gen[2*SAMPLES]; float mono[SAMPLES]; int i; pa_usec_t start, stop; pa_sample_format_t sf; pa_sample_spec iss, oss; pa_remap_t remap; pa_log_debug("checking NEON remap_mono_to_stereo(float, %d)", SAMPLES); memset(stereo_ref, 0, sizeof(stereo_ref)); memset(stereo, 0, sizeof(stereo)); for (i = 0; i < SAMPLES; i++) { mono[i] = rand()/(float) RAND_MAX - 0.5f; } sf = PA_SAMPLE_FLOAT32NE; remap.format = &sf; iss.format = PA_SAMPLE_FLOAT32NE; iss.channels = 1; oss.format = PA_SAMPLE_FLOAT32NE; oss.channels = 2; remap.i_ss = &iss; remap.o_ss = &oss; remap.map_table_f[0][0] = 1.0; remap.map_table_f[1][0] = 1.0; remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES); remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); for (i = 0; i < 2*SAMPLES; i++) { if (fabsf(stereo[i] - stereo_ref[i]) > 0.00001) { pa_log_debug("%d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_ref[i], mono[i/2]); } } for (i = 0; i < 2*SAMPLES; i++) { if (fabsf(stereo[i] - stereo_gen[i]) > 0.00001) { pa_log_debug("%d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_gen[i], mono[i/2]); } } start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start)); } static void run_test_stereo_to_mono_float(void) { float stereo[2*SAMPLES]; float mono_ref[SAMPLES]; float mono_gen[SAMPLES]; float mono[SAMPLES]; int i; pa_usec_t start, stop; pa_sample_format_t sf; pa_sample_spec iss, oss; pa_remap_t remap; pa_log_debug("checking NEON remap_stereo_to_mono(float, %d)", SAMPLES); memset(mono_ref, 0, sizeof(mono_ref)); memset(mono, 0, sizeof(mono)); for (i = 0; i < 2*SAMPLES; i++) { stereo[i] = rand()/(float) RAND_MAX - 0.5f; } sf = PA_SAMPLE_FLOAT32NE; remap.format = &sf; iss.format = PA_SAMPLE_FLOAT32NE; iss.channels = 2; oss.format = PA_SAMPLE_FLOAT32NE; oss.channels = 1; remap.i_ss = &iss; remap.o_ss = &oss; remap.map_table_f[0][0] = 1.0; remap.map_table_f[0][1] = 1.0; remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES); remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES); remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); for (i = 0; i < SAMPLES; i++) { if (fabsf(mono[i] - mono_ref[i]) > 0.00001) { pa_log_debug("%d: %.3f != %.3f (%.3f %0.3f)", i, mono[i], mono_ref[i], stereo[2*i+0], stereo[2*i+1]); } } start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start)); } static void run_test_mono_to_stereo_s16(void) { int16_t stereo[2*SAMPLES]; int16_t stereo_ref[2*SAMPLES]; int16_t stereo_gen[2*SAMPLES]; int16_t mono[SAMPLES]; int i; pa_usec_t start, stop; pa_sample_format_t sf; pa_sample_spec iss, oss; pa_remap_t remap; pa_log_debug("checking NEON remap_mono_to_stereo(s16, %d)", SAMPLES); memset(stereo_ref, 0, sizeof(stereo_ref)); memset(stereo, 0, sizeof(stereo)); for (i = 0; i < SAMPLES; i++) { mono[i] = rand() - RAND_MAX/2; } sf = PA_SAMPLE_S16NE; remap.format = &sf; iss.format = PA_SAMPLE_S16NE; iss.channels = 1; oss.format = PA_SAMPLE_S16NE; oss.channels = 2; remap.i_ss = &iss; remap.o_ss = &oss; remap.map_table_f[0][0] = 1.0; remap.map_table_f[1][0] = 1.0; remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES); remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); for (i = 0; i < 2*SAMPLES; i++) { if (abs(stereo[i] - stereo_ref[i]) > 0) { pa_log_debug("%d: %d != %d (%d)", i, stereo[i], stereo_ref[i], mono[i/2]); } } for (i = 0; i < 2*SAMPLES; i++) { if (abs(stereo[i] - stereo_gen[i]) > 0) { pa_log_debug("%d: %d != %d (%d)", i, stereo[i], stereo_gen[i], mono[i/2]); } } start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start)); } static void run_test_stereo_to_mono_s16(void) { int16_t stereo[2*SAMPLES]; int16_t mono_ref[SAMPLES]; int16_t mono_gen[SAMPLES]; int16_t mono[SAMPLES]; int i; pa_usec_t start, stop; pa_sample_format_t sf; pa_sample_spec iss, oss; pa_remap_t remap; pa_log_debug("checking NEON remap_stereo_to_mono(s16, %d)", SAMPLES); memset(mono_ref, 0, sizeof(mono_ref)); memset(mono, 0, sizeof(mono)); for (i = 0; i < 2*SAMPLES; i++) { stereo[i] = rand() - RAND_MAX/2; } sf = PA_SAMPLE_S16NE; remap.format = &sf; iss.format = PA_SAMPLE_S16NE; iss.channels = 2; oss.format = PA_SAMPLE_S16NE; oss.channels = 1; remap.i_ss = &iss; remap.o_ss = &oss; remap.map_table_f[0][0] = 1.0; remap.map_table_f[0][1] = 1.0; remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES); remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES); remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); for (i = 0; i < SAMPLES; i++) { if (abs(mono[i] - mono_ref[i]) > 0) { pa_log_debug("%d: %d != %d (%d)", i, mono[i], mono_ref[i], stereo[2*i+0], stereo[2*i+1]); } } for (i = 0; i < SAMPLES; i++) { if (abs(mono[i] - mono_gen[i]) > 0) { pa_log_debug("%d: %d != %d (%d)", i, mono[i], mono_gen[i], stereo[2*i+0], stereo[2*i+1]); } } start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start)); } #endif /* defined(__arm__) */ int main() { run_test_stereo_to_mono_float(); run_test_stereo_to_mono_s16(); run_test_mono_to_stereo_float(); run_test_mono_to_stereo_s16(); return EXIT_SUCCESS; }