Mercurial > hg > pa-neon
view remap_neon.c @ 4:1f6289166006
complete
author | Peter Meerwald <p.meerwald@bct-electronic.com> |
---|---|
date | Sun, 08 Jul 2012 21:03:41 +0200 |
parents | e889fd0e7769 |
children | 07763f536182 |
line wrap: on
line source
/* * Copyright 2012 Peter Meerwald <p.meerwald@bct-electronic.com> */ #include <stdlib.h> #include <stdio.h> #include <stdarg.h> #include <string.h> #include <math.h> #include <sys/time.h> #include <assert.h> typedef unsigned char uint8_t; typedef short int16_t; typedef unsigned int uint32_t; typedef enum pa_sample_format { PA_SAMPLE_S16LE, PA_SAMPLE_FLOAT32LE, } pa_sample_format_t; #define PA_SAMPLE_S16NE PA_SAMPLE_S16LE #define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE typedef struct pa_sample_spec { pa_sample_format_t format; uint32_t rate; uint8_t channels; } pa_sample_spec; #define PA_CHANNELS_MAX 32 typedef struct { pa_sample_format_t *format; pa_sample_spec *i_ss, *o_ss; float map_table_f[PA_CHANNELS_MAX][PA_CHANNELS_MAX]; int32_t map_table_i[PA_CHANNELS_MAX][PA_CHANNELS_MAX]; } pa_remap_t; typedef long long unsigned int pa_usec_t; #define pa_assert(x) assert(x) #define pa_assert_not_reached() assert(0) #define PA_CLAMP_UNLIKELY(x, low, high) \ (((x) < (low)) ? (low) : (((x) > (high)) ? (high) : (x))) static void pa_log_info(const char *format, ...) { va_list ap; char buf[1024]; va_start(ap, format); vsprintf(buf, format, ap); printf("%s\n", buf); va_end(ap); } #define pa_log_debug pa_log_info static pa_usec_t pa_rtclock_now() { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec * 1000000ULL + tv.tv_usec; } static void remap_channels_matrix_c(pa_remap_t *m, void *dst, const void *src, unsigned n) { unsigned oc, ic, i; unsigned n_ic, n_oc; n_ic = m->i_ss->channels; n_oc = m->o_ss->channels; switch (*m->format) { case PA_SAMPLE_FLOAT32NE: { float *d, *s; memset(dst, 0, n * sizeof(float) * n_oc); for (oc = 0; oc < n_oc; oc++) { for (ic = 0; ic < n_ic; ic++) { float vol; vol = m->map_table_f[oc][ic]; if (vol <= 0.0) continue; d = (float *)dst + oc; s = (float *)src + ic; if (vol >= 1.0) { for (i = n; i > 0; i--, s += n_ic, d += n_oc) *d += *s; } else { for (i = n; i > 0; i--, s += n_ic, d += n_oc) *d += *s * vol; } } } break; } case PA_SAMPLE_S16NE: { int16_t *d, *s; memset(dst, 0, n * sizeof(int16_t) * n_oc); for (oc = 0; oc < n_oc; oc++) { for (ic = 0; ic < n_ic; ic++) { int32_t vol; vol = m->map_table_i[oc][ic]; if (vol <= 0) continue; d = (int16_t *)dst + oc; s = (int16_t *)src + ic; if (vol >= 0x10000) { for (i = n; i > 0; i--, s += n_ic, d += n_oc) *d += *s; } else { for (i = n; i > 0; i--, s += n_ic, d += n_oc) *d += (int16_t) (((int32_t)*s * vol) >> 16); } } } break; } default: pa_assert_not_reached(); } } static void remap_mono_to_stereo_c(pa_remap_t *m, void *dst, const void *src, unsigned n) { unsigned i; switch (*m->format) { case PA_SAMPLE_FLOAT32NE: { float *d, *s; d = (float *) dst; s = (float *) src; for (i = n >> 2; i; i--) { d[0] = d[1] = s[0]; d[2] = d[3] = s[1]; d[4] = d[5] = s[2]; d[6] = d[7] = s[3]; s += 4; d += 8; } for (i = n & 3; i; i--) { d[0] = d[1] = s[0]; s++; d += 2; } break; } case PA_SAMPLE_S16NE: { int16_t *d, *s; d = (int16_t *) dst; s = (int16_t *) src; for (i = n >> 2; i; i--) { d[0] = d[1] = s[0]; d[2] = d[3] = s[1]; d[4] = d[5] = s[2]; d[6] = d[7] = s[3]; s += 4; d += 8; } for (i = n & 3; i; i--) { d[0] = d[1] = s[0]; s++; d += 2; } break; } default: pa_assert_not_reached(); } } static void remap_stereo_to_mono_c(pa_remap_t *m, void *dst, const void *src, unsigned n) { unsigned i; switch (*m->format) { case PA_SAMPLE_FLOAT32NE: { float *d = (float *) dst, *s = (float *) src; for (i = n >> 2; i > 0; i--) { d[0] = s[0] + s[1]; d[1] = s[2] + s[3]; d[2] = s[4] + s[5]; d[3] = s[6] + s[7]; s += 8; d += 4; } for (i = n & 3; i; i--) { d[0] = s[0] + s[1]; s += 2; d += 1; } break; } case PA_SAMPLE_S16NE: { int16_t *d = (int16_t *) dst, *s = (int16_t *) src; for (i = n >> 2; i > 0; i--) { *d++ += s[0] + s[1]; *d++ += s[2] + s[3]; *d++ += s[4] + s[5]; *d++ += s[6] + s[7]; s += 8; } for (i = n & 3; i; i--) { *d++ += s[0] + s[1]; s += 2; } break; } default: pa_assert_not_reached(); } } #if defined(__arm__) #include "arm_neon.h" static void mono_to_stereo_float_neon_a8(float *dst, const float *src, unsigned n) { int i = n & 3; asm volatile ( "mov %[n], %[n], lsr #2\n\t" "1:\n\t" "vld1.32 {q0}, [%[src]]!\n\t" "vmov q1, q0\n\t" "subs %[n], %[n], #1\n\t" "vst2.32 {q0,q1}, [%[dst]]!\n\t" "bgt 1b\n\t" // output operands (or input operands that get modified) : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) : // input operands : "memory", "cc" // clobber list ); while (i--) { dst[0] = dst[1] = src[0]; src++; dst += 2; } } static void mono_to_stereo_float_neon_a9(float *dst, const float *src, unsigned n) { int i = n & 1; asm volatile ( "mov %[n], %[n], lsr #1\n\t" "1:\n\t" "ldm %[src]!, {r4,r6}\n\t" "mov r5, r4\n\t" "mov r7, r6\n\t" "subs %[n], %[n], #1\n\t" "stm %[dst]!, {r4-r7}\n\t" "bgt 1b\n\t" // output operands (or input operands that get modified) : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) : // input operands : "memory", "cc", "r4", "r5", "r6", "r7" // clobber list ); while (i--) { dst[0] = dst[1] = src[0]; src++; dst += 2; } } static void mono_to_stereo_int16_neon(int16_t *dst, const int16_t *src, unsigned n) { int i = n & 7; asm volatile ( "mov %[n], %[n], lsr #3\n\t" "1:\n\t" "vld1.16 {q0}, [%[src]]!\n\t" "vmov q1, q0\n\t" "subs %[n], %[n], #1\n\t" "vst2.16 {q0,q1}, [%[dst]]!\n\t" "bgt 1b\n\t" // output operands (or input operands that get modified) : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) : // input operands : "memory", "cc" // clobber list ); while (i--) { dst[0] = dst[1] = src[0]; src++; dst += 2; } } static void remap_mono_to_stereo_neon_a9(pa_remap_t *m, void *dst, const void *src, unsigned n) { switch (*m->format) { case PA_SAMPLE_FLOAT32NE: mono_to_stereo_float_neon_a9(dst, src, n); break; case PA_SAMPLE_S16NE: mono_to_stereo_int16_neon(dst, src, n); break; default: pa_assert_not_reached(); } } static void remap_mono_to_stereo_neon_a8(pa_remap_t *m, void *dst, const void *src, unsigned n) { switch (*m->format) { case PA_SAMPLE_FLOAT32NE: mono_to_stereo_float_neon_a8(dst, src, n); break; case PA_SAMPLE_S16NE: mono_to_stereo_int16_neon(dst, src, n); break; default: pa_assert_not_reached(); } } static void stereo_to_mono_float_neon(float *dst, const float *src, unsigned n) { int i = n & 3; asm volatile ( "mov %[n], %[n], lsr #2\n\t" "1:\n\t" "vld2.32 {q0,q1}, [%[src]]!\n\t" "vadd.f32 q0, q0, q1\n\t" "subs %[n], %[n], #1\n\t" "vst1.32 {q0}, [%[dst]]!\n\t" "bgt 1b\n\t" // output operands (or input operands that get modified) : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) : // input operands : "memory", "cc" // clobber list ); while (i--) { dst[0] = src[0] + src[1]; src += 2; dst++; } } static void stereo_to_mono_int16_neon(int16_t *dst, const int16_t *src, unsigned n) { int i = n & 7; asm volatile ( "mov %[n], %[n], lsr #3\n\t" "1:\n\t" "vld2.16 {q0,q1}, [%[src]]!\n\t" "vadd.s16 q0, q0, q1\n\t" "subs %[n], %[n], #1\n\t" "vst1.16 {q0}, [%[dst]]!\n\t" "bgt 1b\n\t" // output operands (or input operands that get modified) : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) : // input operands : "memory", "cc" // clobber list ); while (i--) { dst[0] = src[0] + src[1]; src += 2; dst++; } } static void remap_stereo_to_mono_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) { switch (*m->format) { case PA_SAMPLE_FLOAT32NE: stereo_to_mono_float_neon(dst, src, n); break; case PA_SAMPLE_S16NE: stereo_to_mono_int16_neon(dst, src, n); break; default: pa_assert_not_reached(); } } #define SAMPLES 1019 #define TIMES 500000 static void run_test_mono_to_stereo_float(void) { float stereo_a9[2*SAMPLES]; float stereo_a8[2*SAMPLES]; float stereo_ref[2*SAMPLES]; float stereo_gen[2*SAMPLES]; float mono[SAMPLES]; int i; pa_usec_t start, stop; pa_sample_format_t sf; pa_sample_spec iss, oss; pa_remap_t remap; pa_log_debug("checking NEON remap_mono_to_stereo(float, %d)", SAMPLES); memset(stereo_ref, 0, sizeof(stereo_ref)); memset(stereo_gen, 0, sizeof(stereo_gen)); memset(stereo_a9, 0, sizeof(stereo_a9)); memset(stereo_a8, 0, sizeof(stereo_a8)); for (i = 0; i < SAMPLES; i++) { mono[i] = rand()/(float) RAND_MAX - 0.5f; } sf = PA_SAMPLE_FLOAT32NE; remap.format = &sf; iss.format = PA_SAMPLE_FLOAT32NE; iss.channels = 1; oss.format = PA_SAMPLE_FLOAT32NE; oss.channels = 2; remap.i_ss = &iss; remap.o_ss = &oss; remap.map_table_f[0][0] = 1.0; remap.map_table_f[1][0] = 1.0; remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES); remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES); remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES); for (i = 0; i < 2*SAMPLES; i++) { if (fabsf(stereo_a9[i] - stereo_ref[i]) > 0.00001) { pa_log_debug("NEON/A9 %d: %.3f != %.3f (%.3f)", i, stereo_a9[i], stereo_ref[i], mono[i/2]); } } for (i = 0; i < 2*SAMPLES; i++) { if (fabsf(stereo_a8[i] - stereo_ref[i]) > 0.00001) { pa_log_debug("NEON/A8 %d: %.3f != %.3f (%.3f)", i, stereo_a8[i], stereo_ref[i], mono[i/2]); } } for (i = 0; i < 2*SAMPLES; i++) { if (fabsf(stereo_gen[i] - stereo_ref[i]) > 0.00001) { pa_log_debug("generic %d: %.3f != %.3f (%.3f)", i, stereo_gen[i], stereo_ref[i], mono[i/2]); } } start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("NEON/A9:\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("NEON/A8:\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start)); } static void run_test_stereo_to_mono_float(void) { float stereo[2*SAMPLES]; float mono_ref[SAMPLES]; float mono_gen[SAMPLES]; float mono[SAMPLES]; int i; pa_usec_t start, stop; pa_sample_format_t sf; pa_sample_spec iss, oss; pa_remap_t remap; pa_log_debug("checking NEON remap_stereo_to_mono(float, %d)", SAMPLES); memset(mono_ref, 0, sizeof(mono_ref)); memset(mono, 0, sizeof(mono)); for (i = 0; i < 2*SAMPLES; i++) { stereo[i] = rand()/(float) RAND_MAX - 0.5f; } sf = PA_SAMPLE_FLOAT32NE; remap.format = &sf; iss.format = PA_SAMPLE_FLOAT32NE; iss.channels = 2; oss.format = PA_SAMPLE_FLOAT32NE; oss.channels = 1; remap.i_ss = &iss; remap.o_ss = &oss; remap.map_table_f[0][0] = 1.0; remap.map_table_f[0][1] = 1.0; remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES); remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES); remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); for (i = 0; i < SAMPLES; i++) { if (fabsf(mono[i] - mono_ref[i]) > 0.00001) { pa_log_debug("%d: %.3f != %.3f (%.3f %0.3f)", i, mono[i], mono_ref[i], stereo[2*i+0], stereo[2*i+1]); } } start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("NEON:\t\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start)); } static void run_test_mono_to_stereo_s16(void) { int16_t stereo_a9[2*SAMPLES]; int16_t stereo_a8[2*SAMPLES]; int16_t stereo_ref[2*SAMPLES]; int16_t stereo_gen[2*SAMPLES]; int16_t mono[SAMPLES]; int i; pa_usec_t start, stop; pa_sample_format_t sf; pa_sample_spec iss, oss; pa_remap_t remap; pa_log_debug("checking NEON remap_mono_to_stereo(s16, %d)", SAMPLES); memset(stereo_ref, 0, sizeof(stereo_ref)); memset(stereo_a9, 0, sizeof(stereo_a9)); memset(stereo_a8, 0, sizeof(stereo_a8)); memset(stereo_gen, 0, sizeof(stereo_gen)); for (i = 0; i < SAMPLES; i++) { mono[i] = rand() - RAND_MAX/2; } sf = PA_SAMPLE_S16NE; remap.format = &sf; iss.format = PA_SAMPLE_S16NE; iss.channels = 1; oss.format = PA_SAMPLE_S16NE; oss.channels = 2; remap.i_ss = &iss; remap.o_ss = &oss; remap.map_table_i[0][0] = 0x10000; remap.map_table_i[1][0] = 0x10000; remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES); remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES); remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES); for (i = 0; i < 2*SAMPLES; i++) { if (abs(stereo_a9[i] - stereo_ref[i]) > 0) { pa_log_debug("NEON/A9 %d: %d != %d (%d)", i, stereo_a9[i], stereo_ref[i], mono[i/2]); } } for (i = 0; i < 2*SAMPLES; i++) { if (abs(stereo_a8[i] - stereo_ref[i]) > 0) { pa_log_debug("NEON/A8 %d: %d != %d (%d)", i, stereo_a8[i], stereo_ref[i], mono[i/2]); } } for (i = 0; i < 2*SAMPLES; i++) { if (abs(stereo_gen[i] - stereo_ref[i]) > 0) { pa_log_debug("generic %d: %d != %d (%d)", i, stereo_gen[i], stereo_ref[i], mono[i/2]); } } start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("NEON/A9:\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("NEON/A8:\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start)); } static void run_test_stereo_to_mono_s16(void) { int16_t stereo[2*SAMPLES]; int16_t mono_ref[SAMPLES]; int16_t mono_gen[SAMPLES]; int16_t mono[SAMPLES]; int i; pa_usec_t start, stop; pa_sample_format_t sf; pa_sample_spec iss, oss; pa_remap_t remap; pa_log_debug("checking NEON remap_stereo_to_mono(s16, %d)", SAMPLES); memset(mono_ref, 0, sizeof(mono_ref)); memset(mono_gen, 0, sizeof(mono_gen)); memset(mono, 0, sizeof(mono)); for (i = 0; i < 2*SAMPLES; i++) { stereo[i] = rand() - RAND_MAX/2; } sf = PA_SAMPLE_S16NE; remap.format = &sf; iss.format = PA_SAMPLE_S16NE; iss.channels = 2; oss.format = PA_SAMPLE_S16NE; oss.channels = 1; remap.i_ss = &iss; remap.o_ss = &oss; remap.map_table_i[0][0] = 0x10000; remap.map_table_i[0][1] = 0x10000; remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES); remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES); remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); for (i = 0; i < SAMPLES; i++) { if (abs(mono[i] - mono_ref[i]) > 0) { pa_log_debug("%d: %d != %d (%d)", i, mono[i], mono_ref[i], stereo[2*i+0], stereo[2*i+1]); } } for (i = 0; i < SAMPLES; i++) { if (abs(mono[i] - mono_gen[i]) > 0) { pa_log_debug("%d: %d != %d (%d)", i, mono[i], mono_gen[i], stereo[2*i+0], stereo[2*i+1]); } } start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("NEON:\t\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES); } stop = pa_rtclock_now(); pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start)); } #endif /* defined(__arm__) */ int main() { /* not in user space unsigned cpuid; asm volatile( "mrc p15, 0, %[cpuid], c0, c0, 0\n\t" : [cpuid] "=r" (cpuid) : : "cc"); printf("%08x %03x\n", cpuid, (cpuid >> 4) & 0xfff); */ run_test_stereo_to_mono_float(); run_test_stereo_to_mono_s16(); run_test_mono_to_stereo_float(); run_test_mono_to_stereo_s16(); return EXIT_SUCCESS; }