Mercurial > hg > pa-neon
changeset 0:e0040ee59c3c
import
author | Peter Meerwald <p.meerwald@bct-electronic.com> |
---|---|
date | Thu, 12 Jan 2012 17:27:46 +0100 |
parents | |
children | b829afbea564 |
files | .hgignore compile.sh remap_neon.c sconv_neon.c svolume_neon.c |
diffstat | 5 files changed, 928 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgignore Thu Jan 12 17:27:46 2012 +0100 @@ -0,0 +1,4 @@ +syntax: glob +sconv_neon +svolume_neon +remap_neon
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/compile.sh Thu Jan 12 17:27:46 2012 +0100 @@ -0,0 +1,11 @@ +arm-linux-gnueabi-gcc -Wall -g -O2 -static \ + -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon \ + -o sconv_neon sconv_neon.c -lm + +arm-linux-gnueabi-gcc -Wall -g -O2 -static \ + -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon \ + -o remap_neon remap_neon.c -lm + +arm-linux-gnueabi-gcc -Wall -g -O2 -static \ + -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon \ + -o svolume_neon svolume_neon.c -lm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/remap_neon.c Thu Jan 12 17:27:46 2012 +0100 @@ -0,0 +1,261 @@ +/* + * Copyright 2012 Peter Meerwald <p.meerwald@bct-electronic.com> + */ + +#include <stdlib.h> +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <math.h> +#include <sys/time.h> +#include <assert.h> + + +typedef short int16_t; +typedef enum pa_sample_format { + PA_SAMPLE_S16LE, + PA_SAMPLE_FLOAT32LE, +} pa_sample_format_t; +#define PA_SAMPLE_S16NE PA_SAMPLE_S16LE +#define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE +typedef struct { + pa_sample_format_t *format; +} pa_remap_t; +typedef void (*pa_remap_func_t)(pa_remap_t *m, void *dst, const void *src, unsigned n); +typedef long long unsigned int pa_usec_t; + +#define pa_assert(x) assert(x) +#define pa_assert_not_reached() assert(0) + +#define PA_CLAMP_UNLIKELY(x, low, high) \ + (((x) < (low)) ? (low) : (((x) > (high)) ? (high) : (x))) + +static void pa_log_info(const char *format, ...) { + va_list ap; + char buf[1024]; + va_start(ap, format); + vsprintf(buf, format, ap); + printf("%s\n", buf); + va_end(ap); +} + +#define pa_log_debug pa_log_info + +static pa_usec_t pa_rtclock_now() { + struct timeval tv; + gettimeofday(&tv, NULL); + + return tv.tv_sec * 1000000ULL + tv.tv_usec; +} + +static void remap_mono_to_stereo_c(pa_remap_t *m, void *dst, const void *src, unsigned n) { + unsigned i; + + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + { + float *d, *s; + + d = (float *) dst; + s = (float *) src; + + for (i = n >> 2; i; i--) { + d[0] = d[1] = s[0]; + d[2] = d[3] = s[1]; + d[4] = d[5] = s[2]; + d[6] = d[7] = s[3]; + s += 4; + d += 8; + } + for (i = n & 3; i; i--) { + d[0] = d[1] = s[0]; + s++; + d += 2; + } + break; + } + case PA_SAMPLE_S16NE: + { + int16_t *d, *s; + + d = (int16_t *) dst; + s = (int16_t *) src; + + for (i = n >> 2; i; i--) { + d[0] = d[1] = s[0]; + d[2] = d[3] = s[1]; + d[4] = d[5] = s[2]; + d[6] = d[7] = s[3]; + s += 4; + d += 8; + } + for (i = n & 3; i; i--) { + d[0] = d[1] = s[0]; + s++; + d += 2; + } + break; + } + default: + pa_assert_not_reached(); + } +} + +#if defined(__arm__) + +#include "arm_neon.h" + +void remap_mono_to_stereo_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) { + unsigned i; + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + { + float *d = (float *) dst, *s = (float *) src; + + for (i = 0; i < n/4; i++) { + float32x4x2_t stereo; + stereo.val[0] = vld1q_f32(s); + stereo.val[1] = stereo.val[0]; + vst2q_f32(d, stereo); + s += 4; + d += 8; + } + + for (i = n & ~3; i < n; i++) { + d[0] = d[1] = s[0]; + s++; + d += 2; + } + break; + } + case PA_SAMPLE_S16NE: + { + int16_t *d = (int16_t *) dst, *s = (int16_t *) src; + + for (i = 0; i < n/8; i++) { + int16x8x2_t stereo; + stereo.val[0] = vld1q_s16(s); + stereo.val[1] = stereo.val[0]; + vst2q_s16(d, stereo); + s += 8; + d += 16; + } + + for (i = n & ~7; i < n; i++) { + d[0] = d[1] = s[0]; + s++; + d += 2; + } + break; + } + default: + pa_assert_not_reached(); + } +} + +#define SAMPLES 1019 +#define TIMES 10000 + +static void run_test_float(void) { + float stereo[2*SAMPLES]; + float stereo_ref[2*SAMPLES]; + float mono[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_remap_func_t func; + pa_sample_format_t sf; + pa_remap_t remap; + + pa_log_debug("checking NEON remap_mono_to_stereo(float, %d)", SAMPLES); + + memset(stereo_ref, 0, sizeof(stereo_ref)); + memset(stereo, 0, sizeof(stereo)); + + for (i = 0; i < SAMPLES; i++) { + mono[i] = rand()/(float) RAND_MAX - 0.5f; + } + + sf = PA_SAMPLE_FLOAT32NE; + remap.format = &sf; + func = (pa_remap_func_t) remap_mono_to_stereo_c; + func(&remap, stereo_ref, mono, SAMPLES); + remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); + + for (i = 0; i < 2*SAMPLES; i++) { + if (fabsf(stereo[i] - stereo_ref[i]) > 0.00001) { + pa_log_debug("%d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_ref[i], + mono[i/2]); + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + func(&remap, stereo_ref, mono, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} + +static void run_test_s16(void) { + int16_t stereo[2*SAMPLES]; + int16_t stereo_ref[2*SAMPLES]; + int16_t mono[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_remap_func_t func; + pa_sample_format_t sf; + pa_remap_t remap; + + pa_log_debug("checking NEON remap_mono_to_stereo(s16, %d)", SAMPLES); + + memset(stereo_ref, 0, sizeof(stereo_ref)); + memset(stereo, 0, sizeof(stereo)); + + for (i = 0; i < SAMPLES; i++) { + mono[i] = rand() - RAND_MAX/2; + } + + sf = PA_SAMPLE_S16NE; + remap.format = &sf; + func = (pa_remap_func_t) remap_mono_to_stereo_c; + func(&remap, stereo_ref, mono, SAMPLES); + remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); + + for (i = 0; i < 2*SAMPLES; i++) { + if (abs(stereo[i] - stereo_ref[i]) > 0) { + pa_log_debug("%d: %d != %d (%d)", i, stereo[i], stereo_ref[i], + mono[i/2]); + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + func(&remap, stereo_ref, mono, SAMPLES); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} + +#endif /* defined(__arm__) */ + +int main() { + + run_test_float(); + run_test_s16(); + + return EXIT_SUCCESS; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sconv_neon.c Thu Jan 12 17:27:46 2012 +0100 @@ -0,0 +1,202 @@ +/* + * Copyright 2012 Peter Meerwald <p.meerwald@bct-electronic.com> + */ + +#include <stdlib.h> +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <math.h> +#include <sys/time.h> +#include <assert.h> + +typedef short int16_t; +typedef void (*pa_convert_func_t)(unsigned n, const void *a, void *b); +typedef long long unsigned int pa_usec_t; + +#define pa_assert(x) assert(x) + +#define PA_CLAMP_UNLIKELY(x, low, high) \ + (((x) < (low)) ? (low) : (((x) > (high)) ? (high) : (x))) + +static void pa_log_info(const char *format, ...) { + va_list ap; + char buf[1024]; + va_start(ap, format); + vsprintf(buf, format, ap); + printf("%s\n", buf); + va_end(ap); +} + +#define pa_log_debug pa_log_info + +static pa_usec_t pa_rtclock_now() { + struct timeval tv; + gettimeofday(&tv, NULL); + + return tv.tv_sec * 1000000ULL + tv.tv_usec; +} + +#if defined(__arm__) + +#include "arm_neon.h" + +void pa_sconv_s16le_from_float32ne(unsigned n, const float *a, int16_t *b) { + pa_assert(a); + pa_assert(b); + + for (; n > 0; n--) { + float v = *(a++); + + v = PA_CLAMP_UNLIKELY(v, -1.0f, 1.0f); + *(b++) = (int16_t) lrintf(v * 0x7FFF); + } +} + +void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *a, int16_t *b) { + unsigned i; + + const float32x4_t plusone4 = vdupq_n_f32(1.0f); + const float32x4_t minusone4 = vdupq_n_f32(-1.0f); + const float32x4_t half4 = vdupq_n_f32(0.5f); + const float32x4_t scale4 = vdupq_n_f32(32767.0f); + const uint32x4_t mask4 = vdupq_n_u32(0x80000000); + + for (i = 0; i < n/4; i++) { + float32x4_t v4 = ((float32x4_t *)a)[i]; + v4 = vmulq_f32(vmaxq_f32(vminq_f32(v4, plusone4) , minusone4), scale4); + + const float32x4_t w4 = vreinterpretq_f32_u32(vorrq_u32(vandq_u32( + vreinterpretq_u32_f32(v4), mask4), vreinterpretq_u32_f32(half4))); + + ((int16x4_t *)b)[i] = vmovn_s32(vcvtq_s32_f32(vaddq_f32(v4, w4))); + } + + // leftovers + for (i = n & ~3; i < n; i++) { + b[i] = (int16_t) lrintf(PA_CLAMP_UNLIKELY(a[i], -1.0f, 1.0f) * 0x7FFF); + } +} + +void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *a, float *b) { + pa_assert(a); + pa_assert(b); + + for (; n > 0; n--) + *(b++) = ((float) (*(a++)))/(float) 0x7FFF; +} + +void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *a, float *b) { + unsigned i; + + const float32x4_t invscale4 = vdupq_n_f32(1.0f / 0x7FFF); + + for (i = 0; i < n/4; i++) { + ((float32x4_t *)b)[i] = vmulq_f32(vcvtq_f32_s32(vmovl_s16(((int16x4_t *)a)[i])), invscale4); + } + + // leftovers + const float invscale = 1.0f / 0x7FFF; + for (i = n & ~3; i < n; i++) { + b[i] = a[i] * invscale; + } +} + +#define SAMPLES 1019 +#define TIMES 300 + +static void run_test_from(void) { + int16_t samples[SAMPLES]; + int16_t samples_ref[SAMPLES]; + float floats[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_convert_func_t func; + + pa_log_debug("checking NEON sconv_s16le_from_float(%d)", SAMPLES); + + memset(samples_ref, 0, sizeof(samples_ref)); + memset(samples, 0, sizeof(samples)); + + for (i = 0; i < SAMPLES; i++) { + floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f); + } + + func = (pa_convert_func_t) pa_sconv_s16le_from_float32ne; + func(SAMPLES, floats, samples_ref); + pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples); + + for (i = 0; i < SAMPLES; i++) { + if (abs(samples[i] - samples_ref[i]) > 0) { + pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i], + floats[i]); + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + func(SAMPLES, floats, samples_ref); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} + +static void run_test_to(void) { + int16_t samples[SAMPLES]; + float floats[SAMPLES]; + float floats_ref[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_convert_func_t func; + + pa_log_debug("checking NEON sconv_s16le_to_float(%d)", SAMPLES); + + memset(floats_ref, 0, sizeof(floats_ref)); + memset(floats, 0, sizeof(float)); + + for (i = 0; i < SAMPLES; i++) { + samples[i] = rand() - RAND_MAX/2; + } + + func = (pa_convert_func_t) pa_sconv_s16le_to_float32ne; + func(SAMPLES, samples, floats_ref); + pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats); + + for (i = 0; i < SAMPLES; i++) { + if (fabsf(floats[i] - floats_ref[i]) > 0.00001) { + pa_log_debug("%d: %.8f != %.8f (%d)", i, floats[i], floats_ref[i], + samples[i]); + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + func(SAMPLES, samples, floats_ref); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} + +#endif /* defined(__arm__) */ + +int main() { + + run_test_from(); + run_test_to(); + + return EXIT_SUCCESS; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/svolume_neon.c Thu Jan 12 17:27:46 2012 +0100 @@ -0,0 +1,450 @@ +/* + * Copyright 2012 Peter Meerwald <p.meerwald@bct-electronic.com> + */ + +#include <stdlib.h> +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <math.h> +#include <sys/time.h> +#include <assert.h> + + +typedef short int16_t; +typedef unsigned int uint32_t; +typedef enum pa_sample_format { + PA_SAMPLE_S16LE, + PA_SAMPLE_FLOAT32LE, +} pa_sample_format_t; +#define PA_SAMPLE_S16NE PA_SAMPLE_S16LE +#define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE +typedef struct { + pa_sample_format_t *format; +} pa_remap_t; +typedef void (*pa_remap_func_t)(pa_remap_t *m, void *dst, const void *src, unsigned n); +typedef long long unsigned int pa_usec_t; + +#define pa_assert(x) assert(x) +#define pa_assert_not_reached() assert(0) + +#define PA_MAX(a, b) ((a) > (b) ? (a) : (b)) + +typedef uint32_t pa_volume_t; +#define PA_VOLUME_MUTED ((pa_volume_t) 0U) +#define PA_VOLUME_MAX ((pa_volume_t) UINT32_MAX/2) + +#define PA_UNLIKELY(x) (x) +#define PA_CLAMP_UNLIKELY(x, low, high) (PA_UNLIKELY((x) > (high)) ? (high) : (PA_UNLIKELY((x) < (low)) ? (low) : (x))) +#define PA_CLAMP_VOLUME(v) (PA_CLAMP_UNLIKELY((v), PA_VOLUME_MUTED, PA_VOLUME_MAX)) + +static void pa_log_info(const char *format, ...) { + va_list ap; + char buf[1024]; + va_start(ap, format); + vsprintf(buf, format, ap); + printf("%s\n", buf); + va_end(ap); +} + +#define pa_log_debug pa_log_info + +static pa_usec_t pa_rtclock_now() { + struct timeval tv; + gettimeofday(&tv, NULL); + + return tv.tv_sec * 1000000ULL + tv.tv_usec; +} + +void pa_volume_s16ne_c(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { + unsigned channel; + + length /= sizeof(int16_t); + + for (channel = 0; length; length--) { + int32_t t, hi, lo; + + /* Multiplying the 32bit volume factor with the 16bit + * sample might result in an 48bit value. We want to + * do without 64 bit integers and hence do the + * multiplication independently for the HI and LO part + * of the volume. */ + + hi = volumes[channel] >> 16; + lo = volumes[channel] & 0xFFFF; + + t = (int32_t)(*samples); + t = ((t * lo) >> 16) + (t * hi); + t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); + *samples++ = (int16_t) t; + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } +} + +void pa_volume_float32ne_c(float *samples, float *volumes, unsigned channels, unsigned length) { + unsigned channel; + + length /= sizeof(float); + + for (channel = 0; length; length--) { + *samples++ *= volumes[channel]; + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } +} + +/* +void pa_volume_s16ne_orc(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) +{ + if (channels == 2) { + int64_t v = (int64_t)volumes[1] << 32 | volumes[0]; + pa_volume_s16ne_orc_2ch (samples, v, ((length / (sizeof(int16_t))) / 2)); + } else if (channels == 1) + pa_volume_s16ne_orc_1ch (samples, volumes[0], length / (sizeof(int16_t))); +} +*/ + +#if defined(__arm__) + +#include "arm_neon.h" + +#define MOD_INC() \ + " subs r0, r6, %2 \n\t" \ + " itt cs \n\t" \ + " addcs r0, %1 \n\t" \ + " movcs r6, r0 \n\t" + +static void pa_volume_s16ne_arm(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { + int32_t *ve; + + /* Channels must be at least 4, and always a multiple of the original number. + * This is also the max amount we overread the volume array, which should + * have enough padding. */ + channels = channels == 3 ? 6 : PA_MAX (4U, channels); + ve = volumes + channels; + + __asm__ __volatile__ ( + " mov r6, %1 \n\t" + " mov %3, %3, LSR #1 \n\t" /* length /= sizeof (int16_t) */ + " tst %3, #1 \n\t" /* check for odd samples */ + " beq 2f \n\t" + + "1: \n\t" + " ldr r0, [r6], #4 \n\t" /* odd samples volumes */ + " ldrh r2, [%0] \n\t" + + " smulwb r0, r0, r2 \n\t" + " ssat r0, #16, r0 \n\t" + + " strh r0, [%0], #2 \n\t" + + MOD_INC() + + "2: \n\t" + " mov %3, %3, LSR #1 \n\t" + " tst %3, #1 \n\t" /* check for odd samples */ + " beq 4f \n\t" + + "3: \n\t" + " ldrd r2, [r6], #8 \n\t" /* 2 samples at a time */ + " ldr r0, [%0] \n\t" + + " smulwt r2, r2, r0 \n\t" + " smulwb r3, r3, r0 \n\t" + + " ssat r2, #16, r2 \n\t" + " ssat r3, #16, r3 \n\t" + + " pkhbt r0, r3, r2, LSL #16 \n\t" + " str r0, [%0], #4 \n\t" + + MOD_INC() + + "4: \n\t" + " movs %3, %3, LSR #1 \n\t" + " beq 6f \n\t" + + "5: \n\t" + " ldrd r2, [r6], #8 \n\t" /* 4 samples at a time */ + " ldrd r4, [r6], #8 \n\t" + " ldrd r0, [%0] \n\t" + + " smulwt r2, r2, r0 \n\t" + " smulwb r3, r3, r0 \n\t" + " smulwt r4, r4, r1 \n\t" + " smulwb r5, r5, r1 \n\t" + + " ssat r2, #16, r2 \n\t" + " ssat r3, #16, r3 \n\t" + " ssat r4, #16, r4 \n\t" + " ssat r5, #16, r5 \n\t" + + " pkhbt r0, r3, r2, LSL #16 \n\t" + " pkhbt r1, r5, r4, LSL #16 \n\t" + " strd r0, [%0], #8 \n\t" + + MOD_INC() + + " subs %3, %3, #1 \n\t" + " bne 5b \n\t" + "6: \n\t" + + : "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length) + : + : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc" + ); +} + +static inline void vol_s16ne_neon(int32x4_t vol4, int16_t *samples, unsigned length) { + unsigned i; + int16x4_t hi = vshrn_n_s32(vol4, 16); + int32x4_t lo = vandq_s32(vol4, vdupq_n_s32(0xFFFF)); + + for (i = 0; i < length/8; i++) { + int16x4_t v1 = ((int16x4_t *) samples)[2*i]; + int16x4_t v2 = ((int16x4_t *) samples)[2*i+1]; + + int32x4_t t1 = vmull_s16(v1, hi); + int32x4_t t2 = vmull_s16(v2, hi); + + int16x4_t r1 = vqmovn_s32(vsraq_n_s32(t1, vmulq_s32(vmovl_s16(v1), lo), 16)); + int16x4_t r2 = vqmovn_s32(vsraq_n_s32(t2, vmulq_s32(vmovl_s16(v2), lo), 16)); + + ((int16x8_t *)samples)[i] = vcombine_s16(r1, r2); + } +} + +void pa_volume_s16ne_neon(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { + unsigned channel = 0, i; + int32x4_t vol4; + + length /= sizeof(int16_t); + + switch (channels) { + case 1: + vol4 = vdupq_n_s32(*volumes); + vol_s16ne_neon(vol4, samples, length); + + for (i = length & ~7; i < length; i++) { + int32_t t = samples[i]; + t = ((t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16)); + samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); + } + break; + case 2: + vol4 = vcombine_s32(*(int32x2_t *)volumes, *(int32x2_t *)volumes); + vol_s16ne_neon(vol4, samples, length); + + for (i = length & ~7; i < length; i++) { + int32_t t = samples[i]; + int32_t vol = volumes[(channel++) & 1]; + t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); + samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); + } + break; + case 4: + vol4 = *(int32x4_t *)volumes; + vol_s16ne_neon(vol4, samples, length); + + for (i = length & ~7; i < length; i++) { + int32_t t = samples[i]; + int32_t vol = volumes[(channel++) & 3]; + t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); + samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); + } + break; + default: + for (; length; length--) { + int32_t t, hi, lo; + + /* Multiplying the 32bit volume factor with the 16bit + * sample might result in an 48bit value. We want to + * do without 64 bit integers and hence do the + * multiplication independently for the HI and LO part + * of the volume. */ + + hi = volumes[channel] >> 16; + lo = volumes[channel] & 0xFFFF; + + t = (int32_t)(*samples); + t = ((t * lo) >> 16) + (t * hi); + t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); + *samples++ = (int16_t) t; + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } + break; + } +} + +void pa_volume_float32ne_neon(float *samples, float *volumes, unsigned channels, unsigned length) { + unsigned channel = 0, i; + float32x4_t vol4; + + length /= sizeof(float); + + switch (channels) { + case 1: + vol4 = vdupq_n_f32(*volumes); + for (i = 0; i < length/4; i++) { + ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4); + } + + for (i = length & ~3; i < length; i++) { + samples[i] *= volumes[0]; + } + break; + case 2: + vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes); + for (i = 0; i < length/4; i++) { + ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4); + } + + for (i = length & ~3; i < length; i++) { + samples[i] *= volumes[channel]; + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } + break; + case 4: + vol4 = *(float32x4_t *)volumes; + for (i = 0; i < length/4; i++) { + ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4); + } + + for (i = length & ~3; i < length; i++) { + samples[i] *= volumes[channel++]; + } + break; + default: + for (; length; length--) { + *samples++ *= volumes[channel]; + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } + break; + } +} + +#define SAMPLES 1019 +#define TIMES 1000 +#define CHANNELS 4 +#define PADDING 16 + +static void run_test_float(void) { + float floats[SAMPLES]; + float floats_ref[SAMPLES]; + float floats_orig[SAMPLES]; + float volumes[CHANNELS]; + unsigned i; + pa_usec_t start, stop; + + pa_log_debug("checking NEON volume_float32ne(%d)", SAMPLES); + + for (i = 0; i < SAMPLES; i++) { + floats_orig[i] = rand()/(float) RAND_MAX - 0.5f; + } + memcpy(floats_ref, floats_orig, sizeof(floats_orig)); + memcpy(floats, floats_orig, sizeof(floats_orig)); + + for (i = 0; i < CHANNELS; i++) + volumes[i] = 0.5f * rand() / (float) RAND_MAX; + + pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats)); + pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref)); + + for (i = 0; i < SAMPLES; i++) { + if (fabsf(floats[i] - floats_ref[i]) > 0.00001) { + pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i], + floats_orig[i]); + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + memcpy(floats, floats_orig, sizeof(floats_orig)); + pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats)); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + memcpy(floats_ref, floats_orig, sizeof(floats_orig)); + pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref)); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} + +static void run_test_s16(void) { + int16_t samples[SAMPLES]; + int16_t samples_ref[SAMPLES]; + int16_t samples_orig[SAMPLES]; + int32_t volumes[CHANNELS + PADDING]; + unsigned i, padding; + pa_usec_t start, stop; + + pa_log_debug("checking NEON volume_s16ne(%d)", SAMPLES); + + for (i = 0; i < SAMPLES; i++) { + samples_orig[i] = rand() - RAND_MAX/2; + } + memcpy(samples_ref, samples_orig, sizeof(samples_orig)); + memcpy(samples, samples_orig, sizeof(samples_orig)); + + for (i = 0; i < CHANNELS; i++) + volumes[i] = PA_CLAMP_VOLUME(rand() >> 15); + for (padding = 0; padding < PADDING; padding++, i++) + volumes[i] = volumes[padding]; + + pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples)); + pa_volume_s16ne_c(samples_ref, volumes, CHANNELS, sizeof(samples_ref)); + + for (i = 0; i < SAMPLES; i++) { + if (abs(samples[i] - samples_ref[i]) > 0) { + pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i], + samples_orig[i]); + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + memcpy(samples, samples_orig, sizeof(samples_orig)); + pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples)); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + memcpy(samples, samples_orig, sizeof(samples_orig)); + pa_volume_s16ne_arm(samples, volumes, CHANNELS, sizeof(samples)); + } + stop = pa_rtclock_now(); + pa_log_info("ARM: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + memcpy(samples_ref, samples_orig, sizeof(samples_orig)); + pa_volume_s16ne_c(samples_ref, volumes, CHANNELS, sizeof(samples_ref)); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} + +#endif /* defined(__arm__) */ + +int main() { + + run_test_float(); + run_test_s16(); + + return EXIT_SUCCESS; +}