# HG changeset patch
# User Peter Meerwald
# Date 1326385666 -3600
# Node ID e0040ee59c3c88b43a3ca2826188a2db74b15b51
import
diff -r 000000000000 -r e0040ee59c3c .hgignore
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/.hgignore Thu Jan 12 17:27:46 2012 +0100
@@ -0,0 +1,4 @@
+syntax: glob
+sconv_neon
+svolume_neon
+remap_neon
diff -r 000000000000 -r e0040ee59c3c compile.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/compile.sh Thu Jan 12 17:27:46 2012 +0100
@@ -0,0 +1,11 @@
+arm-linux-gnueabi-gcc -Wall -g -O2 -static \
+ -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon \
+ -o sconv_neon sconv_neon.c -lm
+
+arm-linux-gnueabi-gcc -Wall -g -O2 -static \
+ -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon \
+ -o remap_neon remap_neon.c -lm
+
+arm-linux-gnueabi-gcc -Wall -g -O2 -static \
+ -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon \
+ -o svolume_neon svolume_neon.c -lm
diff -r 000000000000 -r e0040ee59c3c remap_neon.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/remap_neon.c Thu Jan 12 17:27:46 2012 +0100
@@ -0,0 +1,261 @@
+/*
+ * Copyright 2012 Peter Meerwald
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+
+typedef short int16_t;
+typedef enum pa_sample_format {
+ PA_SAMPLE_S16LE,
+ PA_SAMPLE_FLOAT32LE,
+} pa_sample_format_t;
+#define PA_SAMPLE_S16NE PA_SAMPLE_S16LE
+#define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE
+typedef struct {
+ pa_sample_format_t *format;
+} pa_remap_t;
+typedef void (*pa_remap_func_t)(pa_remap_t *m, void *dst, const void *src, unsigned n);
+typedef long long unsigned int pa_usec_t;
+
+#define pa_assert(x) assert(x)
+#define pa_assert_not_reached() assert(0)
+
+#define PA_CLAMP_UNLIKELY(x, low, high) \
+ (((x) < (low)) ? (low) : (((x) > (high)) ? (high) : (x)))
+
+static void pa_log_info(const char *format, ...) {
+ va_list ap;
+ char buf[1024];
+ va_start(ap, format);
+ vsprintf(buf, format, ap);
+ printf("%s\n", buf);
+ va_end(ap);
+}
+
+#define pa_log_debug pa_log_info
+
+static pa_usec_t pa_rtclock_now() {
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+
+ return tv.tv_sec * 1000000ULL + tv.tv_usec;
+}
+
+static void remap_mono_to_stereo_c(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+ unsigned i;
+
+ switch (*m->format) {
+ case PA_SAMPLE_FLOAT32NE:
+ {
+ float *d, *s;
+
+ d = (float *) dst;
+ s = (float *) src;
+
+ for (i = n >> 2; i; i--) {
+ d[0] = d[1] = s[0];
+ d[2] = d[3] = s[1];
+ d[4] = d[5] = s[2];
+ d[6] = d[7] = s[3];
+ s += 4;
+ d += 8;
+ }
+ for (i = n & 3; i; i--) {
+ d[0] = d[1] = s[0];
+ s++;
+ d += 2;
+ }
+ break;
+ }
+ case PA_SAMPLE_S16NE:
+ {
+ int16_t *d, *s;
+
+ d = (int16_t *) dst;
+ s = (int16_t *) src;
+
+ for (i = n >> 2; i; i--) {
+ d[0] = d[1] = s[0];
+ d[2] = d[3] = s[1];
+ d[4] = d[5] = s[2];
+ d[6] = d[7] = s[3];
+ s += 4;
+ d += 8;
+ }
+ for (i = n & 3; i; i--) {
+ d[0] = d[1] = s[0];
+ s++;
+ d += 2;
+ }
+ break;
+ }
+ default:
+ pa_assert_not_reached();
+ }
+}
+
+#if defined(__arm__)
+
+#include "arm_neon.h"
+
+void remap_mono_to_stereo_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+ unsigned i;
+ switch (*m->format) {
+ case PA_SAMPLE_FLOAT32NE:
+ {
+ float *d = (float *) dst, *s = (float *) src;
+
+ for (i = 0; i < n/4; i++) {
+ float32x4x2_t stereo;
+ stereo.val[0] = vld1q_f32(s);
+ stereo.val[1] = stereo.val[0];
+ vst2q_f32(d, stereo);
+ s += 4;
+ d += 8;
+ }
+
+ for (i = n & ~3; i < n; i++) {
+ d[0] = d[1] = s[0];
+ s++;
+ d += 2;
+ }
+ break;
+ }
+ case PA_SAMPLE_S16NE:
+ {
+ int16_t *d = (int16_t *) dst, *s = (int16_t *) src;
+
+ for (i = 0; i < n/8; i++) {
+ int16x8x2_t stereo;
+ stereo.val[0] = vld1q_s16(s);
+ stereo.val[1] = stereo.val[0];
+ vst2q_s16(d, stereo);
+ s += 8;
+ d += 16;
+ }
+
+ for (i = n & ~7; i < n; i++) {
+ d[0] = d[1] = s[0];
+ s++;
+ d += 2;
+ }
+ break;
+ }
+ default:
+ pa_assert_not_reached();
+ }
+}
+
+#define SAMPLES 1019
+#define TIMES 10000
+
+static void run_test_float(void) {
+ float stereo[2*SAMPLES];
+ float stereo_ref[2*SAMPLES];
+ float mono[SAMPLES];
+ int i;
+ pa_usec_t start, stop;
+ pa_remap_func_t func;
+ pa_sample_format_t sf;
+ pa_remap_t remap;
+
+ pa_log_debug("checking NEON remap_mono_to_stereo(float, %d)", SAMPLES);
+
+ memset(stereo_ref, 0, sizeof(stereo_ref));
+ memset(stereo, 0, sizeof(stereo));
+
+ for (i = 0; i < SAMPLES; i++) {
+ mono[i] = rand()/(float) RAND_MAX - 0.5f;
+ }
+
+ sf = PA_SAMPLE_FLOAT32NE;
+ remap.format = &sf;
+ func = (pa_remap_func_t) remap_mono_to_stereo_c;
+ func(&remap, stereo_ref, mono, SAMPLES);
+ remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
+
+ for (i = 0; i < 2*SAMPLES; i++) {
+ if (fabsf(stereo[i] - stereo_ref[i]) > 0.00001) {
+ pa_log_debug("%d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_ref[i],
+ mono[i/2]);
+ }
+ }
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ func(&remap, stereo_ref, mono, SAMPLES);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+static void run_test_s16(void) {
+ int16_t stereo[2*SAMPLES];
+ int16_t stereo_ref[2*SAMPLES];
+ int16_t mono[SAMPLES];
+ int i;
+ pa_usec_t start, stop;
+ pa_remap_func_t func;
+ pa_sample_format_t sf;
+ pa_remap_t remap;
+
+ pa_log_debug("checking NEON remap_mono_to_stereo(s16, %d)", SAMPLES);
+
+ memset(stereo_ref, 0, sizeof(stereo_ref));
+ memset(stereo, 0, sizeof(stereo));
+
+ for (i = 0; i < SAMPLES; i++) {
+ mono[i] = rand() - RAND_MAX/2;
+ }
+
+ sf = PA_SAMPLE_S16NE;
+ remap.format = &sf;
+ func = (pa_remap_func_t) remap_mono_to_stereo_c;
+ func(&remap, stereo_ref, mono, SAMPLES);
+ remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
+
+ for (i = 0; i < 2*SAMPLES; i++) {
+ if (abs(stereo[i] - stereo_ref[i]) > 0) {
+ pa_log_debug("%d: %d != %d (%d)", i, stereo[i], stereo_ref[i],
+ mono[i/2]);
+ }
+ }
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ func(&remap, stereo_ref, mono, SAMPLES);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+#endif /* defined(__arm__) */
+
+int main() {
+
+ run_test_float();
+ run_test_s16();
+
+ return EXIT_SUCCESS;
+}
diff -r 000000000000 -r e0040ee59c3c sconv_neon.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sconv_neon.c Thu Jan 12 17:27:46 2012 +0100
@@ -0,0 +1,202 @@
+/*
+ * Copyright 2012 Peter Meerwald
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+typedef short int16_t;
+typedef void (*pa_convert_func_t)(unsigned n, const void *a, void *b);
+typedef long long unsigned int pa_usec_t;
+
+#define pa_assert(x) assert(x)
+
+#define PA_CLAMP_UNLIKELY(x, low, high) \
+ (((x) < (low)) ? (low) : (((x) > (high)) ? (high) : (x)))
+
+static void pa_log_info(const char *format, ...) {
+ va_list ap;
+ char buf[1024];
+ va_start(ap, format);
+ vsprintf(buf, format, ap);
+ printf("%s\n", buf);
+ va_end(ap);
+}
+
+#define pa_log_debug pa_log_info
+
+static pa_usec_t pa_rtclock_now() {
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+
+ return tv.tv_sec * 1000000ULL + tv.tv_usec;
+}
+
+#if defined(__arm__)
+
+#include "arm_neon.h"
+
+void pa_sconv_s16le_from_float32ne(unsigned n, const float *a, int16_t *b) {
+ pa_assert(a);
+ pa_assert(b);
+
+ for (; n > 0; n--) {
+ float v = *(a++);
+
+ v = PA_CLAMP_UNLIKELY(v, -1.0f, 1.0f);
+ *(b++) = (int16_t) lrintf(v * 0x7FFF);
+ }
+}
+
+void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *a, int16_t *b) {
+ unsigned i;
+
+ const float32x4_t plusone4 = vdupq_n_f32(1.0f);
+ const float32x4_t minusone4 = vdupq_n_f32(-1.0f);
+ const float32x4_t half4 = vdupq_n_f32(0.5f);
+ const float32x4_t scale4 = vdupq_n_f32(32767.0f);
+ const uint32x4_t mask4 = vdupq_n_u32(0x80000000);
+
+ for (i = 0; i < n/4; i++) {
+ float32x4_t v4 = ((float32x4_t *)a)[i];
+ v4 = vmulq_f32(vmaxq_f32(vminq_f32(v4, plusone4) , minusone4), scale4);
+
+ const float32x4_t w4 = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(
+ vreinterpretq_u32_f32(v4), mask4), vreinterpretq_u32_f32(half4)));
+
+ ((int16x4_t *)b)[i] = vmovn_s32(vcvtq_s32_f32(vaddq_f32(v4, w4)));
+ }
+
+ // leftovers
+ for (i = n & ~3; i < n; i++) {
+ b[i] = (int16_t) lrintf(PA_CLAMP_UNLIKELY(a[i], -1.0f, 1.0f) * 0x7FFF);
+ }
+}
+
+void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *a, float *b) {
+ pa_assert(a);
+ pa_assert(b);
+
+ for (; n > 0; n--)
+ *(b++) = ((float) (*(a++)))/(float) 0x7FFF;
+}
+
+void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *a, float *b) {
+ unsigned i;
+
+ const float32x4_t invscale4 = vdupq_n_f32(1.0f / 0x7FFF);
+
+ for (i = 0; i < n/4; i++) {
+ ((float32x4_t *)b)[i] = vmulq_f32(vcvtq_f32_s32(vmovl_s16(((int16x4_t *)a)[i])), invscale4);
+ }
+
+ // leftovers
+ const float invscale = 1.0f / 0x7FFF;
+ for (i = n & ~3; i < n; i++) {
+ b[i] = a[i] * invscale;
+ }
+}
+
+#define SAMPLES 1019
+#define TIMES 300
+
+static void run_test_from(void) {
+ int16_t samples[SAMPLES];
+ int16_t samples_ref[SAMPLES];
+ float floats[SAMPLES];
+ int i;
+ pa_usec_t start, stop;
+ pa_convert_func_t func;
+
+ pa_log_debug("checking NEON sconv_s16le_from_float(%d)", SAMPLES);
+
+ memset(samples_ref, 0, sizeof(samples_ref));
+ memset(samples, 0, sizeof(samples));
+
+ for (i = 0; i < SAMPLES; i++) {
+ floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f);
+ }
+
+ func = (pa_convert_func_t) pa_sconv_s16le_from_float32ne;
+ func(SAMPLES, floats, samples_ref);
+ pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples);
+
+ for (i = 0; i < SAMPLES; i++) {
+ if (abs(samples[i] - samples_ref[i]) > 0) {
+ pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i],
+ floats[i]);
+ }
+ }
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ func(SAMPLES, floats, samples_ref);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+static void run_test_to(void) {
+ int16_t samples[SAMPLES];
+ float floats[SAMPLES];
+ float floats_ref[SAMPLES];
+ int i;
+ pa_usec_t start, stop;
+ pa_convert_func_t func;
+
+ pa_log_debug("checking NEON sconv_s16le_to_float(%d)", SAMPLES);
+
+ memset(floats_ref, 0, sizeof(floats_ref));
+ memset(floats, 0, sizeof(float));
+
+ for (i = 0; i < SAMPLES; i++) {
+ samples[i] = rand() - RAND_MAX/2;
+ }
+
+ func = (pa_convert_func_t) pa_sconv_s16le_to_float32ne;
+ func(SAMPLES, samples, floats_ref);
+ pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats);
+
+ for (i = 0; i < SAMPLES; i++) {
+ if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
+ pa_log_debug("%d: %.8f != %.8f (%d)", i, floats[i], floats_ref[i],
+ samples[i]);
+ }
+ }
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ func(SAMPLES, samples, floats_ref);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+#endif /* defined(__arm__) */
+
+int main() {
+
+ run_test_from();
+ run_test_to();
+
+ return EXIT_SUCCESS;
+}
diff -r 000000000000 -r e0040ee59c3c svolume_neon.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/svolume_neon.c Thu Jan 12 17:27:46 2012 +0100
@@ -0,0 +1,450 @@
+/*
+ * Copyright 2012 Peter Meerwald
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+
+typedef short int16_t;
+typedef unsigned int uint32_t;
+typedef enum pa_sample_format {
+ PA_SAMPLE_S16LE,
+ PA_SAMPLE_FLOAT32LE,
+} pa_sample_format_t;
+#define PA_SAMPLE_S16NE PA_SAMPLE_S16LE
+#define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE
+typedef struct {
+ pa_sample_format_t *format;
+} pa_remap_t;
+typedef void (*pa_remap_func_t)(pa_remap_t *m, void *dst, const void *src, unsigned n);
+typedef long long unsigned int pa_usec_t;
+
+#define pa_assert(x) assert(x)
+#define pa_assert_not_reached() assert(0)
+
+#define PA_MAX(a, b) ((a) > (b) ? (a) : (b))
+
+typedef uint32_t pa_volume_t;
+#define PA_VOLUME_MUTED ((pa_volume_t) 0U)
+#define PA_VOLUME_MAX ((pa_volume_t) UINT32_MAX/2)
+
+#define PA_UNLIKELY(x) (x)
+#define PA_CLAMP_UNLIKELY(x, low, high) (PA_UNLIKELY((x) > (high)) ? (high) : (PA_UNLIKELY((x) < (low)) ? (low) : (x)))
+#define PA_CLAMP_VOLUME(v) (PA_CLAMP_UNLIKELY((v), PA_VOLUME_MUTED, PA_VOLUME_MAX))
+
+static void pa_log_info(const char *format, ...) {
+ va_list ap;
+ char buf[1024];
+ va_start(ap, format);
+ vsprintf(buf, format, ap);
+ printf("%s\n", buf);
+ va_end(ap);
+}
+
+#define pa_log_debug pa_log_info
+
+static pa_usec_t pa_rtclock_now() {
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+
+ return tv.tv_sec * 1000000ULL + tv.tv_usec;
+}
+
+void pa_volume_s16ne_c(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
+ unsigned channel;
+
+ length /= sizeof(int16_t);
+
+ for (channel = 0; length; length--) {
+ int32_t t, hi, lo;
+
+ /* Multiplying the 32bit volume factor with the 16bit
+ * sample might result in an 48bit value. We want to
+ * do without 64 bit integers and hence do the
+ * multiplication independently for the HI and LO part
+ * of the volume. */
+
+ hi = volumes[channel] >> 16;
+ lo = volumes[channel] & 0xFFFF;
+
+ t = (int32_t)(*samples);
+ t = ((t * lo) >> 16) + (t * hi);
+ t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+ *samples++ = (int16_t) t;
+
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
+}
+
+void pa_volume_float32ne_c(float *samples, float *volumes, unsigned channels, unsigned length) {
+ unsigned channel;
+
+ length /= sizeof(float);
+
+ for (channel = 0; length; length--) {
+ *samples++ *= volumes[channel];
+
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
+}
+
+/*
+void pa_volume_s16ne_orc(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
+{
+ if (channels == 2) {
+ int64_t v = (int64_t)volumes[1] << 32 | volumes[0];
+ pa_volume_s16ne_orc_2ch (samples, v, ((length / (sizeof(int16_t))) / 2));
+ } else if (channels == 1)
+ pa_volume_s16ne_orc_1ch (samples, volumes[0], length / (sizeof(int16_t)));
+}
+*/
+
+#if defined(__arm__)
+
+#include "arm_neon.h"
+
+#define MOD_INC() \
+ " subs r0, r6, %2 \n\t" \
+ " itt cs \n\t" \
+ " addcs r0, %1 \n\t" \
+ " movcs r6, r0 \n\t"
+
+static void pa_volume_s16ne_arm(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
+ int32_t *ve;
+
+ /* Channels must be at least 4, and always a multiple of the original number.
+ * This is also the max amount we overread the volume array, which should
+ * have enough padding. */
+ channels = channels == 3 ? 6 : PA_MAX (4U, channels);
+ ve = volumes + channels;
+
+ __asm__ __volatile__ (
+ " mov r6, %1 \n\t"
+ " mov %3, %3, LSR #1 \n\t" /* length /= sizeof (int16_t) */
+ " tst %3, #1 \n\t" /* check for odd samples */
+ " beq 2f \n\t"
+
+ "1: \n\t"
+ " ldr r0, [r6], #4 \n\t" /* odd samples volumes */
+ " ldrh r2, [%0] \n\t"
+
+ " smulwb r0, r0, r2 \n\t"
+ " ssat r0, #16, r0 \n\t"
+
+ " strh r0, [%0], #2 \n\t"
+
+ MOD_INC()
+
+ "2: \n\t"
+ " mov %3, %3, LSR #1 \n\t"
+ " tst %3, #1 \n\t" /* check for odd samples */
+ " beq 4f \n\t"
+
+ "3: \n\t"
+ " ldrd r2, [r6], #8 \n\t" /* 2 samples at a time */
+ " ldr r0, [%0] \n\t"
+
+ " smulwt r2, r2, r0 \n\t"
+ " smulwb r3, r3, r0 \n\t"
+
+ " ssat r2, #16, r2 \n\t"
+ " ssat r3, #16, r3 \n\t"
+
+ " pkhbt r0, r3, r2, LSL #16 \n\t"
+ " str r0, [%0], #4 \n\t"
+
+ MOD_INC()
+
+ "4: \n\t"
+ " movs %3, %3, LSR #1 \n\t"
+ " beq 6f \n\t"
+
+ "5: \n\t"
+ " ldrd r2, [r6], #8 \n\t" /* 4 samples at a time */
+ " ldrd r4, [r6], #8 \n\t"
+ " ldrd r0, [%0] \n\t"
+
+ " smulwt r2, r2, r0 \n\t"
+ " smulwb r3, r3, r0 \n\t"
+ " smulwt r4, r4, r1 \n\t"
+ " smulwb r5, r5, r1 \n\t"
+
+ " ssat r2, #16, r2 \n\t"
+ " ssat r3, #16, r3 \n\t"
+ " ssat r4, #16, r4 \n\t"
+ " ssat r5, #16, r5 \n\t"
+
+ " pkhbt r0, r3, r2, LSL #16 \n\t"
+ " pkhbt r1, r5, r4, LSL #16 \n\t"
+ " strd r0, [%0], #8 \n\t"
+
+ MOD_INC()
+
+ " subs %3, %3, #1 \n\t"
+ " bne 5b \n\t"
+ "6: \n\t"
+
+ : "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length)
+ :
+ : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc"
+ );
+}
+
+static inline void vol_s16ne_neon(int32x4_t vol4, int16_t *samples, unsigned length) {
+ unsigned i;
+ int16x4_t hi = vshrn_n_s32(vol4, 16);
+ int32x4_t lo = vandq_s32(vol4, vdupq_n_s32(0xFFFF));
+
+ for (i = 0; i < length/8; i++) {
+ int16x4_t v1 = ((int16x4_t *) samples)[2*i];
+ int16x4_t v2 = ((int16x4_t *) samples)[2*i+1];
+
+ int32x4_t t1 = vmull_s16(v1, hi);
+ int32x4_t t2 = vmull_s16(v2, hi);
+
+ int16x4_t r1 = vqmovn_s32(vsraq_n_s32(t1, vmulq_s32(vmovl_s16(v1), lo), 16));
+ int16x4_t r2 = vqmovn_s32(vsraq_n_s32(t2, vmulq_s32(vmovl_s16(v2), lo), 16));
+
+ ((int16x8_t *)samples)[i] = vcombine_s16(r1, r2);
+ }
+}
+
+void pa_volume_s16ne_neon(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
+ unsigned channel = 0, i;
+ int32x4_t vol4;
+
+ length /= sizeof(int16_t);
+
+ switch (channels) {
+ case 1:
+ vol4 = vdupq_n_s32(*volumes);
+ vol_s16ne_neon(vol4, samples, length);
+
+ for (i = length & ~7; i < length; i++) {
+ int32_t t = samples[i];
+ t = ((t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16));
+ samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+ }
+ break;
+ case 2:
+ vol4 = vcombine_s32(*(int32x2_t *)volumes, *(int32x2_t *)volumes);
+ vol_s16ne_neon(vol4, samples, length);
+
+ for (i = length & ~7; i < length; i++) {
+ int32_t t = samples[i];
+ int32_t vol = volumes[(channel++) & 1];
+ t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
+ samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+ }
+ break;
+ case 4:
+ vol4 = *(int32x4_t *)volumes;
+ vol_s16ne_neon(vol4, samples, length);
+
+ for (i = length & ~7; i < length; i++) {
+ int32_t t = samples[i];
+ int32_t vol = volumes[(channel++) & 3];
+ t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
+ samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+ }
+ break;
+ default:
+ for (; length; length--) {
+ int32_t t, hi, lo;
+
+ /* Multiplying the 32bit volume factor with the 16bit
+ * sample might result in an 48bit value. We want to
+ * do without 64 bit integers and hence do the
+ * multiplication independently for the HI and LO part
+ * of the volume. */
+
+ hi = volumes[channel] >> 16;
+ lo = volumes[channel] & 0xFFFF;
+
+ t = (int32_t)(*samples);
+ t = ((t * lo) >> 16) + (t * hi);
+ t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+ *samples++ = (int16_t) t;
+
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
+ break;
+ }
+}
+
+void pa_volume_float32ne_neon(float *samples, float *volumes, unsigned channels, unsigned length) {
+ unsigned channel = 0, i;
+ float32x4_t vol4;
+
+ length /= sizeof(float);
+
+ switch (channels) {
+ case 1:
+ vol4 = vdupq_n_f32(*volumes);
+ for (i = 0; i < length/4; i++) {
+ ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
+ }
+
+ for (i = length & ~3; i < length; i++) {
+ samples[i] *= volumes[0];
+ }
+ break;
+ case 2:
+ vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes);
+ for (i = 0; i < length/4; i++) {
+ ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
+ }
+
+ for (i = length & ~3; i < length; i++) {
+ samples[i] *= volumes[channel];
+
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
+ break;
+ case 4:
+ vol4 = *(float32x4_t *)volumes;
+ for (i = 0; i < length/4; i++) {
+ ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
+ }
+
+ for (i = length & ~3; i < length; i++) {
+ samples[i] *= volumes[channel++];
+ }
+ break;
+ default:
+ for (; length; length--) {
+ *samples++ *= volumes[channel];
+
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
+ break;
+ }
+}
+
+#define SAMPLES 1019
+#define TIMES 1000
+#define CHANNELS 4
+#define PADDING 16
+
+static void run_test_float(void) {
+ float floats[SAMPLES];
+ float floats_ref[SAMPLES];
+ float floats_orig[SAMPLES];
+ float volumes[CHANNELS];
+ unsigned i;
+ pa_usec_t start, stop;
+
+ pa_log_debug("checking NEON volume_float32ne(%d)", SAMPLES);
+
+ for (i = 0; i < SAMPLES; i++) {
+ floats_orig[i] = rand()/(float) RAND_MAX - 0.5f;
+ }
+ memcpy(floats_ref, floats_orig, sizeof(floats_orig));
+ memcpy(floats, floats_orig, sizeof(floats_orig));
+
+ for (i = 0; i < CHANNELS; i++)
+ volumes[i] = 0.5f * rand() / (float) RAND_MAX;
+
+ pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
+ pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
+
+ for (i = 0; i < SAMPLES; i++) {
+ if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
+ pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i],
+ floats_orig[i]);
+ }
+ }
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ memcpy(floats, floats_orig, sizeof(floats_orig));
+ pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ memcpy(floats_ref, floats_orig, sizeof(floats_orig));
+ pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+static void run_test_s16(void) {
+ int16_t samples[SAMPLES];
+ int16_t samples_ref[SAMPLES];
+ int16_t samples_orig[SAMPLES];
+ int32_t volumes[CHANNELS + PADDING];
+ unsigned i, padding;
+ pa_usec_t start, stop;
+
+ pa_log_debug("checking NEON volume_s16ne(%d)", SAMPLES);
+
+ for (i = 0; i < SAMPLES; i++) {
+ samples_orig[i] = rand() - RAND_MAX/2;
+ }
+ memcpy(samples_ref, samples_orig, sizeof(samples_orig));
+ memcpy(samples, samples_orig, sizeof(samples_orig));
+
+ for (i = 0; i < CHANNELS; i++)
+ volumes[i] = PA_CLAMP_VOLUME(rand() >> 15);
+ for (padding = 0; padding < PADDING; padding++, i++)
+ volumes[i] = volumes[padding];
+
+ pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
+ pa_volume_s16ne_c(samples_ref, volumes, CHANNELS, sizeof(samples_ref));
+
+ for (i = 0; i < SAMPLES; i++) {
+ if (abs(samples[i] - samples_ref[i]) > 0) {
+ pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i],
+ samples_orig[i]);
+ }
+ }
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ memcpy(samples, samples_orig, sizeof(samples_orig));
+ pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ memcpy(samples, samples_orig, sizeof(samples_orig));
+ pa_volume_s16ne_arm(samples, volumes, CHANNELS, sizeof(samples));
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("ARM: %llu usec.", (long long unsigned int)(stop - start));
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ memcpy(samples_ref, samples_orig, sizeof(samples_orig));
+ pa_volume_s16ne_c(samples_ref, volumes, CHANNELS, sizeof(samples_ref));
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+#endif /* defined(__arm__) */
+
+int main() {
+
+ run_test_float();
+ run_test_s16();
+
+ return EXIT_SUCCESS;
+}