Mercurial > hg > pa-neon
view svolume_neon.c @ 5:07763f536182 default tip
ALIGNment support
author | Peter Meerwald <p.meerwald@bct-electronic.com> |
---|---|
date | Sun, 08 Jul 2012 21:48:08 +0200 |
parents | 1f6289166006 |
children |
line wrap: on
line source
/* * Copyright 2012 Peter Meerwald <p.meerwald@bct-electronic.com> */ #include <stdlib.h> #include <stdio.h> #include <stdarg.h> #include <string.h> #include <math.h> #include <sys/time.h> #include <assert.h> typedef short int16_t; typedef unsigned int uint32_t; typedef enum pa_sample_format { PA_SAMPLE_S16LE, PA_SAMPLE_FLOAT32LE, } pa_sample_format_t; #define PA_SAMPLE_S16NE PA_SAMPLE_S16LE #define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE typedef long long unsigned int pa_usec_t; #define pa_assert(x) assert(x) #define pa_assert_not_reached() assert(0) #define PA_MAX(a, b) ((a) > (b) ? (a) : (b)) typedef uint32_t pa_volume_t; #define PA_VOLUME_MUTED ((pa_volume_t) 0U) #define PA_VOLUME_MAX ((pa_volume_t) UINT32_MAX/2) #define PA_UNLIKELY(x) (x) #define PA_CLAMP_UNLIKELY(x, low, high) (PA_UNLIKELY((x) > (high)) ? (high) : (PA_UNLIKELY((x) < (low)) ? (low) : (x))) #define PA_CLAMP_VOLUME(v) (PA_CLAMP_UNLIKELY((v), PA_VOLUME_MUTED, PA_VOLUME_MAX)) static void pa_log_info(const char *format, ...) { va_list ap; char buf[1024]; va_start(ap, format); vsprintf(buf, format, ap); printf("%s\n", buf); va_end(ap); } #define pa_log_debug pa_log_info static pa_usec_t pa_rtclock_now() { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec * 1000000ULL + tv.tv_usec; } static void pa_volume_s16ne_c(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) { unsigned channel; length /= sizeof(int16_t); for (channel = 0; length; length--) { int32_t t, hi, lo; /* Multiplying the 32bit volume factor with the 16bit * sample might result in an 48bit value. We want to * do without 64 bit integers and hence do the * multiplication independently for the HI and LO part * of the volume. */ hi = volumes[channel] >> 16; lo = volumes[channel] & 0xFFFF; t = (int32_t)(*samples); t = ((t * lo) >> 16) + (t * hi); t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); *samples++ = (int16_t) t; if (PA_UNLIKELY(++channel >= channels)) channel = 0; } } static void pa_volume_float32ne_c(float *samples, const float *volumes, unsigned channels, unsigned length) { unsigned channel; length /= sizeof(float); for (channel = 0; length; length--) { *samples++ *= volumes[channel]; if (PA_UNLIKELY(++channel >= channels)) channel = 0; } } #if defined(__arm__) #include "arm_neon.h" #define MOD_INC() \ " subs r0, r6, %2 \n\t" \ " itt cs \n\t" \ " addcs r0, %1 \n\t" \ " movcs r6, r0 \n\t" static void pa_volume_s16ne_arm(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) { /* Channels must be at least 4, and always a multiple of the original number. * This is also the max amount we overread the volume array, which should * have enough padding. */ channels = channels == 3 ? 6 : PA_MAX(4U, channels); const uint32_t *ve = volumes + channels; __asm__ __volatile__ ( " mov r6, %1 \n\t" " mov %3, %3, LSR #1 \n\t" /* length /= sizeof (int16_t) */ " tst %3, #1 \n\t" /* check for odd samples */ " beq 2f \n\t" "1: \n\t" " ldr r0, [r6], #4 \n\t" /* odd samples volumes */ " ldrh r2, [%0] \n\t" " smulwb r0, r0, r2 \n\t" " ssat r0, #16, r0 \n\t" " strh r0, [%0], #2 \n\t" MOD_INC() "2: \n\t" " mov %3, %3, LSR #1 \n\t" " tst %3, #1 \n\t" /* check for odd samples */ " beq 4f \n\t" "3: \n\t" " ldrd r2, [r6], #8 \n\t" /* 2 samples at a time */ " ldr r0, [%0] \n\t" " smulwt r2, r2, r0 \n\t" " smulwb r3, r3, r0 \n\t" " ssat r2, #16, r2 \n\t" " ssat r3, #16, r3 \n\t" " pkhbt r0, r3, r2, LSL #16 \n\t" " str r0, [%0], #4 \n\t" MOD_INC() "4: \n\t" " movs %3, %3, LSR #1 \n\t" " beq 6f \n\t" "5: \n\t" " ldrd r2, [r6], #8 \n\t" /* 4 samples at a time */ " ldrd r4, [r6], #8 \n\t" " ldrd r0, [%0] \n\t" " smulwt r2, r2, r0 \n\t" " smulwb r3, r3, r0 \n\t" " smulwt r4, r4, r1 \n\t" " smulwb r5, r5, r1 \n\t" " ssat r2, #16, r2 \n\t" " ssat r3, #16, r3 \n\t" " ssat r4, #16, r4 \n\t" " ssat r5, #16, r5 \n\t" " pkhbt r0, r3, r2, LSL #16 \n\t" " pkhbt r1, r5, r4, LSL #16 \n\t" " strd r0, [%0], #8 \n\t" MOD_INC() " subs %3, %3, #1 \n\t" " bne 5b \n\t" "6: \n\t" : "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length) : : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc" ); } static inline void vol_s16_neon(const uint32x4_t *vol4, int16_t *samples, unsigned length) { asm volatile ( "mov %[length], %[length], lsr #2\n\t" "vld1.s32 {q0}, [%[vol]]\n\t" "vshl.u32 q3, q0, #16\n\t" /* lo */ "vshrn.s32 d1, q0, #16\n\t" /* hi */ "vshr.u32 q3, q3, #16\n\t" "1:\n\t" "vld1.16 {d0}, [%[samples]]\n\t" "vmull.s16 q1, d0, d1\n\t" "vmovl.s16 q2, d0\n\t" "vmul.s32 q2, q2, q3\n\t" "vsra.s32 q1, q2, #16\n\t" "vmovn.s32 d0, q1\n\t" "subs %[length], %[length], #1\n\t" "vst1.16 {d0}, [%[samples]]!\n\t" "bgt 1b\n\t" /* output operands (or input operands that get modified) */ : [samples] "+r" (samples), [length] "+r" (length) : [vol] "r" (vol4) /* input operands */ : "memory", "cc", "q0", "q1", "q2", "q3" /* clobber list */ ); } static inline void vol_float_neon(const float32x4_t *vol4, float *samples, unsigned length) { asm volatile ( "mov %[length], %[length], lsr #2\n\t" "vld1.32 {q1}, [%[vol]]\n\t" "1:\n\t" "vld1.32 {q0}, [%[samples]]\n\t" "vmul.f32 q0, q0, q1\n\t" "subs %[length], %[length], #1\n\t" "vst1.32 {q0}, [%[samples]]!\n\t" "bgt 1b\n\t" /* output operands (or input operands that get modified) */ : [samples] "+r" (samples), [length] "+r" (length) : [vol] "r" (vol4) /* input operands */ : "memory", "cc", "q0", "q1" /* clobber list */ ); } static void pa_volume_s16ne_neon(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) { unsigned channel = 0, i; uint32x4_t vol4; length /= sizeof(int16_t); switch (channels) { case 1: vol4 = vdupq_n_u32(*volumes); vol_s16_neon(&vol4, samples, length); for (i = length & ~3; i < length; i++) { int32_t t = samples[i]; t = ((int32_t) (t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16)); samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); } break; case 2: vol4 = vcombine_u32(*(uint32x2_t *)volumes, *(uint32x2_t *)volumes); vol_s16_neon(&vol4, samples, length); for (i = length & ~3; i < length; i++) { int32_t t = samples[i]; uint32_t vol = volumes[(channel++) & 1]; t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); } break; case 4: vol4 = *(uint32x4_t *)volumes; vol_s16_neon(&vol4, samples, length); for (i = length & ~3; i < length; i++) { int32_t t = samples[i]; uint32_t vol = volumes[(channel++) & 3]; t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); } break; default: for (; length; length--) { int32_t t; uint32_t hi, lo; /* Multiplying the 32bit volume factor with the 16bit * sample might result in an 48bit value. We want to * do without 64 bit integers and hence do the * multiplication independently for the HI and LO part * of the volume. */ hi = volumes[channel] >> 16; lo = volumes[channel] & 0xFFFF; t = (int32_t)(*samples); t = ((int32_t) (t * lo) >> 16) + (t * hi); t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); *samples++ = (int16_t) t; if (PA_UNLIKELY(++channel >= channels)) channel = 0; } break; } } static void pa_volume_float32ne_neon(float *samples, const float *volumes, unsigned channels, unsigned length) { unsigned channel = 0, i; float32x4_t vol4; length /= sizeof(float); switch (channels) { case 1: vol4 = vdupq_n_f32(*volumes); vol_float_neon(&vol4, samples, length); for (i = length & ~3; i < length; i++) { samples[i] *= volumes[0]; } break; case 2: vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes); vol_float_neon(&vol4, samples, length); for (i = length & ~3; i < length; i++) { samples[i] *= volumes[channel]; if (PA_UNLIKELY(++channel >= channels)) channel = 0; } break; case 4: vol4 = *(float32x4_t *)volumes; vol_float_neon(&vol4, samples, length); for (i = length & ~3; i < length; i++) { samples[i] *= volumes[channel++]; } break; default: for (; length; length--) { *samples++ *= volumes[channel]; if (PA_UNLIKELY(++channel >= channels)) channel = 0; } break; } } #define SAMPLES 1019 #define TIMES 50000 #define CHANNELS 4 #define PADDING 16 #define ALIGN 1 static void run_test_float(void) { float floats[SAMPLES+ALIGN]; float floats_ref[SAMPLES+ALIGN]; float floats_orig[SAMPLES+ALIGN]; float volumes[CHANNELS]; unsigned i; pa_usec_t start, stop; pa_log_debug("checking NEON volume_float32ne(%d)", SAMPLES); for (i = 0; i < SAMPLES+ALIGN; i++) { floats_orig[i] = rand()/(float) RAND_MAX - 0.5f; } memcpy(floats_ref, floats_orig, sizeof(floats_orig)); memcpy(floats, floats_orig, sizeof(floats_orig)); for (i = 0; i < CHANNELS; i++) volumes[i] = 0.5f * rand() / (float) RAND_MAX; pa_volume_float32ne_neon(floats+ALIGN, volumes, CHANNELS, sizeof(floats)); pa_volume_float32ne_c(floats_ref+ALIGN, volumes, CHANNELS, sizeof(floats_ref)); for (i = ALIGN; i < SAMPLES+ALIGN; i++) { if (fabsf(floats[i] - floats_ref[i]) > 0.00001) { pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i], floats_orig[i]); } } start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { memcpy(floats, floats_orig, sizeof(floats_orig)); pa_volume_float32ne_neon(floats+ALIGN, volumes, CHANNELS, sizeof(floats)); } stop = pa_rtclock_now(); pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { memcpy(floats_ref, floats_orig, sizeof(floats_orig)); pa_volume_float32ne_c(floats_ref+ALIGN, volumes, CHANNELS, sizeof(floats_ref)); } stop = pa_rtclock_now(); pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); } static void run_test_s16(void) { int16_t samples[SAMPLES+ALIGN]; int16_t samples_ref[SAMPLES+ALIGN]; int16_t samples_orig[SAMPLES+ALIGN]; uint32_t volumes[CHANNELS + PADDING]; unsigned i, padding; pa_usec_t start, stop; pa_log_debug("checking NEON volume_s16ne(%d)", SAMPLES); for (i = 0; i < SAMPLES; i++) { samples_orig[i] = rand() - RAND_MAX/2; } memcpy(samples_ref, samples_orig, sizeof(samples_orig)); memcpy(samples, samples_orig, sizeof(samples_orig)); for (i = 0; i < CHANNELS; i++) volumes[i] = PA_CLAMP_VOLUME(rand() >> 15); for (padding = 0; padding < PADDING; padding++, i++) volumes[i] = volumes[padding]; pa_volume_s16ne_neon(samples+ALIGN, volumes, CHANNELS, sizeof(samples)); pa_volume_s16ne_c(samples_ref+ALIGN, volumes, CHANNELS, sizeof(samples_ref)); for (i = ALIGN; i < SAMPLES+ALIGN; i++) { if (abs(samples[i] - samples_ref[i]) > 0) { pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i], samples_orig[i]); } } start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { memcpy(samples, samples_orig, sizeof(samples_orig)); pa_volume_s16ne_neon(samples+ALIGN, volumes, CHANNELS, sizeof(samples)); } stop = pa_rtclock_now(); pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { memcpy(samples, samples_orig, sizeof(samples_orig)); pa_volume_s16ne_arm(samples+ALIGN, volumes, CHANNELS, sizeof(samples)); } stop = pa_rtclock_now(); pa_log_info("ARM: %llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { memcpy(samples_ref, samples_orig, sizeof(samples_orig)); pa_volume_s16ne_c(samples_ref, volumes, CHANNELS, sizeof(samples_ref)); } stop = pa_rtclock_now(); pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); } #endif /* defined(__arm__) */ int main() { run_test_float(); run_test_s16(); return EXIT_SUCCESS; }