Mercurial > hg > pa-neon
view svolume_neon.c @ 1:b829afbea564
more testing
author | Peter Meerwald <p.meerwald@bct-electronic.com> |
---|---|
date | Fri, 20 Apr 2012 14:26:14 +0200 |
parents | e0040ee59c3c |
children | 1f6289166006 |
line wrap: on
line source
/* * Copyright 2012 Peter Meerwald <p.meerwald@bct-electronic.com> */ #include <stdlib.h> #include <stdio.h> #include <stdarg.h> #include <string.h> #include <math.h> #include <sys/time.h> #include <assert.h> typedef short int16_t; typedef unsigned int uint32_t; typedef enum pa_sample_format { PA_SAMPLE_S16LE, PA_SAMPLE_FLOAT32LE, } pa_sample_format_t; #define PA_SAMPLE_S16NE PA_SAMPLE_S16LE #define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE typedef struct { pa_sample_format_t *format; } pa_remap_t; typedef void (*pa_remap_func_t)(pa_remap_t *m, void *dst, const void *src, unsigned n); typedef long long unsigned int pa_usec_t; #define pa_assert(x) assert(x) #define pa_assert_not_reached() assert(0) #define PA_MAX(a, b) ((a) > (b) ? (a) : (b)) typedef uint32_t pa_volume_t; #define PA_VOLUME_MUTED ((pa_volume_t) 0U) #define PA_VOLUME_MAX ((pa_volume_t) UINT32_MAX/2) #define PA_UNLIKELY(x) (x) #define PA_CLAMP_UNLIKELY(x, low, high) (PA_UNLIKELY((x) > (high)) ? (high) : (PA_UNLIKELY((x) < (low)) ? (low) : (x))) #define PA_CLAMP_VOLUME(v) (PA_CLAMP_UNLIKELY((v), PA_VOLUME_MUTED, PA_VOLUME_MAX)) static void pa_log_info(const char *format, ...) { va_list ap; char buf[1024]; va_start(ap, format); vsprintf(buf, format, ap); printf("%s\n", buf); va_end(ap); } #define pa_log_debug pa_log_info static pa_usec_t pa_rtclock_now() { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec * 1000000ULL + tv.tv_usec; } void pa_volume_s16ne_c(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { unsigned channel; length /= sizeof(int16_t); for (channel = 0; length; length--) { int32_t t, hi, lo; /* Multiplying the 32bit volume factor with the 16bit * sample might result in an 48bit value. We want to * do without 64 bit integers and hence do the * multiplication independently for the HI and LO part * of the volume. */ hi = volumes[channel] >> 16; lo = volumes[channel] & 0xFFFF; t = (int32_t)(*samples); t = ((t * lo) >> 16) + (t * hi); t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); *samples++ = (int16_t) t; if (PA_UNLIKELY(++channel >= channels)) channel = 0; } } void pa_volume_float32ne_c(float *samples, float *volumes, unsigned channels, unsigned length) { unsigned channel; length /= sizeof(float); for (channel = 0; length; length--) { *samples++ *= volumes[channel]; if (PA_UNLIKELY(++channel >= channels)) channel = 0; } } /* void pa_volume_s16ne_orc(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { if (channels == 2) { int64_t v = (int64_t)volumes[1] << 32 | volumes[0]; pa_volume_s16ne_orc_2ch (samples, v, ((length / (sizeof(int16_t))) / 2)); } else if (channels == 1) pa_volume_s16ne_orc_1ch (samples, volumes[0], length / (sizeof(int16_t))); } */ #if defined(__arm__) #include "arm_neon.h" #define MOD_INC() \ " subs r0, r6, %2 \n\t" \ " itt cs \n\t" \ " addcs r0, %1 \n\t" \ " movcs r6, r0 \n\t" static void pa_volume_s16ne_arm(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { int32_t *ve; /* Channels must be at least 4, and always a multiple of the original number. * This is also the max amount we overread the volume array, which should * have enough padding. */ channels = channels == 3 ? 6 : PA_MAX (4U, channels); ve = volumes + channels; __asm__ __volatile__ ( " mov r6, %1 \n\t" " mov %3, %3, LSR #1 \n\t" /* length /= sizeof (int16_t) */ " tst %3, #1 \n\t" /* check for odd samples */ " beq 2f \n\t" "1: \n\t" " ldr r0, [r6], #4 \n\t" /* odd samples volumes */ " ldrh r2, [%0] \n\t" " smulwb r0, r0, r2 \n\t" " ssat r0, #16, r0 \n\t" " strh r0, [%0], #2 \n\t" MOD_INC() "2: \n\t" " mov %3, %3, LSR #1 \n\t" " tst %3, #1 \n\t" /* check for odd samples */ " beq 4f \n\t" "3: \n\t" " ldrd r2, [r6], #8 \n\t" /* 2 samples at a time */ " ldr r0, [%0] \n\t" " smulwt r2, r2, r0 \n\t" " smulwb r3, r3, r0 \n\t" " ssat r2, #16, r2 \n\t" " ssat r3, #16, r3 \n\t" " pkhbt r0, r3, r2, LSL #16 \n\t" " str r0, [%0], #4 \n\t" MOD_INC() "4: \n\t" " movs %3, %3, LSR #1 \n\t" " beq 6f \n\t" "5: \n\t" " ldrd r2, [r6], #8 \n\t" /* 4 samples at a time */ " ldrd r4, [r6], #8 \n\t" " ldrd r0, [%0] \n\t" " smulwt r2, r2, r0 \n\t" " smulwb r3, r3, r0 \n\t" " smulwt r4, r4, r1 \n\t" " smulwb r5, r5, r1 \n\t" " ssat r2, #16, r2 \n\t" " ssat r3, #16, r3 \n\t" " ssat r4, #16, r4 \n\t" " ssat r5, #16, r5 \n\t" " pkhbt r0, r3, r2, LSL #16 \n\t" " pkhbt r1, r5, r4, LSL #16 \n\t" " strd r0, [%0], #8 \n\t" MOD_INC() " subs %3, %3, #1 \n\t" " bne 5b \n\t" "6: \n\t" : "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length) : : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc" ); } static inline void vol_s16ne_neon(int32x4_t vol4, int16_t *samples, unsigned length) { unsigned i; int16x4_t hi = vshrn_n_s32(vol4, 16); int32x4_t lo = vandq_s32(vol4, vdupq_n_s32(0xFFFF)); for (i = 0; i < length/8; i++) { int16x4_t v1 = ((int16x4_t *) samples)[2*i]; int16x4_t v2 = ((int16x4_t *) samples)[2*i+1]; int32x4_t t1 = vmull_s16(v1, hi); int32x4_t t2 = vmull_s16(v2, hi); int16x4_t r1 = vqmovn_s32(vsraq_n_s32(t1, vmulq_s32(vmovl_s16(v1), lo), 16)); int16x4_t r2 = vqmovn_s32(vsraq_n_s32(t2, vmulq_s32(vmovl_s16(v2), lo), 16)); ((int16x8_t *)samples)[i] = vcombine_s16(r1, r2); } } void pa_volume_s16ne_neon(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { unsigned channel = 0, i; int32x4_t vol4; length /= sizeof(int16_t); switch (channels) { case 1: vol4 = vdupq_n_s32(*volumes); vol_s16ne_neon(vol4, samples, length); for (i = length & ~7; i < length; i++) { int32_t t = samples[i]; t = ((t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16)); samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); } break; case 2: vol4 = vcombine_s32(*(int32x2_t *)volumes, *(int32x2_t *)volumes); vol_s16ne_neon(vol4, samples, length); for (i = length & ~7; i < length; i++) { int32_t t = samples[i]; int32_t vol = volumes[(channel++) & 1]; t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); } break; case 4: vol4 = *(int32x4_t *)volumes; vol_s16ne_neon(vol4, samples, length); for (i = length & ~7; i < length; i++) { int32_t t = samples[i]; int32_t vol = volumes[(channel++) & 3]; t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); } break; default: for (; length; length--) { int32_t t, hi, lo; /* Multiplying the 32bit volume factor with the 16bit * sample might result in an 48bit value. We want to * do without 64 bit integers and hence do the * multiplication independently for the HI and LO part * of the volume. */ hi = volumes[channel] >> 16; lo = volumes[channel] & 0xFFFF; t = (int32_t)(*samples); t = ((t * lo) >> 16) + (t * hi); t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); *samples++ = (int16_t) t; if (PA_UNLIKELY(++channel >= channels)) channel = 0; } break; } } void pa_volume_float32ne_neon(float *samples, float *volumes, unsigned channels, unsigned length) { unsigned channel = 0, i; float32x4_t vol4; length /= sizeof(float); switch (channels) { case 1: vol4 = vdupq_n_f32(*volumes); for (i = 0; i < length/4; i++) { ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4); } for (i = length & ~3; i < length; i++) { samples[i] *= volumes[0]; } break; case 2: vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes); for (i = 0; i < length/4; i++) { ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4); } for (i = length & ~3; i < length; i++) { samples[i] *= volumes[channel]; if (PA_UNLIKELY(++channel >= channels)) channel = 0; } break; case 4: vol4 = *(float32x4_t *)volumes; for (i = 0; i < length/4; i++) { ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4); } for (i = length & ~3; i < length; i++) { samples[i] *= volumes[channel++]; } break; default: for (; length; length--) { *samples++ *= volumes[channel]; if (PA_UNLIKELY(++channel >= channels)) channel = 0; } break; } } #define SAMPLES 1019 #define TIMES 3000 #define CHANNELS 4 #define PADDING 16 static void run_test_float(void) { float floats[SAMPLES]; float floats_ref[SAMPLES]; float floats_orig[SAMPLES]; float volumes[CHANNELS]; unsigned i; pa_usec_t start, stop; pa_log_debug("checking NEON volume_float32ne(%d)", SAMPLES); for (i = 0; i < SAMPLES; i++) { floats_orig[i] = rand()/(float) RAND_MAX - 0.5f; } memcpy(floats_ref, floats_orig, sizeof(floats_orig)); memcpy(floats, floats_orig, sizeof(floats_orig)); for (i = 0; i < CHANNELS; i++) volumes[i] = 0.5f * rand() / (float) RAND_MAX; pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats)); pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref)); for (i = 0; i < SAMPLES; i++) { if (fabsf(floats[i] - floats_ref[i]) > 0.00001) { pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i], floats_orig[i]); } } start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { memcpy(floats, floats_orig, sizeof(floats_orig)); pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats)); } stop = pa_rtclock_now(); pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { memcpy(floats_ref, floats_orig, sizeof(floats_orig)); pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref)); } stop = pa_rtclock_now(); pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); } static void run_test_s16(void) { int16_t samples[SAMPLES]; int16_t samples_ref[SAMPLES]; int16_t samples_orig[SAMPLES]; int32_t volumes[CHANNELS + PADDING]; unsigned i, padding; pa_usec_t start, stop; pa_log_debug("checking NEON volume_s16ne(%d)", SAMPLES); for (i = 0; i < SAMPLES; i++) { samples_orig[i] = rand() - RAND_MAX/2; } memcpy(samples_ref, samples_orig, sizeof(samples_orig)); memcpy(samples, samples_orig, sizeof(samples_orig)); for (i = 0; i < CHANNELS; i++) volumes[i] = PA_CLAMP_VOLUME(rand() >> 15); for (padding = 0; padding < PADDING; padding++, i++) volumes[i] = volumes[padding]; pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples)); pa_volume_s16ne_c(samples_ref, volumes, CHANNELS, sizeof(samples_ref)); for (i = 0; i < SAMPLES; i++) { if (abs(samples[i] - samples_ref[i]) > 0) { pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i], samples_orig[i]); } } start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { memcpy(samples, samples_orig, sizeof(samples_orig)); pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples)); } stop = pa_rtclock_now(); pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { memcpy(samples, samples_orig, sizeof(samples_orig)); pa_volume_s16ne_arm(samples, volumes, CHANNELS, sizeof(samples)); } stop = pa_rtclock_now(); pa_log_info("ARM: %llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { memcpy(samples_ref, samples_orig, sizeof(samples_orig)); pa_volume_s16ne_c(samples_ref, volumes, CHANNELS, sizeof(samples_ref)); } stop = pa_rtclock_now(); pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); } #endif /* defined(__arm__) */ int main() { run_test_float(); run_test_s16(); return EXIT_SUCCESS; }