# HG changeset patch # User Peter Meerwald # Date 1341774221 -7200 # Node ID 1f6289166006786b83c76ae4937aacb58ba7edd4 # Parent e889fd0e7769b6b0bc6aa645dc3417c354ef9724 complete diff -r e889fd0e7769 -r 1f6289166006 compile.sh --- a/compile.sh Thu Jul 05 17:31:56 2012 +0200 +++ b/compile.sh Sun Jul 08 21:03:41 2012 +0200 @@ -14,6 +14,8 @@ $CC \ -o svolume_neon svolume_neon.c -lm +$CC \ + -S -o uuu.s svolume_neon.c -lm $CC \ -S -o asdf.s remap_neon.c -lm diff -r e889fd0e7769 -r 1f6289166006 svolume_neon.c --- a/svolume_neon.c Thu Jul 05 17:31:56 2012 +0200 +++ b/svolume_neon.c Sun Jul 08 21:03:41 2012 +0200 @@ -19,10 +19,6 @@ } pa_sample_format_t; #define PA_SAMPLE_S16NE PA_SAMPLE_S16LE #define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE -typedef struct { - pa_sample_format_t *format; -} pa_remap_t; -typedef void (*pa_remap_func_t)(pa_remap_t *m, void *dst, const void *src, unsigned n); typedef long long unsigned int pa_usec_t; #define pa_assert(x) assert(x) @@ -56,7 +52,7 @@ return tv.tv_sec * 1000000ULL + tv.tv_usec; } -void pa_volume_s16ne_c(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { +static void pa_volume_s16ne_c(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) { unsigned channel; length /= sizeof(int16_t); @@ -83,7 +79,7 @@ } } -void pa_volume_float32ne_c(float *samples, float *volumes, unsigned channels, unsigned length) { +static void pa_volume_float32ne_c(float *samples, const float *volumes, unsigned channels, unsigned length) { unsigned channel; length /= sizeof(float); @@ -96,17 +92,6 @@ } } -/* -void pa_volume_s16ne_orc(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) -{ - if (channels == 2) { - int64_t v = (int64_t)volumes[1] << 32 | volumes[0]; - pa_volume_s16ne_orc_2ch (samples, v, ((length / (sizeof(int16_t))) / 2)); - } else if (channels == 1) - pa_volume_s16ne_orc_1ch (samples, volumes[0], length / (sizeof(int16_t))); -} -*/ - #if defined(__arm__) #include "arm_neon.h" @@ -117,14 +102,12 @@ " addcs r0, %1 \n\t" \ " movcs r6, r0 \n\t" -static void pa_volume_s16ne_arm(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { - int32_t *ve; - +static void pa_volume_s16ne_arm(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) { /* Channels must be at least 4, and always a multiple of the original number. * This is also the max amount we overread the volume array, which should * have enough padding. */ - channels = channels == 3 ? 6 : PA_MAX (4U, channels); - ve = volumes + channels; + channels = channels == 3 ? 6 : PA_MAX(4U, channels); + const uint32_t *ve = volumes + channels; __asm__ __volatile__ ( " mov r6, %1 \n\t" @@ -198,67 +181,94 @@ ); } -static inline void vol_s16ne_neon(int32x4_t vol4, int16_t *samples, unsigned length) { - unsigned i; - int16x4_t hi = vshrn_n_s32(vol4, 16); - int32x4_t lo = vandq_s32(vol4, vdupq_n_s32(0xFFFF)); - - for (i = 0; i < length/8; i++) { - int16x4_t v1 = ((int16x4_t *) samples)[2*i]; - int16x4_t v2 = ((int16x4_t *) samples)[2*i+1]; +static inline void vol_s16_neon(const uint32x4_t *vol4, int16_t *samples, unsigned length) { + asm volatile ( + "mov %[length], %[length], lsr #2\n\t" + "vld1.s32 {q0}, [%[vol]]\n\t" + "vshl.u32 q3, q0, #16\n\t" /* lo */ + "vshrn.s32 d1, q0, #16\n\t" /* hi */ + "vshr.u32 q3, q3, #16\n\t" + "1:\n\t" + "vld1.16 {d0}, [%[samples]]\n\t" + + "vmull.s16 q1, d0, d1\n\t" - int32x4_t t1 = vmull_s16(v1, hi); - int32x4_t t2 = vmull_s16(v2, hi); + "vmovl.s16 q2, d0\n\t" + "vmul.s32 q2, q2, q3\n\t" + + "vsra.s32 q1, q2, #16\n\t" + "vmovn.s32 d0, q1\n\t" - int16x4_t r1 = vqmovn_s32(vsraq_n_s32(t1, vmulq_s32(vmovl_s16(v1), lo), 16)); - int16x4_t r2 = vqmovn_s32(vsraq_n_s32(t2, vmulq_s32(vmovl_s16(v2), lo), 16)); - - ((int16x8_t *)samples)[i] = vcombine_s16(r1, r2); - } + "subs %[length], %[length], #1\n\t" + "vst1.16 {d0}, [%[samples]]!\n\t" + "bgt 1b\n\t" + /* output operands (or input operands that get modified) */ + : [samples] "+r" (samples), [length] "+r" (length) + : [vol] "r" (vol4) /* input operands */ + : "memory", "cc", "q0", "q1", "q2", "q3" /* clobber list */ + ); } -void pa_volume_s16ne_neon(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { +static inline void vol_float_neon(const float32x4_t *vol4, float *samples, unsigned length) { + asm volatile ( + "mov %[length], %[length], lsr #2\n\t" + "vld1.32 {q1}, [%[vol]]\n\t" + "1:\n\t" + "vld1.32 {q0}, [%[samples]]\n\t" + "vmul.f32 q0, q0, q1\n\t" + "subs %[length], %[length], #1\n\t" + "vst1.32 {q0}, [%[samples]]!\n\t" + "bgt 1b\n\t" + /* output operands (or input operands that get modified) */ + : [samples] "+r" (samples), [length] "+r" (length) + : [vol] "r" (vol4) /* input operands */ + : "memory", "cc", "q0", "q1" /* clobber list */ + ); +} + +static void pa_volume_s16ne_neon(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) { unsigned channel = 0, i; - int32x4_t vol4; + uint32x4_t vol4; length /= sizeof(int16_t); switch (channels) { case 1: - vol4 = vdupq_n_s32(*volumes); - vol_s16ne_neon(vol4, samples, length); - - for (i = length & ~7; i < length; i++) { + vol4 = vdupq_n_u32(*volumes); + vol_s16_neon(&vol4, samples, length); + + for (i = length & ~3; i < length; i++) { int32_t t = samples[i]; - t = ((t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16)); + t = ((int32_t) (t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16)); samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); } break; case 2: - vol4 = vcombine_s32(*(int32x2_t *)volumes, *(int32x2_t *)volumes); - vol_s16ne_neon(vol4, samples, length); - - for (i = length & ~7; i < length; i++) { + vol4 = vcombine_u32(*(uint32x2_t *)volumes, *(uint32x2_t *)volumes); + vol_s16_neon(&vol4, samples, length); + + for (i = length & ~3; i < length; i++) { int32_t t = samples[i]; - int32_t vol = volumes[(channel++) & 1]; - t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); + uint32_t vol = volumes[(channel++) & 1]; + t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); } break; case 4: - vol4 = *(int32x4_t *)volumes; - vol_s16ne_neon(vol4, samples, length); - - for (i = length & ~7; i < length; i++) { + vol4 = *(uint32x4_t *)volumes; + vol_s16_neon(&vol4, samples, length); + + for (i = length & ~3; i < length; i++) { int32_t t = samples[i]; - int32_t vol = volumes[(channel++) & 3]; - t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); + uint32_t vol = volumes[(channel++) & 3]; + t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); } break; default: for (; length; length--) { - int32_t t, hi, lo; + int32_t t; + uint32_t hi, lo; /* Multiplying the 32bit volume factor with the 16bit * sample might result in an 48bit value. We want to @@ -270,7 +280,7 @@ lo = volumes[channel] & 0xFFFF; t = (int32_t)(*samples); - t = ((t * lo) >> 16) + (t * hi); + t = ((int32_t) (t * lo) >> 16) + (t * hi); t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); *samples++ = (int16_t) t; @@ -281,7 +291,7 @@ } } -void pa_volume_float32ne_neon(float *samples, float *volumes, unsigned channels, unsigned length) { +static void pa_volume_float32ne_neon(float *samples, const float *volumes, unsigned channels, unsigned length) { unsigned channel = 0, i; float32x4_t vol4; @@ -290,9 +300,7 @@ switch (channels) { case 1: vol4 = vdupq_n_f32(*volumes); - for (i = 0; i < length/4; i++) { - ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4); - } + vol_float_neon(&vol4, samples, length); for (i = length & ~3; i < length; i++) { samples[i] *= volumes[0]; @@ -300,9 +308,7 @@ break; case 2: vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes); - for (i = 0; i < length/4; i++) { - ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4); - } + vol_float_neon(&vol4, samples, length); for (i = length & ~3; i < length; i++) { samples[i] *= volumes[channel]; @@ -313,9 +319,7 @@ break; case 4: vol4 = *(float32x4_t *)volumes; - for (i = 0; i < length/4; i++) { - ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4); - } + vol_float_neon(&vol4, samples, length); for (i = length & ~3; i < length; i++) { samples[i] *= volumes[channel++]; @@ -333,7 +337,7 @@ } #define SAMPLES 1019 -#define TIMES 3000 +#define TIMES 50000 #define CHANNELS 4 #define PADDING 16 @@ -358,7 +362,7 @@ pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats)); pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref)); - + for (i = 0; i < SAMPLES; i++) { if (fabsf(floats[i] - floats_ref[i]) > 0.00001) { pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i], @@ -387,7 +391,7 @@ int16_t samples[SAMPLES]; int16_t samples_ref[SAMPLES]; int16_t samples_orig[SAMPLES]; - int32_t volumes[CHANNELS + PADDING]; + uint32_t volumes[CHANNELS + PADDING]; unsigned i, padding; pa_usec_t start, stop; @@ -413,7 +417,7 @@ samples_orig[i]); } } - +exit(0); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { memcpy(samples, samples_orig, sizeof(samples_orig));