pa-neon: svolume_neon.c comparison

comparison svolume_neon.c @ 4:1f6289166006

complete

author	Peter Meerwald <p.meerwald@bct-electronic.com>
date	Sun, 08 Jul 2012 21:03:41 +0200
parents	b829afbea564
children	07763f536182

comparison

equal deleted inserted replaced

-:e889fd0e7769
+:1f6289166006
 PA_SAMPLE_S16LE,
 PA_SAMPLE_FLOAT32LE,
 } pa_sample_format_t;
 #define PA_SAMPLE_S16NE PA_SAMPLE_S16LE
 #define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE
-typedef struct {
-pa_sample_format_t *format;
-} pa_remap_t;
-typedef void (*pa_remap_func_t)(pa_remap_t *m, void *dst, const void *src, unsigned n);
 typedef long long unsigned int pa_usec_t;
 #define pa_assert(x) assert(x)
 #define pa_assert_not_reached() assert(0)
 gettimeofday(&tv, NULL);
 return tv.tv_sec * 1000000ULL + tv.tv_usec;
 }
-void pa_volume_s16ne_c(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
+static void pa_volume_s16ne_c(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) {
 unsigned channel;
 length /= sizeof(int16_t);
 for (channel = 0; length; length--) {
 if (PA_UNLIKELY(++channel >= channels))
 channel = 0;
 }
 }
-void pa_volume_float32ne_c(float *samples, float *volumes, unsigned channels, unsigned length) {
+static void pa_volume_float32ne_c(float *samples, const float *volumes, unsigned channels, unsigned length) {
 unsigned channel;
 length /= sizeof(float);
 for (channel = 0; length; length--) {
 if (PA_UNLIKELY(++channel >= channels))
 channel = 0;
 }
 }
-/*
-void pa_volume_s16ne_orc(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
-{
-if (channels == 2) {
-int64_t v = (int64_t)volumes[1] << 32 | volumes[0];
-pa_volume_s16ne_orc_2ch (samples, v, ((length / (sizeof(int16_t))) / 2));
-} else if (channels == 1)
-pa_volume_s16ne_orc_1ch (samples, volumes[0], length / (sizeof(int16_t)));
-}
-*/
 #if defined(__arm__)
 #include "arm_neon.h"
 " subs  r0, r6, %2              \n\t" \
 " itt cs                        \n\t" \
 " addcs r0, %1                  \n\t" \
 " movcs r6, r0                  \n\t"
-static void pa_volume_s16ne_arm(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
+static void pa_volume_s16ne_arm(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) {
-int32_t *ve;
 /* Channels must be at least 4, and always a multiple of the original number.
 * This is also the max amount we overread the volume array, which should
 * have enough padding. */
-channels = channels == 3 ? 6 : PA_MAX (4U, channels);
+channels = channels == 3 ? 6 : PA_MAX(4U, channels);
-ve = volumes + channels;
+const uint32_t *ve = volumes + channels;
 __asm__ __volatile__ (
 " mov r6, %1                      \n\t"
 " mov %3, %3, LSR #1              \n\t" /* length /= sizeof (int16_t) */
 " tst %3, #1                      \n\t" /* check for odd samples */
 :
 : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc"
 );
 }
-static inline void vol_s16ne_neon(int32x4_t vol4, int16_t *samples, unsigned length) {
+static inline void vol_s16_neon(const uint32x4_t *vol4, int16_t *samples, unsigned length) {
-unsigned i;
+asm volatile (
-int16x4_t hi = vshrn_n_s32(vol4, 16);
+"mov        %[length], %[length], lsr #2\n\t"
-int32x4_t lo = vandq_s32(vol4, vdupq_n_s32(0xFFFF));
+"vld1.s32   {q0}, [%[vol]]\n\t"
+"vshl.u32   q3, q0, #16\n\t" /* lo */
-for (i = 0; i < length/8; i++) {
+"vshrn.s32  d1, q0, #16\n\t" /* hi */
-int16x4_t v1 = ((int16x4_t *) samples)[2*i];
+"vshr.u32   q3, q3, #16\n\t"
-int16x4_t v2 = ((int16x4_t *) samples)[2*i+1];
+"1:\n\t"
+"vld1.16	{d0}, [%[samples]]\n\t"
-int32x4_t t1 = vmull_s16(v1, hi);
-int32x4_t t2 = vmull_s16(v2, hi);
+"vmull.s16  q1, d0, d1\n\t"
-int16x4_t r1 = vqmovn_s32(vsraq_n_s32(t1, vmulq_s32(vmovl_s16(v1), lo), 16));
+"vmovl.s16  q2, d0\n\t"
-int16x4_t r2 = vqmovn_s32(vsraq_n_s32(t2, vmulq_s32(vmovl_s16(v2), lo), 16));
+"vmul.s32   q2, q2, q3\n\t"
-((int16x8_t *)samples)[i] = vcombine_s16(r1, r2);
+"vsra.s32   q1, q2, #16\n\t"
-}
+"vmovn.s32  d0, q1\n\t"
-}
+"subs       %[length], %[length], #1\n\t"
-void pa_volume_s16ne_neon(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
+"vst1.16	{d0}, [%[samples]]!\n\t"
+"bgt        1b\n\t"
+/* output operands (or input operands that get modified) */
+: [samples] "+r" (samples), [length] "+r" (length)
+: [vol] "r" (vol4) /* input operands */
+: "memory", "cc", "q0", "q1", "q2", "q3" /* clobber list */
+);
+}
+static inline void vol_float_neon(const float32x4_t *vol4, float *samples, unsigned length) {
+asm volatile (
+"mov        %[length], %[length], lsr #2\n\t"
+"vld1.32    {q1}, [%[vol]]\n\t"
+"1:\n\t"
+"vld1.32	{q0}, [%[samples]]\n\t"
+"vmul.f32   q0, q0, q1\n\t"
+"subs       %[length], %[length], #1\n\t"
+"vst1.32	{q0}, [%[samples]]!\n\t"
+"bgt        1b\n\t"
+/* output operands (or input operands that get modified) */
+: [samples] "+r" (samples), [length] "+r" (length)
+: [vol] "r" (vol4) /* input operands */
+: "memory", "cc", "q0", "q1" /* clobber list */
+);
+}
+static void pa_volume_s16ne_neon(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) {
 unsigned channel = 0, i;
-int32x4_t vol4;
+uint32x4_t vol4;
 length /= sizeof(int16_t);
 switch (channels) {
 case 1:
-vol4 = vdupq_n_s32(*volumes);
+vol4 = vdupq_n_u32(*volumes);
-vol_s16ne_neon(vol4, samples, length);
+vol_s16_neon(&vol4, samples, length);
-for (i = length & ~7; i < length; i++) {
+for (i = length & ~3; i < length; i++) {
 int32_t t = samples[i];
-t = ((t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16));
+t = ((int32_t) (t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16));
 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
 }
 break;
 case 2:
-vol4 = vcombine_s32(*(int32x2_t *)volumes, *(int32x2_t *)volumes);
+vol4 = vcombine_u32(*(uint32x2_t *)volumes, *(uint32x2_t *)volumes);
-vol_s16ne_neon(vol4, samples, length);
+vol_s16_neon(&vol4, samples, length);
-for (i = length & ~7; i < length; i++) {
+for (i = length & ~3; i < length; i++) {
 int32_t t = samples[i];
-int32_t vol = volumes[(channel++) & 1];
+uint32_t vol = volumes[(channel++) & 1];
-t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
+t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
 }
 break;
 case 4:
-vol4 = *(int32x4_t *)volumes;
+vol4 = *(uint32x4_t *)volumes;
-vol_s16ne_neon(vol4, samples, length);
+vol_s16_neon(&vol4, samples, length);
-for (i = length & ~7; i < length; i++) {
+for (i = length & ~3; i < length; i++) {
 int32_t t = samples[i];
-int32_t vol = volumes[(channel++) & 3];
+uint32_t vol = volumes[(channel++) & 3];
-t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
+t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
 }
 break;
 default:
 for (; length; length--) {
-int32_t t, hi, lo;
+int32_t t;
+uint32_t hi, lo;
 /* Multiplying the 32bit volume factor with the 16bit
 * sample might result in an 48bit value. We want to
 * do without 64 bit integers and hence do the
 * multiplication independently for the HI and LO part
 hi = volumes[channel] >> 16;
 lo = volumes[channel] & 0xFFFF;
 t = (int32_t)(*samples);
-t = ((t * lo) >> 16) + (t * hi);
+t = ((int32_t) (t * lo) >> 16) + (t * hi);
 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
 *samples++ = (int16_t) t;
 if (PA_UNLIKELY(++channel >= channels))
 channel = 0;
 }
 break;
 }
 }
-void pa_volume_float32ne_neon(float *samples, float *volumes, unsigned channels, unsigned length) {
+static void pa_volume_float32ne_neon(float *samples, const float *volumes, unsigned channels, unsigned length) {
 unsigned channel = 0, i;
 float32x4_t vol4;
 length /= sizeof(float);
 switch (channels) {
 case 1:
 vol4 = vdupq_n_f32(*volumes);
-for (i = 0; i < length/4; i++) {
+vol_float_neon(&vol4, samples, length);
-((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
-}
 for (i = length & ~3; i < length; i++) {
 samples[i] *= volumes[0];
 }
 break;
 case 2:
 vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes);
-for (i = 0; i < length/4; i++) {
+vol_float_neon(&vol4, samples, length);
-((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
-}
 for (i = length & ~3; i < length; i++) {
 samples[i] *= volumes[channel];
 if (PA_UNLIKELY(++channel >= channels))
 channel = 0;
 }
 break;
 case 4:
 vol4 = *(float32x4_t *)volumes;
-for (i = 0; i < length/4; i++) {
+vol_float_neon(&vol4, samples, length);
-((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
-}
 for (i = length & ~3; i < length; i++) {
 samples[i] *= volumes[channel++];
 }
 break;
 break;
 }
 }
 #define SAMPLES 1019
-#define TIMES 3000
+#define TIMES 50000
 #define CHANNELS 4
 #define PADDING 16
 static void run_test_float(void) {
 float floats[SAMPLES];
 for (i = 0; i < CHANNELS; i++)
 volumes[i] = 0.5f * rand() / (float) RAND_MAX;
 pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
 pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
 for (i = 0; i < SAMPLES; i++) {
 if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
 pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i],
 floats_orig[i]);
 }
 static void run_test_s16(void) {
 int16_t samples[SAMPLES];
 int16_t samples_ref[SAMPLES];
 int16_t samples_orig[SAMPLES];
-int32_t volumes[CHANNELS + PADDING];
+uint32_t volumes[CHANNELS + PADDING];
 unsigned i, padding;
 pa_usec_t start, stop;
 pa_log_debug("checking NEON volume_s16ne(%d)", SAMPLES);
 if (abs(samples[i] - samples_ref[i]) > 0) {
 pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i],
 samples_orig[i]);
 }
 }
+exit(0);
 start = pa_rtclock_now();
 for (i = 0; i < TIMES; i++) {
 memcpy(samples, samples_orig, sizeof(samples_orig));
 pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
 }

Mercurial > hg > pa-neon

comparison svolume_neon.c @ 4:1f6289166006