# HG changeset patch
# User Peter Meerwald <p.meerwald@bct-electronic.com>
# Date 1341774221 -7200
# Node ID 1f6289166006786b83c76ae4937aacb58ba7edd4
# Parent  e889fd0e7769b6b0bc6aa645dc3417c354ef9724
complete

diff -r e889fd0e7769 -r 1f6289166006 compile.sh
--- a/compile.sh	Thu Jul 05 17:31:56 2012 +0200
+++ b/compile.sh	Sun Jul 08 21:03:41 2012 +0200
@@ -14,6 +14,8 @@
 
 $CC \
     -o svolume_neon svolume_neon.c -lm
+$CC \
+    -S -o uuu.s svolume_neon.c -lm
 
 $CC \
     -S -o asdf.s remap_neon.c -lm
diff -r e889fd0e7769 -r 1f6289166006 svolume_neon.c
--- a/svolume_neon.c	Thu Jul 05 17:31:56 2012 +0200
+++ b/svolume_neon.c	Sun Jul 08 21:03:41 2012 +0200
@@ -19,10 +19,6 @@
 } pa_sample_format_t;
 #define PA_SAMPLE_S16NE PA_SAMPLE_S16LE
 #define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE
-typedef struct {
-    pa_sample_format_t *format;
-} pa_remap_t;
-typedef void (*pa_remap_func_t)(pa_remap_t *m, void *dst, const void *src, unsigned n);
 typedef long long unsigned int pa_usec_t;
 
 #define pa_assert(x) assert(x)
@@ -56,7 +52,7 @@
     return tv.tv_sec * 1000000ULL + tv.tv_usec;
 }
 
-void pa_volume_s16ne_c(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
+static void pa_volume_s16ne_c(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) {
     unsigned channel;
 
     length /= sizeof(int16_t);
@@ -83,7 +79,7 @@
     }
 }
 
-void pa_volume_float32ne_c(float *samples, float *volumes, unsigned channels, unsigned length) {
+static void pa_volume_float32ne_c(float *samples, const float *volumes, unsigned channels, unsigned length) {
     unsigned channel;
 
     length /= sizeof(float);
@@ -96,17 +92,6 @@
     }
 }
 
-/*
-void pa_volume_s16ne_orc(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
-{
-    if (channels == 2) {
-        int64_t v = (int64_t)volumes[1] << 32 | volumes[0];
-        pa_volume_s16ne_orc_2ch (samples, v, ((length / (sizeof(int16_t))) / 2));
-    } else if (channels == 1)
-        pa_volume_s16ne_orc_1ch (samples, volumes[0], length / (sizeof(int16_t)));
-}
-*/
-
 #if defined(__arm__)
 
 #include "arm_neon.h"
@@ -117,14 +102,12 @@
     " addcs r0, %1                  \n\t" \
     " movcs r6, r0                  \n\t"
 
-static void pa_volume_s16ne_arm(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
-    int32_t *ve;
-
+static void pa_volume_s16ne_arm(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) {
     /* Channels must be at least 4, and always a multiple of the original number.
      * This is also the max amount we overread the volume array, which should
      * have enough padding. */
-    channels = channels == 3 ? 6 : PA_MAX (4U, channels);
-    ve = volumes + channels;
+    channels = channels == 3 ? 6 : PA_MAX(4U, channels);
+    const uint32_t *ve = volumes + channels;
 
     __asm__ __volatile__ (
         " mov r6, %1                      \n\t"
@@ -198,67 +181,94 @@
     );
 }
 
-static inline void vol_s16ne_neon(int32x4_t vol4, int16_t *samples, unsigned length) {
-    unsigned i;
-    int16x4_t hi = vshrn_n_s32(vol4, 16);
-    int32x4_t lo = vandq_s32(vol4, vdupq_n_s32(0xFFFF));
-    
-    for (i = 0; i < length/8; i++) {
-        int16x4_t v1 = ((int16x4_t *) samples)[2*i];
-        int16x4_t v2 = ((int16x4_t *) samples)[2*i+1];
+static inline void vol_s16_neon(const uint32x4_t *vol4, int16_t *samples, unsigned length) {
+    asm volatile (
+    "mov        %[length], %[length], lsr #2\n\t"
+    "vld1.s32   {q0}, [%[vol]]\n\t"
+    "vshl.u32   q3, q0, #16\n\t" /* lo */
+    "vshrn.s32  d1, q0, #16\n\t" /* hi */
+    "vshr.u32   q3, q3, #16\n\t"
+    "1:\n\t"
+    "vld1.16	{d0}, [%[samples]]\n\t"
+
+    "vmull.s16  q1, d0, d1\n\t"
 
-        int32x4_t t1 = vmull_s16(v1, hi);
-        int32x4_t t2 = vmull_s16(v2, hi);
+    "vmovl.s16  q2, d0\n\t"
+    "vmul.s32   q2, q2, q3\n\t"
+
+    "vsra.s32   q1, q2, #16\n\t"
+    "vmovn.s32  d0, q1\n\t"
 
-        int16x4_t r1 = vqmovn_s32(vsraq_n_s32(t1, vmulq_s32(vmovl_s16(v1), lo), 16));
-        int16x4_t r2 = vqmovn_s32(vsraq_n_s32(t2, vmulq_s32(vmovl_s16(v2), lo), 16));
-      
-        ((int16x8_t *)samples)[i] = vcombine_s16(r1, r2);
-    }
+    "subs       %[length], %[length], #1\n\t"
+    "vst1.16	{d0}, [%[samples]]!\n\t"
+    "bgt        1b\n\t"
+      /* output operands (or input operands that get modified) */
+    : [samples] "+r" (samples), [length] "+r" (length)
+    : [vol] "r" (vol4) /* input operands */
+    : "memory", "cc", "q0", "q1", "q2", "q3" /* clobber list */
+    );
 }
 
-void pa_volume_s16ne_neon(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
+static inline void vol_float_neon(const float32x4_t *vol4, float *samples, unsigned length) {
+    asm volatile (
+    "mov        %[length], %[length], lsr #2\n\t"
+    "vld1.32    {q1}, [%[vol]]\n\t"
+    "1:\n\t"
+    "vld1.32	{q0}, [%[samples]]\n\t"
+    "vmul.f32   q0, q0, q1\n\t"
+    "subs       %[length], %[length], #1\n\t"
+    "vst1.32	{q0}, [%[samples]]!\n\t"
+    "bgt        1b\n\t"
+      /* output operands (or input operands that get modified) */
+    : [samples] "+r" (samples), [length] "+r" (length)
+    : [vol] "r" (vol4) /* input operands */
+    : "memory", "cc", "q0", "q1" /* clobber list */
+    );
+}
+
+static void pa_volume_s16ne_neon(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) {
     unsigned channel = 0, i;
-    int32x4_t vol4;
+    uint32x4_t vol4;
 
     length /= sizeof(int16_t);
 
     switch (channels) {
         case 1:
-            vol4 = vdupq_n_s32(*volumes);
-            vol_s16ne_neon(vol4, samples, length);
-            
-            for (i = length & ~7; i < length; i++) {
+            vol4 = vdupq_n_u32(*volumes);
+            vol_s16_neon(&vol4, samples, length);
+
+            for (i = length & ~3; i < length; i++) {
                 int32_t t = samples[i];
-                t = ((t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16));
+                t = ((int32_t) (t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16));
                 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
             }
             break;
         case 2:
-            vol4 = vcombine_s32(*(int32x2_t *)volumes, *(int32x2_t *)volumes);
-            vol_s16ne_neon(vol4, samples, length);
-            
-            for (i = length & ~7; i < length; i++) {
+            vol4 = vcombine_u32(*(uint32x2_t *)volumes, *(uint32x2_t *)volumes);
+            vol_s16_neon(&vol4, samples, length);
+
+            for (i = length & ~3; i < length; i++) {
                 int32_t t = samples[i];
-                int32_t vol = volumes[(channel++) & 1];
-                t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
+                uint32_t vol = volumes[(channel++) & 1];
+                t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
                 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
             }
             break;
         case 4:
-            vol4 = *(int32x4_t *)volumes;
-            vol_s16ne_neon(vol4, samples, length);
-            
-            for (i = length & ~7; i < length; i++) {
+            vol4 = *(uint32x4_t *)volumes;
+            vol_s16_neon(&vol4, samples, length);
+
+            for (i = length & ~3; i < length; i++) {
                 int32_t t = samples[i];
-                int32_t vol = volumes[(channel++) & 3];
-                t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
+                uint32_t vol = volumes[(channel++) & 3];
+                t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
                 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
             }
             break;
         default:
             for (; length; length--) {
-                int32_t t, hi, lo;
+                int32_t t;
+                uint32_t hi, lo;
 
                 /* Multiplying the 32bit volume factor with the 16bit
                  * sample might result in an 48bit value. We want to
@@ -270,7 +280,7 @@
                 lo = volumes[channel] & 0xFFFF;
 
                 t = (int32_t)(*samples);
-                t = ((t * lo) >> 16) + (t * hi);
+                t = ((int32_t) (t * lo) >> 16) + (t * hi);
                 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
                 *samples++ = (int16_t) t;
 
@@ -281,7 +291,7 @@
     }
 }
 
-void pa_volume_float32ne_neon(float *samples, float *volumes, unsigned channels, unsigned length) {
+static void pa_volume_float32ne_neon(float *samples, const float *volumes, unsigned channels, unsigned length) {
     unsigned channel = 0, i;
     float32x4_t vol4;
 
@@ -290,9 +300,7 @@
     switch (channels) {
         case 1:
             vol4 = vdupq_n_f32(*volumes);
-            for (i = 0; i < length/4; i++) {
-                ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
-            }
+            vol_float_neon(&vol4, samples, length);
 
             for (i = length & ~3; i < length; i++) {
                 samples[i] *= volumes[0];
@@ -300,9 +308,7 @@
             break;
         case 2:
             vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes);
-            for (i = 0; i < length/4; i++) {
-                ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
-            }
+            vol_float_neon(&vol4, samples, length);
 
             for (i = length & ~3; i < length; i++) {
                 samples[i] *= volumes[channel];
@@ -313,9 +319,7 @@
             break;
         case 4:
             vol4 = *(float32x4_t *)volumes;
-            for (i = 0; i < length/4; i++) {
-                ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
-            }
+            vol_float_neon(&vol4, samples, length);
 
             for (i = length & ~3; i < length; i++) {
                 samples[i] *= volumes[channel++];
@@ -333,7 +337,7 @@
 }
 
 #define SAMPLES 1019
-#define TIMES 3000
+#define TIMES 50000
 #define CHANNELS 4
 #define PADDING 16
 
@@ -358,7 +362,7 @@
 
     pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
     pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
-    
+
     for (i = 0; i < SAMPLES; i++) {
         if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
             pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i],
@@ -387,7 +391,7 @@
     int16_t samples[SAMPLES];
     int16_t samples_ref[SAMPLES];
     int16_t samples_orig[SAMPLES];
-    int32_t volumes[CHANNELS + PADDING];
+    uint32_t volumes[CHANNELS + PADDING];
     unsigned i, padding;
     pa_usec_t start, stop;
 
@@ -413,7 +417,7 @@
                       samples_orig[i]);
         }
     }
-
+exit(0);
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
         memcpy(samples, samples_orig, sizeof(samples_orig));