# HG changeset patch
# User Peter Meerwald <p.meerwald@bct-electronic.com>
# Date 1341502316 -7200
# Node ID e889fd0e7769b6b0bc6aa645dc3417c354ef9724
# Parent  09ee6a01a3d3710416875d4ebca52eceb5b7e505
stuff

diff -r 09ee6a01a3d3 -r e889fd0e7769 compile.sh
--- a/compile.sh	Wed Jul 04 15:24:08 2012 +0200
+++ b/compile.sh	Thu Jul 05 17:31:56 2012 +0200
@@ -1,11 +1,13 @@
 #!/bin/sh
 
 #CC="arm-linux-gnueabi-gcc -Wall -g -O2 -static -marm -fomit-frame-pointer -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon"
-#CC="/opt/arm-2012.03/bin/arm-none-linux-gnueabi-gcc -Wall -g -O3 -static -marm -fomit-frame-pointer -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon"
-CC="/home/pmeerw/openbricks-bct2/build/build.brettl3.eglibc/toolchain/bin/armv7-openbricks-linux-gnueabi-gcc -Wall -g -O3 -static -marm -fomit-frame-pointer -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon"
+CC="/opt/arm-2012.03/bin/arm-none-linux-gnueabi-gcc -Wall -g -O3 -static -marm -fomit-frame-pointer -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon"
+#CC="/home/pmeerw/openbricks-bct2/build/build.brettl3.eglibc/toolchain/bin/armv7-openbricks-linux-gnueabi-gcc -Wall -g -O3 -static -marm -fomit-frame-pointer -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon"
 
 $CC \
     -o sconv_neon sconv_neon.c -lm
+$CC \
+    -S -o bla.s sconv_neon.c -lm
 
 $CC \
     -o remap_neon remap_neon.c xxx.S -lm
diff -r 09ee6a01a3d3 -r e889fd0e7769 remap_neon.c
--- a/remap_neon.c	Wed Jul 04 15:24:08 2012 +0200
+++ b/remap_neon.c	Thu Jul 05 17:31:56 2012 +0200
@@ -713,6 +713,18 @@
 #endif /* defined(__arm__) */
 
 int main() {
+
+/* not in user space
+    unsigned cpuid;
+    asm volatile(
+    "mrc    p15, 0, %[cpuid], c0, c0, 0\n\t"
+    : [cpuid] "=r" (cpuid)
+    :
+    : "cc");
+
+    printf("%08x %03x\n", cpuid,  (cpuid >> 4) & 0xfff);
+*/
+
     run_test_stereo_to_mono_float();
     run_test_stereo_to_mono_s16();
 
diff -r 09ee6a01a3d3 -r e889fd0e7769 sconv_neon.c
--- a/sconv_neon.c	Wed Jul 04 15:24:08 2012 +0200
+++ b/sconv_neon.c	Thu Jul 05 17:31:56 2012 +0200
@@ -41,69 +41,95 @@
 
 #include "arm_neon.h"
 
-void pa_sconv_s16le_from_float32ne(unsigned n, const float *a, int16_t *b) {
-    pa_assert(a);
-    pa_assert(b);
+void pa_sconv_s16le_from_float32ne(unsigned n, const float *src, int16_t *dst) {
+    pa_assert(src);
+    pa_assert(dst);
 
     for (; n > 0; n--) {
-        float v = *(a++);
+        float v = *(src++);
 
         v = PA_CLAMP_UNLIKELY(v, -1.0f, 1.0f);
-        *(b++) = (int16_t) lrintf(v * 0x7FFF);
+        *(dst++) = (int16_t) lrintf(v * 0x7FFF);
     }
 }
 
-void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *a, int16_t *b) {
-    unsigned i;
-
-    const float32x4_t plusone4 = vdupq_n_f32(1.0f);
-    const float32x4_t minusone4 = vdupq_n_f32(-1.0f);
-    const float32x4_t half4 = vdupq_n_f32(0.5f);
-    const float32x4_t scale4 = vdupq_n_f32(32767.0f);
-    const uint32x4_t mask4 = vdupq_n_u32(0x80000000);
+void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *src, int16_t *dst) {
+    unsigned i = n & 3;
 
-    for (i = 0; i < n/4; i++) {
-        float32x4_t v4 = ((float32x4_t *)a)[i];
-        v4 = vmulq_f32(vmaxq_f32(vminq_f32(v4, plusone4) , minusone4), scale4);
-
-        const float32x4_t w4 = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(
-                vreinterpretq_u32_f32(v4), mask4), vreinterpretq_u32_f32(half4)));
+    asm volatile (
+    "mov        %[n], %[n], lsr #2\n\t"
+    "vdup.f32   q2, %[plusone]\n\t"
+    "vneg.f32   q3, q2\n\t"
+    "vdup.f32   q4, %[scale]\n\t"
+    "vdup.u32   q5, %[mask]\n\t"
+    "vdup.f32   q6, %[half]\n\t"
+    "1:\n\t"
+    "vld1.32    {q0}, [%[src]]!\n\t"
+    "vmin.f32   q0, q0, q2\n\t" /* clamp */
+    "vmax.f32   q0, q0, q3\n\t"
+    "vmul.f32   q0, q0, q4\n\t" /* scale */
+    "vand.u32   q1, q0, q5\n\t" 
+    "vorr.u32   q1, q1, q6\n\t" /* round */
+    "vadd.f32   q0, q0, q1\n\t"
+    "vcvt.s32.f32 q0, q0\n\t" /* narrow */
+    "vmovn.i32  d0, q0\n\t"
+    "subs       %[n], %[n], #1\n\t"
+    "vst1.16    {d0}, [%[dst]]!\n\t"
+    "bgt        1b\n\t"
+      /* output operands (or input operands that get modified) */
+    : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
+    : [plusone] "r" (1.0f), [scale] "r" (32767.0f),
+      [half] "r" (0.5f), [mask] "r" (0x80000000) /* input operands */
+    : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6" /* clobber list */
+    );
 
-        ((int16x4_t *)b)[i] = vmovn_s32(vcvtq_s32_f32(vaddq_f32(v4, w4)));
-    }
- 
     // leftovers
-    for (i = n & ~3; i < n; i++) {
-        b[i] = (int16_t) lrintf(PA_CLAMP_UNLIKELY(a[i], -1.0f, 1.0f) * 0x7FFF);
+    while (i--) {
+        *dst++ = (int16_t) lrintf(PA_CLAMP_UNLIKELY(*src, -1.0f, 1.0f) * 0x7FFF);
+        src++;
     }
 }
 
-void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *a, float *b) {
-    pa_assert(a);
-    pa_assert(b);
+void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *src, float *dst) {
+    pa_assert(src);
+    pa_assert(dst);
 
     for (; n > 0; n--)
-        *(b++) = ((float) (*(a++)))/(float) 0x7FFF;
+        *(dst++) = ((float) (*(src++)))/(float) 0x7FFF;
 }
 
-void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *a, float *b) {
-    unsigned i;
+void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *src, float *dst) {
+    unsigned i = n & 3;
+
+    const float invscale = 1.0f / 0x7FFF;
 
-    const float32x4_t invscale4 = vdupq_n_f32(1.0f / 0x7FFF);
+    asm volatile (
+    "mov        %[n], %[n], lsr #2\n\t"
+    "vdup.f32   q1, %[invscale]\n\t"
+    "1:\n\t"
+    "vld1.16    {d0}, [%[src]]!\n\t"
+    "vmovl.s16  q0, d0\n\t"
+    
+    "vcvt.f32.s32 q0, q0\n\t"
+    "vmul.f32   q0, q0, q1\n\t"
 
-    for (i = 0; i < n/4; i++) {
-        ((float32x4_t *)b)[i] = vmulq_f32(vcvtq_f32_s32(vmovl_s16(((int16x4_t *)a)[i])), invscale4);
-    }
+    "subs       %[n], %[n], #1\n\t"
+    "vst1.32    {q0}, [%[dst]]!\n\t"
+    "bgt        1b\n\t"
+      /* output operands (or input operands that get modified) */
+    : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
+    : [invscale] "r" (invscale) /* input operands */
+    : "memory", "cc", "q0", "q1" /* clobber list */
+    );
 
     // leftovers
-    const float invscale = 1.0f / 0x7FFF;
-    for (i = n & ~3; i < n; i++) {
-        b[i] = a[i] * invscale;
+    while (i--) {
+        *dst++ = *src++ * invscale;
     }
 }
 
 #define SAMPLES 1019
-#define TIMES 10000
+#define TIMES 100000
 
 static void run_test_from(void) {
     int16_t samples[SAMPLES];
@@ -111,7 +137,6 @@
     float floats[SAMPLES];
     int i;
     pa_usec_t start, stop;
-    pa_convert_func_t func;
 
     pa_log_debug("checking NEON sconv_s16le_from_float(%d)", SAMPLES);
 
@@ -122,8 +147,7 @@
         floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f);
     }
 
-    func = (pa_convert_func_t) pa_sconv_s16le_from_float32ne;
-    func(SAMPLES, floats, samples_ref);
+    pa_sconv_s16le_from_float32ne(SAMPLES, floats, samples_ref);
     pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples);
 
     for (i = 0; i < SAMPLES; i++) {
@@ -142,7 +166,7 @@
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        func(SAMPLES, floats, samples_ref);
+        pa_sconv_s16le_from_float32ne(SAMPLES, floats, samples_ref);
     }
     stop = pa_rtclock_now();
     pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));