diff remap_neon.c @ 2:09ee6a01a3d3

new
author Peter Meerwald <p.meerwald@bct-electronic.com>
date Wed, 04 Jul 2012 15:24:08 +0200
parents b829afbea564
children e889fd0e7769
line wrap: on
line diff
--- a/remap_neon.c	Fri Apr 20 14:26:14 2012 +0200
+++ b/remap_neon.c	Wed Jul 04 15:24:08 2012 +0200
@@ -188,8 +188,6 @@
     }
 }
 
-
-
 static void remap_stereo_to_mono_c(pa_remap_t *m, void *dst, const void *src, unsigned n) {
     unsigned i;
 
@@ -235,115 +233,179 @@
     }
 }
 
-
 #if defined(__arm__)
 
 #include "arm_neon.h"
 
-void remap_mono_to_stereo_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) {
-    unsigned i;
+static void mono_to_stereo_float_neon_a8(float *dst, const float *src, unsigned n) {
+    int i = n & 3;
+    
+    asm volatile (
+    "mov        %[n], %[n], lsr #2\n\t"
+    "1:\n\t"
+    "vld1.32    {q0}, [%[src]]!\n\t"
+    "vmov       q1, q0\n\t"
+    "subs       %[n], %[n], #1\n\t"
+    "vst2.32    {q0,q1}, [%[dst]]!\n\t"
+    "bgt        1b\n\t"
+      // output operands (or input operands that get modified)
+    : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) 
+    : // input operands
+    : "memory", "cc" // clobber list
+    );
+
+    while (i--) {
+        dst[0] = dst[1] = src[0];
+        src++;
+        dst += 2;
+    }
+}
+
+static void mono_to_stereo_float_neon_a9(float *dst, const float *src, unsigned n) {
+    int i = n & 1;
+
+    asm volatile (
+    "mov        %[n], %[n], lsr #1\n\t"
+    "1:\n\t"
+    "ldm        %[src]!, {r4,r6}\n\t"
+    "mov        r5, r4\n\t"
+    "mov        r7, r6\n\t"
+    "subs       %[n], %[n], #1\n\t"
+    "stm        %[dst]!, {r4-r7}\n\t"
+    "bgt        1b\n\t"
+      // output operands (or input operands that get modified)
+    : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
+    : // input operands
+    : "memory", "cc", "r4", "r5", "r6", "r7" // clobber list
+    );
+
+    while (i--) {
+        dst[0] = dst[1] = src[0];
+        src++;
+        dst += 2;
+    }
+}
+
+static void mono_to_stereo_int16_neon(int16_t *dst, const int16_t *src, unsigned n) {
+    int i = n & 7;
+    
+    asm volatile (
+    "mov        %[n], %[n], lsr #3\n\t"
+    "1:\n\t"
+    "vld1.16	{q0}, [%[src]]!\n\t"
+    "vmov	    q1, q0\n\t"
+    "subs	    %[n], %[n], #1\n\t"
+    "vst2.16	{q0,q1}, [%[dst]]!\n\t"
+    "bgt	    1b\n\t"
+      // output operands (or input operands that get modified)
+    : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) 
+    : // input operands
+    : "memory", "cc" // clobber list
+    );
+
+    while (i--) {
+        dst[0] = dst[1] = src[0];
+        src++;
+        dst += 2;
+    }
+}
+
+static void remap_mono_to_stereo_neon_a9(pa_remap_t *m, void *dst, const void *src, unsigned n) {
     switch (*m->format) {
         case PA_SAMPLE_FLOAT32NE:
-        {
-            float *d = (float *) dst, *s = (float *) src;
-            
-            for (i = 0; i < n/4; i++) {
-                float32x4x2_t stereo; 
-                stereo.val[0] = vld1q_f32(s);
-                stereo.val[1] = stereo.val[0];
-                vst2q_f32(d, stereo);
-                s += 4;
-                d += 8;
-            }
-            
-            for (i = n & ~3; i < n; i++) {
-                d[0] = d[1] = s[0];
-                s++;
-                d += 2;
-            }
+            mono_to_stereo_float_neon_a9(dst, src, n);
             break;
-        }
         case PA_SAMPLE_S16NE:
-        {
-            int16_t *d = (int16_t *) dst, *s = (int16_t *) src;
-
-            for (i = 0; i < n/8; i++) {
-                int16x8x2_t stereo; 
-                stereo.val[0] = vld1q_s16(s);
-                stereo.val[1] = stereo.val[0];
-                vst2q_s16(d, stereo);
-                s += 8;
-                d += 16;
-            }
-
-            for (i = n & ~7; i < n; i++) {
-                d[0] = d[1] = s[0];
-                s++;
-                d += 2;
-            }
+            mono_to_stereo_int16_neon(dst, src, n);
             break;
-        }
         default:
             pa_assert_not_reached();
     }
 }
 
-/* don't inline, causes ICE, see https://bugs.launchpad.net/bugs/936863 */
-static __attribute__ ((noinline)) void stereo_to_mono_float(float *d, const float *s, unsigned n) {
-    unsigned i;
-
-    for (i = 0; i < n/4; i++) {
-        float32x4x2_t stereo = vld2q_f32(s);
-        float32x4_t mono = vaddq_f32(stereo.val[0], stereo.val[1]);
-        vst1q_f32(d, mono);
-        s += 8;
-        d += 4;
-    }
-    for (i = n & ~3; i < n; i++) {
-        d[0] = s[0] + s[1];
-        s += 2;
-        d++;
+static void remap_mono_to_stereo_neon_a8(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+    switch (*m->format) {
+        case PA_SAMPLE_FLOAT32NE:
+            mono_to_stereo_float_neon_a8(dst, src, n);
+            break;
+        case PA_SAMPLE_S16NE:
+            mono_to_stereo_int16_neon(dst, src, n);
+            break;
+        default:
+            pa_assert_not_reached();
     }
 }
 
-/* don't inline, causes ICE, see https://bugs.launchpad.net/bugs/936863 */
-static __attribute__ ((noinline)) void stereo_to_mono_int16(int16_t *d, const int16_t *s, unsigned n) {
-    unsigned int i;
+static void stereo_to_mono_float_neon(float *dst, const float *src, unsigned n) {
+    int i = n & 3;
+
+    asm volatile (
+    "mov        %[n], %[n], lsr #2\n\t"
+    "1:\n\t"
+    "vld2.32    {q0,q1}, [%[src]]!\n\t"
+    "vadd.f32    q0, q0, q1\n\t"
+    "subs       %[n], %[n], #1\n\t"
+    "vst1.32    {q0}, [%[dst]]!\n\t"
+    "bgt        1b\n\t"
+      // output operands (or input operands that get modified)
+    : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) 
+    : // input operands
+    : "memory", "cc" // clobber list
+    );
 
-    for (i = 0; i < n/8; i++) {
-        int16x8x2_t stereo = vld2q_s16(s);
-        int16x8_t mono = vaddq_s16(stereo.val[0], stereo.val[1]);
-        vst1q_s16(d, mono);
-        s += 16;
-        d += 8;
+    while (i--) {
+        dst[0] = src[0] + src[1];
+        src += 2;
+        dst++;
     }
-    for (i = n & ~7; i < n; i++) {
-        d[0] = s[0] + s[1];
-        s += 2;
-        d++;
+}
+
+static void stereo_to_mono_int16_neon(int16_t *dst, const int16_t *src, unsigned n) {
+    int i = n & 7;
+
+    asm volatile (
+    "mov        %[n], %[n], lsr #3\n\t"
+    "1:\n\t"
+    "vld2.16    {q0,q1}, [%[src]]!\n\t"
+    "vadd.s16    q0, q0, q1\n\t"
+    "subs       %[n], %[n], #1\n\t"
+    "vst1.16    {q0}, [%[dst]]!\n\t"
+    "bgt        1b\n\t"
+      // output operands (or input operands that get modified)
+    : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) 
+    : // input operands
+    : "memory", "cc" // clobber list
+    );
+
+   while (i--) {
+        dst[0] = src[0] + src[1];
+        src += 2;
+        dst++;
     }
 }
 
 static void remap_stereo_to_mono_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) {
     switch (*m->format) {
         case PA_SAMPLE_FLOAT32NE:
-            stereo_to_mono_float(dst, src, n);
+            stereo_to_mono_float_neon(dst, src, n);
             break;
         case PA_SAMPLE_S16NE:
-            stereo_to_mono_int16(dst, src, n);
+            stereo_to_mono_int16_neon(dst, src, n);
             break;
         default:
             pa_assert_not_reached();
     }
 }
+
 #define SAMPLES 1019
-#define TIMES 10000
+#define TIMES 500000
 
 static void run_test_mono_to_stereo_float(void) {
-    float stereo[2*SAMPLES];
+    float stereo_a9[2*SAMPLES];
+    float stereo_a8[2*SAMPLES];
     float stereo_ref[2*SAMPLES];
     float stereo_gen[2*SAMPLES];
-    float mono[SAMPLES];
+    float mono[SAMPLES]; 
     int i;
     pa_usec_t start, stop;
     pa_sample_format_t sf;
@@ -353,7 +415,9 @@
     pa_log_debug("checking NEON remap_mono_to_stereo(float, %d)", SAMPLES);
 
     memset(stereo_ref, 0, sizeof(stereo_ref));
-    memset(stereo, 0, sizeof(stereo));
+    memset(stereo_gen, 0, sizeof(stereo_gen));
+    memset(stereo_a9, 0, sizeof(stereo_a9));
+    memset(stereo_a8, 0, sizeof(stereo_a8));
 
     for (i = 0; i < SAMPLES; i++) {
         mono[i] = rand()/(float) RAND_MAX - 0.5f;
@@ -370,43 +434,57 @@
     remap.map_table_f[0][0] = 1.0;
     remap.map_table_f[1][0] = 1.0;
 
+    remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES);
+    remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES);
     remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
     remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES);
-    remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
 
     for (i = 0; i < 2*SAMPLES; i++) {
-        if (fabsf(stereo[i] - stereo_ref[i]) > 0.00001) {
-            pa_log_debug("%d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_ref[i],
+        if (fabsf(stereo_a9[i] - stereo_ref[i]) > 0.00001) {
+            pa_log_debug("NEON/A9 %d: %.3f != %.3f (%.3f)", i, stereo_a9[i], stereo_ref[i],
                       mono[i/2]);
         }
     }
     for (i = 0; i < 2*SAMPLES; i++) {
-        if (fabsf(stereo[i] - stereo_gen[i]) > 0.00001) {
-            pa_log_debug("%d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_gen[i],
+        if (fabsf(stereo_a8[i] - stereo_ref[i]) > 0.00001) {
+            pa_log_debug("NEON/A8 %d: %.3f != %.3f (%.3f)", i, stereo_a8[i], stereo_ref[i],
+                      mono[i/2]);
+        }
+    }
+    for (i = 0; i < 2*SAMPLES; i++) {
+        if (fabsf(stereo_gen[i] - stereo_ref[i]) > 0.00001) {
+            pa_log_debug("generic %d: %.3f != %.3f (%.3f)", i, stereo_gen[i], stereo_ref[i],
                       mono[i/2]);
         }
     }
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
+        remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
     }
     stop = pa_rtclock_now();
-    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+    pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
+        remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES);
     }
     stop = pa_rtclock_now();
-    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+    pa_log_info("NEON/A9:\t%llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON/A8:\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
         remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES);
     }
     stop = pa_rtclock_now();
-    pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start));
+    pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start));
 }
 
 static void run_test_stereo_to_mono_float(void) {
@@ -456,25 +534,26 @@
         remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES);
     }
     stop = pa_rtclock_now();
-    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+    pa_log_info("NEON:\t\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
         remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES);
     }
     stop = pa_rtclock_now();
-    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+    pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
         remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES);
     }
     stop = pa_rtclock_now();
-    pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start));
+    pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start));
 }
 
 static void run_test_mono_to_stereo_s16(void) {
-    int16_t stereo[2*SAMPLES];
+    int16_t stereo_a9[2*SAMPLES];
+    int16_t stereo_a8[2*SAMPLES];
     int16_t stereo_ref[2*SAMPLES];
     int16_t stereo_gen[2*SAMPLES];
     int16_t mono[SAMPLES];
@@ -487,7 +566,9 @@
     pa_log_debug("checking NEON remap_mono_to_stereo(s16, %d)", SAMPLES);
 
     memset(stereo_ref, 0, sizeof(stereo_ref));
-    memset(stereo, 0, sizeof(stereo));
+    memset(stereo_a9, 0, sizeof(stereo_a9));
+    memset(stereo_a8, 0, sizeof(stereo_a8));
+    memset(stereo_gen, 0, sizeof(stereo_gen));
 
     for (i = 0; i < SAMPLES; i++) {
         mono[i] = rand() - RAND_MAX/2;
@@ -501,47 +582,61 @@
     oss.channels = 2;
     remap.i_ss = &iss;
     remap.o_ss = &oss;
-    remap.map_table_f[0][0] = 1.0;
-    remap.map_table_f[1][0] = 1.0;
+    remap.map_table_i[0][0] = 0x10000;
+    remap.map_table_i[1][0] = 0x10000;
     
     remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
     remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES);
-    remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
+    remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES);
+    remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES);
     
     for (i = 0; i < 2*SAMPLES; i++) {
-        if (abs(stereo[i] - stereo_ref[i]) > 0) {
-            pa_log_debug("%d: %d != %d (%d)", i, stereo[i], stereo_ref[i],
+        if (abs(stereo_a9[i] - stereo_ref[i]) > 0) {
+            pa_log_debug("NEON/A9 %d: %d != %d (%d)", i, stereo_a9[i], stereo_ref[i],
+                      mono[i/2]);
+        }
+    }
+    for (i = 0; i < 2*SAMPLES; i++) {
+        if (abs(stereo_a8[i] - stereo_ref[i]) > 0) {
+            pa_log_debug("NEON/A8 %d: %d != %d (%d)", i, stereo_a8[i], stereo_ref[i],
                       mono[i/2]);
         }
     }
 
     for (i = 0; i < 2*SAMPLES; i++) {
-        if (abs(stereo[i] - stereo_gen[i]) > 0) {
-            pa_log_debug("%d: %d != %d (%d)", i, stereo[i], stereo_gen[i],
+        if (abs(stereo_gen[i] - stereo_ref[i]) > 0) {
+            pa_log_debug("generic %d: %d != %d (%d)", i, stereo_gen[i], stereo_ref[i],
                       mono[i/2]);
         }
     }
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
+        remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES);
     }
     stop = pa_rtclock_now();
-    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+    pa_log_info("NEON/A9:\t%llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON/A8:\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
         remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
     }
     stop = pa_rtclock_now();
-    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+    pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
         remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES);
     }
     stop = pa_rtclock_now();
-    pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start));
+    pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start));
 }
 
 static void run_test_stereo_to_mono_s16(void) {
@@ -558,6 +653,7 @@
     pa_log_debug("checking NEON remap_stereo_to_mono(s16, %d)", SAMPLES);
 
     memset(mono_ref, 0, sizeof(mono_ref));
+    memset(mono_gen, 0, sizeof(mono_gen));
     memset(mono, 0, sizeof(mono));
 
     for (i = 0; i < 2*SAMPLES; i++) {
@@ -572,8 +668,8 @@
     oss.channels = 1;
     remap.i_ss = &iss;
     remap.o_ss = &oss;
-    remap.map_table_f[0][0] = 1.0;
-    remap.map_table_f[0][1] = 1.0;
+    remap.map_table_i[0][0] = 0x10000;
+    remap.map_table_i[0][1] = 0x10000;
     
     remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES);
     remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES);
@@ -597,34 +693,31 @@
         remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES);
     }
     stop = pa_rtclock_now();
-    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+    pa_log_info("NEON:\t\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
         remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES);
     }
     stop = pa_rtclock_now();
-    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+    pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
         remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES);
     }
     stop = pa_rtclock_now();
-    pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start));
+    pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start));
 }
 
-
 #endif /* defined(__arm__) */
 
 int main() {
-
     run_test_stereo_to_mono_float();
     run_test_stereo_to_mono_s16();
 
     run_test_mono_to_stereo_float();
     run_test_mono_to_stereo_s16();
 
-
     return EXIT_SUCCESS;
 }

Repositories maintained by Peter Meerwald, pmeerw@pmeerw.net.