changeset 5:07763f536182 default tip

ALIGNment support
author Peter Meerwald <p.meerwald@bct-electronic.com>
date Sun, 08 Jul 2012 21:48:08 +0200
parents 1f6289166006
children
files remap_neon.c sconv_neon.c svolume_neon.c
diffstat 3 files changed, 105 insertions(+), 104 deletions(-) [+]
line wrap: on
line diff
--- a/remap_neon.c	Sun Jul 08 21:03:41 2012 +0200
+++ b/remap_neon.c	Sun Jul 08 21:48:08 2012 +0200
@@ -239,7 +239,7 @@
 
 static void mono_to_stereo_float_neon_a8(float *dst, const float *src, unsigned n) {
     int i = n & 3;
-    
+
     asm volatile (
     "mov        %[n], %[n], lsr #2\n\t"
     "1:\n\t"
@@ -288,7 +288,7 @@
 
 static void mono_to_stereo_int16_neon(int16_t *dst, const int16_t *src, unsigned n) {
     int i = n & 7;
-    
+
     asm volatile (
     "mov        %[n], %[n], lsr #3\n\t"
     "1:\n\t"
@@ -298,7 +298,7 @@
     "vst2.16	{q0,q1}, [%[dst]]!\n\t"
     "bgt	    1b\n\t"
       // output operands (or input operands that get modified)
-    : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) 
+    : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
     : // input operands
     : "memory", "cc" // clobber list
     );
@@ -399,13 +399,14 @@
 
 #define SAMPLES 1019
 #define TIMES 500000
+#define ALIGN 1
 
 static void run_test_mono_to_stereo_float(void) {
-    float stereo_a9[2*SAMPLES];
-    float stereo_a8[2*SAMPLES];
-    float stereo_ref[2*SAMPLES];
-    float stereo_gen[2*SAMPLES];
-    float mono[SAMPLES]; 
+    float stereo_a9[2*SAMPLES+ALIGN];
+    float stereo_a8[2*SAMPLES+ALIGN];
+    float stereo_ref[2*SAMPLES+ALIGN];
+    float stereo_gen[2*SAMPLES+ALIGN];
+    float mono[SAMPLES+ALIGN];
     int i;
     pa_usec_t start, stop;
     pa_sample_format_t sf;
@@ -419,7 +420,7 @@
     memset(stereo_a9, 0, sizeof(stereo_a9));
     memset(stereo_a8, 0, sizeof(stereo_a8));
 
-    for (i = 0; i < SAMPLES; i++) {
+    for (i = 0; i < SAMPLES+ALIGN; i++) {
         mono[i] = rand()/(float) RAND_MAX - 0.5f;
     }
 
@@ -434,24 +435,24 @@
     remap.map_table_f[0][0] = 1.0;
     remap.map_table_f[1][0] = 1.0;
 
-    remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES);
-    remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES);
-    remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
-    remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES);
+    remap_mono_to_stereo_neon_a9(&remap, stereo_a9+ALIGN, mono+ALIGN, SAMPLES);
+    remap_mono_to_stereo_neon_a8(&remap, stereo_a8+ALIGN, mono+ALIGN, SAMPLES);
+    remap_mono_to_stereo_c(&remap, stereo_ref+ALIGN, mono+ALIGN, SAMPLES);
+    remap_channels_matrix_c(&remap, stereo_gen+ALIGN, mono+ALIGN, SAMPLES);
 
-    for (i = 0; i < 2*SAMPLES; i++) {
+    for (i = ALIGN; i < 2*SAMPLES+ALIGN; i++) {
         if (fabsf(stereo_a9[i] - stereo_ref[i]) > 0.00001) {
             pa_log_debug("NEON/A9 %d: %.3f != %.3f (%.3f)", i, stereo_a9[i], stereo_ref[i],
                       mono[i/2]);
         }
     }
-    for (i = 0; i < 2*SAMPLES; i++) {
+    for (i = ALIGN; i < 2*SAMPLES+ALIGN; i++) {
         if (fabsf(stereo_a8[i] - stereo_ref[i]) > 0.00001) {
             pa_log_debug("NEON/A8 %d: %.3f != %.3f (%.3f)", i, stereo_a8[i], stereo_ref[i],
                       mono[i/2]);
         }
     }
-    for (i = 0; i < 2*SAMPLES; i++) {
+    for (i = ALIGN; i < 2*SAMPLES+ALIGN; i++) {
         if (fabsf(stereo_gen[i] - stereo_ref[i]) > 0.00001) {
             pa_log_debug("generic %d: %.3f != %.3f (%.3f)", i, stereo_gen[i], stereo_ref[i],
                       mono[i/2]);
@@ -460,38 +461,38 @@
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
+        remap_mono_to_stereo_c(&remap, stereo_ref+ALIGN, mono+ALIGN, SAMPLES);
     }
     stop = pa_rtclock_now();
     pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES);
+        remap_mono_to_stereo_neon_a9(&remap, stereo_a9+ALIGN, mono+ALIGN, SAMPLES);
     }
     stop = pa_rtclock_now();
     pa_log_info("NEON/A9:\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES);
+        remap_mono_to_stereo_neon_a8(&remap, stereo_a8+ALIGN, mono+ALIGN, SAMPLES);
     }
     stop = pa_rtclock_now();
     pa_log_info("NEON/A8:\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES);
+        remap_channels_matrix_c(&remap, stereo_gen+ALIGN, mono+ALIGN, SAMPLES);
     }
     stop = pa_rtclock_now();
     pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start));
 }
 
 static void run_test_stereo_to_mono_float(void) {
-    float stereo[2*SAMPLES];
-    float mono_ref[SAMPLES];
-    float mono_gen[SAMPLES];
-    float mono[SAMPLES];
+    float stereo[2*SAMPLES+ALIGN];
+    float mono_ref[SAMPLES+ALIGN];
+    float mono_gen[SAMPLES+ALIGN];
+    float mono[SAMPLES+ALIGN];
     int i;
     pa_usec_t start, stop;
     pa_sample_format_t sf;
@@ -503,7 +504,7 @@
     memset(mono_ref, 0, sizeof(mono_ref));
     memset(mono, 0, sizeof(mono));
 
-    for (i = 0; i < 2*SAMPLES; i++) {
+    for (i = 0; i < 2*SAMPLES+ALIGN; i++) {
         stereo[i] = rand()/(float) RAND_MAX - 0.5f;
     }
 
@@ -518,11 +519,11 @@
     remap.map_table_f[0][0] = 1.0;
     remap.map_table_f[0][1] = 1.0;
 
-    remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES);
-    remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES);
-    remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES);
+    remap_stereo_to_mono_c(&remap, mono_ref+ALIGN, stereo+ALIGN, SAMPLES);
+    remap_channels_matrix_c(&remap, mono_gen+ALIGN, stereo+ALIGN, SAMPLES);
+    remap_stereo_to_mono_neon(&remap, mono+ALIGN, stereo+ALIGN, SAMPLES);
 
-    for (i = 0; i < SAMPLES; i++) {
+    for (i = ALIGN; i < SAMPLES+ALIGN; i++) {
         if (fabsf(mono[i] - mono_ref[i]) > 0.00001) {
             pa_log_debug("%d: %.3f != %.3f (%.3f %0.3f)", i, mono[i], mono_ref[i],
                       stereo[2*i+0], stereo[2*i+1]);
@@ -531,32 +532,32 @@
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES);
+        remap_stereo_to_mono_neon(&remap, mono+ALIGN, stereo+ALIGN, SAMPLES);
     }
     stop = pa_rtclock_now();
     pa_log_info("NEON:\t\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES);
+        remap_stereo_to_mono_c(&remap, mono_ref+ALIGN, stereo+ALIGN, SAMPLES);
     }
     stop = pa_rtclock_now();
     pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES);
+        remap_channels_matrix_c(&remap, mono_gen+ALIGN, stereo+ALIGN, SAMPLES);
     }
     stop = pa_rtclock_now();
     pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start));
 }
 
 static void run_test_mono_to_stereo_s16(void) {
-    int16_t stereo_a9[2*SAMPLES];
-    int16_t stereo_a8[2*SAMPLES];
-    int16_t stereo_ref[2*SAMPLES];
-    int16_t stereo_gen[2*SAMPLES];
-    int16_t mono[SAMPLES];
+    int16_t stereo_a9[2*SAMPLES+ALIGN];
+    int16_t stereo_a8[2*SAMPLES+ALIGN];
+    int16_t stereo_ref[2*SAMPLES+ALIGN];
+    int16_t stereo_gen[2*SAMPLES+ALIGN];
+    int16_t mono[SAMPLES+ALIGN];
     int i;
     pa_usec_t start, stop;
     pa_sample_format_t sf;
@@ -570,7 +571,7 @@
     memset(stereo_a8, 0, sizeof(stereo_a8));
     memset(stereo_gen, 0, sizeof(stereo_gen));
 
-    for (i = 0; i < SAMPLES; i++) {
+    for (i = 0; i < SAMPLES+ALIGN; i++) {
         mono[i] = rand() - RAND_MAX/2;
     }
 
@@ -584,26 +585,26 @@
     remap.o_ss = &oss;
     remap.map_table_i[0][0] = 0x10000;
     remap.map_table_i[1][0] = 0x10000;
-    
-    remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
-    remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES);
-    remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES);
-    remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES);
-    
-    for (i = 0; i < 2*SAMPLES; i++) {
+
+    remap_mono_to_stereo_c(&remap, stereo_ref+ALIGN, mono+ALIGN, SAMPLES);
+    remap_channels_matrix_c(&remap, stereo_gen+ALIGN, mono+ALIGN, SAMPLES);
+    remap_mono_to_stereo_neon_a9(&remap, stereo_a9+ALIGN, mono+ALIGN, SAMPLES);
+    remap_mono_to_stereo_neon_a8(&remap, stereo_a8+ALIGN, mono+ALIGN, SAMPLES);
+
+    for (i = ALIGN; i < 2*SAMPLES+ALIGN; i++) {
         if (abs(stereo_a9[i] - stereo_ref[i]) > 0) {
             pa_log_debug("NEON/A9 %d: %d != %d (%d)", i, stereo_a9[i], stereo_ref[i],
                       mono[i/2]);
         }
     }
-    for (i = 0; i < 2*SAMPLES; i++) {
+    for (i = ALIGN; i < 2*SAMPLES+ALIGN; i++) {
         if (abs(stereo_a8[i] - stereo_ref[i]) > 0) {
             pa_log_debug("NEON/A8 %d: %d != %d (%d)", i, stereo_a8[i], stereo_ref[i],
                       mono[i/2]);
         }
     }
 
-    for (i = 0; i < 2*SAMPLES; i++) {
+    for (i = ALIGN; i < 2*SAMPLES+ALIGN; i++) {
         if (abs(stereo_gen[i] - stereo_ref[i]) > 0) {
             pa_log_debug("generic %d: %d != %d (%d)", i, stereo_gen[i], stereo_ref[i],
                       mono[i/2]);
@@ -612,38 +613,38 @@
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES);
+        remap_mono_to_stereo_neon_a9(&remap, stereo_a9+ALIGN, mono+ALIGN, SAMPLES);
     }
     stop = pa_rtclock_now();
     pa_log_info("NEON/A9:\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES);
+        remap_mono_to_stereo_neon_a8(&remap, stereo_a8+ALIGN, mono+ALIGN, SAMPLES);
     }
     stop = pa_rtclock_now();
     pa_log_info("NEON/A8:\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
+        remap_mono_to_stereo_c(&remap, stereo_ref+ALIGN, mono+ALIGN, SAMPLES);
     }
     stop = pa_rtclock_now();
     pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES);
+        remap_channels_matrix_c(&remap, stereo_gen+ALIGN, mono+ALIGN, SAMPLES);
     }
     stop = pa_rtclock_now();
     pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start));
 }
 
 static void run_test_stereo_to_mono_s16(void) {
-    int16_t stereo[2*SAMPLES];
-    int16_t mono_ref[SAMPLES];
-    int16_t mono_gen[SAMPLES];
-    int16_t mono[SAMPLES];
+    int16_t stereo[2*SAMPLES+ALIGN];
+    int16_t mono_ref[SAMPLES+ALIGN];
+    int16_t mono_gen[SAMPLES+ALIGN];
+    int16_t mono[SAMPLES+ALIGN];
     int i;
     pa_usec_t start, stop;
     pa_sample_format_t sf;
@@ -656,7 +657,7 @@
     memset(mono_gen, 0, sizeof(mono_gen));
     memset(mono, 0, sizeof(mono));
 
-    for (i = 0; i < 2*SAMPLES; i++) {
+    for (i = 0; i < 2*SAMPLES+ALIGN; i++) {
         stereo[i] = rand() - RAND_MAX/2;
     }
 
@@ -670,18 +671,18 @@
     remap.o_ss = &oss;
     remap.map_table_i[0][0] = 0x10000;
     remap.map_table_i[0][1] = 0x10000;
-    
-    remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES);
-    remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES);
-    remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES);
 
-    for (i = 0; i < SAMPLES; i++) {
+    remap_stereo_to_mono_c(&remap, mono_ref+ALIGN, stereo+ALIGN, SAMPLES);
+    remap_channels_matrix_c(&remap, mono_gen+ALIGN, stereo+ALIGN, SAMPLES);
+    remap_stereo_to_mono_neon(&remap, mono+ALIGN, stereo+ALIGN, SAMPLES);
+
+    for (i = ALIGN; i < SAMPLES+ALIGN; i++) {
         if (abs(mono[i] - mono_ref[i]) > 0) {
             pa_log_debug("%d: %d != %d (%d)", i, mono[i], mono_ref[i],
                       stereo[2*i+0], stereo[2*i+1]);
         }
     }
-    for (i = 0; i < SAMPLES; i++) {
+    for (i = ALIGN; i < SAMPLES+ALIGN; i++) {
         if (abs(mono[i] - mono_gen[i]) > 0) {
             pa_log_debug("%d: %d != %d (%d)", i, mono[i], mono_gen[i],
                       stereo[2*i+0], stereo[2*i+1]);
@@ -690,21 +691,21 @@
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES);
+        remap_stereo_to_mono_neon(&remap, mono+ALIGN, stereo+ALIGN, SAMPLES);
     }
     stop = pa_rtclock_now();
     pa_log_info("NEON:\t\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES);
+        remap_stereo_to_mono_c(&remap, mono_ref+ALIGN, stereo+ALIGN, SAMPLES);
     }
     stop = pa_rtclock_now();
     pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES);
+        remap_channels_matrix_c(&remap, mono_gen+ALIGN, stereo+ALIGN, SAMPLES);
     }
     stop = pa_rtclock_now();
     pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start));
--- a/sconv_neon.c	Sun Jul 08 21:03:41 2012 +0200
+++ b/sconv_neon.c	Sun Jul 08 21:48:08 2012 +0200
@@ -109,7 +109,7 @@
     "1:\n\t"
     "vld1.16    {d0}, [%[src]]!\n\t"
     "vmovl.s16  q0, d0\n\t"
-    
+
     "vcvt.f32.s32 q0, q0\n\t"
     "vmul.f32   q0, q0, q1\n\t"
 
@@ -130,11 +130,12 @@
 
 #define SAMPLES 1019
 #define TIMES 100000
+#define ALIGN 1
 
 static void run_test_from(void) {
-    int16_t samples[SAMPLES];
-    int16_t samples_ref[SAMPLES];
-    float floats[SAMPLES];
+    int16_t samples[SAMPLES+ALIGN];
+    int16_t samples_ref[SAMPLES+ALIGN];
+    float floats[SAMPLES+ALIGN];
     int i;
     pa_usec_t start, stop;
 
@@ -143,14 +144,14 @@
     memset(samples_ref, 0, sizeof(samples_ref));
     memset(samples, 0, sizeof(samples));
 
-    for (i = 0; i < SAMPLES; i++) {
+    for (i = 0; i < SAMPLES+ALIGN; i++) {
         floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f);
     }
 
-    pa_sconv_s16le_from_float32ne(SAMPLES, floats, samples_ref);
-    pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples);
+    pa_sconv_s16le_from_float32ne(SAMPLES, floats+ALIGN, samples_ref+ALIGN);
+    pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats+ALIGN, samples+ALIGN);
 
-    for (i = 0; i < SAMPLES; i++) {
+    for (i = ALIGN; i < SAMPLES+ALIGN; i++) {
         if (abs(samples[i] - samples_ref[i]) > 0) {
             pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i],
                       floats[i]);
@@ -159,41 +160,39 @@
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples);
+        pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats+ALIGN, samples+ALIGN);
     }
     stop = pa_rtclock_now();
     pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        pa_sconv_s16le_from_float32ne(SAMPLES, floats, samples_ref);
+        pa_sconv_s16le_from_float32ne(SAMPLES, floats+ALIGN, samples_ref+ALIGN);
     }
     stop = pa_rtclock_now();
     pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
 }
 
 static void run_test_to(void) {
-    int16_t samples[SAMPLES];
-    float floats[SAMPLES];
-    float floats_ref[SAMPLES];
+    int16_t samples[SAMPLES+ALIGN];
+    float floats[SAMPLES+ALIGN];
+    float floats_ref[SAMPLES+ALIGN];
     int i;
     pa_usec_t start, stop;
-    pa_convert_func_t func;
 
     pa_log_debug("checking NEON sconv_s16le_to_float(%d)", SAMPLES);
 
     memset(floats_ref, 0, sizeof(floats_ref));
     memset(floats, 0, sizeof(float));
 
-    for (i = 0; i < SAMPLES; i++) {
+    for (i = 0; i < SAMPLES+ALIGN; i++) {
         samples[i] = rand() - RAND_MAX/2;
     }
 
-    func = (pa_convert_func_t) pa_sconv_s16le_to_float32ne;
-    func(SAMPLES, samples, floats_ref);
-    pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats);
+    pa_sconv_s16le_to_float32ne(SAMPLES, samples+ALIGN, floats_ref+ALIGN);
+    pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples+ALIGN, floats+ALIGN);
 
-    for (i = 0; i < SAMPLES; i++) {
+    for (i = ALIGN; i < SAMPLES+ALIGN; i++) {
         if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
             pa_log_debug("%d: %.8f != %.8f (%d)", i, floats[i], floats_ref[i],
                       samples[i]);
@@ -202,14 +201,14 @@
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats);
+        pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples+ALIGN, floats+ALIGN);
     }
     stop = pa_rtclock_now();
     pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
 
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
-        func(SAMPLES, samples, floats_ref);
+        pa_sconv_s16le_to_float32ne(SAMPLES, samples+ALIGN, floats_ref+ALIGN);
     }
     stop = pa_rtclock_now();
     pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
--- a/svolume_neon.c	Sun Jul 08 21:03:41 2012 +0200
+++ b/svolume_neon.c	Sun Jul 08 21:48:08 2012 +0200
@@ -340,18 +340,19 @@
 #define TIMES 50000
 #define CHANNELS 4
 #define PADDING 16
+#define ALIGN 1
 
 static void run_test_float(void) {
-    float floats[SAMPLES];
-    float floats_ref[SAMPLES];
-    float floats_orig[SAMPLES];
+    float floats[SAMPLES+ALIGN];
+    float floats_ref[SAMPLES+ALIGN];
+    float floats_orig[SAMPLES+ALIGN];
     float volumes[CHANNELS];
     unsigned i;
     pa_usec_t start, stop;
 
     pa_log_debug("checking NEON volume_float32ne(%d)", SAMPLES);
 
-    for (i = 0; i < SAMPLES; i++) {
+    for (i = 0; i < SAMPLES+ALIGN; i++) {
         floats_orig[i] = rand()/(float) RAND_MAX - 0.5f;
     }
     memcpy(floats_ref, floats_orig, sizeof(floats_orig));
@@ -360,10 +361,10 @@
     for (i = 0; i < CHANNELS; i++)
         volumes[i] = 0.5f * rand() / (float) RAND_MAX;
 
-    pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
-    pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
+    pa_volume_float32ne_neon(floats+ALIGN, volumes, CHANNELS, sizeof(floats));
+    pa_volume_float32ne_c(floats_ref+ALIGN, volumes, CHANNELS, sizeof(floats_ref));
 
-    for (i = 0; i < SAMPLES; i++) {
+    for (i = ALIGN; i < SAMPLES+ALIGN; i++) {
         if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
             pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i],
                       floats_orig[i]);
@@ -373,7 +374,7 @@
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
         memcpy(floats, floats_orig, sizeof(floats_orig));
-        pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
+        pa_volume_float32ne_neon(floats+ALIGN, volumes, CHANNELS, sizeof(floats));
     }
     stop = pa_rtclock_now();
     pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
@@ -381,16 +382,16 @@
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
         memcpy(floats_ref, floats_orig, sizeof(floats_orig));
-        pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
+        pa_volume_float32ne_c(floats_ref+ALIGN, volumes, CHANNELS, sizeof(floats_ref));
     }
     stop = pa_rtclock_now();
     pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
 }
 
 static void run_test_s16(void) {
-    int16_t samples[SAMPLES];
-    int16_t samples_ref[SAMPLES];
-    int16_t samples_orig[SAMPLES];
+    int16_t samples[SAMPLES+ALIGN];
+    int16_t samples_ref[SAMPLES+ALIGN];
+    int16_t samples_orig[SAMPLES+ALIGN];
     uint32_t volumes[CHANNELS + PADDING];
     unsigned i, padding;
     pa_usec_t start, stop;
@@ -408,20 +409,20 @@
     for (padding = 0; padding < PADDING; padding++, i++)
         volumes[i] = volumes[padding];
 
-    pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
-    pa_volume_s16ne_c(samples_ref, volumes, CHANNELS, sizeof(samples_ref));
+    pa_volume_s16ne_neon(samples+ALIGN, volumes, CHANNELS, sizeof(samples));
+    pa_volume_s16ne_c(samples_ref+ALIGN, volumes, CHANNELS, sizeof(samples_ref));
 
-    for (i = 0; i < SAMPLES; i++) {
+    for (i = ALIGN; i < SAMPLES+ALIGN; i++) {
         if (abs(samples[i] - samples_ref[i]) > 0) {
             pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i],
                       samples_orig[i]);
         }
     }
-exit(0);
+
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
         memcpy(samples, samples_orig, sizeof(samples_orig));
-        pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
+        pa_volume_s16ne_neon(samples+ALIGN, volumes, CHANNELS, sizeof(samples));
     }
     stop = pa_rtclock_now();
     pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
@@ -429,7 +430,7 @@
     start = pa_rtclock_now();
     for (i = 0; i < TIMES; i++) {
         memcpy(samples, samples_orig, sizeof(samples_orig));
-        pa_volume_s16ne_arm(samples, volumes, CHANNELS, sizeof(samples));
+        pa_volume_s16ne_arm(samples+ALIGN, volumes, CHANNELS, sizeof(samples));
     }
     stop = pa_rtclock_now();
     pa_log_info("ARM: %llu usec.", (long long unsigned int)(stop - start));

Repositories maintained by Peter Meerwald, pmeerw@pmeerw.net.