# HG changeset patch
# User Peter Meerwald
# Date 1341408248 -7200
# Node ID 09ee6a01a3d3710416875d4ebca52eceb5b7e505
# Parent b829afbea564ccfc4ec54e75505c396c61ddaf0d
new
diff -r b829afbea564 -r 09ee6a01a3d3 compile.sh
--- a/compile.sh Fri Apr 20 14:26:14 2012 +0200
+++ b/compile.sh Wed Jul 04 15:24:08 2012 +0200
@@ -1,11 +1,17 @@
-arm-linux-gnueabi-gcc -Wall -g -O2 -static \
- -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon \
+#!/bin/sh
+
+#CC="arm-linux-gnueabi-gcc -Wall -g -O2 -static -marm -fomit-frame-pointer -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon"
+#CC="/opt/arm-2012.03/bin/arm-none-linux-gnueabi-gcc -Wall -g -O3 -static -marm -fomit-frame-pointer -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon"
+CC="/home/pmeerw/openbricks-bct2/build/build.brettl3.eglibc/toolchain/bin/armv7-openbricks-linux-gnueabi-gcc -Wall -g -O3 -static -marm -fomit-frame-pointer -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon"
+
+$CC \
-o sconv_neon sconv_neon.c -lm
-arm-linux-gnueabi-gcc -Wall -g -O2 -static \
- -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon \
- -o remap_neon remap_neon.c -lm
+$CC \
+ -o remap_neon remap_neon.c xxx.S -lm
-arm-linux-gnueabi-gcc -Wall -g -O2 -static \
- -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon \
+$CC \
-o svolume_neon svolume_neon.c -lm
+
+$CC \
+ -S -o asdf.s remap_neon.c -lm
diff -r b829afbea564 -r 09ee6a01a3d3 remap_neon.c
--- a/remap_neon.c Fri Apr 20 14:26:14 2012 +0200
+++ b/remap_neon.c Wed Jul 04 15:24:08 2012 +0200
@@ -188,8 +188,6 @@
}
}
-
-
static void remap_stereo_to_mono_c(pa_remap_t *m, void *dst, const void *src, unsigned n) {
unsigned i;
@@ -235,115 +233,179 @@
}
}
-
#if defined(__arm__)
#include "arm_neon.h"
-void remap_mono_to_stereo_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) {
- unsigned i;
+static void mono_to_stereo_float_neon_a8(float *dst, const float *src, unsigned n) {
+ int i = n & 3;
+
+ asm volatile (
+ "mov %[n], %[n], lsr #2\n\t"
+ "1:\n\t"
+ "vld1.32 {q0}, [%[src]]!\n\t"
+ "vmov q1, q0\n\t"
+ "subs %[n], %[n], #1\n\t"
+ "vst2.32 {q0,q1}, [%[dst]]!\n\t"
+ "bgt 1b\n\t"
+ // output operands (or input operands that get modified)
+ : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
+ : // input operands
+ : "memory", "cc" // clobber list
+ );
+
+ while (i--) {
+ dst[0] = dst[1] = src[0];
+ src++;
+ dst += 2;
+ }
+}
+
+static void mono_to_stereo_float_neon_a9(float *dst, const float *src, unsigned n) {
+ int i = n & 1;
+
+ asm volatile (
+ "mov %[n], %[n], lsr #1\n\t"
+ "1:\n\t"
+ "ldm %[src]!, {r4,r6}\n\t"
+ "mov r5, r4\n\t"
+ "mov r7, r6\n\t"
+ "subs %[n], %[n], #1\n\t"
+ "stm %[dst]!, {r4-r7}\n\t"
+ "bgt 1b\n\t"
+ // output operands (or input operands that get modified)
+ : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
+ : // input operands
+ : "memory", "cc", "r4", "r5", "r6", "r7" // clobber list
+ );
+
+ while (i--) {
+ dst[0] = dst[1] = src[0];
+ src++;
+ dst += 2;
+ }
+}
+
+static void mono_to_stereo_int16_neon(int16_t *dst, const int16_t *src, unsigned n) {
+ int i = n & 7;
+
+ asm volatile (
+ "mov %[n], %[n], lsr #3\n\t"
+ "1:\n\t"
+ "vld1.16 {q0}, [%[src]]!\n\t"
+ "vmov q1, q0\n\t"
+ "subs %[n], %[n], #1\n\t"
+ "vst2.16 {q0,q1}, [%[dst]]!\n\t"
+ "bgt 1b\n\t"
+ // output operands (or input operands that get modified)
+ : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
+ : // input operands
+ : "memory", "cc" // clobber list
+ );
+
+ while (i--) {
+ dst[0] = dst[1] = src[0];
+ src++;
+ dst += 2;
+ }
+}
+
+static void remap_mono_to_stereo_neon_a9(pa_remap_t *m, void *dst, const void *src, unsigned n) {
switch (*m->format) {
case PA_SAMPLE_FLOAT32NE:
- {
- float *d = (float *) dst, *s = (float *) src;
-
- for (i = 0; i < n/4; i++) {
- float32x4x2_t stereo;
- stereo.val[0] = vld1q_f32(s);
- stereo.val[1] = stereo.val[0];
- vst2q_f32(d, stereo);
- s += 4;
- d += 8;
- }
-
- for (i = n & ~3; i < n; i++) {
- d[0] = d[1] = s[0];
- s++;
- d += 2;
- }
+ mono_to_stereo_float_neon_a9(dst, src, n);
break;
- }
case PA_SAMPLE_S16NE:
- {
- int16_t *d = (int16_t *) dst, *s = (int16_t *) src;
-
- for (i = 0; i < n/8; i++) {
- int16x8x2_t stereo;
- stereo.val[0] = vld1q_s16(s);
- stereo.val[1] = stereo.val[0];
- vst2q_s16(d, stereo);
- s += 8;
- d += 16;
- }
-
- for (i = n & ~7; i < n; i++) {
- d[0] = d[1] = s[0];
- s++;
- d += 2;
- }
+ mono_to_stereo_int16_neon(dst, src, n);
break;
- }
default:
pa_assert_not_reached();
}
}
-/* don't inline, causes ICE, see https://bugs.launchpad.net/bugs/936863 */
-static __attribute__ ((noinline)) void stereo_to_mono_float(float *d, const float *s, unsigned n) {
- unsigned i;
-
- for (i = 0; i < n/4; i++) {
- float32x4x2_t stereo = vld2q_f32(s);
- float32x4_t mono = vaddq_f32(stereo.val[0], stereo.val[1]);
- vst1q_f32(d, mono);
- s += 8;
- d += 4;
- }
- for (i = n & ~3; i < n; i++) {
- d[0] = s[0] + s[1];
- s += 2;
- d++;
+static void remap_mono_to_stereo_neon_a8(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+ switch (*m->format) {
+ case PA_SAMPLE_FLOAT32NE:
+ mono_to_stereo_float_neon_a8(dst, src, n);
+ break;
+ case PA_SAMPLE_S16NE:
+ mono_to_stereo_int16_neon(dst, src, n);
+ break;
+ default:
+ pa_assert_not_reached();
}
}
-/* don't inline, causes ICE, see https://bugs.launchpad.net/bugs/936863 */
-static __attribute__ ((noinline)) void stereo_to_mono_int16(int16_t *d, const int16_t *s, unsigned n) {
- unsigned int i;
+static void stereo_to_mono_float_neon(float *dst, const float *src, unsigned n) {
+ int i = n & 3;
+
+ asm volatile (
+ "mov %[n], %[n], lsr #2\n\t"
+ "1:\n\t"
+ "vld2.32 {q0,q1}, [%[src]]!\n\t"
+ "vadd.f32 q0, q0, q1\n\t"
+ "subs %[n], %[n], #1\n\t"
+ "vst1.32 {q0}, [%[dst]]!\n\t"
+ "bgt 1b\n\t"
+ // output operands (or input operands that get modified)
+ : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
+ : // input operands
+ : "memory", "cc" // clobber list
+ );
- for (i = 0; i < n/8; i++) {
- int16x8x2_t stereo = vld2q_s16(s);
- int16x8_t mono = vaddq_s16(stereo.val[0], stereo.val[1]);
- vst1q_s16(d, mono);
- s += 16;
- d += 8;
+ while (i--) {
+ dst[0] = src[0] + src[1];
+ src += 2;
+ dst++;
}
- for (i = n & ~7; i < n; i++) {
- d[0] = s[0] + s[1];
- s += 2;
- d++;
+}
+
+static void stereo_to_mono_int16_neon(int16_t *dst, const int16_t *src, unsigned n) {
+ int i = n & 7;
+
+ asm volatile (
+ "mov %[n], %[n], lsr #3\n\t"
+ "1:\n\t"
+ "vld2.16 {q0,q1}, [%[src]]!\n\t"
+ "vadd.s16 q0, q0, q1\n\t"
+ "subs %[n], %[n], #1\n\t"
+ "vst1.16 {q0}, [%[dst]]!\n\t"
+ "bgt 1b\n\t"
+ // output operands (or input operands that get modified)
+ : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
+ : // input operands
+ : "memory", "cc" // clobber list
+ );
+
+ while (i--) {
+ dst[0] = src[0] + src[1];
+ src += 2;
+ dst++;
}
}
static void remap_stereo_to_mono_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) {
switch (*m->format) {
case PA_SAMPLE_FLOAT32NE:
- stereo_to_mono_float(dst, src, n);
+ stereo_to_mono_float_neon(dst, src, n);
break;
case PA_SAMPLE_S16NE:
- stereo_to_mono_int16(dst, src, n);
+ stereo_to_mono_int16_neon(dst, src, n);
break;
default:
pa_assert_not_reached();
}
}
+
#define SAMPLES 1019
-#define TIMES 10000
+#define TIMES 500000
static void run_test_mono_to_stereo_float(void) {
- float stereo[2*SAMPLES];
+ float stereo_a9[2*SAMPLES];
+ float stereo_a8[2*SAMPLES];
float stereo_ref[2*SAMPLES];
float stereo_gen[2*SAMPLES];
- float mono[SAMPLES];
+ float mono[SAMPLES];
int i;
pa_usec_t start, stop;
pa_sample_format_t sf;
@@ -353,7 +415,9 @@
pa_log_debug("checking NEON remap_mono_to_stereo(float, %d)", SAMPLES);
memset(stereo_ref, 0, sizeof(stereo_ref));
- memset(stereo, 0, sizeof(stereo));
+ memset(stereo_gen, 0, sizeof(stereo_gen));
+ memset(stereo_a9, 0, sizeof(stereo_a9));
+ memset(stereo_a8, 0, sizeof(stereo_a8));
for (i = 0; i < SAMPLES; i++) {
mono[i] = rand()/(float) RAND_MAX - 0.5f;
@@ -370,43 +434,57 @@
remap.map_table_f[0][0] = 1.0;
remap.map_table_f[1][0] = 1.0;
+ remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES);
+ remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES);
remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES);
- remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
for (i = 0; i < 2*SAMPLES; i++) {
- if (fabsf(stereo[i] - stereo_ref[i]) > 0.00001) {
- pa_log_debug("%d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_ref[i],
+ if (fabsf(stereo_a9[i] - stereo_ref[i]) > 0.00001) {
+ pa_log_debug("NEON/A9 %d: %.3f != %.3f (%.3f)", i, stereo_a9[i], stereo_ref[i],
mono[i/2]);
}
}
for (i = 0; i < 2*SAMPLES; i++) {
- if (fabsf(stereo[i] - stereo_gen[i]) > 0.00001) {
- pa_log_debug("%d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_gen[i],
+ if (fabsf(stereo_a8[i] - stereo_ref[i]) > 0.00001) {
+ pa_log_debug("NEON/A8 %d: %.3f != %.3f (%.3f)", i, stereo_a8[i], stereo_ref[i],
+ mono[i/2]);
+ }
+ }
+ for (i = 0; i < 2*SAMPLES; i++) {
+ if (fabsf(stereo_gen[i] - stereo_ref[i]) > 0.00001) {
+ pa_log_debug("generic %d: %.3f != %.3f (%.3f)", i, stereo_gen[i], stereo_ref[i],
mono[i/2]);
}
}
start = pa_rtclock_now();
for (i = 0; i < TIMES; i++) {
- remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
+ remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
}
stop = pa_rtclock_now();
- pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+ pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start));
start = pa_rtclock_now();
for (i = 0; i < TIMES; i++) {
- remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
+ remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES);
}
stop = pa_rtclock_now();
- pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+ pa_log_info("NEON/A9:\t%llu usec.", (long long unsigned int)(stop - start));
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("NEON/A8:\t%llu usec.", (long long unsigned int)(stop - start));
start = pa_rtclock_now();
for (i = 0; i < TIMES; i++) {
remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES);
}
stop = pa_rtclock_now();
- pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start));
+ pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start));
}
static void run_test_stereo_to_mono_float(void) {
@@ -456,25 +534,26 @@
remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES);
}
stop = pa_rtclock_now();
- pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+ pa_log_info("NEON:\t\t%llu usec.", (long long unsigned int)(stop - start));
start = pa_rtclock_now();
for (i = 0; i < TIMES; i++) {
remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES);
}
stop = pa_rtclock_now();
- pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+ pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start));
start = pa_rtclock_now();
for (i = 0; i < TIMES; i++) {
remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES);
}
stop = pa_rtclock_now();
- pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start));
+ pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start));
}
static void run_test_mono_to_stereo_s16(void) {
- int16_t stereo[2*SAMPLES];
+ int16_t stereo_a9[2*SAMPLES];
+ int16_t stereo_a8[2*SAMPLES];
int16_t stereo_ref[2*SAMPLES];
int16_t stereo_gen[2*SAMPLES];
int16_t mono[SAMPLES];
@@ -487,7 +566,9 @@
pa_log_debug("checking NEON remap_mono_to_stereo(s16, %d)", SAMPLES);
memset(stereo_ref, 0, sizeof(stereo_ref));
- memset(stereo, 0, sizeof(stereo));
+ memset(stereo_a9, 0, sizeof(stereo_a9));
+ memset(stereo_a8, 0, sizeof(stereo_a8));
+ memset(stereo_gen, 0, sizeof(stereo_gen));
for (i = 0; i < SAMPLES; i++) {
mono[i] = rand() - RAND_MAX/2;
@@ -501,47 +582,61 @@
oss.channels = 2;
remap.i_ss = &iss;
remap.o_ss = &oss;
- remap.map_table_f[0][0] = 1.0;
- remap.map_table_f[1][0] = 1.0;
+ remap.map_table_i[0][0] = 0x10000;
+ remap.map_table_i[1][0] = 0x10000;
remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES);
- remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
+ remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES);
+ remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES);
for (i = 0; i < 2*SAMPLES; i++) {
- if (abs(stereo[i] - stereo_ref[i]) > 0) {
- pa_log_debug("%d: %d != %d (%d)", i, stereo[i], stereo_ref[i],
+ if (abs(stereo_a9[i] - stereo_ref[i]) > 0) {
+ pa_log_debug("NEON/A9 %d: %d != %d (%d)", i, stereo_a9[i], stereo_ref[i],
+ mono[i/2]);
+ }
+ }
+ for (i = 0; i < 2*SAMPLES; i++) {
+ if (abs(stereo_a8[i] - stereo_ref[i]) > 0) {
+ pa_log_debug("NEON/A8 %d: %d != %d (%d)", i, stereo_a8[i], stereo_ref[i],
mono[i/2]);
}
}
for (i = 0; i < 2*SAMPLES; i++) {
- if (abs(stereo[i] - stereo_gen[i]) > 0) {
- pa_log_debug("%d: %d != %d (%d)", i, stereo[i], stereo_gen[i],
+ if (abs(stereo_gen[i] - stereo_ref[i]) > 0) {
+ pa_log_debug("generic %d: %d != %d (%d)", i, stereo_gen[i], stereo_ref[i],
mono[i/2]);
}
}
start = pa_rtclock_now();
for (i = 0; i < TIMES; i++) {
- remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
+ remap_mono_to_stereo_neon_a9(&remap, stereo_a9, mono, SAMPLES);
}
stop = pa_rtclock_now();
- pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+ pa_log_info("NEON/A9:\t%llu usec.", (long long unsigned int)(stop - start));
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ remap_mono_to_stereo_neon_a8(&remap, stereo_a8, mono, SAMPLES);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("NEON/A8:\t%llu usec.", (long long unsigned int)(stop - start));
start = pa_rtclock_now();
for (i = 0; i < TIMES; i++) {
remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
}
stop = pa_rtclock_now();
- pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+ pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start));
start = pa_rtclock_now();
for (i = 0; i < TIMES; i++) {
remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES);
}
stop = pa_rtclock_now();
- pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start));
+ pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start));
}
static void run_test_stereo_to_mono_s16(void) {
@@ -558,6 +653,7 @@
pa_log_debug("checking NEON remap_stereo_to_mono(s16, %d)", SAMPLES);
memset(mono_ref, 0, sizeof(mono_ref));
+ memset(mono_gen, 0, sizeof(mono_gen));
memset(mono, 0, sizeof(mono));
for (i = 0; i < 2*SAMPLES; i++) {
@@ -572,8 +668,8 @@
oss.channels = 1;
remap.i_ss = &iss;
remap.o_ss = &oss;
- remap.map_table_f[0][0] = 1.0;
- remap.map_table_f[0][1] = 1.0;
+ remap.map_table_i[0][0] = 0x10000;
+ remap.map_table_i[0][1] = 0x10000;
remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES);
remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES);
@@ -597,34 +693,31 @@
remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES);
}
stop = pa_rtclock_now();
- pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+ pa_log_info("NEON:\t\t%llu usec.", (long long unsigned int)(stop - start));
start = pa_rtclock_now();
for (i = 0; i < TIMES; i++) {
remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES);
}
stop = pa_rtclock_now();
- pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+ pa_log_info("ref:\t\t%llu usec.", (long long unsigned int)(stop - start));
start = pa_rtclock_now();
for (i = 0; i < TIMES; i++) {
remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES);
}
stop = pa_rtclock_now();
- pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start));
+ pa_log_info("generic:\t%llu usec.", (long long unsigned int)(stop - start));
}
-
#endif /* defined(__arm__) */
int main() {
-
run_test_stereo_to_mono_float();
run_test_stereo_to_mono_s16();
run_test_mono_to_stereo_float();
run_test_mono_to_stereo_s16();
-
return EXIT_SUCCESS;
}