# HG changeset patch # User Peter Meerwald # Date 1341502316 -7200 # Node ID e889fd0e7769b6b0bc6aa645dc3417c354ef9724 # Parent 09ee6a01a3d3710416875d4ebca52eceb5b7e505 stuff diff -r 09ee6a01a3d3 -r e889fd0e7769 compile.sh --- a/compile.sh Wed Jul 04 15:24:08 2012 +0200 +++ b/compile.sh Thu Jul 05 17:31:56 2012 +0200 @@ -1,11 +1,13 @@ #!/bin/sh #CC="arm-linux-gnueabi-gcc -Wall -g -O2 -static -marm -fomit-frame-pointer -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon" -#CC="/opt/arm-2012.03/bin/arm-none-linux-gnueabi-gcc -Wall -g -O3 -static -marm -fomit-frame-pointer -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon" -CC="/home/pmeerw/openbricks-bct2/build/build.brettl3.eglibc/toolchain/bin/armv7-openbricks-linux-gnueabi-gcc -Wall -g -O3 -static -marm -fomit-frame-pointer -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon" +CC="/opt/arm-2012.03/bin/arm-none-linux-gnueabi-gcc -Wall -g -O3 -static -marm -fomit-frame-pointer -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon" +#CC="/home/pmeerw/openbricks-bct2/build/build.brettl3.eglibc/toolchain/bin/armv7-openbricks-linux-gnueabi-gcc -Wall -g -O3 -static -marm -fomit-frame-pointer -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon" $CC \ -o sconv_neon sconv_neon.c -lm +$CC \ + -S -o bla.s sconv_neon.c -lm $CC \ -o remap_neon remap_neon.c xxx.S -lm diff -r 09ee6a01a3d3 -r e889fd0e7769 remap_neon.c --- a/remap_neon.c Wed Jul 04 15:24:08 2012 +0200 +++ b/remap_neon.c Thu Jul 05 17:31:56 2012 +0200 @@ -713,6 +713,18 @@ #endif /* defined(__arm__) */ int main() { + +/* not in user space + unsigned cpuid; + asm volatile( + "mrc p15, 0, %[cpuid], c0, c0, 0\n\t" + : [cpuid] "=r" (cpuid) + : + : "cc"); + + printf("%08x %03x\n", cpuid, (cpuid >> 4) & 0xfff); +*/ + run_test_stereo_to_mono_float(); run_test_stereo_to_mono_s16(); diff -r 09ee6a01a3d3 -r e889fd0e7769 sconv_neon.c --- a/sconv_neon.c Wed Jul 04 15:24:08 2012 +0200 +++ b/sconv_neon.c Thu Jul 05 17:31:56 2012 +0200 @@ -41,69 +41,95 @@ #include "arm_neon.h" -void pa_sconv_s16le_from_float32ne(unsigned n, const float *a, int16_t *b) { - pa_assert(a); - pa_assert(b); +void pa_sconv_s16le_from_float32ne(unsigned n, const float *src, int16_t *dst) { + pa_assert(src); + pa_assert(dst); for (; n > 0; n--) { - float v = *(a++); + float v = *(src++); v = PA_CLAMP_UNLIKELY(v, -1.0f, 1.0f); - *(b++) = (int16_t) lrintf(v * 0x7FFF); + *(dst++) = (int16_t) lrintf(v * 0x7FFF); } } -void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *a, int16_t *b) { - unsigned i; - - const float32x4_t plusone4 = vdupq_n_f32(1.0f); - const float32x4_t minusone4 = vdupq_n_f32(-1.0f); - const float32x4_t half4 = vdupq_n_f32(0.5f); - const float32x4_t scale4 = vdupq_n_f32(32767.0f); - const uint32x4_t mask4 = vdupq_n_u32(0x80000000); +void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *src, int16_t *dst) { + unsigned i = n & 3; - for (i = 0; i < n/4; i++) { - float32x4_t v4 = ((float32x4_t *)a)[i]; - v4 = vmulq_f32(vmaxq_f32(vminq_f32(v4, plusone4) , minusone4), scale4); - - const float32x4_t w4 = vreinterpretq_f32_u32(vorrq_u32(vandq_u32( - vreinterpretq_u32_f32(v4), mask4), vreinterpretq_u32_f32(half4))); + asm volatile ( + "mov %[n], %[n], lsr #2\n\t" + "vdup.f32 q2, %[plusone]\n\t" + "vneg.f32 q3, q2\n\t" + "vdup.f32 q4, %[scale]\n\t" + "vdup.u32 q5, %[mask]\n\t" + "vdup.f32 q6, %[half]\n\t" + "1:\n\t" + "vld1.32 {q0}, [%[src]]!\n\t" + "vmin.f32 q0, q0, q2\n\t" /* clamp */ + "vmax.f32 q0, q0, q3\n\t" + "vmul.f32 q0, q0, q4\n\t" /* scale */ + "vand.u32 q1, q0, q5\n\t" + "vorr.u32 q1, q1, q6\n\t" /* round */ + "vadd.f32 q0, q0, q1\n\t" + "vcvt.s32.f32 q0, q0\n\t" /* narrow */ + "vmovn.i32 d0, q0\n\t" + "subs %[n], %[n], #1\n\t" + "vst1.16 {d0}, [%[dst]]!\n\t" + "bgt 1b\n\t" + /* output operands (or input operands that get modified) */ + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) + : [plusone] "r" (1.0f), [scale] "r" (32767.0f), + [half] "r" (0.5f), [mask] "r" (0x80000000) /* input operands */ + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6" /* clobber list */ + ); - ((int16x4_t *)b)[i] = vmovn_s32(vcvtq_s32_f32(vaddq_f32(v4, w4))); - } - // leftovers - for (i = n & ~3; i < n; i++) { - b[i] = (int16_t) lrintf(PA_CLAMP_UNLIKELY(a[i], -1.0f, 1.0f) * 0x7FFF); + while (i--) { + *dst++ = (int16_t) lrintf(PA_CLAMP_UNLIKELY(*src, -1.0f, 1.0f) * 0x7FFF); + src++; } } -void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *a, float *b) { - pa_assert(a); - pa_assert(b); +void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *src, float *dst) { + pa_assert(src); + pa_assert(dst); for (; n > 0; n--) - *(b++) = ((float) (*(a++)))/(float) 0x7FFF; + *(dst++) = ((float) (*(src++)))/(float) 0x7FFF; } -void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *a, float *b) { - unsigned i; +void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *src, float *dst) { + unsigned i = n & 3; + + const float invscale = 1.0f / 0x7FFF; - const float32x4_t invscale4 = vdupq_n_f32(1.0f / 0x7FFF); + asm volatile ( + "mov %[n], %[n], lsr #2\n\t" + "vdup.f32 q1, %[invscale]\n\t" + "1:\n\t" + "vld1.16 {d0}, [%[src]]!\n\t" + "vmovl.s16 q0, d0\n\t" + + "vcvt.f32.s32 q0, q0\n\t" + "vmul.f32 q0, q0, q1\n\t" - for (i = 0; i < n/4; i++) { - ((float32x4_t *)b)[i] = vmulq_f32(vcvtq_f32_s32(vmovl_s16(((int16x4_t *)a)[i])), invscale4); - } + "subs %[n], %[n], #1\n\t" + "vst1.32 {q0}, [%[dst]]!\n\t" + "bgt 1b\n\t" + /* output operands (or input operands that get modified) */ + : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) + : [invscale] "r" (invscale) /* input operands */ + : "memory", "cc", "q0", "q1" /* clobber list */ + ); // leftovers - const float invscale = 1.0f / 0x7FFF; - for (i = n & ~3; i < n; i++) { - b[i] = a[i] * invscale; + while (i--) { + *dst++ = *src++ * invscale; } } #define SAMPLES 1019 -#define TIMES 10000 +#define TIMES 100000 static void run_test_from(void) { int16_t samples[SAMPLES]; @@ -111,7 +137,6 @@ float floats[SAMPLES]; int i; pa_usec_t start, stop; - pa_convert_func_t func; pa_log_debug("checking NEON sconv_s16le_from_float(%d)", SAMPLES); @@ -122,8 +147,7 @@ floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f); } - func = (pa_convert_func_t) pa_sconv_s16le_from_float32ne; - func(SAMPLES, floats, samples_ref); + pa_sconv_s16le_from_float32ne(SAMPLES, floats, samples_ref); pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples); for (i = 0; i < SAMPLES; i++) { @@ -142,7 +166,7 @@ start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { - func(SAMPLES, floats, samples_ref); + pa_sconv_s16le_from_float32ne(SAMPLES, floats, samples_ref); } stop = pa_rtclock_now(); pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));