pa-neon: sconv_neon.c comparison

comparison sconv_neon.c @ 3:e889fd0e7769

stuff

author	Peter Meerwald <p.meerwald@bct-electronic.com>
date	Thu, 05 Jul 2012 17:31:56 +0200
parents	b829afbea564
children	07763f536182

comparison

equal deleted inserted replaced

-:09ee6a01a3d3
+:e889fd0e7769
 #if defined(__arm__)
 #include "arm_neon.h"
-void pa_sconv_s16le_from_float32ne(unsigned n, const float *a, int16_t *b) {
+void pa_sconv_s16le_from_float32ne(unsigned n, const float *src, int16_t *dst) {
-pa_assert(a);
+pa_assert(src);
-pa_assert(b);
+pa_assert(dst);
 for (; n > 0; n--) {
-float v = *(a++);
+float v = *(src++);
 v = PA_CLAMP_UNLIKELY(v, -1.0f, 1.0f);
-*(b++) = (int16_t) lrintf(v * 0x7FFF);
+*(dst++) = (int16_t) lrintf(v * 0x7FFF);
 }
 }
-void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *a, int16_t *b) {
+void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *src, int16_t *dst) {
-unsigned i;
+unsigned i = n & 3;
-const float32x4_t plusone4 = vdupq_n_f32(1.0f);
+asm volatile (
-const float32x4_t minusone4 = vdupq_n_f32(-1.0f);
+"mov        %[n], %[n], lsr #2\n\t"
-const float32x4_t half4 = vdupq_n_f32(0.5f);
+"vdup.f32   q2, %[plusone]\n\t"
-const float32x4_t scale4 = vdupq_n_f32(32767.0f);
+"vneg.f32   q3, q2\n\t"
-const uint32x4_t mask4 = vdupq_n_u32(0x80000000);
+"vdup.f32   q4, %[scale]\n\t"
+"vdup.u32   q5, %[mask]\n\t"
-for (i = 0; i < n/4; i++) {
+"vdup.f32   q6, %[half]\n\t"
-float32x4_t v4 = ((float32x4_t *)a)[i];
+"1:\n\t"
-v4 = vmulq_f32(vmaxq_f32(vminq_f32(v4, plusone4) , minusone4), scale4);
+"vld1.32    {q0}, [%[src]]!\n\t"
+"vmin.f32   q0, q0, q2\n\t" /* clamp */
-const float32x4_t w4 = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(
+"vmax.f32   q0, q0, q3\n\t"
-vreinterpretq_u32_f32(v4), mask4), vreinterpretq_u32_f32(half4)));
+"vmul.f32   q0, q0, q4\n\t" /* scale */
+"vand.u32   q1, q0, q5\n\t"
-((int16x4_t *)b)[i] = vmovn_s32(vcvtq_s32_f32(vaddq_f32(v4, w4)));
+"vorr.u32   q1, q1, q6\n\t" /* round */
-}
+"vadd.f32   q0, q0, q1\n\t"
+"vcvt.s32.f32 q0, q0\n\t" /* narrow */
+"vmovn.i32  d0, q0\n\t"
+"subs       %[n], %[n], #1\n\t"
+"vst1.16    {d0}, [%[dst]]!\n\t"
+"bgt        1b\n\t"
+/* output operands (or input operands that get modified) */
+: [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
+: [plusone] "r" (1.0f), [scale] "r" (32767.0f),
+[half] "r" (0.5f), [mask] "r" (0x80000000) /* input operands */
+: "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6" /* clobber list */
+);
 // leftovers
-for (i = n & ~3; i < n; i++) {
+while (i--) {
-b[i] = (int16_t) lrintf(PA_CLAMP_UNLIKELY(a[i], -1.0f, 1.0f) * 0x7FFF);
+*dst++ = (int16_t) lrintf(PA_CLAMP_UNLIKELY(*src, -1.0f, 1.0f) * 0x7FFF);
-}
+src++;
 }
+}
-void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *a, float *b) {
-pa_assert(a);
+void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *src, float *dst) {
-pa_assert(b);
+pa_assert(src);
+pa_assert(dst);
 for (; n > 0; n--)
-*(b++) = ((float) (*(a++)))/(float) 0x7FFF;
+*(dst++) = ((float) (*(src++)))/(float) 0x7FFF;
 }
-void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *a, float *b) {
+void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *src, float *dst) {
-unsigned i;
+unsigned i = n & 3;
-const float32x4_t invscale4 = vdupq_n_f32(1.0f / 0x7FFF);
+const float invscale = 1.0f / 0x7FFF;
-for (i = 0; i < n/4; i++) {
+asm volatile (
-((float32x4_t *)b)[i] = vmulq_f32(vcvtq_f32_s32(vmovl_s16(((int16x4_t *)a)[i])), invscale4);
+"mov        %[n], %[n], lsr #2\n\t"
-}
+"vdup.f32   q1, %[invscale]\n\t"
+"1:\n\t"
+"vld1.16    {d0}, [%[src]]!\n\t"
+"vmovl.s16  q0, d0\n\t"
+"vcvt.f32.s32 q0, q0\n\t"
+"vmul.f32   q0, q0, q1\n\t"
+"subs       %[n], %[n], #1\n\t"
+"vst1.32    {q0}, [%[dst]]!\n\t"
+"bgt        1b\n\t"
+/* output operands (or input operands that get modified) */
+: [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
+: [invscale] "r" (invscale) /* input operands */
+: "memory", "cc", "q0", "q1" /* clobber list */
+);
 // leftovers
-const float invscale = 1.0f / 0x7FFF;
+while (i--) {
-for (i = n & ~3; i < n; i++) {
+*dst++ = *src++ * invscale;
-b[i] = a[i] * invscale;
 }
 }
 #define SAMPLES 1019
-#define TIMES 10000
+#define TIMES 100000
 static void run_test_from(void) {
 int16_t samples[SAMPLES];
 int16_t samples_ref[SAMPLES];
 float floats[SAMPLES];
 int i;
 pa_usec_t start, stop;
-pa_convert_func_t func;
 pa_log_debug("checking NEON sconv_s16le_from_float(%d)", SAMPLES);
 memset(samples_ref, 0, sizeof(samples_ref));
 memset(samples, 0, sizeof(samples));
 for (i = 0; i < SAMPLES; i++) {
 floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f);
 }
-func = (pa_convert_func_t) pa_sconv_s16le_from_float32ne;
+pa_sconv_s16le_from_float32ne(SAMPLES, floats, samples_ref);
-func(SAMPLES, floats, samples_ref);
 pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples);
 for (i = 0; i < SAMPLES; i++) {
 if (abs(samples[i] - samples_ref[i]) > 0) {
 pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i],
 stop = pa_rtclock_now();
 pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
 start = pa_rtclock_now();
 for (i = 0; i < TIMES; i++) {
-func(SAMPLES, floats, samples_ref);
+pa_sconv_s16le_from_float32ne(SAMPLES, floats, samples_ref);
 }
 stop = pa_rtclock_now();
 pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
 }

Mercurial > hg > pa-neon

comparison sconv_neon.c @ 3:e889fd0e7769