diff sconv_neon.c @ 0:e0040ee59c3c

import
author Peter Meerwald <p.meerwald@bct-electronic.com>
date Thu, 12 Jan 2012 17:27:46 +0100
parents
children b829afbea564
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sconv_neon.c	Thu Jan 12 17:27:46 2012 +0100
@@ -0,0 +1,202 @@
+/*
+ * Copyright 2012 Peter Meerwald <p.meerwald@bct-electronic.com>
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <math.h>
+#include <sys/time.h>
+#include <assert.h>
+
+typedef short int16_t;
+typedef void (*pa_convert_func_t)(unsigned n, const void *a, void *b);
+typedef long long unsigned int pa_usec_t;
+
+#define pa_assert(x) assert(x)
+
+#define PA_CLAMP_UNLIKELY(x, low, high) \
+    (((x) < (low)) ? (low) : (((x) > (high)) ? (high) : (x)))
+
+static void pa_log_info(const char *format, ...)  {
+    va_list ap;
+    char buf[1024];
+    va_start(ap, format);
+    vsprintf(buf, format, ap);
+    printf("%s\n", buf);
+    va_end(ap);
+}
+
+#define pa_log_debug pa_log_info
+
+static pa_usec_t pa_rtclock_now() {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+
+    return tv.tv_sec * 1000000ULL + tv.tv_usec;
+}
+
+#if defined(__arm__)
+
+#include "arm_neon.h"
+
+void pa_sconv_s16le_from_float32ne(unsigned n, const float *a, int16_t *b) {
+    pa_assert(a);
+    pa_assert(b);
+
+    for (; n > 0; n--) {
+        float v = *(a++);
+
+        v = PA_CLAMP_UNLIKELY(v, -1.0f, 1.0f);
+        *(b++) = (int16_t) lrintf(v * 0x7FFF);
+    }
+}
+
+void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *a, int16_t *b) {
+    unsigned i;
+
+    const float32x4_t plusone4 = vdupq_n_f32(1.0f);
+    const float32x4_t minusone4 = vdupq_n_f32(-1.0f);
+    const float32x4_t half4 = vdupq_n_f32(0.5f);
+    const float32x4_t scale4 = vdupq_n_f32(32767.0f);
+    const uint32x4_t mask4 = vdupq_n_u32(0x80000000);
+
+    for (i = 0; i < n/4; i++) {
+        float32x4_t v4 = ((float32x4_t *)a)[i];
+        v4 = vmulq_f32(vmaxq_f32(vminq_f32(v4, plusone4) , minusone4), scale4);
+
+        const float32x4_t w4 = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(
+                vreinterpretq_u32_f32(v4), mask4), vreinterpretq_u32_f32(half4)));
+
+        ((int16x4_t *)b)[i] = vmovn_s32(vcvtq_s32_f32(vaddq_f32(v4, w4)));
+    }
+ 
+    // leftovers
+    for (i = n & ~3; i < n; i++) {
+        b[i] = (int16_t) lrintf(PA_CLAMP_UNLIKELY(a[i], -1.0f, 1.0f) * 0x7FFF);
+    }
+}
+
+void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *a, float *b) {
+    pa_assert(a);
+    pa_assert(b);
+
+    for (; n > 0; n--)
+        *(b++) = ((float) (*(a++)))/(float) 0x7FFF;
+}
+
+void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *a, float *b) {
+    unsigned i;
+
+    const float32x4_t invscale4 = vdupq_n_f32(1.0f / 0x7FFF);
+
+    for (i = 0; i < n/4; i++) {
+        ((float32x4_t *)b)[i] = vmulq_f32(vcvtq_f32_s32(vmovl_s16(((int16x4_t *)a)[i])), invscale4);
+    }
+
+    // leftovers
+    const float invscale = 1.0f / 0x7FFF;
+    for (i = n & ~3; i < n; i++) {
+        b[i] = a[i] * invscale;
+    }
+}
+
+#define SAMPLES 1019
+#define TIMES 300
+
+static void run_test_from(void) {
+    int16_t samples[SAMPLES];
+    int16_t samples_ref[SAMPLES];
+    float floats[SAMPLES];
+    int i;
+    pa_usec_t start, stop;
+    pa_convert_func_t func;
+
+    pa_log_debug("checking NEON sconv_s16le_from_float(%d)", SAMPLES);
+
+    memset(samples_ref, 0, sizeof(samples_ref));
+    memset(samples, 0, sizeof(samples));
+
+    for (i = 0; i < SAMPLES; i++) {
+        floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f);
+    }
+
+    func = (pa_convert_func_t) pa_sconv_s16le_from_float32ne;
+    func(SAMPLES, floats, samples_ref);
+    pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples);
+
+    for (i = 0; i < SAMPLES; i++) {
+        if (abs(samples[i] - samples_ref[i]) > 0) {
+            pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i],
+                      floats[i]);
+        }
+    }
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        func(SAMPLES, floats, samples_ref);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+static void run_test_to(void) {
+    int16_t samples[SAMPLES];
+    float floats[SAMPLES];
+    float floats_ref[SAMPLES];
+    int i;
+    pa_usec_t start, stop;
+    pa_convert_func_t func;
+
+    pa_log_debug("checking NEON sconv_s16le_to_float(%d)", SAMPLES);
+
+    memset(floats_ref, 0, sizeof(floats_ref));
+    memset(floats, 0, sizeof(float));
+
+    for (i = 0; i < SAMPLES; i++) {
+        samples[i] = rand() - RAND_MAX/2;
+    }
+
+    func = (pa_convert_func_t) pa_sconv_s16le_to_float32ne;
+    func(SAMPLES, samples, floats_ref);
+    pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats);
+
+    for (i = 0; i < SAMPLES; i++) {
+        if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
+            pa_log_debug("%d: %.8f != %.8f (%d)", i, floats[i], floats_ref[i],
+                      samples[i]);
+        }
+    }
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        func(SAMPLES, samples, floats_ref);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+#endif /* defined(__arm__) */
+
+int main() {
+
+    run_test_from();
+    run_test_to();
+
+    return EXIT_SUCCESS;
+}

Repositories maintained by Peter Meerwald, pmeerw@pmeerw.net.