diff svolume_neon.c @ 0:e0040ee59c3c

import
author Peter Meerwald <p.meerwald@bct-electronic.com>
date Thu, 12 Jan 2012 17:27:46 +0100
parents
children b829afbea564
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/svolume_neon.c	Thu Jan 12 17:27:46 2012 +0100
@@ -0,0 +1,450 @@
+/*
+ * Copyright 2012 Peter Meerwald <p.meerwald@bct-electronic.com>
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <math.h>
+#include <sys/time.h>
+#include <assert.h>
+
+
+typedef short int16_t;
+typedef unsigned int uint32_t;
+typedef enum pa_sample_format {
+    PA_SAMPLE_S16LE,
+    PA_SAMPLE_FLOAT32LE,
+} pa_sample_format_t;
+#define PA_SAMPLE_S16NE PA_SAMPLE_S16LE
+#define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE
+typedef struct {
+    pa_sample_format_t *format;
+} pa_remap_t;
+typedef void (*pa_remap_func_t)(pa_remap_t *m, void *dst, const void *src, unsigned n);
+typedef long long unsigned int pa_usec_t;
+
+#define pa_assert(x) assert(x)
+#define pa_assert_not_reached() assert(0)
+
+#define PA_MAX(a, b) ((a) > (b) ? (a) : (b))
+
+typedef uint32_t pa_volume_t;
+#define PA_VOLUME_MUTED ((pa_volume_t) 0U)
+#define PA_VOLUME_MAX ((pa_volume_t) UINT32_MAX/2)
+
+#define PA_UNLIKELY(x) (x)
+#define PA_CLAMP_UNLIKELY(x, low, high) (PA_UNLIKELY((x) > (high)) ? (high) : (PA_UNLIKELY((x) < (low)) ? (low) : (x)))
+#define PA_CLAMP_VOLUME(v) (PA_CLAMP_UNLIKELY((v), PA_VOLUME_MUTED, PA_VOLUME_MAX))
+
+static void pa_log_info(const char *format, ...)  {
+    va_list ap;
+    char buf[1024];
+    va_start(ap, format);
+    vsprintf(buf, format, ap);
+    printf("%s\n", buf);
+    va_end(ap);
+}
+
+#define pa_log_debug pa_log_info
+
+static pa_usec_t pa_rtclock_now() {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+
+    return tv.tv_sec * 1000000ULL + tv.tv_usec;
+}
+
+void pa_volume_s16ne_c(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
+    unsigned channel;
+
+    length /= sizeof(int16_t);
+
+    for (channel = 0; length; length--) {
+        int32_t t, hi, lo;
+
+        /* Multiplying the 32bit volume factor with the 16bit
+         * sample might result in an 48bit value. We want to
+         * do without 64 bit integers and hence do the
+         * multiplication independently for the HI and LO part
+         * of the volume. */
+
+        hi = volumes[channel] >> 16;
+        lo = volumes[channel] & 0xFFFF;
+
+        t = (int32_t)(*samples);
+        t = ((t * lo) >> 16) + (t * hi);
+        t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+        *samples++ = (int16_t) t;
+
+        if (PA_UNLIKELY(++channel >= channels))
+            channel = 0;
+    }
+}
+
+void pa_volume_float32ne_c(float *samples, float *volumes, unsigned channels, unsigned length) {
+    unsigned channel;
+
+    length /= sizeof(float);
+
+    for (channel = 0; length; length--) {
+        *samples++ *= volumes[channel];
+
+        if (PA_UNLIKELY(++channel >= channels))
+            channel = 0;
+    }
+}
+
+/*
+void pa_volume_s16ne_orc(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
+{
+    if (channels == 2) {
+        int64_t v = (int64_t)volumes[1] << 32 | volumes[0];
+        pa_volume_s16ne_orc_2ch (samples, v, ((length / (sizeof(int16_t))) / 2));
+    } else if (channels == 1)
+        pa_volume_s16ne_orc_1ch (samples, volumes[0], length / (sizeof(int16_t)));
+}
+*/
+
+#if defined(__arm__)
+
+#include "arm_neon.h"
+
+#define MOD_INC() \
+    " subs  r0, r6, %2              \n\t" \
+    " itt cs                        \n\t" \
+    " addcs r0, %1                  \n\t" \
+    " movcs r6, r0                  \n\t"
+
+static void pa_volume_s16ne_arm(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
+    int32_t *ve;
+
+    /* Channels must be at least 4, and always a multiple of the original number.
+     * This is also the max amount we overread the volume array, which should
+     * have enough padding. */
+    channels = channels == 3 ? 6 : PA_MAX (4U, channels);
+    ve = volumes + channels;
+
+    __asm__ __volatile__ (
+        " mov r6, %1                      \n\t"
+        " mov %3, %3, LSR #1              \n\t" /* length /= sizeof (int16_t) */
+        " tst %3, #1                      \n\t" /* check for odd samples */
+        " beq  2f                         \n\t"
+
+        "1:                               \n\t"
+        " ldr  r0, [r6], #4               \n\t" /* odd samples volumes */
+        " ldrh r2, [%0]                   \n\t"
+
+        " smulwb r0, r0, r2               \n\t"
+        " ssat r0, #16, r0                \n\t"
+
+        " strh r0, [%0], #2               \n\t"
+
+        MOD_INC()
+
+        "2:                               \n\t"
+        " mov %3, %3, LSR #1              \n\t"
+        " tst %3, #1                      \n\t" /* check for odd samples */
+        " beq  4f                         \n\t"
+
+        "3:                               \n\t"
+        " ldrd r2, [r6], #8               \n\t" /* 2 samples at a time */
+        " ldr  r0, [%0]                   \n\t"
+
+        " smulwt r2, r2, r0               \n\t"
+        " smulwb r3, r3, r0               \n\t"
+
+        " ssat r2, #16, r2                \n\t"
+        " ssat r3, #16, r3                \n\t"
+
+        " pkhbt r0, r3, r2, LSL #16       \n\t"
+        " str  r0, [%0], #4               \n\t"
+
+        MOD_INC()
+
+        "4:                               \n\t"
+        " movs %3, %3, LSR #1             \n\t"
+        " beq  6f                         \n\t"
+
+        "5:                               \n\t"
+        " ldrd r2, [r6], #8               \n\t" /* 4 samples at a time */
+        " ldrd r4, [r6], #8               \n\t"
+        " ldrd r0, [%0]                   \n\t"
+
+        " smulwt r2, r2, r0               \n\t"
+        " smulwb r3, r3, r0               \n\t"
+        " smulwt r4, r4, r1               \n\t"
+        " smulwb r5, r5, r1               \n\t"
+
+        " ssat r2, #16, r2                \n\t"
+        " ssat r3, #16, r3                \n\t"
+        " ssat r4, #16, r4                \n\t"
+        " ssat r5, #16, r5                \n\t"
+
+        " pkhbt r0, r3, r2, LSL #16       \n\t"
+        " pkhbt r1, r5, r4, LSL #16       \n\t"
+        " strd  r0, [%0], #8              \n\t"
+
+        MOD_INC()
+
+        " subs %3, %3, #1                 \n\t"
+        " bne 5b                          \n\t"
+        "6:                               \n\t"
+
+        : "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length)
+        :
+        : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc"
+    );
+}
+
+static inline void vol_s16ne_neon(int32x4_t vol4, int16_t *samples, unsigned length) {
+    unsigned i;
+    int16x4_t hi = vshrn_n_s32(vol4, 16);
+    int32x4_t lo = vandq_s32(vol4, vdupq_n_s32(0xFFFF));
+    
+    for (i = 0; i < length/8; i++) {
+        int16x4_t v1 = ((int16x4_t *) samples)[2*i];
+        int16x4_t v2 = ((int16x4_t *) samples)[2*i+1];
+
+        int32x4_t t1 = vmull_s16(v1, hi);
+        int32x4_t t2 = vmull_s16(v2, hi);
+
+        int16x4_t r1 = vqmovn_s32(vsraq_n_s32(t1, vmulq_s32(vmovl_s16(v1), lo), 16));
+        int16x4_t r2 = vqmovn_s32(vsraq_n_s32(t2, vmulq_s32(vmovl_s16(v2), lo), 16));
+      
+        ((int16x8_t *)samples)[i] = vcombine_s16(r1, r2);
+    }
+}
+
+void pa_volume_s16ne_neon(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
+    unsigned channel = 0, i;
+    int32x4_t vol4;
+
+    length /= sizeof(int16_t);
+
+    switch (channels) {
+        case 1:
+            vol4 = vdupq_n_s32(*volumes);
+            vol_s16ne_neon(vol4, samples, length);
+            
+            for (i = length & ~7; i < length; i++) {
+                int32_t t = samples[i];
+                t = ((t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16));
+                samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+            }
+            break;
+        case 2:
+            vol4 = vcombine_s32(*(int32x2_t *)volumes, *(int32x2_t *)volumes);
+            vol_s16ne_neon(vol4, samples, length);
+            
+            for (i = length & ~7; i < length; i++) {
+                int32_t t = samples[i];
+                int32_t vol = volumes[(channel++) & 1];
+                t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
+                samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+            }
+            break;
+        case 4:
+            vol4 = *(int32x4_t *)volumes;
+            vol_s16ne_neon(vol4, samples, length);
+            
+            for (i = length & ~7; i < length; i++) {
+                int32_t t = samples[i];
+                int32_t vol = volumes[(channel++) & 3];
+                t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
+                samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+            }
+            break;
+        default:
+            for (; length; length--) {
+                int32_t t, hi, lo;
+
+                /* Multiplying the 32bit volume factor with the 16bit
+                 * sample might result in an 48bit value. We want to
+                 * do without 64 bit integers and hence do the
+                 * multiplication independently for the HI and LO part
+                 * of the volume. */
+
+                hi = volumes[channel] >> 16;
+                lo = volumes[channel] & 0xFFFF;
+
+                t = (int32_t)(*samples);
+                t = ((t * lo) >> 16) + (t * hi);
+                t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+                *samples++ = (int16_t) t;
+
+                if (PA_UNLIKELY(++channel >= channels))
+                    channel = 0;
+            }
+            break;
+    }
+}
+
+void pa_volume_float32ne_neon(float *samples, float *volumes, unsigned channels, unsigned length) {
+    unsigned channel = 0, i;
+    float32x4_t vol4;
+
+    length /= sizeof(float);
+
+    switch (channels) {
+        case 1:
+            vol4 = vdupq_n_f32(*volumes);
+            for (i = 0; i < length/4; i++) {
+                ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
+            }
+
+            for (i = length & ~3; i < length; i++) {
+                samples[i] *= volumes[0];
+            }
+            break;
+        case 2:
+            vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes);
+            for (i = 0; i < length/4; i++) {
+                ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
+            }
+
+            for (i = length & ~3; i < length; i++) {
+                samples[i] *= volumes[channel];
+
+                if (PA_UNLIKELY(++channel >= channels))
+                    channel = 0;
+            }
+            break;
+        case 4:
+            vol4 = *(float32x4_t *)volumes;
+            for (i = 0; i < length/4; i++) {
+                ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
+            }
+
+            for (i = length & ~3; i < length; i++) {
+                samples[i] *= volumes[channel++];
+            }
+            break;
+        default:
+            for (; length; length--) {
+                *samples++ *= volumes[channel];
+
+                if (PA_UNLIKELY(++channel >= channels))
+                    channel = 0;
+            }
+            break;
+    }
+}
+
+#define SAMPLES 1019
+#define TIMES 1000
+#define CHANNELS 4
+#define PADDING 16
+
+static void run_test_float(void) {
+    float floats[SAMPLES];
+    float floats_ref[SAMPLES];
+    float floats_orig[SAMPLES];
+    float volumes[CHANNELS];
+    unsigned i;
+    pa_usec_t start, stop;
+
+    pa_log_debug("checking NEON volume_float32ne(%d)", SAMPLES);
+
+    for (i = 0; i < SAMPLES; i++) {
+        floats_orig[i] = rand()/(float) RAND_MAX - 0.5f;
+    }
+    memcpy(floats_ref, floats_orig, sizeof(floats_orig));
+    memcpy(floats, floats_orig, sizeof(floats_orig));
+
+    for (i = 0; i < CHANNELS; i++)
+        volumes[i] = 0.5f * rand() / (float) RAND_MAX;
+
+    pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
+    pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
+    
+    for (i = 0; i < SAMPLES; i++) {
+        if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
+            pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i],
+                      floats_orig[i]);
+        }
+    }
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        memcpy(floats, floats_orig, sizeof(floats_orig));
+        pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        memcpy(floats_ref, floats_orig, sizeof(floats_orig));
+        pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+static void run_test_s16(void) {
+    int16_t samples[SAMPLES];
+    int16_t samples_ref[SAMPLES];
+    int16_t samples_orig[SAMPLES];
+    int32_t volumes[CHANNELS + PADDING];
+    unsigned i, padding;
+    pa_usec_t start, stop;
+
+    pa_log_debug("checking NEON volume_s16ne(%d)", SAMPLES);
+
+    for (i = 0; i < SAMPLES; i++) {
+        samples_orig[i] = rand() - RAND_MAX/2;
+    }
+    memcpy(samples_ref, samples_orig, sizeof(samples_orig));
+    memcpy(samples, samples_orig, sizeof(samples_orig));
+
+    for (i = 0; i < CHANNELS; i++)
+        volumes[i] = PA_CLAMP_VOLUME(rand() >> 15);
+    for (padding = 0; padding < PADDING; padding++, i++)
+        volumes[i] = volumes[padding];
+
+    pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
+    pa_volume_s16ne_c(samples_ref, volumes, CHANNELS, sizeof(samples_ref));
+
+    for (i = 0; i < SAMPLES; i++) {
+        if (abs(samples[i] - samples_ref[i]) > 0) {
+            pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i],
+                      samples_orig[i]);
+        }
+    }
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        memcpy(samples, samples_orig, sizeof(samples_orig));
+        pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        memcpy(samples, samples_orig, sizeof(samples_orig));
+        pa_volume_s16ne_arm(samples, volumes, CHANNELS, sizeof(samples));
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ARM: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        memcpy(samples_ref, samples_orig, sizeof(samples_orig));
+        pa_volume_s16ne_c(samples_ref, volumes, CHANNELS, sizeof(samples_ref));
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+#endif /* defined(__arm__) */
+
+int main() {
+
+    run_test_float();
+    run_test_s16();
+
+    return EXIT_SUCCESS;
+}

Repositories maintained by Peter Meerwald, pmeerw@pmeerw.net.