view svolume_neon.c @ 5:07763f536182 default tip

ALIGNment support
author Peter Meerwald <p.meerwald@bct-electronic.com>
date Sun, 08 Jul 2012 21:48:08 +0200
parents 1f6289166006
children
line wrap: on
line source

/*
 * Copyright 2012 Peter Meerwald <p.meerwald@bct-electronic.com>
 */

#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
#include <assert.h>


typedef short int16_t;
typedef unsigned int uint32_t;
typedef enum pa_sample_format {
    PA_SAMPLE_S16LE,
    PA_SAMPLE_FLOAT32LE,
} pa_sample_format_t;
#define PA_SAMPLE_S16NE PA_SAMPLE_S16LE
#define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE
typedef long long unsigned int pa_usec_t;

#define pa_assert(x) assert(x)
#define pa_assert_not_reached() assert(0)

#define PA_MAX(a, b) ((a) > (b) ? (a) : (b))

typedef uint32_t pa_volume_t;
#define PA_VOLUME_MUTED ((pa_volume_t) 0U)
#define PA_VOLUME_MAX ((pa_volume_t) UINT32_MAX/2)

#define PA_UNLIKELY(x) (x)
#define PA_CLAMP_UNLIKELY(x, low, high) (PA_UNLIKELY((x) > (high)) ? (high) : (PA_UNLIKELY((x) < (low)) ? (low) : (x)))
#define PA_CLAMP_VOLUME(v) (PA_CLAMP_UNLIKELY((v), PA_VOLUME_MUTED, PA_VOLUME_MAX))

static void pa_log_info(const char *format, ...)  {
    va_list ap;
    char buf[1024];
    va_start(ap, format);
    vsprintf(buf, format, ap);
    printf("%s\n", buf);
    va_end(ap);
}

#define pa_log_debug pa_log_info

static pa_usec_t pa_rtclock_now() {
    struct timeval tv;
    gettimeofday(&tv, NULL);

    return tv.tv_sec * 1000000ULL + tv.tv_usec;
}

static void pa_volume_s16ne_c(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) {
    unsigned channel;

    length /= sizeof(int16_t);

    for (channel = 0; length; length--) {
        int32_t t, hi, lo;

        /* Multiplying the 32bit volume factor with the 16bit
         * sample might result in an 48bit value. We want to
         * do without 64 bit integers and hence do the
         * multiplication independently for the HI and LO part
         * of the volume. */

        hi = volumes[channel] >> 16;
        lo = volumes[channel] & 0xFFFF;

        t = (int32_t)(*samples);
        t = ((t * lo) >> 16) + (t * hi);
        t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
        *samples++ = (int16_t) t;

        if (PA_UNLIKELY(++channel >= channels))
            channel = 0;
    }
}

static void pa_volume_float32ne_c(float *samples, const float *volumes, unsigned channels, unsigned length) {
    unsigned channel;

    length /= sizeof(float);

    for (channel = 0; length; length--) {
        *samples++ *= volumes[channel];

        if (PA_UNLIKELY(++channel >= channels))
            channel = 0;
    }
}

#if defined(__arm__)

#include "arm_neon.h"

#define MOD_INC() \
    " subs  r0, r6, %2              \n\t" \
    " itt cs                        \n\t" \
    " addcs r0, %1                  \n\t" \
    " movcs r6, r0                  \n\t"

static void pa_volume_s16ne_arm(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) {
    /* Channels must be at least 4, and always a multiple of the original number.
     * This is also the max amount we overread the volume array, which should
     * have enough padding. */
    channels = channels == 3 ? 6 : PA_MAX(4U, channels);
    const uint32_t *ve = volumes + channels;

    __asm__ __volatile__ (
        " mov r6, %1                      \n\t"
        " mov %3, %3, LSR #1              \n\t" /* length /= sizeof (int16_t) */
        " tst %3, #1                      \n\t" /* check for odd samples */
        " beq  2f                         \n\t"

        "1:                               \n\t"
        " ldr  r0, [r6], #4               \n\t" /* odd samples volumes */
        " ldrh r2, [%0]                   \n\t"

        " smulwb r0, r0, r2               \n\t"
        " ssat r0, #16, r0                \n\t"

        " strh r0, [%0], #2               \n\t"

        MOD_INC()

        "2:                               \n\t"
        " mov %3, %3, LSR #1              \n\t"
        " tst %3, #1                      \n\t" /* check for odd samples */
        " beq  4f                         \n\t"

        "3:                               \n\t"
        " ldrd r2, [r6], #8               \n\t" /* 2 samples at a time */
        " ldr  r0, [%0]                   \n\t"

        " smulwt r2, r2, r0               \n\t"
        " smulwb r3, r3, r0               \n\t"

        " ssat r2, #16, r2                \n\t"
        " ssat r3, #16, r3                \n\t"

        " pkhbt r0, r3, r2, LSL #16       \n\t"
        " str  r0, [%0], #4               \n\t"

        MOD_INC()

        "4:                               \n\t"
        " movs %3, %3, LSR #1             \n\t"
        " beq  6f                         \n\t"

        "5:                               \n\t"
        " ldrd r2, [r6], #8               \n\t" /* 4 samples at a time */
        " ldrd r4, [r6], #8               \n\t"
        " ldrd r0, [%0]                   \n\t"

        " smulwt r2, r2, r0               \n\t"
        " smulwb r3, r3, r0               \n\t"
        " smulwt r4, r4, r1               \n\t"
        " smulwb r5, r5, r1               \n\t"

        " ssat r2, #16, r2                \n\t"
        " ssat r3, #16, r3                \n\t"
        " ssat r4, #16, r4                \n\t"
        " ssat r5, #16, r5                \n\t"

        " pkhbt r0, r3, r2, LSL #16       \n\t"
        " pkhbt r1, r5, r4, LSL #16       \n\t"
        " strd  r0, [%0], #8              \n\t"

        MOD_INC()

        " subs %3, %3, #1                 \n\t"
        " bne 5b                          \n\t"
        "6:                               \n\t"

        : "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length)
        :
        : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc"
    );
}

static inline void vol_s16_neon(const uint32x4_t *vol4, int16_t *samples, unsigned length) {
    asm volatile (
    "mov        %[length], %[length], lsr #2\n\t"
    "vld1.s32   {q0}, [%[vol]]\n\t"
    "vshl.u32   q3, q0, #16\n\t" /* lo */
    "vshrn.s32  d1, q0, #16\n\t" /* hi */
    "vshr.u32   q3, q3, #16\n\t"
    "1:\n\t"
    "vld1.16	{d0}, [%[samples]]\n\t"

    "vmull.s16  q1, d0, d1\n\t"

    "vmovl.s16  q2, d0\n\t"
    "vmul.s32   q2, q2, q3\n\t"

    "vsra.s32   q1, q2, #16\n\t"
    "vmovn.s32  d0, q1\n\t"

    "subs       %[length], %[length], #1\n\t"
    "vst1.16	{d0}, [%[samples]]!\n\t"
    "bgt        1b\n\t"
      /* output operands (or input operands that get modified) */
    : [samples] "+r" (samples), [length] "+r" (length)
    : [vol] "r" (vol4) /* input operands */
    : "memory", "cc", "q0", "q1", "q2", "q3" /* clobber list */
    );
}

static inline void vol_float_neon(const float32x4_t *vol4, float *samples, unsigned length) {
    asm volatile (
    "mov        %[length], %[length], lsr #2\n\t"
    "vld1.32    {q1}, [%[vol]]\n\t"
    "1:\n\t"
    "vld1.32	{q0}, [%[samples]]\n\t"
    "vmul.f32   q0, q0, q1\n\t"
    "subs       %[length], %[length], #1\n\t"
    "vst1.32	{q0}, [%[samples]]!\n\t"
    "bgt        1b\n\t"
      /* output operands (or input operands that get modified) */
    : [samples] "+r" (samples), [length] "+r" (length)
    : [vol] "r" (vol4) /* input operands */
    : "memory", "cc", "q0", "q1" /* clobber list */
    );
}

static void pa_volume_s16ne_neon(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) {
    unsigned channel = 0, i;
    uint32x4_t vol4;

    length /= sizeof(int16_t);

    switch (channels) {
        case 1:
            vol4 = vdupq_n_u32(*volumes);
            vol_s16_neon(&vol4, samples, length);

            for (i = length & ~3; i < length; i++) {
                int32_t t = samples[i];
                t = ((int32_t) (t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16));
                samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
            }
            break;
        case 2:
            vol4 = vcombine_u32(*(uint32x2_t *)volumes, *(uint32x2_t *)volumes);
            vol_s16_neon(&vol4, samples, length);

            for (i = length & ~3; i < length; i++) {
                int32_t t = samples[i];
                uint32_t vol = volumes[(channel++) & 1];
                t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
                samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
            }
            break;
        case 4:
            vol4 = *(uint32x4_t *)volumes;
            vol_s16_neon(&vol4, samples, length);

            for (i = length & ~3; i < length; i++) {
                int32_t t = samples[i];
                uint32_t vol = volumes[(channel++) & 3];
                t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
                samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
            }
            break;
        default:
            for (; length; length--) {
                int32_t t;
                uint32_t hi, lo;

                /* Multiplying the 32bit volume factor with the 16bit
                 * sample might result in an 48bit value. We want to
                 * do without 64 bit integers and hence do the
                 * multiplication independently for the HI and LO part
                 * of the volume. */

                hi = volumes[channel] >> 16;
                lo = volumes[channel] & 0xFFFF;

                t = (int32_t)(*samples);
                t = ((int32_t) (t * lo) >> 16) + (t * hi);
                t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
                *samples++ = (int16_t) t;

                if (PA_UNLIKELY(++channel >= channels))
                    channel = 0;
            }
            break;
    }
}

static void pa_volume_float32ne_neon(float *samples, const float *volumes, unsigned channels, unsigned length) {
    unsigned channel = 0, i;
    float32x4_t vol4;

    length /= sizeof(float);

    switch (channels) {
        case 1:
            vol4 = vdupq_n_f32(*volumes);
            vol_float_neon(&vol4, samples, length);

            for (i = length & ~3; i < length; i++) {
                samples[i] *= volumes[0];
            }
            break;
        case 2:
            vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes);
            vol_float_neon(&vol4, samples, length);

            for (i = length & ~3; i < length; i++) {
                samples[i] *= volumes[channel];

                if (PA_UNLIKELY(++channel >= channels))
                    channel = 0;
            }
            break;
        case 4:
            vol4 = *(float32x4_t *)volumes;
            vol_float_neon(&vol4, samples, length);

            for (i = length & ~3; i < length; i++) {
                samples[i] *= volumes[channel++];
            }
            break;
        default:
            for (; length; length--) {
                *samples++ *= volumes[channel];

                if (PA_UNLIKELY(++channel >= channels))
                    channel = 0;
            }
            break;
    }
}

#define SAMPLES 1019
#define TIMES 50000
#define CHANNELS 4
#define PADDING 16
#define ALIGN 1

static void run_test_float(void) {
    float floats[SAMPLES+ALIGN];
    float floats_ref[SAMPLES+ALIGN];
    float floats_orig[SAMPLES+ALIGN];
    float volumes[CHANNELS];
    unsigned i;
    pa_usec_t start, stop;

    pa_log_debug("checking NEON volume_float32ne(%d)", SAMPLES);

    for (i = 0; i < SAMPLES+ALIGN; i++) {
        floats_orig[i] = rand()/(float) RAND_MAX - 0.5f;
    }
    memcpy(floats_ref, floats_orig, sizeof(floats_orig));
    memcpy(floats, floats_orig, sizeof(floats_orig));

    for (i = 0; i < CHANNELS; i++)
        volumes[i] = 0.5f * rand() / (float) RAND_MAX;

    pa_volume_float32ne_neon(floats+ALIGN, volumes, CHANNELS, sizeof(floats));
    pa_volume_float32ne_c(floats_ref+ALIGN, volumes, CHANNELS, sizeof(floats_ref));

    for (i = ALIGN; i < SAMPLES+ALIGN; i++) {
        if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
            pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i],
                      floats_orig[i]);
        }
    }

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        memcpy(floats, floats_orig, sizeof(floats_orig));
        pa_volume_float32ne_neon(floats+ALIGN, volumes, CHANNELS, sizeof(floats));
    }
    stop = pa_rtclock_now();
    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        memcpy(floats_ref, floats_orig, sizeof(floats_orig));
        pa_volume_float32ne_c(floats_ref+ALIGN, volumes, CHANNELS, sizeof(floats_ref));
    }
    stop = pa_rtclock_now();
    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
}

static void run_test_s16(void) {
    int16_t samples[SAMPLES+ALIGN];
    int16_t samples_ref[SAMPLES+ALIGN];
    int16_t samples_orig[SAMPLES+ALIGN];
    uint32_t volumes[CHANNELS + PADDING];
    unsigned i, padding;
    pa_usec_t start, stop;

    pa_log_debug("checking NEON volume_s16ne(%d)", SAMPLES);

    for (i = 0; i < SAMPLES; i++) {
        samples_orig[i] = rand() - RAND_MAX/2;
    }
    memcpy(samples_ref, samples_orig, sizeof(samples_orig));
    memcpy(samples, samples_orig, sizeof(samples_orig));

    for (i = 0; i < CHANNELS; i++)
        volumes[i] = PA_CLAMP_VOLUME(rand() >> 15);
    for (padding = 0; padding < PADDING; padding++, i++)
        volumes[i] = volumes[padding];

    pa_volume_s16ne_neon(samples+ALIGN, volumes, CHANNELS, sizeof(samples));
    pa_volume_s16ne_c(samples_ref+ALIGN, volumes, CHANNELS, sizeof(samples_ref));

    for (i = ALIGN; i < SAMPLES+ALIGN; i++) {
        if (abs(samples[i] - samples_ref[i]) > 0) {
            pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i],
                      samples_orig[i]);
        }
    }

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        memcpy(samples, samples_orig, sizeof(samples_orig));
        pa_volume_s16ne_neon(samples+ALIGN, volumes, CHANNELS, sizeof(samples));
    }
    stop = pa_rtclock_now();
    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        memcpy(samples, samples_orig, sizeof(samples_orig));
        pa_volume_s16ne_arm(samples+ALIGN, volumes, CHANNELS, sizeof(samples));
    }
    stop = pa_rtclock_now();
    pa_log_info("ARM: %llu usec.", (long long unsigned int)(stop - start));

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        memcpy(samples_ref, samples_orig, sizeof(samples_orig));
        pa_volume_s16ne_c(samples_ref, volumes, CHANNELS, sizeof(samples_ref));
    }
    stop = pa_rtclock_now();
    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
}

#endif /* defined(__arm__) */

int main() {

    run_test_float();
    run_test_s16();

    return EXIT_SUCCESS;
}

Repositories maintained by Peter Meerwald, pmeerw@pmeerw.net.