view svolume_neon.c @ 0:e0040ee59c3c

import
author Peter Meerwald <p.meerwald@bct-electronic.com>
date Thu, 12 Jan 2012 17:27:46 +0100
parents
children b829afbea564
line wrap: on
line source

/*
 * Copyright 2012 Peter Meerwald <p.meerwald@bct-electronic.com>
 */

#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
#include <assert.h>


typedef short int16_t;
typedef unsigned int uint32_t;
typedef enum pa_sample_format {
    PA_SAMPLE_S16LE,
    PA_SAMPLE_FLOAT32LE,
} pa_sample_format_t;
#define PA_SAMPLE_S16NE PA_SAMPLE_S16LE
#define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE
typedef struct {
    pa_sample_format_t *format;
} pa_remap_t;
typedef void (*pa_remap_func_t)(pa_remap_t *m, void *dst, const void *src, unsigned n);
typedef long long unsigned int pa_usec_t;

#define pa_assert(x) assert(x)
#define pa_assert_not_reached() assert(0)

#define PA_MAX(a, b) ((a) > (b) ? (a) : (b))

typedef uint32_t pa_volume_t;
#define PA_VOLUME_MUTED ((pa_volume_t) 0U)
#define PA_VOLUME_MAX ((pa_volume_t) UINT32_MAX/2)

#define PA_UNLIKELY(x) (x)
#define PA_CLAMP_UNLIKELY(x, low, high) (PA_UNLIKELY((x) > (high)) ? (high) : (PA_UNLIKELY((x) < (low)) ? (low) : (x)))
#define PA_CLAMP_VOLUME(v) (PA_CLAMP_UNLIKELY((v), PA_VOLUME_MUTED, PA_VOLUME_MAX))

static void pa_log_info(const char *format, ...)  {
    va_list ap;
    char buf[1024];
    va_start(ap, format);
    vsprintf(buf, format, ap);
    printf("%s\n", buf);
    va_end(ap);
}

#define pa_log_debug pa_log_info

static pa_usec_t pa_rtclock_now() {
    struct timeval tv;
    gettimeofday(&tv, NULL);

    return tv.tv_sec * 1000000ULL + tv.tv_usec;
}

void pa_volume_s16ne_c(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
    unsigned channel;

    length /= sizeof(int16_t);

    for (channel = 0; length; length--) {
        int32_t t, hi, lo;

        /* Multiplying the 32bit volume factor with the 16bit
         * sample might result in an 48bit value. We want to
         * do without 64 bit integers and hence do the
         * multiplication independently for the HI and LO part
         * of the volume. */

        hi = volumes[channel] >> 16;
        lo = volumes[channel] & 0xFFFF;

        t = (int32_t)(*samples);
        t = ((t * lo) >> 16) + (t * hi);
        t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
        *samples++ = (int16_t) t;

        if (PA_UNLIKELY(++channel >= channels))
            channel = 0;
    }
}

void pa_volume_float32ne_c(float *samples, float *volumes, unsigned channels, unsigned length) {
    unsigned channel;

    length /= sizeof(float);

    for (channel = 0; length; length--) {
        *samples++ *= volumes[channel];

        if (PA_UNLIKELY(++channel >= channels))
            channel = 0;
    }
}

/*
void pa_volume_s16ne_orc(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
    if (channels == 2) {
        int64_t v = (int64_t)volumes[1] << 32 | volumes[0];
        pa_volume_s16ne_orc_2ch (samples, v, ((length / (sizeof(int16_t))) / 2));
    } else if (channels == 1)
        pa_volume_s16ne_orc_1ch (samples, volumes[0], length / (sizeof(int16_t)));
}
*/

#if defined(__arm__)

#include "arm_neon.h"

#define MOD_INC() \
    " subs  r0, r6, %2              \n\t" \
    " itt cs                        \n\t" \
    " addcs r0, %1                  \n\t" \
    " movcs r6, r0                  \n\t"

static void pa_volume_s16ne_arm(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
    int32_t *ve;

    /* Channels must be at least 4, and always a multiple of the original number.
     * This is also the max amount we overread the volume array, which should
     * have enough padding. */
    channels = channels == 3 ? 6 : PA_MAX (4U, channels);
    ve = volumes + channels;

    __asm__ __volatile__ (
        " mov r6, %1                      \n\t"
        " mov %3, %3, LSR #1              \n\t" /* length /= sizeof (int16_t) */
        " tst %3, #1                      \n\t" /* check for odd samples */
        " beq  2f                         \n\t"

        "1:                               \n\t"
        " ldr  r0, [r6], #4               \n\t" /* odd samples volumes */
        " ldrh r2, [%0]                   \n\t"

        " smulwb r0, r0, r2               \n\t"
        " ssat r0, #16, r0                \n\t"

        " strh r0, [%0], #2               \n\t"

        MOD_INC()

        "2:                               \n\t"
        " mov %3, %3, LSR #1              \n\t"
        " tst %3, #1                      \n\t" /* check for odd samples */
        " beq  4f                         \n\t"

        "3:                               \n\t"
        " ldrd r2, [r6], #8               \n\t" /* 2 samples at a time */
        " ldr  r0, [%0]                   \n\t"

        " smulwt r2, r2, r0               \n\t"
        " smulwb r3, r3, r0               \n\t"

        " ssat r2, #16, r2                \n\t"
        " ssat r3, #16, r3                \n\t"

        " pkhbt r0, r3, r2, LSL #16       \n\t"
        " str  r0, [%0], #4               \n\t"

        MOD_INC()

        "4:                               \n\t"
        " movs %3, %3, LSR #1             \n\t"
        " beq  6f                         \n\t"

        "5:                               \n\t"
        " ldrd r2, [r6], #8               \n\t" /* 4 samples at a time */
        " ldrd r4, [r6], #8               \n\t"
        " ldrd r0, [%0]                   \n\t"

        " smulwt r2, r2, r0               \n\t"
        " smulwb r3, r3, r0               \n\t"
        " smulwt r4, r4, r1               \n\t"
        " smulwb r5, r5, r1               \n\t"

        " ssat r2, #16, r2                \n\t"
        " ssat r3, #16, r3                \n\t"
        " ssat r4, #16, r4                \n\t"
        " ssat r5, #16, r5                \n\t"

        " pkhbt r0, r3, r2, LSL #16       \n\t"
        " pkhbt r1, r5, r4, LSL #16       \n\t"
        " strd  r0, [%0], #8              \n\t"

        MOD_INC()

        " subs %3, %3, #1                 \n\t"
        " bne 5b                          \n\t"
        "6:                               \n\t"

        : "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length)
        :
        : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc"
    );
}

static inline void vol_s16ne_neon(int32x4_t vol4, int16_t *samples, unsigned length) {
    unsigned i;
    int16x4_t hi = vshrn_n_s32(vol4, 16);
    int32x4_t lo = vandq_s32(vol4, vdupq_n_s32(0xFFFF));
    
    for (i = 0; i < length/8; i++) {
        int16x4_t v1 = ((int16x4_t *) samples)[2*i];
        int16x4_t v2 = ((int16x4_t *) samples)[2*i+1];

        int32x4_t t1 = vmull_s16(v1, hi);
        int32x4_t t2 = vmull_s16(v2, hi);

        int16x4_t r1 = vqmovn_s32(vsraq_n_s32(t1, vmulq_s32(vmovl_s16(v1), lo), 16));
        int16x4_t r2 = vqmovn_s32(vsraq_n_s32(t2, vmulq_s32(vmovl_s16(v2), lo), 16));
      
        ((int16x8_t *)samples)[i] = vcombine_s16(r1, r2);
    }
}

void pa_volume_s16ne_neon(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
    unsigned channel = 0, i;
    int32x4_t vol4;

    length /= sizeof(int16_t);

    switch (channels) {
        case 1:
            vol4 = vdupq_n_s32(*volumes);
            vol_s16ne_neon(vol4, samples, length);
            
            for (i = length & ~7; i < length; i++) {
                int32_t t = samples[i];
                t = ((t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16));
                samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
            }
            break;
        case 2:
            vol4 = vcombine_s32(*(int32x2_t *)volumes, *(int32x2_t *)volumes);
            vol_s16ne_neon(vol4, samples, length);
            
            for (i = length & ~7; i < length; i++) {
                int32_t t = samples[i];
                int32_t vol = volumes[(channel++) & 1];
                t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
                samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
            }
            break;
        case 4:
            vol4 = *(int32x4_t *)volumes;
            vol_s16ne_neon(vol4, samples, length);
            
            for (i = length & ~7; i < length; i++) {
                int32_t t = samples[i];
                int32_t vol = volumes[(channel++) & 3];
                t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
                samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
            }
            break;
        default:
            for (; length; length--) {
                int32_t t, hi, lo;

                /* Multiplying the 32bit volume factor with the 16bit
                 * sample might result in an 48bit value. We want to
                 * do without 64 bit integers and hence do the
                 * multiplication independently for the HI and LO part
                 * of the volume. */

                hi = volumes[channel] >> 16;
                lo = volumes[channel] & 0xFFFF;

                t = (int32_t)(*samples);
                t = ((t * lo) >> 16) + (t * hi);
                t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
                *samples++ = (int16_t) t;

                if (PA_UNLIKELY(++channel >= channels))
                    channel = 0;
            }
            break;
    }
}

void pa_volume_float32ne_neon(float *samples, float *volumes, unsigned channels, unsigned length) {
    unsigned channel = 0, i;
    float32x4_t vol4;

    length /= sizeof(float);

    switch (channels) {
        case 1:
            vol4 = vdupq_n_f32(*volumes);
            for (i = 0; i < length/4; i++) {
                ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
            }

            for (i = length & ~3; i < length; i++) {
                samples[i] *= volumes[0];
            }
            break;
        case 2:
            vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes);
            for (i = 0; i < length/4; i++) {
                ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
            }

            for (i = length & ~3; i < length; i++) {
                samples[i] *= volumes[channel];

                if (PA_UNLIKELY(++channel >= channels))
                    channel = 0;
            }
            break;
        case 4:
            vol4 = *(float32x4_t *)volumes;
            for (i = 0; i < length/4; i++) {
                ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
            }

            for (i = length & ~3; i < length; i++) {
                samples[i] *= volumes[channel++];
            }
            break;
        default:
            for (; length; length--) {
                *samples++ *= volumes[channel];

                if (PA_UNLIKELY(++channel >= channels))
                    channel = 0;
            }
            break;
    }
}

#define SAMPLES 1019
#define TIMES 1000
#define CHANNELS 4
#define PADDING 16

static void run_test_float(void) {
    float floats[SAMPLES];
    float floats_ref[SAMPLES];
    float floats_orig[SAMPLES];
    float volumes[CHANNELS];
    unsigned i;
    pa_usec_t start, stop;

    pa_log_debug("checking NEON volume_float32ne(%d)", SAMPLES);

    for (i = 0; i < SAMPLES; i++) {
        floats_orig[i] = rand()/(float) RAND_MAX - 0.5f;
    }
    memcpy(floats_ref, floats_orig, sizeof(floats_orig));
    memcpy(floats, floats_orig, sizeof(floats_orig));

    for (i = 0; i < CHANNELS; i++)
        volumes[i] = 0.5f * rand() / (float) RAND_MAX;

    pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
    pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
    
    for (i = 0; i < SAMPLES; i++) {
        if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
            pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i],
                      floats_orig[i]);
        }
    }

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        memcpy(floats, floats_orig, sizeof(floats_orig));
        pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
    }
    stop = pa_rtclock_now();
    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        memcpy(floats_ref, floats_orig, sizeof(floats_orig));
        pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
    }
    stop = pa_rtclock_now();
    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
}

static void run_test_s16(void) {
    int16_t samples[SAMPLES];
    int16_t samples_ref[SAMPLES];
    int16_t samples_orig[SAMPLES];
    int32_t volumes[CHANNELS + PADDING];
    unsigned i, padding;
    pa_usec_t start, stop;

    pa_log_debug("checking NEON volume_s16ne(%d)", SAMPLES);

    for (i = 0; i < SAMPLES; i++) {
        samples_orig[i] = rand() - RAND_MAX/2;
    }
    memcpy(samples_ref, samples_orig, sizeof(samples_orig));
    memcpy(samples, samples_orig, sizeof(samples_orig));

    for (i = 0; i < CHANNELS; i++)
        volumes[i] = PA_CLAMP_VOLUME(rand() >> 15);
    for (padding = 0; padding < PADDING; padding++, i++)
        volumes[i] = volumes[padding];

    pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
    pa_volume_s16ne_c(samples_ref, volumes, CHANNELS, sizeof(samples_ref));

    for (i = 0; i < SAMPLES; i++) {
        if (abs(samples[i] - samples_ref[i]) > 0) {
            pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i],
                      samples_orig[i]);
        }
    }

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        memcpy(samples, samples_orig, sizeof(samples_orig));
        pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
    }
    stop = pa_rtclock_now();
    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        memcpy(samples, samples_orig, sizeof(samples_orig));
        pa_volume_s16ne_arm(samples, volumes, CHANNELS, sizeof(samples));
    }
    stop = pa_rtclock_now();
    pa_log_info("ARM: %llu usec.", (long long unsigned int)(stop - start));

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        memcpy(samples_ref, samples_orig, sizeof(samples_orig));
        pa_volume_s16ne_c(samples_ref, volumes, CHANNELS, sizeof(samples_ref));
    }
    stop = pa_rtclock_now();
    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
}

#endif /* defined(__arm__) */

int main() {

    run_test_float();
    run_test_s16();

    return EXIT_SUCCESS;
}

Repositories maintained by Peter Meerwald, pmeerw@pmeerw.net.