view sconv_neon.c @ 5:07763f536182 default tip

ALIGNment support
author Peter Meerwald <p.meerwald@bct-electronic.com>
date Sun, 08 Jul 2012 21:48:08 +0200
parents e889fd0e7769
children
line wrap: on
line source

/*
 * Copyright 2012 Peter Meerwald <p.meerwald@bct-electronic.com>
 */

#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
#include <assert.h>

typedef short int16_t;
typedef void (*pa_convert_func_t)(unsigned n, const void *a, void *b);
typedef long long unsigned int pa_usec_t;

#define pa_assert(x) assert(x)

#define PA_CLAMP_UNLIKELY(x, low, high) \
    (((x) < (low)) ? (low) : (((x) > (high)) ? (high) : (x)))

static void pa_log_info(const char *format, ...)  {
    va_list ap;
    char buf[1024];
    va_start(ap, format);
    vsprintf(buf, format, ap);
    printf("%s\n", buf);
    va_end(ap);
}

#define pa_log_debug pa_log_info

static pa_usec_t pa_rtclock_now() {
    struct timeval tv;
    gettimeofday(&tv, NULL);

    return tv.tv_sec * 1000000ULL + tv.tv_usec;
}

#if defined(__arm__)

#include "arm_neon.h"

void pa_sconv_s16le_from_float32ne(unsigned n, const float *src, int16_t *dst) {
    pa_assert(src);
    pa_assert(dst);

    for (; n > 0; n--) {
        float v = *(src++);

        v = PA_CLAMP_UNLIKELY(v, -1.0f, 1.0f);
        *(dst++) = (int16_t) lrintf(v * 0x7FFF);
    }
}

void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *src, int16_t *dst) {
    unsigned i = n & 3;

    asm volatile (
    "mov        %[n], %[n], lsr #2\n\t"
    "vdup.f32   q2, %[plusone]\n\t"
    "vneg.f32   q3, q2\n\t"
    "vdup.f32   q4, %[scale]\n\t"
    "vdup.u32   q5, %[mask]\n\t"
    "vdup.f32   q6, %[half]\n\t"
    "1:\n\t"
    "vld1.32    {q0}, [%[src]]!\n\t"
    "vmin.f32   q0, q0, q2\n\t" /* clamp */
    "vmax.f32   q0, q0, q3\n\t"
    "vmul.f32   q0, q0, q4\n\t" /* scale */
    "vand.u32   q1, q0, q5\n\t" 
    "vorr.u32   q1, q1, q6\n\t" /* round */
    "vadd.f32   q0, q0, q1\n\t"
    "vcvt.s32.f32 q0, q0\n\t" /* narrow */
    "vmovn.i32  d0, q0\n\t"
    "subs       %[n], %[n], #1\n\t"
    "vst1.16    {d0}, [%[dst]]!\n\t"
    "bgt        1b\n\t"
      /* output operands (or input operands that get modified) */
    : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
    : [plusone] "r" (1.0f), [scale] "r" (32767.0f),
      [half] "r" (0.5f), [mask] "r" (0x80000000) /* input operands */
    : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6" /* clobber list */
    );

    // leftovers
    while (i--) {
        *dst++ = (int16_t) lrintf(PA_CLAMP_UNLIKELY(*src, -1.0f, 1.0f) * 0x7FFF);
        src++;
    }
}

void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *src, float *dst) {
    pa_assert(src);
    pa_assert(dst);

    for (; n > 0; n--)
        *(dst++) = ((float) (*(src++)))/(float) 0x7FFF;
}

void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *src, float *dst) {
    unsigned i = n & 3;

    const float invscale = 1.0f / 0x7FFF;

    asm volatile (
    "mov        %[n], %[n], lsr #2\n\t"
    "vdup.f32   q1, %[invscale]\n\t"
    "1:\n\t"
    "vld1.16    {d0}, [%[src]]!\n\t"
    "vmovl.s16  q0, d0\n\t"

    "vcvt.f32.s32 q0, q0\n\t"
    "vmul.f32   q0, q0, q1\n\t"

    "subs       %[n], %[n], #1\n\t"
    "vst1.32    {q0}, [%[dst]]!\n\t"
    "bgt        1b\n\t"
      /* output operands (or input operands that get modified) */
    : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
    : [invscale] "r" (invscale) /* input operands */
    : "memory", "cc", "q0", "q1" /* clobber list */
    );

    // leftovers
    while (i--) {
        *dst++ = *src++ * invscale;
    }
}

#define SAMPLES 1019
#define TIMES 100000
#define ALIGN 1

static void run_test_from(void) {
    int16_t samples[SAMPLES+ALIGN];
    int16_t samples_ref[SAMPLES+ALIGN];
    float floats[SAMPLES+ALIGN];
    int i;
    pa_usec_t start, stop;

    pa_log_debug("checking NEON sconv_s16le_from_float(%d)", SAMPLES);

    memset(samples_ref, 0, sizeof(samples_ref));
    memset(samples, 0, sizeof(samples));

    for (i = 0; i < SAMPLES+ALIGN; i++) {
        floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f);
    }

    pa_sconv_s16le_from_float32ne(SAMPLES, floats+ALIGN, samples_ref+ALIGN);
    pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats+ALIGN, samples+ALIGN);

    for (i = ALIGN; i < SAMPLES+ALIGN; i++) {
        if (abs(samples[i] - samples_ref[i]) > 0) {
            pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i],
                      floats[i]);
        }
    }

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats+ALIGN, samples+ALIGN);
    }
    stop = pa_rtclock_now();
    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        pa_sconv_s16le_from_float32ne(SAMPLES, floats+ALIGN, samples_ref+ALIGN);
    }
    stop = pa_rtclock_now();
    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
}

static void run_test_to(void) {
    int16_t samples[SAMPLES+ALIGN];
    float floats[SAMPLES+ALIGN];
    float floats_ref[SAMPLES+ALIGN];
    int i;
    pa_usec_t start, stop;

    pa_log_debug("checking NEON sconv_s16le_to_float(%d)", SAMPLES);

    memset(floats_ref, 0, sizeof(floats_ref));
    memset(floats, 0, sizeof(float));

    for (i = 0; i < SAMPLES+ALIGN; i++) {
        samples[i] = rand() - RAND_MAX/2;
    }

    pa_sconv_s16le_to_float32ne(SAMPLES, samples+ALIGN, floats_ref+ALIGN);
    pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples+ALIGN, floats+ALIGN);

    for (i = ALIGN; i < SAMPLES+ALIGN; i++) {
        if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
            pa_log_debug("%d: %.8f != %.8f (%d)", i, floats[i], floats_ref[i],
                      samples[i]);
        }
    }

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples+ALIGN, floats+ALIGN);
    }
    stop = pa_rtclock_now();
    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        pa_sconv_s16le_to_float32ne(SAMPLES, samples+ALIGN, floats_ref+ALIGN);
    }
    stop = pa_rtclock_now();
    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
}

#endif /* defined(__arm__) */

int main() {

    run_test_from();
    run_test_to();

    return EXIT_SUCCESS;
}

Repositories maintained by Peter Meerwald, pmeerw@pmeerw.net.