view remap_neon.c @ 1:b829afbea564

more testing
author Peter Meerwald <p.meerwald@bct-electronic.com>
date Fri, 20 Apr 2012 14:26:14 +0200
parents e0040ee59c3c
children 09ee6a01a3d3
line wrap: on
line source

/*
 * Copyright 2012 Peter Meerwald <p.meerwald@bct-electronic.com>
 */

#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
#include <assert.h>

typedef unsigned char uint8_t;
typedef short int16_t;
typedef unsigned int uint32_t;

typedef enum pa_sample_format {
    PA_SAMPLE_S16LE,
    PA_SAMPLE_FLOAT32LE,
} pa_sample_format_t;
#define PA_SAMPLE_S16NE PA_SAMPLE_S16LE
#define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE

typedef struct pa_sample_spec {
  pa_sample_format_t format;
  uint32_t rate;
  uint8_t channels;
} pa_sample_spec;

#define PA_CHANNELS_MAX 32
typedef struct {
    pa_sample_format_t *format;
    pa_sample_spec *i_ss, *o_ss;
    float map_table_f[PA_CHANNELS_MAX][PA_CHANNELS_MAX];
    int32_t map_table_i[PA_CHANNELS_MAX][PA_CHANNELS_MAX];
} pa_remap_t;

typedef long long unsigned int pa_usec_t;

#define pa_assert(x) assert(x)
#define pa_assert_not_reached() assert(0)

#define PA_CLAMP_UNLIKELY(x, low, high) \
    (((x) < (low)) ? (low) : (((x) > (high)) ? (high) : (x)))

static void pa_log_info(const char *format, ...)  {
    va_list ap;
    char buf[1024];
    va_start(ap, format);
    vsprintf(buf, format, ap);
    printf("%s\n", buf);
    va_end(ap);
}

#define pa_log_debug pa_log_info

static pa_usec_t pa_rtclock_now() {
    struct timeval tv;
    gettimeofday(&tv, NULL);

    return tv.tv_sec * 1000000ULL + tv.tv_usec;
}

static void remap_channels_matrix_c(pa_remap_t *m, void *dst, const void *src, unsigned n) {
    unsigned oc, ic, i;
    unsigned n_ic, n_oc;

    n_ic = m->i_ss->channels;
    n_oc = m->o_ss->channels;

    switch (*m->format) {
        case PA_SAMPLE_FLOAT32NE:
        {
            float *d, *s;

            memset(dst, 0, n * sizeof(float) * n_oc);

            for (oc = 0; oc < n_oc; oc++) {

                for (ic = 0; ic < n_ic; ic++) {
                    float vol;

                    vol = m->map_table_f[oc][ic];

                    if (vol <= 0.0)
                        continue;

                    d = (float *)dst + oc;
                    s = (float *)src + ic;

                    if (vol >= 1.0) {
                        for (i = n; i > 0; i--, s += n_ic, d += n_oc)
                            *d += *s;
                    } else {
                        for (i = n; i > 0; i--, s += n_ic, d += n_oc)
                            *d += *s * vol;
                    }
                }
            }

            break;
        }
        case PA_SAMPLE_S16NE:
        {
            int16_t *d, *s;

            memset(dst, 0, n * sizeof(int16_t) * n_oc);

            for (oc = 0; oc < n_oc; oc++) {

                for (ic = 0; ic < n_ic; ic++) {
                    int32_t vol;

                    vol = m->map_table_i[oc][ic];

                    if (vol <= 0)
                        continue;

                    d = (int16_t *)dst + oc;
                    s = (int16_t *)src + ic;

                    if (vol >= 0x10000) {
                        for (i = n; i > 0; i--, s += n_ic, d += n_oc)
                            *d += *s;
                    } else {
                        for (i = n; i > 0; i--, s += n_ic, d += n_oc)
                            *d += (int16_t) (((int32_t)*s * vol) >> 16);
                    }
                }
            }
            break;
        }
        default:
            pa_assert_not_reached();
    }
}

static void remap_mono_to_stereo_c(pa_remap_t *m, void *dst, const void *src, unsigned n) {
    unsigned i;

    switch (*m->format) {
        case PA_SAMPLE_FLOAT32NE:
        {
            float *d, *s;

            d = (float *) dst;
            s = (float *) src;

            for (i = n >> 2; i; i--) {
                d[0] = d[1] = s[0];
                d[2] = d[3] = s[1];
                d[4] = d[5] = s[2];
                d[6] = d[7] = s[3];
                s += 4;
                d += 8;
            }
            for (i = n & 3; i; i--) {
                d[0] = d[1] = s[0];
                s++;
                d += 2;
            }
            break;
        }
        case PA_SAMPLE_S16NE:
        {
            int16_t *d, *s;

            d = (int16_t *) dst;
            s = (int16_t *) src;

            for (i = n >> 2; i; i--) {
                d[0] = d[1] = s[0];
                d[2] = d[3] = s[1];
                d[4] = d[5] = s[2];
                d[6] = d[7] = s[3];
                s += 4;
                d += 8;
            }
            for (i = n & 3; i; i--) {
                d[0] = d[1] = s[0];
                s++;
                d += 2;
            }
            break;
        }
        default:
            pa_assert_not_reached();
    }
}



static void remap_stereo_to_mono_c(pa_remap_t *m, void *dst, const void *src, unsigned n) {
    unsigned i;

    switch (*m->format) {
        case PA_SAMPLE_FLOAT32NE:
        {
            float *d = (float *) dst, *s = (float *) src;

            for (i = n >> 2; i > 0; i--) {
                d[0] = s[0] + s[1];
                d[1] = s[2] + s[3];
                d[2] = s[4] + s[5];
                d[3] = s[6] + s[7];
                s += 8;
                d += 4;
            }
            for (i = n & 3; i; i--) {
                d[0] = s[0] + s[1];
                s += 2;
                d += 1;
            }
            break;
        }
        case PA_SAMPLE_S16NE:
        {
            int16_t *d = (int16_t *) dst, *s = (int16_t *) src;

            for (i = n >> 2; i > 0; i--) {
                *d++ += s[0] + s[1];
                *d++ += s[2] + s[3];
                *d++ += s[4] + s[5];
                *d++ += s[6] + s[7];
                s += 8;
            }
            for (i = n & 3; i; i--) {
                *d++ += s[0] + s[1];
                s += 2;
            }
            break;
        }
        default:
            pa_assert_not_reached();
    }
}


#if defined(__arm__)

#include "arm_neon.h"

void remap_mono_to_stereo_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) {
    unsigned i;
    switch (*m->format) {
        case PA_SAMPLE_FLOAT32NE:
        {
            float *d = (float *) dst, *s = (float *) src;
            
            for (i = 0; i < n/4; i++) {
                float32x4x2_t stereo; 
                stereo.val[0] = vld1q_f32(s);
                stereo.val[1] = stereo.val[0];
                vst2q_f32(d, stereo);
                s += 4;
                d += 8;
            }
            
            for (i = n & ~3; i < n; i++) {
                d[0] = d[1] = s[0];
                s++;
                d += 2;
            }
            break;
        }
        case PA_SAMPLE_S16NE:
        {
            int16_t *d = (int16_t *) dst, *s = (int16_t *) src;

            for (i = 0; i < n/8; i++) {
                int16x8x2_t stereo; 
                stereo.val[0] = vld1q_s16(s);
                stereo.val[1] = stereo.val[0];
                vst2q_s16(d, stereo);
                s += 8;
                d += 16;
            }

            for (i = n & ~7; i < n; i++) {
                d[0] = d[1] = s[0];
                s++;
                d += 2;
            }
            break;
        }
        default:
            pa_assert_not_reached();
    }
}

/* don't inline, causes ICE, see https://bugs.launchpad.net/bugs/936863 */
static __attribute__ ((noinline)) void stereo_to_mono_float(float *d, const float *s, unsigned n) {
    unsigned i;

    for (i = 0; i < n/4; i++) {
        float32x4x2_t stereo = vld2q_f32(s);
        float32x4_t mono = vaddq_f32(stereo.val[0], stereo.val[1]);
        vst1q_f32(d, mono);
        s += 8;
        d += 4;
    }
    for (i = n & ~3; i < n; i++) {
        d[0] = s[0] + s[1];
        s += 2;
        d++;
    }
}

/* don't inline, causes ICE, see https://bugs.launchpad.net/bugs/936863 */
static __attribute__ ((noinline)) void stereo_to_mono_int16(int16_t *d, const int16_t *s, unsigned n) {
    unsigned int i;

    for (i = 0; i < n/8; i++) {
        int16x8x2_t stereo = vld2q_s16(s);
        int16x8_t mono = vaddq_s16(stereo.val[0], stereo.val[1]);
        vst1q_s16(d, mono);
        s += 16;
        d += 8;
    }
    for (i = n & ~7; i < n; i++) {
        d[0] = s[0] + s[1];
        s += 2;
        d++;
    }
}

static void remap_stereo_to_mono_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) {
    switch (*m->format) {
        case PA_SAMPLE_FLOAT32NE:
            stereo_to_mono_float(dst, src, n);
            break;
        case PA_SAMPLE_S16NE:
            stereo_to_mono_int16(dst, src, n);
            break;
        default:
            pa_assert_not_reached();
    }
}
#define SAMPLES 1019
#define TIMES 10000

static void run_test_mono_to_stereo_float(void) {
    float stereo[2*SAMPLES];
    float stereo_ref[2*SAMPLES];
    float stereo_gen[2*SAMPLES];
    float mono[SAMPLES];
    int i;
    pa_usec_t start, stop;
    pa_sample_format_t sf;
    pa_sample_spec iss, oss;
    pa_remap_t remap;

    pa_log_debug("checking NEON remap_mono_to_stereo(float, %d)", SAMPLES);

    memset(stereo_ref, 0, sizeof(stereo_ref));
    memset(stereo, 0, sizeof(stereo));

    for (i = 0; i < SAMPLES; i++) {
        mono[i] = rand()/(float) RAND_MAX - 0.5f;
    }

    sf = PA_SAMPLE_FLOAT32NE;
    remap.format = &sf;
    iss.format = PA_SAMPLE_FLOAT32NE;
    iss.channels = 1;
    oss.format = PA_SAMPLE_FLOAT32NE;
    oss.channels = 2;
    remap.i_ss = &iss;
    remap.o_ss = &oss;
    remap.map_table_f[0][0] = 1.0;
    remap.map_table_f[1][0] = 1.0;

    remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
    remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES);
    remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);

    for (i = 0; i < 2*SAMPLES; i++) {
        if (fabsf(stereo[i] - stereo_ref[i]) > 0.00001) {
            pa_log_debug("%d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_ref[i],
                      mono[i/2]);
        }
    }
    for (i = 0; i < 2*SAMPLES; i++) {
        if (fabsf(stereo[i] - stereo_gen[i]) > 0.00001) {
            pa_log_debug("%d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_gen[i],
                      mono[i/2]);
        }
    }

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
    }
    stop = pa_rtclock_now();
    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
    }
    stop = pa_rtclock_now();
    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES);
    }
    stop = pa_rtclock_now();
    pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start));
}

static void run_test_stereo_to_mono_float(void) {
    float stereo[2*SAMPLES];
    float mono_ref[SAMPLES];
    float mono_gen[SAMPLES];
    float mono[SAMPLES];
    int i;
    pa_usec_t start, stop;
    pa_sample_format_t sf;
    pa_sample_spec iss, oss;
    pa_remap_t remap;

    pa_log_debug("checking NEON remap_stereo_to_mono(float, %d)", SAMPLES);

    memset(mono_ref, 0, sizeof(mono_ref));
    memset(mono, 0, sizeof(mono));

    for (i = 0; i < 2*SAMPLES; i++) {
        stereo[i] = rand()/(float) RAND_MAX - 0.5f;
    }

    sf = PA_SAMPLE_FLOAT32NE;
    remap.format = &sf;
    iss.format = PA_SAMPLE_FLOAT32NE;
    iss.channels = 2;
    oss.format = PA_SAMPLE_FLOAT32NE;
    oss.channels = 1;
    remap.i_ss = &iss;
    remap.o_ss = &oss;
    remap.map_table_f[0][0] = 1.0;
    remap.map_table_f[0][1] = 1.0;

    remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES);
    remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES);
    remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES);

    for (i = 0; i < SAMPLES; i++) {
        if (fabsf(mono[i] - mono_ref[i]) > 0.00001) {
            pa_log_debug("%d: %.3f != %.3f (%.3f %0.3f)", i, mono[i], mono_ref[i],
                      stereo[2*i+0], stereo[2*i+1]);
        }
    }

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES);
    }
    stop = pa_rtclock_now();
    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES);
    }
    stop = pa_rtclock_now();
    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES);
    }
    stop = pa_rtclock_now();
    pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start));
}

static void run_test_mono_to_stereo_s16(void) {
    int16_t stereo[2*SAMPLES];
    int16_t stereo_ref[2*SAMPLES];
    int16_t stereo_gen[2*SAMPLES];
    int16_t mono[SAMPLES];
    int i;
    pa_usec_t start, stop;
    pa_sample_format_t sf;
    pa_sample_spec iss, oss;
    pa_remap_t remap;

    pa_log_debug("checking NEON remap_mono_to_stereo(s16, %d)", SAMPLES);

    memset(stereo_ref, 0, sizeof(stereo_ref));
    memset(stereo, 0, sizeof(stereo));

    for (i = 0; i < SAMPLES; i++) {
        mono[i] = rand() - RAND_MAX/2;
    }

    sf = PA_SAMPLE_S16NE;
    remap.format = &sf;
    iss.format = PA_SAMPLE_S16NE;
    iss.channels = 1;
    oss.format = PA_SAMPLE_S16NE;
    oss.channels = 2;
    remap.i_ss = &iss;
    remap.o_ss = &oss;
    remap.map_table_f[0][0] = 1.0;
    remap.map_table_f[1][0] = 1.0;
    
    remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
    remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES);
    remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
    
    for (i = 0; i < 2*SAMPLES; i++) {
        if (abs(stereo[i] - stereo_ref[i]) > 0) {
            pa_log_debug("%d: %d != %d (%d)", i, stereo[i], stereo_ref[i],
                      mono[i/2]);
        }
    }

    for (i = 0; i < 2*SAMPLES; i++) {
        if (abs(stereo[i] - stereo_gen[i]) > 0) {
            pa_log_debug("%d: %d != %d (%d)", i, stereo[i], stereo_gen[i],
                      mono[i/2]);
        }
    }

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
    }
    stop = pa_rtclock_now();
    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
    }
    stop = pa_rtclock_now();
    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        remap_channels_matrix_c(&remap, stereo_gen, mono, SAMPLES);
    }
    stop = pa_rtclock_now();
    pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start));
}

static void run_test_stereo_to_mono_s16(void) {
    int16_t stereo[2*SAMPLES];
    int16_t mono_ref[SAMPLES];
    int16_t mono_gen[SAMPLES];
    int16_t mono[SAMPLES];
    int i;
    pa_usec_t start, stop;
    pa_sample_format_t sf;
    pa_sample_spec iss, oss;
    pa_remap_t remap;

    pa_log_debug("checking NEON remap_stereo_to_mono(s16, %d)", SAMPLES);

    memset(mono_ref, 0, sizeof(mono_ref));
    memset(mono, 0, sizeof(mono));

    for (i = 0; i < 2*SAMPLES; i++) {
        stereo[i] = rand() - RAND_MAX/2;
    }

    sf = PA_SAMPLE_S16NE;
    remap.format = &sf;
    iss.format = PA_SAMPLE_S16NE;
    iss.channels = 2;
    oss.format = PA_SAMPLE_S16NE;
    oss.channels = 1;
    remap.i_ss = &iss;
    remap.o_ss = &oss;
    remap.map_table_f[0][0] = 1.0;
    remap.map_table_f[0][1] = 1.0;
    
    remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES);
    remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES);
    remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES);

    for (i = 0; i < SAMPLES; i++) {
        if (abs(mono[i] - mono_ref[i]) > 0) {
            pa_log_debug("%d: %d != %d (%d)", i, mono[i], mono_ref[i],
                      stereo[2*i+0], stereo[2*i+1]);
        }
    }
    for (i = 0; i < SAMPLES; i++) {
        if (abs(mono[i] - mono_gen[i]) > 0) {
            pa_log_debug("%d: %d != %d (%d)", i, mono[i], mono_gen[i],
                      stereo[2*i+0], stereo[2*i+1]);
        }
    }

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        remap_stereo_to_mono_neon(&remap, mono, stereo, SAMPLES);
    }
    stop = pa_rtclock_now();
    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        remap_stereo_to_mono_c(&remap, mono_ref, stereo, SAMPLES);
    }
    stop = pa_rtclock_now();
    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));

    start = pa_rtclock_now();
    for (i = 0; i < TIMES; i++) {
        remap_channels_matrix_c(&remap, mono_gen, stereo, SAMPLES);
    }
    stop = pa_rtclock_now();
    pa_log_info("generic: %llu usec.", (long long unsigned int)(stop - start));
}


#endif /* defined(__arm__) */

int main() {

    run_test_stereo_to_mono_float();
    run_test_stereo_to_mono_s16();

    run_test_mono_to_stereo_float();
    run_test_mono_to_stereo_s16();


    return EXIT_SUCCESS;
}

Repositories maintained by Peter Meerwald, pmeerw@pmeerw.net.