Mercurial > hg > pa-neon
diff sconv_neon.c @ 0:e0040ee59c3c
import
author | Peter Meerwald <p.meerwald@bct-electronic.com> |
---|---|
date | Thu, 12 Jan 2012 17:27:46 +0100 |
parents | |
children | b829afbea564 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sconv_neon.c Thu Jan 12 17:27:46 2012 +0100 @@ -0,0 +1,202 @@ +/* + * Copyright 2012 Peter Meerwald <p.meerwald@bct-electronic.com> + */ + +#include <stdlib.h> +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <math.h> +#include <sys/time.h> +#include <assert.h> + +typedef short int16_t; +typedef void (*pa_convert_func_t)(unsigned n, const void *a, void *b); +typedef long long unsigned int pa_usec_t; + +#define pa_assert(x) assert(x) + +#define PA_CLAMP_UNLIKELY(x, low, high) \ + (((x) < (low)) ? (low) : (((x) > (high)) ? (high) : (x))) + +static void pa_log_info(const char *format, ...) { + va_list ap; + char buf[1024]; + va_start(ap, format); + vsprintf(buf, format, ap); + printf("%s\n", buf); + va_end(ap); +} + +#define pa_log_debug pa_log_info + +static pa_usec_t pa_rtclock_now() { + struct timeval tv; + gettimeofday(&tv, NULL); + + return tv.tv_sec * 1000000ULL + tv.tv_usec; +} + +#if defined(__arm__) + +#include "arm_neon.h" + +void pa_sconv_s16le_from_float32ne(unsigned n, const float *a, int16_t *b) { + pa_assert(a); + pa_assert(b); + + for (; n > 0; n--) { + float v = *(a++); + + v = PA_CLAMP_UNLIKELY(v, -1.0f, 1.0f); + *(b++) = (int16_t) lrintf(v * 0x7FFF); + } +} + +void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *a, int16_t *b) { + unsigned i; + + const float32x4_t plusone4 = vdupq_n_f32(1.0f); + const float32x4_t minusone4 = vdupq_n_f32(-1.0f); + const float32x4_t half4 = vdupq_n_f32(0.5f); + const float32x4_t scale4 = vdupq_n_f32(32767.0f); + const uint32x4_t mask4 = vdupq_n_u32(0x80000000); + + for (i = 0; i < n/4; i++) { + float32x4_t v4 = ((float32x4_t *)a)[i]; + v4 = vmulq_f32(vmaxq_f32(vminq_f32(v4, plusone4) , minusone4), scale4); + + const float32x4_t w4 = vreinterpretq_f32_u32(vorrq_u32(vandq_u32( + vreinterpretq_u32_f32(v4), mask4), vreinterpretq_u32_f32(half4))); + + ((int16x4_t *)b)[i] = vmovn_s32(vcvtq_s32_f32(vaddq_f32(v4, w4))); + } + + // leftovers + for (i = n & ~3; i < n; i++) { + b[i] = (int16_t) lrintf(PA_CLAMP_UNLIKELY(a[i], -1.0f, 1.0f) * 0x7FFF); + } +} + +void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *a, float *b) { + pa_assert(a); + pa_assert(b); + + for (; n > 0; n--) + *(b++) = ((float) (*(a++)))/(float) 0x7FFF; +} + +void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *a, float *b) { + unsigned i; + + const float32x4_t invscale4 = vdupq_n_f32(1.0f / 0x7FFF); + + for (i = 0; i < n/4; i++) { + ((float32x4_t *)b)[i] = vmulq_f32(vcvtq_f32_s32(vmovl_s16(((int16x4_t *)a)[i])), invscale4); + } + + // leftovers + const float invscale = 1.0f / 0x7FFF; + for (i = n & ~3; i < n; i++) { + b[i] = a[i] * invscale; + } +} + +#define SAMPLES 1019 +#define TIMES 300 + +static void run_test_from(void) { + int16_t samples[SAMPLES]; + int16_t samples_ref[SAMPLES]; + float floats[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_convert_func_t func; + + pa_log_debug("checking NEON sconv_s16le_from_float(%d)", SAMPLES); + + memset(samples_ref, 0, sizeof(samples_ref)); + memset(samples, 0, sizeof(samples)); + + for (i = 0; i < SAMPLES; i++) { + floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f); + } + + func = (pa_convert_func_t) pa_sconv_s16le_from_float32ne; + func(SAMPLES, floats, samples_ref); + pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples); + + for (i = 0; i < SAMPLES; i++) { + if (abs(samples[i] - samples_ref[i]) > 0) { + pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i], + floats[i]); + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + func(SAMPLES, floats, samples_ref); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} + +static void run_test_to(void) { + int16_t samples[SAMPLES]; + float floats[SAMPLES]; + float floats_ref[SAMPLES]; + int i; + pa_usec_t start, stop; + pa_convert_func_t func; + + pa_log_debug("checking NEON sconv_s16le_to_float(%d)", SAMPLES); + + memset(floats_ref, 0, sizeof(floats_ref)); + memset(floats, 0, sizeof(float)); + + for (i = 0; i < SAMPLES; i++) { + samples[i] = rand() - RAND_MAX/2; + } + + func = (pa_convert_func_t) pa_sconv_s16le_to_float32ne; + func(SAMPLES, samples, floats_ref); + pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats); + + for (i = 0; i < SAMPLES; i++) { + if (fabsf(floats[i] - floats_ref[i]) > 0.00001) { + pa_log_debug("%d: %.8f != %.8f (%d)", i, floats[i], floats_ref[i], + samples[i]); + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats); + } + stop = pa_rtclock_now(); + pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + func(SAMPLES, samples, floats_ref); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} + +#endif /* defined(__arm__) */ + +int main() { + + run_test_from(); + run_test_to(); + + return EXIT_SUCCESS; +}