Mercurial > hg > pa-neon
view sconv_neon.c @ 3:e889fd0e7769
stuff
author | Peter Meerwald <p.meerwald@bct-electronic.com> |
---|---|
date | Thu, 05 Jul 2012 17:31:56 +0200 |
parents | b829afbea564 |
children | 07763f536182 |
line wrap: on
line source
/* * Copyright 2012 Peter Meerwald <p.meerwald@bct-electronic.com> */ #include <stdlib.h> #include <stdio.h> #include <stdarg.h> #include <string.h> #include <math.h> #include <sys/time.h> #include <assert.h> typedef short int16_t; typedef void (*pa_convert_func_t)(unsigned n, const void *a, void *b); typedef long long unsigned int pa_usec_t; #define pa_assert(x) assert(x) #define PA_CLAMP_UNLIKELY(x, low, high) \ (((x) < (low)) ? (low) : (((x) > (high)) ? (high) : (x))) static void pa_log_info(const char *format, ...) { va_list ap; char buf[1024]; va_start(ap, format); vsprintf(buf, format, ap); printf("%s\n", buf); va_end(ap); } #define pa_log_debug pa_log_info static pa_usec_t pa_rtclock_now() { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec * 1000000ULL + tv.tv_usec; } #if defined(__arm__) #include "arm_neon.h" void pa_sconv_s16le_from_float32ne(unsigned n, const float *src, int16_t *dst) { pa_assert(src); pa_assert(dst); for (; n > 0; n--) { float v = *(src++); v = PA_CLAMP_UNLIKELY(v, -1.0f, 1.0f); *(dst++) = (int16_t) lrintf(v * 0x7FFF); } } void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *src, int16_t *dst) { unsigned i = n & 3; asm volatile ( "mov %[n], %[n], lsr #2\n\t" "vdup.f32 q2, %[plusone]\n\t" "vneg.f32 q3, q2\n\t" "vdup.f32 q4, %[scale]\n\t" "vdup.u32 q5, %[mask]\n\t" "vdup.f32 q6, %[half]\n\t" "1:\n\t" "vld1.32 {q0}, [%[src]]!\n\t" "vmin.f32 q0, q0, q2\n\t" /* clamp */ "vmax.f32 q0, q0, q3\n\t" "vmul.f32 q0, q0, q4\n\t" /* scale */ "vand.u32 q1, q0, q5\n\t" "vorr.u32 q1, q1, q6\n\t" /* round */ "vadd.f32 q0, q0, q1\n\t" "vcvt.s32.f32 q0, q0\n\t" /* narrow */ "vmovn.i32 d0, q0\n\t" "subs %[n], %[n], #1\n\t" "vst1.16 {d0}, [%[dst]]!\n\t" "bgt 1b\n\t" /* output operands (or input operands that get modified) */ : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) : [plusone] "r" (1.0f), [scale] "r" (32767.0f), [half] "r" (0.5f), [mask] "r" (0x80000000) /* input operands */ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6" /* clobber list */ ); // leftovers while (i--) { *dst++ = (int16_t) lrintf(PA_CLAMP_UNLIKELY(*src, -1.0f, 1.0f) * 0x7FFF); src++; } } void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *src, float *dst) { pa_assert(src); pa_assert(dst); for (; n > 0; n--) *(dst++) = ((float) (*(src++)))/(float) 0x7FFF; } void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *src, float *dst) { unsigned i = n & 3; const float invscale = 1.0f / 0x7FFF; asm volatile ( "mov %[n], %[n], lsr #2\n\t" "vdup.f32 q1, %[invscale]\n\t" "1:\n\t" "vld1.16 {d0}, [%[src]]!\n\t" "vmovl.s16 q0, d0\n\t" "vcvt.f32.s32 q0, q0\n\t" "vmul.f32 q0, q0, q1\n\t" "subs %[n], %[n], #1\n\t" "vst1.32 {q0}, [%[dst]]!\n\t" "bgt 1b\n\t" /* output operands (or input operands that get modified) */ : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) : [invscale] "r" (invscale) /* input operands */ : "memory", "cc", "q0", "q1" /* clobber list */ ); // leftovers while (i--) { *dst++ = *src++ * invscale; } } #define SAMPLES 1019 #define TIMES 100000 static void run_test_from(void) { int16_t samples[SAMPLES]; int16_t samples_ref[SAMPLES]; float floats[SAMPLES]; int i; pa_usec_t start, stop; pa_log_debug("checking NEON sconv_s16le_from_float(%d)", SAMPLES); memset(samples_ref, 0, sizeof(samples_ref)); memset(samples, 0, sizeof(samples)); for (i = 0; i < SAMPLES; i++) { floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f); } pa_sconv_s16le_from_float32ne(SAMPLES, floats, samples_ref); pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples); for (i = 0; i < SAMPLES; i++) { if (abs(samples[i] - samples_ref[i]) > 0) { pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i], floats[i]); } } start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples); } stop = pa_rtclock_now(); pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { pa_sconv_s16le_from_float32ne(SAMPLES, floats, samples_ref); } stop = pa_rtclock_now(); pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); } static void run_test_to(void) { int16_t samples[SAMPLES]; float floats[SAMPLES]; float floats_ref[SAMPLES]; int i; pa_usec_t start, stop; pa_convert_func_t func; pa_log_debug("checking NEON sconv_s16le_to_float(%d)", SAMPLES); memset(floats_ref, 0, sizeof(floats_ref)); memset(floats, 0, sizeof(float)); for (i = 0; i < SAMPLES; i++) { samples[i] = rand() - RAND_MAX/2; } func = (pa_convert_func_t) pa_sconv_s16le_to_float32ne; func(SAMPLES, samples, floats_ref); pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats); for (i = 0; i < SAMPLES; i++) { if (fabsf(floats[i] - floats_ref[i]) > 0.00001) { pa_log_debug("%d: %.8f != %.8f (%d)", i, floats[i], floats_ref[i], samples[i]); } } start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats); } stop = pa_rtclock_now(); pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (i = 0; i < TIMES; i++) { func(SAMPLES, samples, floats_ref); } stop = pa_rtclock_now(); pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); } #endif /* defined(__arm__) */ int main() { run_test_from(); run_test_to(); return EXIT_SUCCESS; }