# HG changeset patch
# User Peter Meerwald
# Date 1341774221 -7200
# Node ID 1f6289166006786b83c76ae4937aacb58ba7edd4
# Parent e889fd0e7769b6b0bc6aa645dc3417c354ef9724
complete
diff -r e889fd0e7769 -r 1f6289166006 compile.sh
--- a/compile.sh Thu Jul 05 17:31:56 2012 +0200
+++ b/compile.sh Sun Jul 08 21:03:41 2012 +0200
@@ -14,6 +14,8 @@
$CC \
-o svolume_neon svolume_neon.c -lm
+$CC \
+ -S -o uuu.s svolume_neon.c -lm
$CC \
-S -o asdf.s remap_neon.c -lm
diff -r e889fd0e7769 -r 1f6289166006 svolume_neon.c
--- a/svolume_neon.c Thu Jul 05 17:31:56 2012 +0200
+++ b/svolume_neon.c Sun Jul 08 21:03:41 2012 +0200
@@ -19,10 +19,6 @@
} pa_sample_format_t;
#define PA_SAMPLE_S16NE PA_SAMPLE_S16LE
#define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE
-typedef struct {
- pa_sample_format_t *format;
-} pa_remap_t;
-typedef void (*pa_remap_func_t)(pa_remap_t *m, void *dst, const void *src, unsigned n);
typedef long long unsigned int pa_usec_t;
#define pa_assert(x) assert(x)
@@ -56,7 +52,7 @@
return tv.tv_sec * 1000000ULL + tv.tv_usec;
}
-void pa_volume_s16ne_c(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
+static void pa_volume_s16ne_c(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) {
unsigned channel;
length /= sizeof(int16_t);
@@ -83,7 +79,7 @@
}
}
-void pa_volume_float32ne_c(float *samples, float *volumes, unsigned channels, unsigned length) {
+static void pa_volume_float32ne_c(float *samples, const float *volumes, unsigned channels, unsigned length) {
unsigned channel;
length /= sizeof(float);
@@ -96,17 +92,6 @@
}
}
-/*
-void pa_volume_s16ne_orc(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
-{
- if (channels == 2) {
- int64_t v = (int64_t)volumes[1] << 32 | volumes[0];
- pa_volume_s16ne_orc_2ch (samples, v, ((length / (sizeof(int16_t))) / 2));
- } else if (channels == 1)
- pa_volume_s16ne_orc_1ch (samples, volumes[0], length / (sizeof(int16_t)));
-}
-*/
-
#if defined(__arm__)
#include "arm_neon.h"
@@ -117,14 +102,12 @@
" addcs r0, %1 \n\t" \
" movcs r6, r0 \n\t"
-static void pa_volume_s16ne_arm(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
- int32_t *ve;
-
+static void pa_volume_s16ne_arm(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) {
/* Channels must be at least 4, and always a multiple of the original number.
* This is also the max amount we overread the volume array, which should
* have enough padding. */
- channels = channels == 3 ? 6 : PA_MAX (4U, channels);
- ve = volumes + channels;
+ channels = channels == 3 ? 6 : PA_MAX(4U, channels);
+ const uint32_t *ve = volumes + channels;
__asm__ __volatile__ (
" mov r6, %1 \n\t"
@@ -198,67 +181,94 @@
);
}
-static inline void vol_s16ne_neon(int32x4_t vol4, int16_t *samples, unsigned length) {
- unsigned i;
- int16x4_t hi = vshrn_n_s32(vol4, 16);
- int32x4_t lo = vandq_s32(vol4, vdupq_n_s32(0xFFFF));
-
- for (i = 0; i < length/8; i++) {
- int16x4_t v1 = ((int16x4_t *) samples)[2*i];
- int16x4_t v2 = ((int16x4_t *) samples)[2*i+1];
+static inline void vol_s16_neon(const uint32x4_t *vol4, int16_t *samples, unsigned length) {
+ asm volatile (
+ "mov %[length], %[length], lsr #2\n\t"
+ "vld1.s32 {q0}, [%[vol]]\n\t"
+ "vshl.u32 q3, q0, #16\n\t" /* lo */
+ "vshrn.s32 d1, q0, #16\n\t" /* hi */
+ "vshr.u32 q3, q3, #16\n\t"
+ "1:\n\t"
+ "vld1.16 {d0}, [%[samples]]\n\t"
+
+ "vmull.s16 q1, d0, d1\n\t"
- int32x4_t t1 = vmull_s16(v1, hi);
- int32x4_t t2 = vmull_s16(v2, hi);
+ "vmovl.s16 q2, d0\n\t"
+ "vmul.s32 q2, q2, q3\n\t"
+
+ "vsra.s32 q1, q2, #16\n\t"
+ "vmovn.s32 d0, q1\n\t"
- int16x4_t r1 = vqmovn_s32(vsraq_n_s32(t1, vmulq_s32(vmovl_s16(v1), lo), 16));
- int16x4_t r2 = vqmovn_s32(vsraq_n_s32(t2, vmulq_s32(vmovl_s16(v2), lo), 16));
-
- ((int16x8_t *)samples)[i] = vcombine_s16(r1, r2);
- }
+ "subs %[length], %[length], #1\n\t"
+ "vst1.16 {d0}, [%[samples]]!\n\t"
+ "bgt 1b\n\t"
+ /* output operands (or input operands that get modified) */
+ : [samples] "+r" (samples), [length] "+r" (length)
+ : [vol] "r" (vol4) /* input operands */
+ : "memory", "cc", "q0", "q1", "q2", "q3" /* clobber list */
+ );
}
-void pa_volume_s16ne_neon(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
+static inline void vol_float_neon(const float32x4_t *vol4, float *samples, unsigned length) {
+ asm volatile (
+ "mov %[length], %[length], lsr #2\n\t"
+ "vld1.32 {q1}, [%[vol]]\n\t"
+ "1:\n\t"
+ "vld1.32 {q0}, [%[samples]]\n\t"
+ "vmul.f32 q0, q0, q1\n\t"
+ "subs %[length], %[length], #1\n\t"
+ "vst1.32 {q0}, [%[samples]]!\n\t"
+ "bgt 1b\n\t"
+ /* output operands (or input operands that get modified) */
+ : [samples] "+r" (samples), [length] "+r" (length)
+ : [vol] "r" (vol4) /* input operands */
+ : "memory", "cc", "q0", "q1" /* clobber list */
+ );
+}
+
+static void pa_volume_s16ne_neon(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) {
unsigned channel = 0, i;
- int32x4_t vol4;
+ uint32x4_t vol4;
length /= sizeof(int16_t);
switch (channels) {
case 1:
- vol4 = vdupq_n_s32(*volumes);
- vol_s16ne_neon(vol4, samples, length);
-
- for (i = length & ~7; i < length; i++) {
+ vol4 = vdupq_n_u32(*volumes);
+ vol_s16_neon(&vol4, samples, length);
+
+ for (i = length & ~3; i < length; i++) {
int32_t t = samples[i];
- t = ((t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16));
+ t = ((int32_t) (t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16));
samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
}
break;
case 2:
- vol4 = vcombine_s32(*(int32x2_t *)volumes, *(int32x2_t *)volumes);
- vol_s16ne_neon(vol4, samples, length);
-
- for (i = length & ~7; i < length; i++) {
+ vol4 = vcombine_u32(*(uint32x2_t *)volumes, *(uint32x2_t *)volumes);
+ vol_s16_neon(&vol4, samples, length);
+
+ for (i = length & ~3; i < length; i++) {
int32_t t = samples[i];
- int32_t vol = volumes[(channel++) & 1];
- t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
+ uint32_t vol = volumes[(channel++) & 1];
+ t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
}
break;
case 4:
- vol4 = *(int32x4_t *)volumes;
- vol_s16ne_neon(vol4, samples, length);
-
- for (i = length & ~7; i < length; i++) {
+ vol4 = *(uint32x4_t *)volumes;
+ vol_s16_neon(&vol4, samples, length);
+
+ for (i = length & ~3; i < length; i++) {
int32_t t = samples[i];
- int32_t vol = volumes[(channel++) & 3];
- t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
+ uint32_t vol = volumes[(channel++) & 3];
+ t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
}
break;
default:
for (; length; length--) {
- int32_t t, hi, lo;
+ int32_t t;
+ uint32_t hi, lo;
/* Multiplying the 32bit volume factor with the 16bit
* sample might result in an 48bit value. We want to
@@ -270,7 +280,7 @@
lo = volumes[channel] & 0xFFFF;
t = (int32_t)(*samples);
- t = ((t * lo) >> 16) + (t * hi);
+ t = ((int32_t) (t * lo) >> 16) + (t * hi);
t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
*samples++ = (int16_t) t;
@@ -281,7 +291,7 @@
}
}
-void pa_volume_float32ne_neon(float *samples, float *volumes, unsigned channels, unsigned length) {
+static void pa_volume_float32ne_neon(float *samples, const float *volumes, unsigned channels, unsigned length) {
unsigned channel = 0, i;
float32x4_t vol4;
@@ -290,9 +300,7 @@
switch (channels) {
case 1:
vol4 = vdupq_n_f32(*volumes);
- for (i = 0; i < length/4; i++) {
- ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
- }
+ vol_float_neon(&vol4, samples, length);
for (i = length & ~3; i < length; i++) {
samples[i] *= volumes[0];
@@ -300,9 +308,7 @@
break;
case 2:
vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes);
- for (i = 0; i < length/4; i++) {
- ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
- }
+ vol_float_neon(&vol4, samples, length);
for (i = length & ~3; i < length; i++) {
samples[i] *= volumes[channel];
@@ -313,9 +319,7 @@
break;
case 4:
vol4 = *(float32x4_t *)volumes;
- for (i = 0; i < length/4; i++) {
- ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
- }
+ vol_float_neon(&vol4, samples, length);
for (i = length & ~3; i < length; i++) {
samples[i] *= volumes[channel++];
@@ -333,7 +337,7 @@
}
#define SAMPLES 1019
-#define TIMES 3000
+#define TIMES 50000
#define CHANNELS 4
#define PADDING 16
@@ -358,7 +362,7 @@
pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
-
+
for (i = 0; i < SAMPLES; i++) {
if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i],
@@ -387,7 +391,7 @@
int16_t samples[SAMPLES];
int16_t samples_ref[SAMPLES];
int16_t samples_orig[SAMPLES];
- int32_t volumes[CHANNELS + PADDING];
+ uint32_t volumes[CHANNELS + PADDING];
unsigned i, padding;
pa_usec_t start, stop;
@@ -413,7 +417,7 @@
samples_orig[i]);
}
}
-
+exit(0);
start = pa_rtclock_now();
for (i = 0; i < TIMES; i++) {
memcpy(samples, samples_orig, sizeof(samples_orig));