comparison svolume_neon.c @ 0:e0040ee59c3c

import
author Peter Meerwald <p.meerwald@bct-electronic.com>
date Thu, 12 Jan 2012 17:27:46 +0100
parents
children b829afbea564
comparison
equal deleted inserted replaced
-1:000000000000 0:e0040ee59c3c
1 /*
2 * Copyright 2012 Peter Meerwald <p.meerwald@bct-electronic.com>
3 */
4
5 #include <stdlib.h>
6 #include <stdio.h>
7 #include <stdarg.h>
8 #include <string.h>
9 #include <math.h>
10 #include <sys/time.h>
11 #include <assert.h>
12
13
14 typedef short int16_t;
15 typedef unsigned int uint32_t;
16 typedef enum pa_sample_format {
17 PA_SAMPLE_S16LE,
18 PA_SAMPLE_FLOAT32LE,
19 } pa_sample_format_t;
20 #define PA_SAMPLE_S16NE PA_SAMPLE_S16LE
21 #define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE
22 typedef struct {
23 pa_sample_format_t *format;
24 } pa_remap_t;
25 typedef void (*pa_remap_func_t)(pa_remap_t *m, void *dst, const void *src, unsigned n);
26 typedef long long unsigned int pa_usec_t;
27
28 #define pa_assert(x) assert(x)
29 #define pa_assert_not_reached() assert(0)
30
31 #define PA_MAX(a, b) ((a) > (b) ? (a) : (b))
32
33 typedef uint32_t pa_volume_t;
34 #define PA_VOLUME_MUTED ((pa_volume_t) 0U)
35 #define PA_VOLUME_MAX ((pa_volume_t) UINT32_MAX/2)
36
37 #define PA_UNLIKELY(x) (x)
38 #define PA_CLAMP_UNLIKELY(x, low, high) (PA_UNLIKELY((x) > (high)) ? (high) : (PA_UNLIKELY((x) < (low)) ? (low) : (x)))
39 #define PA_CLAMP_VOLUME(v) (PA_CLAMP_UNLIKELY((v), PA_VOLUME_MUTED, PA_VOLUME_MAX))
40
41 static void pa_log_info(const char *format, ...) {
42 va_list ap;
43 char buf[1024];
44 va_start(ap, format);
45 vsprintf(buf, format, ap);
46 printf("%s\n", buf);
47 va_end(ap);
48 }
49
50 #define pa_log_debug pa_log_info
51
52 static pa_usec_t pa_rtclock_now() {
53 struct timeval tv;
54 gettimeofday(&tv, NULL);
55
56 return tv.tv_sec * 1000000ULL + tv.tv_usec;
57 }
58
59 void pa_volume_s16ne_c(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
60 unsigned channel;
61
62 length /= sizeof(int16_t);
63
64 for (channel = 0; length; length--) {
65 int32_t t, hi, lo;
66
67 /* Multiplying the 32bit volume factor with the 16bit
68 * sample might result in an 48bit value. We want to
69 * do without 64 bit integers and hence do the
70 * multiplication independently for the HI and LO part
71 * of the volume. */
72
73 hi = volumes[channel] >> 16;
74 lo = volumes[channel] & 0xFFFF;
75
76 t = (int32_t)(*samples);
77 t = ((t * lo) >> 16) + (t * hi);
78 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
79 *samples++ = (int16_t) t;
80
81 if (PA_UNLIKELY(++channel >= channels))
82 channel = 0;
83 }
84 }
85
86 void pa_volume_float32ne_c(float *samples, float *volumes, unsigned channels, unsigned length) {
87 unsigned channel;
88
89 length /= sizeof(float);
90
91 for (channel = 0; length; length--) {
92 *samples++ *= volumes[channel];
93
94 if (PA_UNLIKELY(++channel >= channels))
95 channel = 0;
96 }
97 }
98
99 /*
100 void pa_volume_s16ne_orc(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
101 {
102 if (channels == 2) {
103 int64_t v = (int64_t)volumes[1] << 32 | volumes[0];
104 pa_volume_s16ne_orc_2ch (samples, v, ((length / (sizeof(int16_t))) / 2));
105 } else if (channels == 1)
106 pa_volume_s16ne_orc_1ch (samples, volumes[0], length / (sizeof(int16_t)));
107 }
108 */
109
110 #if defined(__arm__)
111
112 #include "arm_neon.h"
113
114 #define MOD_INC() \
115 " subs r0, r6, %2 \n\t" \
116 " itt cs \n\t" \
117 " addcs r0, %1 \n\t" \
118 " movcs r6, r0 \n\t"
119
120 static void pa_volume_s16ne_arm(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
121 int32_t *ve;
122
123 /* Channels must be at least 4, and always a multiple of the original number.
124 * This is also the max amount we overread the volume array, which should
125 * have enough padding. */
126 channels = channels == 3 ? 6 : PA_MAX (4U, channels);
127 ve = volumes + channels;
128
129 __asm__ __volatile__ (
130 " mov r6, %1 \n\t"
131 " mov %3, %3, LSR #1 \n\t" /* length /= sizeof (int16_t) */
132 " tst %3, #1 \n\t" /* check for odd samples */
133 " beq 2f \n\t"
134
135 "1: \n\t"
136 " ldr r0, [r6], #4 \n\t" /* odd samples volumes */
137 " ldrh r2, [%0] \n\t"
138
139 " smulwb r0, r0, r2 \n\t"
140 " ssat r0, #16, r0 \n\t"
141
142 " strh r0, [%0], #2 \n\t"
143
144 MOD_INC()
145
146 "2: \n\t"
147 " mov %3, %3, LSR #1 \n\t"
148 " tst %3, #1 \n\t" /* check for odd samples */
149 " beq 4f \n\t"
150
151 "3: \n\t"
152 " ldrd r2, [r6], #8 \n\t" /* 2 samples at a time */
153 " ldr r0, [%0] \n\t"
154
155 " smulwt r2, r2, r0 \n\t"
156 " smulwb r3, r3, r0 \n\t"
157
158 " ssat r2, #16, r2 \n\t"
159 " ssat r3, #16, r3 \n\t"
160
161 " pkhbt r0, r3, r2, LSL #16 \n\t"
162 " str r0, [%0], #4 \n\t"
163
164 MOD_INC()
165
166 "4: \n\t"
167 " movs %3, %3, LSR #1 \n\t"
168 " beq 6f \n\t"
169
170 "5: \n\t"
171 " ldrd r2, [r6], #8 \n\t" /* 4 samples at a time */
172 " ldrd r4, [r6], #8 \n\t"
173 " ldrd r0, [%0] \n\t"
174
175 " smulwt r2, r2, r0 \n\t"
176 " smulwb r3, r3, r0 \n\t"
177 " smulwt r4, r4, r1 \n\t"
178 " smulwb r5, r5, r1 \n\t"
179
180 " ssat r2, #16, r2 \n\t"
181 " ssat r3, #16, r3 \n\t"
182 " ssat r4, #16, r4 \n\t"
183 " ssat r5, #16, r5 \n\t"
184
185 " pkhbt r0, r3, r2, LSL #16 \n\t"
186 " pkhbt r1, r5, r4, LSL #16 \n\t"
187 " strd r0, [%0], #8 \n\t"
188
189 MOD_INC()
190
191 " subs %3, %3, #1 \n\t"
192 " bne 5b \n\t"
193 "6: \n\t"
194
195 : "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length)
196 :
197 : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc"
198 );
199 }
200
201 static inline void vol_s16ne_neon(int32x4_t vol4, int16_t *samples, unsigned length) {
202 unsigned i;
203 int16x4_t hi = vshrn_n_s32(vol4, 16);
204 int32x4_t lo = vandq_s32(vol4, vdupq_n_s32(0xFFFF));
205
206 for (i = 0; i < length/8; i++) {
207 int16x4_t v1 = ((int16x4_t *) samples)[2*i];
208 int16x4_t v2 = ((int16x4_t *) samples)[2*i+1];
209
210 int32x4_t t1 = vmull_s16(v1, hi);
211 int32x4_t t2 = vmull_s16(v2, hi);
212
213 int16x4_t r1 = vqmovn_s32(vsraq_n_s32(t1, vmulq_s32(vmovl_s16(v1), lo), 16));
214 int16x4_t r2 = vqmovn_s32(vsraq_n_s32(t2, vmulq_s32(vmovl_s16(v2), lo), 16));
215
216 ((int16x8_t *)samples)[i] = vcombine_s16(r1, r2);
217 }
218 }
219
220 void pa_volume_s16ne_neon(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
221 unsigned channel = 0, i;
222 int32x4_t vol4;
223
224 length /= sizeof(int16_t);
225
226 switch (channels) {
227 case 1:
228 vol4 = vdupq_n_s32(*volumes);
229 vol_s16ne_neon(vol4, samples, length);
230
231 for (i = length & ~7; i < length; i++) {
232 int32_t t = samples[i];
233 t = ((t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16));
234 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
235 }
236 break;
237 case 2:
238 vol4 = vcombine_s32(*(int32x2_t *)volumes, *(int32x2_t *)volumes);
239 vol_s16ne_neon(vol4, samples, length);
240
241 for (i = length & ~7; i < length; i++) {
242 int32_t t = samples[i];
243 int32_t vol = volumes[(channel++) & 1];
244 t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
245 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
246 }
247 break;
248 case 4:
249 vol4 = *(int32x4_t *)volumes;
250 vol_s16ne_neon(vol4, samples, length);
251
252 for (i = length & ~7; i < length; i++) {
253 int32_t t = samples[i];
254 int32_t vol = volumes[(channel++) & 3];
255 t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
256 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
257 }
258 break;
259 default:
260 for (; length; length--) {
261 int32_t t, hi, lo;
262
263 /* Multiplying the 32bit volume factor with the 16bit
264 * sample might result in an 48bit value. We want to
265 * do without 64 bit integers and hence do the
266 * multiplication independently for the HI and LO part
267 * of the volume. */
268
269 hi = volumes[channel] >> 16;
270 lo = volumes[channel] & 0xFFFF;
271
272 t = (int32_t)(*samples);
273 t = ((t * lo) >> 16) + (t * hi);
274 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
275 *samples++ = (int16_t) t;
276
277 if (PA_UNLIKELY(++channel >= channels))
278 channel = 0;
279 }
280 break;
281 }
282 }
283
284 void pa_volume_float32ne_neon(float *samples, float *volumes, unsigned channels, unsigned length) {
285 unsigned channel = 0, i;
286 float32x4_t vol4;
287
288 length /= sizeof(float);
289
290 switch (channels) {
291 case 1:
292 vol4 = vdupq_n_f32(*volumes);
293 for (i = 0; i < length/4; i++) {
294 ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
295 }
296
297 for (i = length & ~3; i < length; i++) {
298 samples[i] *= volumes[0];
299 }
300 break;
301 case 2:
302 vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes);
303 for (i = 0; i < length/4; i++) {
304 ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
305 }
306
307 for (i = length & ~3; i < length; i++) {
308 samples[i] *= volumes[channel];
309
310 if (PA_UNLIKELY(++channel >= channels))
311 channel = 0;
312 }
313 break;
314 case 4:
315 vol4 = *(float32x4_t *)volumes;
316 for (i = 0; i < length/4; i++) {
317 ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
318 }
319
320 for (i = length & ~3; i < length; i++) {
321 samples[i] *= volumes[channel++];
322 }
323 break;
324 default:
325 for (; length; length--) {
326 *samples++ *= volumes[channel];
327
328 if (PA_UNLIKELY(++channel >= channels))
329 channel = 0;
330 }
331 break;
332 }
333 }
334
335 #define SAMPLES 1019
336 #define TIMES 1000
337 #define CHANNELS 4
338 #define PADDING 16
339
340 static void run_test_float(void) {
341 float floats[SAMPLES];
342 float floats_ref[SAMPLES];
343 float floats_orig[SAMPLES];
344 float volumes[CHANNELS];
345 unsigned i;
346 pa_usec_t start, stop;
347
348 pa_log_debug("checking NEON volume_float32ne(%d)", SAMPLES);
349
350 for (i = 0; i < SAMPLES; i++) {
351 floats_orig[i] = rand()/(float) RAND_MAX - 0.5f;
352 }
353 memcpy(floats_ref, floats_orig, sizeof(floats_orig));
354 memcpy(floats, floats_orig, sizeof(floats_orig));
355
356 for (i = 0; i < CHANNELS; i++)
357 volumes[i] = 0.5f * rand() / (float) RAND_MAX;
358
359 pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
360 pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
361
362 for (i = 0; i < SAMPLES; i++) {
363 if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
364 pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i],
365 floats_orig[i]);
366 }
367 }
368
369 start = pa_rtclock_now();
370 for (i = 0; i < TIMES; i++) {
371 memcpy(floats, floats_orig, sizeof(floats_orig));
372 pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
373 }
374 stop = pa_rtclock_now();
375 pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
376
377 start = pa_rtclock_now();
378 for (i = 0; i < TIMES; i++) {
379 memcpy(floats_ref, floats_orig, sizeof(floats_orig));
380 pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
381 }
382 stop = pa_rtclock_now();
383 pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
384 }
385
386 static void run_test_s16(void) {
387 int16_t samples[SAMPLES];
388 int16_t samples_ref[SAMPLES];
389 int16_t samples_orig[SAMPLES];
390 int32_t volumes[CHANNELS + PADDING];
391 unsigned i, padding;
392 pa_usec_t start, stop;
393
394 pa_log_debug("checking NEON volume_s16ne(%d)", SAMPLES);
395
396 for (i = 0; i < SAMPLES; i++) {
397 samples_orig[i] = rand() - RAND_MAX/2;
398 }
399 memcpy(samples_ref, samples_orig, sizeof(samples_orig));
400 memcpy(samples, samples_orig, sizeof(samples_orig));
401
402 for (i = 0; i < CHANNELS; i++)
403 volumes[i] = PA_CLAMP_VOLUME(rand() >> 15);
404 for (padding = 0; padding < PADDING; padding++, i++)
405 volumes[i] = volumes[padding];
406
407 pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
408 pa_volume_s16ne_c(samples_ref, volumes, CHANNELS, sizeof(samples_ref));
409
410 for (i = 0; i < SAMPLES; i++) {
411 if (abs(samples[i] - samples_ref[i]) > 0) {
412 pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i],
413 samples_orig[i]);
414 }
415 }
416
417 start = pa_rtclock_now();
418 for (i = 0; i < TIMES; i++) {
419 memcpy(samples, samples_orig, sizeof(samples_orig));
420 pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
421 }
422 stop = pa_rtclock_now();
423 pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
424
425 start = pa_rtclock_now();
426 for (i = 0; i < TIMES; i++) {
427 memcpy(samples, samples_orig, sizeof(samples_orig));
428 pa_volume_s16ne_arm(samples, volumes, CHANNELS, sizeof(samples));
429 }
430 stop = pa_rtclock_now();
431 pa_log_info("ARM: %llu usec.", (long long unsigned int)(stop - start));
432
433 start = pa_rtclock_now();
434 for (i = 0; i < TIMES; i++) {
435 memcpy(samples_ref, samples_orig, sizeof(samples_orig));
436 pa_volume_s16ne_c(samples_ref, volumes, CHANNELS, sizeof(samples_ref));
437 }
438 stop = pa_rtclock_now();
439 pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
440 }
441
442 #endif /* defined(__arm__) */
443
444 int main() {
445
446 run_test_float();
447 run_test_s16();
448
449 return EXIT_SUCCESS;
450 }

Repositories maintained by Peter Meerwald, pmeerw@pmeerw.net.