comparison svolume_neon.c @ 4:1f6289166006

complete
author Peter Meerwald <p.meerwald@bct-electronic.com>
date Sun, 08 Jul 2012 21:03:41 +0200
parents b829afbea564
children 07763f536182
comparison
equal deleted inserted replaced
3:e889fd0e7769 4:1f6289166006
17 PA_SAMPLE_S16LE, 17 PA_SAMPLE_S16LE,
18 PA_SAMPLE_FLOAT32LE, 18 PA_SAMPLE_FLOAT32LE,
19 } pa_sample_format_t; 19 } pa_sample_format_t;
20 #define PA_SAMPLE_S16NE PA_SAMPLE_S16LE 20 #define PA_SAMPLE_S16NE PA_SAMPLE_S16LE
21 #define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE 21 #define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE
22 typedef struct {
23 pa_sample_format_t *format;
24 } pa_remap_t;
25 typedef void (*pa_remap_func_t)(pa_remap_t *m, void *dst, const void *src, unsigned n);
26 typedef long long unsigned int pa_usec_t; 22 typedef long long unsigned int pa_usec_t;
27 23
28 #define pa_assert(x) assert(x) 24 #define pa_assert(x) assert(x)
29 #define pa_assert_not_reached() assert(0) 25 #define pa_assert_not_reached() assert(0)
30 26
54 gettimeofday(&tv, NULL); 50 gettimeofday(&tv, NULL);
55 51
56 return tv.tv_sec * 1000000ULL + tv.tv_usec; 52 return tv.tv_sec * 1000000ULL + tv.tv_usec;
57 } 53 }
58 54
59 void pa_volume_s16ne_c(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { 55 static void pa_volume_s16ne_c(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) {
60 unsigned channel; 56 unsigned channel;
61 57
62 length /= sizeof(int16_t); 58 length /= sizeof(int16_t);
63 59
64 for (channel = 0; length; length--) { 60 for (channel = 0; length; length--) {
81 if (PA_UNLIKELY(++channel >= channels)) 77 if (PA_UNLIKELY(++channel >= channels))
82 channel = 0; 78 channel = 0;
83 } 79 }
84 } 80 }
85 81
86 void pa_volume_float32ne_c(float *samples, float *volumes, unsigned channels, unsigned length) { 82 static void pa_volume_float32ne_c(float *samples, const float *volumes, unsigned channels, unsigned length) {
87 unsigned channel; 83 unsigned channel;
88 84
89 length /= sizeof(float); 85 length /= sizeof(float);
90 86
91 for (channel = 0; length; length--) { 87 for (channel = 0; length; length--) {
93 89
94 if (PA_UNLIKELY(++channel >= channels)) 90 if (PA_UNLIKELY(++channel >= channels))
95 channel = 0; 91 channel = 0;
96 } 92 }
97 } 93 }
98
99 /*
100 void pa_volume_s16ne_orc(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
101 {
102 if (channels == 2) {
103 int64_t v = (int64_t)volumes[1] << 32 | volumes[0];
104 pa_volume_s16ne_orc_2ch (samples, v, ((length / (sizeof(int16_t))) / 2));
105 } else if (channels == 1)
106 pa_volume_s16ne_orc_1ch (samples, volumes[0], length / (sizeof(int16_t)));
107 }
108 */
109 94
110 #if defined(__arm__) 95 #if defined(__arm__)
111 96
112 #include "arm_neon.h" 97 #include "arm_neon.h"
113 98
115 " subs r0, r6, %2 \n\t" \ 100 " subs r0, r6, %2 \n\t" \
116 " itt cs \n\t" \ 101 " itt cs \n\t" \
117 " addcs r0, %1 \n\t" \ 102 " addcs r0, %1 \n\t" \
118 " movcs r6, r0 \n\t" 103 " movcs r6, r0 \n\t"
119 104
120 static void pa_volume_s16ne_arm(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { 105 static void pa_volume_s16ne_arm(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) {
121 int32_t *ve;
122
123 /* Channels must be at least 4, and always a multiple of the original number. 106 /* Channels must be at least 4, and always a multiple of the original number.
124 * This is also the max amount we overread the volume array, which should 107 * This is also the max amount we overread the volume array, which should
125 * have enough padding. */ 108 * have enough padding. */
126 channels = channels == 3 ? 6 : PA_MAX (4U, channels); 109 channels = channels == 3 ? 6 : PA_MAX(4U, channels);
127 ve = volumes + channels; 110 const uint32_t *ve = volumes + channels;
128 111
129 __asm__ __volatile__ ( 112 __asm__ __volatile__ (
130 " mov r6, %1 \n\t" 113 " mov r6, %1 \n\t"
131 " mov %3, %3, LSR #1 \n\t" /* length /= sizeof (int16_t) */ 114 " mov %3, %3, LSR #1 \n\t" /* length /= sizeof (int16_t) */
132 " tst %3, #1 \n\t" /* check for odd samples */ 115 " tst %3, #1 \n\t" /* check for odd samples */
196 : 179 :
197 : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc" 180 : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc"
198 ); 181 );
199 } 182 }
200 183
201 static inline void vol_s16ne_neon(int32x4_t vol4, int16_t *samples, unsigned length) { 184 static inline void vol_s16_neon(const uint32x4_t *vol4, int16_t *samples, unsigned length) {
202 unsigned i; 185 asm volatile (
203 int16x4_t hi = vshrn_n_s32(vol4, 16); 186 "mov %[length], %[length], lsr #2\n\t"
204 int32x4_t lo = vandq_s32(vol4, vdupq_n_s32(0xFFFF)); 187 "vld1.s32 {q0}, [%[vol]]\n\t"
205 188 "vshl.u32 q3, q0, #16\n\t" /* lo */
206 for (i = 0; i < length/8; i++) { 189 "vshrn.s32 d1, q0, #16\n\t" /* hi */
207 int16x4_t v1 = ((int16x4_t *) samples)[2*i]; 190 "vshr.u32 q3, q3, #16\n\t"
208 int16x4_t v2 = ((int16x4_t *) samples)[2*i+1]; 191 "1:\n\t"
209 192 "vld1.16 {d0}, [%[samples]]\n\t"
210 int32x4_t t1 = vmull_s16(v1, hi); 193
211 int32x4_t t2 = vmull_s16(v2, hi); 194 "vmull.s16 q1, d0, d1\n\t"
212 195
213 int16x4_t r1 = vqmovn_s32(vsraq_n_s32(t1, vmulq_s32(vmovl_s16(v1), lo), 16)); 196 "vmovl.s16 q2, d0\n\t"
214 int16x4_t r2 = vqmovn_s32(vsraq_n_s32(t2, vmulq_s32(vmovl_s16(v2), lo), 16)); 197 "vmul.s32 q2, q2, q3\n\t"
215 198
216 ((int16x8_t *)samples)[i] = vcombine_s16(r1, r2); 199 "vsra.s32 q1, q2, #16\n\t"
217 } 200 "vmovn.s32 d0, q1\n\t"
218 } 201
219 202 "subs %[length], %[length], #1\n\t"
220 void pa_volume_s16ne_neon(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { 203 "vst1.16 {d0}, [%[samples]]!\n\t"
204 "bgt 1b\n\t"
205 /* output operands (or input operands that get modified) */
206 : [samples] "+r" (samples), [length] "+r" (length)
207 : [vol] "r" (vol4) /* input operands */
208 : "memory", "cc", "q0", "q1", "q2", "q3" /* clobber list */
209 );
210 }
211
212 static inline void vol_float_neon(const float32x4_t *vol4, float *samples, unsigned length) {
213 asm volatile (
214 "mov %[length], %[length], lsr #2\n\t"
215 "vld1.32 {q1}, [%[vol]]\n\t"
216 "1:\n\t"
217 "vld1.32 {q0}, [%[samples]]\n\t"
218 "vmul.f32 q0, q0, q1\n\t"
219 "subs %[length], %[length], #1\n\t"
220 "vst1.32 {q0}, [%[samples]]!\n\t"
221 "bgt 1b\n\t"
222 /* output operands (or input operands that get modified) */
223 : [samples] "+r" (samples), [length] "+r" (length)
224 : [vol] "r" (vol4) /* input operands */
225 : "memory", "cc", "q0", "q1" /* clobber list */
226 );
227 }
228
229 static void pa_volume_s16ne_neon(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) {
221 unsigned channel = 0, i; 230 unsigned channel = 0, i;
222 int32x4_t vol4; 231 uint32x4_t vol4;
223 232
224 length /= sizeof(int16_t); 233 length /= sizeof(int16_t);
225 234
226 switch (channels) { 235 switch (channels) {
227 case 1: 236 case 1:
228 vol4 = vdupq_n_s32(*volumes); 237 vol4 = vdupq_n_u32(*volumes);
229 vol_s16ne_neon(vol4, samples, length); 238 vol_s16_neon(&vol4, samples, length);
230 239
231 for (i = length & ~7; i < length; i++) { 240 for (i = length & ~3; i < length; i++) {
232 int32_t t = samples[i]; 241 int32_t t = samples[i];
233 t = ((t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16)); 242 t = ((int32_t) (t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16));
234 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); 243 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
235 } 244 }
236 break; 245 break;
237 case 2: 246 case 2:
238 vol4 = vcombine_s32(*(int32x2_t *)volumes, *(int32x2_t *)volumes); 247 vol4 = vcombine_u32(*(uint32x2_t *)volumes, *(uint32x2_t *)volumes);
239 vol_s16ne_neon(vol4, samples, length); 248 vol_s16_neon(&vol4, samples, length);
240 249
241 for (i = length & ~7; i < length; i++) { 250 for (i = length & ~3; i < length; i++) {
242 int32_t t = samples[i]; 251 int32_t t = samples[i];
243 int32_t vol = volumes[(channel++) & 1]; 252 uint32_t vol = volumes[(channel++) & 1];
244 t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); 253 t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
245 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); 254 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
246 } 255 }
247 break; 256 break;
248 case 4: 257 case 4:
249 vol4 = *(int32x4_t *)volumes; 258 vol4 = *(uint32x4_t *)volumes;
250 vol_s16ne_neon(vol4, samples, length); 259 vol_s16_neon(&vol4, samples, length);
251 260
252 for (i = length & ~7; i < length; i++) { 261 for (i = length & ~3; i < length; i++) {
253 int32_t t = samples[i]; 262 int32_t t = samples[i];
254 int32_t vol = volumes[(channel++) & 3]; 263 uint32_t vol = volumes[(channel++) & 3];
255 t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); 264 t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
256 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); 265 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
257 } 266 }
258 break; 267 break;
259 default: 268 default:
260 for (; length; length--) { 269 for (; length; length--) {
261 int32_t t, hi, lo; 270 int32_t t;
271 uint32_t hi, lo;
262 272
263 /* Multiplying the 32bit volume factor with the 16bit 273 /* Multiplying the 32bit volume factor with the 16bit
264 * sample might result in an 48bit value. We want to 274 * sample might result in an 48bit value. We want to
265 * do without 64 bit integers and hence do the 275 * do without 64 bit integers and hence do the
266 * multiplication independently for the HI and LO part 276 * multiplication independently for the HI and LO part
268 278
269 hi = volumes[channel] >> 16; 279 hi = volumes[channel] >> 16;
270 lo = volumes[channel] & 0xFFFF; 280 lo = volumes[channel] & 0xFFFF;
271 281
272 t = (int32_t)(*samples); 282 t = (int32_t)(*samples);
273 t = ((t * lo) >> 16) + (t * hi); 283 t = ((int32_t) (t * lo) >> 16) + (t * hi);
274 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); 284 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
275 *samples++ = (int16_t) t; 285 *samples++ = (int16_t) t;
276 286
277 if (PA_UNLIKELY(++channel >= channels)) 287 if (PA_UNLIKELY(++channel >= channels))
278 channel = 0; 288 channel = 0;
279 } 289 }
280 break; 290 break;
281 } 291 }
282 } 292 }
283 293
284 void pa_volume_float32ne_neon(float *samples, float *volumes, unsigned channels, unsigned length) { 294 static void pa_volume_float32ne_neon(float *samples, const float *volumes, unsigned channels, unsigned length) {
285 unsigned channel = 0, i; 295 unsigned channel = 0, i;
286 float32x4_t vol4; 296 float32x4_t vol4;
287 297
288 length /= sizeof(float); 298 length /= sizeof(float);
289 299
290 switch (channels) { 300 switch (channels) {
291 case 1: 301 case 1:
292 vol4 = vdupq_n_f32(*volumes); 302 vol4 = vdupq_n_f32(*volumes);
293 for (i = 0; i < length/4; i++) { 303 vol_float_neon(&vol4, samples, length);
294 ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
295 }
296 304
297 for (i = length & ~3; i < length; i++) { 305 for (i = length & ~3; i < length; i++) {
298 samples[i] *= volumes[0]; 306 samples[i] *= volumes[0];
299 } 307 }
300 break; 308 break;
301 case 2: 309 case 2:
302 vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes); 310 vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes);
303 for (i = 0; i < length/4; i++) { 311 vol_float_neon(&vol4, samples, length);
304 ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
305 }
306 312
307 for (i = length & ~3; i < length; i++) { 313 for (i = length & ~3; i < length; i++) {
308 samples[i] *= volumes[channel]; 314 samples[i] *= volumes[channel];
309 315
310 if (PA_UNLIKELY(++channel >= channels)) 316 if (PA_UNLIKELY(++channel >= channels))
311 channel = 0; 317 channel = 0;
312 } 318 }
313 break; 319 break;
314 case 4: 320 case 4:
315 vol4 = *(float32x4_t *)volumes; 321 vol4 = *(float32x4_t *)volumes;
316 for (i = 0; i < length/4; i++) { 322 vol_float_neon(&vol4, samples, length);
317 ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4);
318 }
319 323
320 for (i = length & ~3; i < length; i++) { 324 for (i = length & ~3; i < length; i++) {
321 samples[i] *= volumes[channel++]; 325 samples[i] *= volumes[channel++];
322 } 326 }
323 break; 327 break;
331 break; 335 break;
332 } 336 }
333 } 337 }
334 338
335 #define SAMPLES 1019 339 #define SAMPLES 1019
336 #define TIMES 3000 340 #define TIMES 50000
337 #define CHANNELS 4 341 #define CHANNELS 4
338 #define PADDING 16 342 #define PADDING 16
339 343
340 static void run_test_float(void) { 344 static void run_test_float(void) {
341 float floats[SAMPLES]; 345 float floats[SAMPLES];
356 for (i = 0; i < CHANNELS; i++) 360 for (i = 0; i < CHANNELS; i++)
357 volumes[i] = 0.5f * rand() / (float) RAND_MAX; 361 volumes[i] = 0.5f * rand() / (float) RAND_MAX;
358 362
359 pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats)); 363 pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
360 pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref)); 364 pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
361 365
362 for (i = 0; i < SAMPLES; i++) { 366 for (i = 0; i < SAMPLES; i++) {
363 if (fabsf(floats[i] - floats_ref[i]) > 0.00001) { 367 if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
364 pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i], 368 pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i],
365 floats_orig[i]); 369 floats_orig[i]);
366 } 370 }
385 389
386 static void run_test_s16(void) { 390 static void run_test_s16(void) {
387 int16_t samples[SAMPLES]; 391 int16_t samples[SAMPLES];
388 int16_t samples_ref[SAMPLES]; 392 int16_t samples_ref[SAMPLES];
389 int16_t samples_orig[SAMPLES]; 393 int16_t samples_orig[SAMPLES];
390 int32_t volumes[CHANNELS + PADDING]; 394 uint32_t volumes[CHANNELS + PADDING];
391 unsigned i, padding; 395 unsigned i, padding;
392 pa_usec_t start, stop; 396 pa_usec_t start, stop;
393 397
394 pa_log_debug("checking NEON volume_s16ne(%d)", SAMPLES); 398 pa_log_debug("checking NEON volume_s16ne(%d)", SAMPLES);
395 399
411 if (abs(samples[i] - samples_ref[i]) > 0) { 415 if (abs(samples[i] - samples_ref[i]) > 0) {
412 pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i], 416 pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i],
413 samples_orig[i]); 417 samples_orig[i]);
414 } 418 }
415 } 419 }
416 420 exit(0);
417 start = pa_rtclock_now(); 421 start = pa_rtclock_now();
418 for (i = 0; i < TIMES; i++) { 422 for (i = 0; i < TIMES; i++) {
419 memcpy(samples, samples_orig, sizeof(samples_orig)); 423 memcpy(samples, samples_orig, sizeof(samples_orig));
420 pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples)); 424 pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
421 } 425 }

Repositories maintained by Peter Meerwald, pmeerw@pmeerw.net.