Mercurial > hg > pa-neon
comparison svolume_neon.c @ 4:1f6289166006
complete
author | Peter Meerwald <p.meerwald@bct-electronic.com> |
---|---|
date | Sun, 08 Jul 2012 21:03:41 +0200 |
parents | b829afbea564 |
children | 07763f536182 |
comparison
equal
deleted
inserted
replaced
3:e889fd0e7769 | 4:1f6289166006 |
---|---|
17 PA_SAMPLE_S16LE, | 17 PA_SAMPLE_S16LE, |
18 PA_SAMPLE_FLOAT32LE, | 18 PA_SAMPLE_FLOAT32LE, |
19 } pa_sample_format_t; | 19 } pa_sample_format_t; |
20 #define PA_SAMPLE_S16NE PA_SAMPLE_S16LE | 20 #define PA_SAMPLE_S16NE PA_SAMPLE_S16LE |
21 #define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE | 21 #define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE |
22 typedef struct { | |
23 pa_sample_format_t *format; | |
24 } pa_remap_t; | |
25 typedef void (*pa_remap_func_t)(pa_remap_t *m, void *dst, const void *src, unsigned n); | |
26 typedef long long unsigned int pa_usec_t; | 22 typedef long long unsigned int pa_usec_t; |
27 | 23 |
28 #define pa_assert(x) assert(x) | 24 #define pa_assert(x) assert(x) |
29 #define pa_assert_not_reached() assert(0) | 25 #define pa_assert_not_reached() assert(0) |
30 | 26 |
54 gettimeofday(&tv, NULL); | 50 gettimeofday(&tv, NULL); |
55 | 51 |
56 return tv.tv_sec * 1000000ULL + tv.tv_usec; | 52 return tv.tv_sec * 1000000ULL + tv.tv_usec; |
57 } | 53 } |
58 | 54 |
59 void pa_volume_s16ne_c(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { | 55 static void pa_volume_s16ne_c(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) { |
60 unsigned channel; | 56 unsigned channel; |
61 | 57 |
62 length /= sizeof(int16_t); | 58 length /= sizeof(int16_t); |
63 | 59 |
64 for (channel = 0; length; length--) { | 60 for (channel = 0; length; length--) { |
81 if (PA_UNLIKELY(++channel >= channels)) | 77 if (PA_UNLIKELY(++channel >= channels)) |
82 channel = 0; | 78 channel = 0; |
83 } | 79 } |
84 } | 80 } |
85 | 81 |
86 void pa_volume_float32ne_c(float *samples, float *volumes, unsigned channels, unsigned length) { | 82 static void pa_volume_float32ne_c(float *samples, const float *volumes, unsigned channels, unsigned length) { |
87 unsigned channel; | 83 unsigned channel; |
88 | 84 |
89 length /= sizeof(float); | 85 length /= sizeof(float); |
90 | 86 |
91 for (channel = 0; length; length--) { | 87 for (channel = 0; length; length--) { |
93 | 89 |
94 if (PA_UNLIKELY(++channel >= channels)) | 90 if (PA_UNLIKELY(++channel >= channels)) |
95 channel = 0; | 91 channel = 0; |
96 } | 92 } |
97 } | 93 } |
98 | |
99 /* | |
100 void pa_volume_s16ne_orc(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) | |
101 { | |
102 if (channels == 2) { | |
103 int64_t v = (int64_t)volumes[1] << 32 | volumes[0]; | |
104 pa_volume_s16ne_orc_2ch (samples, v, ((length / (sizeof(int16_t))) / 2)); | |
105 } else if (channels == 1) | |
106 pa_volume_s16ne_orc_1ch (samples, volumes[0], length / (sizeof(int16_t))); | |
107 } | |
108 */ | |
109 | 94 |
110 #if defined(__arm__) | 95 #if defined(__arm__) |
111 | 96 |
112 #include "arm_neon.h" | 97 #include "arm_neon.h" |
113 | 98 |
115 " subs r0, r6, %2 \n\t" \ | 100 " subs r0, r6, %2 \n\t" \ |
116 " itt cs \n\t" \ | 101 " itt cs \n\t" \ |
117 " addcs r0, %1 \n\t" \ | 102 " addcs r0, %1 \n\t" \ |
118 " movcs r6, r0 \n\t" | 103 " movcs r6, r0 \n\t" |
119 | 104 |
120 static void pa_volume_s16ne_arm(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { | 105 static void pa_volume_s16ne_arm(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) { |
121 int32_t *ve; | |
122 | |
123 /* Channels must be at least 4, and always a multiple of the original number. | 106 /* Channels must be at least 4, and always a multiple of the original number. |
124 * This is also the max amount we overread the volume array, which should | 107 * This is also the max amount we overread the volume array, which should |
125 * have enough padding. */ | 108 * have enough padding. */ |
126 channels = channels == 3 ? 6 : PA_MAX (4U, channels); | 109 channels = channels == 3 ? 6 : PA_MAX(4U, channels); |
127 ve = volumes + channels; | 110 const uint32_t *ve = volumes + channels; |
128 | 111 |
129 __asm__ __volatile__ ( | 112 __asm__ __volatile__ ( |
130 " mov r6, %1 \n\t" | 113 " mov r6, %1 \n\t" |
131 " mov %3, %3, LSR #1 \n\t" /* length /= sizeof (int16_t) */ | 114 " mov %3, %3, LSR #1 \n\t" /* length /= sizeof (int16_t) */ |
132 " tst %3, #1 \n\t" /* check for odd samples */ | 115 " tst %3, #1 \n\t" /* check for odd samples */ |
196 : | 179 : |
197 : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc" | 180 : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc" |
198 ); | 181 ); |
199 } | 182 } |
200 | 183 |
201 static inline void vol_s16ne_neon(int32x4_t vol4, int16_t *samples, unsigned length) { | 184 static inline void vol_s16_neon(const uint32x4_t *vol4, int16_t *samples, unsigned length) { |
202 unsigned i; | 185 asm volatile ( |
203 int16x4_t hi = vshrn_n_s32(vol4, 16); | 186 "mov %[length], %[length], lsr #2\n\t" |
204 int32x4_t lo = vandq_s32(vol4, vdupq_n_s32(0xFFFF)); | 187 "vld1.s32 {q0}, [%[vol]]\n\t" |
205 | 188 "vshl.u32 q3, q0, #16\n\t" /* lo */ |
206 for (i = 0; i < length/8; i++) { | 189 "vshrn.s32 d1, q0, #16\n\t" /* hi */ |
207 int16x4_t v1 = ((int16x4_t *) samples)[2*i]; | 190 "vshr.u32 q3, q3, #16\n\t" |
208 int16x4_t v2 = ((int16x4_t *) samples)[2*i+1]; | 191 "1:\n\t" |
209 | 192 "vld1.16 {d0}, [%[samples]]\n\t" |
210 int32x4_t t1 = vmull_s16(v1, hi); | 193 |
211 int32x4_t t2 = vmull_s16(v2, hi); | 194 "vmull.s16 q1, d0, d1\n\t" |
212 | 195 |
213 int16x4_t r1 = vqmovn_s32(vsraq_n_s32(t1, vmulq_s32(vmovl_s16(v1), lo), 16)); | 196 "vmovl.s16 q2, d0\n\t" |
214 int16x4_t r2 = vqmovn_s32(vsraq_n_s32(t2, vmulq_s32(vmovl_s16(v2), lo), 16)); | 197 "vmul.s32 q2, q2, q3\n\t" |
215 | 198 |
216 ((int16x8_t *)samples)[i] = vcombine_s16(r1, r2); | 199 "vsra.s32 q1, q2, #16\n\t" |
217 } | 200 "vmovn.s32 d0, q1\n\t" |
218 } | 201 |
219 | 202 "subs %[length], %[length], #1\n\t" |
220 void pa_volume_s16ne_neon(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { | 203 "vst1.16 {d0}, [%[samples]]!\n\t" |
204 "bgt 1b\n\t" | |
205 /* output operands (or input operands that get modified) */ | |
206 : [samples] "+r" (samples), [length] "+r" (length) | |
207 : [vol] "r" (vol4) /* input operands */ | |
208 : "memory", "cc", "q0", "q1", "q2", "q3" /* clobber list */ | |
209 ); | |
210 } | |
211 | |
212 static inline void vol_float_neon(const float32x4_t *vol4, float *samples, unsigned length) { | |
213 asm volatile ( | |
214 "mov %[length], %[length], lsr #2\n\t" | |
215 "vld1.32 {q1}, [%[vol]]\n\t" | |
216 "1:\n\t" | |
217 "vld1.32 {q0}, [%[samples]]\n\t" | |
218 "vmul.f32 q0, q0, q1\n\t" | |
219 "subs %[length], %[length], #1\n\t" | |
220 "vst1.32 {q0}, [%[samples]]!\n\t" | |
221 "bgt 1b\n\t" | |
222 /* output operands (or input operands that get modified) */ | |
223 : [samples] "+r" (samples), [length] "+r" (length) | |
224 : [vol] "r" (vol4) /* input operands */ | |
225 : "memory", "cc", "q0", "q1" /* clobber list */ | |
226 ); | |
227 } | |
228 | |
229 static void pa_volume_s16ne_neon(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) { | |
221 unsigned channel = 0, i; | 230 unsigned channel = 0, i; |
222 int32x4_t vol4; | 231 uint32x4_t vol4; |
223 | 232 |
224 length /= sizeof(int16_t); | 233 length /= sizeof(int16_t); |
225 | 234 |
226 switch (channels) { | 235 switch (channels) { |
227 case 1: | 236 case 1: |
228 vol4 = vdupq_n_s32(*volumes); | 237 vol4 = vdupq_n_u32(*volumes); |
229 vol_s16ne_neon(vol4, samples, length); | 238 vol_s16_neon(&vol4, samples, length); |
230 | 239 |
231 for (i = length & ~7; i < length; i++) { | 240 for (i = length & ~3; i < length; i++) { |
232 int32_t t = samples[i]; | 241 int32_t t = samples[i]; |
233 t = ((t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16)); | 242 t = ((int32_t) (t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16)); |
234 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); | 243 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); |
235 } | 244 } |
236 break; | 245 break; |
237 case 2: | 246 case 2: |
238 vol4 = vcombine_s32(*(int32x2_t *)volumes, *(int32x2_t *)volumes); | 247 vol4 = vcombine_u32(*(uint32x2_t *)volumes, *(uint32x2_t *)volumes); |
239 vol_s16ne_neon(vol4, samples, length); | 248 vol_s16_neon(&vol4, samples, length); |
240 | 249 |
241 for (i = length & ~7; i < length; i++) { | 250 for (i = length & ~3; i < length; i++) { |
242 int32_t t = samples[i]; | 251 int32_t t = samples[i]; |
243 int32_t vol = volumes[(channel++) & 1]; | 252 uint32_t vol = volumes[(channel++) & 1]; |
244 t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); | 253 t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); |
245 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); | 254 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); |
246 } | 255 } |
247 break; | 256 break; |
248 case 4: | 257 case 4: |
249 vol4 = *(int32x4_t *)volumes; | 258 vol4 = *(uint32x4_t *)volumes; |
250 vol_s16ne_neon(vol4, samples, length); | 259 vol_s16_neon(&vol4, samples, length); |
251 | 260 |
252 for (i = length & ~7; i < length; i++) { | 261 for (i = length & ~3; i < length; i++) { |
253 int32_t t = samples[i]; | 262 int32_t t = samples[i]; |
254 int32_t vol = volumes[(channel++) & 3]; | 263 uint32_t vol = volumes[(channel++) & 3]; |
255 t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); | 264 t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); |
256 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); | 265 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); |
257 } | 266 } |
258 break; | 267 break; |
259 default: | 268 default: |
260 for (; length; length--) { | 269 for (; length; length--) { |
261 int32_t t, hi, lo; | 270 int32_t t; |
271 uint32_t hi, lo; | |
262 | 272 |
263 /* Multiplying the 32bit volume factor with the 16bit | 273 /* Multiplying the 32bit volume factor with the 16bit |
264 * sample might result in an 48bit value. We want to | 274 * sample might result in an 48bit value. We want to |
265 * do without 64 bit integers and hence do the | 275 * do without 64 bit integers and hence do the |
266 * multiplication independently for the HI and LO part | 276 * multiplication independently for the HI and LO part |
268 | 278 |
269 hi = volumes[channel] >> 16; | 279 hi = volumes[channel] >> 16; |
270 lo = volumes[channel] & 0xFFFF; | 280 lo = volumes[channel] & 0xFFFF; |
271 | 281 |
272 t = (int32_t)(*samples); | 282 t = (int32_t)(*samples); |
273 t = ((t * lo) >> 16) + (t * hi); | 283 t = ((int32_t) (t * lo) >> 16) + (t * hi); |
274 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); | 284 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); |
275 *samples++ = (int16_t) t; | 285 *samples++ = (int16_t) t; |
276 | 286 |
277 if (PA_UNLIKELY(++channel >= channels)) | 287 if (PA_UNLIKELY(++channel >= channels)) |
278 channel = 0; | 288 channel = 0; |
279 } | 289 } |
280 break; | 290 break; |
281 } | 291 } |
282 } | 292 } |
283 | 293 |
284 void pa_volume_float32ne_neon(float *samples, float *volumes, unsigned channels, unsigned length) { | 294 static void pa_volume_float32ne_neon(float *samples, const float *volumes, unsigned channels, unsigned length) { |
285 unsigned channel = 0, i; | 295 unsigned channel = 0, i; |
286 float32x4_t vol4; | 296 float32x4_t vol4; |
287 | 297 |
288 length /= sizeof(float); | 298 length /= sizeof(float); |
289 | 299 |
290 switch (channels) { | 300 switch (channels) { |
291 case 1: | 301 case 1: |
292 vol4 = vdupq_n_f32(*volumes); | 302 vol4 = vdupq_n_f32(*volumes); |
293 for (i = 0; i < length/4; i++) { | 303 vol_float_neon(&vol4, samples, length); |
294 ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4); | |
295 } | |
296 | 304 |
297 for (i = length & ~3; i < length; i++) { | 305 for (i = length & ~3; i < length; i++) { |
298 samples[i] *= volumes[0]; | 306 samples[i] *= volumes[0]; |
299 } | 307 } |
300 break; | 308 break; |
301 case 2: | 309 case 2: |
302 vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes); | 310 vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes); |
303 for (i = 0; i < length/4; i++) { | 311 vol_float_neon(&vol4, samples, length); |
304 ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4); | |
305 } | |
306 | 312 |
307 for (i = length & ~3; i < length; i++) { | 313 for (i = length & ~3; i < length; i++) { |
308 samples[i] *= volumes[channel]; | 314 samples[i] *= volumes[channel]; |
309 | 315 |
310 if (PA_UNLIKELY(++channel >= channels)) | 316 if (PA_UNLIKELY(++channel >= channels)) |
311 channel = 0; | 317 channel = 0; |
312 } | 318 } |
313 break; | 319 break; |
314 case 4: | 320 case 4: |
315 vol4 = *(float32x4_t *)volumes; | 321 vol4 = *(float32x4_t *)volumes; |
316 for (i = 0; i < length/4; i++) { | 322 vol_float_neon(&vol4, samples, length); |
317 ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4); | |
318 } | |
319 | 323 |
320 for (i = length & ~3; i < length; i++) { | 324 for (i = length & ~3; i < length; i++) { |
321 samples[i] *= volumes[channel++]; | 325 samples[i] *= volumes[channel++]; |
322 } | 326 } |
323 break; | 327 break; |
331 break; | 335 break; |
332 } | 336 } |
333 } | 337 } |
334 | 338 |
335 #define SAMPLES 1019 | 339 #define SAMPLES 1019 |
336 #define TIMES 3000 | 340 #define TIMES 50000 |
337 #define CHANNELS 4 | 341 #define CHANNELS 4 |
338 #define PADDING 16 | 342 #define PADDING 16 |
339 | 343 |
340 static void run_test_float(void) { | 344 static void run_test_float(void) { |
341 float floats[SAMPLES]; | 345 float floats[SAMPLES]; |
356 for (i = 0; i < CHANNELS; i++) | 360 for (i = 0; i < CHANNELS; i++) |
357 volumes[i] = 0.5f * rand() / (float) RAND_MAX; | 361 volumes[i] = 0.5f * rand() / (float) RAND_MAX; |
358 | 362 |
359 pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats)); | 363 pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats)); |
360 pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref)); | 364 pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref)); |
361 | 365 |
362 for (i = 0; i < SAMPLES; i++) { | 366 for (i = 0; i < SAMPLES; i++) { |
363 if (fabsf(floats[i] - floats_ref[i]) > 0.00001) { | 367 if (fabsf(floats[i] - floats_ref[i]) > 0.00001) { |
364 pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i], | 368 pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i], |
365 floats_orig[i]); | 369 floats_orig[i]); |
366 } | 370 } |
385 | 389 |
386 static void run_test_s16(void) { | 390 static void run_test_s16(void) { |
387 int16_t samples[SAMPLES]; | 391 int16_t samples[SAMPLES]; |
388 int16_t samples_ref[SAMPLES]; | 392 int16_t samples_ref[SAMPLES]; |
389 int16_t samples_orig[SAMPLES]; | 393 int16_t samples_orig[SAMPLES]; |
390 int32_t volumes[CHANNELS + PADDING]; | 394 uint32_t volumes[CHANNELS + PADDING]; |
391 unsigned i, padding; | 395 unsigned i, padding; |
392 pa_usec_t start, stop; | 396 pa_usec_t start, stop; |
393 | 397 |
394 pa_log_debug("checking NEON volume_s16ne(%d)", SAMPLES); | 398 pa_log_debug("checking NEON volume_s16ne(%d)", SAMPLES); |
395 | 399 |
411 if (abs(samples[i] - samples_ref[i]) > 0) { | 415 if (abs(samples[i] - samples_ref[i]) > 0) { |
412 pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i], | 416 pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i], |
413 samples_orig[i]); | 417 samples_orig[i]); |
414 } | 418 } |
415 } | 419 } |
416 | 420 exit(0); |
417 start = pa_rtclock_now(); | 421 start = pa_rtclock_now(); |
418 for (i = 0; i < TIMES; i++) { | 422 for (i = 0; i < TIMES; i++) { |
419 memcpy(samples, samples_orig, sizeof(samples_orig)); | 423 memcpy(samples, samples_orig, sizeof(samples_orig)); |
420 pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples)); | 424 pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples)); |
421 } | 425 } |