Mercurial > hg > pa-neon
comparison svolume_neon.c @ 0:e0040ee59c3c
import
| author | Peter Meerwald <p.meerwald@bct-electronic.com> |
|---|---|
| date | Thu, 12 Jan 2012 17:27:46 +0100 |
| parents | |
| children | b829afbea564 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e0040ee59c3c |
|---|---|
| 1 /* | |
| 2 * Copyright 2012 Peter Meerwald <p.meerwald@bct-electronic.com> | |
| 3 */ | |
| 4 | |
| 5 #include <stdlib.h> | |
| 6 #include <stdio.h> | |
| 7 #include <stdarg.h> | |
| 8 #include <string.h> | |
| 9 #include <math.h> | |
| 10 #include <sys/time.h> | |
| 11 #include <assert.h> | |
| 12 | |
| 13 | |
| 14 typedef short int16_t; | |
| 15 typedef unsigned int uint32_t; | |
| 16 typedef enum pa_sample_format { | |
| 17 PA_SAMPLE_S16LE, | |
| 18 PA_SAMPLE_FLOAT32LE, | |
| 19 } pa_sample_format_t; | |
| 20 #define PA_SAMPLE_S16NE PA_SAMPLE_S16LE | |
| 21 #define PA_SAMPLE_FLOAT32NE PA_SAMPLE_FLOAT32LE | |
| 22 typedef struct { | |
| 23 pa_sample_format_t *format; | |
| 24 } pa_remap_t; | |
| 25 typedef void (*pa_remap_func_t)(pa_remap_t *m, void *dst, const void *src, unsigned n); | |
| 26 typedef long long unsigned int pa_usec_t; | |
| 27 | |
| 28 #define pa_assert(x) assert(x) | |
| 29 #define pa_assert_not_reached() assert(0) | |
| 30 | |
| 31 #define PA_MAX(a, b) ((a) > (b) ? (a) : (b)) | |
| 32 | |
| 33 typedef uint32_t pa_volume_t; | |
| 34 #define PA_VOLUME_MUTED ((pa_volume_t) 0U) | |
| 35 #define PA_VOLUME_MAX ((pa_volume_t) UINT32_MAX/2) | |
| 36 | |
| 37 #define PA_UNLIKELY(x) (x) | |
| 38 #define PA_CLAMP_UNLIKELY(x, low, high) (PA_UNLIKELY((x) > (high)) ? (high) : (PA_UNLIKELY((x) < (low)) ? (low) : (x))) | |
| 39 #define PA_CLAMP_VOLUME(v) (PA_CLAMP_UNLIKELY((v), PA_VOLUME_MUTED, PA_VOLUME_MAX)) | |
| 40 | |
| 41 static void pa_log_info(const char *format, ...) { | |
| 42 va_list ap; | |
| 43 char buf[1024]; | |
| 44 va_start(ap, format); | |
| 45 vsprintf(buf, format, ap); | |
| 46 printf("%s\n", buf); | |
| 47 va_end(ap); | |
| 48 } | |
| 49 | |
| 50 #define pa_log_debug pa_log_info | |
| 51 | |
| 52 static pa_usec_t pa_rtclock_now() { | |
| 53 struct timeval tv; | |
| 54 gettimeofday(&tv, NULL); | |
| 55 | |
| 56 return tv.tv_sec * 1000000ULL + tv.tv_usec; | |
| 57 } | |
| 58 | |
| 59 void pa_volume_s16ne_c(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { | |
| 60 unsigned channel; | |
| 61 | |
| 62 length /= sizeof(int16_t); | |
| 63 | |
| 64 for (channel = 0; length; length--) { | |
| 65 int32_t t, hi, lo; | |
| 66 | |
| 67 /* Multiplying the 32bit volume factor with the 16bit | |
| 68 * sample might result in an 48bit value. We want to | |
| 69 * do without 64 bit integers and hence do the | |
| 70 * multiplication independently for the HI and LO part | |
| 71 * of the volume. */ | |
| 72 | |
| 73 hi = volumes[channel] >> 16; | |
| 74 lo = volumes[channel] & 0xFFFF; | |
| 75 | |
| 76 t = (int32_t)(*samples); | |
| 77 t = ((t * lo) >> 16) + (t * hi); | |
| 78 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); | |
| 79 *samples++ = (int16_t) t; | |
| 80 | |
| 81 if (PA_UNLIKELY(++channel >= channels)) | |
| 82 channel = 0; | |
| 83 } | |
| 84 } | |
| 85 | |
| 86 void pa_volume_float32ne_c(float *samples, float *volumes, unsigned channels, unsigned length) { | |
| 87 unsigned channel; | |
| 88 | |
| 89 length /= sizeof(float); | |
| 90 | |
| 91 for (channel = 0; length; length--) { | |
| 92 *samples++ *= volumes[channel]; | |
| 93 | |
| 94 if (PA_UNLIKELY(++channel >= channels)) | |
| 95 channel = 0; | |
| 96 } | |
| 97 } | |
| 98 | |
| 99 /* | |
| 100 void pa_volume_s16ne_orc(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) | |
| 101 { | |
| 102 if (channels == 2) { | |
| 103 int64_t v = (int64_t)volumes[1] << 32 | volumes[0]; | |
| 104 pa_volume_s16ne_orc_2ch (samples, v, ((length / (sizeof(int16_t))) / 2)); | |
| 105 } else if (channels == 1) | |
| 106 pa_volume_s16ne_orc_1ch (samples, volumes[0], length / (sizeof(int16_t))); | |
| 107 } | |
| 108 */ | |
| 109 | |
| 110 #if defined(__arm__) | |
| 111 | |
| 112 #include "arm_neon.h" | |
| 113 | |
| 114 #define MOD_INC() \ | |
| 115 " subs r0, r6, %2 \n\t" \ | |
| 116 " itt cs \n\t" \ | |
| 117 " addcs r0, %1 \n\t" \ | |
| 118 " movcs r6, r0 \n\t" | |
| 119 | |
| 120 static void pa_volume_s16ne_arm(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { | |
| 121 int32_t *ve; | |
| 122 | |
| 123 /* Channels must be at least 4, and always a multiple of the original number. | |
| 124 * This is also the max amount we overread the volume array, which should | |
| 125 * have enough padding. */ | |
| 126 channels = channels == 3 ? 6 : PA_MAX (4U, channels); | |
| 127 ve = volumes + channels; | |
| 128 | |
| 129 __asm__ __volatile__ ( | |
| 130 " mov r6, %1 \n\t" | |
| 131 " mov %3, %3, LSR #1 \n\t" /* length /= sizeof (int16_t) */ | |
| 132 " tst %3, #1 \n\t" /* check for odd samples */ | |
| 133 " beq 2f \n\t" | |
| 134 | |
| 135 "1: \n\t" | |
| 136 " ldr r0, [r6], #4 \n\t" /* odd samples volumes */ | |
| 137 " ldrh r2, [%0] \n\t" | |
| 138 | |
| 139 " smulwb r0, r0, r2 \n\t" | |
| 140 " ssat r0, #16, r0 \n\t" | |
| 141 | |
| 142 " strh r0, [%0], #2 \n\t" | |
| 143 | |
| 144 MOD_INC() | |
| 145 | |
| 146 "2: \n\t" | |
| 147 " mov %3, %3, LSR #1 \n\t" | |
| 148 " tst %3, #1 \n\t" /* check for odd samples */ | |
| 149 " beq 4f \n\t" | |
| 150 | |
| 151 "3: \n\t" | |
| 152 " ldrd r2, [r6], #8 \n\t" /* 2 samples at a time */ | |
| 153 " ldr r0, [%0] \n\t" | |
| 154 | |
| 155 " smulwt r2, r2, r0 \n\t" | |
| 156 " smulwb r3, r3, r0 \n\t" | |
| 157 | |
| 158 " ssat r2, #16, r2 \n\t" | |
| 159 " ssat r3, #16, r3 \n\t" | |
| 160 | |
| 161 " pkhbt r0, r3, r2, LSL #16 \n\t" | |
| 162 " str r0, [%0], #4 \n\t" | |
| 163 | |
| 164 MOD_INC() | |
| 165 | |
| 166 "4: \n\t" | |
| 167 " movs %3, %3, LSR #1 \n\t" | |
| 168 " beq 6f \n\t" | |
| 169 | |
| 170 "5: \n\t" | |
| 171 " ldrd r2, [r6], #8 \n\t" /* 4 samples at a time */ | |
| 172 " ldrd r4, [r6], #8 \n\t" | |
| 173 " ldrd r0, [%0] \n\t" | |
| 174 | |
| 175 " smulwt r2, r2, r0 \n\t" | |
| 176 " smulwb r3, r3, r0 \n\t" | |
| 177 " smulwt r4, r4, r1 \n\t" | |
| 178 " smulwb r5, r5, r1 \n\t" | |
| 179 | |
| 180 " ssat r2, #16, r2 \n\t" | |
| 181 " ssat r3, #16, r3 \n\t" | |
| 182 " ssat r4, #16, r4 \n\t" | |
| 183 " ssat r5, #16, r5 \n\t" | |
| 184 | |
| 185 " pkhbt r0, r3, r2, LSL #16 \n\t" | |
| 186 " pkhbt r1, r5, r4, LSL #16 \n\t" | |
| 187 " strd r0, [%0], #8 \n\t" | |
| 188 | |
| 189 MOD_INC() | |
| 190 | |
| 191 " subs %3, %3, #1 \n\t" | |
| 192 " bne 5b \n\t" | |
| 193 "6: \n\t" | |
| 194 | |
| 195 : "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length) | |
| 196 : | |
| 197 : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc" | |
| 198 ); | |
| 199 } | |
| 200 | |
| 201 static inline void vol_s16ne_neon(int32x4_t vol4, int16_t *samples, unsigned length) { | |
| 202 unsigned i; | |
| 203 int16x4_t hi = vshrn_n_s32(vol4, 16); | |
| 204 int32x4_t lo = vandq_s32(vol4, vdupq_n_s32(0xFFFF)); | |
| 205 | |
| 206 for (i = 0; i < length/8; i++) { | |
| 207 int16x4_t v1 = ((int16x4_t *) samples)[2*i]; | |
| 208 int16x4_t v2 = ((int16x4_t *) samples)[2*i+1]; | |
| 209 | |
| 210 int32x4_t t1 = vmull_s16(v1, hi); | |
| 211 int32x4_t t2 = vmull_s16(v2, hi); | |
| 212 | |
| 213 int16x4_t r1 = vqmovn_s32(vsraq_n_s32(t1, vmulq_s32(vmovl_s16(v1), lo), 16)); | |
| 214 int16x4_t r2 = vqmovn_s32(vsraq_n_s32(t2, vmulq_s32(vmovl_s16(v2), lo), 16)); | |
| 215 | |
| 216 ((int16x8_t *)samples)[i] = vcombine_s16(r1, r2); | |
| 217 } | |
| 218 } | |
| 219 | |
| 220 void pa_volume_s16ne_neon(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { | |
| 221 unsigned channel = 0, i; | |
| 222 int32x4_t vol4; | |
| 223 | |
| 224 length /= sizeof(int16_t); | |
| 225 | |
| 226 switch (channels) { | |
| 227 case 1: | |
| 228 vol4 = vdupq_n_s32(*volumes); | |
| 229 vol_s16ne_neon(vol4, samples, length); | |
| 230 | |
| 231 for (i = length & ~7; i < length; i++) { | |
| 232 int32_t t = samples[i]; | |
| 233 t = ((t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16)); | |
| 234 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); | |
| 235 } | |
| 236 break; | |
| 237 case 2: | |
| 238 vol4 = vcombine_s32(*(int32x2_t *)volumes, *(int32x2_t *)volumes); | |
| 239 vol_s16ne_neon(vol4, samples, length); | |
| 240 | |
| 241 for (i = length & ~7; i < length; i++) { | |
| 242 int32_t t = samples[i]; | |
| 243 int32_t vol = volumes[(channel++) & 1]; | |
| 244 t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); | |
| 245 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); | |
| 246 } | |
| 247 break; | |
| 248 case 4: | |
| 249 vol4 = *(int32x4_t *)volumes; | |
| 250 vol_s16ne_neon(vol4, samples, length); | |
| 251 | |
| 252 for (i = length & ~7; i < length; i++) { | |
| 253 int32_t t = samples[i]; | |
| 254 int32_t vol = volumes[(channel++) & 3]; | |
| 255 t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16)); | |
| 256 samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); | |
| 257 } | |
| 258 break; | |
| 259 default: | |
| 260 for (; length; length--) { | |
| 261 int32_t t, hi, lo; | |
| 262 | |
| 263 /* Multiplying the 32bit volume factor with the 16bit | |
| 264 * sample might result in an 48bit value. We want to | |
| 265 * do without 64 bit integers and hence do the | |
| 266 * multiplication independently for the HI and LO part | |
| 267 * of the volume. */ | |
| 268 | |
| 269 hi = volumes[channel] >> 16; | |
| 270 lo = volumes[channel] & 0xFFFF; | |
| 271 | |
| 272 t = (int32_t)(*samples); | |
| 273 t = ((t * lo) >> 16) + (t * hi); | |
| 274 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); | |
| 275 *samples++ = (int16_t) t; | |
| 276 | |
| 277 if (PA_UNLIKELY(++channel >= channels)) | |
| 278 channel = 0; | |
| 279 } | |
| 280 break; | |
| 281 } | |
| 282 } | |
| 283 | |
| 284 void pa_volume_float32ne_neon(float *samples, float *volumes, unsigned channels, unsigned length) { | |
| 285 unsigned channel = 0, i; | |
| 286 float32x4_t vol4; | |
| 287 | |
| 288 length /= sizeof(float); | |
| 289 | |
| 290 switch (channels) { | |
| 291 case 1: | |
| 292 vol4 = vdupq_n_f32(*volumes); | |
| 293 for (i = 0; i < length/4; i++) { | |
| 294 ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4); | |
| 295 } | |
| 296 | |
| 297 for (i = length & ~3; i < length; i++) { | |
| 298 samples[i] *= volumes[0]; | |
| 299 } | |
| 300 break; | |
| 301 case 2: | |
| 302 vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes); | |
| 303 for (i = 0; i < length/4; i++) { | |
| 304 ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4); | |
| 305 } | |
| 306 | |
| 307 for (i = length & ~3; i < length; i++) { | |
| 308 samples[i] *= volumes[channel]; | |
| 309 | |
| 310 if (PA_UNLIKELY(++channel >= channels)) | |
| 311 channel = 0; | |
| 312 } | |
| 313 break; | |
| 314 case 4: | |
| 315 vol4 = *(float32x4_t *)volumes; | |
| 316 for (i = 0; i < length/4; i++) { | |
| 317 ((float32x4_t *)samples)[i] = vmulq_f32(((float32x4_t *)samples)[i], vol4); | |
| 318 } | |
| 319 | |
| 320 for (i = length & ~3; i < length; i++) { | |
| 321 samples[i] *= volumes[channel++]; | |
| 322 } | |
| 323 break; | |
| 324 default: | |
| 325 for (; length; length--) { | |
| 326 *samples++ *= volumes[channel]; | |
| 327 | |
| 328 if (PA_UNLIKELY(++channel >= channels)) | |
| 329 channel = 0; | |
| 330 } | |
| 331 break; | |
| 332 } | |
| 333 } | |
| 334 | |
| 335 #define SAMPLES 1019 | |
| 336 #define TIMES 1000 | |
| 337 #define CHANNELS 4 | |
| 338 #define PADDING 16 | |
| 339 | |
| 340 static void run_test_float(void) { | |
| 341 float floats[SAMPLES]; | |
| 342 float floats_ref[SAMPLES]; | |
| 343 float floats_orig[SAMPLES]; | |
| 344 float volumes[CHANNELS]; | |
| 345 unsigned i; | |
| 346 pa_usec_t start, stop; | |
| 347 | |
| 348 pa_log_debug("checking NEON volume_float32ne(%d)", SAMPLES); | |
| 349 | |
| 350 for (i = 0; i < SAMPLES; i++) { | |
| 351 floats_orig[i] = rand()/(float) RAND_MAX - 0.5f; | |
| 352 } | |
| 353 memcpy(floats_ref, floats_orig, sizeof(floats_orig)); | |
| 354 memcpy(floats, floats_orig, sizeof(floats_orig)); | |
| 355 | |
| 356 for (i = 0; i < CHANNELS; i++) | |
| 357 volumes[i] = 0.5f * rand() / (float) RAND_MAX; | |
| 358 | |
| 359 pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats)); | |
| 360 pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref)); | |
| 361 | |
| 362 for (i = 0; i < SAMPLES; i++) { | |
| 363 if (fabsf(floats[i] - floats_ref[i]) > 0.00001) { | |
| 364 pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i], | |
| 365 floats_orig[i]); | |
| 366 } | |
| 367 } | |
| 368 | |
| 369 start = pa_rtclock_now(); | |
| 370 for (i = 0; i < TIMES; i++) { | |
| 371 memcpy(floats, floats_orig, sizeof(floats_orig)); | |
| 372 pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats)); | |
| 373 } | |
| 374 stop = pa_rtclock_now(); | |
| 375 pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); | |
| 376 | |
| 377 start = pa_rtclock_now(); | |
| 378 for (i = 0; i < TIMES; i++) { | |
| 379 memcpy(floats_ref, floats_orig, sizeof(floats_orig)); | |
| 380 pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref)); | |
| 381 } | |
| 382 stop = pa_rtclock_now(); | |
| 383 pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); | |
| 384 } | |
| 385 | |
| 386 static void run_test_s16(void) { | |
| 387 int16_t samples[SAMPLES]; | |
| 388 int16_t samples_ref[SAMPLES]; | |
| 389 int16_t samples_orig[SAMPLES]; | |
| 390 int32_t volumes[CHANNELS + PADDING]; | |
| 391 unsigned i, padding; | |
| 392 pa_usec_t start, stop; | |
| 393 | |
| 394 pa_log_debug("checking NEON volume_s16ne(%d)", SAMPLES); | |
| 395 | |
| 396 for (i = 0; i < SAMPLES; i++) { | |
| 397 samples_orig[i] = rand() - RAND_MAX/2; | |
| 398 } | |
| 399 memcpy(samples_ref, samples_orig, sizeof(samples_orig)); | |
| 400 memcpy(samples, samples_orig, sizeof(samples_orig)); | |
| 401 | |
| 402 for (i = 0; i < CHANNELS; i++) | |
| 403 volumes[i] = PA_CLAMP_VOLUME(rand() >> 15); | |
| 404 for (padding = 0; padding < PADDING; padding++, i++) | |
| 405 volumes[i] = volumes[padding]; | |
| 406 | |
| 407 pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples)); | |
| 408 pa_volume_s16ne_c(samples_ref, volumes, CHANNELS, sizeof(samples_ref)); | |
| 409 | |
| 410 for (i = 0; i < SAMPLES; i++) { | |
| 411 if (abs(samples[i] - samples_ref[i]) > 0) { | |
| 412 pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i], | |
| 413 samples_orig[i]); | |
| 414 } | |
| 415 } | |
| 416 | |
| 417 start = pa_rtclock_now(); | |
| 418 for (i = 0; i < TIMES; i++) { | |
| 419 memcpy(samples, samples_orig, sizeof(samples_orig)); | |
| 420 pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples)); | |
| 421 } | |
| 422 stop = pa_rtclock_now(); | |
| 423 pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); | |
| 424 | |
| 425 start = pa_rtclock_now(); | |
| 426 for (i = 0; i < TIMES; i++) { | |
| 427 memcpy(samples, samples_orig, sizeof(samples_orig)); | |
| 428 pa_volume_s16ne_arm(samples, volumes, CHANNELS, sizeof(samples)); | |
| 429 } | |
| 430 stop = pa_rtclock_now(); | |
| 431 pa_log_info("ARM: %llu usec.", (long long unsigned int)(stop - start)); | |
| 432 | |
| 433 start = pa_rtclock_now(); | |
| 434 for (i = 0; i < TIMES; i++) { | |
| 435 memcpy(samples_ref, samples_orig, sizeof(samples_orig)); | |
| 436 pa_volume_s16ne_c(samples_ref, volumes, CHANNELS, sizeof(samples_ref)); | |
| 437 } | |
| 438 stop = pa_rtclock_now(); | |
| 439 pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); | |
| 440 } | |
| 441 | |
| 442 #endif /* defined(__arm__) */ | |
| 443 | |
| 444 int main() { | |
| 445 | |
| 446 run_test_float(); | |
| 447 run_test_s16(); | |
| 448 | |
| 449 return EXIT_SUCCESS; | |
| 450 } |
