Mercurial > hg > audiostuff
comparison spandsp-0.0.6pre17/src/vector_float.c @ 4:26cd8f1ef0b1
import spandsp-0.0.6pre17
author | Peter Meerwald <pmeerw@cosy.sbg.ac.at> |
---|---|
date | Fri, 25 Jun 2010 15:50:58 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:c6c5a16ce2f2 | 4:26cd8f1ef0b1 |
---|---|
1 /* | |
2 * SpanDSP - a series of DSP components for telephony | |
3 * | |
4 * vector_float.c - Floating vector arithmetic routines. | |
5 * | |
6 * Written by Steve Underwood <steveu@coppice.org> | |
7 * | |
8 * Copyright (C) 2006 Steve Underwood | |
9 * | |
10 * All rights reserved. | |
11 * | |
12 * This program is free software; you can redistribute it and/or modify | |
13 * it under the terms of the GNU Lesser General Public License version 2.1, | |
14 * as published by the Free Software Foundation. | |
15 * | |
16 * This program is distributed in the hope that it will be useful, | |
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 * GNU Lesser General Public License for more details. | |
20 * | |
21 * You should have received a copy of the GNU Lesser General Public | |
22 * License along with this program; if not, write to the Free Software | |
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
24 * | |
25 * $Id: vector_float.c,v 1.22 2009/07/12 09:23:09 steveu Exp $ | |
26 */ | |
27 | |
28 /*! \file */ | |
29 | |
30 #if defined(HAVE_CONFIG_H) | |
31 #include "config.h" | |
32 #endif | |
33 | |
34 #include <inttypes.h> | |
35 #include <stdlib.h> | |
36 #include <stdio.h> | |
37 #include <string.h> | |
38 #if defined(HAVE_TGMATH_H) | |
39 #include <tgmath.h> | |
40 #endif | |
41 #if defined(HAVE_MATH_H) | |
42 #include <math.h> | |
43 #endif | |
44 #include <assert.h> | |
45 | |
46 #include "floating_fudge.h" | |
47 #include "mmx_sse_decs.h" | |
48 | |
49 #include "spandsp/telephony.h" | |
50 #include "spandsp/vector_float.h" | |
51 | |
52 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) | |
53 SPAN_DECLARE(void) vec_copyf(float z[], const float x[], int n) | |
54 { | |
55 int i; | |
56 __m128 n1; | |
57 | |
58 if ((i = n & ~3)) | |
59 { | |
60 for (i -= 4; i >= 0; i -= 4) | |
61 { | |
62 n1 = _mm_loadu_ps(x + i); | |
63 _mm_storeu_ps(z + i, n1); | |
64 } | |
65 } | |
66 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ | |
67 switch (n & 3) | |
68 { | |
69 case 3: | |
70 z[n - 3] = x[n - 3]; | |
71 case 2: | |
72 z[n - 2] = x[n - 2]; | |
73 case 1: | |
74 z[n - 1] = x[n - 1]; | |
75 } | |
76 } | |
77 #else | |
78 SPAN_DECLARE(void) vec_copyf(float z[], const float x[], int n) | |
79 { | |
80 int i; | |
81 | |
82 for (i = 0; i < n; i++) | |
83 z[i] = x[i]; | |
84 } | |
85 #endif | |
86 /*- End of function --------------------------------------------------------*/ | |
87 | |
88 SPAN_DECLARE(void) vec_copy(double z[], const double x[], int n) | |
89 { | |
90 int i; | |
91 | |
92 for (i = 0; i < n; i++) | |
93 z[i] = x[i]; | |
94 } | |
95 /*- End of function --------------------------------------------------------*/ | |
96 | |
97 #if defined(HAVE_LONG_DOUBLE) | |
98 SPAN_DECLARE(void) vec_copyl(long double z[], const long double x[], int n) | |
99 { | |
100 int i; | |
101 | |
102 for (i = 0; i < n; i++) | |
103 z[i] = x[i]; | |
104 } | |
105 /*- End of function --------------------------------------------------------*/ | |
106 #endif | |
107 | |
108 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) | |
109 SPAN_DECLARE(void) vec_negatef(float z[], const float x[], int n) | |
110 { | |
111 int i; | |
112 static const uint32_t mask = 0x80000000; | |
113 static const float *fmask = (float *) &mask; | |
114 __m128 n1; | |
115 __m128 n2; | |
116 | |
117 if ((i = n & ~3)) | |
118 { | |
119 n2 = _mm_set1_ps(*fmask); | |
120 for (i -= 4; i >= 0; i -= 4) | |
121 { | |
122 n1 = _mm_loadu_ps(x + i); | |
123 n1 = _mm_xor_ps(n1, n2); | |
124 _mm_storeu_ps(z + i, n1); | |
125 } | |
126 } | |
127 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ | |
128 switch (n & 3) | |
129 { | |
130 case 3: | |
131 z[n - 3] = -x[n - 3]; | |
132 case 2: | |
133 z[n - 2] = -x[n - 2]; | |
134 case 1: | |
135 z[n - 1] = -x[n - 1]; | |
136 } | |
137 } | |
138 #else | |
139 SPAN_DECLARE(void) vec_negatef(float z[], const float x[], int n) | |
140 { | |
141 int i; | |
142 | |
143 for (i = 0; i < n; i++) | |
144 z[i] = -x[i]; | |
145 } | |
146 #endif | |
147 /*- End of function --------------------------------------------------------*/ | |
148 | |
149 SPAN_DECLARE(void) vec_negate(double z[], const double x[], int n) | |
150 { | |
151 int i; | |
152 | |
153 for (i = 0; i < n; i++) | |
154 z[i] = -x[i]; | |
155 } | |
156 /*- End of function --------------------------------------------------------*/ | |
157 | |
158 #if defined(HAVE_LONG_DOUBLE) | |
159 SPAN_DECLARE(void) vec_negatel(long double z[], const long double x[], int n) | |
160 { | |
161 int i; | |
162 | |
163 for (i = 0; i < n; i++) | |
164 z[i] = -x[i]; | |
165 } | |
166 /*- End of function --------------------------------------------------------*/ | |
167 #endif | |
168 | |
169 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) | |
170 SPAN_DECLARE(void) vec_zerof(float z[], int n) | |
171 { | |
172 int i; | |
173 __m128 n1; | |
174 | |
175 if ((i = n & ~3)) | |
176 { | |
177 n1 = _mm_setzero_ps(); | |
178 for (i -= 4; i >= 0; i -= 4) | |
179 _mm_storeu_ps(z + i, n1); | |
180 } | |
181 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ | |
182 switch (n & 3) | |
183 { | |
184 case 3: | |
185 z[n - 3] = 0; | |
186 case 2: | |
187 z[n - 2] = 0; | |
188 case 1: | |
189 z[n - 1] = 0; | |
190 } | |
191 } | |
192 #else | |
193 SPAN_DECLARE(void) vec_zerof(float z[], int n) | |
194 { | |
195 int i; | |
196 | |
197 for (i = 0; i < n; i++) | |
198 z[i] = 0.0f; | |
199 } | |
200 #endif | |
201 /*- End of function --------------------------------------------------------*/ | |
202 | |
203 SPAN_DECLARE(void) vec_zero(double z[], int n) | |
204 { | |
205 int i; | |
206 | |
207 for (i = 0; i < n; i++) | |
208 z[i] = 0.0; | |
209 } | |
210 /*- End of function --------------------------------------------------------*/ | |
211 | |
212 #if defined(HAVE_LONG_DOUBLE) | |
213 SPAN_DECLARE(void) vec_zerol(long double z[], int n) | |
214 { | |
215 int i; | |
216 | |
217 for (i = 0; i < n; i++) | |
218 z[i] = 0.0L; | |
219 } | |
220 /*- End of function --------------------------------------------------------*/ | |
221 #endif | |
222 | |
223 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) | |
224 SPAN_DECLARE(void) vec_setf(float z[], float x, int n) | |
225 { | |
226 int i; | |
227 __m128 n1; | |
228 | |
229 if ((i = n & ~3)) | |
230 { | |
231 n1 = _mm_set1_ps(x); | |
232 for (i -= 4; i >= 0; i -= 4) | |
233 _mm_storeu_ps(z + i, n1); | |
234 } | |
235 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ | |
236 switch (n & 3) | |
237 { | |
238 case 3: | |
239 z[n - 3] = x; | |
240 case 2: | |
241 z[n - 2] = x; | |
242 case 1: | |
243 z[n - 1] = x; | |
244 } | |
245 } | |
246 #else | |
247 SPAN_DECLARE(void) vec_setf(float z[], float x, int n) | |
248 { | |
249 int i; | |
250 | |
251 for (i = 0; i < n; i++) | |
252 z[i] = x; | |
253 } | |
254 #endif | |
255 /*- End of function --------------------------------------------------------*/ | |
256 | |
257 SPAN_DECLARE(void) vec_set(double z[], double x, int n) | |
258 { | |
259 int i; | |
260 | |
261 for (i = 0; i < n; i++) | |
262 z[i] = x; | |
263 } | |
264 /*- End of function --------------------------------------------------------*/ | |
265 | |
266 #if defined(HAVE_LONG_DOUBLE) | |
267 SPAN_DECLARE(void) vec_setl(long double z[], long double x, int n) | |
268 { | |
269 int i; | |
270 | |
271 for (i = 0; i < n; i++) | |
272 z[i] = x; | |
273 } | |
274 /*- End of function --------------------------------------------------------*/ | |
275 #endif | |
276 | |
277 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) | |
278 SPAN_DECLARE(void) vec_addf(float z[], const float x[], const float y[], int n) | |
279 { | |
280 int i; | |
281 __m128 n1; | |
282 __m128 n2; | |
283 | |
284 if ((i = n & ~3)) | |
285 { | |
286 for (i -= 4; i >= 0; i -= 4) | |
287 { | |
288 n1 = _mm_loadu_ps(x + i); | |
289 n2 = _mm_loadu_ps(y + i); | |
290 n2 = _mm_add_ps(n1, n2); | |
291 _mm_storeu_ps(z + i, n2); | |
292 } | |
293 } | |
294 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ | |
295 switch (n & 3) | |
296 { | |
297 case 3: | |
298 z[n - 3] = x[n - 3] + y[n - 3]; | |
299 case 2: | |
300 z[n - 2] = x[n - 2] + y[n - 2]; | |
301 case 1: | |
302 z[n - 1] = x[n - 1] + y[n - 1]; | |
303 } | |
304 } | |
305 #else | |
306 SPAN_DECLARE(void) vec_addf(float z[], const float x[], const float y[], int n) | |
307 { | |
308 int i; | |
309 | |
310 for (i = 0; i < n; i++) | |
311 z[i] = x[i] + y[i]; | |
312 } | |
313 #endif | |
314 /*- End of function --------------------------------------------------------*/ | |
315 | |
316 SPAN_DECLARE(void) vec_add(double z[], const double x[], const double y[], int n) | |
317 { | |
318 int i; | |
319 | |
320 for (i = 0; i < n; i++) | |
321 z[i] = x[i] + y[i]; | |
322 } | |
323 /*- End of function --------------------------------------------------------*/ | |
324 | |
325 #if defined(HAVE_LONG_DOUBLE) | |
326 SPAN_DECLARE(void) vec_addl(long double z[], const long double x[], const long double y[], int n) | |
327 { | |
328 int i; | |
329 | |
330 for (i = 0; i < n; i++) | |
331 z[i] = x[i] + y[i]; | |
332 } | |
333 /*- End of function --------------------------------------------------------*/ | |
334 #endif | |
335 | |
336 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) | |
337 SPAN_DECLARE(void) vec_scaledxy_addf(float z[], const float x[], float x_scale, const float y[], float y_scale, int n) | |
338 { | |
339 int i; | |
340 __m128 n1; | |
341 __m128 n2; | |
342 __m128 n3; | |
343 __m128 n4; | |
344 | |
345 if ((i = n & ~3)) | |
346 { | |
347 n3 = _mm_set1_ps(x_scale); | |
348 n4 = _mm_set1_ps(y_scale); | |
349 for (i -= 4; i >= 0; i -= 4) | |
350 { | |
351 n1 = _mm_loadu_ps(x + i); | |
352 n2 = _mm_loadu_ps(y + i); | |
353 n1 = _mm_mul_ps(n1, n3); | |
354 n2 = _mm_mul_ps(n2, n4); | |
355 n2 = _mm_add_ps(n1, n2); | |
356 _mm_storeu_ps(z + i, n2); | |
357 } | |
358 } | |
359 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ | |
360 switch (n & 3) | |
361 { | |
362 case 3: | |
363 z[n - 3] = x[n - 3]*x_scale + y[n - 3]*y_scale; | |
364 case 2: | |
365 z[n - 2] = x[n - 2]*x_scale + y[n - 2]*y_scale; | |
366 case 1: | |
367 z[n - 1] = x[n - 1]*x_scale + y[n - 1]*y_scale; | |
368 } | |
369 } | |
370 #else | |
371 SPAN_DECLARE(void) vec_scaledxy_addf(float z[], const float x[], float x_scale, const float y[], float y_scale, int n) | |
372 { | |
373 int i; | |
374 | |
375 for (i = 0; i < n; i++) | |
376 z[i] = x[i]*x_scale + y[i]*y_scale; | |
377 } | |
378 #endif | |
379 /*- End of function --------------------------------------------------------*/ | |
380 | |
381 SPAN_DECLARE(void) vec_scaledxy_add(double z[], const double x[], double x_scale, const double y[], double y_scale, int n) | |
382 { | |
383 int i; | |
384 | |
385 for (i = 0; i < n; i++) | |
386 z[i] = x[i]*x_scale + y[i]*y_scale; | |
387 } | |
388 /*- End of function --------------------------------------------------------*/ | |
389 | |
390 #if defined(HAVE_LONG_DOUBLE) | |
391 SPAN_DECLARE(void) vec_scaledxy_addl(long double z[], const long double x[], long double x_scale, const long double y[], long double y_scale, int n) | |
392 { | |
393 int i; | |
394 | |
395 for (i = 0; i < n; i++) | |
396 z[i] = x[i]*x_scale + y[i]*y_scale; | |
397 } | |
398 /*- End of function --------------------------------------------------------*/ | |
399 #endif | |
400 | |
401 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) | |
402 SPAN_DECLARE(void) vec_scaledy_addf(float z[], const float x[], const float y[], float y_scale, int n) | |
403 { | |
404 int i; | |
405 __m128 n1; | |
406 __m128 n2; | |
407 __m128 n3; | |
408 | |
409 if ((i = n & ~3)) | |
410 { | |
411 n3 = _mm_set1_ps(y_scale); | |
412 for (i -= 4; i >= 0; i -= 4) | |
413 { | |
414 n1 = _mm_loadu_ps(x + i); | |
415 n2 = _mm_loadu_ps(y + i); | |
416 n2 = _mm_mul_ps(n2, n3); | |
417 n2 = _mm_add_ps(n1, n2); | |
418 _mm_storeu_ps(z + i, n2); | |
419 } | |
420 } | |
421 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ | |
422 switch (n & 3) | |
423 { | |
424 case 3: | |
425 z[n - 3] = x[n - 3] + y[n - 3]*y_scale; | |
426 case 2: | |
427 z[n - 2] = x[n - 2] + y[n - 2]*y_scale; | |
428 case 1: | |
429 z[n - 1] = x[n - 1] + y[n - 1]*y_scale; | |
430 } | |
431 } | |
432 #else | |
433 SPAN_DECLARE(void) vec_scaledy_addf(float z[], const float x[], const float y[], float y_scale, int n) | |
434 { | |
435 int i; | |
436 | |
437 for (i = 0; i < n; i++) | |
438 z[i] = x[i] + y[i]*y_scale; | |
439 } | |
440 #endif | |
441 /*- End of function --------------------------------------------------------*/ | |
442 | |
443 SPAN_DECLARE(void) vec_scaledy_add(double z[], const double x[], const double y[], double y_scale, int n) | |
444 { | |
445 int i; | |
446 | |
447 for (i = 0; i < n; i++) | |
448 z[i] = x[i] + y[i]*y_scale; | |
449 } | |
450 /*- End of function --------------------------------------------------------*/ | |
451 | |
452 #if defined(HAVE_LONG_DOUBLE) | |
453 SPAN_DECLARE(void) vec_scaledy_addl(long double z[], const long double x[], const long double y[], long double y_scale, int n) | |
454 { | |
455 int i; | |
456 | |
457 for (i = 0; i < n; i++) | |
458 z[i] = x[i] + y[i]*y_scale; | |
459 } | |
460 /*- End of function --------------------------------------------------------*/ | |
461 #endif | |
462 | |
463 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) | |
464 SPAN_DECLARE(void) vec_subf(float z[], const float x[], const float y[], int n) | |
465 { | |
466 int i; | |
467 __m128 n1; | |
468 __m128 n2; | |
469 | |
470 if ((i = n & ~3)) | |
471 { | |
472 for (i -= 4; i >= 0; i -= 4) | |
473 { | |
474 n1 = _mm_loadu_ps(x + i); | |
475 n2 = _mm_loadu_ps(y + i); | |
476 n2 = _mm_sub_ps(n1, n2); | |
477 _mm_storeu_ps(z + i, n2); | |
478 } | |
479 } | |
480 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ | |
481 switch (n & 3) | |
482 { | |
483 case 3: | |
484 z[n - 3] = x[n - 3] - y[n - 3]; | |
485 case 2: | |
486 z[n - 2] = x[n - 2] - y[n - 2]; | |
487 case 1: | |
488 z[n - 1] = x[n - 1] - y[n - 1]; | |
489 } | |
490 } | |
491 #else | |
492 SPAN_DECLARE(void) vec_subf(float z[], const float x[], const float y[], int n) | |
493 { | |
494 int i; | |
495 | |
496 for (i = 0; i < n; i++) | |
497 z[i] = x[i] - y[i]; | |
498 } | |
499 #endif | |
500 /*- End of function --------------------------------------------------------*/ | |
501 | |
502 SPAN_DECLARE(void) vec_sub(double z[], const double x[], const double y[], int n) | |
503 { | |
504 int i; | |
505 | |
506 for (i = 0; i < n; i++) | |
507 z[i] = x[i] - y[i]; | |
508 } | |
509 /*- End of function --------------------------------------------------------*/ | |
510 | |
511 #if defined(HAVE_LONG_DOUBLE) | |
512 SPAN_DECLARE(void) vec_subl(long double z[], const long double x[], const long double y[], int n) | |
513 { | |
514 int i; | |
515 | |
516 for (i = 0; i < n; i++) | |
517 z[i] = x[i] - y[i]; | |
518 } | |
519 /*- End of function --------------------------------------------------------*/ | |
520 #endif | |
521 | |
522 SPAN_DECLARE(void) vec_scaledxy_subf(float z[], const float x[], float x_scale, const float y[], float y_scale, int n) | |
523 { | |
524 int i; | |
525 | |
526 for (i = 0; i < n; i++) | |
527 z[i] = x[i]*x_scale - y[i]*y_scale; | |
528 } | |
529 /*- End of function --------------------------------------------------------*/ | |
530 | |
531 SPAN_DECLARE(void) vec_scaledxy_sub(double z[], const double x[], double x_scale, const double y[], double y_scale, int n) | |
532 { | |
533 int i; | |
534 | |
535 for (i = 0; i < n; i++) | |
536 z[i] = x[i]*x_scale - y[i]*y_scale; | |
537 } | |
538 /*- End of function --------------------------------------------------------*/ | |
539 | |
540 #if defined(HAVE_LONG_DOUBLE) | |
541 SPAN_DECLARE(void) vec_scaledxy_subl(long double z[], const long double x[], long double x_scale, const long double y[], long double y_scale, int n) | |
542 { | |
543 int i; | |
544 | |
545 for (i = 0; i < n; i++) | |
546 z[i] = x[i]*x_scale - y[i]*y_scale; | |
547 } | |
548 /*- End of function --------------------------------------------------------*/ | |
549 #endif | |
550 | |
551 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) | |
552 SPAN_DECLARE(void) vec_scalar_mulf(float z[], const float x[], float y, int n) | |
553 { | |
554 int i; | |
555 __m128 n1; | |
556 __m128 n2; | |
557 | |
558 if ((i = n & ~3)) | |
559 { | |
560 n2 = _mm_set1_ps(y); | |
561 for (i -= 4; i >= 0; i -= 4) | |
562 { | |
563 n1 = _mm_loadu_ps(x + i); | |
564 n1 = _mm_mul_ps(n1, n2); | |
565 _mm_storeu_ps(z + i, n1); | |
566 } | |
567 } | |
568 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ | |
569 switch (n & 3) | |
570 { | |
571 case 3: | |
572 z[n - 3] = x[n - 3]*y; | |
573 case 2: | |
574 z[n - 2] = x[n - 2]*y; | |
575 case 1: | |
576 z[n - 1] = x[n - 1]*y; | |
577 } | |
578 } | |
579 #else | |
580 SPAN_DECLARE(void) vec_scalar_mulf(float z[], const float x[], float y, int n) | |
581 { | |
582 int i; | |
583 | |
584 for (i = 0; i < n; i++) | |
585 z[i] = x[i]*y; | |
586 } | |
587 #endif | |
588 /*- End of function --------------------------------------------------------*/ | |
589 | |
590 SPAN_DECLARE(void) vec_scalar_mul(double z[], const double x[], double y, int n) | |
591 { | |
592 int i; | |
593 | |
594 for (i = 0; i < n; i++) | |
595 z[i] = x[i]*y; | |
596 } | |
597 /*- End of function --------------------------------------------------------*/ | |
598 | |
599 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) | |
600 SPAN_DECLARE(void) vec_scalar_addf(float z[], const float x[], float y, int n) | |
601 { | |
602 int i; | |
603 __m128 n1; | |
604 __m128 n2; | |
605 | |
606 if ((i = n & ~3)) | |
607 { | |
608 n2 = _mm_set1_ps(y); | |
609 for (i -= 4; i >= 0; i -= 4) | |
610 { | |
611 n1 = _mm_loadu_ps(x + i); | |
612 n1 = _mm_add_ps(n1, n2); | |
613 _mm_storeu_ps(z + i, n1); | |
614 } | |
615 } | |
616 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ | |
617 switch (n & 3) | |
618 { | |
619 case 3: | |
620 z[n - 3] = x[n - 3] + y; | |
621 case 2: | |
622 z[n - 2] = x[n - 2] + y; | |
623 case 1: | |
624 z[n - 1] = x[n - 1] + y; | |
625 } | |
626 } | |
627 #else | |
628 SPAN_DECLARE(void) vec_scalar_addf(float z[], const float x[], float y, int n) | |
629 { | |
630 int i; | |
631 | |
632 for (i = 0; i < n; i++) | |
633 z[i] = x[i] + y; | |
634 } | |
635 #endif | |
636 /*- End of function --------------------------------------------------------*/ | |
637 | |
638 SPAN_DECLARE(void) vec_scalar_add(double z[], const double x[], double y, int n) | |
639 { | |
640 int i; | |
641 | |
642 for (i = 0; i < n; i++) | |
643 z[i] = x[i] + y; | |
644 } | |
645 /*- End of function --------------------------------------------------------*/ | |
646 | |
647 #if defined(HAVE_LONG_DOUBLE) | |
648 SPAN_DECLARE(void) vec_scalar_addl(long double z[], const long double x[], long double y, int n) | |
649 { | |
650 int i; | |
651 | |
652 for (i = 0; i < n; i++) | |
653 z[i] = x[i] + y; | |
654 } | |
655 /*- End of function --------------------------------------------------------*/ | |
656 #endif | |
657 | |
658 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) | |
659 SPAN_DECLARE(void) vec_scalar_subf(float z[], const float x[], float y, int n) | |
660 { | |
661 int i; | |
662 __m128 n1; | |
663 __m128 n2; | |
664 | |
665 if ((i = n & ~3)) | |
666 { | |
667 n2 = _mm_set1_ps(y); | |
668 for (i -= 4; i >= 0; i -= 4) | |
669 { | |
670 n1 = _mm_loadu_ps(x + i); | |
671 n1 = _mm_sub_ps(n1, n2); | |
672 _mm_storeu_ps(z + i, n1); | |
673 } | |
674 } | |
675 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ | |
676 switch (n & 3) | |
677 { | |
678 case 3: | |
679 z[n - 3] = x[n - 3] - y; | |
680 case 2: | |
681 z[n - 2] = x[n - 2] - y; | |
682 case 1: | |
683 z[n - 1] = x[n - 1] - y; | |
684 } | |
685 } | |
686 #else | |
687 SPAN_DECLARE(void) vec_scalar_subf(float z[], const float x[], float y, int n) | |
688 { | |
689 int i; | |
690 | |
691 for (i = 0; i < n; i++) | |
692 z[i] = x[i] - y; | |
693 } | |
694 #endif | |
695 /*- End of function --------------------------------------------------------*/ | |
696 | |
697 SPAN_DECLARE(void) vec_scalar_sub(double z[], const double x[], double y, int n) | |
698 { | |
699 int i; | |
700 | |
701 for (i = 0; i < n; i++) | |
702 z[i] = x[i] - y; | |
703 } | |
704 /*- End of function --------------------------------------------------------*/ | |
705 | |
706 #if defined(HAVE_LONG_DOUBLE) | |
707 SPAN_DECLARE(void) vec_scalar_subl(long double z[], const long double x[], long double y, int n) | |
708 { | |
709 int i; | |
710 | |
711 for (i = 0; i < n; i++) | |
712 z[i] = x[i] - y; | |
713 } | |
714 /*- End of function --------------------------------------------------------*/ | |
715 #endif | |
716 | |
717 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) | |
718 SPAN_DECLARE(void) vec_mulf(float z[], const float x[], const float y[], int n) | |
719 { | |
720 int i; | |
721 __m128 n1; | |
722 __m128 n2; | |
723 __m128 n3; | |
724 | |
725 if ((i = n & ~3)) | |
726 { | |
727 for (i -= 4; i >= 0; i -= 4) | |
728 { | |
729 n1 = _mm_loadu_ps(x + i); | |
730 n2 = _mm_loadu_ps(y + i); | |
731 n3 = _mm_mul_ps(n1, n2); | |
732 _mm_storeu_ps(z + i, n3); | |
733 } | |
734 } | |
735 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ | |
736 switch (n & 3) | |
737 { | |
738 case 3: | |
739 z[n - 3] = x[n - 3]*y[n - 3]; | |
740 case 2: | |
741 z[n - 2] = x[n - 2]*y[n - 2]; | |
742 case 1: | |
743 z[n - 1] = x[n - 1]*y[n - 1]; | |
744 } | |
745 } | |
746 #else | |
747 SPAN_DECLARE(void) vec_mulf(float z[], const float x[], const float y[], int n) | |
748 { | |
749 int i; | |
750 | |
751 for (i = 0; i < n; i++) | |
752 z[i] = x[i]*y[i]; | |
753 } | |
754 /*- End of function --------------------------------------------------------*/ | |
755 #endif | |
756 | |
757 SPAN_DECLARE(void) vec_mul(double z[], const double x[], const double y[], int n) | |
758 { | |
759 int i; | |
760 | |
761 for (i = 0; i < n; i++) | |
762 z[i] = x[i]*y[i]; | |
763 } | |
764 /*- End of function --------------------------------------------------------*/ | |
765 | |
766 #if defined(HAVE_LONG_DOUBLE) | |
767 SPAN_DECLARE(void) vec_mull(long double z[], const long double x[], const long double y[], int n) | |
768 { | |
769 int i; | |
770 | |
771 for (i = 0; i < n; i++) | |
772 z[i] = x[i]*y[i]; | |
773 } | |
774 /*- End of function --------------------------------------------------------*/ | |
775 #endif | |
776 | |
777 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) | |
778 SPAN_DECLARE(float) vec_dot_prodf(const float x[], const float y[], int n) | |
779 { | |
780 int i; | |
781 float z; | |
782 __m128 n1; | |
783 __m128 n2; | |
784 __m128 n3; | |
785 __m128 n4; | |
786 | |
787 z = 0.0f; | |
788 if ((i = n & ~3)) | |
789 { | |
790 n4 = _mm_setzero_ps(); //sets sum to zero | |
791 for (i -= 4; i >= 0; i -= 4) | |
792 { | |
793 n1 = _mm_loadu_ps(x + i); | |
794 n2 = _mm_loadu_ps(y + i); | |
795 n3 = _mm_mul_ps(n1, n2); | |
796 n4 = _mm_add_ps(n4, n3); | |
797 } | |
798 n4 = _mm_add_ps(_mm_movehl_ps(n4, n4), n4); | |
799 n4 = _mm_add_ss(_mm_shuffle_ps(n4, n4, 1), n4); | |
800 _mm_store_ss(&z, n4); | |
801 } | |
802 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ | |
803 switch (n & 3) | |
804 { | |
805 case 3: | |
806 z += x[n - 3]*y[n - 3]; | |
807 case 2: | |
808 z += x[n - 2]*y[n - 2]; | |
809 case 1: | |
810 z += x[n - 1]*y[n - 1]; | |
811 } | |
812 return z; | |
813 } | |
814 #else | |
815 SPAN_DECLARE(float) vec_dot_prodf(const float x[], const float y[], int n) | |
816 { | |
817 int i; | |
818 float z; | |
819 | |
820 z = 0.0f; | |
821 for (i = 0; i < n; i++) | |
822 z += x[i]*y[i]; | |
823 return z; | |
824 } | |
825 /*- End of function --------------------------------------------------------*/ | |
826 #endif | |
827 | |
828 SPAN_DECLARE(double) vec_dot_prod(const double x[], const double y[], int n) | |
829 { | |
830 int i; | |
831 double z; | |
832 | |
833 z = 0.0; | |
834 for (i = 0; i < n; i++) | |
835 z += x[i]*y[i]; | |
836 return z; | |
837 } | |
838 /*- End of function --------------------------------------------------------*/ | |
839 | |
840 #if defined(HAVE_LONG_DOUBLE) | |
841 SPAN_DECLARE(long double) vec_dot_prodl(const long double x[], const long double y[], int n) | |
842 { | |
843 int i; | |
844 long double z; | |
845 | |
846 z = 0.0L; | |
847 for (i = 0; i < n; i++) | |
848 z += x[i]*y[i]; | |
849 return z; | |
850 } | |
851 /*- End of function --------------------------------------------------------*/ | |
852 #endif | |
853 | |
854 SPAN_DECLARE(float) vec_circular_dot_prodf(const float x[], const float y[], int n, int pos) | |
855 { | |
856 float z; | |
857 | |
858 z = vec_dot_prodf(&x[pos], &y[0], n - pos); | |
859 z += vec_dot_prodf(&x[0], &y[n - pos], pos); | |
860 return z; | |
861 } | |
862 /*- End of function --------------------------------------------------------*/ | |
863 | |
864 #define LMS_LEAK_RATE 0.9999f | |
865 | |
866 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) | |
867 SPAN_DECLARE(void) vec_lmsf(const float x[], float y[], int n, float error) | |
868 { | |
869 int i; | |
870 __m128 n1; | |
871 __m128 n2; | |
872 __m128 n3; | |
873 __m128 n4; | |
874 | |
875 if ((i = n & ~3)) | |
876 { | |
877 n3 = _mm_set1_ps(error); | |
878 n4 = _mm_set1_ps(LMS_LEAK_RATE); | |
879 for (i -= 4; i >= 0; i -= 4) | |
880 { | |
881 n1 = _mm_loadu_ps(x + i); | |
882 n2 = _mm_loadu_ps(y + i); | |
883 n1 = _mm_mul_ps(n1, n3); | |
884 n2 = _mm_mul_ps(n2, n4); | |
885 n1 = _mm_add_ps(n1, n2); | |
886 _mm_storeu_ps(y + i, n1); | |
887 } | |
888 } | |
889 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ | |
890 switch (n & 3) | |
891 { | |
892 case 3: | |
893 y[n - 3] = y[n - 3]*LMS_LEAK_RATE + x[n - 3]*error; | |
894 case 2: | |
895 y[n - 2] = y[n - 2]*LMS_LEAK_RATE + x[n - 2]*error; | |
896 case 1: | |
897 y[n - 1] = y[n - 1]*LMS_LEAK_RATE + x[n - 1]*error; | |
898 } | |
899 } | |
900 #else | |
901 SPAN_DECLARE(void) vec_lmsf(const float x[], float y[], int n, float error) | |
902 { | |
903 int i; | |
904 | |
905 for (i = 0; i < n; i++) | |
906 { | |
907 /* Leak a little to tame uncontrolled wandering */ | |
908 y[i] = y[i]*LMS_LEAK_RATE + x[i]*error; | |
909 } | |
910 } | |
911 #endif | |
912 /*- End of function --------------------------------------------------------*/ | |
913 | |
914 SPAN_DECLARE(void) vec_circular_lmsf(const float x[], float y[], int n, int pos, float error) | |
915 { | |
916 vec_lmsf(&x[pos], &y[0], n - pos, error); | |
917 vec_lmsf(&x[0], &y[n - pos], pos, error); | |
918 } | |
919 /*- End of function --------------------------------------------------------*/ | |
920 /*- End of file ------------------------------------------------------------*/ |