Mercurial > hg > audiostuff
diff spandsp-0.0.6pre17/src/vector_float.c @ 4:26cd8f1ef0b1
import spandsp-0.0.6pre17
author | Peter Meerwald <pmeerw@cosy.sbg.ac.at> |
---|---|
date | Fri, 25 Jun 2010 15:50:58 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spandsp-0.0.6pre17/src/vector_float.c Fri Jun 25 15:50:58 2010 +0200 @@ -0,0 +1,920 @@ +/* + * SpanDSP - a series of DSP components for telephony + * + * vector_float.c - Floating vector arithmetic routines. + * + * Written by Steve Underwood <steveu@coppice.org> + * + * Copyright (C) 2006 Steve Underwood + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * $Id: vector_float.c,v 1.22 2009/07/12 09:23:09 steveu Exp $ + */ + +/*! \file */ + +#if defined(HAVE_CONFIG_H) +#include "config.h" +#endif + +#include <inttypes.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#if defined(HAVE_TGMATH_H) +#include <tgmath.h> +#endif +#if defined(HAVE_MATH_H) +#include <math.h> +#endif +#include <assert.h> + +#include "floating_fudge.h" +#include "mmx_sse_decs.h" + +#include "spandsp/telephony.h" +#include "spandsp/vector_float.h" + +#if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) +SPAN_DECLARE(void) vec_copyf(float z[], const float x[], int n) +{ + int i; + __m128 n1; + + if ((i = n & ~3)) + { + for (i -= 4; i >= 0; i -= 4) + { + n1 = _mm_loadu_ps(x + i); + _mm_storeu_ps(z + i, n1); + } + } + /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ + switch (n & 3) + { + case 3: + z[n - 3] = x[n - 3]; + case 2: + z[n - 2] = x[n - 2]; + case 1: + z[n - 1] = x[n - 1]; + } +} +#else +SPAN_DECLARE(void) vec_copyf(float z[], const float x[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i]; +} +#endif +/*- End of function --------------------------------------------------------*/ + +SPAN_DECLARE(void) vec_copy(double z[], const double x[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i]; +} +/*- End of function --------------------------------------------------------*/ + +#if defined(HAVE_LONG_DOUBLE) +SPAN_DECLARE(void) vec_copyl(long double z[], const long double x[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i]; +} +/*- End of function --------------------------------------------------------*/ +#endif + +#if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) +SPAN_DECLARE(void) vec_negatef(float z[], const float x[], int n) +{ + int i; + static const uint32_t mask = 0x80000000; + static const float *fmask = (float *) &mask; + __m128 n1; + __m128 n2; + + if ((i = n & ~3)) + { + n2 = _mm_set1_ps(*fmask); + for (i -= 4; i >= 0; i -= 4) + { + n1 = _mm_loadu_ps(x + i); + n1 = _mm_xor_ps(n1, n2); + _mm_storeu_ps(z + i, n1); + } + } + /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ + switch (n & 3) + { + case 3: + z[n - 3] = -x[n - 3]; + case 2: + z[n - 2] = -x[n - 2]; + case 1: + z[n - 1] = -x[n - 1]; + } +} +#else +SPAN_DECLARE(void) vec_negatef(float z[], const float x[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = -x[i]; +} +#endif +/*- End of function --------------------------------------------------------*/ + +SPAN_DECLARE(void) vec_negate(double z[], const double x[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = -x[i]; +} +/*- End of function --------------------------------------------------------*/ + +#if defined(HAVE_LONG_DOUBLE) +SPAN_DECLARE(void) vec_negatel(long double z[], const long double x[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = -x[i]; +} +/*- End of function --------------------------------------------------------*/ +#endif + +#if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) +SPAN_DECLARE(void) vec_zerof(float z[], int n) +{ + int i; + __m128 n1; + + if ((i = n & ~3)) + { + n1 = _mm_setzero_ps(); + for (i -= 4; i >= 0; i -= 4) + _mm_storeu_ps(z + i, n1); + } + /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ + switch (n & 3) + { + case 3: + z[n - 3] = 0; + case 2: + z[n - 2] = 0; + case 1: + z[n - 1] = 0; + } +} +#else +SPAN_DECLARE(void) vec_zerof(float z[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = 0.0f; +} +#endif +/*- End of function --------------------------------------------------------*/ + +SPAN_DECLARE(void) vec_zero(double z[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = 0.0; +} +/*- End of function --------------------------------------------------------*/ + +#if defined(HAVE_LONG_DOUBLE) +SPAN_DECLARE(void) vec_zerol(long double z[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = 0.0L; +} +/*- End of function --------------------------------------------------------*/ +#endif + +#if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) +SPAN_DECLARE(void) vec_setf(float z[], float x, int n) +{ + int i; + __m128 n1; + + if ((i = n & ~3)) + { + n1 = _mm_set1_ps(x); + for (i -= 4; i >= 0; i -= 4) + _mm_storeu_ps(z + i, n1); + } + /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ + switch (n & 3) + { + case 3: + z[n - 3] = x; + case 2: + z[n - 2] = x; + case 1: + z[n - 1] = x; + } +} +#else +SPAN_DECLARE(void) vec_setf(float z[], float x, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x; +} +#endif +/*- End of function --------------------------------------------------------*/ + +SPAN_DECLARE(void) vec_set(double z[], double x, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x; +} +/*- End of function --------------------------------------------------------*/ + +#if defined(HAVE_LONG_DOUBLE) +SPAN_DECLARE(void) vec_setl(long double z[], long double x, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x; +} +/*- End of function --------------------------------------------------------*/ +#endif + +#if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) +SPAN_DECLARE(void) vec_addf(float z[], const float x[], const float y[], int n) +{ + int i; + __m128 n1; + __m128 n2; + + if ((i = n & ~3)) + { + for (i -= 4; i >= 0; i -= 4) + { + n1 = _mm_loadu_ps(x + i); + n2 = _mm_loadu_ps(y + i); + n2 = _mm_add_ps(n1, n2); + _mm_storeu_ps(z + i, n2); + } + } + /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ + switch (n & 3) + { + case 3: + z[n - 3] = x[n - 3] + y[n - 3]; + case 2: + z[n - 2] = x[n - 2] + y[n - 2]; + case 1: + z[n - 1] = x[n - 1] + y[n - 1]; + } +} +#else +SPAN_DECLARE(void) vec_addf(float z[], const float x[], const float y[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i] + y[i]; +} +#endif +/*- End of function --------------------------------------------------------*/ + +SPAN_DECLARE(void) vec_add(double z[], const double x[], const double y[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i] + y[i]; +} +/*- End of function --------------------------------------------------------*/ + +#if defined(HAVE_LONG_DOUBLE) +SPAN_DECLARE(void) vec_addl(long double z[], const long double x[], const long double y[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i] + y[i]; +} +/*- End of function --------------------------------------------------------*/ +#endif + +#if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) +SPAN_DECLARE(void) vec_scaledxy_addf(float z[], const float x[], float x_scale, const float y[], float y_scale, int n) +{ + int i; + __m128 n1; + __m128 n2; + __m128 n3; + __m128 n4; + + if ((i = n & ~3)) + { + n3 = _mm_set1_ps(x_scale); + n4 = _mm_set1_ps(y_scale); + for (i -= 4; i >= 0; i -= 4) + { + n1 = _mm_loadu_ps(x + i); + n2 = _mm_loadu_ps(y + i); + n1 = _mm_mul_ps(n1, n3); + n2 = _mm_mul_ps(n2, n4); + n2 = _mm_add_ps(n1, n2); + _mm_storeu_ps(z + i, n2); + } + } + /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ + switch (n & 3) + { + case 3: + z[n - 3] = x[n - 3]*x_scale + y[n - 3]*y_scale; + case 2: + z[n - 2] = x[n - 2]*x_scale + y[n - 2]*y_scale; + case 1: + z[n - 1] = x[n - 1]*x_scale + y[n - 1]*y_scale; + } +} +#else +SPAN_DECLARE(void) vec_scaledxy_addf(float z[], const float x[], float x_scale, const float y[], float y_scale, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i]*x_scale + y[i]*y_scale; +} +#endif +/*- End of function --------------------------------------------------------*/ + +SPAN_DECLARE(void) vec_scaledxy_add(double z[], const double x[], double x_scale, const double y[], double y_scale, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i]*x_scale + y[i]*y_scale; +} +/*- End of function --------------------------------------------------------*/ + +#if defined(HAVE_LONG_DOUBLE) +SPAN_DECLARE(void) vec_scaledxy_addl(long double z[], const long double x[], long double x_scale, const long double y[], long double y_scale, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i]*x_scale + y[i]*y_scale; +} +/*- End of function --------------------------------------------------------*/ +#endif + +#if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) +SPAN_DECLARE(void) vec_scaledy_addf(float z[], const float x[], const float y[], float y_scale, int n) +{ + int i; + __m128 n1; + __m128 n2; + __m128 n3; + + if ((i = n & ~3)) + { + n3 = _mm_set1_ps(y_scale); + for (i -= 4; i >= 0; i -= 4) + { + n1 = _mm_loadu_ps(x + i); + n2 = _mm_loadu_ps(y + i); + n2 = _mm_mul_ps(n2, n3); + n2 = _mm_add_ps(n1, n2); + _mm_storeu_ps(z + i, n2); + } + } + /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ + switch (n & 3) + { + case 3: + z[n - 3] = x[n - 3] + y[n - 3]*y_scale; + case 2: + z[n - 2] = x[n - 2] + y[n - 2]*y_scale; + case 1: + z[n - 1] = x[n - 1] + y[n - 1]*y_scale; + } +} +#else +SPAN_DECLARE(void) vec_scaledy_addf(float z[], const float x[], const float y[], float y_scale, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i] + y[i]*y_scale; +} +#endif +/*- End of function --------------------------------------------------------*/ + +SPAN_DECLARE(void) vec_scaledy_add(double z[], const double x[], const double y[], double y_scale, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i] + y[i]*y_scale; +} +/*- End of function --------------------------------------------------------*/ + +#if defined(HAVE_LONG_DOUBLE) +SPAN_DECLARE(void) vec_scaledy_addl(long double z[], const long double x[], const long double y[], long double y_scale, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i] + y[i]*y_scale; +} +/*- End of function --------------------------------------------------------*/ +#endif + +#if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) +SPAN_DECLARE(void) vec_subf(float z[], const float x[], const float y[], int n) +{ + int i; + __m128 n1; + __m128 n2; + + if ((i = n & ~3)) + { + for (i -= 4; i >= 0; i -= 4) + { + n1 = _mm_loadu_ps(x + i); + n2 = _mm_loadu_ps(y + i); + n2 = _mm_sub_ps(n1, n2); + _mm_storeu_ps(z + i, n2); + } + } + /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ + switch (n & 3) + { + case 3: + z[n - 3] = x[n - 3] - y[n - 3]; + case 2: + z[n - 2] = x[n - 2] - y[n - 2]; + case 1: + z[n - 1] = x[n - 1] - y[n - 1]; + } +} +#else +SPAN_DECLARE(void) vec_subf(float z[], const float x[], const float y[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i] - y[i]; +} +#endif +/*- End of function --------------------------------------------------------*/ + +SPAN_DECLARE(void) vec_sub(double z[], const double x[], const double y[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i] - y[i]; +} +/*- End of function --------------------------------------------------------*/ + +#if defined(HAVE_LONG_DOUBLE) +SPAN_DECLARE(void) vec_subl(long double z[], const long double x[], const long double y[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i] - y[i]; +} +/*- End of function --------------------------------------------------------*/ +#endif + +SPAN_DECLARE(void) vec_scaledxy_subf(float z[], const float x[], float x_scale, const float y[], float y_scale, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i]*x_scale - y[i]*y_scale; +} +/*- End of function --------------------------------------------------------*/ + +SPAN_DECLARE(void) vec_scaledxy_sub(double z[], const double x[], double x_scale, const double y[], double y_scale, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i]*x_scale - y[i]*y_scale; +} +/*- End of function --------------------------------------------------------*/ + +#if defined(HAVE_LONG_DOUBLE) +SPAN_DECLARE(void) vec_scaledxy_subl(long double z[], const long double x[], long double x_scale, const long double y[], long double y_scale, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i]*x_scale - y[i]*y_scale; +} +/*- End of function --------------------------------------------------------*/ +#endif + +#if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) +SPAN_DECLARE(void) vec_scalar_mulf(float z[], const float x[], float y, int n) +{ + int i; + __m128 n1; + __m128 n2; + + if ((i = n & ~3)) + { + n2 = _mm_set1_ps(y); + for (i -= 4; i >= 0; i -= 4) + { + n1 = _mm_loadu_ps(x + i); + n1 = _mm_mul_ps(n1, n2); + _mm_storeu_ps(z + i, n1); + } + } + /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ + switch (n & 3) + { + case 3: + z[n - 3] = x[n - 3]*y; + case 2: + z[n - 2] = x[n - 2]*y; + case 1: + z[n - 1] = x[n - 1]*y; + } +} +#else +SPAN_DECLARE(void) vec_scalar_mulf(float z[], const float x[], float y, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i]*y; +} +#endif +/*- End of function --------------------------------------------------------*/ + +SPAN_DECLARE(void) vec_scalar_mul(double z[], const double x[], double y, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i]*y; +} +/*- End of function --------------------------------------------------------*/ + +#if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) +SPAN_DECLARE(void) vec_scalar_addf(float z[], const float x[], float y, int n) +{ + int i; + __m128 n1; + __m128 n2; + + if ((i = n & ~3)) + { + n2 = _mm_set1_ps(y); + for (i -= 4; i >= 0; i -= 4) + { + n1 = _mm_loadu_ps(x + i); + n1 = _mm_add_ps(n1, n2); + _mm_storeu_ps(z + i, n1); + } + } + /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ + switch (n & 3) + { + case 3: + z[n - 3] = x[n - 3] + y; + case 2: + z[n - 2] = x[n - 2] + y; + case 1: + z[n - 1] = x[n - 1] + y; + } +} +#else +SPAN_DECLARE(void) vec_scalar_addf(float z[], const float x[], float y, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i] + y; +} +#endif +/*- End of function --------------------------------------------------------*/ + +SPAN_DECLARE(void) vec_scalar_add(double z[], const double x[], double y, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i] + y; +} +/*- End of function --------------------------------------------------------*/ + +#if defined(HAVE_LONG_DOUBLE) +SPAN_DECLARE(void) vec_scalar_addl(long double z[], const long double x[], long double y, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i] + y; +} +/*- End of function --------------------------------------------------------*/ +#endif + +#if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) +SPAN_DECLARE(void) vec_scalar_subf(float z[], const float x[], float y, int n) +{ + int i; + __m128 n1; + __m128 n2; + + if ((i = n & ~3)) + { + n2 = _mm_set1_ps(y); + for (i -= 4; i >= 0; i -= 4) + { + n1 = _mm_loadu_ps(x + i); + n1 = _mm_sub_ps(n1, n2); + _mm_storeu_ps(z + i, n1); + } + } + /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ + switch (n & 3) + { + case 3: + z[n - 3] = x[n - 3] - y; + case 2: + z[n - 2] = x[n - 2] - y; + case 1: + z[n - 1] = x[n - 1] - y; + } +} +#else +SPAN_DECLARE(void) vec_scalar_subf(float z[], const float x[], float y, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i] - y; +} +#endif +/*- End of function --------------------------------------------------------*/ + +SPAN_DECLARE(void) vec_scalar_sub(double z[], const double x[], double y, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i] - y; +} +/*- End of function --------------------------------------------------------*/ + +#if defined(HAVE_LONG_DOUBLE) +SPAN_DECLARE(void) vec_scalar_subl(long double z[], const long double x[], long double y, int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i] - y; +} +/*- End of function --------------------------------------------------------*/ +#endif + +#if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) +SPAN_DECLARE(void) vec_mulf(float z[], const float x[], const float y[], int n) +{ + int i; + __m128 n1; + __m128 n2; + __m128 n3; + + if ((i = n & ~3)) + { + for (i -= 4; i >= 0; i -= 4) + { + n1 = _mm_loadu_ps(x + i); + n2 = _mm_loadu_ps(y + i); + n3 = _mm_mul_ps(n1, n2); + _mm_storeu_ps(z + i, n3); + } + } + /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ + switch (n & 3) + { + case 3: + z[n - 3] = x[n - 3]*y[n - 3]; + case 2: + z[n - 2] = x[n - 2]*y[n - 2]; + case 1: + z[n - 1] = x[n - 1]*y[n - 1]; + } +} +#else +SPAN_DECLARE(void) vec_mulf(float z[], const float x[], const float y[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i]*y[i]; +} +/*- End of function --------------------------------------------------------*/ +#endif + +SPAN_DECLARE(void) vec_mul(double z[], const double x[], const double y[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i]*y[i]; +} +/*- End of function --------------------------------------------------------*/ + +#if defined(HAVE_LONG_DOUBLE) +SPAN_DECLARE(void) vec_mull(long double z[], const long double x[], const long double y[], int n) +{ + int i; + + for (i = 0; i < n; i++) + z[i] = x[i]*y[i]; +} +/*- End of function --------------------------------------------------------*/ +#endif + +#if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) +SPAN_DECLARE(float) vec_dot_prodf(const float x[], const float y[], int n) +{ + int i; + float z; + __m128 n1; + __m128 n2; + __m128 n3; + __m128 n4; + + z = 0.0f; + if ((i = n & ~3)) + { + n4 = _mm_setzero_ps(); //sets sum to zero + for (i -= 4; i >= 0; i -= 4) + { + n1 = _mm_loadu_ps(x + i); + n2 = _mm_loadu_ps(y + i); + n3 = _mm_mul_ps(n1, n2); + n4 = _mm_add_ps(n4, n3); + } + n4 = _mm_add_ps(_mm_movehl_ps(n4, n4), n4); + n4 = _mm_add_ss(_mm_shuffle_ps(n4, n4, 1), n4); + _mm_store_ss(&z, n4); + } + /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ + switch (n & 3) + { + case 3: + z += x[n - 3]*y[n - 3]; + case 2: + z += x[n - 2]*y[n - 2]; + case 1: + z += x[n - 1]*y[n - 1]; + } + return z; +} +#else +SPAN_DECLARE(float) vec_dot_prodf(const float x[], const float y[], int n) +{ + int i; + float z; + + z = 0.0f; + for (i = 0; i < n; i++) + z += x[i]*y[i]; + return z; +} +/*- End of function --------------------------------------------------------*/ +#endif + +SPAN_DECLARE(double) vec_dot_prod(const double x[], const double y[], int n) +{ + int i; + double z; + + z = 0.0; + for (i = 0; i < n; i++) + z += x[i]*y[i]; + return z; +} +/*- End of function --------------------------------------------------------*/ + +#if defined(HAVE_LONG_DOUBLE) +SPAN_DECLARE(long double) vec_dot_prodl(const long double x[], const long double y[], int n) +{ + int i; + long double z; + + z = 0.0L; + for (i = 0; i < n; i++) + z += x[i]*y[i]; + return z; +} +/*- End of function --------------------------------------------------------*/ +#endif + +SPAN_DECLARE(float) vec_circular_dot_prodf(const float x[], const float y[], int n, int pos) +{ + float z; + + z = vec_dot_prodf(&x[pos], &y[0], n - pos); + z += vec_dot_prodf(&x[0], &y[n - pos], pos); + return z; +} +/*- End of function --------------------------------------------------------*/ + +#define LMS_LEAK_RATE 0.9999f + +#if defined(__GNUC__) && defined(SPANDSP_USE_SSE2) +SPAN_DECLARE(void) vec_lmsf(const float x[], float y[], int n, float error) +{ + int i; + __m128 n1; + __m128 n2; + __m128 n3; + __m128 n4; + + if ((i = n & ~3)) + { + n3 = _mm_set1_ps(error); + n4 = _mm_set1_ps(LMS_LEAK_RATE); + for (i -= 4; i >= 0; i -= 4) + { + n1 = _mm_loadu_ps(x + i); + n2 = _mm_loadu_ps(y + i); + n1 = _mm_mul_ps(n1, n3); + n2 = _mm_mul_ps(n2, n4); + n1 = _mm_add_ps(n1, n2); + _mm_storeu_ps(y + i, n1); + } + } + /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ + switch (n & 3) + { + case 3: + y[n - 3] = y[n - 3]*LMS_LEAK_RATE + x[n - 3]*error; + case 2: + y[n - 2] = y[n - 2]*LMS_LEAK_RATE + x[n - 2]*error; + case 1: + y[n - 1] = y[n - 1]*LMS_LEAK_RATE + x[n - 1]*error; + } +} +#else +SPAN_DECLARE(void) vec_lmsf(const float x[], float y[], int n, float error) +{ + int i; + + for (i = 0; i < n; i++) + { + /* Leak a little to tame uncontrolled wandering */ + y[i] = y[i]*LMS_LEAK_RATE + x[i]*error; + } +} +#endif +/*- End of function --------------------------------------------------------*/ + +SPAN_DECLARE(void) vec_circular_lmsf(const float x[], float y[], int n, int pos, float error) +{ + vec_lmsf(&x[pos], &y[0], n - pos, error); + vec_lmsf(&x[0], &y[n - pos], pos, error); +} +/*- End of function --------------------------------------------------------*/ +/*- End of file ------------------------------------------------------------*/