Mercurial > hg > audiostuff

diff spandsp-0.0.3/spandsp-0.0.3/src/lpc10_voicing.c @ 5:f762bf195c4b
import spandsp-0.0.3
author: Peter Meerwald <pmeerw@cosy.sbg.ac.at>
date: Fri, 25 Jun 2010 16:00:21 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/spandsp-0.0.3/spandsp-0.0.3/src/lpc10_voicing.c	Fri Jun 25 16:00:21 2010 +0200
@@ -0,0 +1,487 @@
+/*
+ * SpanDSP - a series of DSP components for telephony
+ *
+ * lpc10_voicing.c - LPC10 low bit rate speech codec.
+ *
+ * Written by Steve Underwood <steveu@coppice.org>
+ *
+ * Copyright (C) 2006 Steve Underwood
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * This code is based on the U.S. Department of Defense reference
+ * implementation of the LPC-10 2400 bps Voice Coder. They do not
+ * exert copyright claims on their code, and it may be freely used.
+ *
+ * $Id: lpc10_voicing.c,v 1.7 2006/11/30 15:41:47 steveu Exp $
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <memory.h>
+#if defined(HAVE_TGMATH_H)
+#include <tgmath.h>
+#endif
+#if defined(HAVE_MATH_H)
+#include <math.h>
+#endif
+
+#include "spandsp/telephony.h"
+#include "spandsp/dc_restore.h"
+#include "spandsp/lpc10.h"
+
+#include "lpc10_encdecs.h"
+
+static void vparms(int32_t vwin[],
+                    float *inbuf,
+                    float *lpbuf,
+                    const int32_t buflim[],
+                    int32_t half,
+                    float *dither,
+                    int32_t *mintau,
+                    int32_t *zc, 
+                    int32_t *lbe,
+                    int32_t *fbe,
+                    float *qs,
+                    float *rc1,
+                    float *ar_b,
+                    float *ar_f)
+{
+    int32_t inbuf_offset;
+    int32_t lpbuf_offset;
+    int32_t vlen;
+    int32_t stop;
+    int32_t i;
+    int32_t start;
+    float r1;
+    float r2;
+    float e_pre;
+    float ap_rms;
+    float e_0;
+    float oldsgn;
+    float lp_rms;
+    float e_b;
+    float e_f;
+    float r_b;
+    float r_f;
+    float e0ap;
+
+    /* Calculate zero crossings (ZC) and several energy and correlation */
+    /* measures on low band and full band speech.  Each measure is taken */
+    /* over either the first or the second half of the voicing window, */
+    /* depending on the variable HALF. */
+    lpbuf_offset = buflim[2];
+    lpbuf -= lpbuf_offset;
+    inbuf_offset = buflim[0];
+    inbuf -= inbuf_offset;
+
+    lp_rms = 0.0f;
+    ap_rms = 0.0f;
+    e_pre = 0.0f;
+    e0ap = 0.0f;
+    *rc1 = 0.0f;
+    e_0 = 0.0f;
+    e_b = 0.0f;
+    e_f = 0.0f;
+    r_f = 0.0f;
+    r_b = 0.0f;
+    *zc = 0;
+    vlen = vwin[1] - vwin[0] + 1;
+    start = vwin[0] + half*vlen/2 + 1;
+    stop = start + vlen/2 - 1;
+
+    /* I'll use the symbol HVL in the table below to represent the value */
+    /* VLEN/2.  Note that if VLEN is odd, then HVL should be rounded down, */
+    /* i.e., HVL = (VLEN-1)/2. */
+
+    /* HALF  START          STOP */
+
+    /* 1     VWIN(1)+1      VWIN(1)+HVL */
+    /* 2     VWIN(1)+HVL+1  VWIN(1)+2*HVL */
+    oldsgn = r_sign(1.0f, inbuf[start - 1] - *dither);
+    for (i = start;  i <= stop;  i++)
+    {
+        lp_rms += fabsf(lpbuf[i]);
+        ap_rms += fabsf(inbuf[i]);
+        e_pre += fabsf(inbuf[i] - inbuf[i - 1]);
+        r1 = inbuf[i];
+        e0ap += r1*r1;
+        *rc1 += inbuf[i]*inbuf[i - 1];
+        r1 = lpbuf[i];
+        e_0 += r1*r1;
+        r1 = lpbuf[i - *mintau];
+        e_b += r1*r1;
+        r1 = lpbuf[i + *mintau];
+        e_f += r1*r1;
+        r_f += lpbuf[i]*lpbuf[i + *mintau];
+        r_b += lpbuf[i]*lpbuf[i - *mintau];
+        r1 = inbuf[i] + *dither;
+        if (r_sign(1.0f, r1) != oldsgn)
+        {
+            ++(*zc);
+            oldsgn = -oldsgn;
+        }
+        *dither = -(*dither);
+    }
+    /* Normalized short-term autocovariance coefficient at unit sample delay */
+    *rc1 /= max(e0ap, 1.0f);
+    /* Ratio of the energy of the first difference signal (6 dB/oct preemphasis)*/
+    /* to the energy of the full band signal */
+    /* Computing MAX */
+    r1 = ap_rms*2.0f;
+    *qs = e_pre/max(r1, 1.0f);
+    /* aR_b is the product of the forward and reverse prediction gains, */
+    /* looking backward in time (the causal case). */
+    *ar_b = r_b/max(e_b, 1.0f)*(r_b/max(e_0, 1.0f));
+    /* aR_f is the same as aR_b, but looking forward in time (non causal case).*/
+    *ar_f = r_f/max(e_f, 1.0f)*(r_f/max(e_0, 1.0f));
+    /* Normalize ZC, LBE, and FBE to old fixed window length of 180. */
+    /* (The fraction 90/VLEN has a range of 0.58 to 1) */
+    r2 = (float) (*zc << 1);
+    *zc = lrintf(r2*(90.0f/vlen));
+    r1 = lp_rms/4*(90.0f/vlen);
+    *lbe = min(lrintf(r1), 32767);
+    r1 = ap_rms/4*(90.0f/vlen);
+    *fbe = min(lrintf(r1), 32767);
+}
+/*- End of function --------------------------------------------------------*/
+
+/* Voicing detection makes voicing decisions for each half */
+/* frame of input speech.  Tentative voicing decisions are made two frames*/
+/* in the future (2F) for each half frame.  These decisions are carried */
+/* through one frame in the future (1F) to the present (P) frame where */
+/* they are examined and smoothed, resulting in the final voicing */
+/* decisions for each half frame. */
+
+/* The voicing parameter (signal measurement) column vector (VALUE) */
+/* is based on a rectangular window of speech samples determined by the */
+/* window placement algorithm.  The voicing parameter vector contains the*/
+/* AMDF windowed maximum-to-minimum ratio, the zero crossing rate, energy*/
+/* measures, reflection coefficients, and prediction gains.  The voicing */
+/* window is placed to avoid contamination of the voicing parameter vector*/
+/* with speech onsets. */
+
+/* The input signal is then classified as unvoiced (including */
+/* silence) or voiced.  This decision is made by a linear discriminant */
+/* function consisting of a dot product of the voicing decision */
+/* coefficient (VDC) row vector with the measurement column vector */
+/* (VALUE).  The VDC vector is 2-dimensional, each row vector is optimized*/
+/* for a particular signal-to-noise ratio (SNR).  So, before the dot */
+/* product is performed, the SNR is estimated to select the appropriate */
+/* VDC vector. */
+
+/* The smoothing algorithm is a modified median smoother.  The */
+/* voicing discriminant function is used by the smoother to determine how*/
+/* strongly voiced or unvoiced a signal is.  The smoothing is further */
+/* modified if a speech onset and a voicing decision transition occur */
+/* within one half frame.  In this case, the voicing decision transition */
+/* is extended to the speech onset.  For transmission purposes, there are*/
+/* constraints on the duration and transition of voicing decisions.  The */
+/* smoother takes these constraints into account. */
+
+/* Finally, the energy estimates are updated along with the dither */
+/* threshold used to calculate the zero crossing rate (ZC). */
+
+void lpc10_voicing(lpc10_encode_state_t *s,
+                   int32_t vwin[],
+                   float *inbuf,
+                   float *lpbuf,
+                   const int32_t buflim[],
+                   int32_t half,
+                   float *minamd,
+                   float *maxamd, 
+                   int32_t *mintau,
+                   float ivrc[],
+                   int32_t obound[])
+{
+    static const float vdc[100] =
+    {
+        0.0f, 1714.0f, -110.0f, 334.0f, -4096.0f,  -654.0f, 3752.0f, 3769.0f, 0.0f,  1181.0f,
+        0.0f,  874.0f,  -97.0f, 300.0f, -4096.0f, -1021.0f, 2451.0f, 2527.0f, 0.0f,  -500.0f,
+        0.0f,  510.0f,  -70.0f, 250.0f, -4096.0f, -1270.0f, 2194.0f, 2491.0f, 0.0f, -1500.0f,
+        0.0f,  500.0f,  -10.0f, 200.0f, -4096.0f, -1300.0f,  2.0e3f,  2.0e3f, 0.0f,  -2.0e3f,
+        0.0f,  500.0f,    0.0f,   0.0f, -4096.0f, -1300.0f,  2.0e3f,  2.0e3f, 0.0f, -2500.0f,
+        0.0f,    0.0f,    0.0f,   0.0f,     0.0f,     0.0f,    0.0f,    0.0f, 0.0f,     0.0f,
+        0.0f,    0.0f,    0.0f,   0.0f,     0.0f,     0.0f,    0.0f,    0.0f, 0.0f,     0.0f,
+        0.0f,    0.0f,    0.0f,   0.0f,     0.0f,     0.0f,    0.0f,    0.0f, 0.0f,     0.0f,
+        0.0f,    0.0f,    0.0f,   0.0f,     0.0f,     0.0f,    0.0f,    0.0f, 0.0f,     0.0f,
+        0.0f,    0.0f,    0.0f,   0.0f,     0.0f,     0.0f,    0.0f,    0.0f, 0.0f,     0.0f
+    };
+    static const int nvdcl = 5;
+    static const float vdcl[10] =
+    {
+        600.0f, 450.0f, 300.0f, 200.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
+    };
+
+    int32_t inbuf_offset;
+    int32_t lpbuf_offset;
+    int32_t i1;
+    float r1;
+    float r2;
+    float ar_b;
+    float ar_f;
+    int32_t snrl;
+    int32_t i;
+    float value[9];
+    int32_t zc;
+    int ot;
+    float qs;
+    int32_t vstate;
+    float rc1;
+    int32_t fbe;
+    int32_t lbe;
+    float snr2;
+
+    inbuf_offset = 0;
+    lpbuf_offset = 0;
+    if (inbuf)
+    {
+        inbuf_offset = buflim[0];
+        inbuf -= inbuf_offset;
+    }
+    if (lpbuf)
+    {
+        lpbuf_offset = buflim[2];
+        lpbuf -= lpbuf_offset;
+    }
+
+    /* Voicing Decision Parameter vector (* denotes zero coefficient): */
+
+    /*     * MAXMIN */
+    /*       LBE/LBVE */
+    /*       ZC */
+    /*       RC1 */
+    /*       QS */
+    /*       IVRC2 */
+    /*       aR_B */
+    /*       aR_F */
+    /*     * LOG(LBE/LBVE) */
+    /* Define 2-D voicing decision coefficient vector according to the voicing */
+    /* parameter order above.  Each row (VDC vector) is optimized for a specific */
+    /*   SNR.  The last element of the vector is the constant. */
+    /*              E    ZC    RC1    Qs   IVRC2  aRb   aRf        c */
+
+    /* The VOICE array contains the result of the linear discriminant function*/
+    /* (analog values).  The VOIBUF array contains the hard-limited binary */
+    /* voicing decisions.  The VOICE and VOIBUF arrays, according to FORTRAN */
+    /* memory allocation, are addressed as: */
+
+    /*        (half-frame number, future-frame number) */
+
+    /*        |   Past    |  Present  |  Future1  |  Future2  | */
+    /*        | 1,0 | 2,0 | 1,1 | 2,1 | 1,2 | 2,2 | 1,3 | 2,3 |  --->  time */
+
+    /* Update linear discriminant function history each frame: */
+    if (half == 0)
+    {
+        s->voice[0][0] = s->voice[1][0];
+        s->voice[0][1] = s->voice[1][1];
+        s->voice[1][0] = s->voice[2][0];
+        s->voice[1][1] = s->voice[2][1];
+        s->maxmin = *maxamd / max(*minamd, 1.0f);
+    }
+    /* Calculate voicing parameters twice per frame */
+    vparms(vwin,
+           &inbuf[inbuf_offset],
+           &lpbuf[lpbuf_offset],
+           buflim, 
+           half,
+           &s->dither,
+           mintau,
+           &zc,
+           &lbe,
+           &fbe,
+           &qs,
+           &rc1,
+           &ar_b,
+           &ar_f);
+    /* Estimate signal-to-noise ratio to select the appropriate VDC vector. */
+    /* The SNR is estimated as the running average of the ratio of the */
+    /* running average full-band voiced energy to the running average */
+    /* full-band unvoiced energy. SNR filter has gain of 63. */
+    r1 = (s->snr + s->fbve/(float) max(s->fbue, 1))*63/64.0f;
+    s->snr = (float) lrintf(r1);
+    snr2 = s->snr*s->fbue/max(s->lbue, 1);
+    /* Quantize SNR to SNRL according to VDCL thresholds. */
+    i1 = nvdcl - 1;
+    for (snrl = 0;  snrl < i1;  snrl++)
+    {
+        if (snr2 > vdcl[snrl])
+            break;
+    }
+    /* (Note:  SNRL = NVDCL here) */
+    /* Linear discriminant voicing parameters: */
+    value[0] = s->maxmin;
+    value[1] = (float) lbe/max(s->lbve, 1);
+    value[2] = (float) zc;
+    value[3] = rc1;
+    value[4] = qs;
+    value[5] = ivrc[1];
+    value[6] = ar_b;
+    value[7] = ar_f;
+    /* Evaluation of linear discriminant function: */
+    s->voice[2][half] = vdc[snrl*10 + 9];
+    for (i = 0;  i < 8;  i++)
+        s->voice[2][half] += vdc[snrl*10 + i]*value[i];
+    /* Classify as voiced if discriminant > 0, otherwise unvoiced */
+    /* Voicing decision for current half-frame:  1 = Voiced; 0 = Unvoiced */
+    s->voibuf[3][half] = (s->voice[2][half] > 0.0f)  ?  1  :  0;
+    /* Skip voicing decision smoothing in first half-frame: */
+    /* Give a value to VSTATE, so that trace statements below will print */
+    /* a consistent value from one call to the next when HALF .EQ. 1. */
+    /* The value of VSTATE is not used for any other purpose when this is */
+    /* true. */
+    vstate = -1;
+    if (half != 0)
+    {
+        /* Voicing decision smoothing rules (override of linear combination): */
+
+        /*     Unvoiced half-frames:  At least two in a row. */
+        /*     -------------------- */
+
+        /*     Voiced half-frames:    At least two in a row in one frame. */
+        /*     -------------------    Otherwise at least three in a row. */
+        /*                    (Due to the way transition frames are encoded) */
+
+        /* In many cases, the discriminant function determines how to smooth. */
+        /* In the following chart, the decisions marked with a * may be overridden. */
+
+        /* Voicing override of transitions at onsets: */
+        /* If a V/UV or UV/V voicing decision transition occurs within one-half */
+        /* frame of an onset bounding a voicing window, then the transition is */
+        /* moved to occur at the onset. */
+
+        /*     P    1F */
+        /*     -----    ----- */
+        /*     0   0   0   0 */
+        /*     0   0   0*  1    (If there is an onset there) */
+        /*     0   0   1*  0*    (Based on 2F and discriminant distance) */
+        /*     0   0   1   1 */
+        /*     0   1*  0   0    (Always) */
+        /*     0   1*  0*  1    (Based on discriminant distance) */
+        /*     0*  1   1   0*    (Based on past, 2F, and discriminant distance) */
+        /*     0   1*  1   1    (If there is an onset there) */
+        /*     1   0*  0   0    (If there is an onset there) */
+        /*     1   0   0   1 */
+        /*     1   0*  1*  0    (Based on discriminant distance) */
+        /*     1   0*  1   1    (Always) */
+        /*     1   1   0   0 */
+        /*     1   1   0*  1*    (Based on 2F and discriminant distance) */
+        /*     1   1   1*  0    (If there is an onset there) */
+        /*     1   1   1   1 */
+
+        /* Determine if there is an onset transition between P and 1F. */
+        /* OT (Onset Transition) is true if there is an onset between */
+        /* P and 1F but not after 1F. */
+        ot = ((obound[0] & 2) != 0  ||  obound[1] == 1)  &&  (obound[2] & 1) == 0;
+        /* Multi-way dispatch on voicing decision history: */
+        vstate = (s->voibuf[1][0] << 3) + (s->voibuf[1][1] << 2) + (s->voibuf[2][0] << 1) + s->voibuf[2][1];
+        switch (vstate + 1)
+        {
+        case 2:
+            if (ot  &&  s->voibuf[3][0] == 1)
+                s->voibuf[2][0] = 1;
+            break;
+        case 3:
+            if (s->voibuf[3][0] == 0  ||  s->voice[1][0] < -s->voice[1][1])
+                s->voibuf[2][0] = 0;
+            else
+                s->voibuf[2][1] = 1;
+            break;
+        case 5:
+            s->voibuf[1][1] = 0;
+            break;
+        case 6:
+            if (s->voice[0][1] < -s->voice[1][0])
+                s->voibuf[1][1] = 0;
+            else
+                s->voibuf[2][0] = 1;
+            break;
+        case 7:
+            if (s->voibuf[0][0] == 1  ||  s->voibuf[3][0] == 1  ||  s->voice[1][1] > s->voice[0][0])
+                s->voibuf[2][1] = 1;
+            else
+                s->voibuf[1][0] = 1;
+            break;
+        case 8:
+            if (ot)
+                s->voibuf[1][1] = 0;
+            break;
+        case 9:
+            if (ot)
+                s->voibuf[1][1] = 1;
+            break;
+        case 11:
+            if (s->voice[1][9] < -s->voice[0][1])
+                s->voibuf[2][0] = 0;
+            else
+                s->voibuf[1][1] = 1;
+            break;
+        case 12:
+            s->voibuf[1][1] = 1;
+            break;
+        case 14:
+            if (s->voibuf[3][0] == 0  &&  s->voice[1][1] < -s->voice[1][0])
+                s->voibuf[2][1] = 0;
+            else
+                s->voibuf[2][0] = 1;
+            break;
+        case 15:
+            if (ot  &&  s->voibuf[3][0] == 0)
+                s->voibuf[2][0] = 0;
+            break;
+        }
+    }
+    /* During unvoiced half-frames, update the low band and full band unvoiced*/
+    /* energy estimates (LBUE and FBUE) and also the zero crossing */
+    /* threshold (DITHER).  (The input to the unvoiced energy filters is */
+    /* restricted to be less than 10dB above the previous inputs of the */
+    /* filters.) */
+    /* During voiced half-frames, update the low-pass (LBVE) and all-pass */
+    /* (FBVE) voiced energy estimates. */
+    if (s->voibuf[3][half] == 0)
+    {
+        r1 = (s->sfbue*63 + (min(fbe, s->ofbue*3) << 3))/64.0f;
+        s->sfbue = lrintf(r1);
+        s->fbue = s->sfbue/8;
+        s->ofbue = fbe;
+        r1 = (s->slbue*63 + (min(lbe, s->olbue*3) << 3))/64.0f;
+        s->slbue = lrintf(r1);
+        s->lbue = s->slbue/8;
+        s->olbue = lbe;
+    }
+    else
+    {
+        s->lbve = lrintf((s->lbve*63 + lbe)/64.0f);
+        s->fbve = lrintf((s->fbve*63 + fbe)/64.0f);
+    }
+    /* Set dither threshold to yield proper zero crossing rates in the */
+    /* presence of low frequency noise and low level signal input. */
+    /* NOTE: The divisor is a function of REF, the expected energies. */
+    /* Computing MIN */
+    /* Computing MAX */
+    r2 = sqrtf((float) (s->lbue*s->lbve))*64/3000;
+    r1 = max(r2, 1.0f);
+    s->dither = min(r1, 20.0f);
+    /* Voicing decisions are returned in VOIBUF. */
+}
+/*- End of function --------------------------------------------------------*/
+/*- End of file ------------------------------------------------------------*/
author	Peter Meerwald <pmeerw@cosy.sbg.ac.at>
date	Fri, 25 Jun 2010 16:00:21 +0200
parents
children