audiostuff: spandsp-0.0.3/spandsp-0.0.3/src/lpc10

comparison spandsp-0.0.3/spandsp-0.0.3/src/lpc10_voicing.c @ 5:f762bf195c4b

import spandsp-0.0.3

author	Peter Meerwald <pmeerw@cosy.sbg.ac.at>
date	Fri, 25 Jun 2010 16:00:21 +0200
parents
children

comparison

equal deleted inserted replaced

-:26cd8f1ef0b1
+:f762bf195c4b
+/*
+* SpanDSP - a series of DSP components for telephony
+*
+* lpc10_voicing.c - LPC10 low bit rate speech codec.
+*
+* Written by Steve Underwood <steveu@coppice.org>
+*
+* Copyright (C) 2006 Steve Underwood
+*
+* All rights reserved.
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License version 2, as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+* This code is based on the U.S. Department of Defense reference
+* implementation of the LPC-10 2400 bps Voice Coder. They do not
+* exert copyright claims on their code, and it may be freely used.
+*
+* $Id: lpc10_voicing.c,v 1.7 2006/11/30 15:41:47 steveu Exp $
+*/
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <memory.h>
+#if defined(HAVE_TGMATH_H)
+#include <tgmath.h>
+#endif
+#if defined(HAVE_MATH_H)
+#include <math.h>
+#endif
+#include "spandsp/telephony.h"
+#include "spandsp/dc_restore.h"
+#include "spandsp/lpc10.h"
+#include "lpc10_encdecs.h"
+static void vparms(int32_t vwin[],
+float *inbuf,
+float *lpbuf,
+const int32_t buflim[],
+int32_t half,
+float *dither,
+int32_t *mintau,
+int32_t *zc,
+int32_t *lbe,
+int32_t *fbe,
+float *qs,
+float *rc1,
+float *ar_b,
+float *ar_f)
+{
+int32_t inbuf_offset;
+int32_t lpbuf_offset;
+int32_t vlen;
+int32_t stop;
+int32_t i;
+int32_t start;
+float r1;
+float r2;
+float e_pre;
+float ap_rms;
+float e_0;
+float oldsgn;
+float lp_rms;
+float e_b;
+float e_f;
+float r_b;
+float r_f;
+float e0ap;
+/* Calculate zero crossings (ZC) and several energy and correlation */
+/* measures on low band and full band speech.  Each measure is taken */
+/* over either the first or the second half of the voicing window, */
+/* depending on the variable HALF. */
+lpbuf_offset = buflim[2];
+lpbuf -= lpbuf_offset;
+inbuf_offset = buflim[0];
+inbuf -= inbuf_offset;
+lp_rms = 0.0f;
+ap_rms = 0.0f;
+e_pre = 0.0f;
+e0ap = 0.0f;
+*rc1 = 0.0f;
+e_0 = 0.0f;
+e_b = 0.0f;
+e_f = 0.0f;
+r_f = 0.0f;
+r_b = 0.0f;
+*zc = 0;
+vlen = vwin[1] - vwin[0] + 1;
+start = vwin[0] + half*vlen/2 + 1;
+stop = start + vlen/2 - 1;
+/* I'll use the symbol HVL in the table below to represent the value */
+/* VLEN/2.  Note that if VLEN is odd, then HVL should be rounded down, */
+/* i.e., HVL = (VLEN-1)/2. */
+/* HALF  START          STOP */
+/* 1     VWIN(1)+1      VWIN(1)+HVL */
+/* 2     VWIN(1)+HVL+1  VWIN(1)+2*HVL */
+oldsgn = r_sign(1.0f, inbuf[start - 1] - *dither);
+for (i = start;  i <= stop;  i++)
+{
+lp_rms += fabsf(lpbuf[i]);
+ap_rms += fabsf(inbuf[i]);
+e_pre += fabsf(inbuf[i] - inbuf[i - 1]);
+r1 = inbuf[i];
+e0ap += r1*r1;
+*rc1 += inbuf[i]*inbuf[i - 1];
+r1 = lpbuf[i];
+e_0 += r1*r1;
+r1 = lpbuf[i - *mintau];
+e_b += r1*r1;
+r1 = lpbuf[i + *mintau];
+e_f += r1*r1;
+r_f += lpbuf[i]*lpbuf[i + *mintau];
+r_b += lpbuf[i]*lpbuf[i - *mintau];
+r1 = inbuf[i] + *dither;
+if (r_sign(1.0f, r1) != oldsgn)
+{
+++(*zc);
+oldsgn = -oldsgn;
+}
+*dither = -(*dither);
+}
+/* Normalized short-term autocovariance coefficient at unit sample delay */
+*rc1 /= max(e0ap, 1.0f);
+/* Ratio of the energy of the first difference signal (6 dB/oct preemphasis)*/
+/* to the energy of the full band signal */
+/* Computing MAX */
+r1 = ap_rms*2.0f;
+*qs = e_pre/max(r1, 1.0f);
+/* aR_b is the product of the forward and reverse prediction gains, */
+/* looking backward in time (the causal case). */
+*ar_b = r_b/max(e_b, 1.0f)*(r_b/max(e_0, 1.0f));
+/* aR_f is the same as aR_b, but looking forward in time (non causal case).*/
+*ar_f = r_f/max(e_f, 1.0f)*(r_f/max(e_0, 1.0f));
+/* Normalize ZC, LBE, and FBE to old fixed window length of 180. */
+/* (The fraction 90/VLEN has a range of 0.58 to 1) */
+r2 = (float) (*zc << 1);
+*zc = lrintf(r2*(90.0f/vlen));
+r1 = lp_rms/4*(90.0f/vlen);
+*lbe = min(lrintf(r1), 32767);
+r1 = ap_rms/4*(90.0f/vlen);
+*fbe = min(lrintf(r1), 32767);
+}
+/*- End of function --------------------------------------------------------*/
+/* Voicing detection makes voicing decisions for each half */
+/* frame of input speech.  Tentative voicing decisions are made two frames*/
+/* in the future (2F) for each half frame.  These decisions are carried */
+/* through one frame in the future (1F) to the present (P) frame where */
+/* they are examined and smoothed, resulting in the final voicing */
+/* decisions for each half frame. */
+/* The voicing parameter (signal measurement) column vector (VALUE) */
+/* is based on a rectangular window of speech samples determined by the */
+/* window placement algorithm.  The voicing parameter vector contains the*/
+/* AMDF windowed maximum-to-minimum ratio, the zero crossing rate, energy*/
+/* measures, reflection coefficients, and prediction gains.  The voicing */
+/* window is placed to avoid contamination of the voicing parameter vector*/
+/* with speech onsets. */
+/* The input signal is then classified as unvoiced (including */
+/* silence) or voiced.  This decision is made by a linear discriminant */
+/* function consisting of a dot product of the voicing decision */
+/* coefficient (VDC) row vector with the measurement column vector */
+/* (VALUE).  The VDC vector is 2-dimensional, each row vector is optimized*/
+/* for a particular signal-to-noise ratio (SNR).  So, before the dot */
+/* product is performed, the SNR is estimated to select the appropriate */
+/* VDC vector. */
+/* The smoothing algorithm is a modified median smoother.  The */
+/* voicing discriminant function is used by the smoother to determine how*/
+/* strongly voiced or unvoiced a signal is.  The smoothing is further */
+/* modified if a speech onset and a voicing decision transition occur */
+/* within one half frame.  In this case, the voicing decision transition */
+/* is extended to the speech onset.  For transmission purposes, there are*/
+/* constraints on the duration and transition of voicing decisions.  The */
+/* smoother takes these constraints into account. */
+/* Finally, the energy estimates are updated along with the dither */
+/* threshold used to calculate the zero crossing rate (ZC). */
+void lpc10_voicing(lpc10_encode_state_t *s,
+int32_t vwin[],
+float *inbuf,
+float *lpbuf,
+const int32_t buflim[],
+int32_t half,
+float *minamd,
+float *maxamd,
+int32_t *mintau,
+float ivrc[],
+int32_t obound[])
+{
+static const float vdc[100] =
+{
+0.0f, 1714.0f, -110.0f, 334.0f, -4096.0f,  -654.0f, 3752.0f, 3769.0f, 0.0f,  1181.0f,
+0.0f,  874.0f,  -97.0f, 300.0f, -4096.0f, -1021.0f, 2451.0f, 2527.0f, 0.0f,  -500.0f,
+0.0f,  510.0f,  -70.0f, 250.0f, -4096.0f, -1270.0f, 2194.0f, 2491.0f, 0.0f, -1500.0f,
+0.0f,  500.0f,  -10.0f, 200.0f, -4096.0f, -1300.0f,  2.0e3f,  2.0e3f, 0.0f,  -2.0e3f,
+0.0f,  500.0f,    0.0f,   0.0f, -4096.0f, -1300.0f,  2.0e3f,  2.0e3f, 0.0f, -2500.0f,
+0.0f,    0.0f,    0.0f,   0.0f,     0.0f,     0.0f,    0.0f,    0.0f, 0.0f,     0.0f,
+0.0f,    0.0f,    0.0f,   0.0f,     0.0f,     0.0f,    0.0f,    0.0f, 0.0f,     0.0f,
+0.0f,    0.0f,    0.0f,   0.0f,     0.0f,     0.0f,    0.0f,    0.0f, 0.0f,     0.0f,
+0.0f,    0.0f,    0.0f,   0.0f,     0.0f,     0.0f,    0.0f,    0.0f, 0.0f,     0.0f,
+0.0f,    0.0f,    0.0f,   0.0f,     0.0f,     0.0f,    0.0f,    0.0f, 0.0f,     0.0f
+};
+static const int nvdcl = 5;
+static const float vdcl[10] =
+{
+600.0f, 450.0f, 300.0f, 200.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
+};
+int32_t inbuf_offset;
+int32_t lpbuf_offset;
+int32_t i1;
+float r1;
+float r2;
+float ar_b;
+float ar_f;
+int32_t snrl;
+int32_t i;
+float value[9];
+int32_t zc;
+int ot;
+float qs;
+int32_t vstate;
+float rc1;
+int32_t fbe;
+int32_t lbe;
+float snr2;
+inbuf_offset = 0;
+lpbuf_offset = 0;
+if (inbuf)
+{
+inbuf_offset = buflim[0];
+inbuf -= inbuf_offset;
+}
+if (lpbuf)
+{
+lpbuf_offset = buflim[2];
+lpbuf -= lpbuf_offset;
+}
+/* Voicing Decision Parameter vector (* denotes zero coefficient): */
+/*     * MAXMIN */
+/*       LBE/LBVE */
+/*       ZC */
+/*       RC1 */
+/*       QS */
+/*       IVRC2 */
+/*       aR_B */
+/*       aR_F */
+/*     * LOG(LBE/LBVE) */
+/* Define 2-D voicing decision coefficient vector according to the voicing */
+/* parameter order above.  Each row (VDC vector) is optimized for a specific */
+/*   SNR.  The last element of the vector is the constant. */
+/*              E    ZC    RC1    Qs   IVRC2  aRb   aRf        c */
+/* The VOICE array contains the result of the linear discriminant function*/
+/* (analog values).  The VOIBUF array contains the hard-limited binary */
+/* voicing decisions.  The VOICE and VOIBUF arrays, according to FORTRAN */
+/* memory allocation, are addressed as: */
+/*        (half-frame number, future-frame number) */
+/*        |   Past    |  Present  |  Future1  |  Future2  | */
+/*        | 1,0 | 2,0 | 1,1 | 2,1 | 1,2 | 2,2 | 1,3 | 2,3 |  --->  time */
+/* Update linear discriminant function history each frame: */
+if (half == 0)
+{
+s->voice[0][0] = s->voice[1][0];
+s->voice[0][1] = s->voice[1][1];
+s->voice[1][0] = s->voice[2][0];
+s->voice[1][1] = s->voice[2][1];
+s->maxmin = *maxamd / max(*minamd, 1.0f);
+}
+/* Calculate voicing parameters twice per frame */
+vparms(vwin,
+&inbuf[inbuf_offset],
+&lpbuf[lpbuf_offset],
+buflim,
+half,
+&s->dither,
+mintau,
+&zc,
+&lbe,
+&fbe,
+&qs,
+&rc1,
+&ar_b,
+&ar_f);
+/* Estimate signal-to-noise ratio to select the appropriate VDC vector. */
+/* The SNR is estimated as the running average of the ratio of the */
+/* running average full-band voiced energy to the running average */
+/* full-band unvoiced energy. SNR filter has gain of 63. */
+r1 = (s->snr + s->fbve/(float) max(s->fbue, 1))*63/64.0f;
+s->snr = (float) lrintf(r1);
+snr2 = s->snr*s->fbue/max(s->lbue, 1);
+/* Quantize SNR to SNRL according to VDCL thresholds. */
+i1 = nvdcl - 1;
+for (snrl = 0;  snrl < i1;  snrl++)
+{
+if (snr2 > vdcl[snrl])
+break;
+}
+/* (Note:  SNRL = NVDCL here) */
+/* Linear discriminant voicing parameters: */
+value[0] = s->maxmin;
+value[1] = (float) lbe/max(s->lbve, 1);
+value[2] = (float) zc;
+value[3] = rc1;
+value[4] = qs;
+value[5] = ivrc[1];
+value[6] = ar_b;
+value[7] = ar_f;
+/* Evaluation of linear discriminant function: */
+s->voice[2][half] = vdc[snrl*10 + 9];
+for (i = 0;  i < 8;  i++)
+s->voice[2][half] += vdc[snrl*10 + i]*value[i];
+/* Classify as voiced if discriminant > 0, otherwise unvoiced */
+/* Voicing decision for current half-frame:  1 = Voiced; 0 = Unvoiced */
+s->voibuf[3][half] = (s->voice[2][half] > 0.0f)  ?  1  :  0;
+/* Skip voicing decision smoothing in first half-frame: */
+/* Give a value to VSTATE, so that trace statements below will print */
+/* a consistent value from one call to the next when HALF .EQ. 1. */
+/* The value of VSTATE is not used for any other purpose when this is */
+/* true. */
+vstate = -1;
+if (half != 0)
+{
+/* Voicing decision smoothing rules (override of linear combination): */
+/*     Unvoiced half-frames:  At least two in a row. */
+/*     -------------------- */
+/*     Voiced half-frames:    At least two in a row in one frame. */
+/*     -------------------    Otherwise at least three in a row. */
+/*                    (Due to the way transition frames are encoded) */
+/* In many cases, the discriminant function determines how to smooth. */
+/* In the following chart, the decisions marked with a * may be overridden. */
+/* Voicing override of transitions at onsets: */
+/* If a V/UV or UV/V voicing decision transition occurs within one-half */
+/* frame of an onset bounding a voicing window, then the transition is */
+/* moved to occur at the onset. */
+/*     P    1F */
+/*     -----    ----- */
+/*     0   0   0   0 */
+/*     0   0   0*  1    (If there is an onset there) */
+/*     0   0   1*  0*    (Based on 2F and discriminant distance) */
+/*     0   0   1   1 */
+/*     0   1*  0   0    (Always) */
+/*     0   1*  0*  1    (Based on discriminant distance) */
+/*     0*  1   1   0*    (Based on past, 2F, and discriminant distance) */
+/*     0   1*  1   1    (If there is an onset there) */
+/*     1   0*  0   0    (If there is an onset there) */
+/*     1   0   0   1 */
+/*     1   0*  1*  0    (Based on discriminant distance) */
+/*     1   0*  1   1    (Always) */
+/*     1   1   0   0 */
+/*     1   1   0*  1*    (Based on 2F and discriminant distance) */
+/*     1   1   1*  0    (If there is an onset there) */
+/*     1   1   1   1 */
+/* Determine if there is an onset transition between P and 1F. */
+/* OT (Onset Transition) is true if there is an onset between */
+/* P and 1F but not after 1F. */
+ot = ((obound[0] & 2) != 0  ||  obound[1] == 1)  &&  (obound[2] & 1) == 0;
+/* Multi-way dispatch on voicing decision history: */
+vstate = (s->voibuf[1][0] << 3) + (s->voibuf[1][1] << 2) + (s->voibuf[2][0] << 1) + s->voibuf[2][1];
+switch (vstate + 1)
+{
+case 2:
+if (ot  &&  s->voibuf[3][0] == 1)
+s->voibuf[2][0] = 1;
+break;
+case 3:
+if (s->voibuf[3][0] == 0  ||  s->voice[1][0] < -s->voice[1][1])
+s->voibuf[2][0] = 0;
+else
+s->voibuf[2][1] = 1;
+break;
+case 5:
+s->voibuf[1][1] = 0;
+break;
+case 6:
+if (s->voice[0][1] < -s->voice[1][0])
+s->voibuf[1][1] = 0;
+else
+s->voibuf[2][0] = 1;
+break;
+case 7:
+if (s->voibuf[0][0] == 1  ||  s->voibuf[3][0] == 1  ||  s->voice[1][1] > s->voice[0][0])
+s->voibuf[2][1] = 1;
+else
+s->voibuf[1][0] = 1;
+break;
+case 8:
+if (ot)
+s->voibuf[1][1] = 0;
+break;
+case 9:
+if (ot)
+s->voibuf[1][1] = 1;
+break;
+case 11:
+if (s->voice[1][9] < -s->voice[0][1])
+s->voibuf[2][0] = 0;
+else
+s->voibuf[1][1] = 1;
+break;
+case 12:
+s->voibuf[1][1] = 1;
+break;
+case 14:
+if (s->voibuf[3][0] == 0  &&  s->voice[1][1] < -s->voice[1][0])
+s->voibuf[2][1] = 0;
+else
+s->voibuf[2][0] = 1;
+break;
+case 15:
+if (ot  &&  s->voibuf[3][0] == 0)
+s->voibuf[2][0] = 0;
+break;
+}
+}
+/* During unvoiced half-frames, update the low band and full band unvoiced*/
+/* energy estimates (LBUE and FBUE) and also the zero crossing */
+/* threshold (DITHER).  (The input to the unvoiced energy filters is */
+/* restricted to be less than 10dB above the previous inputs of the */
+/* filters.) */
+/* During voiced half-frames, update the low-pass (LBVE) and all-pass */
+/* (FBVE) voiced energy estimates. */
+if (s->voibuf[3][half] == 0)
+{
+r1 = (s->sfbue*63 + (min(fbe, s->ofbue*3) << 3))/64.0f;
+s->sfbue = lrintf(r1);
+s->fbue = s->sfbue/8;
+s->ofbue = fbe;
+r1 = (s->slbue*63 + (min(lbe, s->olbue*3) << 3))/64.0f;
+s->slbue = lrintf(r1);
+s->lbue = s->slbue/8;
+s->olbue = lbe;
+}
+else
+{
+s->lbve = lrintf((s->lbve*63 + lbe)/64.0f);
+s->fbve = lrintf((s->fbve*63 + fbe)/64.0f);
+}
+/* Set dither threshold to yield proper zero crossing rates in the */
+/* presence of low frequency noise and low level signal input. */
+/* NOTE: The divisor is a function of REF, the expected energies. */
+/* Computing MIN */
+/* Computing MAX */
+r2 = sqrtf((float) (s->lbue*s->lbve))*64/3000;
+r1 = max(r2, 1.0f);
+s->dither = min(r1, 20.0f);
+/* Voicing decisions are returned in VOIBUF. */
+}
+/*- End of function --------------------------------------------------------*/
+/*- End of file ------------------------------------------------------------*/

Mercurial > hg > audiostuff

comparison spandsp-0.0.3/spandsp-0.0.3/src/lpc10_voicing.c @ 5:f762bf195c4b