Mercurial > hg > audiostuff
diff spandsp-0.0.3/spandsp-0.0.3/src/lpc10_voicing.c @ 5:f762bf195c4b
import spandsp-0.0.3
author | Peter Meerwald <pmeerw@cosy.sbg.ac.at> |
---|---|
date | Fri, 25 Jun 2010 16:00:21 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spandsp-0.0.3/spandsp-0.0.3/src/lpc10_voicing.c Fri Jun 25 16:00:21 2010 +0200 @@ -0,0 +1,487 @@ +/* + * SpanDSP - a series of DSP components for telephony + * + * lpc10_voicing.c - LPC10 low bit rate speech codec. + * + * Written by Steve Underwood <steveu@coppice.org> + * + * Copyright (C) 2006 Steve Underwood + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * This code is based on the U.S. Department of Defense reference + * implementation of the LPC-10 2400 bps Voice Coder. They do not + * exert copyright claims on their code, and it may be freely used. + * + * $Id: lpc10_voicing.c,v 1.7 2006/11/30 15:41:47 steveu Exp $ + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <stdlib.h> +#include <stdio.h> +#include <inttypes.h> +#include <memory.h> +#if defined(HAVE_TGMATH_H) +#include <tgmath.h> +#endif +#if defined(HAVE_MATH_H) +#include <math.h> +#endif + +#include "spandsp/telephony.h" +#include "spandsp/dc_restore.h" +#include "spandsp/lpc10.h" + +#include "lpc10_encdecs.h" + +static void vparms(int32_t vwin[], + float *inbuf, + float *lpbuf, + const int32_t buflim[], + int32_t half, + float *dither, + int32_t *mintau, + int32_t *zc, + int32_t *lbe, + int32_t *fbe, + float *qs, + float *rc1, + float *ar_b, + float *ar_f) +{ + int32_t inbuf_offset; + int32_t lpbuf_offset; + int32_t vlen; + int32_t stop; + int32_t i; + int32_t start; + float r1; + float r2; + float e_pre; + float ap_rms; + float e_0; + float oldsgn; + float lp_rms; + float e_b; + float e_f; + float r_b; + float r_f; + float e0ap; + + /* Calculate zero crossings (ZC) and several energy and correlation */ + /* measures on low band and full band speech. Each measure is taken */ + /* over either the first or the second half of the voicing window, */ + /* depending on the variable HALF. */ + lpbuf_offset = buflim[2]; + lpbuf -= lpbuf_offset; + inbuf_offset = buflim[0]; + inbuf -= inbuf_offset; + + lp_rms = 0.0f; + ap_rms = 0.0f; + e_pre = 0.0f; + e0ap = 0.0f; + *rc1 = 0.0f; + e_0 = 0.0f; + e_b = 0.0f; + e_f = 0.0f; + r_f = 0.0f; + r_b = 0.0f; + *zc = 0; + vlen = vwin[1] - vwin[0] + 1; + start = vwin[0] + half*vlen/2 + 1; + stop = start + vlen/2 - 1; + + /* I'll use the symbol HVL in the table below to represent the value */ + /* VLEN/2. Note that if VLEN is odd, then HVL should be rounded down, */ + /* i.e., HVL = (VLEN-1)/2. */ + + /* HALF START STOP */ + + /* 1 VWIN(1)+1 VWIN(1)+HVL */ + /* 2 VWIN(1)+HVL+1 VWIN(1)+2*HVL */ + oldsgn = r_sign(1.0f, inbuf[start - 1] - *dither); + for (i = start; i <= stop; i++) + { + lp_rms += fabsf(lpbuf[i]); + ap_rms += fabsf(inbuf[i]); + e_pre += fabsf(inbuf[i] - inbuf[i - 1]); + r1 = inbuf[i]; + e0ap += r1*r1; + *rc1 += inbuf[i]*inbuf[i - 1]; + r1 = lpbuf[i]; + e_0 += r1*r1; + r1 = lpbuf[i - *mintau]; + e_b += r1*r1; + r1 = lpbuf[i + *mintau]; + e_f += r1*r1; + r_f += lpbuf[i]*lpbuf[i + *mintau]; + r_b += lpbuf[i]*lpbuf[i - *mintau]; + r1 = inbuf[i] + *dither; + if (r_sign(1.0f, r1) != oldsgn) + { + ++(*zc); + oldsgn = -oldsgn; + } + *dither = -(*dither); + } + /* Normalized short-term autocovariance coefficient at unit sample delay */ + *rc1 /= max(e0ap, 1.0f); + /* Ratio of the energy of the first difference signal (6 dB/oct preemphasis)*/ + /* to the energy of the full band signal */ + /* Computing MAX */ + r1 = ap_rms*2.0f; + *qs = e_pre/max(r1, 1.0f); + /* aR_b is the product of the forward and reverse prediction gains, */ + /* looking backward in time (the causal case). */ + *ar_b = r_b/max(e_b, 1.0f)*(r_b/max(e_0, 1.0f)); + /* aR_f is the same as aR_b, but looking forward in time (non causal case).*/ + *ar_f = r_f/max(e_f, 1.0f)*(r_f/max(e_0, 1.0f)); + /* Normalize ZC, LBE, and FBE to old fixed window length of 180. */ + /* (The fraction 90/VLEN has a range of 0.58 to 1) */ + r2 = (float) (*zc << 1); + *zc = lrintf(r2*(90.0f/vlen)); + r1 = lp_rms/4*(90.0f/vlen); + *lbe = min(lrintf(r1), 32767); + r1 = ap_rms/4*(90.0f/vlen); + *fbe = min(lrintf(r1), 32767); +} +/*- End of function --------------------------------------------------------*/ + +/* Voicing detection makes voicing decisions for each half */ +/* frame of input speech. Tentative voicing decisions are made two frames*/ +/* in the future (2F) for each half frame. These decisions are carried */ +/* through one frame in the future (1F) to the present (P) frame where */ +/* they are examined and smoothed, resulting in the final voicing */ +/* decisions for each half frame. */ + +/* The voicing parameter (signal measurement) column vector (VALUE) */ +/* is based on a rectangular window of speech samples determined by the */ +/* window placement algorithm. The voicing parameter vector contains the*/ +/* AMDF windowed maximum-to-minimum ratio, the zero crossing rate, energy*/ +/* measures, reflection coefficients, and prediction gains. The voicing */ +/* window is placed to avoid contamination of the voicing parameter vector*/ +/* with speech onsets. */ + +/* The input signal is then classified as unvoiced (including */ +/* silence) or voiced. This decision is made by a linear discriminant */ +/* function consisting of a dot product of the voicing decision */ +/* coefficient (VDC) row vector with the measurement column vector */ +/* (VALUE). The VDC vector is 2-dimensional, each row vector is optimized*/ +/* for a particular signal-to-noise ratio (SNR). So, before the dot */ +/* product is performed, the SNR is estimated to select the appropriate */ +/* VDC vector. */ + +/* The smoothing algorithm is a modified median smoother. The */ +/* voicing discriminant function is used by the smoother to determine how*/ +/* strongly voiced or unvoiced a signal is. The smoothing is further */ +/* modified if a speech onset and a voicing decision transition occur */ +/* within one half frame. In this case, the voicing decision transition */ +/* is extended to the speech onset. For transmission purposes, there are*/ +/* constraints on the duration and transition of voicing decisions. The */ +/* smoother takes these constraints into account. */ + +/* Finally, the energy estimates are updated along with the dither */ +/* threshold used to calculate the zero crossing rate (ZC). */ + +void lpc10_voicing(lpc10_encode_state_t *s, + int32_t vwin[], + float *inbuf, + float *lpbuf, + const int32_t buflim[], + int32_t half, + float *minamd, + float *maxamd, + int32_t *mintau, + float ivrc[], + int32_t obound[]) +{ + static const float vdc[100] = + { + 0.0f, 1714.0f, -110.0f, 334.0f, -4096.0f, -654.0f, 3752.0f, 3769.0f, 0.0f, 1181.0f, + 0.0f, 874.0f, -97.0f, 300.0f, -4096.0f, -1021.0f, 2451.0f, 2527.0f, 0.0f, -500.0f, + 0.0f, 510.0f, -70.0f, 250.0f, -4096.0f, -1270.0f, 2194.0f, 2491.0f, 0.0f, -1500.0f, + 0.0f, 500.0f, -10.0f, 200.0f, -4096.0f, -1300.0f, 2.0e3f, 2.0e3f, 0.0f, -2.0e3f, + 0.0f, 500.0f, 0.0f, 0.0f, -4096.0f, -1300.0f, 2.0e3f, 2.0e3f, 0.0f, -2500.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f + }; + static const int nvdcl = 5; + static const float vdcl[10] = + { + 600.0f, 450.0f, 300.0f, 200.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f + }; + + int32_t inbuf_offset; + int32_t lpbuf_offset; + int32_t i1; + float r1; + float r2; + float ar_b; + float ar_f; + int32_t snrl; + int32_t i; + float value[9]; + int32_t zc; + int ot; + float qs; + int32_t vstate; + float rc1; + int32_t fbe; + int32_t lbe; + float snr2; + + inbuf_offset = 0; + lpbuf_offset = 0; + if (inbuf) + { + inbuf_offset = buflim[0]; + inbuf -= inbuf_offset; + } + if (lpbuf) + { + lpbuf_offset = buflim[2]; + lpbuf -= lpbuf_offset; + } + + /* Voicing Decision Parameter vector (* denotes zero coefficient): */ + + /* * MAXMIN */ + /* LBE/LBVE */ + /* ZC */ + /* RC1 */ + /* QS */ + /* IVRC2 */ + /* aR_B */ + /* aR_F */ + /* * LOG(LBE/LBVE) */ + /* Define 2-D voicing decision coefficient vector according to the voicing */ + /* parameter order above. Each row (VDC vector) is optimized for a specific */ + /* SNR. The last element of the vector is the constant. */ + /* E ZC RC1 Qs IVRC2 aRb aRf c */ + + /* The VOICE array contains the result of the linear discriminant function*/ + /* (analog values). The VOIBUF array contains the hard-limited binary */ + /* voicing decisions. The VOICE and VOIBUF arrays, according to FORTRAN */ + /* memory allocation, are addressed as: */ + + /* (half-frame number, future-frame number) */ + + /* | Past | Present | Future1 | Future2 | */ + /* | 1,0 | 2,0 | 1,1 | 2,1 | 1,2 | 2,2 | 1,3 | 2,3 | ---> time */ + + /* Update linear discriminant function history each frame: */ + if (half == 0) + { + s->voice[0][0] = s->voice[1][0]; + s->voice[0][1] = s->voice[1][1]; + s->voice[1][0] = s->voice[2][0]; + s->voice[1][1] = s->voice[2][1]; + s->maxmin = *maxamd / max(*minamd, 1.0f); + } + /* Calculate voicing parameters twice per frame */ + vparms(vwin, + &inbuf[inbuf_offset], + &lpbuf[lpbuf_offset], + buflim, + half, + &s->dither, + mintau, + &zc, + &lbe, + &fbe, + &qs, + &rc1, + &ar_b, + &ar_f); + /* Estimate signal-to-noise ratio to select the appropriate VDC vector. */ + /* The SNR is estimated as the running average of the ratio of the */ + /* running average full-band voiced energy to the running average */ + /* full-band unvoiced energy. SNR filter has gain of 63. */ + r1 = (s->snr + s->fbve/(float) max(s->fbue, 1))*63/64.0f; + s->snr = (float) lrintf(r1); + snr2 = s->snr*s->fbue/max(s->lbue, 1); + /* Quantize SNR to SNRL according to VDCL thresholds. */ + i1 = nvdcl - 1; + for (snrl = 0; snrl < i1; snrl++) + { + if (snr2 > vdcl[snrl]) + break; + } + /* (Note: SNRL = NVDCL here) */ + /* Linear discriminant voicing parameters: */ + value[0] = s->maxmin; + value[1] = (float) lbe/max(s->lbve, 1); + value[2] = (float) zc; + value[3] = rc1; + value[4] = qs; + value[5] = ivrc[1]; + value[6] = ar_b; + value[7] = ar_f; + /* Evaluation of linear discriminant function: */ + s->voice[2][half] = vdc[snrl*10 + 9]; + for (i = 0; i < 8; i++) + s->voice[2][half] += vdc[snrl*10 + i]*value[i]; + /* Classify as voiced if discriminant > 0, otherwise unvoiced */ + /* Voicing decision for current half-frame: 1 = Voiced; 0 = Unvoiced */ + s->voibuf[3][half] = (s->voice[2][half] > 0.0f) ? 1 : 0; + /* Skip voicing decision smoothing in first half-frame: */ + /* Give a value to VSTATE, so that trace statements below will print */ + /* a consistent value from one call to the next when HALF .EQ. 1. */ + /* The value of VSTATE is not used for any other purpose when this is */ + /* true. */ + vstate = -1; + if (half != 0) + { + /* Voicing decision smoothing rules (override of linear combination): */ + + /* Unvoiced half-frames: At least two in a row. */ + /* -------------------- */ + + /* Voiced half-frames: At least two in a row in one frame. */ + /* ------------------- Otherwise at least three in a row. */ + /* (Due to the way transition frames are encoded) */ + + /* In many cases, the discriminant function determines how to smooth. */ + /* In the following chart, the decisions marked with a * may be overridden. */ + + /* Voicing override of transitions at onsets: */ + /* If a V/UV or UV/V voicing decision transition occurs within one-half */ + /* frame of an onset bounding a voicing window, then the transition is */ + /* moved to occur at the onset. */ + + /* P 1F */ + /* ----- ----- */ + /* 0 0 0 0 */ + /* 0 0 0* 1 (If there is an onset there) */ + /* 0 0 1* 0* (Based on 2F and discriminant distance) */ + /* 0 0 1 1 */ + /* 0 1* 0 0 (Always) */ + /* 0 1* 0* 1 (Based on discriminant distance) */ + /* 0* 1 1 0* (Based on past, 2F, and discriminant distance) */ + /* 0 1* 1 1 (If there is an onset there) */ + /* 1 0* 0 0 (If there is an onset there) */ + /* 1 0 0 1 */ + /* 1 0* 1* 0 (Based on discriminant distance) */ + /* 1 0* 1 1 (Always) */ + /* 1 1 0 0 */ + /* 1 1 0* 1* (Based on 2F and discriminant distance) */ + /* 1 1 1* 0 (If there is an onset there) */ + /* 1 1 1 1 */ + + /* Determine if there is an onset transition between P and 1F. */ + /* OT (Onset Transition) is true if there is an onset between */ + /* P and 1F but not after 1F. */ + ot = ((obound[0] & 2) != 0 || obound[1] == 1) && (obound[2] & 1) == 0; + /* Multi-way dispatch on voicing decision history: */ + vstate = (s->voibuf[1][0] << 3) + (s->voibuf[1][1] << 2) + (s->voibuf[2][0] << 1) + s->voibuf[2][1]; + switch (vstate + 1) + { + case 2: + if (ot && s->voibuf[3][0] == 1) + s->voibuf[2][0] = 1; + break; + case 3: + if (s->voibuf[3][0] == 0 || s->voice[1][0] < -s->voice[1][1]) + s->voibuf[2][0] = 0; + else + s->voibuf[2][1] = 1; + break; + case 5: + s->voibuf[1][1] = 0; + break; + case 6: + if (s->voice[0][1] < -s->voice[1][0]) + s->voibuf[1][1] = 0; + else + s->voibuf[2][0] = 1; + break; + case 7: + if (s->voibuf[0][0] == 1 || s->voibuf[3][0] == 1 || s->voice[1][1] > s->voice[0][0]) + s->voibuf[2][1] = 1; + else + s->voibuf[1][0] = 1; + break; + case 8: + if (ot) + s->voibuf[1][1] = 0; + break; + case 9: + if (ot) + s->voibuf[1][1] = 1; + break; + case 11: + if (s->voice[1][9] < -s->voice[0][1]) + s->voibuf[2][0] = 0; + else + s->voibuf[1][1] = 1; + break; + case 12: + s->voibuf[1][1] = 1; + break; + case 14: + if (s->voibuf[3][0] == 0 && s->voice[1][1] < -s->voice[1][0]) + s->voibuf[2][1] = 0; + else + s->voibuf[2][0] = 1; + break; + case 15: + if (ot && s->voibuf[3][0] == 0) + s->voibuf[2][0] = 0; + break; + } + } + /* During unvoiced half-frames, update the low band and full band unvoiced*/ + /* energy estimates (LBUE and FBUE) and also the zero crossing */ + /* threshold (DITHER). (The input to the unvoiced energy filters is */ + /* restricted to be less than 10dB above the previous inputs of the */ + /* filters.) */ + /* During voiced half-frames, update the low-pass (LBVE) and all-pass */ + /* (FBVE) voiced energy estimates. */ + if (s->voibuf[3][half] == 0) + { + r1 = (s->sfbue*63 + (min(fbe, s->ofbue*3) << 3))/64.0f; + s->sfbue = lrintf(r1); + s->fbue = s->sfbue/8; + s->ofbue = fbe; + r1 = (s->slbue*63 + (min(lbe, s->olbue*3) << 3))/64.0f; + s->slbue = lrintf(r1); + s->lbue = s->slbue/8; + s->olbue = lbe; + } + else + { + s->lbve = lrintf((s->lbve*63 + lbe)/64.0f); + s->fbve = lrintf((s->fbve*63 + fbe)/64.0f); + } + /* Set dither threshold to yield proper zero crossing rates in the */ + /* presence of low frequency noise and low level signal input. */ + /* NOTE: The divisor is a function of REF, the expected energies. */ + /* Computing MIN */ + /* Computing MAX */ + r2 = sqrtf((float) (s->lbue*s->lbve))*64/3000; + r1 = max(r2, 1.0f); + s->dither = min(r1, 20.0f); + /* Voicing decisions are returned in VOIBUF. */ +} +/*- End of function --------------------------------------------------------*/ +/*- End of file ------------------------------------------------------------*/