comparison spandsp-0.0.3/spandsp-0.0.3/src/lpc10_voicing.c @ 5:f762bf195c4b

import spandsp-0.0.3
author Peter Meerwald <pmeerw@cosy.sbg.ac.at>
date Fri, 25 Jun 2010 16:00:21 +0200
parents
children
comparison
equal deleted inserted replaced
4:26cd8f1ef0b1 5:f762bf195c4b
1 /*
2 * SpanDSP - a series of DSP components for telephony
3 *
4 * lpc10_voicing.c - LPC10 low bit rate speech codec.
5 *
6 * Written by Steve Underwood <steveu@coppice.org>
7 *
8 * Copyright (C) 2006 Steve Underwood
9 *
10 * All rights reserved.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2, as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 * This code is based on the U.S. Department of Defense reference
26 * implementation of the LPC-10 2400 bps Voice Coder. They do not
27 * exert copyright claims on their code, and it may be freely used.
28 *
29 * $Id: lpc10_voicing.c,v 1.7 2006/11/30 15:41:47 steveu Exp $
30 */
31
32 #ifdef HAVE_CONFIG_H
33 #include <config.h>
34 #endif
35
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include <inttypes.h>
39 #include <memory.h>
40 #if defined(HAVE_TGMATH_H)
41 #include <tgmath.h>
42 #endif
43 #if defined(HAVE_MATH_H)
44 #include <math.h>
45 #endif
46
47 #include "spandsp/telephony.h"
48 #include "spandsp/dc_restore.h"
49 #include "spandsp/lpc10.h"
50
51 #include "lpc10_encdecs.h"
52
53 static void vparms(int32_t vwin[],
54 float *inbuf,
55 float *lpbuf,
56 const int32_t buflim[],
57 int32_t half,
58 float *dither,
59 int32_t *mintau,
60 int32_t *zc,
61 int32_t *lbe,
62 int32_t *fbe,
63 float *qs,
64 float *rc1,
65 float *ar_b,
66 float *ar_f)
67 {
68 int32_t inbuf_offset;
69 int32_t lpbuf_offset;
70 int32_t vlen;
71 int32_t stop;
72 int32_t i;
73 int32_t start;
74 float r1;
75 float r2;
76 float e_pre;
77 float ap_rms;
78 float e_0;
79 float oldsgn;
80 float lp_rms;
81 float e_b;
82 float e_f;
83 float r_b;
84 float r_f;
85 float e0ap;
86
87 /* Calculate zero crossings (ZC) and several energy and correlation */
88 /* measures on low band and full band speech. Each measure is taken */
89 /* over either the first or the second half of the voicing window, */
90 /* depending on the variable HALF. */
91 lpbuf_offset = buflim[2];
92 lpbuf -= lpbuf_offset;
93 inbuf_offset = buflim[0];
94 inbuf -= inbuf_offset;
95
96 lp_rms = 0.0f;
97 ap_rms = 0.0f;
98 e_pre = 0.0f;
99 e0ap = 0.0f;
100 *rc1 = 0.0f;
101 e_0 = 0.0f;
102 e_b = 0.0f;
103 e_f = 0.0f;
104 r_f = 0.0f;
105 r_b = 0.0f;
106 *zc = 0;
107 vlen = vwin[1] - vwin[0] + 1;
108 start = vwin[0] + half*vlen/2 + 1;
109 stop = start + vlen/2 - 1;
110
111 /* I'll use the symbol HVL in the table below to represent the value */
112 /* VLEN/2. Note that if VLEN is odd, then HVL should be rounded down, */
113 /* i.e., HVL = (VLEN-1)/2. */
114
115 /* HALF START STOP */
116
117 /* 1 VWIN(1)+1 VWIN(1)+HVL */
118 /* 2 VWIN(1)+HVL+1 VWIN(1)+2*HVL */
119 oldsgn = r_sign(1.0f, inbuf[start - 1] - *dither);
120 for (i = start; i <= stop; i++)
121 {
122 lp_rms += fabsf(lpbuf[i]);
123 ap_rms += fabsf(inbuf[i]);
124 e_pre += fabsf(inbuf[i] - inbuf[i - 1]);
125 r1 = inbuf[i];
126 e0ap += r1*r1;
127 *rc1 += inbuf[i]*inbuf[i - 1];
128 r1 = lpbuf[i];
129 e_0 += r1*r1;
130 r1 = lpbuf[i - *mintau];
131 e_b += r1*r1;
132 r1 = lpbuf[i + *mintau];
133 e_f += r1*r1;
134 r_f += lpbuf[i]*lpbuf[i + *mintau];
135 r_b += lpbuf[i]*lpbuf[i - *mintau];
136 r1 = inbuf[i] + *dither;
137 if (r_sign(1.0f, r1) != oldsgn)
138 {
139 ++(*zc);
140 oldsgn = -oldsgn;
141 }
142 *dither = -(*dither);
143 }
144 /* Normalized short-term autocovariance coefficient at unit sample delay */
145 *rc1 /= max(e0ap, 1.0f);
146 /* Ratio of the energy of the first difference signal (6 dB/oct preemphasis)*/
147 /* to the energy of the full band signal */
148 /* Computing MAX */
149 r1 = ap_rms*2.0f;
150 *qs = e_pre/max(r1, 1.0f);
151 /* aR_b is the product of the forward and reverse prediction gains, */
152 /* looking backward in time (the causal case). */
153 *ar_b = r_b/max(e_b, 1.0f)*(r_b/max(e_0, 1.0f));
154 /* aR_f is the same as aR_b, but looking forward in time (non causal case).*/
155 *ar_f = r_f/max(e_f, 1.0f)*(r_f/max(e_0, 1.0f));
156 /* Normalize ZC, LBE, and FBE to old fixed window length of 180. */
157 /* (The fraction 90/VLEN has a range of 0.58 to 1) */
158 r2 = (float) (*zc << 1);
159 *zc = lrintf(r2*(90.0f/vlen));
160 r1 = lp_rms/4*(90.0f/vlen);
161 *lbe = min(lrintf(r1), 32767);
162 r1 = ap_rms/4*(90.0f/vlen);
163 *fbe = min(lrintf(r1), 32767);
164 }
165 /*- End of function --------------------------------------------------------*/
166
167 /* Voicing detection makes voicing decisions for each half */
168 /* frame of input speech. Tentative voicing decisions are made two frames*/
169 /* in the future (2F) for each half frame. These decisions are carried */
170 /* through one frame in the future (1F) to the present (P) frame where */
171 /* they are examined and smoothed, resulting in the final voicing */
172 /* decisions for each half frame. */
173
174 /* The voicing parameter (signal measurement) column vector (VALUE) */
175 /* is based on a rectangular window of speech samples determined by the */
176 /* window placement algorithm. The voicing parameter vector contains the*/
177 /* AMDF windowed maximum-to-minimum ratio, the zero crossing rate, energy*/
178 /* measures, reflection coefficients, and prediction gains. The voicing */
179 /* window is placed to avoid contamination of the voicing parameter vector*/
180 /* with speech onsets. */
181
182 /* The input signal is then classified as unvoiced (including */
183 /* silence) or voiced. This decision is made by a linear discriminant */
184 /* function consisting of a dot product of the voicing decision */
185 /* coefficient (VDC) row vector with the measurement column vector */
186 /* (VALUE). The VDC vector is 2-dimensional, each row vector is optimized*/
187 /* for a particular signal-to-noise ratio (SNR). So, before the dot */
188 /* product is performed, the SNR is estimated to select the appropriate */
189 /* VDC vector. */
190
191 /* The smoothing algorithm is a modified median smoother. The */
192 /* voicing discriminant function is used by the smoother to determine how*/
193 /* strongly voiced or unvoiced a signal is. The smoothing is further */
194 /* modified if a speech onset and a voicing decision transition occur */
195 /* within one half frame. In this case, the voicing decision transition */
196 /* is extended to the speech onset. For transmission purposes, there are*/
197 /* constraints on the duration and transition of voicing decisions. The */
198 /* smoother takes these constraints into account. */
199
200 /* Finally, the energy estimates are updated along with the dither */
201 /* threshold used to calculate the zero crossing rate (ZC). */
202
203 void lpc10_voicing(lpc10_encode_state_t *s,
204 int32_t vwin[],
205 float *inbuf,
206 float *lpbuf,
207 const int32_t buflim[],
208 int32_t half,
209 float *minamd,
210 float *maxamd,
211 int32_t *mintau,
212 float ivrc[],
213 int32_t obound[])
214 {
215 static const float vdc[100] =
216 {
217 0.0f, 1714.0f, -110.0f, 334.0f, -4096.0f, -654.0f, 3752.0f, 3769.0f, 0.0f, 1181.0f,
218 0.0f, 874.0f, -97.0f, 300.0f, -4096.0f, -1021.0f, 2451.0f, 2527.0f, 0.0f, -500.0f,
219 0.0f, 510.0f, -70.0f, 250.0f, -4096.0f, -1270.0f, 2194.0f, 2491.0f, 0.0f, -1500.0f,
220 0.0f, 500.0f, -10.0f, 200.0f, -4096.0f, -1300.0f, 2.0e3f, 2.0e3f, 0.0f, -2.0e3f,
221 0.0f, 500.0f, 0.0f, 0.0f, -4096.0f, -1300.0f, 2.0e3f, 2.0e3f, 0.0f, -2500.0f,
222 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
223 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
224 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
225 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
226 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
227 };
228 static const int nvdcl = 5;
229 static const float vdcl[10] =
230 {
231 600.0f, 450.0f, 300.0f, 200.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
232 };
233
234 int32_t inbuf_offset;
235 int32_t lpbuf_offset;
236 int32_t i1;
237 float r1;
238 float r2;
239 float ar_b;
240 float ar_f;
241 int32_t snrl;
242 int32_t i;
243 float value[9];
244 int32_t zc;
245 int ot;
246 float qs;
247 int32_t vstate;
248 float rc1;
249 int32_t fbe;
250 int32_t lbe;
251 float snr2;
252
253 inbuf_offset = 0;
254 lpbuf_offset = 0;
255 if (inbuf)
256 {
257 inbuf_offset = buflim[0];
258 inbuf -= inbuf_offset;
259 }
260 if (lpbuf)
261 {
262 lpbuf_offset = buflim[2];
263 lpbuf -= lpbuf_offset;
264 }
265
266 /* Voicing Decision Parameter vector (* denotes zero coefficient): */
267
268 /* * MAXMIN */
269 /* LBE/LBVE */
270 /* ZC */
271 /* RC1 */
272 /* QS */
273 /* IVRC2 */
274 /* aR_B */
275 /* aR_F */
276 /* * LOG(LBE/LBVE) */
277 /* Define 2-D voicing decision coefficient vector according to the voicing */
278 /* parameter order above. Each row (VDC vector) is optimized for a specific */
279 /* SNR. The last element of the vector is the constant. */
280 /* E ZC RC1 Qs IVRC2 aRb aRf c */
281
282 /* The VOICE array contains the result of the linear discriminant function*/
283 /* (analog values). The VOIBUF array contains the hard-limited binary */
284 /* voicing decisions. The VOICE and VOIBUF arrays, according to FORTRAN */
285 /* memory allocation, are addressed as: */
286
287 /* (half-frame number, future-frame number) */
288
289 /* | Past | Present | Future1 | Future2 | */
290 /* | 1,0 | 2,0 | 1,1 | 2,1 | 1,2 | 2,2 | 1,3 | 2,3 | ---> time */
291
292 /* Update linear discriminant function history each frame: */
293 if (half == 0)
294 {
295 s->voice[0][0] = s->voice[1][0];
296 s->voice[0][1] = s->voice[1][1];
297 s->voice[1][0] = s->voice[2][0];
298 s->voice[1][1] = s->voice[2][1];
299 s->maxmin = *maxamd / max(*minamd, 1.0f);
300 }
301 /* Calculate voicing parameters twice per frame */
302 vparms(vwin,
303 &inbuf[inbuf_offset],
304 &lpbuf[lpbuf_offset],
305 buflim,
306 half,
307 &s->dither,
308 mintau,
309 &zc,
310 &lbe,
311 &fbe,
312 &qs,
313 &rc1,
314 &ar_b,
315 &ar_f);
316 /* Estimate signal-to-noise ratio to select the appropriate VDC vector. */
317 /* The SNR is estimated as the running average of the ratio of the */
318 /* running average full-band voiced energy to the running average */
319 /* full-band unvoiced energy. SNR filter has gain of 63. */
320 r1 = (s->snr + s->fbve/(float) max(s->fbue, 1))*63/64.0f;
321 s->snr = (float) lrintf(r1);
322 snr2 = s->snr*s->fbue/max(s->lbue, 1);
323 /* Quantize SNR to SNRL according to VDCL thresholds. */
324 i1 = nvdcl - 1;
325 for (snrl = 0; snrl < i1; snrl++)
326 {
327 if (snr2 > vdcl[snrl])
328 break;
329 }
330 /* (Note: SNRL = NVDCL here) */
331 /* Linear discriminant voicing parameters: */
332 value[0] = s->maxmin;
333 value[1] = (float) lbe/max(s->lbve, 1);
334 value[2] = (float) zc;
335 value[3] = rc1;
336 value[4] = qs;
337 value[5] = ivrc[1];
338 value[6] = ar_b;
339 value[7] = ar_f;
340 /* Evaluation of linear discriminant function: */
341 s->voice[2][half] = vdc[snrl*10 + 9];
342 for (i = 0; i < 8; i++)
343 s->voice[2][half] += vdc[snrl*10 + i]*value[i];
344 /* Classify as voiced if discriminant > 0, otherwise unvoiced */
345 /* Voicing decision for current half-frame: 1 = Voiced; 0 = Unvoiced */
346 s->voibuf[3][half] = (s->voice[2][half] > 0.0f) ? 1 : 0;
347 /* Skip voicing decision smoothing in first half-frame: */
348 /* Give a value to VSTATE, so that trace statements below will print */
349 /* a consistent value from one call to the next when HALF .EQ. 1. */
350 /* The value of VSTATE is not used for any other purpose when this is */
351 /* true. */
352 vstate = -1;
353 if (half != 0)
354 {
355 /* Voicing decision smoothing rules (override of linear combination): */
356
357 /* Unvoiced half-frames: At least two in a row. */
358 /* -------------------- */
359
360 /* Voiced half-frames: At least two in a row in one frame. */
361 /* ------------------- Otherwise at least three in a row. */
362 /* (Due to the way transition frames are encoded) */
363
364 /* In many cases, the discriminant function determines how to smooth. */
365 /* In the following chart, the decisions marked with a * may be overridden. */
366
367 /* Voicing override of transitions at onsets: */
368 /* If a V/UV or UV/V voicing decision transition occurs within one-half */
369 /* frame of an onset bounding a voicing window, then the transition is */
370 /* moved to occur at the onset. */
371
372 /* P 1F */
373 /* ----- ----- */
374 /* 0 0 0 0 */
375 /* 0 0 0* 1 (If there is an onset there) */
376 /* 0 0 1* 0* (Based on 2F and discriminant distance) */
377 /* 0 0 1 1 */
378 /* 0 1* 0 0 (Always) */
379 /* 0 1* 0* 1 (Based on discriminant distance) */
380 /* 0* 1 1 0* (Based on past, 2F, and discriminant distance) */
381 /* 0 1* 1 1 (If there is an onset there) */
382 /* 1 0* 0 0 (If there is an onset there) */
383 /* 1 0 0 1 */
384 /* 1 0* 1* 0 (Based on discriminant distance) */
385 /* 1 0* 1 1 (Always) */
386 /* 1 1 0 0 */
387 /* 1 1 0* 1* (Based on 2F and discriminant distance) */
388 /* 1 1 1* 0 (If there is an onset there) */
389 /* 1 1 1 1 */
390
391 /* Determine if there is an onset transition between P and 1F. */
392 /* OT (Onset Transition) is true if there is an onset between */
393 /* P and 1F but not after 1F. */
394 ot = ((obound[0] & 2) != 0 || obound[1] == 1) && (obound[2] & 1) == 0;
395 /* Multi-way dispatch on voicing decision history: */
396 vstate = (s->voibuf[1][0] << 3) + (s->voibuf[1][1] << 2) + (s->voibuf[2][0] << 1) + s->voibuf[2][1];
397 switch (vstate + 1)
398 {
399 case 2:
400 if (ot && s->voibuf[3][0] == 1)
401 s->voibuf[2][0] = 1;
402 break;
403 case 3:
404 if (s->voibuf[3][0] == 0 || s->voice[1][0] < -s->voice[1][1])
405 s->voibuf[2][0] = 0;
406 else
407 s->voibuf[2][1] = 1;
408 break;
409 case 5:
410 s->voibuf[1][1] = 0;
411 break;
412 case 6:
413 if (s->voice[0][1] < -s->voice[1][0])
414 s->voibuf[1][1] = 0;
415 else
416 s->voibuf[2][0] = 1;
417 break;
418 case 7:
419 if (s->voibuf[0][0] == 1 || s->voibuf[3][0] == 1 || s->voice[1][1] > s->voice[0][0])
420 s->voibuf[2][1] = 1;
421 else
422 s->voibuf[1][0] = 1;
423 break;
424 case 8:
425 if (ot)
426 s->voibuf[1][1] = 0;
427 break;
428 case 9:
429 if (ot)
430 s->voibuf[1][1] = 1;
431 break;
432 case 11:
433 if (s->voice[1][9] < -s->voice[0][1])
434 s->voibuf[2][0] = 0;
435 else
436 s->voibuf[1][1] = 1;
437 break;
438 case 12:
439 s->voibuf[1][1] = 1;
440 break;
441 case 14:
442 if (s->voibuf[3][0] == 0 && s->voice[1][1] < -s->voice[1][0])
443 s->voibuf[2][1] = 0;
444 else
445 s->voibuf[2][0] = 1;
446 break;
447 case 15:
448 if (ot && s->voibuf[3][0] == 0)
449 s->voibuf[2][0] = 0;
450 break;
451 }
452 }
453 /* During unvoiced half-frames, update the low band and full band unvoiced*/
454 /* energy estimates (LBUE and FBUE) and also the zero crossing */
455 /* threshold (DITHER). (The input to the unvoiced energy filters is */
456 /* restricted to be less than 10dB above the previous inputs of the */
457 /* filters.) */
458 /* During voiced half-frames, update the low-pass (LBVE) and all-pass */
459 /* (FBVE) voiced energy estimates. */
460 if (s->voibuf[3][half] == 0)
461 {
462 r1 = (s->sfbue*63 + (min(fbe, s->ofbue*3) << 3))/64.0f;
463 s->sfbue = lrintf(r1);
464 s->fbue = s->sfbue/8;
465 s->ofbue = fbe;
466 r1 = (s->slbue*63 + (min(lbe, s->olbue*3) << 3))/64.0f;
467 s->slbue = lrintf(r1);
468 s->lbue = s->slbue/8;
469 s->olbue = lbe;
470 }
471 else
472 {
473 s->lbve = lrintf((s->lbve*63 + lbe)/64.0f);
474 s->fbve = lrintf((s->fbve*63 + fbe)/64.0f);
475 }
476 /* Set dither threshold to yield proper zero crossing rates in the */
477 /* presence of low frequency noise and low level signal input. */
478 /* NOTE: The divisor is a function of REF, the expected energies. */
479 /* Computing MIN */
480 /* Computing MAX */
481 r2 = sqrtf((float) (s->lbue*s->lbve))*64/3000;
482 r1 = max(r2, 1.0f);
483 s->dither = min(r1, 20.0f);
484 /* Voicing decisions are returned in VOIBUF. */
485 }
486 /*- End of function --------------------------------------------------------*/
487 /*- End of file ------------------------------------------------------------*/

Repositories maintained by Peter Meerwald, pmeerw@pmeerw.net.