comparison spandsp-0.0.6pre17/src/lpc10_voicing.c @ 4:26cd8f1ef0b1

import spandsp-0.0.6pre17
author Peter Meerwald <pmeerw@cosy.sbg.ac.at>
date Fri, 25 Jun 2010 15:50:58 +0200
parents
children
comparison
equal deleted inserted replaced
3:c6c5a16ce2f2 4:26cd8f1ef0b1
1 /*
2 * SpanDSP - a series of DSP components for telephony
3 *
4 * lpc10_voicing.c - LPC10 low bit rate speech codec.
5 *
6 * Written by Steve Underwood <steveu@coppice.org>
7 *
8 * Copyright (C) 2006 Steve Underwood
9 *
10 * All rights reserved.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU Lesser General Public License version 2.1,
14 * as published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 * This code is based on the U.S. Department of Defense reference
26 * implementation of the LPC-10 2400 bps Voice Coder. They do not
27 * exert copyright claims on their code, and it may be freely used.
28 *
29 * $Id: lpc10_voicing.c,v 1.18 2009/02/03 16:28:39 steveu Exp $
30 */
31
32 #if defined(HAVE_CONFIG_H)
33 #include "config.h"
34 #endif
35
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include <inttypes.h>
39 #include <memory.h>
40 #if defined(HAVE_TGMATH_H)
41 #include <tgmath.h>
42 #endif
43 #if defined(HAVE_MATH_H)
44 #include <math.h>
45 #endif
46 #include "floating_fudge.h"
47
48 #include "spandsp/telephony.h"
49 #include "spandsp/fast_convert.h"
50 #include "spandsp/dc_restore.h"
51 #include "spandsp/lpc10.h"
52 #include "spandsp/private/lpc10.h"
53
54 #include "lpc10_encdecs.h"
55
56 static void vparms(int32_t vwin[],
57 float *inbuf,
58 float *lpbuf,
59 const int32_t buflim[],
60 int32_t half,
61 float *dither,
62 int32_t *mintau,
63 int32_t *zc,
64 int32_t *lbe,
65 int32_t *fbe,
66 float *qs,
67 float *rc1,
68 float *ar_b,
69 float *ar_f)
70 {
71 int32_t inbuf_offset;
72 int32_t lpbuf_offset;
73 int32_t vlen;
74 int32_t stop;
75 int32_t i;
76 int32_t start;
77 float r1;
78 float r2;
79 float e_pre;
80 float ap_rms;
81 float e_0;
82 float oldsgn;
83 float lp_rms;
84 float e_b;
85 float e_f;
86 float r_b;
87 float r_f;
88 float e0ap;
89
90 /* Calculate zero crossings (ZC) and several energy and correlation */
91 /* measures on low band and full band speech. Each measure is taken */
92 /* over either the first or the second half of the voicing window, */
93 /* depending on the variable HALF. */
94 lpbuf_offset = buflim[2];
95 lpbuf -= lpbuf_offset;
96 inbuf_offset = buflim[0];
97 inbuf -= inbuf_offset;
98
99 lp_rms = 0.0f;
100 ap_rms = 0.0f;
101 e_pre = 0.0f;
102 e0ap = 0.0f;
103 *rc1 = 0.0f;
104 e_0 = 0.0f;
105 e_b = 0.0f;
106 e_f = 0.0f;
107 r_f = 0.0f;
108 r_b = 0.0f;
109 *zc = 0;
110 vlen = vwin[1] - vwin[0] + 1;
111 start = vwin[0] + half*vlen/2 + 1;
112 stop = start + vlen/2 - 1;
113
114 /* I'll use the symbol HVL in the table below to represent the value */
115 /* VLEN/2. Note that if VLEN is odd, then HVL should be rounded down, */
116 /* i.e., HVL = (VLEN-1)/2. */
117
118 /* HALF START STOP */
119
120 /* 1 VWIN(1)+1 VWIN(1)+HVL */
121 /* 2 VWIN(1)+HVL+1 VWIN(1)+2*HVL */
122 oldsgn = r_sign(1.0f, inbuf[start - 1] - *dither);
123 for (i = start; i <= stop; i++)
124 {
125 lp_rms += fabsf(lpbuf[i]);
126 ap_rms += fabsf(inbuf[i]);
127 e_pre += fabsf(inbuf[i] - inbuf[i - 1]);
128 r1 = inbuf[i];
129 e0ap += r1*r1;
130 *rc1 += inbuf[i]*inbuf[i - 1];
131 r1 = lpbuf[i];
132 e_0 += r1*r1;
133 r1 = lpbuf[i - *mintau];
134 e_b += r1*r1;
135 r1 = lpbuf[i + *mintau];
136 e_f += r1*r1;
137 r_f += lpbuf[i]*lpbuf[i + *mintau];
138 r_b += lpbuf[i]*lpbuf[i - *mintau];
139 r1 = inbuf[i] + *dither;
140 if (r_sign(1.0f, r1) != oldsgn)
141 {
142 ++(*zc);
143 oldsgn = -oldsgn;
144 }
145 *dither = -(*dither);
146 }
147 /* Normalized short-term autocovariance coefficient at unit sample delay */
148 *rc1 /= max(e0ap, 1.0f);
149 /* Ratio of the energy of the first difference signal (6 dB/oct preemphasis)*/
150 /* to the energy of the full band signal */
151 /* Computing MAX */
152 r1 = ap_rms*2.0f;
153 *qs = e_pre/max(r1, 1.0f);
154 /* aR_b is the product of the forward and reverse prediction gains, */
155 /* looking backward in time (the causal case). */
156 *ar_b = r_b/max(e_b, 1.0f)*(r_b/max(e_0, 1.0f));
157 /* aR_f is the same as aR_b, but looking forward in time (non causal case).*/
158 *ar_f = r_f/max(e_f, 1.0f)*(r_f/max(e_0, 1.0f));
159 /* Normalize ZC, LBE, and FBE to old fixed window length of 180. */
160 /* (The fraction 90/VLEN has a range of 0.58 to 1) */
161 r2 = (float) (*zc << 1);
162 *zc = lfastrintf(r2*(90.0f/vlen));
163 r1 = lp_rms/4*(90.0f/vlen);
164 *lbe = min(lfastrintf(r1), 32767);
165 r1 = ap_rms/4*(90.0f/vlen);
166 *fbe = min(lfastrintf(r1), 32767);
167 }
168 /*- End of function --------------------------------------------------------*/
169
170 /* Voicing detection makes voicing decisions for each half */
171 /* frame of input speech. Tentative voicing decisions are made two frames*/
172 /* in the future (2F) for each half frame. These decisions are carried */
173 /* through one frame in the future (1F) to the present (P) frame where */
174 /* they are examined and smoothed, resulting in the final voicing */
175 /* decisions for each half frame. */
176
177 /* The voicing parameter (signal measurement) column vector (VALUE) */
178 /* is based on a rectangular window of speech samples determined by the */
179 /* window placement algorithm. The voicing parameter vector contains the*/
180 /* AMDF windowed maximum-to-minimum ratio, the zero crossing rate, energy*/
181 /* measures, reflection coefficients, and prediction gains. The voicing */
182 /* window is placed to avoid contamination of the voicing parameter vector*/
183 /* with speech onsets. */
184
185 /* The input signal is then classified as unvoiced (including */
186 /* silence) or voiced. This decision is made by a linear discriminant */
187 /* function consisting of a dot product of the voicing decision */
188 /* coefficient (VDC) row vector with the measurement column vector */
189 /* (VALUE). The VDC vector is 2-dimensional, each row vector is optimized*/
190 /* for a particular signal-to-noise ratio (SNR). So, before the dot */
191 /* product is performed, the SNR is estimated to select the appropriate */
192 /* VDC vector. */
193
194 /* The smoothing algorithm is a modified median smoother. The */
195 /* voicing discriminant function is used by the smoother to determine how*/
196 /* strongly voiced or unvoiced a signal is. The smoothing is further */
197 /* modified if a speech onset and a voicing decision transition occur */
198 /* within one half frame. In this case, the voicing decision transition */
199 /* is extended to the speech onset. For transmission purposes, there are*/
200 /* constraints on the duration and transition of voicing decisions. The */
201 /* smoother takes these constraints into account. */
202
203 /* Finally, the energy estimates are updated along with the dither */
204 /* threshold used to calculate the zero crossing rate (ZC). */
205
206 void lpc10_voicing(lpc10_encode_state_t *s,
207 int32_t vwin[],
208 float *inbuf,
209 float *lpbuf,
210 const int32_t buflim[],
211 int32_t half,
212 float *minamd,
213 float *maxamd,
214 int32_t *mintau,
215 float ivrc[],
216 int32_t obound[])
217 {
218 static const float vdc[100] =
219 {
220 0.0f, 1714.0f, -110.0f, 334.0f, -4096.0f, -654.0f, 3752.0f, 3769.0f, 0.0f, 1181.0f,
221 0.0f, 874.0f, -97.0f, 300.0f, -4096.0f, -1021.0f, 2451.0f, 2527.0f, 0.0f, -500.0f,
222 0.0f, 510.0f, -70.0f, 250.0f, -4096.0f, -1270.0f, 2194.0f, 2491.0f, 0.0f, -1500.0f,
223 0.0f, 500.0f, -10.0f, 200.0f, -4096.0f, -1300.0f, 2.0e3f, 2.0e3f, 0.0f, -2.0e3f,
224 0.0f, 500.0f, 0.0f, 0.0f, -4096.0f, -1300.0f, 2.0e3f, 2.0e3f, 0.0f, -2500.0f,
225 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
226 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
227 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
228 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
229 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
230 };
231 static const int nvdcl = 5;
232 static const float vdcl[10] =
233 {
234 600.0f, 450.0f, 300.0f, 200.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
235 };
236
237 int32_t inbuf_offset;
238 int32_t lpbuf_offset;
239 int32_t i1;
240 float r1;
241 float r2;
242 float ar_b;
243 float ar_f;
244 int32_t snrl;
245 int32_t i;
246 float value[9];
247 int32_t zc;
248 int ot;
249 float qs;
250 int32_t vstate;
251 float rc1;
252 int32_t fbe;
253 int32_t lbe;
254 float snr2;
255
256 #if (_MSC_VER >= 1400)
257 __analysis_assume(half >= 0 && half < 2);
258 #endif
259 inbuf_offset = 0;
260 lpbuf_offset = 0;
261 if (inbuf)
262 {
263 inbuf_offset = buflim[0];
264 inbuf -= inbuf_offset;
265 }
266 if (lpbuf)
267 {
268 lpbuf_offset = buflim[2];
269 lpbuf -= lpbuf_offset;
270 }
271
272 /* Voicing Decision Parameter vector (* denotes zero coefficient): */
273
274 /* * MAXMIN */
275 /* LBE/LBVE */
276 /* ZC */
277 /* RC1 */
278 /* QS */
279 /* IVRC2 */
280 /* aR_B */
281 /* aR_F */
282 /* * LOG(LBE/LBVE) */
283 /* Define 2-D voicing decision coefficient vector according to the voicing */
284 /* parameter order above. Each row (VDC vector) is optimized for a specific */
285 /* SNR. The last element of the vector is the constant. */
286 /* E ZC RC1 Qs IVRC2 aRb aRf c */
287
288 /* The VOICE array contains the result of the linear discriminant function*/
289 /* (analog values). The VOIBUF array contains the hard-limited binary */
290 /* voicing decisions. The VOICE and VOIBUF arrays, according to FORTRAN */
291 /* memory allocation, are addressed as: */
292
293 /* (half-frame number, future-frame number) */
294
295 /* | Past | Present | Future1 | Future2 | */
296 /* | 1,0 | 2,0 | 1,1 | 2,1 | 1,2 | 2,2 | 1,3 | 2,3 | ---> time */
297
298 /* Update linear discriminant function history each frame: */
299 if (half == 0)
300 {
301 s->voice[0][0] = s->voice[1][0];
302 s->voice[0][1] = s->voice[1][1];
303 s->voice[1][0] = s->voice[2][0];
304 s->voice[1][1] = s->voice[2][1];
305 s->maxmin = *maxamd / max(*minamd, 1.0f);
306 }
307 /* Calculate voicing parameters twice per frame */
308 vparms(vwin,
309 &inbuf[inbuf_offset],
310 &lpbuf[lpbuf_offset],
311 buflim,
312 half,
313 &s->dither,
314 mintau,
315 &zc,
316 &lbe,
317 &fbe,
318 &qs,
319 &rc1,
320 &ar_b,
321 &ar_f);
322 /* Estimate signal-to-noise ratio to select the appropriate VDC vector. */
323 /* The SNR is estimated as the running average of the ratio of the */
324 /* running average full-band voiced energy to the running average */
325 /* full-band unvoiced energy. SNR filter has gain of 63. */
326 r1 = (s->snr + s->fbve/(float) max(s->fbue, 1))*63/64.0f;
327 s->snr = (float) lfastrintf(r1);
328 snr2 = s->snr*s->fbue/max(s->lbue, 1);
329 /* Quantize SNR to SNRL according to VDCL thresholds. */
330 i1 = nvdcl - 1;
331 for (snrl = 0; snrl < i1; snrl++)
332 {
333 if (snr2 > vdcl[snrl])
334 break;
335 }
336 /* (Note: SNRL = NVDCL here) */
337 /* Linear discriminant voicing parameters: */
338 value[0] = s->maxmin;
339 value[1] = (float) lbe/max(s->lbve, 1);
340 value[2] = (float) zc;
341 value[3] = rc1;
342 value[4] = qs;
343 value[5] = ivrc[1];
344 value[6] = ar_b;
345 value[7] = ar_f;
346 /* Evaluation of linear discriminant function: */
347 s->voice[2][half] = vdc[snrl*10 + 9];
348 for (i = 0; i < 8; i++)
349 s->voice[2][half] += vdc[snrl*10 + i]*value[i];
350 /* Classify as voiced if discriminant > 0, otherwise unvoiced */
351 /* Voicing decision for current half-frame: 1 = Voiced; 0 = Unvoiced */
352 s->voibuf[3][half] = (s->voice[2][half] > 0.0f) ? 1 : 0;
353 /* Skip voicing decision smoothing in first half-frame: */
354 /* Give a value to VSTATE, so that trace statements below will print */
355 /* a consistent value from one call to the next when HALF .EQ. 1. */
356 /* The value of VSTATE is not used for any other purpose when this is */
357 /* true. */
358 vstate = -1;
359 if (half != 0)
360 {
361 /* Voicing decision smoothing rules (override of linear combination): */
362
363 /* Unvoiced half-frames: At least two in a row. */
364 /* -------------------- */
365
366 /* Voiced half-frames: At least two in a row in one frame. */
367 /* ------------------- Otherwise at least three in a row. */
368 /* (Due to the way transition frames are encoded) */
369
370 /* In many cases, the discriminant function determines how to smooth. */
371 /* In the following chart, the decisions marked with a * may be overridden. */
372
373 /* Voicing override of transitions at onsets: */
374 /* If a V/UV or UV/V voicing decision transition occurs within one-half */
375 /* frame of an onset bounding a voicing window, then the transition is */
376 /* moved to occur at the onset. */
377
378 /* P 1F */
379 /* ----- ----- */
380 /* 0 0 0 0 */
381 /* 0 0 0* 1 (If there is an onset there) */
382 /* 0 0 1* 0* (Based on 2F and discriminant distance) */
383 /* 0 0 1 1 */
384 /* 0 1* 0 0 (Always) */
385 /* 0 1* 0* 1 (Based on discriminant distance) */
386 /* 0* 1 1 0* (Based on past, 2F, and discriminant distance) */
387 /* 0 1* 1 1 (If there is an onset there) */
388 /* 1 0* 0 0 (If there is an onset there) */
389 /* 1 0 0 1 */
390 /* 1 0* 1* 0 (Based on discriminant distance) */
391 /* 1 0* 1 1 (Always) */
392 /* 1 1 0 0 */
393 /* 1 1 0* 1* (Based on 2F and discriminant distance) */
394 /* 1 1 1* 0 (If there is an onset there) */
395 /* 1 1 1 1 */
396
397 /* Determine if there is an onset transition between P and 1F. */
398 /* OT (Onset Transition) is true if there is an onset between */
399 /* P and 1F but not after 1F. */
400 ot = ((obound[0] & 2) != 0 || obound[1] == 1) && (obound[2] & 1) == 0;
401 /* Multi-way dispatch on voicing decision history: */
402 vstate = (s->voibuf[1][0] << 3) + (s->voibuf[1][1] << 2) + (s->voibuf[2][0] << 1) + s->voibuf[2][1];
403 switch (vstate + 1)
404 {
405 case 2:
406 if (ot && s->voibuf[3][0] == 1)
407 s->voibuf[2][0] = 1;
408 break;
409 case 3:
410 if (s->voibuf[3][0] == 0 || s->voice[1][0] < -s->voice[1][1])
411 s->voibuf[2][0] = 0;
412 else
413 s->voibuf[2][1] = 1;
414 break;
415 case 5:
416 s->voibuf[1][1] = 0;
417 break;
418 case 6:
419 if (s->voice[0][1] < -s->voice[1][0])
420 s->voibuf[1][1] = 0;
421 else
422 s->voibuf[2][0] = 1;
423 break;
424 case 7:
425 if (s->voibuf[0][0] == 1 || s->voibuf[3][0] == 1 || s->voice[1][1] > s->voice[0][0])
426 s->voibuf[2][1] = 1;
427 else
428 s->voibuf[1][0] = 1;
429 break;
430 case 8:
431 if (ot)
432 s->voibuf[1][1] = 0;
433 break;
434 case 9:
435 if (ot)
436 s->voibuf[1][1] = 1;
437 break;
438 case 11:
439 if (s->voice[1][9] < -s->voice[0][1])
440 s->voibuf[2][0] = 0;
441 else
442 s->voibuf[1][1] = 1;
443 break;
444 case 12:
445 s->voibuf[1][1] = 1;
446 break;
447 case 14:
448 if (s->voibuf[3][0] == 0 && s->voice[1][1] < -s->voice[1][0])
449 s->voibuf[2][1] = 0;
450 else
451 s->voibuf[2][0] = 1;
452 break;
453 case 15:
454 if (ot && s->voibuf[3][0] == 0)
455 s->voibuf[2][0] = 0;
456 break;
457 }
458 }
459 /* During unvoiced half-frames, update the low band and full band unvoiced*/
460 /* energy estimates (LBUE and FBUE) and also the zero crossing */
461 /* threshold (DITHER). (The input to the unvoiced energy filters is */
462 /* restricted to be less than 10dB above the previous inputs of the */
463 /* filters.) */
464 /* During voiced half-frames, update the low-pass (LBVE) and all-pass */
465 /* (FBVE) voiced energy estimates. */
466 if (s->voibuf[3][half] == 0)
467 {
468 r1 = (s->sfbue*63 + (min(fbe, s->ofbue*3) << 3))/64.0f;
469 s->sfbue = lfastrintf(r1);
470 s->fbue = s->sfbue/8;
471 s->ofbue = fbe;
472 r1 = (s->slbue*63 + (min(lbe, s->olbue*3) << 3))/64.0f;
473 s->slbue = lfastrintf(r1);
474 s->lbue = s->slbue/8;
475 s->olbue = lbe;
476 }
477 else
478 {
479 s->lbve = lfastrintf((s->lbve*63 + lbe)/64.0f);
480 s->fbve = lfastrintf((s->fbve*63 + fbe)/64.0f);
481 }
482 /* Set dither threshold to yield proper zero crossing rates in the */
483 /* presence of low frequency noise and low level signal input. */
484 /* NOTE: The divisor is a function of REF, the expected energies. */
485 /* Computing MIN */
486 /* Computing MAX */
487 r2 = sqrtf((float) (s->lbue*s->lbve))*64/3000;
488 r1 = max(r2, 1.0f);
489 s->dither = min(r1, 20.0f);
490 /* Voicing decisions are returned in VOIBUF. */
491 }
492 /*- End of function --------------------------------------------------------*/
493 /*- End of file ------------------------------------------------------------*/

Repositories maintained by Peter Meerwald, pmeerw@pmeerw.net.