Mercurial > hg > audiostuff
comparison spandsp-0.0.3/spandsp-0.0.3/src/lpc10_voicing.c @ 5:f762bf195c4b
import spandsp-0.0.3
author | Peter Meerwald <pmeerw@cosy.sbg.ac.at> |
---|---|
date | Fri, 25 Jun 2010 16:00:21 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
4:26cd8f1ef0b1 | 5:f762bf195c4b |
---|---|
1 /* | |
2 * SpanDSP - a series of DSP components for telephony | |
3 * | |
4 * lpc10_voicing.c - LPC10 low bit rate speech codec. | |
5 * | |
6 * Written by Steve Underwood <steveu@coppice.org> | |
7 * | |
8 * Copyright (C) 2006 Steve Underwood | |
9 * | |
10 * All rights reserved. | |
11 * | |
12 * This program is free software; you can redistribute it and/or modify | |
13 * it under the terms of the GNU General Public License version 2, as | |
14 * published by the Free Software Foundation. | |
15 * | |
16 * This program is distributed in the hope that it will be useful, | |
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 * GNU General Public License for more details. | |
20 * | |
21 * You should have received a copy of the GNU General Public License | |
22 * along with this program; if not, write to the Free Software | |
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
24 * | |
25 * This code is based on the U.S. Department of Defense reference | |
26 * implementation of the LPC-10 2400 bps Voice Coder. They do not | |
27 * exert copyright claims on their code, and it may be freely used. | |
28 * | |
29 * $Id: lpc10_voicing.c,v 1.7 2006/11/30 15:41:47 steveu Exp $ | |
30 */ | |
31 | |
32 #ifdef HAVE_CONFIG_H | |
33 #include <config.h> | |
34 #endif | |
35 | |
36 #include <stdlib.h> | |
37 #include <stdio.h> | |
38 #include <inttypes.h> | |
39 #include <memory.h> | |
40 #if defined(HAVE_TGMATH_H) | |
41 #include <tgmath.h> | |
42 #endif | |
43 #if defined(HAVE_MATH_H) | |
44 #include <math.h> | |
45 #endif | |
46 | |
47 #include "spandsp/telephony.h" | |
48 #include "spandsp/dc_restore.h" | |
49 #include "spandsp/lpc10.h" | |
50 | |
51 #include "lpc10_encdecs.h" | |
52 | |
53 static void vparms(int32_t vwin[], | |
54 float *inbuf, | |
55 float *lpbuf, | |
56 const int32_t buflim[], | |
57 int32_t half, | |
58 float *dither, | |
59 int32_t *mintau, | |
60 int32_t *zc, | |
61 int32_t *lbe, | |
62 int32_t *fbe, | |
63 float *qs, | |
64 float *rc1, | |
65 float *ar_b, | |
66 float *ar_f) | |
67 { | |
68 int32_t inbuf_offset; | |
69 int32_t lpbuf_offset; | |
70 int32_t vlen; | |
71 int32_t stop; | |
72 int32_t i; | |
73 int32_t start; | |
74 float r1; | |
75 float r2; | |
76 float e_pre; | |
77 float ap_rms; | |
78 float e_0; | |
79 float oldsgn; | |
80 float lp_rms; | |
81 float e_b; | |
82 float e_f; | |
83 float r_b; | |
84 float r_f; | |
85 float e0ap; | |
86 | |
87 /* Calculate zero crossings (ZC) and several energy and correlation */ | |
88 /* measures on low band and full band speech. Each measure is taken */ | |
89 /* over either the first or the second half of the voicing window, */ | |
90 /* depending on the variable HALF. */ | |
91 lpbuf_offset = buflim[2]; | |
92 lpbuf -= lpbuf_offset; | |
93 inbuf_offset = buflim[0]; | |
94 inbuf -= inbuf_offset; | |
95 | |
96 lp_rms = 0.0f; | |
97 ap_rms = 0.0f; | |
98 e_pre = 0.0f; | |
99 e0ap = 0.0f; | |
100 *rc1 = 0.0f; | |
101 e_0 = 0.0f; | |
102 e_b = 0.0f; | |
103 e_f = 0.0f; | |
104 r_f = 0.0f; | |
105 r_b = 0.0f; | |
106 *zc = 0; | |
107 vlen = vwin[1] - vwin[0] + 1; | |
108 start = vwin[0] + half*vlen/2 + 1; | |
109 stop = start + vlen/2 - 1; | |
110 | |
111 /* I'll use the symbol HVL in the table below to represent the value */ | |
112 /* VLEN/2. Note that if VLEN is odd, then HVL should be rounded down, */ | |
113 /* i.e., HVL = (VLEN-1)/2. */ | |
114 | |
115 /* HALF START STOP */ | |
116 | |
117 /* 1 VWIN(1)+1 VWIN(1)+HVL */ | |
118 /* 2 VWIN(1)+HVL+1 VWIN(1)+2*HVL */ | |
119 oldsgn = r_sign(1.0f, inbuf[start - 1] - *dither); | |
120 for (i = start; i <= stop; i++) | |
121 { | |
122 lp_rms += fabsf(lpbuf[i]); | |
123 ap_rms += fabsf(inbuf[i]); | |
124 e_pre += fabsf(inbuf[i] - inbuf[i - 1]); | |
125 r1 = inbuf[i]; | |
126 e0ap += r1*r1; | |
127 *rc1 += inbuf[i]*inbuf[i - 1]; | |
128 r1 = lpbuf[i]; | |
129 e_0 += r1*r1; | |
130 r1 = lpbuf[i - *mintau]; | |
131 e_b += r1*r1; | |
132 r1 = lpbuf[i + *mintau]; | |
133 e_f += r1*r1; | |
134 r_f += lpbuf[i]*lpbuf[i + *mintau]; | |
135 r_b += lpbuf[i]*lpbuf[i - *mintau]; | |
136 r1 = inbuf[i] + *dither; | |
137 if (r_sign(1.0f, r1) != oldsgn) | |
138 { | |
139 ++(*zc); | |
140 oldsgn = -oldsgn; | |
141 } | |
142 *dither = -(*dither); | |
143 } | |
144 /* Normalized short-term autocovariance coefficient at unit sample delay */ | |
145 *rc1 /= max(e0ap, 1.0f); | |
146 /* Ratio of the energy of the first difference signal (6 dB/oct preemphasis)*/ | |
147 /* to the energy of the full band signal */ | |
148 /* Computing MAX */ | |
149 r1 = ap_rms*2.0f; | |
150 *qs = e_pre/max(r1, 1.0f); | |
151 /* aR_b is the product of the forward and reverse prediction gains, */ | |
152 /* looking backward in time (the causal case). */ | |
153 *ar_b = r_b/max(e_b, 1.0f)*(r_b/max(e_0, 1.0f)); | |
154 /* aR_f is the same as aR_b, but looking forward in time (non causal case).*/ | |
155 *ar_f = r_f/max(e_f, 1.0f)*(r_f/max(e_0, 1.0f)); | |
156 /* Normalize ZC, LBE, and FBE to old fixed window length of 180. */ | |
157 /* (The fraction 90/VLEN has a range of 0.58 to 1) */ | |
158 r2 = (float) (*zc << 1); | |
159 *zc = lrintf(r2*(90.0f/vlen)); | |
160 r1 = lp_rms/4*(90.0f/vlen); | |
161 *lbe = min(lrintf(r1), 32767); | |
162 r1 = ap_rms/4*(90.0f/vlen); | |
163 *fbe = min(lrintf(r1), 32767); | |
164 } | |
165 /*- End of function --------------------------------------------------------*/ | |
166 | |
167 /* Voicing detection makes voicing decisions for each half */ | |
168 /* frame of input speech. Tentative voicing decisions are made two frames*/ | |
169 /* in the future (2F) for each half frame. These decisions are carried */ | |
170 /* through one frame in the future (1F) to the present (P) frame where */ | |
171 /* they are examined and smoothed, resulting in the final voicing */ | |
172 /* decisions for each half frame. */ | |
173 | |
174 /* The voicing parameter (signal measurement) column vector (VALUE) */ | |
175 /* is based on a rectangular window of speech samples determined by the */ | |
176 /* window placement algorithm. The voicing parameter vector contains the*/ | |
177 /* AMDF windowed maximum-to-minimum ratio, the zero crossing rate, energy*/ | |
178 /* measures, reflection coefficients, and prediction gains. The voicing */ | |
179 /* window is placed to avoid contamination of the voicing parameter vector*/ | |
180 /* with speech onsets. */ | |
181 | |
182 /* The input signal is then classified as unvoiced (including */ | |
183 /* silence) or voiced. This decision is made by a linear discriminant */ | |
184 /* function consisting of a dot product of the voicing decision */ | |
185 /* coefficient (VDC) row vector with the measurement column vector */ | |
186 /* (VALUE). The VDC vector is 2-dimensional, each row vector is optimized*/ | |
187 /* for a particular signal-to-noise ratio (SNR). So, before the dot */ | |
188 /* product is performed, the SNR is estimated to select the appropriate */ | |
189 /* VDC vector. */ | |
190 | |
191 /* The smoothing algorithm is a modified median smoother. The */ | |
192 /* voicing discriminant function is used by the smoother to determine how*/ | |
193 /* strongly voiced or unvoiced a signal is. The smoothing is further */ | |
194 /* modified if a speech onset and a voicing decision transition occur */ | |
195 /* within one half frame. In this case, the voicing decision transition */ | |
196 /* is extended to the speech onset. For transmission purposes, there are*/ | |
197 /* constraints on the duration and transition of voicing decisions. The */ | |
198 /* smoother takes these constraints into account. */ | |
199 | |
200 /* Finally, the energy estimates are updated along with the dither */ | |
201 /* threshold used to calculate the zero crossing rate (ZC). */ | |
202 | |
203 void lpc10_voicing(lpc10_encode_state_t *s, | |
204 int32_t vwin[], | |
205 float *inbuf, | |
206 float *lpbuf, | |
207 const int32_t buflim[], | |
208 int32_t half, | |
209 float *minamd, | |
210 float *maxamd, | |
211 int32_t *mintau, | |
212 float ivrc[], | |
213 int32_t obound[]) | |
214 { | |
215 static const float vdc[100] = | |
216 { | |
217 0.0f, 1714.0f, -110.0f, 334.0f, -4096.0f, -654.0f, 3752.0f, 3769.0f, 0.0f, 1181.0f, | |
218 0.0f, 874.0f, -97.0f, 300.0f, -4096.0f, -1021.0f, 2451.0f, 2527.0f, 0.0f, -500.0f, | |
219 0.0f, 510.0f, -70.0f, 250.0f, -4096.0f, -1270.0f, 2194.0f, 2491.0f, 0.0f, -1500.0f, | |
220 0.0f, 500.0f, -10.0f, 200.0f, -4096.0f, -1300.0f, 2.0e3f, 2.0e3f, 0.0f, -2.0e3f, | |
221 0.0f, 500.0f, 0.0f, 0.0f, -4096.0f, -1300.0f, 2.0e3f, 2.0e3f, 0.0f, -2500.0f, | |
222 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, | |
223 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, | |
224 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, | |
225 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, | |
226 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f | |
227 }; | |
228 static const int nvdcl = 5; | |
229 static const float vdcl[10] = | |
230 { | |
231 600.0f, 450.0f, 300.0f, 200.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f | |
232 }; | |
233 | |
234 int32_t inbuf_offset; | |
235 int32_t lpbuf_offset; | |
236 int32_t i1; | |
237 float r1; | |
238 float r2; | |
239 float ar_b; | |
240 float ar_f; | |
241 int32_t snrl; | |
242 int32_t i; | |
243 float value[9]; | |
244 int32_t zc; | |
245 int ot; | |
246 float qs; | |
247 int32_t vstate; | |
248 float rc1; | |
249 int32_t fbe; | |
250 int32_t lbe; | |
251 float snr2; | |
252 | |
253 inbuf_offset = 0; | |
254 lpbuf_offset = 0; | |
255 if (inbuf) | |
256 { | |
257 inbuf_offset = buflim[0]; | |
258 inbuf -= inbuf_offset; | |
259 } | |
260 if (lpbuf) | |
261 { | |
262 lpbuf_offset = buflim[2]; | |
263 lpbuf -= lpbuf_offset; | |
264 } | |
265 | |
266 /* Voicing Decision Parameter vector (* denotes zero coefficient): */ | |
267 | |
268 /* * MAXMIN */ | |
269 /* LBE/LBVE */ | |
270 /* ZC */ | |
271 /* RC1 */ | |
272 /* QS */ | |
273 /* IVRC2 */ | |
274 /* aR_B */ | |
275 /* aR_F */ | |
276 /* * LOG(LBE/LBVE) */ | |
277 /* Define 2-D voicing decision coefficient vector according to the voicing */ | |
278 /* parameter order above. Each row (VDC vector) is optimized for a specific */ | |
279 /* SNR. The last element of the vector is the constant. */ | |
280 /* E ZC RC1 Qs IVRC2 aRb aRf c */ | |
281 | |
282 /* The VOICE array contains the result of the linear discriminant function*/ | |
283 /* (analog values). The VOIBUF array contains the hard-limited binary */ | |
284 /* voicing decisions. The VOICE and VOIBUF arrays, according to FORTRAN */ | |
285 /* memory allocation, are addressed as: */ | |
286 | |
287 /* (half-frame number, future-frame number) */ | |
288 | |
289 /* | Past | Present | Future1 | Future2 | */ | |
290 /* | 1,0 | 2,0 | 1,1 | 2,1 | 1,2 | 2,2 | 1,3 | 2,3 | ---> time */ | |
291 | |
292 /* Update linear discriminant function history each frame: */ | |
293 if (half == 0) | |
294 { | |
295 s->voice[0][0] = s->voice[1][0]; | |
296 s->voice[0][1] = s->voice[1][1]; | |
297 s->voice[1][0] = s->voice[2][0]; | |
298 s->voice[1][1] = s->voice[2][1]; | |
299 s->maxmin = *maxamd / max(*minamd, 1.0f); | |
300 } | |
301 /* Calculate voicing parameters twice per frame */ | |
302 vparms(vwin, | |
303 &inbuf[inbuf_offset], | |
304 &lpbuf[lpbuf_offset], | |
305 buflim, | |
306 half, | |
307 &s->dither, | |
308 mintau, | |
309 &zc, | |
310 &lbe, | |
311 &fbe, | |
312 &qs, | |
313 &rc1, | |
314 &ar_b, | |
315 &ar_f); | |
316 /* Estimate signal-to-noise ratio to select the appropriate VDC vector. */ | |
317 /* The SNR is estimated as the running average of the ratio of the */ | |
318 /* running average full-band voiced energy to the running average */ | |
319 /* full-band unvoiced energy. SNR filter has gain of 63. */ | |
320 r1 = (s->snr + s->fbve/(float) max(s->fbue, 1))*63/64.0f; | |
321 s->snr = (float) lrintf(r1); | |
322 snr2 = s->snr*s->fbue/max(s->lbue, 1); | |
323 /* Quantize SNR to SNRL according to VDCL thresholds. */ | |
324 i1 = nvdcl - 1; | |
325 for (snrl = 0; snrl < i1; snrl++) | |
326 { | |
327 if (snr2 > vdcl[snrl]) | |
328 break; | |
329 } | |
330 /* (Note: SNRL = NVDCL here) */ | |
331 /* Linear discriminant voicing parameters: */ | |
332 value[0] = s->maxmin; | |
333 value[1] = (float) lbe/max(s->lbve, 1); | |
334 value[2] = (float) zc; | |
335 value[3] = rc1; | |
336 value[4] = qs; | |
337 value[5] = ivrc[1]; | |
338 value[6] = ar_b; | |
339 value[7] = ar_f; | |
340 /* Evaluation of linear discriminant function: */ | |
341 s->voice[2][half] = vdc[snrl*10 + 9]; | |
342 for (i = 0; i < 8; i++) | |
343 s->voice[2][half] += vdc[snrl*10 + i]*value[i]; | |
344 /* Classify as voiced if discriminant > 0, otherwise unvoiced */ | |
345 /* Voicing decision for current half-frame: 1 = Voiced; 0 = Unvoiced */ | |
346 s->voibuf[3][half] = (s->voice[2][half] > 0.0f) ? 1 : 0; | |
347 /* Skip voicing decision smoothing in first half-frame: */ | |
348 /* Give a value to VSTATE, so that trace statements below will print */ | |
349 /* a consistent value from one call to the next when HALF .EQ. 1. */ | |
350 /* The value of VSTATE is not used for any other purpose when this is */ | |
351 /* true. */ | |
352 vstate = -1; | |
353 if (half != 0) | |
354 { | |
355 /* Voicing decision smoothing rules (override of linear combination): */ | |
356 | |
357 /* Unvoiced half-frames: At least two in a row. */ | |
358 /* -------------------- */ | |
359 | |
360 /* Voiced half-frames: At least two in a row in one frame. */ | |
361 /* ------------------- Otherwise at least three in a row. */ | |
362 /* (Due to the way transition frames are encoded) */ | |
363 | |
364 /* In many cases, the discriminant function determines how to smooth. */ | |
365 /* In the following chart, the decisions marked with a * may be overridden. */ | |
366 | |
367 /* Voicing override of transitions at onsets: */ | |
368 /* If a V/UV or UV/V voicing decision transition occurs within one-half */ | |
369 /* frame of an onset bounding a voicing window, then the transition is */ | |
370 /* moved to occur at the onset. */ | |
371 | |
372 /* P 1F */ | |
373 /* ----- ----- */ | |
374 /* 0 0 0 0 */ | |
375 /* 0 0 0* 1 (If there is an onset there) */ | |
376 /* 0 0 1* 0* (Based on 2F and discriminant distance) */ | |
377 /* 0 0 1 1 */ | |
378 /* 0 1* 0 0 (Always) */ | |
379 /* 0 1* 0* 1 (Based on discriminant distance) */ | |
380 /* 0* 1 1 0* (Based on past, 2F, and discriminant distance) */ | |
381 /* 0 1* 1 1 (If there is an onset there) */ | |
382 /* 1 0* 0 0 (If there is an onset there) */ | |
383 /* 1 0 0 1 */ | |
384 /* 1 0* 1* 0 (Based on discriminant distance) */ | |
385 /* 1 0* 1 1 (Always) */ | |
386 /* 1 1 0 0 */ | |
387 /* 1 1 0* 1* (Based on 2F and discriminant distance) */ | |
388 /* 1 1 1* 0 (If there is an onset there) */ | |
389 /* 1 1 1 1 */ | |
390 | |
391 /* Determine if there is an onset transition between P and 1F. */ | |
392 /* OT (Onset Transition) is true if there is an onset between */ | |
393 /* P and 1F but not after 1F. */ | |
394 ot = ((obound[0] & 2) != 0 || obound[1] == 1) && (obound[2] & 1) == 0; | |
395 /* Multi-way dispatch on voicing decision history: */ | |
396 vstate = (s->voibuf[1][0] << 3) + (s->voibuf[1][1] << 2) + (s->voibuf[2][0] << 1) + s->voibuf[2][1]; | |
397 switch (vstate + 1) | |
398 { | |
399 case 2: | |
400 if (ot && s->voibuf[3][0] == 1) | |
401 s->voibuf[2][0] = 1; | |
402 break; | |
403 case 3: | |
404 if (s->voibuf[3][0] == 0 || s->voice[1][0] < -s->voice[1][1]) | |
405 s->voibuf[2][0] = 0; | |
406 else | |
407 s->voibuf[2][1] = 1; | |
408 break; | |
409 case 5: | |
410 s->voibuf[1][1] = 0; | |
411 break; | |
412 case 6: | |
413 if (s->voice[0][1] < -s->voice[1][0]) | |
414 s->voibuf[1][1] = 0; | |
415 else | |
416 s->voibuf[2][0] = 1; | |
417 break; | |
418 case 7: | |
419 if (s->voibuf[0][0] == 1 || s->voibuf[3][0] == 1 || s->voice[1][1] > s->voice[0][0]) | |
420 s->voibuf[2][1] = 1; | |
421 else | |
422 s->voibuf[1][0] = 1; | |
423 break; | |
424 case 8: | |
425 if (ot) | |
426 s->voibuf[1][1] = 0; | |
427 break; | |
428 case 9: | |
429 if (ot) | |
430 s->voibuf[1][1] = 1; | |
431 break; | |
432 case 11: | |
433 if (s->voice[1][9] < -s->voice[0][1]) | |
434 s->voibuf[2][0] = 0; | |
435 else | |
436 s->voibuf[1][1] = 1; | |
437 break; | |
438 case 12: | |
439 s->voibuf[1][1] = 1; | |
440 break; | |
441 case 14: | |
442 if (s->voibuf[3][0] == 0 && s->voice[1][1] < -s->voice[1][0]) | |
443 s->voibuf[2][1] = 0; | |
444 else | |
445 s->voibuf[2][0] = 1; | |
446 break; | |
447 case 15: | |
448 if (ot && s->voibuf[3][0] == 0) | |
449 s->voibuf[2][0] = 0; | |
450 break; | |
451 } | |
452 } | |
453 /* During unvoiced half-frames, update the low band and full band unvoiced*/ | |
454 /* energy estimates (LBUE and FBUE) and also the zero crossing */ | |
455 /* threshold (DITHER). (The input to the unvoiced energy filters is */ | |
456 /* restricted to be less than 10dB above the previous inputs of the */ | |
457 /* filters.) */ | |
458 /* During voiced half-frames, update the low-pass (LBVE) and all-pass */ | |
459 /* (FBVE) voiced energy estimates. */ | |
460 if (s->voibuf[3][half] == 0) | |
461 { | |
462 r1 = (s->sfbue*63 + (min(fbe, s->ofbue*3) << 3))/64.0f; | |
463 s->sfbue = lrintf(r1); | |
464 s->fbue = s->sfbue/8; | |
465 s->ofbue = fbe; | |
466 r1 = (s->slbue*63 + (min(lbe, s->olbue*3) << 3))/64.0f; | |
467 s->slbue = lrintf(r1); | |
468 s->lbue = s->slbue/8; | |
469 s->olbue = lbe; | |
470 } | |
471 else | |
472 { | |
473 s->lbve = lrintf((s->lbve*63 + lbe)/64.0f); | |
474 s->fbve = lrintf((s->fbve*63 + fbe)/64.0f); | |
475 } | |
476 /* Set dither threshold to yield proper zero crossing rates in the */ | |
477 /* presence of low frequency noise and low level signal input. */ | |
478 /* NOTE: The divisor is a function of REF, the expected energies. */ | |
479 /* Computing MIN */ | |
480 /* Computing MAX */ | |
481 r2 = sqrtf((float) (s->lbue*s->lbve))*64/3000; | |
482 r1 = max(r2, 1.0f); | |
483 s->dither = min(r1, 20.0f); | |
484 /* Voicing decisions are returned in VOIBUF. */ | |
485 } | |
486 /*- End of function --------------------------------------------------------*/ | |
487 /*- End of file ------------------------------------------------------------*/ |