Mercurial > hg > audiostuff
view spandsp-0.0.6pre17/src/echo.c @ 4:26cd8f1ef0b1
import spandsp-0.0.6pre17
author | Peter Meerwald <pmeerw@cosy.sbg.ac.at> |
---|---|
date | Fri, 25 Jun 2010 15:50:58 +0200 |
parents | |
children |
line wrap: on
line source
/* * SpanDSP - a series of DSP components for telephony * * echo.c - An echo cancellor, suitable for electrical and acoustic * cancellation. This code does not currently comply with * any relevant standards (e.g. G.164/5/7/8). One day.... * * Written by Steve Underwood <steveu@coppice.org> * * Copyright (C) 2001, 2003 Steve Underwood * * Based on a bit from here, a bit from there, eye of toad, * ear of bat, etc - plus, of course, my own 2 cents. * * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License version 2.1, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * $Id: echo.c,v 1.33 2009/09/22 13:11:04 steveu Exp $ */ /*! \file */ /* TODO: Finish the echo suppressor option, however nasty suppression may be. Add an option to reintroduce side tone at -24dB under appropriate conditions. Improve double talk detector (iterative!) */ /* We need to differentiate between transmitted energy which will train the echo canceller well (voice, white noise, and other broadband sources) and energy which will train it badly (supervisory tones, DTMF, whistles, and other narrowband sources). There are many ways this might be done. This canceller uses a method based on the autocorrelation qualities of the transmitted signal. A rather peaky autocorrelation function is a clear sign of a narrowband signal. We only need perform the autocorrelation at well spaced intervals, so the compute load is not too great. Multiple successive autocorrelation functions with a similar peaky shape are a clear indication of a stationary narrowband signal. Using TKEO, it should be possible to greatly reduce the compute requirement for narrowband detection. */ /* The FIR taps must be adapted as 32 bit values, to get the necessary finesse in the adaption process. However, they are applied as 16 bit values (bits 30-15 of the 32 bit values) in the FIR. For the working 16 bit values, we need 4 sets. 3 of the 16 bit sets are used on a rotating basis. Normally the canceller steps round these 3 sets at regular intervals. Any time we detect double talk, we can go back to the set from two steps ago with reasonable assurance it is a well adapted set. We cannot just go back one step, as we may have rotated the sets just before double talk or tone was detected, and that set may already be somewhat corrupted. When narrowband energy is detected we need to continue adapting to it, to echo cancel it. However, the adaption will almost certainly be going astray. Broadband (or even complex sequences of narrowband) energy will normally lead to a well trained cancellor, with taps matching the impulse response of the channel. For stationary narrowband energy, there is usually has an infinite number of alternative tap sets which will cancel it well. A previously well trained set of taps will tend to drift amongst the alternatives. When broadband energy resumes, the taps may be a total mismatch for the signal, and could even amplify rather than attenuate the echo. The solution is to use a fourth set of 16 bit taps. When we first detect the narrowband energy we save the oldest of the group of three sets, but do not change back to an older set. We let the canceller cancel, and it adaption drift while the narrowband energy is present. When we detect the narrowband energy has ceased, we switch to using the fourth set of taps which was saved. When we revert to an older set of taps, we must replace both the 16 bit and 32 bit working tap sets. The saved 16 bit values are good enough to also be used as a replacement for the 32 bit values. We loose the fractions, but they should soon settle down in a reasonable way. */ #if defined(HAVE_CONFIG_H) #include "config.h" #endif #include <inttypes.h> #include <stdlib.h> #if defined(HAVE_TGMATH_H) #include <tgmath.h> #endif #if defined(HAVE_MATH_H) #include <math.h> #endif #include "floating_fudge.h" #include <string.h> #include <stdio.h> #include "spandsp/telephony.h" #include "spandsp/fast_convert.h" #include "spandsp/logging.h" #include "spandsp/saturated.h" #include "spandsp/dc_restore.h" #include "spandsp/bit_operations.h" #include "spandsp/echo.h" #include "spandsp/private/echo.h" #if !defined(NULL) #define NULL (void *) 0 #endif #if !defined(FALSE) #define FALSE 0 #endif #if !defined(TRUE) #define TRUE (!FALSE) #endif #define NONUPDATE_DWELL_TIME 600 /* 600 samples, or 75ms */ #define MIN_TX_POWER_FOR_ADAPTION 64*64 #define MIN_RX_POWER_FOR_ADAPTION 64*64 static int narrowband_detect(echo_can_state_t *ec) { int k; int i; float temp; float scale; float sf[128]; float f_acf[128]; int32_t acf[28]; int score; int len = 32; int alen = 9; k = ec->curr_pos; for (i = 0; i < len; i++) { sf[i] = ec->fir_state.history[k++]; if (k >= 256) k = 0; } for (k = 0; k < alen; k++) { temp = 0; for (i = k; i < len; i++) temp += sf[i]*sf[i - k]; f_acf[k] = temp; } scale = 0x1FFFFFFF/f_acf[0]; for (k = 0; k < alen; k++) acf[k] = (int32_t) (f_acf[k]*scale); score = 0; for (i = 0; i < 9; i++) { if (ec->last_acf[i] >= 0 && acf[i] >= 0) { if ((ec->last_acf[i] >> 1) < acf[i] && acf[i] < (ec->last_acf[i] << 1)) score++; } else if (ec->last_acf[i] < 0 && acf[i] < 0) { if ((ec->last_acf[i] >> 1) > acf[i] && acf[i] > (ec->last_acf[i] << 1)) score++; } } memcpy(ec->last_acf, acf, alen*sizeof(ec->last_acf[0])); return score; } static __inline__ void lms_adapt(echo_can_state_t *ec, int factor) { int i; #if 0 mmx_t *mmx_taps; mmx_t *mmx_coeffs; mmx_t *mmx_hist; mmx_t mmx; mmx.w[0] = mmx.w[1] = mmx.w[2] = mmx.w[3] = factor; mmx_hist = (mmx_t *) &fir->history[fir->curr_pos]; mmx_taps = (mmx_t *) &fir->taps; mmx_coeffs = (mmx_t *) fir->coeffs; i = fir->taps; movq_m2r(mmx, mm0); while (i > 0) { movq_m2r(mmx_hist[0], mm1); movq_m2r(mmx_taps[0], mm0); movq_m2r(mmx_taps[1], mm1); movq_r2r(mm1, mm2); pmulhw(mm0, mm1); pmullw(mm0, mm2); pmaddwd_r2r(mm1, mm0); pmaddwd_r2r(mm3, mm2); paddd_r2r(mm0, mm4); paddd_r2r(mm2, mm4); movq_r2m(mm0, mmx_taps[0]); movq_r2m(mm1, mmx_taps[0]); movq_r2m(mm2, mmx_coeffs[0]); mmx_taps += 2; mmx_coeffs += 1; mmx_hist += 1; i -= 4; ) emms(); #elif 0 /* Update the FIR taps */ for (i = ec->taps - 1; i >= 0; i--) { /* Leak to avoid the coefficients drifting beyond the ability of the adaption process to bring them back under control. */ ec->fir_taps32[i] -= (ec->fir_taps32[i] >> 23); ec->fir_taps32[i] += (ec->fir_state.history[i + ec->curr_pos]*factor); ec->latest_correction = (ec->fir_state.history[i + ec->curr_pos]*factor); ec->fir_taps16[ec->tap_set][i] = ec->fir_taps32[i] >> 15; } #else int offset1; int offset2; /* Update the FIR taps */ offset2 = ec->curr_pos; offset1 = ec->taps - offset2; for (i = ec->taps - 1; i >= offset1; i--) { ec->fir_taps32[i] += (ec->fir_state.history[i - offset1]*factor); ec->fir_taps16[ec->tap_set][i] = (int16_t) (ec->fir_taps32[i] >> 15); } for ( ; i >= 0; i--) { ec->fir_taps32[i] += (ec->fir_state.history[i + offset2]*factor); ec->fir_taps16[ec->tap_set][i] = (int16_t) (ec->fir_taps32[i] >> 15); } #endif } /*- End of function --------------------------------------------------------*/ SPAN_DECLARE(echo_can_state_t *) echo_can_init(int len, int adaption_mode) { echo_can_state_t *ec; int i; int j; if ((ec = (echo_can_state_t *) malloc(sizeof(*ec))) == NULL) return NULL; memset(ec, 0, sizeof(*ec)); ec->taps = len; ec->curr_pos = ec->taps - 1; ec->tap_mask = ec->taps - 1; if ((ec->fir_taps32 = (int32_t *) malloc(ec->taps*sizeof(int32_t))) == NULL) { free(ec); return NULL; } memset(ec->fir_taps32, 0, ec->taps*sizeof(int32_t)); for (i = 0; i < 4; i++) { if ((ec->fir_taps16[i] = (int16_t *) malloc(ec->taps*sizeof(int16_t))) == NULL) { for (j = 0; j < i; j++) free(ec->fir_taps16[j]); free(ec->fir_taps32); free(ec); return NULL; } memset(ec->fir_taps16[i], 0, ec->taps*sizeof(int16_t)); } fir16_create(&ec->fir_state, ec->fir_taps16[0], ec->taps); ec->rx_power_threshold = 10000000; ec->geigel_max = 0; ec->geigel_lag = 0; ec->dtd_onset = FALSE; ec->tap_set = 0; ec->tap_rotate_counter = 1600; ec->cng_level = 1000; echo_can_adaption_mode(ec, adaption_mode); return ec; } /*- End of function --------------------------------------------------------*/ SPAN_DECLARE(int) echo_can_release(echo_can_state_t *ec) { return 0; } /*- End of function --------------------------------------------------------*/ SPAN_DECLARE(int) echo_can_free(echo_can_state_t *ec) { int i; fir16_free(&ec->fir_state); free(ec->fir_taps32); for (i = 0; i < 4; i++) free(ec->fir_taps16[i]); free(ec); return 0; } /*- End of function --------------------------------------------------------*/ SPAN_DECLARE(void) echo_can_adaption_mode(echo_can_state_t *ec, int adaption_mode) { ec->adaption_mode = adaption_mode; } /*- End of function --------------------------------------------------------*/ SPAN_DECLARE(void) echo_can_flush(echo_can_state_t *ec) { int i; for (i = 0; i < 4; i++) ec->tx_power[i] = 0; for (i = 0; i < 3; i++) ec->rx_power[i] = 0; ec->clean_rx_power = 0; ec->nonupdate_dwell = 0; fir16_flush(&ec->fir_state); ec->fir_state.curr_pos = ec->taps - 1; memset(ec->fir_taps32, 0, ec->taps*sizeof(int32_t)); for (i = 0; i < 4; i++) memset(ec->fir_taps16[i], 0, ec->taps*sizeof(int16_t)); ec->curr_pos = ec->taps - 1; ec->supp_test1 = 0; ec->supp_test2 = 0; ec->supp1 = 0; ec->supp2 = 0; ec->vad = 0; ec->cng_level = 1000; ec->cng_filter = 0; ec->geigel_max = 0; ec->geigel_lag = 0; ec->dtd_onset = FALSE; ec->tap_set = 0; ec->tap_rotate_counter = 1600; ec->latest_correction = 0; memset(ec->last_acf, 0, sizeof(ec->last_acf)); ec->narrowband_count = 0; ec->narrowband_score = 0; } /*- End of function --------------------------------------------------------*/ int sample_no = 0; SPAN_DECLARE(void) echo_can_snapshot(echo_can_state_t *ec) { memcpy(ec->snapshot, ec->fir_taps16[0], ec->taps*sizeof(int16_t)); } /*- End of function --------------------------------------------------------*/ static __inline__ int16_t echo_can_hpf(int32_t coeff[2], int16_t amp) { int32_t z; /* Filter DC, 3dB point is 160Hz (I think), note 32 bit precision required otherwise values do not track down to 0. Zero at DC, Pole at (1-Beta) only real axis. Some chip sets (like Si labs) don't need this, but something like a $10 X100P card does. Any DC really slows down convergence. Note: removes some low frequency from the signal, this reduces the speech quality when listening to samples through headphones but may not be obvious through a telephone handset. Note that the 3dB frequency in radians is approx Beta, e.g. for Beta = 2^(-3) = 0.125, 3dB freq is 0.125 rads = 159Hz. This is one of the classic DC removal filters, adjusted to provide sufficient bass rolloff to meet the above requirement to protect hybrids from things that upset them. The difference between successive samples produces a lousy HPF, and then a suitably placed pole flattens things out. The final result is a nicely rolled off bass end. The filtering is implemented with extended fractional precision, which noise shapes things, giving very clean DC removal. Make sure the gain of the HPF is 1.0. The first can still saturate a little under impulse conditions, and it might roll to 32768 and need clipping on sustained peak level signals. However, the scale of such clipping is small, and the error due to any saturation should not markedly affect the downstream processing. */ z = amp << 15; z -= (z >> 4); coeff[0] += z - (coeff[0] >> 3) - coeff[1]; coeff[1] = z; z = coeff[0] >> 15; return saturate(z); } /*- End of function --------------------------------------------------------*/ SPAN_DECLARE(int16_t) echo_can_update(echo_can_state_t *ec, int16_t tx, int16_t rx) { int32_t echo_value; int clean_rx; int nsuppr; int score; int i; sample_no++; if (ec->adaption_mode & ECHO_CAN_USE_RX_HPF) rx = echo_can_hpf(ec->rx_hpf, rx); ec->latest_correction = 0; /* Evaluate the echo - i.e. apply the FIR filter */ /* Assume the gain of the FIR does not exceed unity. Exceeding unity would seem like a rather poor thing for an echo cancellor to do :) This means we can compute the result with a total disregard for overflows. 16bits x 16bits -> 31bits, so no overflow can occur in any multiply. While accumulating we may overflow and underflow the 32 bit scale often. However, if the gain does not exceed unity, everything should work itself out, and the final result will be OK, without any saturation logic. */ /* Overflow is very much possible here, and we do nothing about it because of the compute costs */ /* 16 bit coeffs for the LMS give lousy results (maths good, actual sound bad!), but 32 bit coeffs require some shifting. On balance 32 bit seems best */ echo_value = fir16(&ec->fir_state, tx); /* And the answer is..... */ clean_rx = rx - echo_value; printf("echo is %" PRId32 "\n", echo_value); /* That was the easy part. Now we need to adapt! */ if (ec->nonupdate_dwell > 0) ec->nonupdate_dwell--; /* Calculate short term power levels using very simple single pole IIRs */ /* TODO: Is the nasty modulus approach the fastest, or would a real tx*tx power calculation actually be faster? Using the squares makes the numbers grow a lot! */ ec->tx_power[3] += ((abs(tx) - ec->tx_power[3]) >> 5); ec->tx_power[2] += ((tx*tx - ec->tx_power[2]) >> 8); ec->tx_power[1] += ((tx*tx - ec->tx_power[1]) >> 5); ec->tx_power[0] += ((tx*tx - ec->tx_power[0]) >> 3); ec->rx_power[1] += ((rx*rx - ec->rx_power[1]) >> 6); ec->rx_power[0] += ((rx*rx - ec->rx_power[0]) >> 3); ec->clean_rx_power += ((clean_rx*clean_rx - ec->clean_rx_power) >> 6); score = 0; /* If there is very little being transmitted, any attempt to train is futile. We would either be training on the far end's noise or signal, the channel's own noise, or our noise. Either way, this is hardly good training, so don't do it (avoid trouble). */ if (ec->tx_power[0] > MIN_TX_POWER_FOR_ADAPTION) { /* If the received power is very low, either we are sending very little or we are already well adapted. There is little point in trying to improve the adaption under these circumstances, so don't do it (reduce the compute load). */ if (ec->tx_power[1] > ec->rx_power[0]) { /* There is no (or little) far-end speech. */ if (ec->nonupdate_dwell == 0) { if (++ec->narrowband_count >= 160) { ec->narrowband_count = 0; score = narrowband_detect(ec); printf("Do the narrowband test %d at %d\n", score, ec->curr_pos); if (score > 6) { if (ec->narrowband_score == 0) memcpy(ec->fir_taps16[3], ec->fir_taps16[(ec->tap_set + 1)%3], ec->taps*sizeof(int16_t)); ec->narrowband_score += score; } else { if (ec->narrowband_score > 200) { printf("Revert to %d at %d\n", (ec->tap_set + 1)%3, sample_no); memcpy(ec->fir_taps16[ec->tap_set], ec->fir_taps16[3], ec->taps*sizeof(int16_t)); memcpy(ec->fir_taps16[(ec->tap_set - 1)%3], ec->fir_taps16[3], ec->taps*sizeof(int16_t)); for (i = 0; i < ec->taps; i++) ec->fir_taps32[i] = ec->fir_taps16[3][i] << 15; ec->tap_rotate_counter = 1600; } ec->narrowband_score = 0; } } ec->dtd_onset = FALSE; if (--ec->tap_rotate_counter <= 0) { printf("Rotate to %d at %d\n", ec->tap_set, sample_no); ec->tap_rotate_counter = 1600; ec->tap_set++; if (ec->tap_set > 2) ec->tap_set = 0; ec->fir_state.coeffs = ec->fir_taps16[ec->tap_set]; } /* ... and we are not in the dwell time from previous speech. */ if ((ec->adaption_mode & ECHO_CAN_USE_ADAPTION) && ec->narrowband_score == 0) { //nsuppr = saturate((clean_rx << 16)/ec->tx_power[1]); //nsuppr = clean_rx/ec->tx_power[1]; /* If a sudden surge in signal level (e.g. the onset of a tone burst) cause an abnormally high instantaneous to average signal power ratio, we could kick the adaption badly in the wrong direction. This is because the tx_power takes too long to react and rise. We need to stop too rapid adaption to the new signal. We normalise to a value derived from the instantaneous signal if it exceeds the peak by too much. */ nsuppr = clean_rx; /* Divide isn't very quick, but the "where is the top bit" and shift instructions are single cycle. */ if (tx > 4*ec->tx_power[3]) i = top_bit(tx) - 8; else i = top_bit(ec->tx_power[3]) - 8; if (i > 0) nsuppr >>= i; lms_adapt(ec, nsuppr); } } //printf("%10d %10d %10d %10d %10d\n", rx, clean_rx, nsuppr, ec->tx_power[1], ec->rx_power[1]); //printf("%.4f\n", (float) ec->rx_power[1]/(float) ec->clean_rx_power); } else { if (!ec->dtd_onset) { printf("Revert to %d at %d\n", (ec->tap_set + 1)%3, sample_no); memcpy(ec->fir_taps16[ec->tap_set], ec->fir_taps16[(ec->tap_set + 1)%3], ec->taps*sizeof(int16_t)); memcpy(ec->fir_taps16[(ec->tap_set - 1)%3], ec->fir_taps16[(ec->tap_set + 1)%3], ec->taps*sizeof(int16_t)); for (i = 0; i < ec->taps; i++) ec->fir_taps32[i] = ec->fir_taps16[(ec->tap_set + 1)%3][i] << 15; ec->tap_rotate_counter = 1600; ec->dtd_onset = TRUE; } ec->nonupdate_dwell = NONUPDATE_DWELL_TIME; } } if (ec->rx_power[1]) ec->vad = (8000*ec->clean_rx_power)/ec->rx_power[1]; else ec->vad = 0; if (ec->rx_power[1] > 2048*2048 && ec->clean_rx_power > 4*ec->rx_power[1]) { /* The EC seems to be making things worse, instead of better. Zap it! */ memset(ec->fir_taps32, 0, ec->taps*sizeof(int32_t)); for (i = 0; i < 4; i++) memset(ec->fir_taps16[i], 0, ec->taps*sizeof(int16_t)); } #if defined(XYZZY) if ((ec->adaption_mode & ECHO_CAN_USE_SUPPRESSOR)) { ec->supp_test1 += (ec->fir_state.history[ec->curr_pos] - ec->fir_state.history[(ec->curr_pos - 7) & ec->tap_mask]); ec->supp_test2 += (ec->fir_state.history[(ec->curr_pos - 24) & ec->tap_mask] - ec->fir_state.history[(ec->curr_pos - 31) & ec->tap_mask]); if (ec->supp_test1 > 42 && ec->supp_test2 > 42) supp_change = 25; else supp_change = 50; supp = supp_change + k1*ec->supp1 + k2*ec->supp2; ec->supp2 = ec->supp1; ec->supp1 = supp; clean_rx *= (1 - supp); } #endif if ((ec->adaption_mode & ECHO_CAN_USE_NLP)) { /* Non-linear processor - a fancy way to say "zap small signals, to avoid residual echo due to (uLaw/ALaw) non-linearity in the channel.". */ if (ec->rx_power[1] < 30000000) { if (!ec->cng) { ec->cng_level = ec->clean_rx_power; ec->cng = TRUE; } if ((ec->adaption_mode & ECHO_CAN_USE_CNG)) { /* Very elementary comfort noise generation */ /* Just random numbers rolled off very vaguely Hoth-like */ ec->cng_rndnum = 1664525U*ec->cng_rndnum + 1013904223U; ec->cng_filter = ((ec->cng_rndnum & 0xFFFF) - 32768 + 5*ec->cng_filter) >> 3; clean_rx = (ec->cng_filter*ec->cng_level) >> 17; /* TODO: A better CNG, with more accurate (tracking) spectral shaping! */ } else { clean_rx = 0; } //clean_rx = -16000; } else { ec->cng = FALSE; } } else { ec->cng = FALSE; } printf("Narrowband score %4d %5d at %d\n", ec->narrowband_score, score, sample_no); /* Roll around the rolling buffer */ if (ec->curr_pos <= 0) ec->curr_pos = ec->taps; ec->curr_pos--; return (int16_t) clean_rx; } /*- End of function --------------------------------------------------------*/ SPAN_DECLARE(int16_t) echo_can_hpf_tx(echo_can_state_t *ec, int16_t tx) { if (ec->adaption_mode & ECHO_CAN_USE_TX_HPF) tx = echo_can_hpf(ec->tx_hpf, tx); return tx; } /*- End of function --------------------------------------------------------*/ /*- End of file ------------------------------------------------------------*/