enc_dtx.c
上传用户:dangjiwu
上传日期:2013-07-19
资源大小:42019k
文件大小:40k
- /*
- *===================================================================
- * 3GPP AMR Wideband Floating-point Speech Codec
- *===================================================================
- */
- #include <stdlib.h>
- #include "hlxclib/memory.h"
- #include <math.h>
- #include "typedef.h"
- #include "enc_lpc.h"
- #include "enc_util.h"
- #define DTX_HIST_SIZE_MIN_ONE 7
- #define DTX_HANG_CONST 7 /* yields eight frames of SP HANGOVER */
- #define DTX_ELAPSED_FRAMES_THRESH (24 + 7 -1)
- #define MED_THRESH 2.25
- #define GAIN_THR 1.406
- #define ORDER 16 /* order of linear prediction filter */
- #define RANDOM_INITSEED 21845 /* own random init value */
- #define MRDTX 10
- #define SIZE_BK_NOISE1 64
- #define SIZE_BK_NOISE2 64
- #define SIZE_BK_NOISE3 64
- #define SIZE_BK_NOISE4 32
- #define SIZE_BK_NOISE5 32
- #define FRAME_LEN 256 /* Length (samples) of the input frame */
- #define SCALE 128 /* (UNITY * UNITY) / 512 */
- #define TONE_THR 0.65f /* Threshold for tone detection */
- /* constants for speech level estimation */
- #define SP_EST_COUNT 80
- #define SP_ACTIVITY_COUNT 25
- #define ALPHA_SP_UP (1.0f - 0.85f)
- #define ALPHA_SP_DOWN (1.0f - 0.85f)
- #define NOM_LEVEL 2050.0F /* about -26 dBov */
- #define SPEECH_LEVEL_INIT NOM_LEVEL
- #define MIN_SPEECH_LEVEL1 (NOM_LEVEL * 0.063F) /* NOM_LEVEL -24 dB */
- #define MIN_SPEECH_LEVEL2 (NOM_LEVEL * 0.2F) /* NOM_LEVEL -14 dB */
- #define MIN_SPEECH_SNR 0.125F /* 0 dB, lowest SNR estimation */
- /* Constants for background spectrum update */
- #define ALPHA_UP1 (1.0f - 0.95f) /* Normal update, upwards: */
- #define ALPHA_DOWN1 (1.0f - 0.936f) /* Normal update, downwards */
- #define ALPHA_UP2 (1.0f - 0.985f) /* Forced update, upwards */
- #define ALPHA_DOWN2 (1.0f - 0.943f) /* Forced update, downwards */
- #define ALPHA3 (1.0f - 0.95f) /* Update downwards */
- #define ALPHA4 (1.0f - 0.9f) /* For stationary estimation */
- #define ALPHA5 (1.0f - 0.5f) /* For stationary estimation */
- /* Constants for VAD threshold */
- #define THR_MIN (1.6F * SCALE) /* Minimum threshold */
- #define THR_HIGH (6.0F * SCALE) /* Highest threshold */
- #define THR_LOW (1.7F * SCALE) /* Lowest threshold */
- #define NO_P1 31744.0F /* ilog2(1), Noise level for highest threshold */
- #define NO_P2 19786.0F /* ilog2(0.1, Noise level for lowest threshold */
- #define NO_SLOPE ((Float32)(THR_LOW - THR_HIGH) / (Float32)(NO_P2 - NO_P1))
- #define SP_CH_MIN (-0.75F * SCALE)
- #define SP_CH_MAX (0.75F * SCALE)
- #define SP_P1 22527.0F /* ilog2(NOM_LEVEL / 4) */
- #define SP_P2 17832.0F /* ilog2(NOM_LEVEL * 4) */
- #define SP_SLOPE ((Float32)(SP_CH_MAX - SP_CH_MIN) / (Float32)(SP_P2 - SP_P1))
- /* Constants for hangover length */
- #define HANG_HIGH 12 /* longest hangover */
- #define HANG_LOW 2 /* shortest hangover */
- #define HANG_P1 THR_LOW /* threshold for longest hangover */
- #define HANG_P2 (4 * SCALE) /* threshold for Word16est hangover */
- #define HANG_SLOPE ((Float32)(HANG_LOW - HANG_HIGH) / (Float32)(HANG_P2 - HANG_P1))
- /* Constants for burst length */
- #define BURST_HIGH 8 /* longest burst length */
- #define BURST_LOW 3 /* shortest burst length */
- #define BURST_P1 THR_HIGH /* threshold for Word32est burst */
- #define BURST_P2 THR_LOW /* threshold for Word16est burst */
- #define BURST_SLOPE ((Float32)(BURST_LOW - BURST_HIGH) / (Float32)(BURST_P2 - BURST_P1))
- /* Parameters for background spectrum recovery function */
- #define STAT_COUNT 20 /* threshold of stationary detection counter */
- #define STAT_THR_LEVEL 184 /* Threshold level for stationarity detection */
- #define STAT_THR 1000 /* Threshold for stationarity detection */
- /* Limits for background noise estimate */
- #define NOISE_MIN 40 /* minimum */
- #define NOISE_MAX 20000 /* maximum */
- #define NOISE_INIT 150 /* initial */
- /* Thresholds for signal power (now calculated on 2 frames) */
- #define VAD_POW_LOW 30000.0f /* If input power is lower than this, VAD is set to 0 */
- #define POW_PITCH_TONE_THR 686080.0f /* If input power is lower, pitch detection is ignored */
- /* Constants for the filter bank */
- #define COEFF3 0.407806f /* coefficient for the 3rd order filter */
- #define COEFF5_1 0.670013f /* 1st coefficient the for 5th order filter */
- #define COEFF5_2 0.195007f /* 2nd coefficient the for 5th order filter */
- extern const Float32 E_ROM_en_adjust[];
- extern const Float32 E_ROM_mean_isf_noise[];
- extern const Float32 E_ROM_dico1_isf_noise[];
- extern const Float32 E_ROM_dico2_isf_noise[];
- extern const Float32 E_ROM_dico3_isf_noise[];
- extern const Float32 E_ROM_dico4_isf_noise[];
- extern const Float32 E_ROM_dico5_isf_noise[];
- extern const Float32 E_ROM_isf[];
- /*
- * E_DTX_isf_history_aver
- *
- * Parameters:
- * isf_old I/O: ISF vectors
- * indices I: ISF indices
- * isf_aver O: averaged ISFs
- *
- * Function:
- * Perform the ISF averaging
- *
- * Returns:
- * void
- */
- static void E_DTX_isf_history_aver(Float32 isf_old[], Word16 indices[],
- Float32 isf_aver[])
- {
- Float32 isf_tmp[2 * M];
- Float32 tmp;
- Word32 i, j, k;
- /*
- * Memorize in isf_tmp[][] the ISF vectors to be replaced by
- * the median ISF vector prior to the averaging
- */
- for (k = 0; k < 2; k++)
- {
- if (indices[k] != -1)
- {
- for (i = 0; i < M; i++)
- {
- isf_tmp[k * M + i] = isf_old[indices[k] * M + i];
- isf_old[indices[k] * M + i] = isf_old[indices[2] * M + i];
- }
- }
- }
- /* Perform the ISF averaging */
- for (j = 0; j < M; j++)
- {
- tmp = 0;
- for (i = 0; i < DTX_HIST_SIZE; i++)
- {
- tmp += isf_old[i * M + j];
- }
- isf_aver[j] = tmp;
- }
- /* Retrieve from isf_tmp[][] the ISF vectors saved prior to averaging */
- for (k = 0; k < 2; k++)
- {
- if (indices[k] != -1)
- {
- for (i = 0; i < M; i++)
- {
- isf_old[indices[k] * M + i] = isf_tmp[k * M + i];
- }
- }
- }
- return;
- }
- /*
- * E_DTX_dithering_control
- *
- * Parameters:
- * st I: state struct
- *
- * Function:
- * Analysis of the variation and stationarity
- * of the background noise.
- *
- * Returns:
- * Dithering decision
- */
- static Word16 E_DTX_dithering_control(E_DTX_State * st)
- {
- Float32 ISF_diff, gain_diff, mean, tmp;
- Word32 i;
- Word16 CN_dith;
- /* determine how stationary the spectrum of background noise is */
- ISF_diff = 0.0F;
- for (i = 0; i < 8; i++)
- {
- ISF_diff += st->mem_distance_sum[i];
- }
- if (ISF_diff > 5147609.0f)
- {
- CN_dith = 1;
- }
- else
- {
- CN_dith = 0;
- }
- /* determine how stationary the energy of background noise is */
- mean = 0.0f;
- for (i = 0; i < DTX_HIST_SIZE; i++)
- {
- mean += st->mem_log_en[i] / (Float32)DTX_HIST_SIZE;
- }
- gain_diff = 0.0f;
- for (i = 0; i < DTX_HIST_SIZE; i++)
- {
- tmp = (Float32)fabs(st->mem_log_en[i] - mean);
- gain_diff += tmp;
- }
- if (gain_diff > GAIN_THR)
- {
- CN_dith = 1;
- }
- return CN_dith;
- }
- /*
- * E_DTX_buffer
- *
- * Parameters:
- * st I/O: state struct
- * isf_new I: isf vector
- * enr I: residual energy (for L_FRAME)
- * codec_mode I: speech coder mode
- *
- * Function:
- * Handles the DTX buffer
- *
- * Returns:
- * void
- */
- void E_DTX_buffer(E_DTX_State *st, Float32 isf_new[], Float32 enr,
- Word16 codec_mode)
- {
- Float32 log_en;
- /* update pointer to circular buffer */
- st->mem_hist_ptr++;
- if (st->mem_hist_ptr == DTX_HIST_SIZE)
- {
- st->mem_hist_ptr = 0;
- }
- /* copy isf vector into buffer */
- memcpy(&st->mem_isf[st->mem_hist_ptr * M], isf_new, M * sizeof(Float32));
- enr += 1e-10F;
- log_en = (Float32)(log10(enr / ((Float64)L_FRAME)) / log10(2.0F));
- /* Subtract ~ 3 dB */
- st->mem_log_en[st->mem_hist_ptr] = log_en + E_ROM_en_adjust[codec_mode];
- return;
- }
- /*
- * E_DTX_frame_indices_find
- *
- * Parameters:
- * st I/O: state struct
- * isf_old_tx I: isf vector
- * indices I: distance indices
- *
- * Function:
- * Find indices for min/max distances
- *
- * Returns:
- * void
- */
- static void E_DTX_frame_indices_find(E_DTX_State * st, Word16 indices[])
- {
- Float32 L_tmp, tmp, summin, summax, summax2nd;
- Word32 i, j, k;
- Word16 ptr;
- /*
- * Remove the effect of the oldest frame from the column
- * sum sumD[0..E_DTX_HIST_SIZE-1]. sumD[E_DTX_HIST_SIZE] is
- * not updated since it will be removed later.
- */
- k = DTX_HIST_SIZE_MIN_ONE;
- j = -1;
- for (i = 0; i < DTX_HIST_SIZE_MIN_ONE; i++)
- {
- j = j + k;
- st->mem_distance_sum[i] = st->mem_distance_sum[i] - st->mem_distance[j];
- k--;
- }
- /*
- * Shift the column sum sumD. The element sumD[E_DTX_HIST_SIZE-1]
- * corresponding to the oldest frame is removed. The sum of
- * the distances between the latest isf and other isfs,
- * i.e. the element sumD[0], will be computed during this call.
- * Hence this element is initialized to zero.
- */
- for (i = DTX_HIST_SIZE_MIN_ONE; i > 0; i--)
- {
- st->mem_distance_sum[i] = st->mem_distance_sum[i - 1];
- }
- st->mem_distance_sum[0] = 0.0F;
- /*
- * Remove the oldest frame from the distance matrix.
- * Note that the distance matrix is replaced by a one-
- * dimensional array to save static memory.
- */
- k = 0;
- for (i = 27; i >= 12; i = i - k)
- {
- k++;
- for (j = k; j > 0; j--)
- {
- st->mem_distance[i - j + 1] = st->mem_distance[i - j - k];
- }
- }
- /*
- * Compute the first column of the distance matrix D
- * (squared Euclidean distances from isf1[] to isf_old_tx[][]).
- */
- ptr = st->mem_hist_ptr;
- for (i = 1; i < DTX_HIST_SIZE; i++)
- {
- /* Compute the distance between the latest isf and the other isfs. */
- ptr--;
- if (ptr < 0)
- {
- ptr = DTX_HIST_SIZE_MIN_ONE;
- }
- L_tmp = 0;
- for (j = 0; j < M; j++)
- {
- tmp = st->mem_isf[st->mem_hist_ptr * M + j] - st->mem_isf[ptr * M + j];
- L_tmp += tmp * tmp;
- }
- st->mem_distance[i - 1] = L_tmp;
- /* Update also the column sums. */
- st->mem_distance_sum[0] += st->mem_distance[i - 1];
- st->mem_distance_sum[i] += st->mem_distance[i - 1];
- }
- /* Find the minimum and maximum distances */
- summax = st->mem_distance_sum[0];
- summin = st->mem_distance_sum[0];
- indices[0] = 0;
- indices[2] = 0;
- for (i = 1; i < DTX_HIST_SIZE; i++)
- {
- if (st->mem_distance_sum[i] > summax)
- {
- indices[0] = (Word16)i;
- summax = st->mem_distance_sum[i];
- }
- if (st->mem_distance_sum[i] < summin)
- {
- indices[2] = (Word16)i;
- summin = st->mem_distance_sum[i];
- }
- }
- /* Find the second largest distance */
- summax2nd = -100000000.0;
- indices[1] = -1;
- for (i = 0; i < DTX_HIST_SIZE; i++)
- {
- if ((st->mem_distance_sum[i] > summax2nd) && (i != indices[0]))
- {
- indices[1] = (Word16)i;
- summax2nd = st->mem_distance_sum[i];
- }
- }
- for (i = 0; i < 3; i++)
- {
- indices[i] = (Word16)(st->mem_hist_ptr - indices[i]);
- if (indices[i] < 0)
- {
- indices[i] += DTX_HIST_SIZE;
- }
- }
- /*
- * If maximum distance / MED_THRESH is smaller than minimum distance
- * then the median ISF vector replacement is not performed
- */
- L_tmp = (Float32)(summax / MED_THRESH);
- if (L_tmp <= summin)
- {
- indices[0] = -1;
- }
- /*
- * If second largest distance/MED_THRESH is smaller than
- * minimum distance then the median ISF vector replacement is
- * not performed
- */
- L_tmp = (Float32)(summax2nd / MED_THRESH);
- if (L_tmp <= summin)
- {
- indices[1] = -1;
- }
- return;
- }
- /*
- * E_DTX_isf_q
- *
- * Parameters:
- * isf I: ISF in the frequency domain (0..6400)
- * isf_q O: quantised ISF
- * indice O: quantisation indices
- *
- * Function:
- * The ISF vector is quantized using VQ with split-by-5
- *
- * Returns:
- * void
- */
- static void E_DTX_isf_q(Float32 *isf, Word16 **indice)
- {
- Word32 i;
- Float32 tmp;
- for (i = 0; i < ORDER; i++)
- {
- isf[i] = isf[i] - E_ROM_mean_isf_noise[i];
- }
- (*indice)[0] = E_LPC_isf_sub_vq(&isf[0], E_ROM_dico1_isf_noise, 2,
- SIZE_BK_NOISE1, &tmp);
- (*indice)[1] = E_LPC_isf_sub_vq(&isf[2], E_ROM_dico2_isf_noise, 3,
- SIZE_BK_NOISE2, &tmp);
- (*indice)[2] = E_LPC_isf_sub_vq(&isf[5], E_ROM_dico3_isf_noise, 3,
- SIZE_BK_NOISE3, &tmp);
- (*indice)[3] = E_LPC_isf_sub_vq(&isf[8], E_ROM_dico4_isf_noise, 4,
- SIZE_BK_NOISE4, &tmp);
- (*indice)[4] = E_LPC_isf_sub_vq(&isf[12], E_ROM_dico5_isf_noise, 4,
- SIZE_BK_NOISE5, &tmp);
- return;
- }
- /*
- * E_DTX_exe
- *
- * Parameters:
- * st I/O: state struct
- * exc2 O: CN excitation
- * pt_prms O: analysis parameters
- *
- * Function:
- * Confort noise parameters are encoded for the SID frame
- *
- * Returns:
- * void
- */
- void E_DTX_exe(E_DTX_State *st, Float32 *exc2, Word16 **pt_prms)
- {
- Float32 isf[M];
- Float32 log_en, level, gain, ener;
- Word32 i,j;
- Word16 isf_order[3];
- Word16 CN_dith;
- /* VOX mode computation of SID parameters */
- log_en = 0.0F;
- memset(isf, 0, M * sizeof(Float32));
- /* average energy and isf */
- for (i = 0; i < DTX_HIST_SIZE; i++)
- {
- log_en += st->mem_log_en[i] / (Float32)DTX_HIST_SIZE;
- }
- E_DTX_frame_indices_find(st, isf_order);
- E_DTX_isf_history_aver(st->mem_isf, isf_order, isf);
- for (j = 0; j < M; j++)
- {
- isf[j] = isf[j] / (Float32)DTX_HIST_SIZE; /* divide by 8 */
- }
- /* quantize logarithmic energy to 6 bits (-6 : 66 dB) */
- st->mem_log_en_index = (Word16)((log_en + 2.0F) * 2.625F);
- if(st->mem_log_en_index > 63)
- {
- st->mem_log_en_index = 63;
- }
- if(st->mem_log_en_index < 0)
- {
- st->mem_log_en_index = 0;
- }
- E_DTX_isf_q(isf, pt_prms);
- (*pt_prms) += 5;
- **pt_prms = st->mem_log_en_index;
- (*pt_prms) += 1;
- CN_dith = E_DTX_dithering_control(st);
- **pt_prms = CN_dith;
- (*pt_prms) += 1;
- /* adjust level to speech coder mode */
- log_en = (Float32)((Float32)st->mem_log_en_index / 2.625 - 2.0);
- level = (Float32)(pow( 2.0, log_en ));
- /* generate white noise vector */
- for (i = 0; i < L_FRAME; i++)
- {
- exc2[i] = (Float32)E_UTIL_random(&(st->mem_cng_seed));
- }
- ener = 0.01F;
- for (i = 0; i < L_FRAME; i++)
- {
- ener += exc2[i] * exc2[i];
- }
- gain = (Float32)sqrt(level * L_FRAME / ener);
- for (i = 0; i < L_FRAME; i++)
- {
- exc2[i] *= gain;
- }
- return;
- }
- /*
- * E_DTX_reset
- *
- * Parameters:
- * st O: state struct
- *
- * Function:
- * Initializes state memory
- *
- * Returns:
- * non-zero with error, zero for ok
- */
- Word32 E_DTX_reset(E_DTX_State *st)
- {
- Word32 i;
- if (st == (E_DTX_State *) NULL)
- {
- return -1;
- }
- st->mem_hist_ptr = 0;
- st->mem_log_en_index = 0;
- /* Init isf_hist[] */
- for(i = 0; i < DTX_HIST_SIZE; i++)
- {
- memcpy(&st->mem_isf[i * M], E_ROM_isf, M * sizeof(Float32));
- }
- st->mem_cng_seed = RANDOM_INITSEED;
- /* Reset energy history */
- memset(st->mem_log_en, 0, DTX_HIST_SIZE * sizeof(Float32));
- st->mem_dtx_hangover_count = DTX_HANG_CONST;
- st->mem_dec_ana_elapsed_count = DTX_ELAPSED_FRAMES_THRESH;
- memset(st->mem_distance, 0, 28 * sizeof(Float32));
- memset(st->mem_distance_sum, 0, (DTX_HIST_SIZE - 1) * sizeof(Float32));
- return 0;
- }
- /*
- * E_DTX_init
- *
- * Parameters:
- * st I/O: state struct
- *
- * Function:
- * Allocates state memory and initializes state memory
- *
- * Returns:
- * non-zero with error, zero for ok
- */
- Word32 E_DTX_init (E_DTX_State **st)
- {
- E_DTX_State* s;
- if (st == (E_DTX_State **) NULL)
- {
- return -1;
- }
- *st = NULL;
- /* allocate memory */
- if ((s= (E_DTX_State *) malloc(sizeof(E_DTX_State))) == NULL)
- {
- return -1;
- }
- E_DTX_reset(s);
- *st = s;
- return 0;
- }
- /*
- * E_DTX_exit
- *
- * Parameters:
- * state I/0: State struct
- *
- * Function:
- * The memory used for state memory is freed
- *
- * Returns:
- * void
- */
- void E_DTX_exit (E_DTX_State **st)
- {
- if (st == NULL || *st == NULL)
- {
- return;
- }
- /* deallocate memory */
- free(*st);
- *st = NULL;
- return;
- }
- /*
- * E_DTX_tx_handler
- *
- * Parameters:
- * st I/O: State struct
- * vad_flag I: vad decision
- * usedMode I/O: mode changed or not
- *
- * Function:
- * Adds extra speech hangover to analyze speech on the decoding side.
- *
- * Returns:
- * void
- */
- void E_DTX_tx_handler(E_DTX_State *st, Word32 vad_flag, Word16 *usedMode)
- {
- /* this state machine is in synch with the GSMEFR txDtx machine */
- st->mem_dec_ana_elapsed_count++;
- if (vad_flag != 0)
- {
- st->mem_dtx_hangover_count = DTX_HANG_CONST;
- }
- else
- { /* non-speech */
- if (st->mem_dtx_hangover_count == 0)
- { /* out of decoder analysis hangover */
- st->mem_dec_ana_elapsed_count = 0;
- *usedMode = MRDTX;
- }
- else
- { /* in possible analysis hangover */
- st->mem_dtx_hangover_count--;
- /* decAnaElapsedCount + dtxHangoverCount < E_DTX_ELAPSED_FRAMES_THRESH */
- if ((st->mem_dec_ana_elapsed_count + st->mem_dtx_hangover_count)
- < DTX_ELAPSED_FRAMES_THRESH)
- {
- *usedMode = MRDTX;
- /* if Word16 time since decoder update, do not add extra HO */
- }
- /*
- else
- override VAD and stay in
- speech mode *usedMode
- and add extra hangover
- */
- }
- }
- return;
- }
- /*
- * E_DTX_filter5
- *
- * Parameters:
- * in0 I/O: input values / output low-pass part
- * in1 I/O: input values / output high-pass part
- * data I/O: updated filter memory
- *
- * Function:
- * Fifth-order half-band lowpass/highpass filter pair with decimation.
- *
- * Returns:
- * void
- */
- static void E_DTX_filter5(Float32 *in0, Float32 *in1, Float32 data[])
- {
- Float32 temp0, temp1, temp2;
- temp0 = *in0 - COEFF5_1 * data[0];
- temp1 = data[0] + COEFF5_1 * temp0;
- data[0] = ((temp0 > 1e-10) | (temp0 < -1e-10)) ? temp0 : 0;
- temp0 = *in1 - COEFF5_2 * data[1];
- temp2 = data[1] + COEFF5_2 * temp0;
- data[1] = ((temp0 > 1e-10) | (temp0 < -1e-10)) ? temp0 : 0;
- *in0 = (temp1 + temp2) * 0.5F;
- *in1 = (temp1 - temp2) * 0.5F;
- }
- /*
- * E_DTX_filter3
- *
- * Parameters:
- * in0 I/O: input values / output low-pass part
- * in1 I/O: input values / output high-pass part
- * data I/O: updated filter memory
- *
- * Function:
- * Third-order half-band lowpass/highpass filter pair with decimation.
- *
- * Returns:
- * void
- */
- static void E_DTX_filter3(Float32 *in0, Float32 *in1, Float32 *data)
- {
- Float32 temp1, temp2;
- temp1 = *in1 - COEFF3 * *data;
- temp2 = *data + COEFF3 * temp1;
- *data = ((temp1 > 1e-10) | (temp1 < -1e-10)) ? temp1 : 0;
- *in1 = (*in0 - temp2) * 0.5F;
- *in0 = (*in0 + temp2) * 0.5F;
- }
- /*
- * E_DTX_level_calculation
- *
- * Parameters:
- * data I: signal buffer
- * sub_level I/0: level calculated at the end of the previous frame /
- * level of signal calculated from the last
- * (count2 - count1) samples
- * count1 I: number of samples to be counted
- * count2 I: number of samples to be counted
- * ind_m I: step size for the index of the data buffer
- * ind_a I: starting index of the data buffer
- * scale I: scaling for the level calculation
- *
- * Function:
- * Calculate signal level in a sub-band. Level is calculated
- * by summing absolute values of the input data.
- *
- * Because speech coder has a lookahead, signal level calculated
- * over the lookahead (data[count1 - count2]) is stored (*sub_level)
- * and added to the level of the next frame. Additionally, group
- * delay and decimation of the filter bank is taken into the count
- * for the values of the counters (count1, count2).
- *
- * Returns:
- * signal level
- */
- static Float32 E_DTX_level_calculation(Float32 data[], Float32 *sub_level,
- Word16 count1, Word16 count2,
- Word16 ind_m, Word16 ind_a,
- Float32 scale)
- {
- Float64 l_temp1, l_temp2;
- Float32 level;
- Word32 i;
- l_temp1 = 0.0;
- for (i = count1; i < count2; i++)
- {
- l_temp1 += fabs(data[ind_m * i + ind_a]);
- }
- l_temp1 *= 2.0;
- l_temp2 = l_temp1 + *sub_level / scale;
- *sub_level = (Float32)(l_temp1 * scale);
- for (i = 0; i < count1; i++)
- {
- l_temp2 += 2.0f * fabs(data[ind_m * i + ind_a]);
- }
- level = (Float32)(l_temp2 * scale);
- return level;
- }
- /*
- * E_DTX_filter_bank
- *
- * Parameters:
- * st I/0: State struct
- * in I: input frame
- * level I: signal levels at each band
- *
- * Function:
- * Divide input signal into bands and calculate level of
- * the signal in each band
- *
- * Returns:
- * void
- */
- static void E_DTX_filter_bank(E_DTX_Vad_State *st, Float32 in[],
- Float32 level[])
- {
- Float32 tmp_buf[FRAME_LEN];
- Word32 i, j;
- /* shift input 1 bit down for safe scaling */
- for (i = 0; i < FRAME_LEN; i++)
- {
- tmp_buf[i] = in[i] * 0.5F;
- }
- /* run the filter bank */
- for (i = 0; i < (FRAME_LEN >> 1); i++)
- {
- j = i << 1;
- E_DTX_filter5(&tmp_buf[j], &tmp_buf[j + 1], st->mem_a_data5[0]);
- }
- for (i = 0; i < (FRAME_LEN >> 2); i++)
- {
- j = i << 2;
- E_DTX_filter5(&tmp_buf[j], &tmp_buf[j + 2], st->mem_a_data5[1]);
- E_DTX_filter5(&tmp_buf[j + 1], &tmp_buf[j + 3], st->mem_a_data5[2]);
- }
- for (i = 0; i < (FRAME_LEN >> 3); i++)
- {
- j = i << 3;
- E_DTX_filter5(&tmp_buf[j], &tmp_buf[j + 4], st->mem_a_data5[3]);
- E_DTX_filter5(&tmp_buf[j + 2], &tmp_buf[j + 6], st->mem_a_data5[4]);
- E_DTX_filter3(&tmp_buf[j + 3], &tmp_buf[j + 7], &st->mem_a_data3[0]);
- }
- for (i = 0; i < (FRAME_LEN >> 4); i++)
- {
- j = i << 4;
- E_DTX_filter3(&tmp_buf[j], &tmp_buf[j + 8], &st->mem_a_data3[1]);
- E_DTX_filter3(&tmp_buf[j + 4], &tmp_buf[j + 12], &st->mem_a_data3[2]);
- E_DTX_filter3(&tmp_buf[j + 6], &tmp_buf[j + 14], &st->mem_a_data3[3]);
- }
- for (i = 0; i < (FRAME_LEN >> 5); i++)
- {
- j = i << 5;
- E_DTX_filter3(&tmp_buf[j + 0], &tmp_buf[j + 16], &st->mem_a_data3[4]);
- E_DTX_filter3(&tmp_buf[j + 8], &tmp_buf[j + 24], &st->mem_a_data3[5]);
- }
- /* calculate levels in each frequency band */
- /* 4800 - 6400 Hz*/
- level[11] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[11],
- (FRAME_LEN >> 2) - 48, FRAME_LEN >> 2, 4, 1, 0.25F);
- /* 4000 - 4800 Hz*/
- level[10] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[10],
- (FRAME_LEN >> 3) - 24, FRAME_LEN >> 3, 8, 7, 0.5F);
- /* 3200 - 4000 Hz*/
- level[9] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[9],
- (FRAME_LEN >> 3) - 24, FRAME_LEN >> 3, 8, 3, 0.5F);
- /* 2400 - 3200 Hz*/
- level[8] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[8],
- (FRAME_LEN >> 3) - 24, FRAME_LEN >> 3, 8, 2, 0.5F);
- /* 2000 - 2400 Hz*/
- level[7] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[7],
- (FRAME_LEN >> 4) - 12, FRAME_LEN >> 4, 16, 14, 1.0F);
- /* 1600 - 2000 Hz*/
- level[6] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[6],
- (FRAME_LEN >> 4) - 12, FRAME_LEN >> 4, 16, 6, 1.0F);
- /* 1200 - 1600 Hz*/
- level[5] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[5],
- (FRAME_LEN >> 4) - 12, FRAME_LEN >> 4, 16, 4, 1.0F);
- /* 800 - 1200 Hz*/
- level[4] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[4],
- (FRAME_LEN >> 4) - 12, FRAME_LEN >> 4, 16, 12, 1.0F);
- /* 600 - 800 Hz*/
- level[3] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[3],
- (FRAME_LEN >> 5) - 6, FRAME_LEN >> 5, 32, 8, 2.0F);
- /* 400 - 600 Hz*/
- level[2] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[2],
- (FRAME_LEN >> 5) - 6, FRAME_LEN >> 5, 32, 24, 2.0F);
- /* 200 - 400 Hz*/
- level[1] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[1],
- (FRAME_LEN >> 5) - 6, FRAME_LEN >> 5, 32, 16, 2.0F);
- /* 0 - 200 Hz*/
- level[0] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[0],
- (FRAME_LEN >> 5) - 6, FRAME_LEN >> 5, 32, 0, 2.0F);
- }
- /*
- * E_DTX_update_cntrl
- *
- * Parameters:
- * st I/0: State struct
- * level I: sub-band levels of the input frame
- *
- * Function:
- * Control update of the background noise estimate.
- *
- * Returns:
- * void
- */
- static void E_DTX_update_cntrl(E_DTX_Vad_State *st, Float32 level[])
- {
- Float32 stat_rat;
- Float32 num, denom;
- Float32 alpha;
- Word32 i;
- /* if fullband pitch or tone have been detected for a while, initialize stat_count */
- if ((st->mem_pitch_tone & 0x7c00) == 0x7c00)
- {
- st->mem_stat_count = STAT_COUNT;
- }
- else
- {
- /* if 8 last vad-decisions have been "0", reinitialize stat_count */
- if ((st->mem_vadreg & 0x7f80) == 0)
- {
- st->mem_stat_count = STAT_COUNT;
- }
- else
- {
- stat_rat = 0;
- for (i = 0; i < COMPLEN; i++)
- {
- if (level[i] > st->mem_ave_level[i])
- {
- num = level[i];
- denom = st->mem_ave_level[i];
- }
- else
- {
- num = st->mem_ave_level[i];
- denom = level[i];
- }
- /* Limit nimimum value of num and denom to STAT_THR_LEVEL */
- if (num < STAT_THR_LEVEL)
- {
- num = STAT_THR_LEVEL;
- }
- if (denom < STAT_THR_LEVEL)
- {
- denom = STAT_THR_LEVEL;
- }
- stat_rat += num/denom * 64;
- }
- /* compare stat_rat with a threshold and update stat_count */
- if (stat_rat > STAT_THR)
- {
- st->mem_stat_count = STAT_COUNT;
- }
- else
- {
- if ((st->mem_vadreg & 0x4000) != 0)
- {
- if (st->mem_stat_count != 0)
- {
- st->mem_stat_count--;
- }
- }
- }
- }
- }
- /* Update average amplitude estimate for stationarity estimation */
- alpha = ALPHA4;
- if (st->mem_stat_count == STAT_COUNT)
- {
- alpha = 1.0;
- }
- else if ((st->mem_vadreg & 0x4000) == 0)
- {
- alpha = ALPHA5;
- }
- for (i = 0; i < COMPLEN; i++)
- {
- st->mem_ave_level[i] += alpha * (level[i] - st->mem_ave_level[i]);
- }
- }
- /*
- * E_DTX_hangover_addition
- *
- * Parameters:
- * st I/0: State struct
- * low_power I: flag power of the input frame
- * hang_len I: hangover length
- * burst_len I: minimum burst length for hangover addition
- *
- * Function:
- * Add hangover after speech bursts.
- *
- * Returns:
- * VAD_flag indicating final VAD decision
- */
- static Word16 E_DTX_hangover_addition(E_DTX_Vad_State *st, Word16 low_power,
- Word16 hang_len, Word16 burst_len)
- {
- /*
- * if the input power (pow_sum) is lower than a threshold, clear
- * counters and set VAD_flag to "0" "fast exit"
- */
- if (low_power != 0)
- {
- st->mem_burst_count = 0;
- st->mem_hang_count = 0;
- return 0;
- }
- /* update the counters (hang_count, burst_count) */
- if ((st->mem_vadreg & 0x4000) != 0)
- {
- st->mem_burst_count++;
- if (st->mem_burst_count >= burst_len)
- {
- st->mem_hang_count = hang_len;
- }
- return 1;
- }
- else
- {
- st->mem_burst_count = 0;
- if (st->mem_hang_count > 0)
- {
- st->mem_hang_count--;
- return 1;
- }
- }
- return 0;
- }
- /*
- * E_DTX_noise_estimate_update
- *
- * Parameters:
- * st I/0: State struct
- * level I: sub-band levels of the input frame
- *
- * Function:
- * Update of background noise estimate
- *
- * Returns:
- * void
- */
- static void E_DTX_noise_estimate_update(E_DTX_Vad_State *st, Float32 level[])
- {
- Float32 alpha_up, alpha_down, bckr_add, temp;
- Word32 i;
- /* Control update of bckr_est[] */
- E_DTX_update_cntrl(st, level);
- /* Choose update speed */
- bckr_add = 2.0;
- if ((0x7800 & st->mem_vadreg) == 0)
- {
- alpha_up = ALPHA_UP1;
- alpha_down = ALPHA_DOWN1;
- }
- else
- {
- if (st->mem_stat_count == 0)
- {
- alpha_up = ALPHA_UP2;
- alpha_down = ALPHA_DOWN2;
- }
- else
- {
- alpha_up = 0.0;
- alpha_down = ALPHA3;
- bckr_add = 0.0;
- }
- }
- /* Update noise estimate (bckr_est) */
- for (i = 0; i < COMPLEN; i++)
- {
- temp = st->mem_level[i] - st->mem_bckr_est[i];
- if (temp < 0.0)
- { /* update downwards*/
- st->mem_bckr_est[i] += -2 + (alpha_down * temp);
- /* limit minimum value of the noise estimate to NOISE_MIN */
- if (st->mem_bckr_est[i] < NOISE_MIN)
- {
- st->mem_bckr_est[i] = NOISE_MIN;
- }
- }
- else
- { /* update upwards */
- st->mem_bckr_est[i] += bckr_add + (alpha_up * temp);
- /* limit maximum value of the noise estimate to NOISE_MAX */
- if (st->mem_bckr_est[i] > NOISE_MAX)
- {
- st->mem_bckr_est[i] = NOISE_MAX;
- }
- }
- }
- /* Update signal levels of the previous frame (old_level) */
- memcpy(st->mem_level, level, COMPLEN * sizeof(Float32));
- }
- /*
- * E_DTX_decision
- *
- * Parameters:
- * st I/0: State struct
- * level I: sub-band levels of the input frame
- * pow_sum I: power of the input frame
- *
- * Function:
- * Calculates VAD_flag
- *
- * Returns:
- * VAD_flag
- */
- static Word16 E_DTX_decision(E_DTX_Vad_State *st, Float32 level[COMPLEN], Float64 pow_sum)
- {
- Float64 snr_sum;
- Float32 vad_thr, temp, noise_level;
- Float32 ilog2_speech_level, ilog2_noise_level;
- Float32 temp2;
- Word32 i;
- Word16 low_power_flag;
- Word16 hang_len,burst_len;
- /*
- * Calculate squared sum of the input levels (level)
- * divided by the background noise components (bckr_est).
- */
- snr_sum = 0.0;
- for (i = 0; i < COMPLEN; i++)
- {
- temp = level[i] / st->mem_bckr_est[i];
- snr_sum += temp * temp;
- }
- /* Calculate average level of estimated background noise */
- temp = 0.0;
- for (i = 1; i < COMPLEN; i++) /* ignore lowest band */
- {
- temp += st->mem_bckr_est[i];
- }
- noise_level = (Float32)(temp * 0.0625);
- /*
- * if SNR is lower than a threshold (MIN_SPEECH_SNR),
- * and increase speech_level
- */
- temp = noise_level * MIN_SPEECH_SNR * 8;
- if (st->mem_speech_level < temp)
- {
- st->mem_speech_level = temp;
- }
- ilog2_noise_level = (Float32)(-1024.0F * log10(noise_level / 2147483648.0F) / log10(2.0F));
- /*
- * If SNR is very poor, speech_level is probably corrupted by noise level. This
- * is correctred by subtracting -MIN_SPEECH_SNR*noise_level from speech level
- */
- ilog2_speech_level = (Float32)(-1024.0F * log10((st->mem_speech_level - temp) / 2147483648.0F) / log10(2.0F));
- /* ilog2_speech_level = ilog2(st->speech_level); */
- temp = NO_SLOPE * (ilog2_noise_level- NO_P1)+ THR_HIGH;
- temp2 = SP_CH_MIN + SP_SLOPE * (ilog2_speech_level - SP_P1);
- if (temp2 < SP_CH_MIN)
- {
- temp2 = SP_CH_MIN;
- }
- if (temp2 > SP_CH_MAX)
- {
- temp2 = SP_CH_MAX;
- }
- vad_thr = temp + temp2;
- if (vad_thr < THR_MIN)
- {
- vad_thr = THR_MIN;
- }
- /* Shift VAD decision register */
- st->mem_vadreg = (Word16)(st->mem_vadreg >> 1);
- /* Make intermediate VAD decision */
- if (snr_sum > (vad_thr * (Float32)COMPLEN / 128.0F))
- {
- st->mem_vadreg = (Word16)(st->mem_vadreg | 0x4000);
- }
- /* primary vad decsion made */
- /* check if the input power (pow_sum) is lower than a threshold" */
- if (pow_sum < VAD_POW_LOW)
- {
- low_power_flag = 1;
- }
- else
- {
- low_power_flag = 0;
- }
- /* Update speech subband background noise estimates */
- E_DTX_noise_estimate_update(st, level);
- hang_len = (Word16)((HANG_SLOPE * (vad_thr - HANG_P1) - 0.5) + HANG_HIGH);
- if (hang_len < HANG_LOW)
- {
- hang_len = HANG_LOW;
- }
- burst_len = (Word16)((BURST_SLOPE * (vad_thr - BURST_P1) - 0.5) + BURST_HIGH);
- return(E_DTX_hangover_addition(st, low_power_flag, hang_len,burst_len));
- }
- /*
- * E_DTX_dpeech_estimate
- *
- * Parameters:
- * st I/0: State struct
- * in_level I: level of the input frame
- *
- * Function:
- * Estimate speech level
- *
- * Maximum signal level is searched and stored to the variable sp_max.
- * The speech frames must locate within SP_EST_COUNT number of frames to be counted.
- * Thus, noisy frames having occasional VAD = "1" decisions will not
- * affect to the estimated speech_level.
- *
- * Returns:
- * void
- */
- static void E_DTX_speech_estimate(E_DTX_Vad_State *st, Float32 in_level)
- {
- Float32 alpha, tmp;
- /* if the required activity count cannot be achieved, reset counters */
- if (SP_ACTIVITY_COUNT > (SP_EST_COUNT - st->mem_sp_est_cnt + st->mem_sp_max_cnt))
- {
- st->mem_sp_est_cnt = 0;
- st->mem_sp_max = 0.0;
- st->mem_sp_max_cnt = 0;
- }
- st->mem_sp_est_cnt++;
- if (((st->mem_vadreg & 0x4000) || (in_level > st->mem_speech_level))
- && (in_level > MIN_SPEECH_LEVEL1))
- {
- if (in_level > st->mem_sp_max)
- {
- st->mem_sp_max = in_level;
- }
- st->mem_sp_max_cnt++;
- if (st->mem_sp_max_cnt >= SP_ACTIVITY_COUNT)
- {
- tmp = st->mem_sp_max / 2.0F; /* scale to get "average" speech level*/
- if (tmp > st->mem_speech_level)
- {
- alpha = ALPHA_SP_UP;
- }
- else
- {
- alpha = ALPHA_SP_DOWN;
- }
- if (tmp > MIN_SPEECH_LEVEL2)
- {
- st->mem_speech_level += alpha * (tmp - st->mem_speech_level);
- }
- st->mem_sp_max = 0.0;
- st->mem_sp_max_cnt = 0;
- st->mem_sp_est_cnt = 0;
- }
- }
- }
- /*
- * E_DTX_vad_reset
- *
- * Parameters:
- * state I/0: State struct
- *
- * Function:
- * Initialises state memory
- *
- * Returns:
- * non-zero with error, zero for ok
- */
- Word32 E_DTX_vad_reset (E_DTX_Vad_State *state)
- {
- Word32 i;
- if (state == (E_DTX_Vad_State *) NULL)
- {
- return -1;
- }
- /* Initialize pitch detection variables */
- state->mem_pitch_tone = 0;
- state->mem_vadreg = 0;
- state->mem_hang_count = 0;
- state->mem_burst_count = 0;
- state->mem_hang_count = 0;
- /* initialize memory used by the filter bank */
- memset(state->mem_a_data5, 0, F_5TH_CNT * 2 * sizeof(Float32));
- memset(state->mem_a_data3, 0, F_3TH_CNT * sizeof(Float32));
- /* initialize the rest of the memory */
- for (i = 0; i < COMPLEN; i++)
- {
- state->mem_bckr_est[i] = NOISE_INIT;
- state->mem_level[i] = NOISE_INIT;
- state->mem_ave_level[i] = NOISE_INIT;
- state->mem_sub_level[i] = 0;
- }
- state->mem_sp_est_cnt = 0;
- state->mem_sp_max = 0;
- state->mem_sp_max_cnt = 0;
- state->mem_speech_level = SPEECH_LEVEL_INIT;
- state->mem_pow_sum = 0;
- state->mem_stat_count = 0;
- return 0;
- }
- /*
- * E_DTX_vad_init
- *
- * Parameters:
- * state I/0: State struct
- *
- * Function:
- * Allocates state memory and initializes state memory
- *
- * Returns:
- * non-zero with error, zero for ok
- */
- Word32 E_DTX_vad_init (E_DTX_Vad_State **state)
- {
- E_DTX_Vad_State* s;
- if (state == (E_DTX_Vad_State **) NULL)
- {
- return -1;
- }
- *state = NULL;
- /* allocate memory */
- if ((s = (E_DTX_Vad_State *) malloc(sizeof(E_DTX_Vad_State))) == NULL)
- {
- return -1;
- }
- E_DTX_vad_reset(s);
- *state = s;
- return 0;
- }
- /*
- * E_DTX_vad_exit
- *
- * Parameters:
- * state I/0: State struct
- *
- * Function:
- * The memory used for state memory is freed
- *
- * Returns:
- * void
- */
- void E_DTX_vad_exit (E_DTX_Vad_State **state)
- {
- if (state == NULL || *state == NULL)
- {
- return;
- }
- /* deallocate memory */
- free(*state);
- *state = NULL;
- return;
- }
- /*
- * E_DTX_pitch_tone_detection
- *
- * Parameters:
- * state I/0: State struct
- * p_gain I: pitch gain
- *
- * Function:
- * Set tone flag if pitch gain is high. This is used to detect
- * signaling tones and other signals with high pitch gain.
- *
- * Returns:
- * void
- */
- void E_DTX_pitch_tone_detection (E_DTX_Vad_State *st, Float32 p_gain)
- {
- /* update tone flag and pitch flag */
- st->mem_pitch_tone = (Word16)(st->mem_pitch_tone >> 1);
- /* if (pitch_gain > TONE_THR) set tone flag */
- if (p_gain > TONE_THR)
- {
- st->mem_pitch_tone = (Word16)(st->mem_pitch_tone | 0x4000);
- }
- }
- /*
- * E_DTX_vad
- *
- * Parameters:
- * st I/0: State struct
- * in_buf I: samples of the input frame
- *
- * Function:
- * Main program for Voice Activity Detection (VAD)
- *
- * Returns:
- * VAD Decision, 1 = speech, 0 = noise
- */
- Word16 E_DTX_vad(E_DTX_Vad_State *st, Float32 in_buf[])
- {
- Float64 L_temp, pow_sum;
- Float32 level[COMPLEN];
- Float32 temp;
- Word32 i;
- Word16 VAD_flag;
- /* Calculate power of the input frame. */
- L_temp = 0.0;
- for (i = 0; i < FRAME_LEN; i++)
- {
- L_temp += in_buf[i] * in_buf[i];
- }
- L_temp *= 2.0;
- /* pow_sum = power of current frame and previous frame */
- pow_sum = L_temp + st->mem_pow_sum;
- /* save power of current frame for next call */
- st->mem_pow_sum = L_temp;
- /* If input power is very low, clear tone flag */
- if (pow_sum < POW_PITCH_TONE_THR)
- {
- st->mem_pitch_tone = (Word16)(st->mem_pitch_tone & 0x1fff);
- }
- /* Run the filter bank and calculate signal levels at each band */
- E_DTX_filter_bank(st, in_buf, level);
- /* compute VAD decision */
- VAD_flag = E_DTX_decision(st, level, pow_sum);
- /* Calculate input level */
- L_temp = 0.0;
- for (i = 1; i < COMPLEN; i++) /* ignore lowest band */
- {
- L_temp += level[i];
- }
- temp = (Float32)(L_temp / 16.0F);
- E_DTX_speech_estimate(st, temp); /* Estimate speech level */
- return(VAD_flag);
- }