FE_vus.cpp
上传用户:italyroyal
上传日期:2013-05-06
资源大小:473k
文件大小:7k
- ///////////////////////////////////////////////////////////////////////////////
- // This is a part of the Feature program.
- // Version: 1.0
- // Date: February 22, 2003
- // Programmer: Oh-Wook Kwon
- // Copyright(c) 2003 Oh-Wook Kwon. All rights reserved. owkwon@ucsd.edu
- ///////////////////////////////////////////////////////////////////////////////
- #include "StdAfx.h"
- #include "FE_feature.h"
- static const float EPD_NB_FRAME_THRESHOLD_LTE=(float)10;
- static const float EPD_LAMBDA_LTE=(float)0.97;
- static const float EPD_LAMBDA_ZCR=(float)0.97; // old=0.99
- static const float EPD_SNR_THRESHOLD_UPD_LTE=(float)4;
- static const float EPD_ZCR_THRESHOLD_UPD_LTE=(float)10;
- static const float EPD_NOISE_ENERGY_FLOOR=(float)14.5;
- static const float EPD_SPEECH_ENERGY_FLOOR=(float)37.5;
- static const float lambdaLTEhigherE=(float)0.99;
- static const int EPD_MIN_FRAME=10;
- int Fe::vus_basic(short *sample, int sampleN, int frameSize, vector<EVusType>& vusA)
- {
- int n,i;
- vector<float> energyA;
- vector<float> zcrA;
- vector<float> noiseEnA;
- vector<float> meanZcrA;
- float noiseEn=0;
- float meanZcr=0;
- int zcrFrameN=0;
- #ifdef _DEBUG
- #define TEST_FRAME_LEN 300
- float tmp1A[TEST_FRAME_LEN];
- float tmp2A[TEST_FRAME_LEN];
- EVusType vusTmpA[TEST_FRAME_LEN];
- float meanEnergyTmpA[TEST_FRAME_LEN];
- float meanZcrTmpA[TEST_FRAME_LEN];
- #endif
- int shiftSize=GetShiftSize();
- int frameN=(int)((sampleN-(frameSize-shiftSize))/shiftSize);
- energyA.resize(frameN);
- zcrA.resize(frameN);
- noiseEnA.resize(frameN);
- meanZcrA.resize(frameN);
- vusA.resize(frameN);
- float VUS_ENERGY_TH_VOICED=30;
- float VUS_ENERGY_TH_UNVOICED=60;
- float VUS_ENERGY_TH_START=20;
- float VUS_ENERGY_TH_INSPEECH=15; // old=6
- int VUS_ZCR_TH_MAX=50;
- int VUS_ZCR_TH_MIN=5;
- int VUS_VOICE_ZCR_TH_MIN=0;
- int VUS_ZCR_UPD_TH=50;
- vector<float> x(frameSize);
- float vus_energy_th=VUS_ENERGY_TH_START;
- for(n=0;n<frameN;n++){
- float lambdaLTE, lambdaZcr;
- if(n<EPD_NB_FRAME_THRESHOLD_LTE)
- lambdaLTE=1-1/(float)(n+1);
- else
- lambdaLTE=EPD_LAMBDA_LTE;
- if(zcrFrameN<EPD_NB_FRAME_THRESHOLD_LTE)
- lambdaZcr=1-1/(float)(zcrFrameN+1);
- else
- lambdaZcr=EPD_LAMBDA_ZCR;
-
- int begX=n*shiftSize;
- {
- /* DC offset removal, H(z)=(1-z^(-1))/(1-a*z^(-1)) */
- float a=1-1/(float)1024;
- x[0]=sample[begX];
- for(i=1;i<frameSize;i++){
- x[i]=(sample[begX+i]-sample[begX+i-1]+a*x[i-1]);
- }
- /* low-pass filter, x=filter([1 2 1]/4, [1], x); */
- vector<float> xorg=x;
- x[0]=(xorg[0]+2*xorg[0]+xorg[1])/4;
- for(i=1;i<frameSize-1;i++){
- x[i]=(xorg[i-1]+2*xorg[i]+xorg[i+1])/4;
- }
- x[frameSize-1]=(xorg[frameSize-2]+2*xorg[frameSize-1]+xorg[frameSize-1])/4;
- }
- /* Normalize energy to frame length */
- float sum=0;
- for(i=0;i<frameSize;i++){
- sum += (x[i]*x[i]);
- }
- energyA[n] = (0.5+10*LOG10(1+sum/frameSize)); /* dB scale */
- if(energyA[n]<EPD_NOISE_ENERGY_FLOOR) energyA[n]=EPD_NOISE_ENERGY_FLOOR;
- /* Estimate noise */
- float noiseLevel=0;
- if((energyA[n]-noiseEn)<EPD_SNR_THRESHOLD_UPD_LTE || n<EPD_MIN_FRAME || (energyA[n] < EPD_SPEECH_ENERGY_FLOOR)){
- if((energyA[n]<noiseEn) | (n<EPD_MIN_FRAME) | (energyA[n] < EPD_SPEECH_ENERGY_FLOOR)){
- noiseEn=noiseEn+(1-lambdaLTE)*(energyA[n]-noiseEn);
- }
- else{
- noiseEn=noiseEn+(1-lambdaLTEhigherE)*(energyA[n]-noiseEn);
- }
- /* noise level should be computed from the original noise energy */
- noiseLevel=(2*sqrt(exp(log(10)/10*(noiseEn-0.5))-1));
- if(noiseEn<EPD_NOISE_ENERGY_FLOOR) noiseEn=EPD_NOISE_ENERGY_FLOOR;
- }
- noiseEnA[n]=noiseEn;
- /* compute zero crossing rate (changed) */
- float prev = x[0]-noiseLevel;
- int zcr=0;
- for(i=1;i<frameSize;i++){
- float val = x[i]-noiseLevel;
- float ztmp=val*prev;
- if(ztmp<0) zcr = zcr+1;
- prev=val;
- }
- zcrA[n]=zcr;
- if(zcr>0 || zcrFrameN>0){
- zcrFrameN++;
- }
- if(zcrA[n]<VUS_ZCR_UPD_TH && ((zcrA[n]-meanZcr)<EPD_ZCR_THRESHOLD_UPD_LTE || zcrFrameN<EPD_MIN_FRAME)){
- meanZcr=meanZcr+(1-lambdaZcr)*(zcrA[n]-meanZcr);
- }
- meanZcrA[n]=meanZcr;
- float deltaEnergy=energyA[n]-noiseEn;
- float deltaZcr=zcrA[n]-meanZcr;
- if(deltaEnergy<VUS_ENERGY_TH_UNVOICED && deltaEnergy<vus_energy_th && deltaZcr<VUS_VOICE_ZCR_TH_MIN){
- vusA[n]=FRM_SILENCE;
- vus_energy_th=VUS_ENERGY_TH_START;
- }
- else if(deltaZcr>VUS_ZCR_TH_MAX){
- vus_energy_th=VUS_ENERGY_TH_START;
- vusA[n]=FRM_UNVOICED;
- }
- else if(deltaZcr>VUS_ZCR_TH_MIN && deltaEnergy<vus_energy_th){
- vus_energy_th=VUS_ENERGY_TH_START;
- vusA[n]=FRM_UNVOICED;
- }
- else if(zcr>VUS_ZCR_TH_MIN && deltaEnergy<vus_energy_th){
- vus_energy_th=VUS_ENERGY_TH_START;
- vusA[n]=FRM_UNVOICED;
- }
- else if(deltaEnergy>VUS_ENERGY_TH_VOICED){
- vus_energy_th=VUS_ENERGY_TH_INSPEECH;
- vusA[n]=FRM_VOICED;
- }
- else if(deltaEnergy>vus_energy_th && (deltaZcr>VUS_VOICE_ZCR_TH_MIN || zcr>VUS_ZCR_TH_MIN)){
- vus_energy_th=VUS_ENERGY_TH_INSPEECH;
- vusA[n]=FRM_VOICED;
- }
- else{
- vusA[n]=FRM_SILENCE;
- vus_energy_th=VUS_ENERGY_TH_START;
- }
- /* TRACE("%d %f %f %f %f %dn",n,energyA[n],zcrA[n],deltaEnergy,deltaZcr,(int)vusA[n]); */
- }
- #ifdef _DEBUG
- int k;
- for(k=0;k<my_min(TEST_FRAME_LEN,vusA.size());k++){
- tmp1A[k]=energyA[k];
- tmp2A[k]=zcrA[k];
- meanEnergyTmpA[k]=noiseEnA[k];
- meanZcrTmpA[k]=meanZcrA[k];
- vusTmpA[k]=vusA[k];
- }
- #endif
- /* median filtering */
- vus_median_filter(vusA);
- #ifdef _DEBUG
- for(k=0;k<my_min(TEST_FRAME_LEN,vusA.size());k++){
- vusTmpA[k]=vusA[k];
- }
- #endif
- /* remove short segments */
- vus_remove_short_segments(vusA);
- return frameN;
- }
- int Fe::vus_median_filter(vector<EVusType>& vusA)
- {
- int n,i,k;
- int frameN=vusA.size();
- /* two-stage median filtering, length 5 */
- vector<EVusType> vusOrgA=vusA;
- float tmp[5];
- for(n=2;n<frameN-2;n++){
- for(i=n-2,k=0;i<=n+2;i++,k++) tmp[k]=(float)(vusOrgA[i]);
- vusA[n]=(EVusType)((int)(GetMedian(tmp,5)+0.5));
- }
- vusOrgA=vusA;
- for(n=2;n<frameN-2;n++){
- for(i=n-2,k=0;i<=n+2;i++,k++) tmp[k]=(float)(vusOrgA[i]);
- vusA[n]=(EVusType)((int)(GetMedian(tmp,5)+0.5));
- }
- return frameN;
- }
- int Fe::vus_remove_short_segments(vector<EVusType>& vusA)
- {
- /* remove short voiced segments */
- vus_remove_short_segments_sub(vusA, FRM_VOICED, 5); // old=7
- /* remove short silent segments */
- vus_remove_short_segments_sub(vusA, FRM_SILENCE, 2); // old=2
- /* remove short unvoiced segments */
- vus_remove_short_segments_sub(vusA, FRM_UNVOICED, 3); // old=4
- return 1;
- }
- /* remove short segments with duration less than or equal to minDur */
- int Fe::vus_remove_short_segments_sub(vector<EVusType>& vusA, EVusType type, int minDur)
- {
- int startX=1;
- int n,i,k=0;
- EVusType prevType=vusA[0];
- for(n=1;n<vusA.size();n++){
- if(vusA[n] == type && vusA[n-1] != type){
- startX=n;
- }
- else if(vusA[n] != type && vusA[n-1] == type){
- if(n-startX<=minDur){
- for(i=startX;i<n;i++) vusA[i]=vusA[startX-1];
- k++;
- }
- startX=n;
- }
- }
- if(n-startX<=minDur && vusA[n-1]==type){
- for(i=startX;i<n;i++) vusA[i]=vusA[startX-1];
- k++;
- }
- return k;
- }