FE_endpoint.cpp
上传用户:italyroyal
上传日期:2013-05-06
资源大小:473k
文件大小:21k
- ///////////////////////////////////////////////////////////////////////////////
- // This is a part of the Feature program.
- // Version: 1.0
- // Date: February 22, 2003
- // Programmer: Oh-Wook Kwon
- // Copyright(c) 2003 Oh-Wook Kwon. All rights reserved. owkwon@ucsd.edu
- ///////////////////////////////////////////////////////////////////////////////
- #include "StdAfx.h"
- #include "FE_feature.h"
- #include <limits.h>
- #include <math.h>
- #include <assert.h>
- /*-----------------------------------------------
- * Definition of End-point Detection parameters
- *----------------------------------------------*/
- #ifndef DEFAULT_SAMPLING_RATE
- #define DEFAULT_SAMPLING_RATE 16000
- #endif
- #define DEFAULT_LONG_PAUSE_IN_MS 100 /* default duration of long-pause in millisecond (=100 ms) (changed for kWaves) */
- /* For provision when threre is no speech detected */
- #define EPD_OUTPUT_ALWAYS 1 /* write speech even if no speech is detected in order to satisfy the speech recognizer. */
- #define EPD_DUMMY_FRAMES 1 /* number of frames to be added when no speech is detected. */
- #define EPD_MULTIPLE_END_POINT 1 /* detect multiple speech segments (changed for kWaves) */
- /* Use terminology in the Aurora-3 VADNest */
- #define EPD_NB_FRAME_THRESHOLD_LTE 10
- #define EPD_LAMBDA_LTE 0.97
- #define EPD_SNR_THRESHOLD_UPD_LTE 4 /* in dB scale <-- 20 */
- #define EPD_MIN_FRAME 10
- /* The minimum power of noise and speech are assumed as 14.5 and 37.5,
- which roughly correspond to the amplitude levels of 5 and 71. */
- #define EPD_NOISE_ENERGY_FLOOR 14.5 /* ~ 0.5+10*log10(5*5) */
- #define EPD_SPEECH_ENERGY_FLOOR 37.5 /* ~ 0.5+10*log10(71*71) */
- #define EPD_NOISE_CLEAN 30.5 /* */
- /* Input frame is decided as speech if the difference between the log frame energy
- and the log mean energy exceeds the following threshold. The thresholds 40 and 48
- denote that the log frame energy of speech is 6 and 8 times larger than the log
- mean energy, respectively. Note that the threshold is 15 in the Aurora-3 VADNest. */
- /* high noise case: internal microphone */
- /* The threshold should be optimized later for the target environment. */
- #define EPD_LOW_SNR 10 /* */
- #define EPD_LOW_SNR_ENERGY_TH 12 /* ~ 10*log10(4*4) (changed for kWaves) */
- #define EPD_LOW_SNR_ZCR_TH 30 /* */
- /* low noise case: headset microphone */
- /* We increase the threshold in the denosing case because the input signals have been
- denoised in the preceding noise reduction module and therefore have larger SNR. */
- #define EPD_HIGH_SNR 20 /* (changed for kWaves) */
- #define EPD_HIGH_SNR_ENERGY_TH 20 /* ~ 10*log10(10*10). (changed for kWaves) */
- #define EPD_HIGH_SNR_ZCR_TH 4 /* */
- /* Update signal energy if frame energy is larger than mean by this value in dB scale */
- #define EPD_SNR_THRESHOLD_UPD_SIGNAL_EN 10
- #define EPD_LAMBDA_SIGNAL_EN 0.95
- #define EPD_LAMBDA_ZCR 0.98
- #define EPD_SPEECH_END_ENERGY_OFFSET 6 /* threshold decrease at the speech end */
- #define EPD_ZCR_THRESHOLD_UPD_LTE 20 /* threshold to update mean ZCR */
- /*---------------------
- * local variables
- *---------------------*/
- static int g_longPauseInMs = 0;
- bool Fe::EpdMain(const char *inputfile, int sampleRate, const char *outputfile)
- {
- FILE *fi;
- if( (fi = fopen(inputfile, "rb")) == NULL)
- return 0;
- fseek(fi,0L,SEEK_END);
- int fsize = ftell(fi);
- rewind(fi);
- vector<short> sample(fsize/sizeof(short));
- int sampleN = FREAD(&sample[0],sizeof(short),fsize/sizeof(short),fi);
- vector<CSegment> endPointA;
-
- if(!epd_basic(&sample[0], sampleN, sampleRate, endPointA)){
- fclose(fi);
- return false;
- }
- FILE *fo = fopen(outputfile, "wb");
- if(!fo){
- fprintf(stderr, "Cannot open %sn", outputfile);
- fclose(fi);
- return false;
- }
- fprintf(fo, "#LABELn");
- for(int i=0; i<endPointA.size(); i++){
- fprintf(fo,"%f -1 %sn",endPointA[i].m_fSegment, endPointA[i].m_szLabel.c_str());
- }
- fclose(fi);
- fclose(fo);
- return true;
- }
- int Fe::epd_basic(short *sampleA, int sampleN, int sampleRate, vector<CSegment>& endPointA)
- {
- int i,t,frameX;
- Epd epd;
- epd.Init(sampleRate,0,1,1);
- epd.InitNewUtterance();
- vector<float> in(epd.m_config.m_winSize);
- vector<float> out(epd.m_config.m_winSize);
- int frameSize=epd.m_config.m_winSize;
- int shiftSize=epd.m_config.m_shiftSize;
- int frameN=(int)((sampleN-(frameSize-shiftSize))/float(shiftSize));
- float period=epd.m_config.m_shiftSize/(float)sampleRate;
- FeReturnCode prevStatus=FE_NULL;
- int prevEndPt=-1;
- frameX=0;
- for(t=0;;t++){
- if(t>frameN) break;
- FeReturnCode inStatus;
- EpdFrameKind frameKind;
- if(t<frameN) {
- inStatus=FE_SPEECH;
- for(i=0;i<frameSize;i++) in[i]=sampleA[t*shiftSize+i];
- }
- else{ /* t==frameN */
- inStatus=FE_EOF;
- }
- FeReturnCode status=epd.OneFrame(&in[0], &out[0], frameX, frameSize, inStatus, &frameKind);
- if(status==FE_NULL || status==FE_WAITING){
- continue;
- }
- if(((prevStatus==FE_SPEECH) && (status==FE_END_POINT||status==FE_PAUSE)) || t==frameN){
- if(prevEndPt<epd.m_uttBeginX && epd.m_uttBeginX<epd.m_uttEndX){
- epd_insert_endpoint(endPointA,epd.m_uttBeginX*period,epd.m_uttEndX*period);
- prevEndPt=epd.m_uttEndX;
- }
- }
- prevStatus=status;
- frameX++;
- }
- return endPointA.size();
- }
- bool Fe::epd_insert_endpoint(vector<CSegment>& endPointA, float startPt, float endPt)
- {
- CSegment segment;
- segment.m_fSegment = startPt;
- segment.m_szLabel = "1";
- endPointA.push_back(segment);
- segment.m_fSegment = endPt;
- segment.m_szLabel = "0";
- endPointA.push_back(segment);
- return true;
- }
- Epd::Epd()
- {
- ParmInit(&m_config, DEFAULT_SAMPLING_RATE, 1);
- m_absTimeX=0; /* initialized only once */
- }
- int Epd::ParmInit(EpdParm *epdParm, int samplingRate, int isDenoised)
- {
- epdParm->m_sampleRate=samplingRate;
- if(samplingRate==8000){
- epdParm->m_shiftSize = 80; /* 10 ms. Should be same as frameShift in feature extraction */
- epdParm->m_winSize = 200; /* 25 ms. */
- }
- else if(samplingRate==11025 || samplingRate==11000){
- epdParm->m_shiftSize = 110; /* 10 ms. Should be same as frameShift in feature extraction */
- epdParm->m_winSize = 256; /* 23.27 ms. */
- }
- else if(samplingRate==16000){
- epdParm->m_shiftSize = 160; /* 10 ms. Should be same as frameShift in feature extraction */
- epdParm->m_winSize = 400; /* 25 ms. */
- }
- else{
- float shiftMs=10, winMs=25;
- epdParm->m_winSize = (int)(winMs/1000*epdParm->m_sampleRate);
- epdParm->m_shiftSize = (int)(shiftMs/1000*epdParm->m_sampleRate);
- }
- epdParm->m_threshFrameN = EPD_NB_FRAME_THRESHOLD_LTE; /* 100 ms for determining thresholds (<= m_startSilenceFrameN) */
- epdParm->m_startFrameN = 10; /* 100 ms for speech start detection */
- epdParm->m_startSilenceFrameN = 0; /* add 0 ms of silence before speech start (<= epdParm->m_startFrameN) (changed for kWaves) */
- epdParm->m_endFrameN = 30; /* 300 ms for pause detection */
- epdParm->m_endSilenceFrameN = 5; /* add 50 ms of silence after speech end (changed for kWaves) */
- if(g_longPauseInMs>0){
- epdParm->m_longPauseFrameN = g_longPauseInMs/10;
- }
- else{
- epdParm->m_longPauseFrameN = DEFAULT_LONG_PAUSE_IN_MS/10; /* 500 ms of silence for speech end detection (<= m_endSilenceFrameN) */
- }
- return 1;
- }
- int Epd::Init(int samplingRate, int isAudio, int isActive, int isDenoised)
- {
- m_isAudio=isAudio;
- m_isActive=isActive;
- ParmInit(&m_config, samplingRate, isDenoised);
- m_uttBeginX=0;
- m_uttEndX=0;
- m_localStatus=EPD_STATUS_WAITING;
- m_localFrameX=0;
- m_sampleEndX=0;
- m_absTimeX=0; /* initialized only once */
- /* initial values defined in the Aurora-3 VADNest */
- m_lambdaLTE=(float)EPD_LAMBDA_LTE;
- m_lambdaLTEhigherE=(float)0.99;
- m_lambdaSignalE=(float)EPD_LAMBDA_SIGNAL_EN;
- m_nbSpeechFrame=0;
- m_noiseEn=0;
- m_signalEn=0;
- m_meanZcr=0;
- m_lastSnr = (EPD_HIGH_SNR+EPD_LOW_SNR)/2;
- m_flagVAD=EPD_FK_SILENCE;
- return 1;
- }
- void Epd::SetMaxPause(int msec)
- {
- if(msec>EPD_MAX_PAUSE_IN_MS){
- fprintf(stderr, "[ERROR] Too large pause duration. Must be less than %dn",EPD_MAX_PAUSE_IN_MS);
- assert(0);
- }
- g_longPauseInMs = msec;
- }
- Epd::~Epd()
- {
- }
- int Epd::InitNewUtterance()
- {
- int i;
- for(i=0;i<EPD_FRAME_BUF_SIZE;i++) m_isSpeechA[i]=(EpdFrameKind)(-1);
- for(i=0;i<EPD_FRAME_BUF_SIZE;i++) m_zcrA[i]=0;
- m_localStatus=EPD_STATUS_WAITING;
- m_uttBeginX = 0;
- m_uttEndX = 0;
- m_localFrameX = 0;
- m_sampleEndX = 0;
- m_speechSegN = 0;
- return 1;
- }
- int Epd::PutSample(float *sampleA, int sampleN)
- {
- int i;
- for(i=0;i<sampleN;i++){
- m_epdSpeech[(m_sampleEndX)%EPD_SPEECH_BUF_SIZE]=(short)sampleA[i];
- m_sampleEndX=(m_sampleEndX+1)%EPD_SPEECH_BUF_SIZE;
- }
- return sampleN;
- }
- void EpdClose(Epd *epd)
- {
- }
- /* This algorithm is complicated for now and I need more elegant way of endpoint detection. */
- FeReturnCode Epd::OneFrame(float *in, float *out, int frameX, int winSize, FeReturnCode inStatus, EpdFrameKind *frameKind)
- {
- int i;
- int frameShift=m_config.m_shiftSize;
- int maxDataN=EPD_MAX_RECORD_TIME*m_config.m_sampleRate;
- if(m_localFrameX==0) PutSample(in,winSize);
- else PutSample(in+my_max(0,winSize-frameShift),frameShift);
- *frameKind=EPD_FK_SILENCE;
- if(m_isActive==0){
- if(inStatus==FE_EOF){
- m_uttEndX=m_localFrameX;
- return FE_END_POINT; /* end-point detected */
- }
- else{
- for(i=0;i<winSize;i++) out[i]=m_epdSpeech[(m_localFrameX*frameShift+i)%EPD_SPEECH_BUF_SIZE];
- *frameKind=OneFrame(in);
- m_uttEndX=m_localFrameX;
- return FE_SPEECH; /* in-speech */
- }
- }
-
- if(inStatus==FE_EOF && frameX+m_uttBeginX+m_config.m_startFrameN >= m_uttEndX){
- if(m_isSpeechA[(m_localFrameX-1)%EPD_FRAME_BUF_SIZE]==EPD_FK_SPEECH && m_isSpeechA[(m_localFrameX-2)%EPD_FRAME_BUF_SIZE]==EPD_FK_SPEECH){
- /* printf("n"); Error("Epd","[ERROR] EOF found in the middle of speechn"); */
- return FE_EARLY_END;
- }
- else if(m_isAudio==0 && EPD_OUTPUT_ALWAYS && (m_uttBeginX>m_uttEndX || m_speechSegN<1)){
- /* Because subsequent frames are all assumed silence, I regard the frame after the last speech as the end-point. */
- m_isSpeechA[m_localFrameX%EPD_FRAME_BUF_SIZE]=EPD_FK_SILENCE;
- for(i=m_localFrameX-3;i>=m_localFrameX-m_config.m_endFrameN+1 && i>=0;i--){
- if(m_isSpeechA[i%EPD_FRAME_BUF_SIZE]==EPD_FK_SPEECH) break;
- }
- /* Let's make the speech recognizer happy by making one-frame dummy speech */
- if(m_uttEndX<m_uttBeginX+EPD_DUMMY_FRAMES){
- m_uttEndX=m_uttBeginX+EPD_DUMMY_FRAMES;
- }
- m_localStatus=EPD_STATUS_ENDPOINT;
- m_uttEndX=my_max(0,my_min(m_localFrameX,(i+1)+m_config.m_endSilenceFrameN));
- if(m_speechSegN==0) m_speechSegN++;
- /* printf("EPD: Endpoint detected (begin=%d, end=%d)n",m_uttBeginX,m_uttEndX); */
- for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
- return FE_END_POINT; /* in-speech */
- }
- else if(m_localStatus==EPD_STATUS_WAITING){
- /* printf("n"); Error("Epd","[ERROR] No speechn"); */
- return FE_NO_SPEECH;
- }
- }
- if(m_localStatus==EPD_STATUS_WAITING){
- /* find begin-point */
- /* printf("EPD: Searching for speech...n"); */
- *frameKind=OneFrame(in);
- if(FindBeginPoint(m_localFrameX-1)) {
- /* Add extra frames before the speech start-point of localFrameX-startFrameN */
- m_uttBeginX=my_max(0,(m_localFrameX-m_config.m_startFrameN-m_config.m_startSilenceFrameN+1));
- m_uttEndX=m_localFrameX;
- m_localStatus=EPD_STATUS_SPEECH;
- /* printf("EPD: Speech detected (begin=%d)n",m_uttBeginX); */
- for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
- return FE_SPEECH; /* in-speech */
- }
- else{
- return FE_WAITING;
- }
- }
- else if(m_localStatus==EPD_STATUS_SPEECH){
- if(inStatus==FE_EOF){
- /* Because subsequent frames are all assumed silence, I regard the frame after the last speech as the end-point. */
- m_isSpeechA[m_localFrameX%EPD_FRAME_BUF_SIZE]=EPD_FK_SILENCE;
- for(i=m_localFrameX-3;i>=m_localFrameX-m_config.m_endFrameN+1 && i>=0;i--){
- if(m_isSpeechA[i%EPD_FRAME_BUF_SIZE]==EPD_FK_SPEECH) break;
- }
- m_localStatus=EPD_STATUS_PAUSE;
- m_uttEndX=my_max(0,my_min(m_localFrameX,(i+1)+m_config.m_endSilenceFrameN));
- /* printf("EPD: Endpoint detected (begin=%d, end=%d)n",m_uttBeginX,m_uttEndX); */
- for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
- return FE_SPEECH; /* in-speech */
- }
- /* prepare to detect end-point */
- if(m_localFrameX<m_uttBeginX+m_config.m_endFrameN-1){
- *frameKind=OneFrame(in);
- return FE_NULL;
- }
- *frameKind=OneFrame(in);
- if(FindEndPoint(m_localFrameX-1, m_config.m_endFrameN)) {
- m_uttEndX=m_localFrameX-m_config.m_endFrameN;
- m_localStatus=EPD_STATUS_PAUSE;
- m_uttEndX=m_uttEndX+my_max(0,m_config.m_endSilenceFrameN-m_config.m_endFrameN);
- /* printf("EPD: Endpoint detected (begin=%d, end=%d)n",m_uttBeginX,m_uttEndX); */
- for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
- return FE_SPEECH; /* in-speech */
- }
- else{
- m_uttEndX=m_localFrameX;
- for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
- return FE_SPEECH; /* in-speech */
- }
- }
- else if(m_localStatus==EPD_STATUS_PAUSE){
- if(inStatus!=FE_EOF){
- *frameKind=OneFrame(in);
- }
- if(FindBeginPoint(m_localFrameX-1)) {
- m_uttEndX=m_localFrameX;
- m_localStatus=EPD_STATUS_SPEECH;
- /* printf("EPD: Speech detected again (begin=%d)n",m_localFrameX); */
- for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
- return FE_SPEECH; /* in-speech */
- }
- else if(m_localFrameX<m_uttBeginX+m_config.m_longPauseFrameN-1){
- if(frameX+m_uttBeginX < m_uttEndX){
- for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
- return FE_SPEECH; /* in-speech */
- }
- else if(inStatus==FE_EOF){
- m_uttEndX=m_uttBeginX+frameX;
- m_speechSegN++;
- return FE_END_POINT; /* end-point */
- }
- else{
- for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
- return FE_PAUSE; /* pause */
- }
- }
- else{
- if(FindEndPoint(m_localFrameX-1, m_config.m_longPauseFrameN)) {
- m_localStatus=EPD_STATUS_ENDPOINT;
- if(m_uttEndX < m_uttBeginX+frameX){
- m_speechSegN++;
- /* printf("EPD: Endpoint detected (begin=%d, end=%d)n",m_uttBeginX,m_uttEndX); */
- return FE_END_POINT; /* end-point */
- }
- else{
- m_uttEndX=m_uttBeginX+frameX;
- for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
- return FE_SPEECH; /* in-speech */
- }
- }
- else{
- for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
- return FE_PAUSE; /* pause */
- }
- }
- }
- else if(m_localStatus==EPD_STATUS_ENDPOINT){
- if(m_uttBeginX+frameX < m_uttEndX){
- for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
- return FE_SPEECH; /* in-speech */
- }
- else{
- if(EPD_MULTIPLE_END_POINT){
- m_uttBeginX=m_localFrameX;
- m_uttEndX=m_localFrameX;
- m_localStatus=EPD_STATUS_WAITING;
- return FE_WAITING; /* wait for another speech */
- }
- else{
- m_speechSegN++;
- return FE_END_POINT; /* end-point */
- }
- }
- }
- else{
- assert(0);
- return FE_UNK_ERROR; /* unknown error */
- }
- return FE_UNK_ERROR; /* unknown error */
- }
- int Epd::FindBeginPoint(int endX) {
- int i, sum=0;
- assert(m_isSpeechA[endX%EPD_FRAME_BUF_SIZE] != -1);
- assert(m_localStatus==EPD_STATUS_WAITING || m_localStatus==EPD_STATUS_PAUSE);
- if(endX<m_config.m_startFrameN) return 0;
- for(i=endX;i>=endX-m_config.m_startFrameN+1;i--){ /* look back startFrameN frames */
- sum += (int)m_isSpeechA[i%EPD_FRAME_BUF_SIZE];
- }
- if(m_isSpeechA[endX%EPD_FRAME_BUF_SIZE]==EPD_FK_SPEECH && sum>=m_config.m_startFrameN-2) {
- /* Remove breath noise because it has low pitch frequency. */
- for(i=endX, sum=0;i>=endX-m_config.m_startFrameN+1;i--){
- if(m_zcrA[i%EPD_FRAME_BUF_SIZE] >= EPD_HIGH_SNR_ZCR_TH) sum++;
- }
- if(sum > m_config.m_startFrameN/2)
- return 1;
- else
- return 0;
- }
- else return 0;
- }
- int Epd::FindEndPoint(int endX, int reqSilN) {
- int i, sum=0;
- assert(m_isSpeechA[endX%EPD_FRAME_BUF_SIZE] != -1);
- assert(m_localStatus==EPD_STATUS_SPEECH || m_localStatus==EPD_STATUS_PAUSE);
- for(i=endX;i>=endX-reqSilN+1;i--){ /* look back endFrameN frames */
- sum += (int)m_isSpeechA[i%EPD_FRAME_BUF_SIZE];
- }
- if(m_isSpeechA[endX%EPD_FRAME_BUF_SIZE]==EPD_FK_SILENCE && sum<=1) return 1;
- else return 0;
- }
- EpdFrameKind Epd::OneFrame(const float *s)
- {
- int i, N=m_config.m_winSize;
- float frameEn, frameEnTh, sum, zcr, prev, snrEn, zcrTh;
- float x[EPD_MAX_WIN_SIZE];
-
- if(m_localFrameX<EPD_NB_FRAME_THRESHOLD_LTE)
- m_lambdaLTE=1-1/(float)(m_localFrameX+1);
- else
- m_lambdaLTE=(float)EPD_LAMBDA_LTE;
- if(m_localFrameX<EPD_NB_FRAME_THRESHOLD_LTE)
- m_lambdaZcr=1-1/(float)(m_localFrameX+1);
- else
- m_lambdaZcr=(float)EPD_LAMBDA_ZCR;
-
- /*
- owkwon: Prevent DC level from drifting and high frquency babble noise from surviving noise reduction.
- Here a band-pass filter with pass band 500-2800 Hz is used by cascading H1(z)=1-z^(-1)
- and H2(z)=z+1-z^(-1). This filter may sometimes lose unvoiced frames like /s/ and /ch/.
- The output of the Mel FB in the noise reduction module can be used for this purpose
- as the Aurora-3 uses the 2nd, 3rd, 4th FB coefficients. A more elaborate method is needed here.
- */
- if(1){
- /* DC offset removal, H(z)=(1-z^(-1))/(1-0.999*z^(-1)), y[i]=x[i]-x[i-1]+(1-1/1024)*y[i-1] */
- float x0=0, y0=0, a=(1-1/(float)1024);
- for (i=0; i<N; i++){
- y0 = (float)(x[i] - x0 + a * y0); x0 = x[i]; x[i] = y0; /* in-place output */
- }
- /* low-pass filtering, H(z)=[1 2 1]/4 */
- x[0]=(s[1]+2*s[0]+s[0])/4; x[N-1]=(s[N-1]+2*s[N-1]+s[N-2])/4;
- for(i=1;i<N-1; i++) {
- x[i]=(s[i+1]+2*s[i]+s[i-1])/4;
- }
- }
- for(i=0, sum=0; i<N; i++) {
- sum += x[i]*x[i];
- }
- frameEn = (float)(0.5+10/log(10)*log(1+sum/N));
- /*
- owkwon: Added the condition (frameEn < EPD_SPEECH_ENERGY_FLOOR) for babble noise.
- The babble noise is not removed completely by noise reduction; there exist
- residual speech-like signals.
- */
- if((frameEn-m_noiseEn)<EPD_SNR_THRESHOLD_UPD_LTE || m_localFrameX<EPD_MIN_FRAME || (frameEn < EPD_SPEECH_ENERGY_FLOOR)){
- if((frameEn<m_noiseEn) || (m_localFrameX<EPD_MIN_FRAME) || (frameEn < EPD_SPEECH_ENERGY_FLOOR)){
- m_noiseEn=m_noiseEn+(1-m_lambdaLTE)*(frameEn-m_noiseEn);
- }
- else{
- m_noiseEn=m_noiseEn+(1-m_lambdaLTEhigherE)*(frameEn-m_noiseEn);
- }
- if(m_noiseEn<EPD_NOISE_ENERGY_FLOOR) m_noiseEn=(float)EPD_NOISE_ENERGY_FLOOR;
- m_noiseLevel=(float)(2*sqrt(exp(log(10)/10*(m_noiseEn-0.5))-1));
- }
- if((frameEn-m_noiseEn)>EPD_SNR_THRESHOLD_UPD_SIGNAL_EN){
- if(m_localFrameX>=EPD_MIN_FRAME){
- m_signalEn=m_signalEn+(1-m_lambdaSignalE)*(frameEn-m_signalEn);
- }
- else{
- m_signalEn=m_noiseEn+EPD_SNR_THRESHOLD_UPD_SIGNAL_EN;
- }
- if(m_signalEn-m_noiseEn < m_lastSnr/2)
- m_signalEn = m_noiseEn + m_lastSnr/2;
- }
- else if(frameEn>m_signalEn){
- m_signalEn=frameEn+EPD_SNR_THRESHOLD_UPD_SIGNAL_EN;
- if(m_signalEn-m_noiseEn < m_lastSnr/2)
- m_signalEn = m_noiseEn + m_lastSnr/2;
- }
- prev = x[0]-m_noiseLevel;
- for(i=1, zcr=0; i<N; i++) {
- float val = x[i]-m_noiseLevel;
- float ztmp=val*prev;
- if(ztmp<0) zcr++;
- prev=val;
- }
- if((zcr-m_meanZcr) < EPD_ZCR_THRESHOLD_UPD_LTE || m_localFrameX<EPD_MIN_FRAME){
- m_meanZcr=m_meanZcr+(1-m_lambdaZcr)*(zcr-m_meanZcr);
- }
- snrEn=m_signalEn-m_noiseEn;
- {
- float slopeEn=(EPD_HIGH_SNR_ENERGY_TH-EPD_LOW_SNR_ENERGY_TH)/(float)(EPD_HIGH_SNR-EPD_LOW_SNR);
- float slopeZcr=(EPD_HIGH_SNR_ZCR_TH-EPD_LOW_SNR_ZCR_TH)/(float)(EPD_HIGH_SNR-EPD_LOW_SNR);
- frameEnTh=(float)(EPD_LOW_SNR_ENERGY_TH+slopeEn*(snrEn-EPD_LOW_SNR));
- if(m_localStatus==EPD_STATUS_SPEECH){
- frameEnTh = frameEnTh-EPD_SPEECH_END_ENERGY_OFFSET;
- }
- frameEnTh=my_max(EPD_LOW_SNR_ENERGY_TH,my_min(frameEnTh,EPD_HIGH_SNR_ENERGY_TH));
- zcrTh=(float)(EPD_LOW_SNR_ZCR_TH+slopeZcr*(snrEn-EPD_LOW_SNR));
- zcrTh=my_max(EPD_HIGH_SNR_ZCR_TH,my_min(zcrTh,EPD_LOW_SNR_ZCR_TH));
- }
- #if 0
- if(m_localFrameX%10==0){
- printf("%fn",snrEn);
- }
- #endif
- if(frameEn < EPD_SPEECH_ENERGY_FLOOR){
- m_flagVAD=EPD_FK_SILENCE;
- }
- else if(frameEn-m_noiseEn < EPD_LOW_SNR){
- m_flagVAD=EPD_FK_SILENCE;
- }
- else if((frameEn-m_noiseEn)>frameEnTh){
- m_flagVAD=EPD_FK_SPEECH;
- m_nbSpeechFrame=m_nbSpeechFrame+1;
- m_lastSnr=snrEn;
- }
- else{
- if(m_localFrameX>EPD_MIN_FRAME && m_noiseEn < EPD_NOISE_CLEAN && zcr-m_meanZcr >= zcrTh){
- m_flagVAD=EPD_FK_SPEECH;
- m_nbSpeechFrame=m_nbSpeechFrame+1;
- }
- else
- {
- m_flagVAD=EPD_FK_SILENCE;
- }
- }
- m_zcrA[m_localFrameX%EPD_FRAME_BUF_SIZE]=zcr;
- m_isSpeechA[m_localFrameX%EPD_FRAME_BUF_SIZE]=m_flagVAD;
- m_localFrameX++;
- m_absTimeX++;
- return m_flagVAD;
- }