FE_pitch.cpp
上传用户:italyroyal
上传日期:2013-05-06
资源大小:473k
文件大小:14k
- ///////////////////////////////////////////////////////////////////////////////
- // This is a part of the Feature program.
- // Version: 1.0
- // Date: February 22, 2003
- // Programmer: Oh-Wook Kwon
- // Copyright(c) 2003 Oh-Wook Kwon. All rights reserved. owkwon@ucsd.edu
- ///////////////////////////////////////////////////////////////////////////////
- #include "StdAfx.h"
- #include "FE_feature.h"
- struct FePitchSeg /* Pitch detection parameters */
- {
- EVusType v;
- float f;
- };
- int Fe::pitch_basic(short *sample, int sampleN, int sampleRate, int shiftSize, vector<EVusType>& vusA, vector<float>& pitchA)
- {
- int i,n;
- int maxFrameSize = 2*sampleRate/MIN_PITCH_FREQ; /* 40 ms */
- int minFrameSize = 3*sampleRate/MAX_PITCH_FREQ; /* 12 ms */
- int num_frames = sampleN/shiftSize;
- int lpfLen = (sampleRate/(2*MAX_PITCH_FREQ)); if(lpfLen%2 == 0) lpfLen += 1;
- float maxDeltaPitch = (MAX_PITCH_CHANGE*MAX_PITCH_FREQ); /* 50 Hz for 10 ms */
- #ifdef _DEBUG
- #define TEST_FRAME_LEN 300
- EVusType vusTmpA[TEST_FRAME_LEN];
- int k;
- #endif
- pitchA.resize(num_frames+1);
- for(i=0;i<=num_frames;i++) pitchA[i]=0;
- if(!(sampleN > 0 && sampleRate > 0)) return false;
- if(sampleN<maxFrameSize) return false;
- vector<float> amdfA(maxFrameSize);
- vector<float> frameA(maxFrameSize);
- vector<float> p(num_frames+1+PITCH_BUF_SIZE/2);
- FePitchSeg bufA[PITCH_BUF_SIZE];
- FePitchSeg curr;
-
- if(vusA.size()==0) vus_basic(sample, sampleN, GetFrameSize(), vusA);
- pitch_low_pass_filter(sample, sampleN, lpfLen);
- for(i=0;i<PITCH_BUF_SIZE;i++) bufA[i].v = FRM_SILENCE;
-
- int nCurrPos = (PITCH_BUF_SIZE-1)/2;
- int PITCH_BUF_SIZE2 = (PITCH_BUF_SIZE-1)/2;
- m_meanPitch = 0;
- m_pitchFrameN = 0;
- for(n=0;n < PITCH_BUF_SIZE/2; n++){
- curr.v=vusA[n];
- if(curr.v == FRM_VOICED){
- for(i=0;i<maxFrameSize;i++) frameA[i]=sample[n*shiftSize+i];
- m_window.Windowing(&frameA[0],maxFrameSize,WIN_HANNING);
- curr.f = pitch_find(&frameA[0], maxFrameSize, &p[0], n, &amdfA[0], (int)(shiftSize/(float)sampleRate), sampleRate);
- }
- else{
- curr.f = 0;
- }
- p[n] = curr.f;
-
- for(i=0;i<PITCH_BUF_SIZE-1;i++){
- bufA[i] = bufA[i+1];
- }
- bufA[PITCH_BUF_SIZE-1] = curr;
- }
-
- int frameN=0;
- int blockSize = maxFrameSize;
- for(;n*shiftSize+blockSize < sampleN; n++){
- if(!CheckWinMessage()) break;
- ShowProgress((int)((n*shiftSize*100.0)/sampleN));
-
- curr.v=vusA[n];
- if(curr.v == FRM_VOICED){
- for(i=0;i<blockSize;i++) frameA[i]=sample[n*shiftSize+i];
- m_window.Windowing(&frameA[0],blockSize,WIN_HANNING);
- curr.f = pitch_find(&frameA[0], blockSize, &p[0], n, &amdfA[0], (int)(shiftSize/(float)sampleRate), sampleRate);
- if(curr.f == 0){
- //printf("Correcting pitch to zeron");
- curr.v = FRM_UNVOICED;
- }
- }
- else{
- curr.f = 0;
- }
-
- p[n] = curr.f;
-
- for(i=0;i<PITCH_BUF_SIZE-1;i++){
- bufA[i] = bufA[i+1];
- }
- bufA[PITCH_BUF_SIZE-1] = curr;
-
- int sum;
- for(i=1,sum=0;i<PITCH_BUF_SIZE-1;i++) sum+=((bufA[i].v == FRM_VOICED)?1:0);
- if(sum>=PITCH_BUF_SIZE-3 && (bufA[nCurrPos].v == FRM_UNVOICED || bufA[nCurrPos].v == FRM_SILENCE)) {
- if(bufA[nCurrPos].v == FRM_UNVOICED){
- //printf("Correcting FRM_UNVOICED->FRM_VOICEDn");
- }
- else{
- //printf("Correcting FRM_SILENCE->FRM_VOICEDn");
- }
- bufA[nCurrPos].v = FRM_VOICED;
- bufA[nCurrPos].f = (bufA[nCurrPos-1].f+bufA[nCurrPos+1].f)/2;
- p[n-(PITCH_BUF_SIZE-1-nCurrPos)] = bufA[nCurrPos].f;
- }
-
- for(i=0,sum=0;i<PITCH_BUF_SIZE;i++) sum+=((bufA[i].v == FRM_VOICED)?1:0);
- if(sum==PITCH_BUF_SIZE){
- int peak[PITCH_BUF_SIZE];
- int lmin = -1;
- int lmax = PITCH_BUF_SIZE;
- for(i=1;i<PITCH_BUF_SIZE;i++){
- float delta = (float)fabs(bufA[i].f - bufA[i-1].f);
- if(delta>maxDeltaPitch){
- peak[i] = 1;
- if(lmin < 0) lmin = i;
- lmax = i;
- }
- else{
- peak[i] = 0;
- }
- }
-
- if(lmax-lmin <= 2 && lmin >= 3 && lmax < PITCH_BUF_SIZE){
- //printf("Correcting pitchn");
- for(i=lmin; i<lmax;i++){
- bufA[i].f = (bufA[lmin-1].f+bufA[lmax].f)/2;
- p[n-(PITCH_BUF_SIZE-1-i)] = bufA[i].f;
- }
- }
- }
-
- for(i=1,sum=0;i<PITCH_BUF_SIZE-1;i++) sum+=((bufA[i].v == FRM_UNVOICED || bufA[i].v == FRM_SILENCE)?1:0);
- if(sum>=PITCH_BUF_SIZE-3 && bufA[nCurrPos].v == FRM_VOICED) {
- //printf("Correcting FRM_VOICED->FRM_UNVOICEDn");
- bufA[nCurrPos].v = FRM_UNVOICED;
- bufA[nCurrPos].f = 0;
- p[n-(PITCH_BUF_SIZE-1-nCurrPos)] = bufA[nCurrPos].f;
-
- //printf("Correcting Pitch historyn");
- for(i=nCurrPos+1;i<PITCH_BUF_SIZE;i++){
- curr.v=vusA[n];
- if(curr.v == FRM_VOICED){
- int k;
- for(k=0;k<blockSize;k++) frameA[k]=sample[n*shiftSize+(i-3)*shiftSize+k];
- m_window.Windowing(&frameA[0],blockSize,WIN_HANNING);
- curr.f = pitch_find(&frameA[0], blockSize, &p[0], n-(PITCH_BUF_SIZE-1-i), &amdfA[0], (int)(shiftSize/(float)sampleRate), sampleRate);
- if(curr.f == 0){
- //printf("Correcting pitch to zeron");
- curr.v = FRM_UNVOICED;
- }
- }
- else{
- curr.f = 0;
- }
- p[n-(PITCH_BUF_SIZE-1-i)] = curr.f;
- bufA[i] = curr;
- }
- }
-
- if(n < num_frames){
- pitchA[frameN] = bufA[nCurrPos].f;
- frameN++;
- }
- /* use variable block size */
- if(bufA[nCurrPos].f > 0)
- blockSize = my_max(minFrameSize, my_min(maxFrameSize,(int)(3*sampleRate/(bufA[nCurrPos].f/3)) ));
- else
- blockSize = maxFrameSize;
- }
-
- #ifdef _DEBUG
- for(k=0;k<my_min(TEST_FRAME_LEN,vusA.size());k++){
- vusTmpA[k]=vusA[k];
- }
- #endif
- vector<EVusType> vusOrgA(num_frames+1);
- for(i=0;i<=num_frames;i++) vusOrgA[i] = FRM_SILENCE;
- frameN = 0;
- for(n=0;n*shiftSize+blockSize < sampleN; n++){
- EVusType v=vusA[n];
- if(n < num_frames){
- if(v == FRM_VOICED && pitchA[n]==0)
- vusOrgA[frameN] = FRM_UNVOICED;
- else
- vusOrgA[frameN] = v;
- frameN++;
- }
- vusA[n] = vusOrgA[n];
- }
- #ifdef _DEBUG
- for(k=0;k<my_min(TEST_FRAME_LEN,vusA.size());k++){
- vusTmpA[k]=vusA[k];
- }
- #endif
- /* correct isolated pitch */
- for(i=0;i<frameN;i++){
- if(vusA[i]==FRM_VOICED && pitchA[i]==0) vusA[i]=FRM_UNVOICED;
- }
- vus_remove_short_segments(vusA); /* remove short segments */
- for(i=0;i<frameN;i++){
- if(vusA[i]==FRM_UNVOICED || vusA[i]==FRM_SILENCE) pitchA[i]=0;
- }
- /* we have to shift a few frames because a longer block size is used for pitch estimation */
- int feFrameSize=GetFrameSize();
- int corrFrameN=(int)(0.5*(maxFrameSize-feFrameSize)/(float)feFrameSize);
- for(i=frameN-1;i>=corrFrameN;i--){
- pitchA[i]=pitchA[i-corrFrameN];
- }
- /* spike removal */
- pitch_remove_spike(pitchA);
- /* low-pass filter, x=filter([1 2 1]/4, [1], x); */
- pitch_linear_filter(pitchA);
- return frameN;
- }
- bool Fe::pitch_amdf(float *sample, float *amdfA, int blockSize, int pmin, int pmax)
- {
- int s,n;
- for(s=pmin;s<=pmax;s++){
- amdfA[s] = 0;
- for(n=0;n<blockSize;n++){
- if(n+s < blockSize){
- amdfA[s] += my_abs(sample[n]-sample[n+s]);
- }
- else{
- amdfA[s] += my_abs(sample[n]);
- }
- }
- }
- return true;
- }
- float Fe::pitch_find(float *sample, int blockSize, float *pitchA, int t, float *amdfA, int shiftSize, int sampleRate)
- {
- int pmin = sampleRate/MAX_PITCH_FREQ;
- int pmax = sampleRate/MIN_PITCH_FREQ;
- int i;
-
- float lastPitch = (float)0;
- int lastTime = 0;
- for(i=0;i<t;i++){
- if(pitchA[i]>0) {
- lastPitch = pitchA[i];
- lastTime = i;
- }
- }
- int startSegTime=t;
- for(i=t-1;i>=0;i--){
- if(pitchA[i]==0) {
- startSegTime=i+1;
- break;
- }
- }
-
- if(t>=167 && t<180){
- int aaa=1;
- aaa++;
- }
- float lowF = (float)MIN_PITCH_FREQ;
- float highF = (float)MAX_PITCH_FREQ;
- float maxDeltaPitch = (MAX_PITCH_CHANGE*MAX_PITCH_FREQ); // 40Hz for 10ms
- if(lastPitch > 0){
- if(t >= 3 && lastTime == t-1 && (pitchA[t-2] == (float)0 || pitchA[t-3] == (float)0)){
- // Start of voice
- maxDeltaPitch = (2*MAX_PITCH_CHANGE*lastPitch);
- }
- else{
- // Middle of voice
- maxDeltaPitch = (MAX_PITCH_CHANGE*lastPitch);
- }
- }
-
- float deltaPitch = maxDeltaPitch;
- if(lastPitch > 0){
- int dur = (t-lastTime);
- deltaPitch = dur*maxDeltaPitch;
- lowF = my_max(MIN_PITCH_FREQ,lastPitch - deltaPitch);
- highF = my_min(MAX_PITCH_FREQ,lastPitch + deltaPitch);
- pmin = (int)(sampleRate/highF);
- pmax = (int)(sampleRate/lowF);
- }
- int smin;
- int smax;
-
- pitch_amdf(sample, amdfA, blockSize, pmin, pmax);
- pitch_find_minmax(amdfA, pmin, pmax, &smin, &smax);
- float avgAmdf=0;
- for(i=pmin;i<=pmax;i++) avgAmdf += amdfA[i];
- avgAmdf /= (pmax-pmin+1);
- float amdf=amdfA[smin];
- if(t>=167 && t<180){
- int aaa=1;
- aaa++;
- }
- float currPitch;
- bool reliable=false;
- if(pmin < smin && smin < pmax && amdf<avgAmdf*F0_SHARP_TH_1){
- currPitch = 1/(float)smin * sampleRate;
- if(amdf<avgAmdf*0.9) reliable=true;
- }
- else{
- // try once more with increased range
- if(smin == pmax){
- lowF = my_max(MIN_PITCH_FREQ,lowF-deltaPitch);
- }
- if(smin == pmin){
- highF = my_min(MAX_PITCH_FREQ,highF+deltaPitch);
- }
- pmin = (int)(sampleRate/highF);
- pmax = (int)(sampleRate/lowF);
-
- pitch_amdf(sample, amdfA, blockSize, pmin, pmax);
- pitch_find_minmax(amdfA, pmin, pmax, &smin, &smax);
-
- avgAmdf=0;
- for(i=pmin;i<=pmax;i++) avgAmdf += amdfA[i];
- avgAmdf /= (pmax-pmin+1);
- amdf=amdfA[smin];
- if((pmin < smin && smin < pmax) && amdf<avgAmdf*F0_SHARP_TH_2){
- currPitch = 1/(float)smin * sampleRate;
- }
- else{
- currPitch = (float)0;
- }
- if(amdf<avgAmdf*0.5) reliable=true;
- }
- int gender=0;
- if(m_pitchFrameN>20){
- if(m_meanPitch>200)
- gender=1;
- else if(m_meanPitch<150)
- gender=-1;
- }
- /* Correct F0 doubling/halving error */
- float orgMag=pitch_fft_one_freq(sample,blockSize,smin);
- bool checkDoubling = ((currPitch>F0_DOUBLE_FREQ_TH) || (currPitch>1.6*m_meanPitch));
- bool checkHalving = ((currPitch>0) && (currPitch<F0_HALVE_FREQ_TH || currPitch<0.6*m_meanPitch));
- bool checkGender = ((gender==1 && currPitch<150) || (gender== -1 && currPitch>200));
- float f0_double_th = (float)F0_DOUBLE_TH;
- float f0_halve_th = (float)F0_HALVE_TH;
- if(checkGender != 0){
- if(checkDoubling){
- f0_double_th *= 1;
- f0_halve_th *= (float)0.2;
- }
- else if(checkHalving){
- f0_double_th *= (float)0.5;
- f0_halve_th *= 1;
- }
- }
- if(t>=252){
- int aaa=1;
- aaa++;
- }
- /* Apply correction when the estimated F0 is very different at the start of segment */
- /* and magnitude of the frequency is large enough */
- if(t-startSegTime <= 5 && orgMag > 0.5*blockSize && (checkDoubling || checkHalving || checkGender)){
- float hypoDouble, hypoHalve;
- hypoDouble=pitch_fft_one_freq(sample,blockSize,smin/2);
- hypoHalve=pitch_fft_one_freq(sample,blockSize,smin*2);
- if(checkGender==0 && orgMag>hypoDouble*F0_NO_DOUBLE_TH && orgMag>hypoHalve*F0_NO_HALVE_TH){
- ;
- }
- else if(checkDoubling){
- if(orgMag > f0_double_th*hypoDouble && hypoHalve > f0_halve_th*orgMag)
- currPitch /= 2;
- }
- else if(checkHalving){
- if(orgMag < f0_double_th*hypoDouble)
- currPitch *= 2;
- }
- }
-
- /* update the mean pitch frequency */
- if(reliable){
- float lambda;
- if(m_pitchFrameN<10){
- lambda = 1-1/(float)(m_pitchFrameN+1);
- }
- else{
- lambda = (float)lambdaPitchTrack;
- }
- m_meanPitch = lambda*m_meanPitch + (1-lambda)*currPitch;
- m_pitchFrameN++;
- }
- return currPitch;
- }
- float Fe::pitch_fft_one_freq(float *sample, int blockSize, int p)
- {
- int i;
- float xre=0, xim=0;
- for(i=0;i<blockSize;i++){
- xre += sample[i]*cos(2*M_PI*i/(float)p);
- xim += sample[i]*sin(2*M_PI*i/(float)p);
- }
- return (float)sqrt(xre*xre + xim*xim);
- }
- bool Fe::pitch_find_minmax(float *amdfA, int pmin, int pmax, int* smin, int* smax)
- {
- int s;
- *smin = pmin;
- *smax = pmin;
- float amin = amdfA[pmin];
- float amax = amdfA[pmin];
- for(s=pmin;s<=pmax;s++){
- if(amdfA[s] < amin){
- amin = amdfA[s];
- *smin = s;
- }
- else if(amdfA[s] > amax){
- amax = amdfA[s];
- *smax = s;
- }
- }
- return true;
- }
- bool Fe::pitch_low_pass_filter(short *sample, int sampleN, int lpfLen)
- {
- int i,n,sum,kk;
- vector<int> tmp(sampleN);
- for(n=0;n<lpfLen/2;n++){
- sum=0;
- kk=0;
- for(i=-lpfLen/2;i<=lpfLen/2;i++){
- sum += (((n+i) >= 0) ? sample[n+i] : 0);
- kk += (((n+i) >= 0) ? 1 : 0);
- }
- tmp[n] = sum/kk;
- }
-
- for(n=lpfLen/2;n<sampleN-lpfLen/2;n++){
- sum = 0;
- for(i=-lpfLen/2;i<=lpfLen/2;i++) sum += sample[n+i];
- tmp[n] = sum/lpfLen;
- }
-
- for(n=sampleN-lpfLen/2;n<sampleN;n++){
- sum=0;
- kk=0;
- for(i=-lpfLen/2;i<=lpfLen/2;i++){
- sum += (((n+i) < sampleN) ? sample[n+i] : 0);
- kk += (((n+i) >= 0) ? 1 : 0);
- }
- tmp[n] = sum/kk;
- }
-
- /* clipping to 16 bit integer */
- for(n=0;n<sampleN;n++){
- sample[n] = my_min(my_max(SHRT_MIN, tmp[n]), SHRT_MAX);
- }
-
- return true;
- }
- int Fe::pitch_remove_spike(vector<float>& pitchA)
- {
- int i;
- float maxChange=(float)MAX_PITCH_CHANGE_POSTPROC;
- int frameN=pitchA.size();
- for(i=2;i<frameN-2;i++){
- if(pitchA[i-1]==0 && pitchA[i+1]==0){
- pitchA[i]=0;
- }
- else if(pitchA[i-1]>0 && pitchA[i+1]>0){
- float ftmp=GetMedian(&pitchA[i-2],5);
- if(fabs(ftmp-pitchA[i])>maxChange*pitchA[i]){
- pitchA[i]=(pitchA[i-1]+pitchA[i+1])/2;
- }
- }
- else if(i>=5 && pitchA[i]>0 && pitchA[i-1]>0 && pitchA[i-2]>0 && pitchA[i+1]==0){
- float ftmp=GetMedian(&pitchA[i-5],5);
- if(fabs(ftmp-pitchA[i])>maxChange*pitchA[i]){
- pitchA[i]=my_max(-pitchA[i-2]+my_min(2*pitchA[i-1],MAX_PITCH_FREQ), 0);
- }
- }
- else if(i<frameN-5 && pitchA[i]>0 && pitchA[i+1]>0 && pitchA[i+2]>0 && pitchA[i-1]==0){
- float ftmp=GetMedian(&pitchA[i],5);
- if(fabs(ftmp-pitchA[i])>maxChange*pitchA[i]){
- pitchA[i]=my_max(my_min(2*pitchA[i+1],MAX_PITCH_FREQ)-pitchA[i+2], 0);
- }
- }
- }
- return frameN;
- }
- int Fe::pitch_linear_filter(vector<float>& pitchA)
- {
- int i;
- int frameN=pitchA.size();
- vector<float> x;
- x.resize(frameN);
- for(i=0;i<frameN;i++) x[i]=pitchA[i];
-
- for(i=2;i<frameN-2;i++){
- if(x[i-2]>0 && x[i-1]>0 && x[i+1]>0 && x[i+2]>0){
- pitchA[i]=(x[i-1]+2*x[i]+x[i+1])/4;
- }
- }
- return frameN;
- }