FE_pitch.cpp
上传用户:italyroyal
上传日期:2013-05-06
资源大小:473k
文件大小:14k
源码类别:

语音合成与识别

开发平台:

Visual C++

  1. ///////////////////////////////////////////////////////////////////////////////
  2. // This is a part of the Feature program.
  3. // Version: 1.0
  4. // Date: February 22, 2003
  5. // Programmer: Oh-Wook Kwon
  6. // Copyright(c) 2003 Oh-Wook Kwon. All rights reserved. owkwon@ucsd.edu
  7. ///////////////////////////////////////////////////////////////////////////////
  8. #include "StdAfx.h"
  9. #include "FE_feature.h"
  10. struct FePitchSeg /* Pitch detection parameters */
  11. {
  12. EVusType v;
  13. float f;
  14. };
  15. int Fe::pitch_basic(short *sample, int sampleN, int sampleRate, int shiftSize, vector<EVusType>& vusA, vector<float>& pitchA)
  16. {
  17. int i,n;
  18. int maxFrameSize = 2*sampleRate/MIN_PITCH_FREQ; /* 40 ms */
  19. int minFrameSize = 3*sampleRate/MAX_PITCH_FREQ; /* 12 ms */
  20. int num_frames = sampleN/shiftSize;
  21. int lpfLen = (sampleRate/(2*MAX_PITCH_FREQ)); if(lpfLen%2 == 0) lpfLen += 1;
  22. float maxDeltaPitch = (MAX_PITCH_CHANGE*MAX_PITCH_FREQ); /* 50 Hz for 10 ms */
  23. #ifdef _DEBUG
  24. #define TEST_FRAME_LEN 300
  25. EVusType vusTmpA[TEST_FRAME_LEN];
  26. int k;
  27. #endif
  28. pitchA.resize(num_frames+1);
  29. for(i=0;i<=num_frames;i++) pitchA[i]=0;
  30. if(!(sampleN > 0 && sampleRate > 0)) return false;
  31. if(sampleN<maxFrameSize) return false;
  32. vector<float> amdfA(maxFrameSize);
  33. vector<float> frameA(maxFrameSize);
  34. vector<float> p(num_frames+1+PITCH_BUF_SIZE/2);
  35. FePitchSeg bufA[PITCH_BUF_SIZE];
  36. FePitchSeg curr;
  37. if(vusA.size()==0) vus_basic(sample, sampleN, GetFrameSize(), vusA);
  38. pitch_low_pass_filter(sample, sampleN, lpfLen);
  39. for(i=0;i<PITCH_BUF_SIZE;i++) bufA[i].v = FRM_SILENCE;
  40. int nCurrPos = (PITCH_BUF_SIZE-1)/2;
  41. int PITCH_BUF_SIZE2 = (PITCH_BUF_SIZE-1)/2;
  42. m_meanPitch = 0;
  43. m_pitchFrameN = 0;
  44. for(n=0;n < PITCH_BUF_SIZE/2; n++){
  45. curr.v=vusA[n];
  46. if(curr.v == FRM_VOICED){
  47. for(i=0;i<maxFrameSize;i++) frameA[i]=sample[n*shiftSize+i];
  48. m_window.Windowing(&frameA[0],maxFrameSize,WIN_HANNING);
  49. curr.f = pitch_find(&frameA[0], maxFrameSize, &p[0], n, &amdfA[0], (int)(shiftSize/(float)sampleRate), sampleRate);
  50. }
  51. else{
  52. curr.f = 0;
  53. }
  54. p[n] = curr.f;
  55. for(i=0;i<PITCH_BUF_SIZE-1;i++){
  56. bufA[i] = bufA[i+1];
  57. }
  58. bufA[PITCH_BUF_SIZE-1] = curr;
  59. }
  60. int frameN=0;
  61. int blockSize = maxFrameSize;
  62. for(;n*shiftSize+blockSize < sampleN; n++){
  63. if(!CheckWinMessage()) break;
  64. ShowProgress((int)((n*shiftSize*100.0)/sampleN));
  65. curr.v=vusA[n];
  66. if(curr.v == FRM_VOICED){
  67. for(i=0;i<blockSize;i++) frameA[i]=sample[n*shiftSize+i];
  68. m_window.Windowing(&frameA[0],blockSize,WIN_HANNING);
  69. curr.f = pitch_find(&frameA[0], blockSize, &p[0], n, &amdfA[0], (int)(shiftSize/(float)sampleRate), sampleRate);
  70. if(curr.f == 0){
  71. //printf("Correcting pitch to zeron");
  72. curr.v = FRM_UNVOICED;
  73. }
  74. }
  75. else{
  76. curr.f = 0;
  77. }
  78. p[n] = curr.f;
  79. for(i=0;i<PITCH_BUF_SIZE-1;i++){
  80. bufA[i] = bufA[i+1];
  81. }
  82. bufA[PITCH_BUF_SIZE-1] = curr;
  83. int sum;
  84. for(i=1,sum=0;i<PITCH_BUF_SIZE-1;i++) sum+=((bufA[i].v == FRM_VOICED)?1:0);
  85. if(sum>=PITCH_BUF_SIZE-3 && (bufA[nCurrPos].v == FRM_UNVOICED || bufA[nCurrPos].v == FRM_SILENCE)) {
  86. if(bufA[nCurrPos].v == FRM_UNVOICED){
  87. //printf("Correcting FRM_UNVOICED->FRM_VOICEDn");
  88. }
  89. else{
  90. //printf("Correcting FRM_SILENCE->FRM_VOICEDn");
  91. }
  92. bufA[nCurrPos].v = FRM_VOICED;
  93. bufA[nCurrPos].f = (bufA[nCurrPos-1].f+bufA[nCurrPos+1].f)/2;
  94. p[n-(PITCH_BUF_SIZE-1-nCurrPos)] = bufA[nCurrPos].f;
  95. }
  96. for(i=0,sum=0;i<PITCH_BUF_SIZE;i++) sum+=((bufA[i].v == FRM_VOICED)?1:0);
  97. if(sum==PITCH_BUF_SIZE){
  98. int peak[PITCH_BUF_SIZE];
  99. int lmin = -1;
  100. int lmax = PITCH_BUF_SIZE;
  101. for(i=1;i<PITCH_BUF_SIZE;i++){
  102. float delta = (float)fabs(bufA[i].f - bufA[i-1].f);
  103. if(delta>maxDeltaPitch){
  104. peak[i] = 1;
  105. if(lmin < 0) lmin = i;
  106. lmax = i;
  107. }
  108. else{
  109. peak[i] = 0;
  110. }
  111. }
  112. if(lmax-lmin <= 2 && lmin >= 3 && lmax < PITCH_BUF_SIZE){
  113. //printf("Correcting pitchn");
  114. for(i=lmin; i<lmax;i++){
  115. bufA[i].f = (bufA[lmin-1].f+bufA[lmax].f)/2;
  116. p[n-(PITCH_BUF_SIZE-1-i)] = bufA[i].f;
  117. }
  118. }
  119. }
  120. for(i=1,sum=0;i<PITCH_BUF_SIZE-1;i++) sum+=((bufA[i].v == FRM_UNVOICED || bufA[i].v == FRM_SILENCE)?1:0);
  121. if(sum>=PITCH_BUF_SIZE-3 && bufA[nCurrPos].v == FRM_VOICED) {
  122. //printf("Correcting FRM_VOICED->FRM_UNVOICEDn");
  123. bufA[nCurrPos].v = FRM_UNVOICED;
  124. bufA[nCurrPos].f = 0;
  125. p[n-(PITCH_BUF_SIZE-1-nCurrPos)] = bufA[nCurrPos].f;
  126. //printf("Correcting Pitch historyn");
  127. for(i=nCurrPos+1;i<PITCH_BUF_SIZE;i++){
  128. curr.v=vusA[n];
  129. if(curr.v == FRM_VOICED){
  130. int k;
  131. for(k=0;k<blockSize;k++) frameA[k]=sample[n*shiftSize+(i-3)*shiftSize+k];
  132. m_window.Windowing(&frameA[0],blockSize,WIN_HANNING);
  133. curr.f = pitch_find(&frameA[0], blockSize, &p[0], n-(PITCH_BUF_SIZE-1-i), &amdfA[0], (int)(shiftSize/(float)sampleRate), sampleRate);
  134. if(curr.f == 0){
  135. //printf("Correcting pitch to zeron");
  136. curr.v = FRM_UNVOICED;
  137. }
  138. }
  139. else{
  140. curr.f = 0;
  141. }
  142. p[n-(PITCH_BUF_SIZE-1-i)] = curr.f;
  143. bufA[i] = curr;
  144. }
  145. }
  146. if(n < num_frames){
  147. pitchA[frameN] = bufA[nCurrPos].f;
  148. frameN++;
  149. }
  150. /* use variable block size */
  151. if(bufA[nCurrPos].f > 0)
  152. blockSize = my_max(minFrameSize, my_min(maxFrameSize,(int)(3*sampleRate/(bufA[nCurrPos].f/3)) ));
  153. else
  154. blockSize = maxFrameSize;
  155. }
  156. #ifdef _DEBUG
  157. for(k=0;k<my_min(TEST_FRAME_LEN,vusA.size());k++){
  158. vusTmpA[k]=vusA[k];
  159. }
  160. #endif
  161. vector<EVusType> vusOrgA(num_frames+1);
  162. for(i=0;i<=num_frames;i++) vusOrgA[i] = FRM_SILENCE;
  163. frameN = 0;
  164. for(n=0;n*shiftSize+blockSize < sampleN; n++){
  165. EVusType v=vusA[n];
  166. if(n < num_frames){
  167. if(v == FRM_VOICED && pitchA[n]==0)
  168. vusOrgA[frameN] = FRM_UNVOICED;
  169. else
  170. vusOrgA[frameN] = v;
  171. frameN++;
  172. }
  173. vusA[n] = vusOrgA[n];
  174. }
  175. #ifdef _DEBUG
  176. for(k=0;k<my_min(TEST_FRAME_LEN,vusA.size());k++){
  177. vusTmpA[k]=vusA[k];
  178. }
  179. #endif
  180. /* correct isolated pitch */
  181. for(i=0;i<frameN;i++){
  182. if(vusA[i]==FRM_VOICED && pitchA[i]==0) vusA[i]=FRM_UNVOICED;
  183. }
  184. vus_remove_short_segments(vusA); /* remove short segments */
  185. for(i=0;i<frameN;i++){
  186. if(vusA[i]==FRM_UNVOICED || vusA[i]==FRM_SILENCE) pitchA[i]=0;
  187. }
  188. /* we have to shift a few frames because a longer block size is used for pitch estimation */
  189. int feFrameSize=GetFrameSize();
  190. int corrFrameN=(int)(0.5*(maxFrameSize-feFrameSize)/(float)feFrameSize);
  191. for(i=frameN-1;i>=corrFrameN;i--){
  192. pitchA[i]=pitchA[i-corrFrameN];
  193. }
  194. /* spike removal */
  195. pitch_remove_spike(pitchA);
  196. /* low-pass filter, x=filter([1 2 1]/4, [1], x); */
  197. pitch_linear_filter(pitchA);
  198. return frameN;
  199. }
  200. bool Fe::pitch_amdf(float *sample, float *amdfA, int blockSize, int pmin, int pmax)
  201. {
  202. int s,n;
  203. for(s=pmin;s<=pmax;s++){
  204. amdfA[s] = 0;
  205. for(n=0;n<blockSize;n++){
  206. if(n+s < blockSize){
  207. amdfA[s] += my_abs(sample[n]-sample[n+s]);
  208. }
  209. else{
  210. amdfA[s] += my_abs(sample[n]);
  211. }
  212. }
  213. }
  214. return true;
  215. }
  216. float Fe::pitch_find(float *sample, int blockSize, float *pitchA, int t, float *amdfA, int shiftSize, int sampleRate)
  217. {
  218. int pmin = sampleRate/MAX_PITCH_FREQ;
  219. int pmax = sampleRate/MIN_PITCH_FREQ;
  220. int i;
  221. float lastPitch = (float)0;
  222. int lastTime = 0;
  223. for(i=0;i<t;i++){
  224. if(pitchA[i]>0) {
  225. lastPitch = pitchA[i];
  226. lastTime = i;
  227. }
  228. }
  229. int startSegTime=t;
  230. for(i=t-1;i>=0;i--){
  231. if(pitchA[i]==0) {
  232. startSegTime=i+1;
  233. break;
  234. }
  235. }
  236. if(t>=167 && t<180){
  237. int aaa=1;
  238. aaa++;
  239. }
  240. float lowF = (float)MIN_PITCH_FREQ;
  241. float highF = (float)MAX_PITCH_FREQ;
  242. float maxDeltaPitch = (MAX_PITCH_CHANGE*MAX_PITCH_FREQ); // 40Hz for 10ms
  243. if(lastPitch > 0){
  244. if(t >= 3 && lastTime == t-1 && (pitchA[t-2] == (float)0 || pitchA[t-3] == (float)0)){
  245. // Start of voice
  246. maxDeltaPitch = (2*MAX_PITCH_CHANGE*lastPitch);
  247. }
  248. else{
  249. // Middle of voice
  250. maxDeltaPitch = (MAX_PITCH_CHANGE*lastPitch);
  251. }
  252. }
  253. float deltaPitch = maxDeltaPitch;
  254. if(lastPitch > 0){
  255. int dur = (t-lastTime);
  256. deltaPitch = dur*maxDeltaPitch;
  257. lowF = my_max(MIN_PITCH_FREQ,lastPitch - deltaPitch);
  258. highF = my_min(MAX_PITCH_FREQ,lastPitch + deltaPitch);
  259. pmin = (int)(sampleRate/highF);
  260. pmax = (int)(sampleRate/lowF);
  261. }
  262. int smin;
  263. int smax;
  264. pitch_amdf(sample, amdfA, blockSize, pmin, pmax);
  265. pitch_find_minmax(amdfA, pmin, pmax, &smin, &smax);
  266. float avgAmdf=0;
  267. for(i=pmin;i<=pmax;i++) avgAmdf += amdfA[i];
  268. avgAmdf /= (pmax-pmin+1);
  269. float amdf=amdfA[smin];
  270. if(t>=167 && t<180){
  271. int aaa=1;
  272. aaa++;
  273. }
  274. float currPitch;
  275. bool reliable=false;
  276. if(pmin < smin && smin < pmax && amdf<avgAmdf*F0_SHARP_TH_1){
  277. currPitch = 1/(float)smin * sampleRate;
  278. if(amdf<avgAmdf*0.9) reliable=true;
  279. }
  280. else{
  281. // try once more with increased range
  282. if(smin == pmax){
  283. lowF = my_max(MIN_PITCH_FREQ,lowF-deltaPitch);
  284. }
  285. if(smin == pmin){
  286. highF = my_min(MAX_PITCH_FREQ,highF+deltaPitch);
  287. }
  288. pmin = (int)(sampleRate/highF);
  289. pmax = (int)(sampleRate/lowF);
  290. pitch_amdf(sample, amdfA, blockSize, pmin, pmax);
  291. pitch_find_minmax(amdfA, pmin, pmax, &smin, &smax);
  292. avgAmdf=0;
  293. for(i=pmin;i<=pmax;i++) avgAmdf += amdfA[i];
  294. avgAmdf /= (pmax-pmin+1);
  295. amdf=amdfA[smin];
  296. if((pmin < smin && smin < pmax) && amdf<avgAmdf*F0_SHARP_TH_2){
  297. currPitch = 1/(float)smin * sampleRate;
  298. }
  299. else{
  300. currPitch = (float)0;
  301. }
  302. if(amdf<avgAmdf*0.5) reliable=true;
  303. }
  304. int gender=0;
  305. if(m_pitchFrameN>20){
  306. if(m_meanPitch>200)
  307. gender=1;
  308. else if(m_meanPitch<150)
  309. gender=-1;
  310. }
  311. /* Correct F0 doubling/halving error */
  312. float orgMag=pitch_fft_one_freq(sample,blockSize,smin);
  313. bool checkDoubling = ((currPitch>F0_DOUBLE_FREQ_TH) || (currPitch>1.6*m_meanPitch));
  314. bool checkHalving = ((currPitch>0) && (currPitch<F0_HALVE_FREQ_TH || currPitch<0.6*m_meanPitch));
  315. bool checkGender = ((gender==1 && currPitch<150) || (gender== -1 && currPitch>200));
  316. float f0_double_th = (float)F0_DOUBLE_TH;
  317. float f0_halve_th = (float)F0_HALVE_TH;
  318. if(checkGender != 0){
  319. if(checkDoubling){
  320. f0_double_th *= 1;
  321. f0_halve_th *= (float)0.2;
  322. }
  323. else if(checkHalving){
  324. f0_double_th *= (float)0.5;
  325. f0_halve_th *= 1;
  326. }
  327. }
  328. if(t>=252){
  329. int aaa=1;
  330. aaa++;
  331. }
  332. /* Apply correction when the estimated F0 is very different at the start of segment */
  333. /* and magnitude of the frequency is large enough  */
  334. if(t-startSegTime <= 5 && orgMag > 0.5*blockSize && (checkDoubling || checkHalving || checkGender)){ 
  335. float hypoDouble, hypoHalve;
  336. hypoDouble=pitch_fft_one_freq(sample,blockSize,smin/2);
  337. hypoHalve=pitch_fft_one_freq(sample,blockSize,smin*2);
  338. if(checkGender==0 && orgMag>hypoDouble*F0_NO_DOUBLE_TH && orgMag>hypoHalve*F0_NO_HALVE_TH){
  339. ;
  340. }
  341. else if(checkDoubling){
  342. if(orgMag > f0_double_th*hypoDouble && hypoHalve > f0_halve_th*orgMag)
  343. currPitch /= 2;
  344. }
  345. else if(checkHalving){
  346. if(orgMag < f0_double_th*hypoDouble)
  347. currPitch *= 2;
  348. }
  349. }
  350. /* update the mean pitch frequency */
  351. if(reliable){
  352. float lambda;
  353. if(m_pitchFrameN<10){
  354. lambda = 1-1/(float)(m_pitchFrameN+1);
  355. }
  356. else{
  357. lambda = (float)lambdaPitchTrack;
  358. }
  359. m_meanPitch = lambda*m_meanPitch + (1-lambda)*currPitch;
  360. m_pitchFrameN++;
  361. }
  362. return currPitch;
  363. }
  364. float Fe::pitch_fft_one_freq(float *sample, int blockSize, int p)
  365. {
  366. int i;
  367. float xre=0, xim=0;
  368. for(i=0;i<blockSize;i++){
  369. xre += sample[i]*cos(2*M_PI*i/(float)p);
  370. xim += sample[i]*sin(2*M_PI*i/(float)p);
  371. }
  372. return (float)sqrt(xre*xre + xim*xim);
  373. }
  374. bool Fe::pitch_find_minmax(float *amdfA, int pmin, int pmax, int* smin, int* smax)
  375. {
  376. int s;
  377. *smin = pmin;
  378. *smax = pmin;
  379. float amin = amdfA[pmin];
  380. float amax = amdfA[pmin];
  381. for(s=pmin;s<=pmax;s++){
  382. if(amdfA[s] < amin){
  383. amin = amdfA[s];
  384. *smin = s;
  385. }
  386. else if(amdfA[s] > amax){
  387. amax = amdfA[s];
  388. *smax = s;
  389. }
  390. }
  391. return true;
  392. }
  393. bool Fe::pitch_low_pass_filter(short *sample, int sampleN, int lpfLen)
  394. {
  395. int i,n,sum,kk;
  396. vector<int> tmp(sampleN);
  397. for(n=0;n<lpfLen/2;n++){
  398. sum=0;
  399. kk=0;
  400. for(i=-lpfLen/2;i<=lpfLen/2;i++){
  401. sum += (((n+i) >= 0) ? sample[n+i] : 0);
  402. kk += (((n+i) >= 0) ? 1 : 0);
  403. }
  404. tmp[n] = sum/kk;
  405. }
  406. for(n=lpfLen/2;n<sampleN-lpfLen/2;n++){
  407. sum = 0;
  408. for(i=-lpfLen/2;i<=lpfLen/2;i++) sum += sample[n+i];
  409. tmp[n] = sum/lpfLen;
  410. }
  411. for(n=sampleN-lpfLen/2;n<sampleN;n++){
  412. sum=0;
  413. kk=0;
  414. for(i=-lpfLen/2;i<=lpfLen/2;i++){
  415. sum += (((n+i) < sampleN) ? sample[n+i] : 0);
  416. kk += (((n+i) >= 0) ? 1 : 0);
  417. }
  418. tmp[n] = sum/kk;
  419. }
  420. /* clipping to 16 bit integer */
  421. for(n=0;n<sampleN;n++){
  422. sample[n] = my_min(my_max(SHRT_MIN, tmp[n]), SHRT_MAX);
  423. }
  424. return true;
  425. }
  426. int Fe::pitch_remove_spike(vector<float>& pitchA)
  427. {
  428. int i;
  429. float maxChange=(float)MAX_PITCH_CHANGE_POSTPROC;
  430. int frameN=pitchA.size();
  431. for(i=2;i<frameN-2;i++){
  432. if(pitchA[i-1]==0 && pitchA[i+1]==0){
  433. pitchA[i]=0;
  434. }
  435. else if(pitchA[i-1]>0 && pitchA[i+1]>0){
  436. float ftmp=GetMedian(&pitchA[i-2],5);
  437. if(fabs(ftmp-pitchA[i])>maxChange*pitchA[i]){
  438. pitchA[i]=(pitchA[i-1]+pitchA[i+1])/2;
  439. }
  440. }
  441. else if(i>=5 && pitchA[i]>0 && pitchA[i-1]>0 && pitchA[i-2]>0 && pitchA[i+1]==0){
  442. float ftmp=GetMedian(&pitchA[i-5],5);
  443. if(fabs(ftmp-pitchA[i])>maxChange*pitchA[i]){
  444. pitchA[i]=my_max(-pitchA[i-2]+my_min(2*pitchA[i-1],MAX_PITCH_FREQ), 0);
  445. }
  446. }
  447. else if(i<frameN-5 && pitchA[i]>0 && pitchA[i+1]>0 && pitchA[i+2]>0 && pitchA[i-1]==0){
  448. float ftmp=GetMedian(&pitchA[i],5);
  449. if(fabs(ftmp-pitchA[i])>maxChange*pitchA[i]){
  450. pitchA[i]=my_max(my_min(2*pitchA[i+1],MAX_PITCH_FREQ)-pitchA[i+2], 0);
  451. }
  452. }
  453. }
  454. return frameN;
  455. }
  456. int Fe::pitch_linear_filter(vector<float>& pitchA)
  457. {
  458. int i;
  459. int frameN=pitchA.size();
  460. vector<float> x;
  461. x.resize(frameN);
  462. for(i=0;i<frameN;i++) x[i]=pitchA[i];
  463. for(i=2;i<frameN-2;i++){
  464. if(x[i-2]>0 && x[i-1]>0 && x[i+1]>0 && x[i+2]>0){
  465. pitchA[i]=(x[i-1]+2*x[i]+x[i+1])/4;
  466. }
  467. }
  468. return frameN;
  469. }