FE_vus.cpp
上传用户:italyroyal
上传日期:2013-05-06
资源大小:473k
文件大小:7k
源码类别:

语音合成与识别

开发平台:

Visual C++

  1. ///////////////////////////////////////////////////////////////////////////////
  2. // This is a part of the Feature program.
  3. // Version: 1.0
  4. // Date: February 22, 2003
  5. // Programmer: Oh-Wook Kwon
  6. // Copyright(c) 2003 Oh-Wook Kwon. All rights reserved. owkwon@ucsd.edu
  7. ///////////////////////////////////////////////////////////////////////////////
  8. #include "StdAfx.h"
  9. #include "FE_feature.h"
  10. static const float EPD_NB_FRAME_THRESHOLD_LTE=(float)10;
  11. static const float EPD_LAMBDA_LTE=(float)0.97;
  12. static const float EPD_LAMBDA_ZCR=(float)0.97; // old=0.99
  13. static const float EPD_SNR_THRESHOLD_UPD_LTE=(float)4;
  14. static const float EPD_ZCR_THRESHOLD_UPD_LTE=(float)10;
  15. static const float EPD_NOISE_ENERGY_FLOOR=(float)14.5;
  16. static const float EPD_SPEECH_ENERGY_FLOOR=(float)37.5;
  17. static const float lambdaLTEhigherE=(float)0.99;
  18. static const int EPD_MIN_FRAME=10;
  19. int Fe::vus_basic(short *sample, int sampleN, int frameSize, vector<EVusType>& vusA)
  20. {
  21. int n,i;
  22. vector<float> energyA;
  23. vector<float> zcrA;
  24. vector<float> noiseEnA;
  25. vector<float> meanZcrA;
  26. float noiseEn=0;
  27. float meanZcr=0;
  28. int zcrFrameN=0;
  29. #ifdef _DEBUG
  30. #define TEST_FRAME_LEN 300
  31. float tmp1A[TEST_FRAME_LEN];
  32. float tmp2A[TEST_FRAME_LEN];
  33. EVusType vusTmpA[TEST_FRAME_LEN];
  34. float meanEnergyTmpA[TEST_FRAME_LEN];
  35. float meanZcrTmpA[TEST_FRAME_LEN];
  36. #endif
  37. int shiftSize=GetShiftSize();
  38. int frameN=(int)((sampleN-(frameSize-shiftSize))/shiftSize);
  39. energyA.resize(frameN);
  40. zcrA.resize(frameN);
  41. noiseEnA.resize(frameN);
  42. meanZcrA.resize(frameN);
  43. vusA.resize(frameN);
  44. float VUS_ENERGY_TH_VOICED=30;
  45. float VUS_ENERGY_TH_UNVOICED=60;
  46. float VUS_ENERGY_TH_START=20;
  47. float VUS_ENERGY_TH_INSPEECH=15; // old=6
  48. int VUS_ZCR_TH_MAX=50;
  49. int VUS_ZCR_TH_MIN=5;
  50. int VUS_VOICE_ZCR_TH_MIN=0;
  51. int VUS_ZCR_UPD_TH=50;
  52. vector<float> x(frameSize);
  53. float vus_energy_th=VUS_ENERGY_TH_START;
  54. for(n=0;n<frameN;n++){
  55. float lambdaLTE, lambdaZcr;
  56. if(n<EPD_NB_FRAME_THRESHOLD_LTE)
  57. lambdaLTE=1-1/(float)(n+1);
  58. else
  59. lambdaLTE=EPD_LAMBDA_LTE;
  60. if(zcrFrameN<EPD_NB_FRAME_THRESHOLD_LTE)
  61. lambdaZcr=1-1/(float)(zcrFrameN+1);
  62. else
  63. lambdaZcr=EPD_LAMBDA_ZCR;
  64. int begX=n*shiftSize;
  65. {
  66. /* DC offset removal, H(z)=(1-z^(-1))/(1-a*z^(-1)) */
  67. float a=1-1/(float)1024;
  68. x[0]=sample[begX];
  69. for(i=1;i<frameSize;i++){
  70. x[i]=(sample[begX+i]-sample[begX+i-1]+a*x[i-1]);
  71. }
  72. /* low-pass filter, x=filter([1 2 1]/4, [1], x); */
  73. vector<float> xorg=x;
  74. x[0]=(xorg[0]+2*xorg[0]+xorg[1])/4;
  75. for(i=1;i<frameSize-1;i++){
  76. x[i]=(xorg[i-1]+2*xorg[i]+xorg[i+1])/4;
  77. }
  78. x[frameSize-1]=(xorg[frameSize-2]+2*xorg[frameSize-1]+xorg[frameSize-1])/4;
  79. }
  80. /* Normalize energy to frame length */
  81. float sum=0;
  82. for(i=0;i<frameSize;i++){
  83. sum += (x[i]*x[i]);
  84. }
  85. energyA[n] = (0.5+10*LOG10(1+sum/frameSize)); /* dB scale */
  86. if(energyA[n]<EPD_NOISE_ENERGY_FLOOR) energyA[n]=EPD_NOISE_ENERGY_FLOOR;
  87. /* Estimate noise */
  88. float noiseLevel=0;
  89. if((energyA[n]-noiseEn)<EPD_SNR_THRESHOLD_UPD_LTE || n<EPD_MIN_FRAME || (energyA[n] < EPD_SPEECH_ENERGY_FLOOR)){
  90. if((energyA[n]<noiseEn) | (n<EPD_MIN_FRAME) | (energyA[n] < EPD_SPEECH_ENERGY_FLOOR)){
  91. noiseEn=noiseEn+(1-lambdaLTE)*(energyA[n]-noiseEn);
  92. }
  93. else{
  94. noiseEn=noiseEn+(1-lambdaLTEhigherE)*(energyA[n]-noiseEn);
  95. }
  96. /* noise level should be computed from the original noise energy */
  97. noiseLevel=(2*sqrt(exp(log(10)/10*(noiseEn-0.5))-1));
  98. if(noiseEn<EPD_NOISE_ENERGY_FLOOR) noiseEn=EPD_NOISE_ENERGY_FLOOR;
  99. }
  100. noiseEnA[n]=noiseEn;
  101. /* compute zero crossing rate (changed) */
  102. float prev = x[0]-noiseLevel;
  103. int zcr=0;
  104. for(i=1;i<frameSize;i++){
  105. float val  = x[i]-noiseLevel;
  106. float ztmp=val*prev;
  107. if(ztmp<0) zcr = zcr+1;
  108. prev=val;
  109. }
  110. zcrA[n]=zcr;
  111. if(zcr>0 || zcrFrameN>0){
  112. zcrFrameN++;
  113. }
  114. if(zcrA[n]<VUS_ZCR_UPD_TH && ((zcrA[n]-meanZcr)<EPD_ZCR_THRESHOLD_UPD_LTE || zcrFrameN<EPD_MIN_FRAME)){
  115. meanZcr=meanZcr+(1-lambdaZcr)*(zcrA[n]-meanZcr);
  116. }
  117. meanZcrA[n]=meanZcr;
  118. float deltaEnergy=energyA[n]-noiseEn;
  119. float deltaZcr=zcrA[n]-meanZcr;
  120. if(deltaEnergy<VUS_ENERGY_TH_UNVOICED && deltaEnergy<vus_energy_th && deltaZcr<VUS_VOICE_ZCR_TH_MIN){
  121. vusA[n]=FRM_SILENCE;
  122. vus_energy_th=VUS_ENERGY_TH_START;
  123. }
  124. else if(deltaZcr>VUS_ZCR_TH_MAX){
  125. vus_energy_th=VUS_ENERGY_TH_START;
  126. vusA[n]=FRM_UNVOICED;
  127. }
  128. else if(deltaZcr>VUS_ZCR_TH_MIN && deltaEnergy<vus_energy_th){
  129. vus_energy_th=VUS_ENERGY_TH_START;
  130. vusA[n]=FRM_UNVOICED;
  131. }
  132. else if(zcr>VUS_ZCR_TH_MIN && deltaEnergy<vus_energy_th){
  133. vus_energy_th=VUS_ENERGY_TH_START;
  134. vusA[n]=FRM_UNVOICED;
  135. }
  136. else if(deltaEnergy>VUS_ENERGY_TH_VOICED){
  137. vus_energy_th=VUS_ENERGY_TH_INSPEECH;
  138. vusA[n]=FRM_VOICED;
  139. }
  140. else if(deltaEnergy>vus_energy_th && (deltaZcr>VUS_VOICE_ZCR_TH_MIN || zcr>VUS_ZCR_TH_MIN)){
  141. vus_energy_th=VUS_ENERGY_TH_INSPEECH;
  142. vusA[n]=FRM_VOICED;
  143. }
  144. else{
  145. vusA[n]=FRM_SILENCE;
  146. vus_energy_th=VUS_ENERGY_TH_START;
  147. }
  148. /* TRACE("%d  %f %f %f %f %dn",n,energyA[n],zcrA[n],deltaEnergy,deltaZcr,(int)vusA[n]); */
  149. }
  150. #ifdef _DEBUG
  151. int k;
  152. for(k=0;k<my_min(TEST_FRAME_LEN,vusA.size());k++){
  153. tmp1A[k]=energyA[k];
  154. tmp2A[k]=zcrA[k];
  155. meanEnergyTmpA[k]=noiseEnA[k];
  156. meanZcrTmpA[k]=meanZcrA[k];
  157. vusTmpA[k]=vusA[k];
  158. }
  159. #endif
  160. /* median filtering */
  161. vus_median_filter(vusA);
  162. #ifdef _DEBUG
  163. for(k=0;k<my_min(TEST_FRAME_LEN,vusA.size());k++){
  164. vusTmpA[k]=vusA[k];
  165. }
  166. #endif
  167. /* remove short segments */
  168. vus_remove_short_segments(vusA);
  169. return frameN;
  170. }
  171. int Fe::vus_median_filter(vector<EVusType>& vusA)
  172. {
  173. int n,i,k;
  174. int frameN=vusA.size();
  175. /* two-stage median filtering, length 5 */
  176. vector<EVusType> vusOrgA=vusA;
  177. float tmp[5];
  178. for(n=2;n<frameN-2;n++){
  179. for(i=n-2,k=0;i<=n+2;i++,k++) tmp[k]=(float)(vusOrgA[i]);
  180. vusA[n]=(EVusType)((int)(GetMedian(tmp,5)+0.5));
  181. }
  182. vusOrgA=vusA;
  183. for(n=2;n<frameN-2;n++){
  184. for(i=n-2,k=0;i<=n+2;i++,k++) tmp[k]=(float)(vusOrgA[i]);
  185. vusA[n]=(EVusType)((int)(GetMedian(tmp,5)+0.5));
  186. }
  187. return frameN;
  188. }
  189. int Fe::vus_remove_short_segments(vector<EVusType>& vusA)
  190. {
  191. /* remove short voiced segments */
  192. vus_remove_short_segments_sub(vusA, FRM_VOICED, 5); // old=7
  193. /* remove short silent segments */
  194. vus_remove_short_segments_sub(vusA, FRM_SILENCE, 2); // old=2
  195. /* remove short unvoiced segments */
  196. vus_remove_short_segments_sub(vusA, FRM_UNVOICED, 3); // old=4
  197. return 1;
  198. }
  199. /* remove short segments with duration less than or equal to minDur */
  200. int Fe::vus_remove_short_segments_sub(vector<EVusType>& vusA, EVusType type, int minDur)
  201. {
  202. int startX=1;
  203. int n,i,k=0;
  204. EVusType prevType=vusA[0];
  205. for(n=1;n<vusA.size();n++){
  206. if(vusA[n] == type && vusA[n-1] != type){
  207. startX=n;
  208. }
  209. else if(vusA[n] != type && vusA[n-1] == type){
  210. if(n-startX<=minDur){
  211. for(i=startX;i<n;i++) vusA[i]=vusA[startX-1];
  212. k++;
  213. }
  214. startX=n;
  215. }
  216. }
  217. if(n-startX<=minDur && vusA[n-1]==type){
  218. for(i=startX;i<n;i++) vusA[i]=vusA[startX-1];
  219. k++;
  220. }
  221. return k;
  222. }