WordSegment.cpp
上传用户:sanxfzhen
上传日期:2014-12-28
资源大小:2324k
文件大小:9k
源码类别:

多国语言处理

开发平台:

Visual C++

  1. // WordSegment.cpp: implementation of the CWordSegment class.
  2. //
  3. //////////////////////////////////////////////////////////////////////
  4. #include "stdafx.h"
  5. #include "WordSegment.h"
  6. #include ".\Utility\Utility.h"
  7. const unsigned short CWordSegment::uTag = 0x0001;  // word classify tag
  8. const unsigned short CWordSegment::uPlace = 0x0002; // Chinese place name
  9. const unsigned short CWordSegment::uPerson = 0x0004;  // chinese name
  10. const unsigned short CWordSegment::uTransPerson = 0x0008; // translation name, include translation
  11. char CWordSegment::m_pSentence[MAX_PATH*10];        //用来存放经过去掉空格回车等字母后的句子
  12. CWordSegment g_wordSeg;
  13. //////////////////////////////////////////////////////////////////////
  14. // Construction/Destruction
  15. //////////////////////////////////////////////////////////////////////
  16. AFX_INLINE UINT AFXAPI HashKey(CString key)
  17. {
  18. return HashKey((LPCTSTR)key);
  19. }
  20. CWordSegment::CWordSegment()
  21. {
  22. m_nInit=0;
  23. m_nOperateType=0;
  24. m_dSmoothingPara=0.1;//Smoothing parameter
  25. }
  26. CWordSegment::~CWordSegment()
  27. {
  28. FreeWordSegment();
  29. }
  30. bool CWordSegment::InitWorgSegment(char *pPath, int nLanguage)
  31. {
  32. if(!m_nInit)
  33. {
  34. char filename[MAX_PATH];
  35. if(nLanguage==0)
  36. {
  37. strcpy(filename,pPath);
  38. strcat(filename,"\data\coreDict.dct");
  39. if(!m_dictCore.Load(filename)) return false;
  40. strcpy(filename,pPath);
  41. strcat(filename,"\data\lexical.ctx");
  42. if(!m_POSTagger.LoadContext(filename))  return false;
  43. m_POSTagger.SetTagType();
  44. strcpy(filename,pPath);
  45. strcat(filename,"\data\nr");
  46. if(!m_uPerson.Configure(filename,TT_PERSON)) return false;
  47. strcpy(filename,pPath);
  48. strcat(filename,"\data\ns");
  49. if(!m_uPlace.Configure(filename,TT_PLACE)) return false;
  50. strcpy(filename,pPath);
  51. strcat(filename,"\data\tr");
  52. if(!m_uTransPerson.Configure(filename,TT_TRANS_PERSON)) return false;
  53. strcpy(filename,pPath);
  54. strcat(filename,"\data\BigramDict.dct");
  55. if(!m_dictBigram.Load(filename)) return false;
  56. }
  57. FILE *stream;
  58. char word[MAX_PATH];
  59. strcpy(filename,pPath);
  60. if(nLanguage==0)
  61. {
  62. strcat(filename,"\data\stopwords.txt");
  63. m_nInit=1;
  64. }
  65. else
  66. {
  67. strcat(filename,"\data\engstopwords.txt");
  68. m_nInit=2;
  69. }
  70. if((stream  = fopen( filename, "r" )) == NULL)
  71. return false;
  72. while(!feof(stream))
  73. {
  74. fscanf(stream,"%sn",word); 
  75. if(word[0]!='') m_lstStopWords.SetAt(word,1);
  76. }
  77. m_lstStopWords.SetAt("rn",1);
  78. m_lstStopWords.SetAt(" rn",1);
  79. m_lstStopWords.SetAt("n",1);
  80. m_lstStopWords.SetAt(" n",1);
  81. fclose(stream);
  82. }
  83. return true;
  84. }
  85. void CWordSegment::FreeWordSegment(void)
  86. {
  87. if(m_nInit==1)
  88. {
  89. m_lstStopWords.RemoveAll();
  90. m_dictCore.ReleaseDict();
  91. m_dictBigram.ReleaseDict();
  92. m_POSTagger.ReleaseSpan();
  93. m_uPerson.ReleaseUnknowWord();
  94. m_uTransPerson.ReleaseUnknowWord();
  95. m_uPlace.ReleaseUnknowWord();
  96. }
  97. if(m_nInit==2)
  98. {
  99. m_lstStopWords.RemoveAll();
  100. }
  101. m_nInit=0;
  102. }
  103. void CWordSegment::SetSegSetting(unsigned short Setting)
  104. {
  105. m_nOperateType=Setting;
  106. }
  107. unsigned short CWordSegment::GetSegSetting()
  108. {
  109. return m_nOperateType;
  110. }
  111. bool CWordSegment::Segment(char* sentence)
  112. {
  113. int nIndex,nResultCount;
  114. strcpy(m_pNewSentence,SENTENCE_BEGIN);
  115. strcat(m_pNewSentence,sentence);
  116. strcat(m_pNewSentence,SENTENCE_END);
  117. m_Seg.BiSegment(m_pNewSentence, m_dSmoothingPara,m_dictCore,m_dictBigram,1);
  118. nResultCount=m_Seg.m_nSegmentCount;
  119. for(nIndex=0;nIndex<m_Seg.m_nSegmentCount;nIndex++)
  120. {
  121. if(m_nOperateType&&uPerson)
  122. m_uPerson.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
  123. if(m_nOperateType&&uTransPerson)
  124. m_uTransPerson.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
  125. if(m_nOperateType&&uPlace)
  126. m_uPlace.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
  127. }
  128. m_Seg.BiOptimumSegment(1,m_dSmoothingPara,m_dictBigram,m_dictCore);
  129. for(nIndex=0;nIndex<m_Seg.m_nSegmentCount;nIndex++)
  130. {
  131. if(m_nOperateType&&uTag)
  132. m_POSTagger.POSTagging(m_Seg.m_pWordSeg[nIndex],m_dictCore,m_dictCore);
  133. }
  134. return true;
  135. }
  136. int CWordSegment::GetSegmentCount()
  137. {
  138. return m_Seg.m_nSegmentCount;
  139. }
  140. PWORD_RESULT CWordSegment::GetWordSeg(int index)
  141. {
  142. return m_Seg.m_pWordSeg[index];
  143. }
  144. bool CWordSegment::isInStopWords(char *pWord)
  145. {
  146.   int t;
  147.   return(m_lstStopWords.Lookup(pWord,t)>0);
  148. }
  149. //bStopWord=true 去掉停用词
  150. //bTag=true      结果文件中包含词性标记
  151. int CWordSegment::SegmentFile(const char *source, const char *target, BOOL bStopWord, BOOL bTag)
  152. {
  153. CFile fin;
  154. FILE *fout;
  155. if(!fin.Open(source,CFile::modeRead))
  156. return -1;
  157. if((fout=fopen(target,"w+"))==NULL)
  158. {
  159. fin.Close();
  160. return -1;
  161. }
  162. //读入文件的内容
  163. int num=0;
  164. unsigned int flen=fin.GetLength();
  165. char *buffer=new char[flen+1];
  166. flen=fin.ReadHuge(buffer,flen);
  167. buffer[flen]='';
  168. fin.Close();
  169. int i,j,sum;
  170. char *w;
  171. //realcnt为文章中去掉停用词后剩下的总共词数
  172. //nStart为一个句子在buffer中的开始位置
  173. int nStart=0,nNewStart=0;
  174. bool flag=true;
  175. int nSentenceLen=0;
  176. int realcnt=0;
  177. while(buffer[nStart]!='')
  178. {
  179. flag=true;
  180. nSentenceLen=ParseFile(buffer,nStart,nNewStart);
  181. nStart=nNewStart;
  182. if(nSentenceLen==0) continue;
  183. if(m_pSentence[0]>0) //如果是一个英文单词
  184. {
  185. //如果英文单词的长度大于等于2,且不是数字
  186. if((nSentenceLen>=2)&&((m_pSentence[0]<'0')||(m_pSentence[0]>'9')))
  187. {
  188. fprintf(fout,"%sn",m_pSentence);
  189. realcnt++;
  190. }
  191. }
  192. else //如果是汉字串
  193. {
  194. if(nSentenceLen%2!=0) continue;
  195. if(nSentenceLen==2) //如果是单个汉字
  196. {
  197. fprintf(fout,"%sn",m_pSentence);;
  198. realcnt++;
  199. }
  200. else
  201. {
  202. g_wordSeg.Segment(m_pSentence);
  203. for(i=0;i<g_wordSeg.GetSegmentCount();i++)
  204. {
  205. sum=0;
  206. PWORD_RESULT pItem=g_wordSeg.GetWordSeg(i);
  207. while(pItem[sum].sWord[0]!=0) sum++;
  208. for(j=1;j<sum-1;j++)
  209. {
  210. w=pItem[j].sWord;
  211. if(w[0]==''||(bStopWord&&g_wordSeg.isInStopWords(w))) flag=false;
  212. if(flag)
  213. {
  214. fprintf(fout,"%sn",w);
  215. realcnt++;
  216. }
  217. }
  218. }
  219. }
  220. }
  221. }
  222. fclose(fout);
  223. delete[] buffer;
  224. return realcnt;
  225. }
  226. //对pBuffer中的文字进行处理,得到一个句子的结束位置nEnd
  227. //并且将去掉空格回车等字母后的句子放入变量m_pSentence
  228. int CWordSegment::ParseFile(char *pBuffer, int nStart, int &nEnd)
  229. {
  230. //nSum为得到的句子包含的字节数
  231. int nCurrent,nSum=0;
  232. byte bChar[2];
  233. //是否为中文句子
  234. bool bChinese=true;
  235. nCurrent=nStart;
  236. //去掉句首的全角和半角空格
  237. //判断句子是以中文字母开头,还是英文字母开头
  238. while(pBuffer[nCurrent]!='')
  239. {
  240. bChar[0]=pBuffer[nCurrent];
  241. if(bChar[0]==' '||bChar[0]=='r'||bChar[0]=='n')
  242. nCurrent++;
  243. else if(bChar[0]==0xA1)
  244. {
  245. bChar[1]=pBuffer[nCurrent+1];
  246. if(bChar[1]==0xA1) 
  247. nCurrent+=2;
  248. else
  249. {
  250. bChinese=true;
  251. break;
  252. }
  253. }
  254. else if(pBuffer[nCurrent]>0)
  255. {
  256. bChinese=false;
  257. break;
  258. }
  259. else
  260. {
  261. bChinese=true;
  262. break;
  263. }
  264. }
  265. while(pBuffer[nCurrent]!='')
  266. {
  267. bChar[0]=pBuffer[nCurrent];
  268. if(bChar[0]>127)
  269. {
  270. if(!bChinese) break;
  271. nCurrent++;
  272. bChar[1]=pBuffer[nCurrent];
  273. //0xA1A1为全角的空格
  274. if((bChar[0]!=0xA1)||(bChar[1]!=0xA1))
  275. {
  276. //如果为"的"字,或为标点符号或其它全角字母
  277. if(((bChar[0]==0xB5)&&(bChar[1]==0xC4))||
  278. ((bChar[0]==0xA1)&&(bChar[1]>0xA1)&&(bChar[1]<=0xFE))||
  279. ((bChar[0]==0xA2)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xFC))||
  280. ((bChar[0]==0xA3)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xFE))||
  281. ((bChar[0]==0xA4)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF3))||
  282. ((bChar[0]==0xA5)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF6))||
  283. ((bChar[0]==0xA6)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF5))||
  284. ((bChar[0]==0xA7)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF1))||
  285. ((bChar[0]==0xA8)&&(bChar[1]>=0x40)&&(bChar[1]<=0xE9))||
  286. ((bChar[0]==0xA9)&&(bChar[1]>=0x40)&&(bChar[1]<=0xEF))||
  287. ((bChar[0]==0xAA)&&(bChar[1]==0xA5)))
  288. {
  289. nCurrent++;
  290. break;
  291. }
  292. //如果后半部分小于0x80,代表是一个错误的汉字
  293. else if(bChar[1]>=0x80)
  294. {
  295. m_pSentence[nSum]=pBuffer[nCurrent-1];
  296. nSum++;
  297. m_pSentence[nSum]=pBuffer[nCurrent];
  298. nSum++;
  299. }
  300. }
  301. }
  302. else
  303. {
  304. if(bChar[0]==' '||bChar[0]=='r'||bChar[0]=='n')
  305. {
  306. if(!bChinese)
  307. {
  308. nCurrent++;
  309. break;
  310. }
  311. }
  312. else
  313. {
  314. //if(bChar[0]=='!'||bChar[0]=='?'||bChar[0]==':'||
  315. // bChar[0]==';'||bChar[0]=='.')
  316. if((bChar[0]>32&&bChar[0]<=47)||(bChar[0]>=58&&bChar[0]<=64)||
  317. (bChar[0]>=91&&bChar[0]<=96)||(bChar[0]>=123&&bChar[0]<=127))
  318. {
  319. nCurrent++;
  320. break;
  321. }
  322. else if(bChinese) break;
  323. else
  324. {
  325. m_pSentence[nSum]=pBuffer[nCurrent];
  326. nSum++;
  327. }
  328. }
  329. }
  330. nCurrent++;
  331. }
  332. m_pSentence[nSum]='';
  333. nEnd=nCurrent;
  334. return nSum;
  335. }