Segment.cpp
上传用户:sanxfzhen
上传日期:2014-12-28
资源大小:2324k
文件大小:16k
源码类别:

多国语言处理

开发平台:

Visual C++

  1. //////////////////////////////////////////////////////////////////////
  2. //ICTCLAS简介:计算所汉语词法分析系统ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System),
  3. //             功能有:中文分词;词性标注;未登录词识别。
  4. //             分词正确率高达97.58%(973专家评测结果),
  5. //             未登录词识别召回率均高于90%,其中中国人名的识别召回率接近98%;
  6. //             处理速度为31.5Kbytes/s。
  7. //著作权:  Copyright?2002-2005中科院计算所 职务著作权人:张华平 刘群
  8. //遵循协议:自然语言处理开放资源许可证1.0
  9. //Email: zhanghp@software.ict.ac.cn
  10. //Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
  11. /****************************************************************************
  12.  *
  13.  * Copyright (c) 2000, 2001 
  14.  *     Machine Group
  15.  *     Software Research Lab.
  16.  *     Institute of Computing Tech.
  17.  *     Chinese Academy of Sciences
  18.  *     All rights reserved.
  19.  *
  20.  * This file is the confidential and proprietary property of 
  21.  * Institute of Computing Tech. and the posession or use of this file requires 
  22.  * a written license from the author.
  23.  * Filename: Segment.cpp
  24.  * Abstract:
  25.  *           implementation of the CSegment class.
  26.  * Author:   Kevin Zhang 
  27.  *          (zhanghp@software.ict.ac.cn)
  28.  * Date:     2002-4-23
  29.  *
  30.  * Notes:  N-Shortest paths Word segmentation
  31.  *                
  32.  ****************************************************************************/
  33. #include "stdafx.h"
  34. #include "Segment.h"
  35. #include "..\Utility\Dictionary.h"
  36. #include "..\Utility\Utility.h"
  37. #include "NShortPath.h"
  38. #include <string.h>
  39. #include <math.h>
  40. //////////////////////////////////////////////////////////////////////
  41. // Construction/Destruction
  42. //////////////////////////////////////////////////////////////////////
  43. CSegment::CSegment()
  44. {
  45. //malloc buffer
  46. m_pWordSeg=new PWORD_RESULT[MAX_SEGMENT_NUM];
  47. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  48. {
  49. m_pWordSeg[i]=new WORD_RESULT[MAX_WORDS];
  50. }
  51. m_npWordPosMapTable=0;//Record the start position of possible words
  52. m_nWordCount=0;//Record the End position of possible words
  53. m_graphOptimum.SetRowFirst();//Set row first
  54. }
  55. CSegment::~CSegment()
  56. {
  57. //free buffer
  58. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  59. {
  60. delete m_pWordSeg[i];
  61. }
  62. delete m_pWordSeg;
  63. }
  64. bool CSegment::Segment(char *sSentence,CDictionary &dictCore,int nResultCount)
  65. {
  66. int **nSegRoute;//The segmentation route
  67. nSegRoute=new int*[MAX_SEGMENT_NUM];
  68. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  69. {
  70. nSegRoute[i]=new int[MAX_SENTENCE_LEN/2];
  71. memset(nSegRoute[i],0,MAX_SENTENCE_LEN/2*sizeof(int));
  72. }
  73. m_graphSeg.m_segGraph.SetRowFirst(false);
  74. m_graphOptimum.SetRowFirst(false);
  75.   m_graphSeg.GenerateWordNet(sSentence,dictCore);
  76. CNShortPath sp(&m_graphSeg.m_segGraph,nResultCount);
  77. sp.ShortPath();
  78. sp.Output((int **)nSegRoute,false,&m_nSegmentCount);
  79. m_graphOptimum.SetEmpty();//Set graph optimum empty
  80. i=0;
  81. while(i<m_nSegmentCount)
  82. {
  83. GenerateWord(nSegRoute,i);
  84. //Gernerate word according the Segmentation route
  85. i++;
  86. }
  87. //free the memory
  88. for(i=0;i<MAX_SEGMENT_NUM;i++)
  89. {
  90. delete [] nSegRoute[i];//free the pointer memory
  91. }
  92. delete [] nSegRoute;//free the pointer array
  93. return true;
  94. }
  95. //Generate Word according the segmentation route
  96. bool CSegment::GenerateWord(int **nSegRoute, int nIndex)
  97. {
  98. unsigned int i=0,k=0;
  99. int j,nStartVertex,nEndVertex,nPOS;
  100. char sAtom[WORD_MAXLENGTH],sNumCandidate[100],sCurWord[100];
  101. ELEMENT_TYPE fValue;
  102. while(nSegRoute[nIndex][i]!=-1&&nSegRoute[nIndex][i+1]!=-1&&nSegRoute[nIndex][i]<nSegRoute[nIndex][i+1])
  103. {
  104. nStartVertex=nSegRoute[nIndex][i];
  105. j=nStartVertex;//Set the start vertex
  106. nEndVertex=nSegRoute[nIndex][i+1];//Set the end vertex
  107. nPOS=0;
  108. m_graphSeg.m_segGraph.GetElement(nStartVertex,nEndVertex,&fValue,&nPOS);
  109. sAtom[0]=0;
  110. while(j<nEndVertex)
  111. {//Generate the word according the segmentation route
  112. strcat(sAtom,m_graphSeg.m_sAtom[j]);
  113. j++;
  114. }
  115. m_pWordSeg[nIndex][k].sWord[0]=0;//Init the result ending
  116. strcpy(sNumCandidate,sAtom);
  117. while(sAtom[0]!=0&&(IsAllNum((unsigned char *)sNumCandidate)||IsAllChineseNum(sNumCandidate)))
  118. {//Merge all seperate continue num into one number
  119.  //sAtom[0]!=0: add in 2002-5-9
  120. strcpy(m_pWordSeg[nIndex][k].sWord,sNumCandidate);
  121. //Save them in the result segmentation
  122. i++;//Skip to next atom now 
  123. sAtom[0]=0;
  124. while(j<nSegRoute[nIndex][i+1])
  125. {//Generate the word according the segmentation route
  126. strcat(sAtom,m_graphSeg.m_sAtom[j]);
  127. j++;
  128. }
  129. strcat(sNumCandidate,sAtom);
  130. }
  131. unsigned int nLen=strlen(m_pWordSeg[nIndex][k].sWord);
  132. if(nLen==4&&CC_Find("第上成±—+∶·./",m_pWordSeg[nIndex][k].sWord)||nLen==1&&strchr("+-./",m_pWordSeg[nIndex][k].sWord[0]))
  133. {//Only one word
  134. strcpy(sCurWord,m_pWordSeg[nIndex][k].sWord);//Record current word
  135. i--;
  136. }
  137. else if(m_pWordSeg[nIndex][k].sWord[0]==0)//Have never entering the while loop
  138. {
  139. strcpy(m_pWordSeg[nIndex][k].sWord,sAtom);
  140. //Save them in the result segmentation
  141. strcpy(sCurWord,sAtom);//Record current word
  142. }
  143. else
  144. {//It is a num
  145. if(strcmp("--",m_pWordSeg[nIndex][k].sWord)==0||strcmp("—",m_pWordSeg[nIndex][k].sWord)==0||m_pWordSeg[nIndex][k].sWord[0]=='-'&&m_pWordSeg[nIndex][k].sWord[1]==0)//The delimiter "--"
  146. {
  147. nPOS=30464;//'w'*256;Set the POS with 'w'
  148. i--;//Not num, back to previous word
  149. }
  150. else
  151. {//Adding time suffix
  152. char sInitChar[3];
  153. unsigned int nCharIndex=0;//Get first char
  154. sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex];
  155. if(sInitChar[nCharIndex]<0)
  156. {
  157. nCharIndex+=1;
  158. sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex];
  159. }
  160. nCharIndex+=1;
  161. sInitChar[nCharIndex]='';
  162. if(k>0&&(abs(m_pWordSeg[nIndex][k-1].nHandle)==27904||abs(m_pWordSeg[nIndex][k-1].nHandle)==29696)&&(strcmp(sInitChar,"—")==0||sInitChar[0]=='-')&&(strlen(m_pWordSeg[nIndex][k].sWord)>nCharIndex))
  163. {//3-4月                                 //27904='m'*256
  164.    //Split the sInitChar from the original word
  165. strcpy(m_pWordSeg[nIndex][k+1].sWord,m_pWordSeg[nIndex][k].sWord+nCharIndex);
  166. m_pWordSeg[nIndex][k+1].dValue=m_pWordSeg[nIndex][k].dValue;
  167. m_pWordSeg[nIndex][k+1].nHandle=27904;
  168. m_pWordSeg[nIndex][k].sWord[nCharIndex]=0;
  169. m_pWordSeg[nIndex][k].dValue=0;
  170. m_pWordSeg[nIndex][k].nHandle=30464;//'w'*256;
  171. m_graphOptimum.SetElement(nStartVertex,nStartVertex+1,m_pWordSeg[nIndex][k].dValue,m_pWordSeg[nIndex][k].nHandle,m_pWordSeg[nIndex][k].sWord);
  172. nStartVertex+=1;
  173. k+=1;
  174. }
  175. nLen=strlen(m_pWordSeg[nIndex][k].sWord);
  176. if((strlen(sAtom)==2&&CC_Find("月日时分秒",sAtom))||strcmp(sAtom,"月份")==0)
  177. {//2001年
  178. strcat(m_pWordSeg[nIndex][k].sWord,sAtom);
  179. strcpy(sCurWord,"未##时");
  180. nPOS=-29696;//'t'*256;//Set the POS with 'm'
  181. }
  182. else if(strcmp(sAtom,"年")==0)
  183. {
  184.  if(IsYearTime(m_pWordSeg[nIndex][k].sWord))//strncmp(sAtom,"年",2)==0&&
  185.  {//1998年,
  186. strcat(m_pWordSeg[nIndex][k].sWord,sAtom);
  187. strcpy(sCurWord,"未##时");
  188. nPOS=-29696;//Set the POS with 't'
  189.  }
  190.  else
  191.  {
  192. strcpy(sCurWord,"未##数");
  193. nPOS=-27904;//Set the POS with 'm'
  194. i--;//Can not be a time word
  195.  }
  196. }
  197.       else
  198. {
  199. //早晨/t  五点/t 
  200. if(strcmp(m_pWordSeg[nIndex][k].sWord+strlen(m_pWordSeg[nIndex][k].sWord)-2,"点")==0)
  201. {
  202. strcpy(sCurWord,"未##时");
  203. nPOS=-29696;//Set the POS with 't'
  204. }
  205. else 
  206. {
  207. if(!CC_Find("∶·./",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.'&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='/')
  208. {
  209. strcpy(sCurWord,"未##数");
  210. nPOS=-27904;//'m'*256;Set the POS with 'm'
  211. }
  212. else if(nLen>strlen(sInitChar))
  213. {//Get rid of . example 1.
  214. if(m_pWordSeg[nIndex][k].sWord[nLen-1]=='.'||m_pWordSeg[nIndex][k].sWord[nLen-1]=='/')
  215. m_pWordSeg[nIndex][k].sWord[nLen-1]=0;
  216. else
  217. m_pWordSeg[nIndex][k].sWord[nLen-2]=0;
  218. strcpy(sCurWord,"未##数");
  219. nPOS=-27904;//'m'*256;Set the POS with 'm'
  220. i--;
  221. }
  222. }
  223. i--;//Not num, back to previous word
  224. }
  225. }
  226. fValue=0;
  227. nEndVertex=nSegRoute[nIndex][i+1];//Ending POS changed to latter
  228. }
  229. m_pWordSeg[nIndex][k].nHandle=nPOS;//Get the POS of current word
  230. m_pWordSeg[nIndex][k].dValue=fValue;//(int)(MAX_FREQUENCE*exp(-fValue));//Return the frequency of current word
  231. m_graphOptimum.SetElement(nStartVertex,nEndVertex,fValue,nPOS,sCurWord);
  232. //Generate optimum segmentation graph according the segmentation result
  233. i++;//Skip to next atom
  234. k++;//Accept next word
  235. }
  236. m_pWordSeg[nIndex][k].sWord[0]=0;
  237. m_pWordSeg[nIndex][k].nHandle=-1;//Set ending
  238. return true;
  239. }
  240. //DEL bool CSegment::GetSegmentResult(int nIndex,char *sResult)
  241. //DEL {
  242. //DEL  int i=0;
  243. //DEL  char sTempBuffer[WORD_MAXLENGTH];
  244. //DEL  sResult[0]=0;
  245. //DEL  if(nIndex<0||nIndex>=m_nSegmentCount)
  246. //DEL  return false;
  247. //DEL  while(m_WordSeg[nIndex][i].sWord[0]!=0)
  248. //DEL  {
  249. //DEL  sprintf(sTempBuffer,"%s/%c%c",m_WordSeg[nIndex][i].sWord,m_WordSeg[nIndex][i].nHandle/256,m_WordSeg[nIndex][i].nHandle%256);
  250. //DEL  strcat(sResult,sTempBuffer);
  251. //DEL  strcat(sResult,"  ");
  252. //DEL  i++;
  253. //DEL  }
  254. //DEL  return true;
  255. //DEL }
  256. //Word Segmentation based on optimum segmentation graph
  257. //After unknown word recognition
  258. bool CSegment::OptimumSegmet(int nResultCount)
  259. {
  260. int **nSegRoute;//The segmentation route
  261. nSegRoute=new int*[MAX_SEGMENT_NUM];
  262. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  263. {
  264. nSegRoute[i]=new int[MAX_SENTENCE_LEN/2];
  265. }
  266. CNShortPath sp(&m_graphOptimum,nResultCount);
  267. sp.ShortPath();
  268. sp.Output((int **)nSegRoute,false,&m_nSegmentCount);
  269. i=0;
  270. m_graphSeg.m_segGraph=m_graphOptimum;
  271. m_graphOptimum.SetEmpty();//Set graph optimum empty
  272. while(i<m_nSegmentCount)
  273. {
  274. GenerateWord(nSegRoute,i);
  275. //Gernerate word according the Segmentation route
  276. i++;
  277. }
  278. //free the memory
  279. for(i=0;i<MAX_SEGMENT_NUM;i++)
  280. {
  281. delete [] nSegRoute[i];//free the pointer memory
  282. }
  283. delete [] nSegRoute;//free the pointer array
  284. return true;
  285. }
  286. int CSegment::GetResultCount(PWORD_RESULT pItem)
  287. {
  288. int nCount=0;
  289. while(pItem[nCount].sWord[0]!=0)
  290. {
  291. nCount+=1;
  292. }
  293. return nCount;
  294. }
  295. bool CSegment::GetLastWord(PWORD_RESULT pItem, char *sWordRet)
  296. {
  297. int nCount=0;
  298. sWordRet[0]=0;
  299. while(pItem[nCount].sWord[0]!=0)
  300. {
  301. strcpy(sWordRet,pItem[nCount].sWord);
  302. nCount+=1;
  303. }
  304. return !sWordRet[0];
  305. }
  306. bool CSegment::IsYearTime(char *sNum)
  307. {//Judge whether the sNum is a num genearating year
  308. unsigned int nLen=strlen(sNum);
  309. char sTemp[3];
  310. strncpy(sTemp,sNum,2);
  311. sTemp[2]=0;
  312. if(IsAllSingleByte((unsigned char *)sNum)&&(nLen==4||nLen==2&&sNum[0]>'4'))//1992年, 90年
  313. return true;
  314. if(IsAllNum((unsigned char *)sNum)&&(nLen>=6||nLen==4&&CC_Find("56789",sTemp)))
  315. return true;
  316. if(GetCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖",sNum)==(int)nLen/2&&nLen>=3)
  317. return true;
  318. if(nLen==8&&GetCharCount("千仟零○",sNum)==2)//二仟零二年
  319. return true;
  320. if(nLen==2&&GetCharCount("千仟",sNum)==1)
  321. return true;
  322. if(nLen==4&&GetCharCount("甲乙丙丁戊己庚辛壬癸",sNum)==1&&GetCharCount("子丑寅卯辰巳午未申酉戌亥",sNum+2)==1)
  323. return true;
  324. return false;
  325. }
  326. //CDynamicArray &aWord: the words array
  327. //CDynamicArray &aWordBinaryNet:the net between words
  328. //double dSmoothingPara: the parameter of data smoothing
  329. //CDictionary &DictBinary: the binary dictionary
  330. //CDictionary &DictCore: the Core dictionary
  331. bool CSegment::BiGraphGenerate(CDynamicArray &aWord, CDynamicArray &aBinaryWordNet,double dSmoothingPara,CDictionary &DictBinary,CDictionary &DictCore)
  332. {
  333. PARRAY_CHAIN pTail,pCur,pNextWords;//Temp buffer
  334. unsigned int nWordIndex=0,nTwoWordsFreq=0,nCurWordIndex,nNextWordIndex;
  335. //nWordIndex: the index number of current word
  336. double dCurFreqency,dValue,dTemp;
  337. char sTwoWords[WORD_MAXLENGTH];
  338. m_nWordCount=aWord.GetTail(&pTail);//Get tail element and return the words count
  339. if(m_npWordPosMapTable)
  340. {//free buffer
  341. delete [] m_npWordPosMapTable;
  342. m_npWordPosMapTable=0;
  343. }
  344. if(m_nWordCount>0)//Word count is greater than 0
  345. m_npWordPosMapTable=new int[m_nWordCount];//Record the  position of possible words
  346. pCur=aWord.GetHead();
  347. while(pCur!=NULL)//Set the position map of words
  348. {
  349. m_npWordPosMapTable[nWordIndex++]=pCur->row*MAX_SENTENCE_LEN+pCur->col;
  350. pCur=pCur->next;
  351. }
  352. pCur=aWord.GetHead();
  353. while(pCur!=NULL)//
  354. {
  355. if(pCur->nPOS>=0)//It's not an unknown words
  356. dCurFreqency=pCur->value;
  357. else//Unknown words
  358. dCurFreqency=DictCore.GetFrequency(pCur->sWord,2);
  359. aWord.GetElement(pCur->col,-1,pCur,&pNextWords);//Get next words which begin with pCur->col
  360. while(pNextWords&&pNextWords->row==pCur->col)//Next words
  361. {
  362. //Current words frequency
  363. strcpy(sTwoWords,pCur->sWord);
  364. strcat(sTwoWords,WORD_SEGMENTER);
  365. strcat(sTwoWords,pNextWords->sWord);
  366. nTwoWordsFreq=DictBinary.GetFrequency(sTwoWords,3);
  367. //Two linked Words frequency
  368. dTemp=(double)1/MAX_FREQUENCE;
  369. //Smoothing
  370. dValue=-log(dSmoothingPara*(1+dCurFreqency)/(MAX_FREQUENCE+80000)+(1-dSmoothingPara)*((1-dTemp)*nTwoWordsFreq/(1+dCurFreqency)+dTemp));
  371. //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
  372. if(pCur->nPOS<0)//Unknown words: P(Wi|Ci);while known words:1
  373.     dValue+=pCur->value;
  374. //Get the position index of current word in the position map table
  375. nCurWordIndex=BinarySearch(pCur->row*MAX_SENTENCE_LEN+pCur->col,m_npWordPosMapTable,m_nWordCount);
  376. nNextWordIndex=BinarySearch(pNextWords->row*MAX_SENTENCE_LEN+pNextWords->col,m_npWordPosMapTable,m_nWordCount);
  377. aBinaryWordNet.SetElement(nCurWordIndex,nNextWordIndex,dValue,pCur->nPOS);
  378. pNextWords=pNextWords->next;//Get next word
  379. }
  380. pCur=pCur->next;
  381. }
  382. return true;
  383. }
  384. bool CSegment::BiSegment(char *sSentence, double dSmoothingPara, CDictionary &dictCore, CDictionary &dictBinary, unsigned int nResultCount)
  385. {
  386. int **nSegRoute;//The segmentation route
  387. nSegRoute=new int*[MAX_SEGMENT_NUM];
  388. unsigned int nLen=strlen(sSentence)+10;
  389. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  390. {
  391. nSegRoute[i]=new int[nLen/2];
  392. memset(nSegRoute[i],-1,nLen/2*sizeof(int));
  393. }
  394.   m_graphSeg.GenerateWordNet(sSentence,dictCore,true);//Generate words array
  395.     CDynamicArray aBiwordsNet;
  396. BiGraphGenerate(m_graphSeg.m_segGraph,aBiwordsNet,dSmoothingPara,dictBinary,dictCore);
  397.     //Generate the biword link net
  398.     
  399. CNShortPath sp(&aBiwordsNet,nResultCount);
  400. sp.ShortPath();
  401. sp.Output(nSegRoute,false,&m_nSegmentCount);
  402. m_graphOptimum.SetEmpty();//Set graph optimum empty
  403. i=0;
  404. while(i<m_nSegmentCount)
  405. {
  406. BiPath2UniPath(nSegRoute[i]);
  407. //Path convert to unipath
  408. GenerateWord(nSegRoute,i);
  409. //Gernerate word according the Segmentation route
  410. i++;
  411. }
  412. //free the memory
  413. for(i=0;i<MAX_SEGMENT_NUM;i++)
  414. {
  415. delete [] nSegRoute[i];//free the pointer memory
  416. }
  417. delete [] nSegRoute;//free the pointer array
  418. return true;
  419. }
  420. bool CSegment::BiPath2UniPath(int *npPath)
  421. {//BiPath convert to unipath
  422. int i=0,nTemp=-1;
  423. if(!m_npWordPosMapTable)
  424. return false;
  425. while(npPath[i]!=-1&&npPath[i]<m_nWordCount)
  426. {
  427. nTemp=m_npWordPosMapTable[npPath[i]];
  428. npPath[i]=nTemp/MAX_SENTENCE_LEN;
  429. i++;
  430. }
  431. if(nTemp>0)
  432. npPath[i++]=nTemp%MAX_SENTENCE_LEN;
  433. npPath[i]=-1;
  434. return true;
  435. }
  436. bool CSegment::BiOptimumSegment(unsigned int nResultCount,double dSmoothingPara, CDictionary &dictBinary, CDictionary &dictCore)
  437. {
  438. int **nSegRoute;//The segmentation route
  439. nSegRoute=new int*[MAX_SEGMENT_NUM];
  440. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  441. {
  442. nSegRoute[i]=new int[MAX_SENTENCE_LEN/2];
  443. memset(nSegRoute[i],-1,MAX_SENTENCE_LEN/2*sizeof(int));
  444. }
  445.     CDynamicArray aBiwordsNet;
  446. BiGraphGenerate(m_graphOptimum,aBiwordsNet,dSmoothingPara,dictBinary,dictCore);
  447.     //Generate the biword link net
  448.     
  449. CNShortPath sp(&aBiwordsNet,nResultCount);
  450. sp.ShortPath();
  451. sp.Output((int **)nSegRoute,false,&m_nSegmentCount);
  452. i=0;
  453. m_graphSeg.m_segGraph=m_graphOptimum;
  454. m_graphOptimum.SetEmpty();//Set graph optimum empty
  455. while(i<m_nSegmentCount)
  456. {
  457. BiPath2UniPath(nSegRoute[i]);
  458. //Path convert to unipath
  459. GenerateWord(nSegRoute,i);
  460. //Gernerate word according the Segmentation route
  461. i++;
  462. }
  463. //free the memory
  464. for(i=0;i<MAX_SEGMENT_NUM;i++)
  465. {
  466. delete [] nSegRoute[i];//free the pointer memory
  467. }
  468. delete [] nSegRoute;//free the pointer array
  469. return true;
  470. }