Segment.cpp
上传用户:sunyong76
上传日期:2021-10-03
资源大小:2236k
文件大小:22k
源码类别:

多国语言处理

开发平台:

Java

  1. //////////////////////////////////////////////////////////////////////
  2. //ICTCLAS简介:计算所汉语词法分析系统ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System),
  3. //             功能有:中文分词;词性标注;未登录词识别。
  4. //             分词正确率高达97.58%(973专家评测结果),
  5. //             未登录词识别召回率均高于90%,其中中国人名的识别召回率接近98%;
  6. //             处理速度为31.5Kbytes/s。
  7. //著作权:  Copyright?2002-2005中科院计算所 职务著作权人:张华平 刘群
  8. //遵循协议:自然语言处理开放资源许可证1.0
  9. //Email: zhanghp@software.ict.ac.cn
  10. //Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
  11. /****************************************************************************
  12.  *
  13.  * Copyright (c) 2000, 2001 
  14.  *     Machine Group
  15.  *     Software Research Lab.
  16.  *     Institute of Computing Tech.
  17.  *     Chinese Academy of Sciences
  18.  *     All rights reserved.
  19.  *
  20.  * This file is the confidential and proprietary property of 
  21.  * Institute of Computing Tech. and the posession or use of this file requires 
  22.  * a written license from the author.
  23.  * Filename: Segment.cpp
  24.  * Abstract:
  25.  *           implementation of the CSegment class.
  26.  * Author:   Kevin Zhang 
  27.  *          (zhanghp@software.ict.ac.cn)
  28.  * Date:     2002-4-23
  29.  *
  30.  * Notes:  N-Shortest paths Word segmentation
  31.  *                
  32.  ****************************************************************************/
  33. #include "stdafx.h"
  34. #include "Segment.h"
  35. #include "..\Utility\Dictionary.h"
  36. #include "..\Utility\Utility.h"
  37. #include "NShortPath.h"
  38. #include <string.h>
  39. #include <math.h>
  40. //////////////////////////////////////////////////////////////////////
  41. // Construction/Destruction
  42. //////////////////////////////////////////////////////////////////////
  43. CSegment::CSegment()
  44. {
  45. //malloc buffer
  46. m_pWordSeg=new PWORD_RESULT[MAX_SEGMENT_NUM];
  47. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  48. {
  49. m_pWordSeg[i]=new WORD_RESULT[MAX_WORDS];
  50. }
  51. m_npWordPosMapTable=0;//Record the start position of possible words
  52. m_nWordCount=0;//Record the End position of possible words
  53. m_graphOptimum.SetRowFirst();//Set row first
  54. }
  55. CSegment::~CSegment()
  56. {
  57. //free buffer
  58. /* 
  59.      * ----- commented by huangjin@ict.ac.cn 2006-5-29 ------ 
  60.      * 
  61.      * for(int i=0;i<MAX_SEGMENT_NUM;i++)
  62.  * {
  63.  * delete m_pWordSeg[i];
  64.  * }
  65.  * delete m_pWordSeg;
  66.      *
  67.      */
  68. /*----Added By huangjin@ict.ac.cn 2006-5-29----*/
  69. if(m_pWordSeg!=NULL)
  70. {
  71. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  72. {
  73. delete []m_pWordSeg[i];
  74. m_pWordSeg[i]=NULL;
  75. }
  76. delete []m_pWordSeg;
  77. m_pWordSeg=NULL;
  78. }
  79. /*-----------------------------------------------*/
  80. /*----Added By huangjin@ict.ac.cn 2006-5-31----*/
  81. if(m_npWordPosMapTable!=NULL)
  82. {
  83. delete []m_npWordPosMapTable;
  84. m_npWordPosMapTable=NULL;
  85. }
  86. /*-----------------------------------------------*/
  87. }
  88. bool CSegment::Segment(char *sSentence,CDictionary &dictCore,int nResultCount)
  89. {
  90. int **nSegRoute;//The segmentation route
  91. nSegRoute=new int*[MAX_SEGMENT_NUM];
  92. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  93. {
  94. nSegRoute[i]=new int[MAX_SENTENCE_LEN/2];
  95. memset(nSegRoute[i],0,MAX_SENTENCE_LEN/2*sizeof(int));
  96. }
  97. m_graphSeg.m_segGraph.SetRowFirst(false);
  98. m_graphOptimum.SetRowFirst(false);
  99.   m_graphSeg.GenerateWordNet(sSentence,dictCore);
  100. CNShortPath sp(&m_graphSeg.m_segGraph,nResultCount);
  101. sp.ShortPath();
  102. sp.Output((int **)nSegRoute,false,&m_nSegmentCount);
  103. m_graphOptimum.SetEmpty();//Set graph optimum empty
  104. i=0;
  105. while(i<m_nSegmentCount)
  106. {
  107. GenerateWord(nSegRoute,i);
  108. //Gernerate word according the Segmentation route
  109. i++;
  110. }
  111. //free the memory
  112. for(i=0;i<MAX_SEGMENT_NUM;i++)
  113. {
  114. delete [] nSegRoute[i];//free the pointer memory
  115. /*----Added By huangjin@ict.ac.cn 2006-5-29----*/
  116. nSegRoute[i]=NULL;
  117. /*-----------------------------------------------*/
  118. }
  119. delete [] nSegRoute;//free the pointer array
  120. /*----Added By huangjin@ict.ac.cn 2006-5-29----*/
  121. nSegRoute=NULL;
  122. /*-----------------------------------------------*/
  123. return true;
  124. }
  125. //Generate Word according the segmentation route
  126. bool CSegment::GenerateWord(int **nSegRoute, int nIndex)
  127. {
  128. unsigned int i=0,k=0;
  129. int j,nStartVertex,nEndVertex,nPOS;
  130. /* 
  131. * ----- commented by huangjin@ict.ac.cn 2006-9-17 ------ 
  132. *
  133. *  char sAtom[WORD_MAXLENGTH],sNumCandidate[100],sCurWord[100];
  134. *
  135. */
  136. /*----Added By huangjin@ict.ac.cn 2006-9-17----*/
  137. char sAtom[WORD_MAXLENGTH],sNumCandidate[WORD_MAXLENGTH],sCurWord[WORD_MAXLENGTH];
  138. /*---------------------------------------------*/
  139. ELEMENT_TYPE fValue;
  140. while(nSegRoute[nIndex][i]!=-1&&nSegRoute[nIndex][i+1]!=-1&&nSegRoute[nIndex][i]<nSegRoute[nIndex][i+1])
  141. {
  142. nStartVertex=nSegRoute[nIndex][i];
  143. j=nStartVertex;//Set the start vertex
  144. nEndVertex=nSegRoute[nIndex][i+1];//Set the end vertex
  145. nPOS=0;
  146. m_graphSeg.m_segGraph.GetElement(nStartVertex,nEndVertex,&fValue,&nPOS);
  147. sAtom[0]=0;
  148. while(j<nEndVertex)
  149. {//Generate the word according the segmentation route
  150. strcat(sAtom,m_graphSeg.m_sAtom[j]);
  151. j++;
  152. }
  153. m_pWordSeg[nIndex][k].sWord[0]=0;//Init the result ending
  154. strcpy(sNumCandidate,sAtom);
  155. /* 
  156. * ----- commented by huangjin@ict.ac.cn 2006-9-12 ------ 
  157. *
  158. * while(sAtom[0]!=0&&(IsAllNum((unsigned char *)sNumCandidate)||IsAllChineseNum(sNumCandidate)))
  159. *
  160. */
  161. /*----Added By huangjin@ict.ac.cn 2006-9-12----*/
  162. bool bHaveDot=false;
  163. while(sAtom[0]!=0&&
  164. (IsAllNum((unsigned char *)sNumCandidate)||
  165. (IsAllChineseNum(sNumCandidate)&&
  166. !(bHaveDot=CC_Find(sNumCandidate,"点")||
  167. (bHaveDot&&!strncmp(m_graphSeg.m_sAtom[j],"分",2)))
  168. )))
  169. /*---------------------------------------------*/
  170. {//Merge all seperate continue num into one number
  171.  //sAtom[0]!=0: add in 2002-5-9
  172. strcpy(m_pWordSeg[nIndex][k].sWord,sNumCandidate);
  173. //Save them in the result segmentation
  174. i++;//Skip to next atom now 
  175. sAtom[0]=0;
  176. while(j<nSegRoute[nIndex][i+1])
  177. {//Generate the word according the segmentation route
  178. strcat(sAtom,m_graphSeg.m_sAtom[j]);
  179. j++;
  180. }
  181. strcat(sNumCandidate,sAtom);
  182. }
  183. unsigned int nLen=strlen(m_pWordSeg[nIndex][k].sWord);
  184. /* 
  185. * ----- commented by huangjin@ict.ac.cn 2006-7-18 ------ 
  186. *
  187. * if(nLen==4&&CC_Find("第上成±—+∶·./",m_pWordSeg[nIndex][k].sWord)||nLen==1&&strchr("+-./",m_pWordSeg[nIndex][k].sWord[0]))
  188. *
  189. */
  190. /* 
  191. * ----- commented by huangjin@ict.ac.cn 2006-9-17 ------ 
  192. *
  193. *  /*----Added By huangjin@ict.ac.cn 2006-7-18----*/
  194. if(nLen==2&&CC_Find("第上成±—+∶·./",m_pWordSeg[nIndex][k].sWord)||nLen==1&&strchr("+-./",m_pWordSeg[nIndex][k].sWord[0]))
  195. /*---------------------------------------------*
  196. *
  197. */
  198. /*----Added By huangjin@ict.ac.cn 2006-9-17----
  199. if(nLen>0 &&
  200.   (!(nLen%2)&&GetCharCount("第上成±—+∶·./",m_pWordSeg[nIndex][k].sWord)==nLen/2 ||
  201.    (nLen%2)&&GetCharCount("+-./",m_pWordSeg[nIndex][k].sWord) == nLen ))
  202. /*---------------------------------------------*/
  203. {//Only one word
  204. strcpy(sCurWord,m_pWordSeg[nIndex][k].sWord);//Record current word
  205. i--;
  206. }
  207. else if(m_pWordSeg[nIndex][k].sWord[0]==0)//Have never entering the while loop
  208. {
  209. strcpy(m_pWordSeg[nIndex][k].sWord,sAtom);
  210. //Save them in the result segmentation
  211. strcpy(sCurWord,sAtom);//Record current word
  212. }
  213. else
  214. {//It is a num.
  215. if(strcmp("--",m_pWordSeg[nIndex][k].sWord)==0||strcmp("—",m_pWordSeg[nIndex][k].sWord)==0||m_pWordSeg[nIndex][k].sWord[0]=='-'&&m_pWordSeg[nIndex][k].sWord[1]==0)//The delimiter "--"
  216. {
  217. nPOS=30464;//'w'*256;Set the POS with 'w'
  218. i--;//Not num, back to previous word
  219. }
  220. else
  221. {//Adding time suffix
  222. char sInitChar[3];
  223. unsigned int nCharIndex=0;//Get first char
  224. sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex];
  225. if(sInitChar[nCharIndex]<0)
  226. {
  227. nCharIndex+=1;
  228. sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex];
  229. }
  230. nCharIndex+=1;
  231. sInitChar[nCharIndex]='';
  232. if(k>0
  233. &&(abs(m_pWordSeg[nIndex][k-1].nHandle)==27904||abs(m_pWordSeg[nIndex][k-1].nHandle)==29696 )
  234. &&(strcmp(sInitChar,"—")==0||sInitChar[0]=='-')
  235. &&(strlen(m_pWordSeg[nIndex][k].sWord)>nCharIndex))
  236. {//3-4月                                 //27904='m'*256
  237.    //Split the sInitChar from the original word
  238. strcpy(m_pWordSeg[nIndex][k+1].sWord,m_pWordSeg[nIndex][k].sWord+nCharIndex);
  239. m_pWordSeg[nIndex][k+1].dValue=m_pWordSeg[nIndex][k].dValue;
  240. m_pWordSeg[nIndex][k+1].nHandle=27904;
  241. m_pWordSeg[nIndex][k].sWord[nCharIndex]=0;
  242. m_pWordSeg[nIndex][k].dValue=0;
  243. m_pWordSeg[nIndex][k].nHandle=30464;//'w'*256;
  244. m_graphOptimum.SetElement(nStartVertex,nStartVertex+1,m_pWordSeg[nIndex][k].dValue,m_pWordSeg[nIndex][k].nHandle,m_pWordSeg[nIndex][k].sWord);
  245. nStartVertex+=1;
  246. k+=1;
  247. }
  248. nLen=strlen(m_pWordSeg[nIndex][k].sWord);
  249. /* 
  250. * ----- commented by huangjin@ict.ac.cn 2006-7-18 ------ 
  251. *
  252. * if((strlen(sAtom)==2&&CC_Find("月日时分秒",sAtom))||strcmp(sAtom,"月份")==0)
  253. * {
  254. *
  255. */
  256. /*----Added By huangjin@ict.ac.cn 2006-7-18----*/
  257. if(((strlen(sAtom)==2&&CC_Find("月日时分秒",sAtom))||
  258. strcmp(sAtom,"月份")==0||
  259. strcmp(sAtom,"点钟")==0||
  260. strcmp(sAtom,"刻钟")==0))
  261. {
  262. if(!CC_Find(m_pWordSeg[nIndex][k].sWord,".")&&
  263. !strchr(m_pWordSeg[nIndex][k].sWord,'.')&&
  264. strlen(m_pWordSeg[nIndex][k].sWord)<=6)
  265. {
  266. /*---------------------------------------------*/
  267. strcat(m_pWordSeg[nIndex][k].sWord,sAtom);
  268. strcpy(sCurWord,"未##时");
  269. nPOS=-29696;//'t'*256;
  270. /*----Added By huangjin@ict.ac.cn 2006-7-18----*/
  271. }
  272. else
  273. {
  274. strcpy(sCurWord,"未##数");
  275. nPOS=-27904;//Set the POS with 'm'
  276. i--;//Can not be a time word
  277. }
  278. /*---------------------------------------------*/
  279. }
  280. else if(strcmp(sAtom,"年")==0)
  281. {//2001年
  282.  if(IsYearTime(m_pWordSeg[nIndex][k].sWord))//strncmp(sAtom,"年",2)==0&&
  283.  {//1998年,
  284. strcat(m_pWordSeg[nIndex][k].sWord,sAtom);
  285. strcpy(sCurWord,"未##时");
  286. nPOS=-29696;//Set the POS with 't'
  287.  }
  288.  else
  289.  {
  290. strcpy(sCurWord,"未##数");
  291. nPOS=-27904;//Set the POS with 'm'
  292. i--;//Can not be a time word
  293.  }
  294. }
  295.       else
  296. {
  297. //早晨/t  五点/t 
  298. if(strcmp(m_pWordSeg[nIndex][k].sWord+strlen(m_pWordSeg[nIndex][k].sWord)-2,"点")==0)
  299. {
  300. strcpy(sCurWord,"未##时");
  301. nPOS=-29696;//Set the POS with 't'
  302. }
  303. else 
  304. {
  305. if(!CC_Find("∶·/",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.'&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='/')
  306. {
  307. strcpy(sCurWord,"未##数");
  308. nPOS=-27904;//'m'*256;Set the POS with 'm'
  309. }
  310. else if(nLen>strlen(sInitChar))
  311. {//Get rid of . example 1.
  312. /*----Added By huangjin@ict.ac.cn 2006-7-19----*/
  313. //前面还有点
  314. if((CC_Find((const char*)m_pWordSeg[nIndex][k].sWord,".")!=m_pWordSeg[nIndex][k].sWord+nLen-2)
  315. &&(strchr((const char*)m_pWordSeg[nIndex][k].sWord,'.')!=m_pWordSeg[nIndex][k].sWord+nLen-1))
  316. {
  317. /*---------------------------------------------*/
  318. if(m_pWordSeg[nIndex][k].sWord[nLen-1]=='.'||m_pWordSeg[nIndex][k].sWord[nLen-1]=='/')
  319. m_pWordSeg[nIndex][k].sWord[nLen-1]=0;
  320. else
  321. m_pWordSeg[nIndex][k].sWord[nLen-2]=0;
  322. strcpy(sCurWord,"未##数");
  323. nPOS=-27904;//'m'*256;Set the POS with 'm'
  324. i--;
  325. /*----Added By huangjin@ict.ac.cn 2006-7-19----*/
  326. }
  327. /*---------------------------------------------*/
  328. }
  329. }
  330. i--;//Not num, back to previous word
  331. }
  332. }
  333. fValue=0;
  334. nEndVertex=nSegRoute[nIndex][i+1];//Ending POS changed to latter
  335. }
  336. m_pWordSeg[nIndex][k].nHandle=nPOS;//Get the POS of current word
  337. m_pWordSeg[nIndex][k].dValue=fValue;//(int)(MAX_FREQUENCE*exp(-fValue));//Return the frequency of current word
  338. m_graphOptimum.SetElement(nStartVertex,nEndVertex,fValue,nPOS,sCurWord);
  339. //Generate optimum segmentation graph according the segmentation result
  340. i++;//Skip to next atom
  341. k++;//Accept next word
  342. }
  343. m_pWordSeg[nIndex][k].sWord[0]=0;
  344. m_pWordSeg[nIndex][k].nHandle=-1;//Set ending
  345. return true;
  346. }
  347. //DEL bool CSegment::GetSegmentResult(int nIndex,char *sResult)
  348. //DEL {
  349. //DEL  int i=0;
  350. //DEL  char sTempBuffer[WORD_MAXLENGTH];
  351. //DEL  sResult[0]=0;
  352. //DEL  if(nIndex<0||nIndex>=m_nSegmentCount)
  353. //DEL  return false;
  354. //DEL  while(m_WordSeg[nIndex][i].sWord[0]!=0)
  355. //DEL  {
  356. //DEL  sprintf(sTempBuffer,"%s/%c%c",m_WordSeg[nIndex][i].sWord,m_WordSeg[nIndex][i].nHandle/256,m_WordSeg[nIndex][i].nHandle%256);
  357. //DEL  strcat(sResult,sTempBuffer);
  358. //DEL  strcat(sResult,"  ");
  359. //DEL  i++;
  360. //DEL  }
  361. //DEL  return true;
  362. //DEL }
  363. //Word Segmentation based on optimum segmentation graph
  364. //After unknown word recognition
  365. bool CSegment::OptimumSegmet(int nResultCount)
  366. {
  367. int **nSegRoute;//The segmentation route
  368. nSegRoute=new int*[MAX_SEGMENT_NUM];
  369. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  370. {
  371. nSegRoute[i]=new int[MAX_SENTENCE_LEN/2];
  372. }
  373. CNShortPath sp(&m_graphOptimum,nResultCount);
  374. sp.ShortPath();
  375. sp.Output((int **)nSegRoute,false,&m_nSegmentCount);
  376. i=0;
  377. m_graphSeg.m_segGraph=m_graphOptimum;
  378. m_graphOptimum.SetEmpty();//Set graph optimum empty
  379. while(i<m_nSegmentCount)
  380. {
  381. GenerateWord(nSegRoute,i);
  382. //Gernerate word according the Segmentation route
  383. i++;
  384. }
  385. //free the memory
  386. for(i=0;i<MAX_SEGMENT_NUM;i++)
  387. {
  388. delete [] nSegRoute[i];//free the pointer memory
  389. /*----Added By huangjin@ict.ac.cn 2006-5-29----*/
  390. nSegRoute[i]=NULL;
  391. /*-----------------------------------------------*/
  392. }
  393. delete [] nSegRoute;//free the pointer array
  394. /*----Added By huangjin@ict.ac.cn 2006-5-29----*/
  395. nSegRoute=NULL;
  396. /*-----------------------------------------------*/
  397. return true;
  398. }
  399. int CSegment::GetResultCount(PWORD_RESULT pItem)
  400. {
  401. int nCount=0;
  402. while(pItem[nCount].sWord[0]!=0)
  403. {
  404. nCount+=1;
  405. }
  406. return nCount;
  407. }
  408. bool CSegment::GetLastWord(PWORD_RESULT pItem, char *sWordRet)
  409. {
  410. int nCount=0;
  411. sWordRet[0]=0;
  412. while(pItem[nCount].sWord[0]!=0)
  413. {
  414. strcpy(sWordRet,pItem[nCount].sWord);
  415. nCount+=1;
  416. }
  417. return !sWordRet[0];
  418. }
  419. bool CSegment::IsYearTime(char *sNum)
  420. {//Judge whether the sNum is a num genearating year
  421. unsigned int nLen=strlen(sNum);
  422. char sTemp[3];
  423. strncpy(sTemp,sNum,2);
  424. sTemp[2]=0;
  425. if(IsAllSingleByte((unsigned char *)sNum)&&(nLen==4||nLen==2&&sNum[0]>'4'))//1992年, 90年
  426. return true;
  427. if(IsAllNum((unsigned char *)sNum)&&(nLen>=6||nLen==4&&CC_Find("56789",sTemp)))
  428. return true;
  429. /* 
  430. * ----- commented by huangjin@ict.ac.cn 2006-7-21 ------ 
  431. *
  432. *  if(GetCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖",sNum)==(int)nLen/2&&nLen>=3)
  433. *
  434. */
  435. /*----Added By huangjin@ict.ac.cn 2006-7-21----*/
  436. if(GetCharCount("零○〇一二三四五六七八九壹贰叁肆伍陆柒捌玖",sNum)==(int)nLen/2&&nLen>=3)
  437. /*---------------------------------------------*/
  438. return true;
  439. /* 
  440. * ----- commented by huangjin@ict.ac.cn 2006-7-21 ------ 
  441. *
  442. *  if(nLen==8&&GetCharCount("千仟零○",sNum)==2)//二仟零二年
  443. *
  444. */
  445. /*----Added By huangjin@ict.ac.cn 2006-7-21----*/
  446. if(nLen==8&&GetCharCount("千仟零○〇",sNum)==2)//二仟零二年
  447. /*---------------------------------------------*/
  448. return true;
  449. if(nLen==2&&GetCharCount("千仟",sNum)==1)
  450. return true;
  451. if(nLen==4&&GetCharCount("甲乙丙丁戊己庚辛壬癸",sNum)==1&&GetCharCount("子丑寅卯辰巳午未申酉戌亥",sNum+2)==1)
  452. return true;
  453. return false;
  454. }
  455. //CDynamicArray &aWord: the words array
  456. //CDynamicArray &aWordBinaryNet:the net between words
  457. //double dSmoothingPara: the parameter of data smoothing
  458. //CDictionary &DictBinary: the binary dictionary
  459. //CDictionary &DictCore: the Core dictionary
  460. bool CSegment::BiGraphGenerate(CDynamicArray &aWord, CDynamicArray &aBinaryWordNet,double dSmoothingPara,CDictionary &DictBinary,CDictionary &DictCore)
  461. {
  462. PARRAY_CHAIN pTail,pCur,pNextWords;//Temp buffer
  463. unsigned int nWordIndex=0,nTwoWordsFreq=0,nCurWordIndex,nNextWordIndex;
  464. //nWordIndex: the index number of current word
  465. double dCurFreqency,dValue,dTemp;
  466. char sTwoWords[WORD_MAXLENGTH];
  467. m_nWordCount=aWord.GetTail(&pTail);//Get tail element and return the words count
  468. if(m_npWordPosMapTable)
  469. {//free buffer
  470. delete [] m_npWordPosMapTable;
  471. m_npWordPosMapTable=0;
  472. }
  473. if(m_nWordCount>0)//Word count is greater than 0
  474. m_npWordPosMapTable=new int[m_nWordCount];//Record the  position of possible words
  475. pCur=aWord.GetHead();
  476. while(pCur!=NULL)//Set the position map of words
  477. {
  478. m_npWordPosMapTable[nWordIndex++]=pCur->row*MAX_SENTENCE_LEN+pCur->col;
  479. pCur=pCur->next;
  480. }
  481. pCur=aWord.GetHead();
  482. while(pCur!=NULL)//
  483. {
  484. if(pCur->nPOS>=0)//It's not an unknown words
  485. dCurFreqency=pCur->value;
  486. else//Unknown words
  487. dCurFreqency=DictCore.GetFrequency(pCur->sWord,2);
  488. aWord.GetElement(pCur->col,-1,pCur,&pNextWords);//Get next words which begin with pCur->col
  489. while(pNextWords&&pNextWords->row==pCur->col)//Next words
  490. {
  491. //Current words frequency
  492. strcpy(sTwoWords,pCur->sWord);
  493. strcat(sTwoWords,WORD_SEGMENTER);
  494. strcat(sTwoWords,pNextWords->sWord);
  495. nTwoWordsFreq=DictBinary.GetFrequency(sTwoWords,3);
  496. //Two linked Words frequency
  497. dTemp=(double)1/MAX_FREQUENCE;
  498. //Smoothing
  499. dValue=-log(dSmoothingPara*(1+dCurFreqency)/(MAX_FREQUENCE+80000)+(1-dSmoothingPara)*((1-dTemp)*nTwoWordsFreq/(1+dCurFreqency)+dTemp));
  500. //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
  501. if(pCur->nPOS<0)//Unknown words: P(Wi|Ci);while known words:1
  502.     dValue+=pCur->value;
  503. //Get the position index of current word in the position map table
  504. nCurWordIndex=BinarySearch(pCur->row*MAX_SENTENCE_LEN+pCur->col,m_npWordPosMapTable,m_nWordCount);
  505. nNextWordIndex=BinarySearch(pNextWords->row*MAX_SENTENCE_LEN+pNextWords->col,m_npWordPosMapTable,m_nWordCount);
  506. aBinaryWordNet.SetElement(nCurWordIndex,nNextWordIndex,dValue,pCur->nPOS);
  507. pNextWords=pNextWords->next;//Get next word
  508. }
  509. pCur=pCur->next;
  510. }
  511. return true;
  512. }
  513. bool CSegment::BiSegment(char *sSentence, double dSmoothingPara, CDictionary &dictCore, CDictionary &dictBinary, unsigned int nResultCount)
  514. {
  515. int **nSegRoute;//The segmentation route
  516. nSegRoute=new int*[MAX_SEGMENT_NUM];
  517. unsigned int nLen=strlen(sSentence)+10;
  518. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  519. {
  520. /* 
  521. * ----- commented by huangjin@ict.ac.cn 2006-5-29 ------ 
  522. *  nSegRoute[i]=new int[nLen/2];
  523. * memset(nSegRoute[i],-1,nLen/2*sizeof(int));
  524. *
  525. */
  526. /*----Added By huangjin@ict.ac.cn 2006-5-31----*/
  527. nSegRoute[i]=new int[nLen+1];
  528. memset(nSegRoute[i],-1,(nLen+1)*sizeof(int));
  529. /*-----------------------------------------------*/
  530. }
  531. m_graphSeg.GenerateWordNet(sSentence,dictCore,true);//Generate words array
  532.     CDynamicArray aBiwordsNet;
  533. BiGraphGenerate(m_graphSeg.m_segGraph,aBiwordsNet,dSmoothingPara,dictBinary,dictCore);
  534.     //Generate the biword link net
  535.     
  536. CNShortPath sp(&aBiwordsNet,nResultCount);
  537. sp.ShortPath();
  538. sp.Output(nSegRoute,false,&m_nSegmentCount);
  539. m_graphOptimum.SetEmpty();//Set graph optimum empty
  540. i=0;
  541. while(i<m_nSegmentCount)
  542. {
  543. BiPath2UniPath(nSegRoute[i]);
  544. //Path convert to unipath
  545. GenerateWord(nSegRoute,i);
  546. //Gernerate word according the Segmentation route
  547. i++;
  548. }
  549. //free the memory
  550. for(i=0;i<MAX_SEGMENT_NUM;i++)
  551. {
  552. delete [] nSegRoute[i];//free the pointer memory
  553. /*----Added By huangjin@ict.ac.cn 2006-5-29----*/
  554. nSegRoute[i]=NULL;
  555. /*---------------------------------------------*/
  556. }
  557. delete [] nSegRoute;//free the pointer array
  558. /*----Added By huangjin@ict.ac.cn 2006-5-29----*/
  559. nSegRoute=NULL;
  560. /*---------------------------------------------*/
  561. return true;
  562. }
  563. bool CSegment::BiPath2UniPath(int *npPath)
  564. {//BiPath convert to unipath
  565. int i=0,nTemp=-1;
  566. if(!m_npWordPosMapTable)
  567. return false;
  568. while(npPath[i]!=-1&&npPath[i]<m_nWordCount)
  569. {
  570. nTemp=m_npWordPosMapTable[npPath[i]];
  571. npPath[i]=nTemp/MAX_SENTENCE_LEN;
  572. i++;
  573. }
  574. if(nTemp>0)
  575. npPath[i++]=nTemp%MAX_SENTENCE_LEN;
  576. npPath[i]=-1;
  577. return true;
  578. }
  579. bool CSegment::BiOptimumSegment(unsigned int nResultCount,double dSmoothingPara, CDictionary &dictBinary, CDictionary &dictCore)
  580. {
  581. int **nSegRoute;//The segmentation route
  582. nSegRoute=new int*[MAX_SEGMENT_NUM];
  583. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  584. {
  585. nSegRoute[i]=new int[MAX_SENTENCE_LEN/2];
  586. memset(nSegRoute[i],-1,MAX_SENTENCE_LEN/2*sizeof(int));
  587. }
  588.     CDynamicArray aBiwordsNet;
  589. BiGraphGenerate(m_graphOptimum,aBiwordsNet,dSmoothingPara,dictBinary,dictCore);
  590.     //Generate the biword link net
  591.    
  592. CNShortPath sp(&aBiwordsNet,nResultCount);
  593. sp.ShortPath();
  594. sp.Output((int **)nSegRoute,false,&m_nSegmentCount);
  595. i=0;
  596. m_graphSeg.m_segGraph=m_graphOptimum;
  597. m_graphOptimum.SetEmpty();//Set graph optimum empty
  598. while(i<m_nSegmentCount)
  599. {
  600. BiPath2UniPath(nSegRoute[i]);
  601. //Path convert to unipath
  602. GenerateWord(nSegRoute,i);
  603. //Gernerate word according the Segmentation route
  604. i++;
  605. }
  606. //free the memory
  607. for(i=0;i<MAX_SEGMENT_NUM;i++)
  608. {
  609. delete [] nSegRoute[i];//free the pointer memory
  610. /*----Added By huangjin@ict.ac.cn 2006-5-29----*/
  611. nSegRoute[i]=NULL;
  612. /*---------------------------------------------*/
  613. }
  614. delete [] nSegRoute;//free the pointer array
  615. /*----Added By huangjin@ict.ac.cn 2006-5-29----*/
  616. nSegRoute=NULL;
  617. /*---------------------------------------------*/
  618. return true;
  619. }
  620. /*----Added By huangjin@ict.ac.cn 2006-5-30----*/
  621. void CSegment::ClearSegmentWord(void)
  622. //reset the char buffer
  623. {
  624. if(m_pWordSeg!=NULL)
  625. {
  626. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  627. {
  628. if(m_pWordSeg[i]!=NULL)
  629. {
  630. /*---------------------------------------------*/
  631. /* 
  632. * ----- commented by huangjin@ict.ac.cn 2006-9-12 ------ 
  633. *
  634. *  for(int j=0;j<MAX_WORDS;j++)
  635. *
  636. */
  637. /*----Added By huangjin@ict.ac.cn 2006-9-12----*/
  638. /*---------------------------------------------*/
  639. for(int j=0;j<MAX_WORDS&&m_pWordSeg[i][j].sWord[0];j++)
  640. {
  641. m_pWordSeg[i][j].sWord[0]='';
  642. m_pWordSeg[i][j].dValue=0.0;
  643. m_pWordSeg[i][j].nHandle=0;
  644. }
  645. }
  646. }
  647. }
  648. }
  649. /*-----------------------------------------------*/