SegGraph.cpp
上传用户:sunyong76
上传日期:2021-10-03
资源大小:2236k
文件大小:17k
源码类别:

多国语言处理

开发平台:

Java

  1. //////////////////////////////////////////////////////////////////////
  2. //ICTCLAS简介:计算所汉语词法分析系统ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System),
  3. //             功能有:中文分词;词性标注;未登录词识别。
  4. //             分词正确率高达97.58%(973专家评测结果),
  5. //             未登录词识别召回率均高于90%,其中中国人名的识别召回率接近98%;
  6. //             处理速度为31.5Kbytes/s。
  7. //著作权:  Copyright?2002-2005中科院计算所 职务著作权人:张华平 刘群
  8. //遵循协议:自然语言处理开放资源许可证1.0
  9. //Email: zhanghp@software.ict.ac.cn
  10. //Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
  11. /****************************************************************************
  12.  *
  13.  * Copyright (c) 2000, 2001 
  14.  *     Machine Group
  15.  *     Software Research Lab.
  16.  *     Institute of Computing Tech.
  17.  *     Chinese Academy of Sciences
  18.  *     All rights reserved.
  19.  *
  20.  * This file is the confidential and proprietary property of 
  21.  * Institute of Computing Tech. and the posession or use of this file requires 
  22.  * a written license from the author.
  23.  * Filename: SegGraph.cpp
  24.  * Abstract:
  25.  *            implement for the Word Segmentation Directed Graph.
  26.  *
  27.  * Author:   Kevin Zhang 
  28.  *          (zhanghp@software.ict.ac.cn)
  29.  * Date:     2002-1-8
  30.  *
  31.  * Notes:
  32.  *                
  33.  * 
  34.  ****************************************************************************/
  35. // SegGraph.cpp: implementation of the CSegGraph class.
  36. //
  37. //////////////////////////////////////////////////////////////////////
  38. #include "stdafx.h"
  39. #include "SegGraph.h"
  40. #include "..\Utility\Utility.h"
  41. #include <string.h>
  42. #include <math.h>
  43. //////////////////////////////////////////////////////////////////////
  44. // Construction/Destruction
  45. //////////////////////////////////////////////////////////////////////
  46. CSegGraph::CSegGraph()
  47. {
  48. m_segGraph.SetRowFirst();
  49. //segGraph: The segmentation word graph
  50. //Row first array
  51. }
  52. CSegGraph::~CSegGraph()
  53. {
  54. }
  55. bool CSegGraph::GenerateWordNet(char *sSentence,CDictionary &dictCore,bool bOriginalFreq)
  56. {
  57. //Gernerate the word net from the sLine, that's list all the possible word
  58. unsigned int i=0,j,nLen=strlen(sSentence);
  59. /* 
  60. * ----- commented by huangjin@ict.ac.cn 2006-6-8 ------ 
  61. *
  62. *  char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH];
  63. *
  64. */
  65. /*----Added By huangjin@ict.ac.cn 2006-6-8----*/
  66. char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH]="";
  67. /*---------------------------------------------*/
  68. int nWordIndex=0,nHandleTemp,k,nPOS;
  69. int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount;
  70. double dValue=0;
  71. m_nAtomCount=0;
  72. m_segGraph.SetEmpty();//Set segmentation graph empty
  73. AtomSegment(sSentence);
  74. //Atomic Segmentation
  75.     for(i=0;i<m_nAtomCount;i++)//Init the cost array
  76.     {
  77. if(m_nAtomPOS[i]==CT_CHINESE)//The atom is a Chinese Char
  78. {
  79. if(!bOriginalFreq)//Not original frequency
  80. /* 
  81. * ----- commented by huangjin@ict.ac.cn 2006-5-30 ------ 
  82. *  m_segGraph.SetElement(i,i+1,log(MAX_FREQUENCE),0);//init the link with the maximum value
  83. *
  84. */
  85. /*----Added By huangjin@ict.ac.cn 2006-5-30----*/
  86. m_segGraph.SetElement(i,i+1,log(double(MAX_FREQUENCE)),0);//init the link with the maximum value
  87. /*--------------------------------------------*/
  88. else
  89. m_segGraph.SetElement(i,i+1,0,0,m_sAtom[i]);//init the link with the maximum value
  90. }
  91. else//Other atom
  92. {
  93. strcpy(sWord,m_sAtom[i]);//init the word 
  94. dValue=MAX_FREQUENCE;
  95. switch(m_nAtomPOS[i])
  96. {
  97. case CT_INDEX:
  98. case CT_NUM:
  99. /*----Added By huangjin@ict.ac.cn 2006-7-11----*/
  100. case CT_SINGLE_NUM:
  101. /*---------------------------------------------*/
  102. nPOS=-27904;//'m'*256
  103. strcpy(sWord,"未##数");
  104. dValue=0;
  105. break;
  106. case CT_DELIMITER:
  107. nPOS=30464;//'w'*256;
  108. break;
  109. case CT_LETTER:
  110. nPOS=-'n'*256-'x';//
  111. dValue=0;
  112. strcpy(sWord,"未##串");
  113. break;
  114. /*----Added By huangjin@ict.ac.cn 2006-7-11----*/
  115. case CT_SINGLE_DELIMITER://12021-2129-3121
  116. if(GetCharCount("+-1234567890",m_sAtom[i])==(int)strlen(m_sAtom[i]))
  117. {
  118. nPOS=-27904;//'m'*256
  119. dValue=0;
  120. strcpy(sWord,"未##数");
  121. }
  122. else
  123. {
  124. nPOS=30464; //'w'*256
  125. }
  126. break;
  127. /*---------------------------------------------*/
  128. case CT_SINGLE://12021-2129-3121
  129. if(GetCharCount("+-1234567890",m_sAtom[i])==(int)strlen(m_sAtom[i]))
  130. {
  131. nPOS=-27904;//'m'*256
  132. strcpy(sWord,"未##数");
  133. }
  134. else
  135. {
  136. nPOS=-'n'*256-'x';//
  137. strcpy(sWord,"未##串");
  138. }
  139. dValue=0;
  140. break;
  141. default:
  142. nPOS=m_nAtomPOS[i];//'?'*256;
  143. break;
  144. }
  145. if(!bOriginalFreq)//Not original frequency
  146. m_segGraph.SetElement(i,i+1,0,nPOS);//init the link with minimum
  147. else
  148. m_segGraph.SetElement(i,i+1,dValue,nPOS,sWord);//init the link with minimum
  149. }
  150.     }
  151. i=0;
  152. while(i<m_nAtomCount)//All the word
  153. {
  154.   strcpy(sWord,m_sAtom[i]);//Get the current atom
  155.   j=i+1;
  156.   /* 
  157.   * ----- commented by huangjin@ict.ac.cn 2006-5-31 ------ 
  158.   *
  159.   *  if(strcmp(sWord,"月")==0&&strcmp(m_sAtom[i+1],"份")==0)//Don't split 月份
  160.   *
  161.   */
  162.   /*----Added By huangjin@ict.ac.cn 2006-5-31----*/
  163.   //Add "i<m_nAtomCount-1" so that the i will not slop over when compare m_sAtom[i+1]
  164.   if(strcmp(sWord,"月")==0&&i<m_nAtomCount-1&&strcmp(m_sAtom[i+1],"份")==0)//Don't split 月份
  165.   /*---------------------------------------------*/
  166.   j+=1;
  167.   /*----Added By huangjin@ict.ac.cn 2006-7-17----*/
  168.   if((strcmp(sWord,"点")==0||strcmp(sWord,"刻")==0)&&
  169.   i<m_nAtomCount-1&&strcmp(m_sAtom[i+1],"钟")==0)//Don't split 点钟|刻钟
  170.   j+=1;
  171.   /*---------------------------------------------*/
  172.   //while(j<=m_nAtomCount&&dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp))
  173. //=============================modify by Jiang Wenbin===================================
  174.   while(j<=m_nAtomCount)
  175.   {
  176.   
  177.   bool find=dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp);
  178.   if(find)
  179.   {
  180.   
  181. if(strcmp(sWordMatch,sWord)==0)//find the current word
  182. {
  183.   
  184. nTotalFreq=0;
  185. dictCore.GetHandle(sWord,&nMatchCount,nMatchHandle,nMatchFreq);
  186. for(k=0;k<nMatchCount;k++)//Add the frequency
  187. {
  188. nTotalFreq+=nMatchFreq[k];
  189. }
  190. //Adding a rule to exclude some words to be formed.
  191. if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)m_sAtom[i-1])||IsAllChineseNum(m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0||strncmp(sWord,"月",2)==0))
  192. {//1年内、1999年末
  193. if(CC_Find("末内中底前间初",sWord+2))
  194. break;
  195.  }
  196. if(nMatchCount==1)//The possible word has only one POS, store it
  197. {
  198. if(!bOriginalFreq)//Not original frequency
  199. /* 
  200. * ----- commented by huangjin@ict.ac.cn 2006-5-30 ------ 
  201. *  m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),nMatchHandle[0]);
  202. *
  203. */
  204. /*----Added By huangjin@ict.ac.cn 2006-5-30----*/
  205. m_segGraph.SetElement(i,j,-log(double(nTotalFreq+1))+log(double(MAX_FREQUENCE)),nMatchHandle[0]);
  206. /*---------------------------------------------*/
  207. else
  208. m_segGraph.SetElement(i,j,nTotalFreq,nMatchHandle[0],sWord);
  209. }
  210. else 
  211. {
  212. if(!bOriginalFreq)//Not original frequency
  213. /* 
  214. * ----- commented by huangjin@ict.ac.cn 2006-5-30 ------ 
  215. *  m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),0);
  216. *
  217. */
  218. /*----Added By huangjin@ict.ac.cn 2006-5-30----*/
  219. m_segGraph.SetElement(i,j,-log(double(nTotalFreq+1))+log(double(MAX_FREQUENCE)),0);
  220. /*---------------------------------------------*/
  221. else
  222. m_segGraph.SetElement(i,j,nTotalFreq,0,sWord);
  223. }
  224. }
  225.   }
  226.   else if(j-i>8)
  227.   {
  228. break;
  229.   }
  230.   //Add a condition to control the end of string
  231.    //retrieve the dictionary with the word
  232.   //cout<<"Word: "<<sWord<<endl;
  233.   
  234.   strcat(sWord,m_sAtom[j++]);
  235.   }
  236.   i+=1;//Start from i++;
  237. }
  238. return true;
  239. }
  240. /* 
  241. * ----- commented by huangjin@ict.ac.cn 2006-7-11 ------ 
  242. *
  243. //DELbool CSegGraph::AtomSegment(char *sSentence)
  244. //DEL{
  245. //DEL unsigned int i=0,j=0,nCurType,nNextType;
  246. //DEL //i is the pointer of sentence string
  247. //DEL //j is the pointer of pAtoms
  248. //DEL char sChar[3];
  249. //DEL sChar[2]=0;//Set the char ending
  250. //DEL m_sAtom[j][0]=0;//Set the first word as null
  251. //DEL m_nAtomLength[j]=0;
  252. //DEL if(strncmp(sSentence,SENTENCE_BEGIN,strlen(SENTENCE_BEGIN))==0)
  253. //DEL {
  254. //DEL strcpy(m_sAtom[j],SENTENCE_BEGIN);//Set the first word as sentence begining
  255. //DEL m_nAtomLength[j]=strlen(SENTENCE_BEGIN);
  256. //DEL m_nAtomPOS[j]=CT_SENTENCE_BEGIN;//init
  257. //DEL i+=m_nAtomLength[j];
  258. //DEL j+=1;
  259. //DEL m_sAtom[j][0]=0;//Set the first word as null
  260. //DEL m_nAtomLength[j]=0;
  261. //DEL }
  262. //DEL while(i<strlen(sSentence))
  263. //DEL {
  264. //DEL if(strncmp(sSentence+i,SENTENCE_END,strlen(SENTENCE_END))==0)
  265. //DEL {
  266. //DEL strcpy(m_sAtom[j],SENTENCE_END);//Set the first word as null
  267. //DEL m_nAtomLength[j]=strlen(SENTENCE_END);
  268. //DEL m_nAtomPOS[j]=CT_SENTENCE_END;//init
  269. //DEL i+=m_nAtomLength[j];
  270. //DEL j+=1;
  271. //DEL m_sAtom[j][0]=0;//Set the first word as null
  272. //DEL m_nAtomLength[j]=0;
  273. //DEL continue;
  274. //DEL }
  275. //DEL /*----Added By huangjin@ict.ac.cn 2006-6-8----*/
  276. //DEL //连续的三个点到六个点认为是省略号,不要切开
  277. //DEL char c=*(sSentence+i);
  278. //DEL if(c=='.')
  279. //DEL {
  280. //DEL char ellipsis[]= "......";
  281. //DEL bool bEllipsis = false;
  282. //DEL for( int tmpindex = 5; tmpindex>=2; tmpindex-- )
  283. //DEL {
  284. //DEL if( strncmp(sSentence+i,ellipsis,strlen(ellipsis))==0)
  285. //DEL {
  286. //DEL strcpy(m_sAtom[j],ellipsis);
  287. //DEL m_nAtomLength[j]=strlen(ellipsis);
  288. //DEL m_nAtomPOS[j]=CT_DELIMITER;//init
  289. //DEL i+=m_nAtomLength[j];
  290. //DEL j+=1;
  291. //DEL m_sAtom[j][0]=0;//Set the first word as null
  292. //DEL m_nAtomLength[j]=0;
  293. //DEL bEllipsis=true;
  294. //DEL break;
  295. //DEL }
  296. //DEL else
  297. //DEL {
  298. //DEL ellipsis[tmpindex]='';
  299. //DEL }
  300. //DEL }
  301. //DEL if( bEllipsis )
  302. //DEL {
  303. //DEL continue;
  304. //DEL }
  305. //DEL }
  306. //DEL else if(c=='-'&&i+1<strlen(sSentence))
  307. //DEL {
  308. //DEL c=*(sSentence+i+1);
  309. //DEL if(c=='-')
  310. //DEL {
  311. //DEL strcpy(m_sAtom[j],"--");
  312. //DEL m_nAtomLength[j]=strlen("--");
  313. //DEL m_nAtomPOS[j]=CT_DELIMITER;//init
  314. //DEL i+=m_nAtomLength[j];
  315. //DEL j+=1;
  316. //DEL m_sAtom[j][0]=0;//Set the first word as null
  317. //DEL m_nAtomLength[j]=0;
  318. //DEL continue;
  319. //DEL }
  320. //DEL }
  321. //DEL /*---------------------------------------------*/
  322. //DEL
  323. //DEL sChar[0]=*(sSentence+i);//Get the char with first byte
  324. //DEL sChar[1]=0;//
  325. //DEL i+=1;
  326. //DEL if(sChar[0]<0)//Two byte char
  327. //DEL {
  328. //DEL sChar[1]=*(sSentence+i);//Get the char with second byte
  329. //DEL i+=1;//i increased by 1
  330. //DEL }
  331. //DEL strcat(m_sAtom[j],sChar);
  332. //DEL nCurType=charType((unsigned char *)sChar);
  333. //DEL if(sChar[0]=='.'&&(charType((unsigned char *)sSentence+i)==CT_NUM||(*(sSentence+i)>='0'&&*(sSentence+i)<='9')))
  334. //DEL nCurType=CT_NUM;//Digit after . indicate . as a point in the numeric
  335. //DEL
  336. //DEL m_nAtomPOS[j]=nCurType;
  337. //DEL //Record its property, just convience for continuous processing
  338. //DEL
  339. //DEL if(nCurType==CT_CHINESE||nCurType==CT_INDEX||nCurType==CT_DELIMITER||nCurType==CT_OTHER)
  340. //DEL {//Chinese char, index number,delimiter and other is treated as atom
  341. //DEL m_nAtomLength[j]=strlen(m_sAtom[j]);//Save its length
  342. //DEL j+=1;//Skip to next atom
  343. //DEL m_sAtom[j][0]=0;//init
  344. //DEL /*----Added By huangjin@ict.ac.cn 2006-6-8----*/
  345. //DEL m_nAtomLength[j]=0;
  346. //DEL /*---------------------------------------------*/
  347. //DEL }
  348. //DEL else
  349. //DEL {//Number,single char, letter
  350. //DEL nNextType=255;
  351. //DEL if(i<strlen(sSentence))
  352. //DEL nNextType=charType((unsigned char *)(sSentence+i));
  353. //DEL if(nNextType!=nCurType||i==strlen(sSentence))
  354. //DEL //Reaching end or next char type is different from current char
  355. //DEL {
  356. //DEL m_nAtomLength[j]=strlen(m_sAtom[j]);//Save its length
  357. //DEL j+=1;
  358. //DEL m_sAtom[j][0]=0;//init
  359. //DEL /*----Added By huangjin@ict.ac.cn 2006-6-8----*/
  360. //DEL   m_nAtomLength[j]=0;
  361. //DEL   /*---------------------------------------------*/
  362. //DEL   }
  363. //DEL  }
  364. //DEL }
  365. //DEL m_nAtomCount=j;//The count of segmentation atoms
  366. //DEL return true;
  367. //DEL}
  368. //DEL*
  369. //DEL*/
  370. /*----Added By huangjin@ict.ac.cn 2006-9-12----*/
  371. bool CSegGraph::AtomSegment(char *sSentence)
  372. {
  373. unsigned int i=0, j=0, nCurType, nNextType;
  374. //i is the pointer of sSentence string
  375. //j is the pointer of m_sAtom, m_nAtomLength an m_nAtomPOS
  376. char sChar[3]="";
  377. const unsigned int nLen = strlen(sSentence); //store the length of sSentence
  378. //Set the first word as null
  379. UpdateAtoms(j,"",-1,false);
  380. if(!strncmp(sSentence,SENTENCE_BEGIN,strlen(SENTENCE_BEGIN))) //sentence begin
  381. {
  382. i+=UpdateAtoms(j,SENTENCE_BEGIN,CT_SENTENCE_BEGIN,false);//Set the word as sentence beginning
  383. j++;
  384. UpdateAtoms(j,"",-1,false);
  385. }
  386. while(i<nLen)
  387. {
  388. if(strncmp(sSentence+i,SENTENCE_END,strlen(SENTENCE_END))==0)
  389. {//Set the word as sentence ending
  390. i+=UpdateAtoms(j,SENTENCE_END,CT_SENTENCE_END,false);
  391. j++;
  392. UpdateAtoms(j,"",-1,false);
  393. continue;
  394. }
  395. i+=GetChar(sSentence+i, sChar); //Get current word
  396. nCurType=charType((unsigned char *)sChar);
  397. UpdateAtoms(j,sChar,nCurType);
  398. if ( nCurType==CT_CHINESE )
  399. {//一万八千六百五十二
  400. unsigned int k=i;
  401. char sNumCandidate[WORD_MAXLENGTH]="";
  402. if(k<nLen)
  403. {
  404. bool bNum=false;
  405. do
  406. {
  407. bNum=IsChineseNumCadidate(sChar);
  408. if(bNum)
  409. {
  410. strcat(sNumCandidate,sChar);
  411. k+=GetChar(sSentence+k,sChar);
  412. }
  413. else if(!strncmp(sSentence+k-2,"分之",4))
  414. {
  415. strcat(sNumCandidate,"分之");
  416. bNum=true;
  417. k+=GetChar(sSentence+k+2,sChar)+2;
  418. }
  419. }while(k<nLen&&bNum);
  420. }
  421. unsigned int h=ChineseNumRecognize(sNumCandidate, sSentence+k-strlen(sChar));
  422. if(h!=(unsigned int)-1)
  423. {
  424. sNumCandidate[h]=0;//截断
  425. UpdateAtoms(j,sNumCandidate,CT_NUM,false);
  426. i+=strlen(sNumCandidate)-2;
  427. }
  428. }
  429. else
  430. {
  431. nNextType=255;
  432. if(i<nLen)
  433. nNextType=charType((unsigned char*)(sSentence+i));
  434. //Numeric
  435. if(nCurType==CT_NUM||nCurType==CT_SINGLE_NUM||
  436. strchr("+-.",sChar[0])&&nNextType==CT_SINGLE_NUM||
  437. CC_Find("±-+",sChar)&&nNextType==CT_NUM)
  438. {//Numeric and Numeric Prefix
  439. unsigned int k=i; //tmp index save i
  440. bool bSBC = false;
  441. if(strchr("+-.",sChar[0])||nCurType==CT_SINGLE_NUM)
  442. bSBC = true;
  443. bool bPuncEnd=true;
  444. if(nCurType==CT_NUM||nCurType==CT_SINGLE_NUM)
  445. bPuncEnd=false;
  446. while(k<nLen)
  447. {
  448. k+=GetChar(sSentence+k,sChar);
  449. nNextType=charType((unsigned char*)sChar);
  450. if(!bPuncEnd&&(strchr(".+-:",sChar[0])&&sChar[1]==0)||CC_Find(".·:∶/",sChar))
  451. bPuncEnd=true;
  452. else if(bSBC&&nNextType==CT_SINGLE_NUM||!bSBC&&nNextType==CT_NUM)
  453. bPuncEnd=false;
  454. else
  455. break;
  456. strcat(m_sAtom[j],sChar);//Get the atom
  457. i=k;
  458. }
  459. if(bPuncEnd)//末尾是符号或者标点,要分开
  460. {
  461. if(CC_Find(".·:∶/",m_sAtom[j]+strlen(m_sAtom[j])-2))
  462. {
  463. m_sAtom[j][strlen(m_sAtom[j])-2]=0;
  464. i-=2;
  465. }
  466. else //.+-:
  467. {
  468. m_sAtom[j][strlen(m_sAtom[j])-1]=0;
  469. i-=1;
  470. }
  471. }
  472. UpdateAtoms(j,"",CT_NUM);
  473. }//end of Numeric
  474. else if(i<nLen-1&&strchr(".-",sChar[0])&&!sChar[1]&&*(sSentence+i)==sChar[0])
  475. {//for ...... and --
  476. if(sChar[0]=='.')
  477. {
  478. char ellipsis[]= ".....";
  479. bool bEllipsis = false;
  480. for( int tmpindex = 4; tmpindex>=1; tmpindex-- )
  481. {
  482. if( strncmp(sSentence+i,ellipsis,strlen(ellipsis))==0)
  483. {
  484. bEllipsis=true;
  485. break;
  486. }
  487. else
  488. {
  489. ellipsis[tmpindex]='';
  490. }
  491. }
  492. if( bEllipsis )
  493. {
  494. i+=UpdateAtoms(j,ellipsis,CT_DELIMITER);
  495. }
  496. }
  497. else if(sChar[0]=='-')//the next character must be '-'
  498. {
  499. i+=UpdateAtoms(j,"-",CT_DELIMITER);
  500. }
  501. }
  502. else if(nCurType==CT_LETTER||
  503. nCurType==CT_SINGLE&&sChar[0]!='+'&&sChar[0]!='-'||
  504. (strchr("+-",sChar[0])||CC_Find("±-+",sChar))&&nNextType!=CT_CHINESE)
  505. {//letters
  506. unsigned int k=i;
  507. bool bSBC = true;
  508. if(nCurType==CT_SINGLE||strchr("+-",sChar[0]))
  509. bSBC=false;
  510. while(k<nLen)
  511. {
  512. i=k;
  513. k+=GetChar(sSentence+k,sChar);
  514. nNextType=charType((unsigned char *)sChar);//Get the type
  515. if( bSBC&&(nNextType==CT_LETTER||nNextType==CT_NUM) ||
  516. !bSBC&&(nNextType==CT_SINGLE||nNextType==CT_SINGLE_NUM))
  517. {
  518. strcat(m_sAtom[j],sChar);
  519. }
  520. else
  521. {
  522. break;
  523. }
  524. }
  525. m_nAtomPOS[j]=CT_LETTER;
  526. m_nAtomLength[j]=strlen(m_sAtom[j]);
  527. }
  528. /*---------------------------------------------*/
  529. else if(!strcmp(sChar,"-") && !strncmp(sSentence+i,"-",2) )
  530. {
  531. i+=UpdateAtoms(j,"-",CT_DELIMITER);
  532. }
  533. }
  534. j++;
  535. UpdateAtoms(j,"",-1,false);
  536. }
  537. m_nAtomCount=j;
  538. return true;
  539. }
  540. /*---------------------------------------------*/
  541. /*----Added By huangjin@ict.ac.cn 2006-9-12----*/
  542. int CSegGraph::UpdateAtoms( int j, const char* str, int nPOS, bool bApp )
  543. {
  544. if(!bApp)
  545. {
  546. m_sAtom[j][0]=0;
  547. }
  548. strcat(m_sAtom[j],str);
  549. m_nAtomLength[j]=strlen(m_sAtom[j]);
  550. m_nAtomPOS[j]=nPOS;
  551. return strlen(str);
  552. }
  553. /*---------------------------------------------*/