Result.cpp
上传用户:sunyong76
上传日期:2021-10-03
资源大小:2236k
文件大小:25k
源码类别:

多国语言处理

开发平台:

Java

  1. //////////////////////////////////////////////////////////////////////
  2. //ICTCLAS简介:计算所汉语词法分析系统ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System),
  3. //             功能有:中文分词;词性标注;未登录词识别。
  4. //             分词正确率高达97.58%(973专家评测结果),
  5. //             未登录词识别召回率均高于90%,其中中国人名的识别召回率接近98%;
  6. //             处理速度为31.5Kbytes/s。
  7. //著作权:  Copyright?2002-2005中科院计算所 职务著作权人:张华平 刘群
  8. //遵循协议:自然语言处理开放资源许可证1.0
  9. //Email: zhanghp@software.ict.ac.cn
  10. //Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
  11. // Result.cpp: implementation of the CResult class.
  12. //
  13. //////////////////////////////////////////////////////////////////////
  14. #include "stdafx.h"
  15. #include "Result.h"
  16. #include <string.h>
  17. #include <stdio.h>
  18. #include <math.h>
  19. #include "..\Utility\Utility.h"
  20. //////////////////////////////////////////////////////////////////////
  21. // Construction/Destruction
  22. //////////////////////////////////////////////////////////////////////
  23. CResult::CResult()
  24. {
  25. //malloc buffer
  26. m_pResult=new PWORD_RESULT[MAX_SEGMENT_NUM];
  27. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  28. {
  29. m_pResult[i]=new WORD_RESULT[MAX_WORDS];
  30. }
  31. m_dictCore.Load("data\coreDict.dct");
  32. m_POSTagger.LoadContext("data\lexical.ctx");
  33. /*
  34. m_dictCore.Load("data\Dict.dct");
  35. m_POSTagger.LoadContext("data\trainTest.ctx");
  36. */
  37. /*
  38. m_dictCore.AddItem("十分",'d'*256,+500);
  39. m_dictCore.AddItem("十分",'m'*256,-500);
  40. m_dictCore.AddItem("我国",'n'*256,-2000);
  41. m_dictCore.AddItem("我国",'r'*256,+2000);
  42. m_dictCore.AddItem("千年",'t'*256,200);
  43.     m_dictCore.Optimum();
  44. m_dictCore.Save("data\coreDictOptimum.dct");
  45. */
  46. m_POSTagger.SetTagType();
  47. m_uPerson.Configure("data\nr",TT_PERSON);
  48. //Set the person recognition configure
  49. m_uPlace.Configure("data\ns",TT_PLACE);
  50. //Set the place recognition configure
  51. m_uTransPerson.Configure("data\tr",TT_TRANS_PERSON);
  52. //Set the transliteration person recognition configure
  53. m_nOperateType=2;//0:Only Segment;1: First Tag; 2:Second Type
  54. m_nOutputFormat=0;//0:PKU criterion;1:973 criterion; 2: XML criterion
  55. m_dSmoothingPara=0.1;//Smoothing parameter
  56. m_dictBigram.Load("data\BigramDict.dct");
  57. }
  58. CResult::~CResult()
  59. {
  60. //free buffer
  61. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  62. {
  63. delete [] m_pResult[i];
  64. }
  65. /* 
  66.      * ----- commented by huangjin@ict.ac.cn 2006-5-29 ------ 
  67.      * 
  68.      *  delete m_pResult;
  69.      *
  70.      */
  71. /*----Added By huangjin@ict.ac.cn 2006-5-29----*/
  72. delete []m_pResult;
  73. m_pResult=NULL;
  74. /*---------------------------------------------*/
  75. }
  76. bool CResult::Output(PWORD_RESULT pItem, CString &sResult,bool bFirstWordIgnore)
  77. {
  78. int i=0;
  79. char sTempBuffer[WORD_MAXLENGTH],sPOS[3];
  80. sPOS[2]=0;
  81. sResult="";
  82. if(bFirstWordIgnore)//Ignore first valid
  83. i=1;
  84. while(pItem[i].sWord[0]!=0&&pItem[i].nHandle!=CT_SENTENCE_END)//Not sentence ending flag
  85. {
  86. //Get the POS string
  87. if(m_nOutputFormat!=0)//Not PKU format
  88. PKU2973POS(pItem[i].nHandle,sPOS);
  89. else//PKU format
  90. {
  91. sPOS[0]=pItem[i].nHandle/256;
  92. sPOS[1]=pItem[i].nHandle%256;
  93. }
  94. sPOS[m_nOperateType]=0;//Set the sPOS with operate type
  95. if(m_nOutputFormat==0)//PKU format
  96. {
  97. sprintf(sTempBuffer,"%s",pItem[i].sWord);
  98.     //strcat(sResult,sTempBuffer);
  99.             sResult+=sTempBuffer;
  100. if(sPOS[0]!=0)//need POS 
  101. {
  102. sprintf(sTempBuffer,"/%s",sPOS);
  103. //strcat(sResult,sTempBuffer);
  104.                 sResult+=sTempBuffer;
  105. }
  106.     //strcat(sResult,"  ");
  107.             sResult+="  ";
  108. }
  109. else if(m_nOutputFormat==1)//973 format
  110. {
  111. sprintf(sTempBuffer,"%s\",pItem[i].sWord);
  112.     //strcat(sResult,sTempBuffer);
  113.             sResult+=sTempBuffer;
  114. if(sPOS[0]!=0)//need POS 
  115. {
  116. sprintf(sTempBuffer,"[%s]",sPOS);
  117. //strcat(sResult,sTempBuffer);
  118.                 sResult+=sTempBuffer;
  119. }
  120. }
  121. else if(m_nOutputFormat==2)//XML format
  122. {
  123. if(sPOS[0]!=0)//POS
  124. {
  125. sprintf(sTempBuffer,"<any type=42%s42>",sPOS);
  126. //strcat(sResult,sTempBuffer);
  127.                 sResult+=sTempBuffer;
  128. }
  129. sprintf(sTempBuffer,"<src>%s</src>",pItem[i].sWord);
  130. //strcat(sResult,sTempBuffer);
  131.             sResult+=sTempBuffer;
  132. if(sPOS[0]!=0)
  133. {
  134. //strcat(sResult,"</any>");
  135.                 sResult+="</any>";
  136. }
  137. }
  138. i++;
  139. }
  140. return true;
  141. }
  142. bool CResult::Processing(char *sSentence,unsigned int nCount)
  143. {
  144. int nIndex;
  145. #if _ICT_DEBUG
  146. char *sSegment;
  147. sSegment=new char[MAX_SENTENCE_LEN*2];
  148. #endif
  149. /*----Added By huangjin@ict.ac.cn 2006-5-30----*/
  150. m_Seg.ClearSegmentWord();
  151. this->ResetWord();
  152. /*---------------------------------------------*/
  153. //Unigram segment
  154. //m_Seg.Segment(sSentence,m_dictCore,nCount);
  155. //Bigram segment
  156. m_Seg.BiSegment(sSentence, m_dSmoothingPara, m_dictCore, m_dictBigram, nCount);
  157. m_nResultCount=m_Seg.m_nSegmentCount;
  158. //Record the number of result
  159. for(nIndex=0;nIndex<m_Seg.m_nSegmentCount;nIndex++)
  160. {
  161. #if _ICT_DEBUG
  162. m_POSTagger.POSTagging(m_Seg.m_pWordSeg[nIndex],m_dictCore,m_dictCore);
  163. Output(m_Seg.m_pWordSeg[nIndex],sSegment);
  164. printf("POS Tag%d:%sn",nIndex+1,sSegment);
  165. #endif
  166. m_uPerson.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
  167. m_uTransPerson.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
  168. m_uPlace.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
  169. }
  170. //m_uPerson.Recognition(m_Seg.m_WordSeg[0],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
  171. //Person Recognition
  172. #if _ICT_DEBUG
  173. printf("After person recognition.n");
  174. #endif
  175. //Unigram
  176. //m_Seg.OptimumSegmet(nCount);
  177. //Bigram
  178. m_Seg.BiOptimumSegment(nCount,m_dSmoothingPara,m_dictBigram,m_dictCore);
  179. for(nIndex=0;nIndex<m_Seg.m_nSegmentCount;nIndex++)
  180. {
  181. m_POSTagger.POSTagging(m_Seg.m_pWordSeg[nIndex],m_dictCore,m_dictCore);
  182. #if _ICT_DEBUG
  183. Output(m_Seg.m_pWordSeg[nIndex],sSegment);
  184. printf("POS Tag%d:%sn",nIndex+1,sSegment);
  185. #endif
  186. }
  187. #if _ICT_DEBUG
  188. printf("After Sorting.n");
  189. #endif
  190. Sort();//Sort the ending 
  191. #if _ICT_DEBUG
  192. for(nIndex=0;nIndex<m_Seg.m_nSegmentCount;nIndex++)
  193. {
  194. Output(m_pResult[nIndex],sSegment);
  195. printf("POS Tag%d(P=Exp(%f)):%sn",nIndex+1,m_dResultPossibility[nIndex],sSegment);
  196. }
  197. delete [] sSegment;
  198. #endif
  199. return true;
  200. }
  201. //Sort the segmentation and POS result according its possibility
  202. bool CResult::Sort()
  203. {
  204. ELEMENT_TYPE dPossibility[MAX_SEGMENT_NUM],dTemp;
  205. int nIndex[MAX_SEGMENT_NUM],nTemp;//Index
  206. memset(dPossibility,0,sizeof(dPossibility));
  207. //Init the possibility
  208. for(int i=0;i<m_Seg.m_nSegmentCount;i++)
  209. {//Computing the possibility
  210. dPossibility[i]=ComputePossibility(m_Seg.m_pWordSeg[i]);
  211. nIndex[i]=i;//Record the index
  212. }
  213. //Sort with Bubble sort algorithm
  214. for(i=0;i<m_Seg.m_nSegmentCount;i++)
  215. for(int j=i+1;j<m_Seg.m_nSegmentCount;j++)
  216. {
  217. if(dPossibility[i]<dPossibility[j])
  218. {//Swap the possition and value
  219. nTemp=nIndex[i];
  220. dTemp=dPossibility[i];
  221. nIndex[i]=nIndex[j];
  222. dPossibility[i]=dPossibility[j];
  223. nIndex[j]=nTemp;
  224. dPossibility[j]=dTemp;
  225. }
  226. }
  227. for(i=0;i<m_Seg.m_nSegmentCount;i++)
  228. {//Adjust the segmentation and POS result and store them in the final result array
  229.  //Store them according their possibility ascendly
  230. Adjust(m_Seg.m_pWordSeg[nIndex[i]],m_pResult[i]);
  231. m_dResultPossibility[i]=dPossibility[i];
  232. }
  233. return true;
  234. }
  235. //Compute the possibility of current segmentation and POS result
  236. ELEMENT_TYPE CResult::ComputePossibility(PWORD_RESULT pItem)
  237. {
  238. int i=0;
  239. ELEMENT_TYPE dResultPossibility=0;
  240. while(pItem[i].sWord[0]!=0)
  241. {
  242. dResultPossibility+=pItem[i].dValue;
  243. //Compute the possibility of logP(Wi|Ti)
  244. if(pItem[i+1].sWord[0]!=0)//Not the last one
  245. {//Compute the possibility of logP(Ti|Ti-1)
  246. dResultPossibility+=log((double)(m_POSTagger.m_context.GetContextPossibility(0,pItem[i].nHandle,pItem[i+1].nHandle)+1));
  247. dResultPossibility-=log((double)(m_POSTagger.m_context.GetFrequency(0,pItem[i].nHandle)+1));
  248. }
  249. i++;
  250. }
  251. return dResultPossibility;
  252. }
  253. //Adjust the result with some rules
  254. bool CResult::Adjust(PWORD_RESULT pItem,PWORD_RESULT pItemRet)
  255. {
  256. int i=0,j=0;
  257. unsigned int nLen;
  258. char sSurName[10],sSurName2[10],sGivenName[10];
  259. bool bProcessed=false;//Have been processed
  260. while(pItem[i].sWord[0]!=0)
  261. {
  262. nLen=strlen(pItem[i].sWord);
  263. bProcessed=false;
  264. //Rule1: adjust person name
  265. if(pItem[i].nHandle==28274&&ChineseNameSplit(pItem[i].sWord,sSurName,sSurName2,sGivenName,m_uPerson.m_dict)&&strcmp(pItem[i].sWord,"叶利钦")!=0)//'nr'
  266. {//Divide name into surname and given name
  267. if(sSurName[0])
  268. {
  269. strcpy(pItemRet[j].sWord,sSurName);
  270. pItemRet[j++].nHandle=28274;
  271. }
  272. if(sSurName2[0])
  273. {
  274. strcpy(pItemRet[j].sWord,sSurName2);
  275. pItemRet[j++].nHandle=28274;
  276. }
  277. if(sGivenName[0])
  278. {
  279. strcpy(pItemRet[j].sWord,sGivenName);
  280. pItemRet[j++].nHandle=28274;
  281. }
  282. bProcessed=true;
  283. }
  284. //Rule2 for overlap words ABB 一段段、一片片
  285. /* 
  286. * ----- commented by huangjin@ict.ac.cn 2006-7-17 ------ 
  287. *
  288. *  else if(pItem[i].nHandle==27904&&strlen(pItem[i+1].sWord)==2&&strcmp(pItem[i+1].sWord,pItem[i+2].sWord)==0)
  289. *
  290. */
  291. /*----Added By huangjin@ict.ac.cn 2006-7-17----*/
  292. else if(pItem[i].nHandle==27904&&
  293. strlen(pItem[i].sWord)<=2&&
  294. strlen(pItem[i+1].sWord)==2&&
  295. charType((unsigned char*)(pItem[i+1].sWord))==CT_CHINESE&&
  296. strcmp(pItem[i+1].sWord,pItem[i+2].sWord)==0)
  297. /*---------------------------------------------*/
  298. {//(pItem[i+1].nHandle/256=='q'||pItem[i+1].nHandle/256=='a')&&
  299. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  300. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  301. strcat(pItemRet[j].sWord,pItem[i+2].sWord);
  302. pItemRet[j].nHandle=27904;
  303. j+=1;
  304. i+=2;
  305. bProcessed=true;
  306. }
  307. //Rule3 for overlap words AA
  308. /* 
  309. * ----- commented by huangjin@ict.ac.cn 2006-7-17 ------ 
  310. *
  311. *  else if(nLen==2&&strcmp(pItem[i].sWord,pItem[i+1].sWord)==0)
  312. *
  313. */
  314. /*----Added By huangjin@ict.ac.cn 2006-7-17----*/
  315. else if(nLen==2&&
  316. charType((unsigned char*)(pItem[i].sWord))==CT_CHINESE&&
  317. strcmp(pItem[i].sWord,pItem[i+1].sWord)==0)
  318. /*---------------------------------------------*/
  319. {
  320. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  321. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  322.      //24832=='a'*256
  323. pItemRet[j].nHandle=24832;//a
  324. if(pItem[i].nHandle/256=='v'||pItem[i+1].nHandle/256=='v')//30208='v'8256
  325. {
  326. pItemRet[j].nHandle=30208;
  327. }
  328. if(pItem[i].nHandle/256=='n'||pItem[i+1].nHandle/256=='n')//30208='v'8256
  329. {
  330. pItemRet[j].nHandle='n'*256;
  331. }
  332. i+=1;
  333. if(strlen(pItem[i+1].sWord)==2)
  334. {//AAB:洗/洗/脸、蒙蒙亮
  335. if((pItemRet[j].nHandle==30208&&pItem[i+1].nHandle/256=='n')||
  336.    (pItemRet[j].nHandle==24832&&pItem[i+1].nHandle/256=='a')
  337.    )
  338. {
  339. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  340. i+=1;
  341. }
  342. }
  343. j+=1;
  344. bProcessed=true;
  345. }
  346. //Rule 4: AAB 洗/洗澡
  347. else if(nLen==2&&strncmp(pItem[i].sWord,pItem[i+1].sWord,2)==0&&strlen(pItem[i+1].sWord)==4&&(pItem[i].nHandle/256=='v'||pItem[i].nHandle==24832))//v,a
  348. {
  349. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  350. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  351.      //24832=='a'*256
  352. pItemRet[j].nHandle=24832;//'a'
  353. if(pItem[i].nHandle/256=='v'||pItem[i+1].nHandle/256=='v')//30208='v'8256
  354. {
  355. pItemRet[j].nHandle=30208;
  356. }
  357. i+=1;
  358. j+=1;
  359. bProcessed=true;
  360. }
  361. else if(pItem[i].nHandle/256=='u'&&pItem[i].nHandle%256)//uj,ud,uv,uz,ul,ug->u
  362. pItem[i].nHandle='u'*256;
  363. else if(nLen==2&&strncmp(pItem[i].sWord,pItem[i+1].sWord,2)==0&&strlen(pItem[i+1].sWord)==4&&strncmp(pItem[i+1].sWord+2,pItem[i+2].sWord,2)==0)
  364. {//AABB 朴朴素素 枝枝叶叶
  365. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  366. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  367. strcat(pItemRet[j].sWord,pItem[i+2].sWord);
  368. pItemRet[j].nHandle=pItem[i+1].nHandle;
  369. i+=2;
  370. j+=1;
  371. bProcessed=true;
  372. }
  373. else if(pItem[i].nHandle==28275)//PostFix
  374. {
  375. if(m_uPlace.m_dict.IsExist(pItem[i+1].sWord,4))
  376. {
  377. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  378. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  379. pItemRet[j].nHandle=28275;
  380. i+=1;
  381. j+=1;
  382. bProcessed=true;
  383. }
  384. else if(strlen(pItem[i+1].sWord)==2&&CC_Find("队",pItem[i+1].sWord))
  385. {
  386. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  387. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  388. pItemRet[j].nHandle=28276;
  389. i+=1;
  390. j+=1;
  391. bProcessed=true;
  392. }
  393. else if(strlen(pItem[i+1].sWord)==2&&CC_Find("语文字杯",pItem[i+1].sWord))
  394. {
  395. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  396. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  397. pItemRet[j].nHandle=28282;
  398. i+=1;
  399. j+=1;
  400. bProcessed=true;
  401. }
  402. else if(strlen(pItem[i+1].sWord)==2&&CC_Find("裔",pItem[i+1].sWord))
  403. {
  404. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  405. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  406. pItemRet[j].nHandle=28160;
  407. i+=1;
  408. j+=1;
  409. bProcessed=true;
  410. }
  411. }
  412. else if(pItem[i].nHandle==30208||pItem[i].nHandle==28160)//v
  413. {
  414. if(strlen(pItem[i+1].sWord)==2&&CC_Find("员",pItem[i+1].sWord))
  415. {
  416. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  417. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  418. pItemRet[j].nHandle=28160;
  419. i+=1;
  420. j+=1;
  421. bProcessed=true;
  422. }
  423. }
  424. else if(pItem[i].nHandle==28280)
  425. {//www/nx ./w sina/nx; EIM/nx  -601/m 
  426. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  427. pItemRet[j].nHandle=28280;
  428. /* 
  429. * ----- commented by huangjin@ict.ac.cn 2006-5-29 ------ 
  430. *  while(pItem[i+1].nHandle==28280||strstr("..",pItem[i+1].sWord)||(pItem[i+1].nHandle==27904&&IsAllNum((unsigned char *)pItem[i+1].sWord)))
  431. *
  432. */
  433. /*----Added By huangjin@ict.ac.cn 2006-5-30----*/
  434. while(pItem[i+1].sWord[0]!=0 
  435.   && ( pItem[i+1].nHandle==28280
  436. ||strstr("..",pItem[i+1].sWord)
  437. ||(pItem[i+1].nHandle==27904&&IsAllNum((unsigned char *)pItem[i+1].sWord))))
  438. /*---------------------------------------------*/
  439. {
  440. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  441. i+=1;
  442. }
  443. j+=1;
  444. bProcessed=true;
  445. }
  446. /*----Added By huangjin@ict.ac.cn 2006-9-12----*/
  447. //additional rule for 三点/m 十五分
  448. else if(pItem[i].nHandle==27904&&
  449. !strncmp(pItem[i].sWord+strlen(pItem[i].sWord)-2,"点",2)&&
  450. pItem[i+1].sWord[0]!=0&&
  451. pItem[i+1].nHandle==29696&&
  452. !strncmp(pItem[i+1].sWord+strlen(pItem[i+1].sWord)-2,"分",2))
  453. {
  454. strcpy(pItemRet[j].sWord, pItem[i].sWord);
  455. pItemRet[j++].nHandle = pItem[i+1].nHandle;
  456. bProcessed=true;
  457. }
  458. /*---------------------------------------------*/
  459. if(!bProcessed)
  460. {//If not processed,that's mean: not need to adjust;
  461.  //just copy to the final result
  462. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  463. pItemRet[j++].nHandle=pItem[i].nHandle;
  464. }
  465. i++;
  466. }
  467. pItemRet[j].sWord[0]=0;//Set ending
  468. return true;
  469. }
  470. //Paragraph Segment and POS Tagging
  471. bool CResult::ParagraphProcessing(char *sParagraph,CString &sResult)
  472. {
  473. char *sSentence,sChar[3];
  474. // char *sSentenceResult;
  475.     CString sSentenceResult;
  476. unsigned int nLen=strlen(sParagraph)+13;
  477. sSentence=new char[nLen];//malloc buffer
  478. /* 
  479.      * ----- commented by huangjin@ict.ac.cn 2006-5-29 ------ 
  480.      * 
  481.      *  sSentenceResult=new char[nLen*3];//malloc buffer
  482.      *
  483.     */
  484. /*----Added By huangjin@ict.ac.cn 2006-5-31----*/
  485. //single letter + '/' + postag + ' ',so multiply 4 
  486. //sSentenceResult=new char[nLen*4];//malloc buffer
  487. /*---------------------------------------------*/
  488. sSentence[0]=0;
  489. /*----Added By huangjin@ict.ac.cn 2006-5-29----*/
  490. //sSentenceResult[0]=0;
  491.     sSentenceResult="";
  492. /*---------------------------------------------*/
  493. unsigned int nPosIndex=0,nParagraphLen=strlen(sParagraph),nSentenceIndex=0;
  494. sChar[2]=0;
  495. //sResult[0]=0;//Init the result
  496.     sResult="";
  497. bool bFirstIgnore=true;
  498. strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag
  499. while(nPosIndex<nParagraphLen)
  500. {//Find a whole sentence which separated by ! . n r
  501. sChar[0]=sParagraph[nPosIndex];//Get a char
  502. sChar[1]=0;
  503. if(sParagraph[nPosIndex]<0)
  504. {//double byte char
  505. nPosIndex+=1;
  506. sChar[1]=sParagraph[nPosIndex];
  507. }
  508. nPosIndex+=1;
  509. /*
  510. #define  SEPERATOR_C_SENTENCE "。!?:;…"
  511. #define  SEPERATOR_C_SUB_SENTENCE "、,()“”‘’"
  512. #define  SEPERATOR_E_SENTENCE "!?:;"
  513. #define  SEPERATOR_E_SUB_SENTENCE ",()42'"
  514. #define  SEPERATOR_LINK "nr  "
  515. */
  516. if(CC_Find(SEPERATOR_C_SENTENCE,sChar)||CC_Find(SEPERATOR_C_SUB_SENTENCE,sChar)||strstr(SEPERATOR_E_SENTENCE,sChar)||strstr(SEPERATOR_E_SUB_SENTENCE,sChar)||strstr(SEPERATOR_LINK,sChar))
  517. {//Reach end of a sentence.Get a whole sentence
  518. if(!strstr(SEPERATOR_LINK,sChar))//Not link seperator
  519. {
  520. strcat(sSentence,sChar);
  521. }
  522. /*----Added By huangjin@ict.ac.cn 2006-7-17----*/
  523. //for ……, do not split them
  524. if(!strcmp(sChar,"…")&&nPosIndex<nParagraphLen-1)
  525. {
  526. sChar[0]=sParagraph[nPosIndex];
  527. sChar[1]=sParagraph[nPosIndex+1];
  528.                 if(strcmp(sChar,"…")==0)
  529. {
  530. nPosIndex+=2;
  531. strcat(sSentence,sChar);
  532. }
  533. else
  534. {
  535. strcpy(sChar,"…");//back off
  536. }
  537. }
  538. /*---------------------------------------------*/
  539. if(sSentence[0]!=0&&strcmp(sSentence,SENTENCE_BEGIN)!=0)
  540. {
  541. if(!strstr(SEPERATOR_C_SUB_SENTENCE,sChar)&&!strstr(SEPERATOR_E_SUB_SENTENCE,sChar))
  542. strcat(sSentence,SENTENCE_END);//Add sentence ending flag
  543. Processing(sSentence,1);//Processing and output the result of current sentence.
  544. Output(m_pResult[0],sSentenceResult,bFirstIgnore);//Output to the imediate result
  545. //bFirstIgnore=true;
  546. //strcat(sResult,sSentenceResult);//Store in the result buffer
  547.                 sResult+=sSentenceResult;
  548. }
  549. if(strstr(SEPERATOR_LINK,sChar))//Link the result with the SEPERATOR_LINK
  550. {
  551. //strcat(sResult,sChar);
  552.                 sResult+=sChar;
  553. strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag
  554. //sSentence[0]=0;//New sentence, and begin new segmentation
  555. //bFirstIgnore=false;
  556. }
  557. /* 
  558. * ----- commented by huangjin@ict.ac.cn 2006-7-17 ------ 
  559. *
  560. *  else if(strstr(SEPERATOR_C_SENTENCE,sChar)||strstr(SEPERATOR_E_SENTENCE,sChar))
  561. *
  562. */
  563. /*----Added By huangjin@ict.ac.cn 2006-7-17----*/
  564. else if(CC_Find(SEPERATOR_C_SENTENCE,sChar)||strstr(SEPERATOR_E_SENTENCE,sChar))
  565. /*---------------------------------------------*/
  566. {
  567. strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag
  568. //sSentence[0]=0;//New sentence, and begin new segmentation
  569. //bFirstIgnore=false;
  570. }
  571. else
  572. {
  573. strcpy(sSentence,sChar);//reset current sentence, and add the previous end at begin position
  574. }
  575. }
  576. else //Other chars and store in the sentence buffer
  577. strcat(sSentence,sChar);
  578. }
  579. if(sSentence[0]!=0&&strcmp(sSentence,SENTENCE_BEGIN)!=0)
  580. {
  581. strcat(sSentence,SENTENCE_END);//Add sentence ending flag
  582. Processing(sSentence,1);//Processing and output the result of current sentence.
  583. Output(m_pResult[0],sSentenceResult,bFirstIgnore);//Output to the imediate result
  584. //strcat(sResult,sSentenceResult);//Store in the result buffer
  585.         sResult+=sSentenceResult;
  586. }
  587. delete []  sSentence;//FREE sentence buffer 
  588. // delete []  sSentenceResult;//free buffer
  589. return true;
  590. }
  591. bool CResult::FileProcessing(char *sSourceFile,char *sResultFile)
  592. {
  593. FILE *fpSource,*fpResult;//The file pointer of read and write
  594. char *sParagraph;//,*sParagraphResult;
  595.     CString sParagraphResult;
  596. int nLineIndex=1;
  597. /* 
  598.      * ----- commented by huangjin@ict.ac.cn 2006-5-29 ------ 
  599.      * 
  600.      *  sParagraph=new char[4*1024];
  601.  * sParagraphResult=new char[8*1024];
  602.  * if((fpSource=fopen(sSourceFile,"rt"))==NULL)
  603.  * return false;//Cannot open the source file to read
  604.  * if((fpResult=fopen(sResultFile,"wt"))==NULL) 
  605.  * return false;//Cannot open the result  file to write
  606.      *
  607.     */
  608. /*----Added By huangjin@ict.ac.cn 2006-5-29----*/
  609. //malloc memory after the files are opened successfully
  610.  if((fpSource=fopen(sSourceFile,"rt"))==NULL)
  611.  return false;//Cannot open the source file to read
  612.  if((fpResult=fopen(sResultFile,"wt"))==NULL) 
  613.  return false;//Cannot open the result  file to write
  614.  sParagraph=new char[MAX_SENTENCE_LEN*2];
  615.  //sParagraphResult=new char[MAX_SENTENCE_LEN*4];
  616. /*--------------------------------------------*/
  617. if(m_nOutputFormat==2)//XML format
  618. fprintf(fpResult,"<?xml version=42 1.042 encoding=42gb231242?><result>");
  619. while(!feof(fpSource))
  620. {
  621. /* 
  622. * ----- commented by huangjin@ict.ac.cn 2006-7-23 ------ 
  623. *
  624. *  if(fgets(sParagraph,4*1024,fpSource)==0)//Get a paragrah
  625. *
  626. */
  627. /*----Added By huangjin@ict.ac.cn 2006-7-23----*/
  628. if(fgets(sParagraph,MAX_SENTENCE_LEN,fpSource)==0)//Get a paragrah
  629. /*---------------------------------------------*/
  630. continue;
  631. /*----Added By huangjin@ict.ac.cn 2006-7-23----*/
  632. //如果读到尾正好把一个全角字符截断了会特别处理一下,采取回退而不是多读
  633. if(strlen(sParagraph)==MAX_SENTENCE_LEN-1)
  634. {
  635. bool bTrunc=false;
  636. for(int i=0;i<MAX_SENTENCE_LEN-1;i++)
  637. {
  638. if(sParagraph[i]<0)
  639. {
  640. if(i==MAX_SENTENCE_LEN-2)//末尾是被截断的字符
  641. {
  642. bTrunc=true;
  643. break;
  644. }
  645. i++;
  646. }
  647. }
  648. if(bTrunc)
  649. {
  650. ungetc(sParagraph[MAX_SENTENCE_LEN-2],fpSource);
  651. sParagraph[MAX_SENTENCE_LEN-2]='';
  652. }
  653. }
  654. /*---------------------------------------------*/
  655. //TRACE("%dn",nLineIndex++);
  656. ParagraphProcessing(sParagraph,sParagraphResult);
  657. fprintf(fpResult,"%s",(LPCTSTR)sParagraphResult);
  658. }
  659. delete [] sParagraph;
  660. //delete [] sParagraphResult;
  661. fclose(fpSource);
  662. if(m_nOutputFormat==2)//XML format
  663. fprintf(fpResult,"</result>");
  664. fclose(fpResult);
  665. return true;
  666. }
  667. bool CResult::PKU2973POS(int nHandle, char *sPOS973)
  668. {
  669. int nHandleSet[46]={24832,24932,24935,24942,25088,25344,25600,25703,25856,26112,26368,26624,26880,27136,27392,27648,27904,28160,28263,28274,28275,28276,28280,28282,28416,28672,28928,29184,29440,29696,29799,29952,30052,30055,30058,30060,30070,30074,30208,30308,30311,30318,30464,30720,30976,31232};
  670. //   "a", "ad","ag","an","b", "c", "d", "dg","e", "f","g", "h", "i", "j", "k", "l", "m", "n", "ng","nr","ns","nt","nx","nz","o", "p", "q", "r", "s", "t", "tg","u", "ud","ug","uj","ul","uv","uz","v", "vd","vg","vn","w", "x", "y", "z"
  671. char sPOSRelated[46][3]={"a", "ad","ga","an","f", "c", "d", "d", "e","nd","g", "h", "i", "j", "k", "l", "m", "n", "gn","nh","ns","ni","ws", "nz","o", "p", "q", "r", "nl","nt","gt","u", "ud","ug","uj","ul","uv","uz","v", "vd","gv","vn","w", "x", "u", "a"};
  672. /* 
  673.  "Bg","gf",
  674.  "Rg","gr",
  675.  "Mg","gm",
  676.  "Yg","u",
  677.  "Ug","u",
  678.  "Qg","q",
  679. */
  680. int nIndex=BinarySearch(nHandle,nHandleSet,46);
  681. if(nIndex==-1)
  682. strcpy(sPOS973,"@");
  683. else
  684. strcpy(sPOS973,sPOSRelated[nIndex]);
  685. return true;
  686. }
  687. bool CResult::ChineseNameSplit(char *sPersonName, char *sSurname, char *sSurname2, char *sGivenName, CDictionary &personDict)
  688. {
  689. int nSurNameLen=4,nLen=strlen(sPersonName),nFreq,i=0,nCharType,nFreqGiven;
  690. char sTemp[3];
  691. if(nLen<3||nLen>8)//Not a traditional Chinese person name
  692. return false;
  693. while(i<nLen)//No Including non-CHinese char
  694. {
  695. nCharType=charType((unsigned char*)sPersonName+i);
  696. if(nCharType!=CT_CHINESE&&nCharType!=CT_OTHER)
  697. return false;
  698. i+=2;
  699. }
  700. sSurname2[0]=0;//init 
  701. strncpy(sSurname,sPersonName,nSurNameLen);
  702. sSurname[nSurNameLen]=0;
  703. if(!personDict.IsExist(sSurname,1))
  704. {
  705. nSurNameLen=2;
  706. sSurname[nSurNameLen]=0;
  707. if(!personDict.IsExist(sSurname,1))
  708. {
  709. nSurNameLen=0;
  710. sSurname[nSurNameLen]=0;
  711. }
  712. }
  713. strcpy(sGivenName,sPersonName+nSurNameLen);
  714. if(nLen>6)
  715. {
  716. strncpy(sTemp,sPersonName+nSurNameLen,2);
  717. sTemp[2]=0;//Get the second possible surname
  718. if(personDict.IsExist(sTemp,1))
  719. {//Hongkong women's name: Surname+surname+given name
  720. strcpy(sSurname2,sTemp);
  721. strcpy(sGivenName,sPersonName+nSurNameLen+2);
  722. }
  723. }
  724. nFreq=personDict.GetFrequency(sSurname,1);
  725. strncpy(sTemp,sGivenName,2);
  726. sTemp[2]=0;
  727. nFreqGiven=personDict.GetFrequency(sTemp,2);
  728. if(nSurNameLen!=4&&((nSurNameLen==0&&nLen>4)||strlen(sGivenName)>4||(GetForeignCharCount(sPersonName)>=3&&nFreq<personDict.GetFrequency("张",1)/40&&nFreqGiven<personDict.GetFrequency("华",2)/20)||(nFreq<10&&GetForeignCharCount(sGivenName)==(nLen-nSurNameLen)/2)))
  729. return false;
  730. if(nLen==4&&m_uPerson.IsGivenName(sPersonName))
  731. {//Single Surname+given name
  732. return false;
  733. }
  734. return true;
  735. }
  736. /*----Added By huangjin@ict.ac.cn 2006-5-30----*/
  737. void CResult::ResetWord(void)
  738. {
  739. if(m_pResult)
  740. {
  741. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  742. {
  743. if(m_pResult[i])
  744. {
  745. /*---------------------------------------------*/
  746. /* 
  747. * ----- commented by huangjin@ict.ac.cn 2006-9-12 ------ 
  748. *
  749. *  for(int j=0;j<MAX_WORDS;j++)
  750. *
  751. */
  752. /*----Added By huangjin@ict.ac.cn 2006-9-12----*/
  753. for(int j=0;j<MAX_WORDS&&m_pResult[i][j].sWord[0];j++)
  754. /*---------------------------------------------*/
  755. {
  756. m_pResult[i][j].sWord[0]='';
  757. m_pResult[i][j].dValue=0.0;
  758. m_pResult[i][j].nHandle=0;
  759. }
  760. }
  761. }
  762. }
  763. }
  764. /*---------------------------------------------*/