Result.cpp
上传用户:yxl0916
上传日期:2007-05-25
资源大小:2245k
文件大小:19k
源码类别:

多国语言处理

开发平台:

Visual C++

  1. //////////////////////////////////////////////////////////////////////
  2. //ICTCLAS简介:计算所汉语词法分析系统ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System),
  3. //             功能有:中文分词;词性标注;未登录词识别。
  4. //             分词正确率高达97.58%(973专家评测结果),
  5. //             未登录词识别召回率均高于90%,其中中国人名的识别召回率接近98%;
  6. //             处理速度为31.5Kbytes/s。
  7. //著作权:  Copyright?2002-2005中科院计算所 职务著作权人:张华平 刘群
  8. //遵循协议:自然语言处理开放资源许可证1.0
  9. //Email: zhanghp@software.ict.ac.cn
  10. //Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
  11. // Result.cpp: implementation of the CResult class.
  12. //
  13. //////////////////////////////////////////////////////////////////////
  14. #include "stdafx.h"
  15. #include "Result.h"
  16. #include <string.h>
  17. #include <stdio.h>
  18. #include <math.h>
  19. #include "..\Utility\Utility.h"
  20. //////////////////////////////////////////////////////////////////////
  21. // Construction/Destruction
  22. //////////////////////////////////////////////////////////////////////
  23. CResult::CResult()
  24. {
  25. //malloc buffer
  26. m_pResult=new PWORD_RESULT[MAX_SEGMENT_NUM];
  27. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  28. {
  29. m_pResult[i]=new WORD_RESULT[MAX_WORDS];
  30. }
  31. m_dictCore.Load("data\coreDict.dct");
  32. m_POSTagger.LoadContext("data\lexical.ctx");
  33. /*
  34. m_dictCore.Load("data\Dict.dct");
  35. m_POSTagger.LoadContext("data\trainTest.ctx");
  36. */
  37. /*
  38. m_dictCore.AddItem("十分",'d'*256,+500);
  39. m_dictCore.AddItem("十分",'m'*256,-500);
  40. m_dictCore.AddItem("我国",'n'*256,-2000);
  41. m_dictCore.AddItem("我国",'r'*256,+2000);
  42. m_dictCore.AddItem("千年",'t'*256,200);
  43.     m_dictCore.Optimum();
  44. m_dictCore.Save("data\coreDictOptimum.dct");
  45. */
  46. m_POSTagger.SetTagType();
  47. m_uPerson.Configure("data\nr",TT_PERSON);
  48. //Set the person recognition configure
  49. m_uPlace.Configure("data\ns",TT_PLACE);
  50. //Set the place recognition configure
  51. m_uTransPerson.Configure("data\tr",TT_TRANS_PERSON);
  52. //Set the transliteration person recognition configure
  53. m_nOperateType=2;//0:Only Segment;1: First Tag; 2:Second Type
  54. m_nOutputFormat=0;//0:PKU criterion;1:973 criterion; 2: XML criterion
  55. m_dSmoothingPara=0.1;//Smoothing parameter
  56. m_dictBigram.Load("data\BigramDict.dct");
  57. }
  58. CResult::~CResult()
  59. {
  60. //free buffer
  61. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  62. {
  63. delete [] m_pResult[i];
  64. }
  65. delete m_pResult;
  66. }
  67. bool CResult::Output(PWORD_RESULT pItem, char *sResult,bool bFirstWordIgnore)
  68. {
  69. int i=0;
  70. char sTempBuffer[WORD_MAXLENGTH],sPOS[3];
  71. sPOS[2]=0;
  72. sResult[0]=0;
  73. if(bFirstWordIgnore)//Ignore first valid
  74. i=1;
  75. while(pItem[i].sWord[0]!=0&&pItem[i].nHandle!=CT_SENTENCE_END)//Not sentence ending flag
  76. {
  77. //Get the POS string
  78. if(m_nOutputFormat!=0)//Not PKU format
  79. PKU2973POS(pItem[i].nHandle,sPOS);
  80. else//PKU format
  81. {
  82. sPOS[0]=pItem[i].nHandle/256;
  83. sPOS[1]=pItem[i].nHandle%256;
  84. }
  85. sPOS[m_nOperateType]=0;//Set the sPOS with operate type
  86. if(m_nOutputFormat==0)//PKU format
  87. {
  88. sprintf(sTempBuffer,"%s",pItem[i].sWord);
  89.     strcat(sResult,sTempBuffer);
  90. if(sPOS[0]!=0)//need POS 
  91. {
  92. sprintf(sTempBuffer,"/%s",sPOS);
  93. strcat(sResult,sTempBuffer);
  94. }
  95.     strcat(sResult,"  ");
  96. }
  97. else if(m_nOutputFormat==1)//973 format
  98. {
  99. sprintf(sTempBuffer,"%s\",pItem[i].sWord);
  100.     strcat(sResult,sTempBuffer);
  101. if(sPOS[0]!=0)//need POS 
  102. {
  103. sprintf(sTempBuffer,"[%s]",sPOS);
  104. strcat(sResult,sTempBuffer);
  105. }
  106. }
  107. else if(m_nOutputFormat==2)//XML format
  108. {
  109. if(sPOS[0]!=0)//POS
  110. {
  111. sprintf(sTempBuffer,"<any type=42%s42>",sPOS);
  112. strcat(sResult,sTempBuffer);
  113. }
  114. sprintf(sTempBuffer,"<src>%s</src>",pItem[i].sWord);
  115. strcat(sResult,sTempBuffer);
  116. if(sPOS[0]!=0)
  117. {
  118. strcat(sResult,"</any>");
  119. }
  120. }
  121. i++;
  122. }
  123. return true;
  124. }
  125. bool CResult::Processing(char *sSentence,unsigned int nCount)
  126. {
  127. int nIndex;
  128. #if _ICT_DEBUG
  129. char *sSegment;
  130. sSegment=new char[MAX_SENTENCE_LEN*2];
  131. #endif
  132. //Unigram segment
  133. //m_Seg.Segment(sSentence,m_dictCore,nCount);
  134. //Bigram segment
  135. m_Seg.BiSegment(sSentence, m_dSmoothingPara,m_dictCore,m_dictBigram,nCount);
  136. m_nResultCount=m_Seg.m_nSegmentCount;
  137. //Record the number of result
  138. for(nIndex=0;nIndex<m_Seg.m_nSegmentCount;nIndex++)
  139. {
  140. #if _ICT_DEBUG
  141. m_POSTagger.POSTagging(m_Seg.m_pWordSeg[nIndex],m_dictCore,m_dictCore);
  142. Output(m_Seg.m_pWordSeg[nIndex],sSegment);
  143. printf("POS Tag%d:%sn",nIndex+1,sSegment);
  144. #endif
  145. m_uPerson.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
  146. m_uTransPerson.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
  147. m_uPlace.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
  148. }
  149. //m_uPerson.Recognition(m_Seg.m_WordSeg[0],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
  150. //Person Recognition
  151. #if _ICT_DEBUG
  152. printf("After person recognition.n");
  153. #endif
  154. //Unigram
  155. //m_Seg.OptimumSegmet(nCount);
  156. //Bigram
  157. m_Seg.BiOptimumSegment(nCount,m_dSmoothingPara,m_dictBigram,m_dictCore);
  158. for(nIndex=0;nIndex<m_Seg.m_nSegmentCount;nIndex++)
  159. {
  160. m_POSTagger.POSTagging(m_Seg.m_pWordSeg[nIndex],m_dictCore,m_dictCore);
  161. #if _ICT_DEBUG
  162. Output(m_Seg.m_pWordSeg[nIndex],sSegment);
  163. printf("POS Tag%d:%sn",nIndex+1,sSegment);
  164. #endif
  165. }
  166. #if _ICT_DEBUG
  167. printf("After Sorting.n");
  168. #endif
  169. Sort();//Sort the ending 
  170. #if _ICT_DEBUG
  171. for(nIndex=0;nIndex<m_Seg.m_nSegmentCount;nIndex++)
  172. {
  173. Output(m_pResult[nIndex],sSegment);
  174. printf("POS Tag%d(P=Exp(%f)):%sn",nIndex+1,m_dResultPossibility[nIndex],sSegment);
  175. }
  176. delete [] sSegment;
  177. #endif
  178. return true;
  179. }
  180. //Sort the segmentation and POS result according its possibility
  181. bool CResult::Sort()
  182. {
  183. ELEMENT_TYPE dPossibility[MAX_SEGMENT_NUM],dTemp;
  184. int nIndex[MAX_SEGMENT_NUM],nTemp;//Index
  185. memset(dPossibility,0,sizeof(dPossibility));
  186. //Init the possibility
  187. for(int i=0;i<m_Seg.m_nSegmentCount;i++)
  188. {//Computing the possibility
  189. dPossibility[i]=ComputePossibility(m_Seg.m_pWordSeg[i]);
  190. nIndex[i]=i;//Record the index
  191. }
  192. //Sort with Bubble sort algorithm
  193. for(i=0;i<m_Seg.m_nSegmentCount;i++)
  194. for(int j=i+1;j<m_Seg.m_nSegmentCount;j++)
  195. {
  196. if(dPossibility[i]<dPossibility[j])
  197. {//Swap the possition and value
  198. nTemp=nIndex[i];
  199. dTemp=dPossibility[i];
  200. nIndex[i]=nIndex[j];
  201. dPossibility[i]=dPossibility[j];
  202. nIndex[j]=nTemp;
  203. dPossibility[j]=dTemp;
  204. }
  205. }
  206. for(i=0;i<m_Seg.m_nSegmentCount;i++)
  207. {//Adjust the segmentation and POS result and store them in the final result array
  208.  //Store them according their possibility ascendly
  209. Adjust(m_Seg.m_pWordSeg[nIndex[i]],m_pResult[i]);
  210. m_dResultPossibility[i]=dPossibility[i];
  211. }
  212. return true;
  213. }
  214. //Compute the possibility of current segmentation and POS result
  215. ELEMENT_TYPE CResult::ComputePossibility(PWORD_RESULT pItem)
  216. {
  217. int i=0;
  218. ELEMENT_TYPE dResultPossibility=0;
  219. while(pItem[i].sWord[0]!=0)
  220. {
  221. dResultPossibility+=pItem[i].dValue;
  222. //Compute the possibility of logP(Wi|Ti)
  223. if(pItem[i+1].sWord[0]!=0)//Not the last one
  224. {//Compute the possibility of logP(Ti|Ti-1)
  225. dResultPossibility+=log((double)(m_POSTagger.m_context.GetContextPossibility(0,pItem[i].nHandle,pItem[i+1].nHandle)+1));
  226. dResultPossibility-=log((double)(m_POSTagger.m_context.GetFrequency(0,pItem[i].nHandle)+1));
  227. }
  228. i++;
  229. }
  230. return dResultPossibility;
  231. }
  232. //Adjust the result with some rules
  233. bool CResult::Adjust(PWORD_RESULT pItem,PWORD_RESULT pItemRet)
  234. {
  235. int i=0,j=0;
  236. unsigned int nLen;
  237. char sSurName[10],sSurName2[10],sGivenName[10];
  238. bool bProcessed=false;//Have been processed
  239. while(pItem[i].sWord[0]!=0)
  240. {
  241. nLen=strlen(pItem[i].sWord);
  242. bProcessed=false;
  243. //Rule1: adjust person name
  244. if(pItem[i].nHandle==28274&&ChineseNameSplit(pItem[i].sWord,sSurName,sSurName2,sGivenName,m_uPerson.m_dict)&&strcmp(pItem[i].sWord,"叶利钦")!=0)//'nr'
  245. {//Divide name into surname and given name
  246. if(sSurName[0])
  247. {
  248. strcpy(pItemRet[j].sWord,sSurName);
  249. pItemRet[j++].nHandle=28274;
  250. }
  251. if(sSurName2[0])
  252. {
  253. strcpy(pItemRet[j].sWord,sSurName2);
  254. pItemRet[j++].nHandle=28274;
  255. }
  256. if(sGivenName[0])
  257. {
  258. strcpy(pItemRet[j].sWord,sGivenName);
  259. pItemRet[j++].nHandle=28274;
  260. }
  261. bProcessed=true;
  262. }
  263. //Rule2 for overlap words ABB 一段段、一片片
  264. else if(pItem[i].nHandle==27904&&strlen(pItem[i+1].sWord)==2&&strcmp(pItem[i+1].sWord,pItem[i+2].sWord)==0)
  265. {//(pItem[i+1].nHandle/256=='q'||pItem[i+1].nHandle/256=='a')&&
  266. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  267. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  268. strcat(pItemRet[j].sWord,pItem[i+2].sWord);
  269. pItemRet[j].nHandle=27904;
  270. j+=1;
  271. i+=2;
  272. bProcessed=true;
  273. }
  274. //Rule3 for overlap words AA
  275. else if(nLen==2&&strcmp(pItem[i].sWord,pItem[i+1].sWord)==0)
  276. {
  277. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  278. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  279.      //24832=='a'*256
  280. pItemRet[j].nHandle=24832;//a
  281. if(pItem[i].nHandle/256=='v'||pItem[i+1].nHandle/256=='v')//30208='v'8256
  282. {
  283. pItemRet[j].nHandle=30208;
  284. }
  285. if(pItem[i].nHandle/256=='n'||pItem[i+1].nHandle/256=='n')//30208='v'8256
  286. {
  287. pItemRet[j].nHandle='n'*256;
  288. }
  289. i+=1;
  290. if(strlen(pItem[i+1].sWord)==2)
  291. {//AAB:洗/洗/脸、蒙蒙亮
  292. if((pItemRet[j].nHandle==30208&&pItem[i+1].nHandle/256=='n')||
  293.    (pItemRet[j].nHandle==24832&&pItem[i+1].nHandle/256=='a')
  294.    )
  295. {
  296. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  297. i+=1;
  298. }
  299. }
  300. j+=1;
  301. bProcessed=true;
  302. }
  303. //Rule 4: AAB 洗/洗澡
  304. else if(nLen==2&&strncmp(pItem[i].sWord,pItem[i+1].sWord,2)==0&&strlen(pItem[i+1].sWord)==4&&(pItem[i].nHandle/256=='v'||pItem[i].nHandle==24832))//v,a
  305. {
  306. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  307. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  308.      //24832=='a'*256
  309. pItemRet[j].nHandle=24832;//'a'
  310. if(pItem[i].nHandle/256=='v'||pItem[i+1].nHandle/256=='v')//30208='v'8256
  311. {
  312. pItemRet[j].nHandle=30208;
  313. }
  314. i+=1;
  315. j+=1;
  316. bProcessed=true;
  317. }
  318. else if(pItem[i].nHandle/256=='u'&&pItem[i].nHandle%256)//uj,ud,uv,uz,ul,ug->u
  319. pItem[i].nHandle='u'*256;
  320. else if(nLen==2&&strncmp(pItem[i].sWord,pItem[i+1].sWord,2)==0&&strlen(pItem[i+1].sWord)==4&&strncmp(pItem[i+1].sWord+2,pItem[i+2].sWord,2)==0)
  321. {//AABB 朴朴素素 枝枝叶叶
  322. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  323. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  324. strcat(pItemRet[j].sWord,pItem[i+2].sWord);
  325. pItemRet[j].nHandle=pItem[i+1].nHandle;
  326. i+=2;
  327. j+=1;
  328. bProcessed=true;
  329. }
  330. else if(pItem[i].nHandle==28275)//PostFix
  331. {
  332. if(m_uPlace.m_dict.IsExist(pItem[i+1].sWord,4))
  333. {
  334. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  335. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  336. pItemRet[j].nHandle=28275;
  337. i+=1;
  338. j+=1;
  339. bProcessed=true;
  340. }
  341. else if(strlen(pItem[i+1].sWord)==2&&CC_Find("队",pItem[i+1].sWord))
  342. {
  343. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  344. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  345. pItemRet[j].nHandle=28276;
  346. i+=1;
  347. j+=1;
  348. bProcessed=true;
  349. }
  350. else if(strlen(pItem[i+1].sWord)==2&&CC_Find("语文字杯",pItem[i+1].sWord))
  351. {
  352. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  353. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  354. pItemRet[j].nHandle=28282;
  355. i+=1;
  356. j+=1;
  357. bProcessed=true;
  358. }
  359. else if(strlen(pItem[i+1].sWord)==2&&CC_Find("裔",pItem[i+1].sWord))
  360. {
  361. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  362. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  363. pItemRet[j].nHandle=28160;
  364. i+=1;
  365. j+=1;
  366. bProcessed=true;
  367. }
  368. }
  369. else if(pItem[i].nHandle==30208||pItem[i].nHandle==28160)//v
  370. {
  371. if(strlen(pItem[i+1].sWord)==2&&CC_Find("员",pItem[i+1].sWord))
  372. {
  373. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  374. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  375. pItemRet[j].nHandle=28160;
  376. i+=1;
  377. j+=1;
  378. bProcessed=true;
  379. }
  380. }
  381. else if(pItem[i].nHandle==28280)
  382. {//www/nx ./w sina/nx; EIM/nx  -601/m 
  383. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  384. pItemRet[j].nHandle=28280;
  385. while(pItem[i+1].nHandle==28280||strstr("..",pItem[i+1].sWord)||(pItem[i+1].nHandle==27904&&IsAllNum((unsigned char *)pItem[i+1].sWord)))
  386. {
  387. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  388. i+=1;
  389. }
  390. j+=1;
  391. bProcessed=true;
  392. }
  393. if(!bProcessed)
  394. {//If not processed,that's mean: not need to adjust;
  395.  //just copy to the final result
  396. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  397. pItemRet[j++].nHandle=pItem[i].nHandle;
  398. }
  399. i++;
  400. }
  401. pItemRet[j].sWord[0]=0;//Set ending
  402. return true;
  403. }
  404. //Paragraph Segment and POS Tagging
  405. bool CResult::ParagraphProcessing(char *sParagraph,char *sResult)
  406. {
  407. char *sSentence,sChar[3];
  408. char *sSentenceResult;
  409. unsigned int nLen=strlen(sParagraph)+13;
  410. sSentence=new char[nLen];//malloc buffer
  411. sSentenceResult=new char[nLen*3];//malloc buffer
  412. sSentence[0]=0;
  413. unsigned int nPosIndex=0,nParagraphLen=strlen(sParagraph),nSentenceIndex=0;
  414. sChar[2]=0;
  415. sResult[0]=0;//Init the result
  416. bool bFirstIgnore=true;
  417. strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag
  418. while(nPosIndex<nParagraphLen)
  419. {//Find a whole sentence which separated by ! . n r
  420. sChar[0]=sParagraph[nPosIndex];//Get a char
  421. sChar[1]=0;
  422. if(sParagraph[nPosIndex]<0)
  423. {//double byte char
  424. nPosIndex+=1;
  425. sChar[1]=sParagraph[nPosIndex];
  426. }
  427. nPosIndex+=1;
  428. /*
  429. #define  SEPERATOR_C_SENTENCE "。!?:;…"
  430. #define  SEPERATOR_C_SUB_SENTENCE "、,()“”‘’"
  431. #define  SEPERATOR_E_SENTENCE "!?:;"
  432. #define  SEPERATOR_E_SUB_SENTENCE ",()42'"
  433. #define  SEPERATOR_LINK "nr  "
  434. */
  435. if(CC_Find(SEPERATOR_C_SENTENCE,sChar)||CC_Find(SEPERATOR_C_SUB_SENTENCE,sChar)||strstr(SEPERATOR_E_SENTENCE,sChar)||strstr(SEPERATOR_E_SUB_SENTENCE,sChar)||strstr(SEPERATOR_LINK,sChar))
  436. {//Reach end of a sentence.Get a whole sentence
  437. if(!strstr(SEPERATOR_LINK,sChar))//Not link seperator
  438. {
  439. strcat(sSentence,sChar);
  440. }
  441. if(sSentence[0]!=0&&strcmp(sSentence,SENTENCE_BEGIN)!=0)
  442. {
  443. if(!strstr(SEPERATOR_C_SUB_SENTENCE,sChar)&&!strstr(SEPERATOR_E_SUB_SENTENCE,sChar))
  444. strcat(sSentence,SENTENCE_END);//Add sentence ending flag
  445. Processing(sSentence,1);//Processing and output the result of current sentence.
  446. Output(m_pResult[0],sSentenceResult,bFirstIgnore);//Output to the imediate result
  447. //bFirstIgnore=true;
  448. strcat(sResult,sSentenceResult);//Store in the result buffer
  449. }
  450. if(strstr(SEPERATOR_LINK,sChar))//Link the result with the SEPERATOR_LINK
  451. {
  452. strcat(sResult,sChar);
  453. strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag
  454. //sSentence[0]=0;//New sentence, and begin new segmentation
  455. //bFirstIgnore=false;
  456. }
  457. else if(strstr(SEPERATOR_C_SENTENCE,sChar)||strstr(SEPERATOR_E_SENTENCE,sChar))
  458. {
  459. strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag
  460. //sSentence[0]=0;//New sentence, and begin new segmentation
  461. //bFirstIgnore=false;
  462. }
  463. else
  464. {
  465. strcpy(sSentence,sChar);//reset current sentence, and add the previous end at begin position
  466. }
  467. }
  468. else //Other chars and store in the sentence buffer
  469. strcat(sSentence,sChar);
  470. }
  471. if(sSentence[0]!=0&&strcmp(sSentence,SENTENCE_BEGIN)!=0)
  472. {
  473. strcat(sSentence,SENTENCE_END);//Add sentence ending flag
  474. Processing(sSentence,1);//Processing and output the result of current sentence.
  475. Output(m_pResult[0],sSentenceResult,bFirstIgnore);//Output to the imediate result
  476. strcat(sResult,sSentenceResult);//Store in the result buffer
  477. }
  478. delete []  sSentence;//FREE sentence buffer 
  479. delete []  sSentenceResult;//free buffer
  480. return true;
  481. }
  482. bool CResult::FileProcessing(char *sSourceFile,char *sResultFile)
  483. {
  484. FILE *fpSource,*fpResult;//The file pointer of read and write
  485. char *sParagraph,*sParagraphResult;
  486. int nLineIndex=1;
  487. sParagraph=new char[4*1024];
  488. sParagraphResult=new char[8*1024];
  489.     if((fpSource=fopen(sSourceFile,"rt"))==NULL)
  490. return false;//Cannot open the source file to read
  491.     if((fpResult=fopen(sResultFile,"wt"))==NULL) 
  492. return false;//Cannot open the result  file to write
  493. if(m_nOutputFormat==2)//XML format
  494. fprintf(fpResult,"<?xml version=42 1.042 encoding=42gb231242?><result>");
  495. while(!feof(fpSource))
  496. {
  497. if(fgets(sParagraph,4*1024,fpSource)==0)//Get a paragrah
  498. continue;
  499. TRACE("%dn",nLineIndex++);
  500. ParagraphProcessing(sParagraph,sParagraphResult);
  501. fprintf(fpResult,"%s",sParagraphResult);
  502. }
  503. delete [] sParagraph;
  504. delete [] sParagraphResult;
  505. fclose(fpSource);
  506. if(m_nOutputFormat==2)//XML format
  507. fprintf(fpResult,"</result>");
  508. fclose(fpResult);
  509. return true;
  510. }
  511. bool CResult::PKU2973POS(int nHandle, char *sPOS973)
  512. {
  513. int nHandleSet[46]={24832,24932,24935,24942,25088,25344,25600,25703,25856,26112,26368,26624,26880,27136,27392,27648,27904,28160,28263,28274,28275,28276,28280,28282,28416,28672,28928,29184,29440,29696,29799,29952,30052,30055,30058,30060,30070,30074,30208,30308,30311,30318,30464,30720,30976,31232};
  514. //   "a", "ad","ag","an","b", "c", "d", "dg","e", "f","g", "h", "i", "j", "k", "l", "m", "n", "ng","nr","ns","nt","nx","nz","o", "p", "q", "r", "s", "t", "tg","u", "ud","ug","uj","ul","uv","uz","v", "vd","vg","vn","w", "x", "y", "z"
  515. char sPOSRelated[46][3]={"a", "ad","ga","an","f", "c", "d", "d", "e","nd","g", "h", "i", "j", "k", "l", "m", "n", "gn","nh","ns","ni","ws", "nz","o", "p", "q", "r", "nl","nt","gt","u", "ud","ug","uj","ul","uv","uz","v", "vd","gv","vn","w", "x", "u", "a"};
  516. /* 
  517.  "Bg","gf",
  518.  "Rg","gr",
  519.  "Mg","gm",
  520.  "Yg","u",
  521.  "Ug","u",
  522.  "Qg","q",
  523. */
  524. int nIndex=BinarySearch(nHandle,nHandleSet,46);
  525. if(nIndex==-1)
  526. strcpy(sPOS973,"@");
  527. else
  528. strcpy(sPOS973,sPOSRelated[nIndex]);
  529. return true;
  530. }
  531. bool CResult::ChineseNameSplit(char *sPersonName, char *sSurname, char *sSurname2, char *sGivenName, CDictionary &personDict)
  532. {
  533. int nSurNameLen=4,nLen=strlen(sPersonName),nFreq,i=0,nCharType,nFreqGiven;
  534. char sTemp[3];
  535. if(nLen<3||nLen>8)//Not a traditional Chinese person name
  536. return false;
  537. while(i<nLen)//No Including non-CHinese char
  538. {
  539. nCharType=charType((unsigned char*)sPersonName+i);
  540. if(nCharType!=CT_CHINESE&&nCharType!=CT_OTHER)
  541. return false;
  542. i+=2;
  543. }
  544. sSurname2[0]=0;//init 
  545. strncpy(sSurname,sPersonName,nSurNameLen);
  546. sSurname[nSurNameLen]=0;
  547. if(!personDict.IsExist(sSurname,1))
  548. {
  549. nSurNameLen=2;
  550. sSurname[nSurNameLen]=0;
  551. if(!personDict.IsExist(sSurname,1))
  552. {
  553. nSurNameLen=0;
  554. sSurname[nSurNameLen]=0;
  555. }
  556. }
  557. strcpy(sGivenName,sPersonName+nSurNameLen);
  558. if(nLen>6)
  559. {
  560. strncpy(sTemp,sPersonName+nSurNameLen,2);
  561. sTemp[2]=0;//Get the second possible surname
  562. if(personDict.IsExist(sTemp,1))
  563. {//Hongkong women's name: Surname+surname+given name
  564. strcpy(sSurname2,sTemp);
  565. strcpy(sGivenName,sPersonName+nSurNameLen+2);
  566. }
  567. }
  568. nFreq=personDict.GetFrequency(sSurname,1);
  569. strncpy(sTemp,sGivenName,2);
  570. sTemp[2]=0;
  571. nFreqGiven=personDict.GetFrequency(sTemp,2);
  572. if(nSurNameLen!=4&&((nSurNameLen==0&&nLen>4)||strlen(sGivenName)>4||(GetForeignCharCount(sPersonName)>=3&&nFreq<personDict.GetFrequency("张",1)/40&&nFreqGiven<personDict.GetFrequency("华",2)/20)||(nFreq<10&&GetForeignCharCount(sGivenName)==(nLen-nSurNameLen)/2)))
  573. return false;
  574. if(nLen==4&&m_uPerson.IsGivenName(sPersonName))
  575. {//Single Surname+given name
  576. return false;
  577. }
  578. return true;
  579. }