Result.cpp
上传用户:chen_dj
上传日期:2013-04-22
资源大小:111k
文件大小:16k
源码类别:

多国语言处理

开发平台:

C/C++

  1. // Result.cpp: implementation of the CResult class.
  2. //
  3. //////////////////////////////////////////////////////////////////////
  4. #include "stdafx.h"
  5. #include "Result.h"
  6. #include <string.h>
  7. #include <stdio.h>
  8. #include <math.h>
  9. #include "..\Utility\Utility.h"
  10. //////////////////////////////////////////////////////////////////////
  11. // Construction/Destruction
  12. //////////////////////////////////////////////////////////////////////
  13. CResult::CResult()
  14. {
  15. //malloc buffer
  16. m_pResult=new PWORD_RESULT[MAX_SEGMENT_NUM];
  17. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  18. {
  19. m_pResult[i]=new WORD_RESULT[MAX_WORDS];
  20. }
  21. m_dictCore.Load("data\coreDict.dct");
  22. m_POSTagger.LoadContext("data\lexical.ctx");
  23. /*
  24. m_dictCore.Load("data\Dict.dct");
  25. m_POSTagger.LoadContext("data\trainTest.ctx");
  26. */
  27. /*
  28. m_dictCore.AddItem("十分",'d'*256,500);
  29. m_dictCore.AddItem("十分",'m'*256,-500);
  30. m_dictCore.Save("data\coreDict.dct");
  31. */
  32. m_POSTagger.SetTagType();
  33. m_Person.Configure("data\Person",TT_PERSON);
  34. //Set the person recognition configure
  35. m_Transliteration.Configure("data\Trans",TT_TRANS);
  36. //Set the transliteration recognition configure
  37. m_Place.Configure("data\Place",TT_PLACE);
  38. //Set the place recognition configure
  39. m_nOperateType=2;//0:Only Segment;1: First Tag; 2:Second Type
  40. m_nOutputFormat=0;//0:PKU criterion;1:973 criterion; 2: XML criterion
  41. }
  42. CResult::~CResult()
  43. {
  44. //free buffer
  45. for(int i=0;i<MAX_SEGMENT_NUM;i++)
  46. {
  47. delete [] m_pResult[i];
  48. }
  49. delete m_pResult;
  50. }
  51. bool CResult::Output(PWORD_RESULT pItem, char *sResult,bool bFirstWordIgnore)
  52. {
  53. int i=0;
  54. char sTempBuffer[WORD_MAXLENGTH],sPOS[3];
  55. sPOS[2]=0;
  56. sResult[0]=0;
  57. if(bFirstWordIgnore)//Ignore first valid
  58. i=1;
  59. while(pItem[i].sWord[0]!=0&&pItem[i].nHandle!=CT_SENTENCE_END)//Not sentence ending flag
  60. {
  61. //Get the POS string
  62. if(m_nOutputFormat!=0)//Not PKU format
  63. PKU2973POS(pItem[i].nHandle,sPOS);
  64. else//PKU format
  65. {
  66. sPOS[0]=pItem[i].nHandle/256;
  67. sPOS[1]=pItem[i].nHandle%256;
  68. }
  69. sPOS[m_nOperateType]=0;//Set the sPOS with operate type
  70. if(m_nOutputFormat==0)//PKU format
  71. {
  72. sprintf(sTempBuffer,"%s",pItem[i].sWord);
  73.     strcat(sResult,sTempBuffer);
  74. if(sPOS[0]!=0)//need POS 
  75. {
  76. sprintf(sTempBuffer,"/%s",sPOS);
  77. strcat(sResult,sTempBuffer);
  78. }
  79.     strcat(sResult,"  ");
  80. }
  81. else if(m_nOutputFormat==1)//973 format
  82. {
  83. sprintf(sTempBuffer,"%s\",pItem[i].sWord);
  84.     strcat(sResult,sTempBuffer);
  85. if(sPOS[0]!=0)//need POS 
  86. {
  87. sprintf(sTempBuffer,"[%s]",sPOS);
  88. strcat(sResult,sTempBuffer);
  89. }
  90. }
  91. else if(m_nOutputFormat==2)//XML format
  92. {
  93. if(sPOS[0]!=0)//POS
  94. {
  95. sprintf(sTempBuffer,"<any type=42%s42>",sPOS);
  96. strcat(sResult,sTempBuffer);
  97. }
  98. sprintf(sTempBuffer,"<src>%s</src>",pItem[i].sWord);
  99. strcat(sResult,sTempBuffer);
  100. if(sPOS[0]!=0)
  101. {
  102. strcat(sResult,"</any>");
  103. }
  104. }
  105. i++;
  106. }
  107. return true;
  108. }
  109. bool CResult::Processing(char *sSentence,unsigned int nCount)
  110. {
  111. int nIndex;
  112. #if _ICT_DEBUG
  113. char *sSegment;
  114. sSegment=new char[MAX_SENTENCE_LEN*2];
  115. #endif
  116. m_Seg.Segment(sSentence,m_dictCore,nCount);
  117. m_nResultCount=m_Seg.m_nSegmentCount;
  118. //Record the number of result
  119. for(nIndex=0;nIndex<m_Seg.m_nSegmentCount;nIndex++)
  120. {
  121. m_POSTagger.POSTagging(m_Seg.m_pWordSeg[nIndex],m_dictCore,m_dictCore);
  122. #if _ICT_DEBUG
  123. Output(m_Seg.m_pWordSeg[nIndex],sSegment);
  124. printf("POS Tag%d:%sn",nIndex+1,sSegment);
  125. #endif
  126. m_Person.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
  127. m_Transliteration.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
  128. m_Place.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
  129. }
  130. //m_Person.Recognition(m_Seg.m_WordSeg[0],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
  131. //Person Recognition
  132. #if _ICT_DEBUG
  133. printf("After person recognition.n");
  134. #endif
  135. m_Seg.OptimumSegmet(nCount);
  136. for(nIndex=0;nIndex<m_Seg.m_nSegmentCount;nIndex++)
  137. {
  138. m_POSTagger.POSTagging(m_Seg.m_pWordSeg[nIndex],m_dictCore,m_dictCore);
  139. #if _ICT_DEBUG
  140. Output(m_Seg.m_pWordSeg[nIndex],sSegment);
  141. printf("POS Tag%d:%sn",nIndex+1,sSegment);
  142. #endif
  143. }
  144. #if _ICT_DEBUG
  145. printf("After Sorting.n");
  146. #endif
  147. Sort();//Sort the ending 
  148. #if _ICT_DEBUG
  149. for(nIndex=0;nIndex<m_Seg.m_nSegmentCount;nIndex++)
  150. {
  151. Output(m_pResult[nIndex],sSegment);
  152. printf("POS Tag%d(P=Exp(%f)):%sn",nIndex+1,m_dResultPossibility[nIndex],sSegment);
  153. }
  154. delete [] sSegment;
  155. #endif
  156. return true;
  157. }
  158. //Sort the segmentation and POS result according its possibility
  159. bool CResult::Sort()
  160. {
  161. ELEMENT_TYPE dPossibility[MAX_SEGMENT_NUM],dTemp;
  162. int nIndex[MAX_SEGMENT_NUM],nTemp;//Index
  163. memset(dPossibility,0,sizeof(dPossibility));
  164. //Init the possibility
  165. for(int i=0;i<m_Seg.m_nSegmentCount;i++)
  166. {//Computing the possibility
  167. dPossibility[i]=ComputePossibility(m_Seg.m_pWordSeg[i]);
  168. nIndex[i]=i;//Record the index
  169. }
  170. //Sort with Bubble sort algorithm
  171. for(i=0;i<m_Seg.m_nSegmentCount;i++)
  172. for(int j=i+1;j<m_Seg.m_nSegmentCount;j++)
  173. {
  174. if(dPossibility[i]<dPossibility[j])
  175. {//Swap the possition and value
  176. nTemp=nIndex[i];
  177. dTemp=dPossibility[i];
  178. nIndex[i]=nIndex[j];
  179. dPossibility[i]=dPossibility[j];
  180. nIndex[j]=nTemp;
  181. dPossibility[j]=dTemp;
  182. }
  183. }
  184. for(i=0;i<m_Seg.m_nSegmentCount;i++)
  185. {//Adjust the segmentation and POS result and store them in the final result array
  186.  //Store them according their possibility ascendly
  187. Adjust(m_Seg.m_pWordSeg[nIndex[i]],m_pResult[i]);
  188. m_dResultPossibility[i]=dPossibility[i];
  189. }
  190. return true;
  191. }
  192. //Compute the possibility of current segmentation and POS result
  193. ELEMENT_TYPE CResult::ComputePossibility(PWORD_RESULT pItem)
  194. {
  195. int i=0;
  196. ELEMENT_TYPE dResultPossibility=0;
  197. while(pItem[i].sWord[0]!=0)
  198. {
  199. dResultPossibility+=pItem[i].dValue;
  200. //Compute the possibility of logP(Wi|Ti)
  201. if(pItem[i+1].sWord[0]!=0)//Not the last one
  202. {//Compute the possibility of logP(Ti|Ti-1)
  203. dResultPossibility+=log((double)(m_POSTagger.m_context.GetContextPossibility(0,pItem[i].nHandle,pItem[i+1].nHandle)+1));
  204. dResultPossibility-=log((double)(m_POSTagger.m_context.GetFrequency(0,pItem[i].nHandle)+1));
  205. }
  206. i++;
  207. }
  208. return dResultPossibility;
  209. }
  210. //Adjust the result with some rules
  211. bool CResult::Adjust(PWORD_RESULT pItem,PWORD_RESULT pItemRet)
  212. {
  213. int i=0,j=0;
  214. unsigned int nLen;
  215. char sSurName[10],sSurName2[10],sGivenName[10];
  216. bool bProcessed=false;//Have been processed
  217. while(pItem[i].sWord[0]!=0)
  218. {
  219. nLen=strlen(pItem[i].sWord);
  220. bProcessed=false;
  221. //Rule1: adjust person name
  222. if(pItem[i].nHandle==28274&&ChineseNameSplit(pItem[i].sWord,sSurName,sSurName2,sGivenName,m_Person.m_dict)&&strcmp(pItem[i].sWord,"叶利钦")!=0)//'nr'
  223. {//Divide name into surname and given name
  224. if(sSurName[0])
  225. {
  226. strcpy(pItemRet[j].sWord,sSurName);
  227. pItemRet[j++].nHandle=28274;
  228. }
  229. if(sSurName2[0])
  230. {
  231. strcpy(pItemRet[j].sWord,sSurName2);
  232. pItemRet[j++].nHandle=28274;
  233. }
  234. if(sGivenName[0])
  235. {
  236. strcpy(pItemRet[j].sWord,sGivenName);
  237. pItemRet[j++].nHandle=28274;
  238. }
  239. bProcessed=true;
  240. }
  241. //Rule2 for overlap words ABB 一段段、一片片
  242. else if(pItem[i].nHandle==256*'m'&&pItem[i+1].nHandle/256=='q'&&strcmp(pItem[i+1].sWord,pItem[i+2].sWord)==0)
  243. {
  244. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  245. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  246. strcat(pItemRet[j].sWord,pItem[i+2].sWord);
  247. pItemRet[j].nHandle=256*'m';
  248. j+=1;
  249. i+=2;
  250. bProcessed=true;
  251. }
  252. //Rule3 for overlap words AA or AABB
  253. else if(nLen==2&&strcmp(pItem[i].sWord,pItem[i+1].sWord)==0)
  254. {
  255. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  256. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  257.      //24832=='a'*256
  258. pItemRet[j].nHandle=24832;//a
  259. if(pItem[i].nHandle/256=='v'||pItem[i+1].nHandle/256=='v')//30208='v'8256
  260. {
  261. pItemRet[j].nHandle=30208;
  262. }
  263. if(pItem[i].nHandle/256=='n'||pItem[i+1].nHandle/256=='n')//30208='v'8256
  264. {
  265. pItemRet[j].nHandle='n'*256;
  266. }
  267. i+=1;
  268. if(strlen(pItem[i+1].sWord)==2)
  269. {//AAB:洗/洗/脸、蒙蒙亮
  270. if((pItemRet[j].nHandle==30208&&pItem[i+1].nHandle/256=='n')||
  271.    (pItemRet[j].nHandle==24832&&pItem[i+1].nHandle/256=='a')
  272.    )
  273. {
  274. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  275. i+=1;
  276. }
  277. }
  278. j+=1;
  279. bProcessed=true;
  280. }
  281. //Rule 4: AAB 洗/洗澡
  282. else if(nLen==2&&strncmp(pItem[i].sWord,pItem[i+1].sWord,2)==0&&strlen(pItem[i+1].sWord)==4&&(pItem[i].nHandle/256=='v'||pItem[i].nHandle/256=='a'))
  283. {
  284. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  285. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  286.      //24832=='a'*256
  287. pItemRet[j].nHandle=24832;//'a'
  288. if(pItem[i].nHandle/256=='v'||pItem[i+1].nHandle/256=='v')//30208='v'8256
  289. {
  290. pItemRet[j].nHandle=30208;
  291. }
  292. i+=1;
  293. j+=1;
  294. bProcessed=true;
  295. }
  296. else if(pItem[i].nHandle/256=='u'&&pItem[i].nHandle%256)//uj,ud,uv,uz,ul,ug->u
  297. pItem[i].nHandle='u'*256;
  298. else if(pItem[i].nHandle/256=='s'&&pItem[i].nHandle%256=='s')//ss->nx
  299. pItem[i].nHandle='n'*256+'x';
  300. else if(pItem[i].nHandle==28275&&m_Transliteration.m_dict.IsExist(pItem[i+1].sWord,30))
  301. {
  302. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  303. strcat(pItemRet[j].sWord,pItem[i+1].sWord);
  304. pItemRet[j].nHandle=28275;
  305. i+=1;
  306. j+=1;
  307. bProcessed=true;
  308. }
  309. if(!bProcessed)
  310. {//If not processed,that's mean: not need to adjust;
  311.  //just copy to the final result
  312. strcpy(pItemRet[j].sWord,pItem[i].sWord);
  313. pItemRet[j++].nHandle=pItem[i].nHandle;
  314. }
  315. i++;
  316. }
  317. pItemRet[j].sWord[0]=0;//Set ending
  318. return true;
  319. }
  320. //Paragraph Segment and POS Tagging
  321. bool CResult::ParagraphProcessing(char *sParagraph,char *sResult)
  322. {
  323. char *sSentence,sChar[3];
  324. char *sSentenceResult;
  325. unsigned int nLen=strlen(sParagraph)+13;
  326. sSentence=new char[nLen];//malloc buffer
  327. sSentenceResult=new char[nLen*3];//malloc buffer
  328. sSentence[0]=0;
  329. unsigned int nPosIndex=0,nParagraphLen=strlen(sParagraph),nSentenceIndex=0;
  330. sChar[2]=0;
  331. sResult[0]=0;//Init the result
  332. bool bFirstIgnore=true;
  333. strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag
  334. while(nPosIndex<nParagraphLen)
  335. {//Find a whole sentence which separated by ! . n r
  336. sChar[0]=sParagraph[nPosIndex];//Get a char
  337. sChar[1]=0;
  338. if(sParagraph[nPosIndex]<0)
  339. {//double byte char
  340. nPosIndex+=1;
  341. sChar[1]=sParagraph[nPosIndex];
  342. }
  343. nPosIndex+=1;
  344. /*
  345. #define  SEPERATOR_C_SENTENCE "。!?:;…"
  346. #define  SEPERATOR_C_SUB_SENTENCE "、,()“”‘’"
  347. #define  SEPERATOR_E_SENTENCE "!?:;"
  348. #define  SEPERATOR_E_SUB_SENTENCE ",()42'"
  349. #define  SEPERATOR_LINK "nr  "
  350. */
  351. if(CC_Find(SEPERATOR_C_SENTENCE,sChar)||CC_Find(SEPERATOR_C_SUB_SENTENCE,sChar)||strstr(SEPERATOR_E_SENTENCE,sChar)||strstr(SEPERATOR_E_SUB_SENTENCE,sChar)||strstr(SEPERATOR_LINK,sChar))
  352. {//Reach end of a sentence.Get a whole sentence
  353. if(!strstr(SEPERATOR_LINK,sChar))//Not link seperator
  354. {
  355. strcat(sSentence,sChar);
  356. }
  357. if(sSentence[0]!=0)
  358. {
  359. if(!strstr(SEPERATOR_C_SUB_SENTENCE,sChar)&&!strstr(SEPERATOR_E_SUB_SENTENCE,sChar))
  360. strcat(sSentence,SENTENCE_END);//Add sentence ending flag
  361. Processing(sSentence,1);//Processing and output the result of current sentence.
  362. Output(m_pResult[0],sSentenceResult,bFirstIgnore);//Output to the imediate result
  363. //bFirstIgnore=true;
  364. strcat(sResult,sSentenceResult);//Store in the result buffer
  365. }
  366. if(strstr(SEPERATOR_LINK,sChar))//Link the result with the SEPERATOR_LINK
  367. {
  368. strcat(sResult,sChar);
  369. strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag
  370. //sSentence[0]=0;//New sentence, and begin new segmentation
  371. //bFirstIgnore=false;
  372. }
  373. else if(strstr(SEPERATOR_C_SENTENCE,sChar)||strstr(SEPERATOR_E_SENTENCE,sChar))
  374. {
  375. strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag
  376. //sSentence[0]=0;//New sentence, and begin new segmentation
  377. //bFirstIgnore=false;
  378. }
  379. else
  380. {
  381. strcpy(sSentence,sChar);//reset current sentence, and add the previous end at begin position
  382. }
  383. }
  384. else //Other chars and store in the sentence buffer
  385. strcat(sSentence,sChar);
  386. }
  387. if(sSentence[0]!=0&&strcmp(sSentence,SENTENCE_BEGIN)!=0)
  388. {
  389. strcat(sSentence,SENTENCE_END);//Add sentence ending flag
  390. Processing(sSentence,1);//Processing and output the result of current sentence.
  391. Output(m_pResult[0],sSentenceResult,bFirstIgnore);//Output to the imediate result
  392. strcat(sResult,sSentenceResult);//Store in the result buffer
  393. }
  394. delete []  sSentence;//FREE sentence buffer 
  395. delete []  sSentenceResult;//free buffer
  396. return true;
  397. }
  398. bool CResult::FileProcessing(char *sSourceFile,char *sResultFile)
  399. {
  400. FILE *fpSource,*fpResult;//The file pointer of read and write
  401. char *sParagraph,*sParagraphResult;
  402. sParagraph=new char[4*1024];
  403. sParagraphResult=new char[8*1024];
  404.     if((fpSource=fopen(sSourceFile,"rt"))==NULL)
  405. return false;//Cannot open the source file to read
  406.     if((fpResult=fopen(sResultFile,"wt"))==NULL) 
  407. return false;//Cannot open the result  file to write
  408. if(m_nOutputFormat==2)//XML format
  409. fprintf(fpResult,"<?xml version=42 1.042 encoding=42gb231242?><result>");
  410. while(!feof(fpSource))
  411. {
  412. if(fgets(sParagraph,4*1024,fpSource)==0)//Get a paragrah
  413. continue;
  414. ParagraphProcessing(sParagraph,sParagraphResult);
  415. fprintf(fpResult,"%s",sParagraphResult);
  416. }
  417. delete [] sParagraph;
  418. delete [] sParagraphResult;
  419. fclose(fpSource);
  420. if(m_nOutputFormat==2)//XML format
  421. fprintf(fpResult,"</result>");
  422. fclose(fpResult);
  423. return true;
  424. }
  425. bool CResult::PKU2973POS(int nHandle, char *sPOS973)
  426. {
  427. int nHandleSet[46]={24832,24932,24935,24942,25088,25344,25600,25703,25856,26112,26368,26624,26880,27136,27392,27648,27904,28160,28263,28274,28275,28276,28280,28282,28416,28672,28928,29184,29440,29696,29799,29952,30052,30055,30058,30060,30070,30074,30208,30308,30311,30318,30464,30720,30976,31232};
  428. //   "a", "ad","ag","an","b", "c", "d", "dg","e", "f","g", "h", "i", "j", "k", "l", "m", "n", "ng","nr","ns","nt","nx","nz","o", "p", "q", "r", "s", "t", "tg","u", "ud","ug","uj","ul","uv","uz","v", "vd","vg","vn","w", "x", "y", "z"
  429. char sPOSRelated[46][3]={"a", "ad","ga","an","f", "c", "d", "d", "e","nd","g", "h", "i", "j", "k", "l", "m", "n", "gn","nh","ns","ni","ws", "nz","o", "p", "q", "r", "nl","nt","gt","u", "ud","ug","uj","ul","uv","uz","v", "vd","gv","vn","w", "x", "u", "a"};
  430. /* "Bg","gf",
  431.  "Rg","gr",
  432.  "Mg","gm",
  433.  "Yg","u",
  434.  "Ug","u",
  435.  "Qg","q",
  436. */
  437. int nIndex=BinarySearch(nHandle,nHandleSet,46);
  438. if(nIndex==-1)
  439. strcpy(sPOS973,"@");
  440. else
  441. strcpy(sPOS973,sPOSRelated[nIndex]);
  442. return true;
  443. }
  444. bool CResult::ChineseNameSplit(char *sPersonName, char *sSurname, char *sSurname2, char *sGivenName, CDictionary &personDict)
  445. {
  446. int nSurNameLen=4,nLen=strlen(sPersonName),nFreq,i=0,nCharType;
  447. char sTemp[3];
  448. if(nLen<3||nLen>8)//Not a traditional Chinese person name
  449. return false;
  450. while(i<nLen)//No Including non-CHinese char
  451. {
  452. nCharType=charType((unsigned char*)sPersonName+i);
  453. if(nCharType!=CT_CHINESE&&nCharType!=CT_OTHER)
  454. return false;
  455. i+=2;
  456. }
  457. sSurname2[0]=0;//init 
  458. strncpy(sSurname,sPersonName,nSurNameLen);
  459. sSurname[nSurNameLen]=0;
  460. if(!personDict.IsExist(sSurname,1))
  461. {
  462. nSurNameLen=2;
  463. sSurname[nSurNameLen]=0;
  464. if(!personDict.IsExist(sSurname,1))
  465. {
  466. nSurNameLen=0;
  467. sSurname[nSurNameLen]=0;
  468. }
  469. }
  470. strcpy(sGivenName,sPersonName+nSurNameLen);
  471. if(nLen>6)
  472. {
  473. strncpy(sTemp,sPersonName+nSurNameLen,2);
  474. sTemp[2]=0;//Get the second possible surname
  475. if(personDict.IsExist(sTemp,1))
  476. {//Hongkong women's name: Surname+surname+given name
  477. strcpy(sSurname2,sTemp);
  478. strcpy(sGivenName,sPersonName+nSurNameLen+2);
  479. }
  480. }
  481. nFreq=personDict.GetFrequency(sSurname,1);
  482. if(nSurNameLen!=4&&((nSurNameLen==0&&nLen>4)||strlen(sGivenName)>4||(GetForeignCharCount(sPersonName)==3&&nFreq<personDict.GetFrequency("张",1)/4)||(nFreq<10&&GetForeignCharCount(sGivenName)==(nLen-nSurNameLen)/2)))
  483. return false;
  484. if(nLen==4&&m_Person.IsGivenName(sPersonName))
  485. {//Single Surname+given name
  486. return false;
  487. }
  488. return true;
  489. }