Span.cpp
上传用户:chen_dj
上传日期:2013-04-22
资源大小:111k
文件大小:27k
源码类别:

多国语言处理

开发平台:

C/C++

  1. /****************************************************************************
  2.  *
  3.  * Copyright (c) 2000, 2001 
  4.  *     Machine Group
  5.  *     Software Research Lab.
  6.  *     Institute of Computing Tech.
  7.  *     Chinese Academy of Sciences
  8.  *     All rights reserved.
  9.  *
  10.  * This file is the confidential and proprietary property of 
  11.  * Institute of Computing Tech. and the posession or use of this file requires 
  12.  * a written license from the author.
  13.  * Filename: Span.cpp
  14.  * Abstract:
  15.  *           implementation of the CSpan class.
  16.  * Author:   Kevin Zhang 
  17.  *          (zhanghp@software.ict.ac.cn)
  18.  * Date:     2002-4-23
  19.  *
  20.  * Notes:    Tagging with Hidden Markov Model
  21.  *                
  22.  ****************************************************************************/
  23. #include "stdafx.h"
  24. #include "Span.h"
  25. #include "..\Segment\Segment.h"
  26. #include "..\Utility\Utility.h"
  27. #include <math.h>
  28. #include <string.h>
  29. #include <stdio.h>
  30. #include <time.h>
  31. //////////////////////////////////////////////////////////////////////
  32. // Construction/Destruction
  33. //////////////////////////////////////////////////////////////////////
  34. CSpan::CSpan()
  35. {
  36. if(m_tagType!=TT_NORMAL)
  37.       m_nTags[0][0]=100;//Begin tag
  38. else
  39.       m_nTags[0][0]=0;//Begin tag
  40. m_nTags[0][1]=-1;
  41. m_dFrequency[0][0]=0;
  42. m_nCurLength=1;
  43. m_nUnknownIndex=0;
  44. m_nStartPos=0;
  45. m_nWordPosition[1]=0;
  46. m_sWords[0][0]=0;
  47. m_tagType=TT_NORMAL;//Default tagging type
  48. }
  49. CSpan::~CSpan()
  50. {
  51. }
  52. bool CSpan::Disamb()
  53. {
  54. int i,j,k,nMinCandidate;
  55. double dMinFee,dTmp;
  56. for(i=1;i<m_nCurLength;i++)//For every word
  57. {
  58. for(j=0;m_nTags[i][j]>=0;j++)//For every word
  59. {
  60. nMinCandidate=MAX_POS_PER_WORD+1;
  61. for(k=0;m_nTags[i-1][k]>=0;k++)
  62. {
  63. //ConvertPOS(m_nTags[i-1][k],&nKey,&nPrevPOS);
  64. //ConvertPOS(m_nTags[i][j],&nKey,&nCurPOS);
  65. //dTmp=m_context.GetContextPossibility(nKey,nPrevPOS,nCurPOS);
  66. dTmp=-log(m_context.GetContextPossibility(0,m_nTags[i-1][k],m_nTags[i][j]));
  67. dTmp+=m_dFrequency[i-1][k];//Add the fees
  68. if(nMinCandidate>10||dTmp<dMinFee)//Get the minimum fee
  69. {
  70. nMinCandidate=k;
  71. dMinFee=dTmp;
  72. }
  73. }
  74. m_nBestPrev[i][j]=nMinCandidate;//The best previous for j
  75. m_dFrequency[i][j]=m_dFrequency[i][j]+dMinFee;
  76. }
  77. }
  78. return true;
  79. }
  80. bool CSpan::Reset(bool bContinue)
  81. {
  82. if(!bContinue)
  83. {//||CC_Find("。!”〕〉》」〗】",m_sWords[m_nCurLength-1])
  84. if(m_tagType!=TT_NORMAL)//Get the last POS in the last sentence
  85.       m_nTags[0][0]=100;//Begin tag
  86. else
  87.       m_nTags[0][0]=0;//Begin tag
  88. m_nUnknownIndex=0;
  89. m_dFrequency[0][0]=0;
  90. m_nStartPos=0;
  91. }
  92. else
  93. {
  94. m_nTags[0][0]=m_nTags[m_nCurLength-1][0];//Get the last POS in the last sentence
  95. m_dFrequency[0][0]=m_dFrequency[m_nCurLength-1][0];
  96. }
  97.     m_nTags[0][1]=-1;//Get the last POS in the last sentence,set the -1 as end flag
  98. m_nCurLength=1;
  99. m_nWordPosition[1]=m_nStartPos;
  100. m_sWords[0][0]=0;
  101. return true;
  102. }
  103. bool CSpan::LoadContext(char *sFilename)
  104. {
  105. return m_context.Load(sFilename);
  106. }
  107. bool CSpan::UnknownMatch()
  108. {//Find the template
  109.   char sPOS[MAX_WORDS_PER_SENTENCE]="z";
  110.   int nStart,nEnd;
  111.   for(int i=1;m_nBestTag[i]>-1;i++)
  112. sPOS[i]=m_nBestTag[i]+'a';
  113.   sPOS[i]=0;
  114.   char *pFind=strchr(sPOS+1,'b');
  115.   char *pFindEnd;
  116.   while(pFind!=NULL&&*pFind!=NULL)
  117.   {
  118.     pFindEnd=pFind+1;
  119.     while(pFindEnd!=NULL&&*pFindEnd!=NULL&&(*pFindEnd=='c'||*pFindEnd=='d'))
  120.        pFindEnd=pFindEnd+1;
  121.     nStart=pFind-sPOS;
  122.     nEnd=pFindEnd-sPOS;
  123. //Save the unknown word position
  124.     m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
  125. m_nUnknownWords[m_nUnknownIndex++][1]=m_nWordPosition[nEnd];
  126. pFind=strchr(pFindEnd,'b');
  127.   }
  128.   return true;
  129. }
  130. bool CSpan::GetBestPOS()
  131. {
  132.   Disamb();
  133.   for(int i=m_nCurLength-1,j=0;i>0;i--)
  134.   {
  135.  if(m_sWords[i][0])
  136.  {//Not virtual ending
  137.  m_nBestTag[i]=m_nTags[i][j];//Record the best POS and its possibility
  138.  }
  139.  j=m_nBestPrev[i][j];
  140.   }
  141.   int nEnd=m_nCurLength;//Set the end of POS tagging
  142.   if(m_sWords[m_nCurLength-1][0]==0)
  143.   nEnd=m_nCurLength-1;
  144.   m_nBestTag[nEnd]=-1;
  145.   return true;
  146. }
  147. bool CSpan::SplitPersonPOS(CDictionary &unlistDict)
  148. {//Split the word with POS 21 and 22
  149.     int i=m_nCurLength-1,j;
  150. unsigned int nLenWord,nLenPart;
  151. char sFirstPart[50],sLastPart[50];
  152. int nFirstPOS,nLastPOS;
  153. for(;i>0;i--)
  154. {
  155. if(m_nBestTag[i]==21||m_nBestTag[i]==22)
  156. {//Find the POS which need to split
  157. for(j=m_nCurLength-1;j>i;j--)
  158. {//Move the POS and words
  159. strcpy(m_sWords[j+1],m_sWords[j]);
  160. m_nBestTag[j+1]=m_nBestTag[j];
  161. m_nWordPosition[j+1]=m_nWordPosition[j];
  162. }
  163. m_nCurLength+=1;//The length increment 
  164.         /*
  165. CSegment segment;
  166. segment.Segment(m_sWords[i],unlistDict,1);
  167. */
  168. //Generate new segment words and POS
  169. if(m_nBestTag[i]==21)
  170. {//Combination by Previous and first component
  171. nLenWord=strlen(m_sWords[i]);
  172. if(nLenWord>4)//Get first component
  173. {
  174. strcpy(sLastPart,m_sWords[i]+nLenWord-4);
  175. if(!unlistDict.IsExist(sLastPart,-1))
  176. strcpy(sLastPart,m_sWords[i]+nLenWord-2);
  177. }
  178. else
  179. {
  180. strcpy(sLastPart,m_sWords[i]+nLenWord-2);
  181. }
  182. nLenPart=strlen(sLastPart);
  183. if(nLenPart<nLenWord)
  184. {//Get first part
  185. strncpy(sFirstPart,m_sWords[i],nLenWord-nLenPart);
  186. sFirstPart[nLenWord-nLenPart]=0;
  187. }
  188. else
  189. {
  190. strncpy(sFirstPart,m_sWords[i],nLenWord-2);
  191. sFirstPart[nLenWord-2]=0;
  192. strncpy(sLastPart,m_sWords[i]+nLenWord-2,2);
  193. sLastPart[2]=0;
  194. }
  195. nFirstPOS=11;
  196. nLastPOS=1;
  197. }
  198. else
  199. {//Combination by Next word and last component
  200. nLenWord=strlen(m_sWords[i]);
  201. if(nLenWord>4)//Get last component
  202. {
  203. strncpy(sFirstPart,m_sWords[i],4);
  204. sFirstPart[4]=0;
  205. if(!unlistDict.IsExist(sFirstPart,-1))
  206. sFirstPart[2]=0;
  207. }
  208. else
  209. {
  210. strncpy(sFirstPart,m_sWords[i],2);
  211. sFirstPart[2]=0;
  212. }
  213. nLenPart=strlen(sFirstPart);
  214. if(nLenPart<nLenWord)
  215. {//Get first part
  216. strncpy(sLastPart,m_sWords[i]+nLenPart,nLenWord-nLenPart);
  217. sLastPart[nLenWord-nLenPart]=0;
  218. }
  219. else
  220. {
  221. strncpy(sFirstPart,m_sWords[i],2);
  222. sFirstPart[2]=0;
  223. strncpy(sLastPart,m_sWords[i]+2,nLenWord-2);
  224. sLastPart[nLenWord-2]=0;
  225. }
  226. if(unlistDict.IsExist(sFirstPart,1)&&m_nBestTag[i-1]==5)
  227. //小陈说:
  228. nFirstPOS=1;
  229. else if(unlistDict.IsExist(m_sWords[i-1],1)&&!unlistDict.IsExist(m_sWords[i-2],1))
  230. nFirstPOS=4;
  231. else
  232. nFirstPOS=3;
  233. nLastPOS=12;
  234. }
  235.             strcpy(m_sWords[i],sFirstPart);
  236. m_nBestTag[i]=nFirstPOS;
  237.             strcpy(m_sWords[i+1],sLastPart);
  238. m_nBestTag[i+1]=nLastPOS;
  239. m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(sFirstPart);
  240. }
  241. }
  242. return true;
  243. }
  244. bool CSpan::PersonRecognize(CDictionary &personDict)
  245. {
  246.   char sPOS[MAX_WORDS_PER_SENTENCE]="z",sPersonName[100];
  247.                           //0     1    2    3    4   5   
  248.   char sPatterns[][5]={ "BBCD","BBC","BBE","BBZ","BCD","BEE","BE",
  249.  "BG",  "BXD","BZ", "CDCD","CD","EE", 
  250.  "FB", "Y","XD",""};
  251.   double dFactor[]={0.0011,0.0011,0.0011,0.0011,0.7614,0.0011,0.2055,
  252.  0.0160,0.0011,0.0011,0,0.0160,0.0011,
  253.  0.0160,0.0011,0.0011,0
  254.   };
  255.   //About parameter:
  256. /*
  257. Given Name: 486     0.0160
  258. Surname+postfix:484 0.0160
  259. m_lPerson2Num:6265   0.2055
  260. m_lPerson3Num: 23184 0.7614
  261. m_lPerson4Num:32     0.0011
  262.   */
  263.   //The person recognition patterns set
  264.   //BBCD:姓+姓+名1+名2;
  265.   //BBE: 姓+姓+单名;
  266.   //BBZ: 姓+姓+双名成词;
  267.   //BCD: 姓+名1+名2;
  268.   //BE:  姓+单名;
  269.   //BEE: 姓+单名+单名;韩磊磊
  270.   //BG:  姓+后缀
  271.   //BXD: 姓+姓双名首字成词+双名末字
  272.   //BZ:  姓+双名成词;
  273.   //B:  姓
  274.   //CD:  名1+名2;
  275.   //EE:  单名+单名;
  276.   //FB:  前缀+姓
  277.   //XD:  姓双名首字成词+双名末字
  278.   //Y:   姓单名成词
  279.   int nPatternLen[]={4,3,3,3,3,3,2,2,3,2,4,2,2,2,1,2,0};
  280.   for(int i=1;m_nBestTag[i]>-1;i++)//Convert to string from POS
  281. sPOS[i]=m_nBestTag[i]+'A';
  282.   sPOS[i]=0;
  283.   int j=1,k,nPos;//Find the proper pattern from the first POS
  284.   int nLittleFreqCount;//Counter for the person name role with little frequecy
  285.   bool bMatched=false;   
  286.   while(j<i)
  287.   {
  288. bMatched=false;   
  289. for(k=0;!bMatched&&nPatternLen[k]>0;k++)
  290. {
  291. if(strncmp(sPatterns[k],sPOS+j,nPatternLen[k])==0&&strcmp(m_sWords[j-1],"·")!=0&&strcmp(m_sWords[j+nPatternLen[k]],"·")!=0)
  292. {//Find the proper pattern k
  293. if(strcmp(sPatterns[k],"FB")==0&&(sPOS[j+2]=='E'||sPOS[j+2]=='C'||sPOS[j+2]=='G'))
  294. {//Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效;
  295. continue;
  296. }
  297. /* if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0)
  298. {//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同,规则失效.如:韩磊磊
  299. continue;
  300. }
  301. if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12)
  302. {//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘
  303. continue;
  304. }
  305. */ //Get the possible name
  306. nPos=j;//Record the person position in the tag sequence
  307. sPersonName[0]=0;
  308. nLittleFreqCount=0;//Record the number of role with little frequency
  309. while(nPos<j+nPatternLen[k])
  310. {//Get the possible person name
  311.  //
  312. if(m_nBestTag[nPos]<4&&personDict.GetFrequency(m_sWords[nPos],m_nBestTag[nPos])<LITTLE_FREQUENCY)
  313. nLittleFreqCount++;//The counter increase
  314. strcat(sPersonName,m_sWords[nPos]);
  315. nPos+=1;
  316. }
  317. if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY)
  318. {//Exclusion foreign name
  319.  //Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效
  320. j+=nPatternLen[k]-1;
  321. continue;
  322. }
  323. if(strcmp(sPatterns[k],"CDCD")==0)
  324. {//Rule for exclusion
  325.  //规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱
  326.    //Rule 3 for exclusion:含外国人名用字 规则适用
  327.  //否则,排除规则失效:黑妞白妞姐俩拔了头筹。
  328. if(GetForeignCharCount(sPersonName)>0)
  329. j+=nPatternLen[k]-1;
  330. continue;
  331. }
  332. if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName))
  333. {//
  334. j+=nPatternLen[k]-1;
  335. continue;
  336. }
  337. if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3)
  338. //马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀,
  339. //The all roles appear with two lower frequecy,we will ignore them
  340. continue;
  341. m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[j];
  342. m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[j+nPatternLen[k]];
  343. m_dWordsPossibility[m_nUnknownIndex]=log(dFactor[k])+ComputePossibility(j,nPatternLen[k],personDict);
  344. //Mutiply the factor 
  345. m_nUnknownIndex+=1;
  346. j+=nPatternLen[k];
  347. bMatched=true;
  348. }
  349. }
  350.     if(!bMatched)//Not matched, add j by 1
  351. j+=1;
  352.   }
  353.   return true;
  354. }
  355. int CSpan::GetFrom(PWORD_RESULT pWordItems,int nIndex,CDictionary &dictCore, CDictionary &dictUnknown)
  356. {
  357. int nCount,aPOS[MAX_POS_PER_WORD],aFreq[MAX_POS_PER_WORD];
  358. int nFreq=0,j,nRetPos=0,nWordsIndex=0;
  359. bool bSplit=false;//Need to split in Transliteration recognition 
  360.     int i=1;
  361. nWordsIndex=i+nIndex-1;
  362. for(;i<MAX_WORDS_PER_SENTENCE&&pWordItems[nWordsIndex].sWord[0]!=0;i++)
  363. {
  364. if(m_tagType==TT_NORMAL||!dictUnknown.IsExist(pWordItems[nWordsIndex].sWord,44))
  365.         {
  366. strcpy(m_sWords[i],pWordItems[nWordsIndex].sWord);//store current word
  367.         m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]);
  368. }
  369. else
  370. {
  371. if(!bSplit)
  372. {
  373. strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord,2);//store current word
  374. m_sWords[i][2]=0;
  375. bSplit=true;
  376. }
  377. else
  378. {
  379. unsigned int nLen=strlen(pWordItems[nWordsIndex].sWord+2);
  380. strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord+2,nLen);//store current word
  381. m_sWords[i][nLen]=0;
  382. bSplit=false;
  383. }
  384.         m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]);
  385. }
  386. //Record the position of current word
  387. m_nStartPos=m_nWordPosition[i+1];
  388. //Move the Start POS to the ending
  389. if(m_tagType!=TT_NORMAL)
  390. {
  391. //Get the POSs from the unknown recognition dictionary
  392. dictUnknown.GetHandle(m_sWords[i],&nCount,aPOS,aFreq);
  393. for(j=0;j<nCount;j++) 
  394. {//Get the POS set of sCurWord in the unknown dictionary
  395. m_nTags[i][j]=aPOS[j];
  396.     m_dFrequency[i][j]=-log((double)(1+aFreq[j]))+log((double)(m_context.GetFrequency(0,aPOS[j])+1));
  397. }
  398. //Get the POS set of sCurWord in the core dictionary
  399. //We ignore the POS in the core dictionary and recognize them as other (0).
  400. //We add their frequency to get the possibility as POS 0
  401. dictCore.GetHandle(m_sWords[i],&nCount,aPOS,aFreq);
  402. nFreq=0;
  403. for(int k=0;k<nCount;k++) 
  404. {
  405. nFreq+=aFreq[k];
  406. }
  407. if(nCount>0)
  408. {
  409. m_nTags[i][j]=0;
  410. //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1);
  411. m_dFrequency[i][j]=-log((double)(1+nFreq))+log((double)(m_context.GetFrequency(0,0)+1));
  412. j++;
  413. }
  414. }
  415. else//For normal POS tagging
  416. {
  417. j=0;
  418. //Get the POSs from the unknown recognition dictionary
  419. if(pWordItems[nWordsIndex].nHandle>0)
  420. {//The word has  is only one POS value
  421.  //We have record its POS and nFrequncy in the items.
  422. m_nTags[i][j]=pWordItems[nWordsIndex].nHandle;
  423. m_dFrequency[i][j]=pWordItems[nWordsIndex].dValue-log(MAX_FREQUENCE)+log((double)(m_context.GetFrequency(0,m_nTags[i][j])+1));
  424. if(m_dFrequency[i][j]<0)//Not permit the value less than 0
  425. m_dFrequency[i][j]=0;
  426. j++;
  427. }
  428. else
  429. {//The word has multiple POSs, we should retrieve the information from Core Dictionary 
  430. if(pWordItems[nWordsIndex].nHandle<0)
  431. {//The word has  is only one POS value
  432.  //We have record its POS and nFrequncy in the items.
  433. if(pWordItems[nWordsIndex].nHandle==-'t'*256-'t')//tt
  434. {
  435. char sWordOrg[100],sPostfix[10];
  436. double dRatio=0.6925;//The ratio which transliteration as a person name 
  437. PostfixSplit(pWordItems[nWordsIndex].sWord,sWordOrg,sPostfix);
  438. if(sPostfix[0]!=0)
  439. dRatio=0.01;
  440. m_nTags[i][j]='n'*256+'r';
  441. m_dFrequency[i][j]=-log(dRatio)+pWordItems[nWordsIndex].dValue;
  442. //m_dFrequency[i][j]=log(dRatio)+pWordItems[nWordsIndex].dValue-log(m_context.GetFrequency(0,m_nTags[i][j]))+log(MAX_FREQUENCE);
  443. //P(W|R)=P(WRT)/P(RT)=P(R)*P(W|T)/P(R|T)
  444. j++;
  445. m_nTags[i][j]='n'*256+'s';
  446. m_dFrequency[i][j]=-log(1-dRatio)+pWordItems[nWordsIndex].dValue;
  447. //m_dFrequency[i][j]=log(1-dRatio)+pWordItems[nWordsIndex].dValue-log(m_context.GetFrequency(0,m_nTags[i][j]))+log(MAX_FREQUENCE);
  448. j++;
  449. }
  450. else//Unknown words such as Chinese person name or place name
  451. {
  452. m_nTags[i][j]=-pWordItems[nWordsIndex].nHandle;
  453.     // m_dFrequency[i][j++]=(double)(1+pWordItems[nWordsIndex].nFrequency)/(double)(m_context.GetFrequency(0,aPOS[j])+1);
  454. m_dFrequency[i][j++]=pWordItems[nWordsIndex].dValue;
  455. }
  456. }
  457. dictCore.GetHandle(m_sWords[i],&nCount,aPOS,aFreq);
  458. for(;j<nCount;j++) 
  459. {//Get the POS set of sCurWord in the unknown dictionary
  460. m_nTags[i][j]=aPOS[j];
  461.     m_dFrequency[i][j]=-log(1+aFreq[j])+log(m_context.GetFrequency(0,m_nTags[i][j])+1);
  462. }
  463. }
  464. }
  465. if(j==0)
  466. {//We donot know the POS, so we have to guess them according lexical knowledge
  467. GuessPOS(i,&j);//Guess the POS of current word
  468. }
  469. m_nTags[i][j]=-1;//Set the ending POS 
  470. if(j==1)//No ambuguity
  471. {//No ambuguity, so we can break from the loop
  472. i++;
  473. m_sWords[i][0]=0;
  474. break;
  475. }
  476. if(!bSplit)
  477. nWordsIndex++;
  478. }
  479. if(pWordItems[nWordsIndex].sWord[0]==0)
  480. nRetPos=-1;//Reaching ending
  481. if(m_nTags[i-1][1]!=-1)//||m_sWords[i][0]==0
  482. {//Set end for words like "张/华/平"
  483. if(m_tagType!=TT_NORMAL)
  484.        m_nTags[i][0]=101;
  485. else
  486.        m_nTags[i][0]=1;
  487. m_dFrequency[i][0]=0;
  488.     m_sWords[i][0]=0;//Set virtual ending
  489. m_nTags[i++][1]=-1;
  490. }
  491. m_nCurLength=i;//The current word count
  492. if(nRetPos!=-1)
  493. return nWordsIndex+1;//Next start position
  494. return -1;//Reaching ending
  495. }
  496. //Set the tag type
  497. void CSpan::SetTagType(enum TAG_TYPE  nType)
  498. {
  499. m_tagType=nType;
  500. }
  501. //POS tagging with Hidden Markov Model
  502. bool CSpan::POSTagging(PWORD_RESULT pWordItems,CDictionary &dictCore,CDictionary &dictUnknown)
  503. {
  504. //pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary
  505.     int i=0,j,nStartPos;
  506. Reset(false);
  507.     while(i>-1&&pWordItems[i].sWord[0]!=0)
  508. {
  509. nStartPos=i;//Start Position
  510. i=GetFrom(pWordItems,nStartPos,dictCore,dictUnknown);
  511. GetBestPOS();
  512. switch(m_tagType)
  513. {
  514. case TT_NORMAL://normal POS tagging
  515. j=1;
  516. while(m_nBestTag[j]!=-1&&j<m_nCurLength)
  517. {//Store the best POS tagging
  518. pWordItems[j+nStartPos-1].nHandle=m_nBestTag[j];
  519. //Let 。be 0
  520. if(pWordItems[j+nStartPos-1].dValue>0&&dictCore.IsExist(pWordItems[j+nStartPos-1].sWord,-1))//Exist and update its frequncy as a POS value
  521. pWordItems[j+nStartPos-1].dValue=log(MAX_FREQUENCE)-log(dictCore.GetFrequency(pWordItems[j+nStartPos-1].sWord,m_nBestTag[j])+1);
  522. j+=1;
  523. }
  524. break;
  525. case TT_PERSON://Person recognition
  526. /*clock_t lStart,lEnd;
  527.     lStart=clock();
  528. */
  529. SplitPersonPOS(dictUnknown);
  530. //lEnd=clock();
  531. //printf("SplitPersonPOS=%fn",(double)(lEnd-lStart)*1000/CLOCKS_PER_SEC);
  532. //Spit Persons POS
  533. //lStart=clock();
  534. PersonRecognize(dictUnknown);
  535. //lEnd=clock();
  536. //printf("PersonRecognize=%fn",(double)(lEnd-lStart)/CLOCKS_PER_SEC);
  537. //Person Recognition with the person recognition dictionary
  538. break;
  539. case TT_PLACE://Place name recognition
  540. PlaceRecognize(dictCore,dictUnknown);
  541. break;
  542. case TT_TRANS://Transliteration
  543. TransRecognize(dictCore,dictUnknown);
  544. break;
  545. default:
  546. break;
  547. }
  548. Reset();
  549. }
  550. return true;
  551. }
  552. //Guess the POS of No. nIndex word item
  553. bool CSpan::GuessPOS(int nIndex,int *pSubIndex)
  554. {
  555. int j=0,i=nIndex,nCharType;
  556. unsigned int nLen;
  557. switch(m_tagType)
  558. {
  559. case TT_NORMAL:
  560. break;
  561. case TT_PERSON:
  562. j=0;
  563. if(CC_Find("××",m_sWords[nIndex]))
  564. {
  565. m_nTags[i][j]=6;
  566. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,6)+1);
  567. }
  568. else
  569. {
  570. m_nTags[i][j]=0;
  571. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  572. nLen=strlen(m_sWords[nIndex]);
  573. if(nLen>=4)
  574. {
  575. m_nTags[i][j]=0;
  576. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  577. m_nTags[i][j]=11;
  578. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
  579. m_nTags[i][j]=12;
  580. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
  581. m_nTags[i][j]=13;
  582. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
  583. }
  584. else if(nLen==2)
  585. {
  586. m_nTags[i][j]=0;
  587. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  588. nCharType=charType((unsigned char *)m_sWords[nIndex]);
  589. if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
  590. {
  591. m_nTags[i][j]=1;
  592. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
  593. m_nTags[i][j]=2;
  594. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
  595. m_nTags[i][j]=3;
  596. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
  597. m_nTags[i][j]=4;
  598. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
  599. }
  600. m_nTags[i][j]=11;
  601. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
  602. m_nTags[i][j]=12;
  603. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
  604. m_nTags[i][j]=13;
  605. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
  606. }
  607. }
  608. break;
  609. case TT_PLACE:
  610. j=0;
  611. m_nTags[i][j]=0;
  612. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  613. nLen=strlen(m_sWords[nIndex]);
  614. if(nLen>=4)
  615. {
  616. m_nTags[i][j]=11;
  617. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
  618. m_nTags[i][j]=12;
  619. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
  620. m_nTags[i][j]=13;
  621. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
  622. }
  623. else if(nLen==2)
  624. {
  625. m_nTags[i][j]=0;
  626. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  627. nCharType=charType((unsigned char *)m_sWords[nIndex]);
  628. if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
  629. {
  630. m_nTags[i][j]=1;
  631. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
  632. m_nTags[i][j]=2;
  633. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
  634. m_nTags[i][j]=3;
  635. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
  636. m_nTags[i][j]=4;
  637. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
  638. }
  639. m_nTags[i][j]=11;
  640. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
  641. m_nTags[i][j]=12;
  642. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
  643. m_nTags[i][j]=13;
  644. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
  645. }
  646. break;
  647. case TT_TRANS:
  648. j=0;
  649. nLen=strlen(m_sWords[nIndex]);
  650. m_nTags[i][j]=0;
  651. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  652. if(!IsAllChinese((unsigned char *)m_sWords[nIndex]))
  653. {
  654. if(IsAllLetter((unsigned char *)m_sWords[nIndex]))
  655. {
  656. m_nTags[i][j]=1;
  657. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
  658. m_nTags[i][j]=11;
  659. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)+1);
  660. /* }
  661. if(IsAllNum((unsigned char *)m_sWords[nIndex])||IsAllLetter((unsigned char *)m_sWords[nIndex]))
  662. {
  663. */ m_nTags[i][j]=2;
  664. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
  665. m_nTags[i][j]=3;
  666. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
  667. m_nTags[i][j]=12;
  668. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*2+1);
  669. m_nTags[i][j]=13;
  670. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*2+1);
  671. }
  672. m_nTags[i][j]=41;
  673. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
  674. m_nTags[i][j]=42;
  675. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
  676. m_nTags[i][j]=43;
  677. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
  678. }
  679. else if(nLen>=4)
  680. {
  681. m_nTags[i][j]=41;
  682. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
  683. m_nTags[i][j]=42;
  684. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
  685. m_nTags[i][j]=43;
  686. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
  687. }
  688. else if(nLen==2)
  689. {
  690. nCharType=charType((unsigned char *)m_sWords[nIndex]);
  691. if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
  692. {
  693. m_nTags[i][j]=1;
  694. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)*2+1);
  695. m_nTags[i][j]=2;
  696. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
  697. m_nTags[i][j]=3;
  698. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
  699. m_nTags[i][j]=30;
  700. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,30)*8+1);
  701. m_nTags[i][j]=11;
  702. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*4+1);
  703. m_nTags[i][j]=12;
  704. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*4+1);
  705. m_nTags[i][j]=13;
  706. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*4+1);
  707. m_nTags[i][j]=21;
  708. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,21)*2+1);
  709. m_nTags[i][j]=22;
  710. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,22)*2+1);
  711. m_nTags[i][j]=23;
  712. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,23)*2+1);
  713. }
  714. m_nTags[i][j]=41;
  715. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
  716. m_nTags[i][j]=42;
  717. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
  718. m_nTags[i][j]=43;
  719. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
  720. }
  721. break;
  722. default:
  723. break;
  724. }
  725. *pSubIndex=j;
  726. return true;
  727. }
  728. ELEMENT_TYPE  CSpan::ComputePossibility(int nStartPos,int nLength,CDictionary &dict)
  729. {
  730. ELEMENT_TYPE dRetValue=0,dPOSPoss;
  731. //dPOSPoss: the possibility of a POS appears
  732. //dContextPoss: The possibility of context POS appears
  733. int nFreq;
  734. for(int i=nStartPos;i<nStartPos+nLength;i++)
  735. {
  736. nFreq=dict.GetFrequency(m_sWords[i],m_nBestTag[i]);
  737. //nFreq is word being the POS
  738. dPOSPoss=log((double)(m_context.GetFrequency(0,m_nBestTag[i])+1))-log((double)(nFreq+1));
  739. dRetValue+=dPOSPoss;
  740. /* if(i<nStartPos+nLength-1)
  741. {
  742. dContextPoss=log((double)(m_context.GetContextPossibility(0,m_nBestTag[i],m_nBestTag[i+1])+1));
  743. dRetValue+=dPOSPoss-dContextPoss;
  744. }
  745. */ }
  746. return dRetValue;
  747. }
  748. bool CSpan::TransRecognize(CDictionary &dictCore,CDictionary &transDict)
  749. {
  750.   char sPOS[MAX_WORDS_PER_SENTENCE]="Z";
  751.   int nStart=1,nEnd=1,i=1;
  752.   while(m_nBestTag[i]>-1)
  753.   {
  754.   if(m_nBestTag[i]==1||m_nBestTag[i]==11||m_nBestTag[i]==21)//1,11,21 Trigger the recognition
  755.   {
  756. nStart=i;
  757. nEnd=nStart+1;
  758. while(m_nBestTag[nEnd]==m_nBestTag[nStart])//1,11,21
  759. nEnd++;
  760. while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
  761. nEnd++;
  762. while(m_nBestTag[nEnd]==m_nBestTag[nStart]+2)//3,13,23
  763. nEnd++;
  764. while(m_nBestTag[nEnd]==30)//3,13,23
  765. nEnd++;
  766.   }
  767.   else if(m_nBestTag[i]==2||m_nBestTag[i]==12||m_nBestTag[i]==22)//1,11,21 Trigger the recognition
  768.   {
  769. nStart=i;
  770. nEnd=nStart+1;
  771. while(m_nBestTag[nEnd]==m_nBestTag[nStart])//2,12,22
  772. nEnd++;
  773. while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
  774. nEnd++;
  775. while(m_nBestTag[nEnd]==30)//3,13,23
  776. nEnd++;
  777.   }
  778.   if(nEnd>nStart&&!IsAllNum((unsigned char *)m_sWords[nStart])&&(nEnd>nStart+2||(nEnd==nStart+2&&(m_nBestTag[nEnd-1]!=30||strlen(m_sWords[nStart])>2))||(nEnd==nStart+1&&strlen(m_sWords[nStart])>2&&!dictCore.IsExist(m_sWords[nStart],-1))))
  779.   {
  780. m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
  781. m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
  782. m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,transDict);
  783. nStart=nEnd;
  784.   }
  785.   if(i<nEnd)
  786.   i=nEnd;
  787.   else
  788.   i=i+1;
  789.   }
  790.   return true;
  791. }
  792. bool CSpan::PlaceRecognize(CDictionary &dictCore,CDictionary &placeDict)
  793. {
  794.   int nStart=1,nEnd=1,i=1;
  795.   while(m_nBestTag[i]>-1)
  796.   {
  797.   if(m_nBestTag[i]==1)//1 Trigger the recognition procession
  798.   {
  799. nStart=i;
  800. nEnd=nStart+1;
  801. while(m_nBestTag[nEnd]==1)//
  802. nEnd++;
  803. while(m_nBestTag[nEnd]==2)//2,12,22
  804. nEnd++;
  805. while(m_nBestTag[nEnd]==3)
  806. nEnd++;
  807. while(m_nBestTag[nEnd]==4)
  808. nEnd++;   
  809.   }
  810.   else if(m_nBestTag[i]==2)//1,11,21 Trigger the recognition
  811.   {
  812. nStart=i;
  813. nEnd=nStart+1;
  814. while(m_nBestTag[nEnd]==2)//2
  815. nEnd++;
  816. while(m_nBestTag[nEnd]==3)//2
  817. nEnd++;
  818. while(m_nBestTag[nEnd]==4)//2
  819. nEnd++;
  820.   }
  821.   if(nEnd>nStart)
  822.   {
  823. m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
  824. m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
  825. m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,placeDict);
  826. nStart=nEnd;
  827.   }
  828.   if(i<nEnd)
  829.   i=nEnd;
  830.   else
  831.   i=i+1;
  832.   }
  833.   return true;
  834. }