Span.cpp
上传用户:sunyong76
上传日期:2021-10-03
资源大小:2236k
文件大小:29k
源码类别:

多国语言处理

开发平台:

Java

  1. //////////////////////////////////////////////////////////////////////
  2. //ICTCLAS简介:计算所汉语词法分析系统ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System),
  3. //             功能有:中文分词;词性标注;未登录词识别。
  4. //             分词正确率高达97.58%(973专家评测结果),
  5. //             未登录词识别召回率均高于90%,其中中国人名的识别召回率接近98%;
  6. //             处理速度为31.5Kbytes/s。
  7. //著作权:  Copyright?2002-2005中科院计算所 职务著作权人:张华平 刘群
  8. //遵循协议:自然语言处理开放资源许可证1.0
  9. //Email: zhanghp@software.ict.ac.cn
  10. //Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
  11. /****************************************************************************
  12.  *
  13.  * Copyright (c) 2000, 2001 
  14.  *     Machine Group
  15.  *     Software Research Lab.
  16.  *     Institute of Computing Tech.
  17.  *     Chinese Academy of Sciences
  18.  *     All rights reserved.
  19.  *
  20.  * This file is the confidential and proprietary property of 
  21.  * Institute of Computing Tech. and the posession or use of this file requires 
  22.  * a written license from the author.
  23.  * Filename: Span.cpp
  24.  * Abstract:
  25.  *           implementation of the CSpan class.
  26.  * Author:   Kevin Zhang 
  27.  *          (zhanghp@software.ict.ac.cn)
  28.  * Date:     2002-4-23
  29.  *
  30.  * Notes:    Tagging with Hidden Markov Model
  31.  *                
  32.  ****************************************************************************/
  33. #include "stdafx.h"
  34. #include "Span.h"
  35. #include "..\Segment\Segment.h"
  36. #include "..\Utility\Utility.h"
  37. #include <math.h>
  38. #include <string.h>
  39. #include <stdio.h>
  40. #include <time.h>
  41. //////////////////////////////////////////////////////////////////////
  42. // Construction/Destruction
  43. //////////////////////////////////////////////////////////////////////
  44. CSpan::CSpan()
  45. {
  46. if(m_tagType!=TT_NORMAL)
  47.       m_nTags[0][0]=100;//Begin tag
  48. else
  49.       m_nTags[0][0]=0;//Begin tag
  50. m_nTags[0][1]=-1;
  51. m_dFrequency[0][0]=0;
  52. m_nCurLength=1;
  53. m_nUnknownIndex=0;
  54. m_nStartPos=0;
  55. m_nWordPosition[1]=0;
  56. m_sWords[0][0]=0;
  57. m_tagType=TT_NORMAL;//Default tagging type
  58. }
  59. CSpan::~CSpan()
  60. {
  61. }
  62. bool CSpan::Disamb()
  63. {
  64. int i,j,k,nMinCandidate;
  65. /* 
  66.      * ----- commented by huangjin@ict.ac.cn 2006-5-30 ------ 
  67.      * 
  68.  * double dMinFee,dTmp;
  69.  *
  70. */
  71. /*----Added By huangjin@ict.ac.cn 2006-5-30----*/
  72. double dMinFee=0.0, dTmp=0.0;
  73. /*---------------------------------------------*/
  74. for(i=1;i<m_nCurLength;i++)//For every word
  75. {
  76. for(j=0;m_nTags[i][j]>=0;j++)//For every word
  77. {
  78. nMinCandidate=MAX_POS_PER_WORD+1;
  79. /*----Added By huangjin@ict.ac.cn 2006-7-26----*/
  80. dMinFee=10000000.00;//Init
  81. /*---------------------------------------------*/
  82. for(k=0;m_nTags[i-1][k]>=0;k++)
  83. {
  84. //ConvertPOS(m_nTags[i-1][k],&nKey,&nPrevPOS);
  85. //ConvertPOS(m_nTags[i][j],&nKey,&nCurPOS);
  86. //dTmp=m_context.GetContextPossibility(nKey,nPrevPOS,nCurPOS);
  87. dTmp=-log(m_context.GetContextPossibility(0,m_nTags[i-1][k],m_nTags[i][j]));
  88. dTmp+=m_dFrequency[i-1][k];//Add the fees
  89. /* 
  90. * ----- commented by huangjin@ict.ac.cn 2006-7-26 ------ 
  91. *
  92. *  if(nMinCandidate>10||dTmp<dMinFee)//Get the minimum fee
  93. *
  94. */
  95. /*----Added By huangjin@ict.ac.cn 2006-7-26----*/
  96. if(nMinCandidate>MAX_POS_PER_WORD+1||dTmp<dMinFee)//Get the minimum fee
  97. /*---------------------------------------------*/
  98. {
  99. nMinCandidate=k;
  100. dMinFee=dTmp;
  101. }
  102. }
  103. m_nBestPrev[i][j]=nMinCandidate;//The best previous for j
  104. m_dFrequency[i][j]=m_dFrequency[i][j]+dMinFee;
  105. }
  106. }
  107. return true;
  108. }
  109. bool CSpan::Reset(bool bContinue)
  110. {
  111. if(!bContinue)
  112. {//||CC_Find("。!”〕〉》」〗】",m_sWords[m_nCurLength-1])
  113. if(m_tagType!=TT_NORMAL)//Get the last POS in the last sentence
  114.       m_nTags[0][0]=100;//Begin tag
  115. else
  116.       m_nTags[0][0]=0;//Begin tag
  117. m_nUnknownIndex=0;
  118. m_dFrequency[0][0]=0;
  119. m_nStartPos=0;
  120. }
  121. else
  122. {
  123. m_nTags[0][0]=m_nTags[m_nCurLength-1][0];//Get the last POS in the last sentence
  124. m_dFrequency[0][0]=m_dFrequency[m_nCurLength-1][0];
  125. }
  126.     m_nTags[0][1]=-1;//Get the last POS in the last sentence,set the -1 as end flag
  127. m_nCurLength=1;
  128. m_nWordPosition[1]=m_nStartPos;
  129. m_sWords[0][0]=0;
  130. return true;
  131. }
  132. bool CSpan::LoadContext(char *sFilename)
  133. {
  134. return m_context.Load(sFilename);
  135. }
  136. bool CSpan::GetBestPOS()
  137. {
  138.   Disamb();
  139.   for(int i=m_nCurLength-1,j=0;i>0;i--)//,j>=0
  140.   {
  141.  if(m_sWords[i][0])
  142.  {//Not virtual ending
  143.  m_nBestTag[i]=m_nTags[i][j];//Record the best POS and its possibility
  144.  }  
  145.  j=m_nBestPrev[i][j];
  146.  /*----Added By huangjin@ict.ac.cn 2006-9-12----*/
  147.  if ( j >= MAX_WORDS_PER_SENTENCE )
  148.  {
  149.  j = 0;
  150.  }
  151.  /*---------------------------------------------*/
  152.   }
  153.   int nEnd=m_nCurLength;//Set the end of POS tagging
  154. if(m_sWords[m_nCurLength-1][0]==0)
  155. nEnd=m_nCurLength-1;
  156.  
  157.   m_nBestTag[nEnd]=-1;
  158.   return true;
  159. }
  160. bool CSpan::PersonRecognize(CDictionary &personDict)
  161. {
  162.   char sPOS[MAX_WORDS_PER_SENTENCE]="z",sPersonName[100];
  163.                           //0     1    2    3    4   5   
  164.   char sPatterns[][5]={ "BBCD","BBC","BBE","BBZ","BCD","BEE","BE","BG",
  165.                     "BXD","BZ", "CDCD","CD","EE", "FB", "Y","XD",""};
  166.                     //BBCD        BBC       BBE     BBZ    BCD         BEE      BE         BG
  167.   double dFactor[]={0.003606,0.000021,0.001314,0.000315,0.656624, 0.000021,0.146116,0.009136,
  168.                // BXD      BZ   CDCD     CD      EE      FB       Y         XD  
  169. 0.000042,0.038971,0,0.090367,0.000273,0.009157,0.034324,0.009735,0
  170.   };
  171.   //About parameter:
  172. /*
  173. BBCD 343 0.003606
  174. BBC 2 0.000021
  175. BBE 125 0.001314
  176. BBZ 30 0.000315
  177. BCD 62460 0.656624
  178. BEE 0 0.000000
  179. BE 13899 0.146116
  180. BG 869 0.009136
  181. BXD 4 0.000042
  182. BZ 3707 0.038971
  183. CD 8596 0.090367
  184. EE 26 0.000273
  185. FB 871 0.009157
  186. Y 3265 0.034324
  187. XD 926 0.009735
  188.  */
  189.   //The person recognition patterns set
  190.   //BBCD:姓+姓+名1+名2;
  191.   //BBE: 姓+姓+单名;
  192.   //BBZ: 姓+姓+双名成词;
  193.   //BCD: 姓+名1+名2;
  194.   //BE:  姓+单名;
  195.   //BEE: 姓+单名+单名;韩磊磊
  196.   //BG:  姓+后缀
  197.   //BXD: 姓+姓双名首字成词+双名末字
  198.   //BZ:  姓+双名成词;
  199.   //B:  姓
  200.   //CD:  名1+名2;
  201.   //EE:  单名+单名;
  202.   //FB:  前缀+姓
  203.   //XD:  姓双名首字成词+双名末字
  204.   //Y:   姓单名成词
  205.   int nPatternLen[]={4,3,3,3,3,3,2,2,3,2,4,2,2,2,1,2,0};
  206.   /* 
  207. * ----- commented by huangjin@ict.ac.cn 2006-7-27 ------ 
  208. *
  209. * for(int i=1;m_nBestTag[i]>-1;i++)//Convert to string from POS
  210. *
  211. */
  212. /*----Added By huangjin@ict.ac.cn 2006-7-27----*/
  213.   int i=1;
  214.   for( i=1;m_nBestTag[i]>-1;i++)//Convert to string from POS
  215. /*---------------------------------------------*/  
  216. sPOS[i]=m_nBestTag[i]+'A';
  217.   sPOS[i]=0;
  218.   int j=1,k,nPos;//Find the proper pattern from the first POS
  219.   int nLittleFreqCount;//Counter for the person name role with little frequecy
  220.   bool bMatched=false;   
  221.   while(j<i)
  222.   {
  223. bMatched=false;   
  224. for(k=0;!bMatched&&nPatternLen[k]>0;k++)
  225. {
  226. if(strncmp(sPatterns[k],sPOS+j,nPatternLen[k])==0&&strcmp(m_sWords[j-1],"·")!=0&&strcmp(m_sWords[j+nPatternLen[k]],"·")!=0)
  227. {//Find the proper pattern k
  228. if(strcmp(sPatterns[k],"FB")==0&&(sPOS[j+2]=='E'||sPOS[j+2]=='C'||sPOS[j+2]=='G'))
  229. {//Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效;
  230. continue;
  231. }
  232. /* if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0)
  233. {//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同,规则失效.如:韩磊磊
  234. continue;
  235. }
  236. if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12)
  237. {//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘
  238. continue;
  239. }
  240. */ //Get the possible name
  241. nPos=j;//Record the person position in the tag sequence
  242. sPersonName[0]=0;
  243. nLittleFreqCount=0;//Record the number of role with little frequency
  244. while(nPos<j+nPatternLen[k])
  245. {//Get the possible person name
  246.  //
  247. if(m_nBestTag[nPos]<4&&personDict.GetFrequency(m_sWords[nPos],m_nBestTag[nPos])<LITTLE_FREQUENCY)
  248. nLittleFreqCount++;//The counter increase
  249. strcat(sPersonName,m_sWords[nPos]);
  250. nPos+=1;
  251. }
  252. /*
  253. if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY)
  254. {//Exclusion foreign name
  255.  //Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效
  256. j+=nPatternLen[k]-1;
  257. continue;
  258. }
  259. */ if(strcmp(sPatterns[k],"CDCD")==0)
  260. {//Rule for exclusion
  261.  //规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱
  262.    //Rule 3 for exclusion:含外国人名用字 规则适用
  263.  //否则,排除规则失效:黑妞白妞姐俩拔了头筹。
  264. if(GetForeignCharCount(sPersonName)>0)
  265. j+=nPatternLen[k]-1;
  266. continue;
  267. }
  268. /* if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName))
  269. {//
  270. j+=nPatternLen[k]-1;
  271. continue;
  272. }
  273. if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3)
  274. //马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀,
  275. //The all roles appear with two lower frequecy,we will ignore them
  276. continue;
  277. */ m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[j];
  278. m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[j+nPatternLen[k]];
  279. m_dWordsPossibility[m_nUnknownIndex]=-log(dFactor[k])+ComputePossibility(j,nPatternLen[k],personDict);
  280. //Mutiply the factor 
  281. m_nUnknownIndex+=1;
  282. j+=nPatternLen[k];
  283. bMatched=true;
  284. }
  285. }
  286.     if(!bMatched)//Not matched, add j by 1
  287. j+=1;
  288.   }
  289.   return true;
  290. }
  291. int CSpan::GetFrom(PWORD_RESULT pWordItems,int nIndex,CDictionary &dictCore, CDictionary &dictUnknown)
  292. {
  293. int nCount,aPOS[MAX_POS_PER_WORD],aFreq[MAX_POS_PER_WORD];
  294. int nFreq=0,j,nRetPos=0,nWordsIndex=0;
  295. bool bSplit=false;//Need to split in Transliteration recognition 
  296.     int i=1,nPOSCount;
  297. char sCurWord[WORD_MAXLENGTH];//Current word
  298. nWordsIndex=i+nIndex-1;
  299. /* 
  300. * ----- commented by huangjin@ict.ac.cn 2006-9-12 ------ 
  301. *
  302. *  for(;i<MAX_WORDS_PER_SENTENCE&&pWordItems[nWordsIndex].sWord[0]!=0;i++)
  303. *
  304. */
  305. /*----Added By huangjin@ict.ac.cn 2006-9-12----*/
  306. for(;i<MAX_WORDS_PER_SENTENCE-1&&pWordItems[nWordsIndex].sWord[0]!=0;i++)
  307. /*---------------------------------------------*/
  308. {
  309. if(m_tagType==TT_NORMAL||!dictUnknown.IsExist(pWordItems[nWordsIndex].sWord,44))
  310.         {
  311. strcpy(m_sWords[i],pWordItems[nWordsIndex].sWord);//store current word
  312.         m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]);
  313. }
  314. else
  315. {
  316. if(!bSplit)
  317. {
  318. strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord,2);//store current word
  319. m_sWords[i][2]=0;
  320. bSplit=true;
  321. }
  322. else
  323. {
  324. unsigned int nLen=strlen(pWordItems[nWordsIndex].sWord+2);
  325. strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord+2,nLen);//store current word
  326. m_sWords[i][nLen]=0;
  327. bSplit=false;
  328. }
  329.         m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]);
  330. }
  331. //Record the position of current word
  332. m_nStartPos=m_nWordPosition[i+1];
  333. //Move the Start POS to the ending
  334. if(m_tagType!=TT_NORMAL)
  335. {
  336. //Get the POSs from the unknown recognition dictionary
  337. strcpy(sCurWord,m_sWords[i]);
  338. if(m_tagType==TT_TRANS_PERSON&&i>0&&charType((unsigned char*)m_sWords[i-1])==CT_CHINESE)
  339. {
  340. if(m_sWords[i][0]=='.'&&m_sWords[i][1]==0)
  341. strcpy(sCurWord,".");
  342. else if(m_sWords[i][0]=='-'&&m_sWords[i][1]==0)
  343. strcpy(sCurWord,"-");
  344. }
  345. dictUnknown.GetHandle(sCurWord,&nCount,aPOS,aFreq);
  346. nPOSCount=nCount+1;
  347. for(j=0;j<nCount;j++) 
  348. {//Get the POS set of sCurWord in the unknown dictionary
  349. m_nTags[i][j]=aPOS[j];
  350.     m_dFrequency[i][j]=-log((double)(1+aFreq[j]))+log((double)(m_context.GetFrequency(0,aPOS[j])+nPOSCount));
  351. }
  352. //Get the POS set of sCurWord in the core dictionary
  353. //We ignore the POS in the core dictionary and recognize them as other (0).
  354. //We add their frequency to get the possibility as POS 0
  355. /* 
  356. * ----- commented by huangjin@ict.ac.cn 2006-6-8 ------ 
  357. *
  358. * if(strcmp(m_sWords[i],"始##始")==0)
  359. * {
  360. * m_nTags[i][j]=100;
  361. * m_dFrequency[i][j]=0;
  362. * j++;
  363. * }
  364. * else if(strcmp(m_sWords[i],"末##末")==0)
  365. * {
  366. * m_nTags[i][j]=101;
  367. * m_dFrequency[i][j]=0;
  368. * j++;
  369. * }
  370. *
  371. */
  372. /*----Added By huangjin@ict.ac.cn 2006-6-8----*/
  373. if(strcmp(m_sWords[i],SENTENCE_BEGIN)==0&&j==1)
  374. {//Sentence Begin
  375. m_nTags[i][j-1]=100;
  376. m_dFrequency[i][j-1]=0;
  377. }
  378. else if(strcmp(m_sWords[i],SENTENCE_END)==0&&j==1)
  379. {//Sentence Ending
  380. m_nTags[i][j]=101;
  381. m_dFrequency[i][j]=0;
  382. }
  383. /*---------------------------------------------*/
  384. else
  385. {
  386. dictCore.GetHandle(m_sWords[i],&nCount,aPOS,aFreq);
  387. nFreq=0;
  388. for(int k=0;k<nCount;k++) 
  389. {
  390. nFreq+=aFreq[k];
  391. }
  392. if(nCount>0)
  393. {
  394. m_nTags[i][j]=0;
  395. //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1);
  396. m_dFrequency[i][j]=-log((double)(1+nFreq))+log((double)(m_context.GetFrequency(0,0)+nPOSCount));
  397. j++;
  398. }
  399. }
  400. }
  401. else//For normal POS tagging
  402. {
  403. j=0;
  404. //Get the POSs from the unknown recognition dictionary
  405. if(pWordItems[nWordsIndex].nHandle>0)
  406. {//The word has  is only one POS value
  407.  //We have record its POS and nFrequncy in the items.
  408. m_nTags[i][j]=pWordItems[nWordsIndex].nHandle;
  409. /*----Added By huangjin@ict.ac.cn 2006-9-12----*/
  410. if ( pWordItems[nWordsIndex].dValue <= 0.0 )
  411. m_dFrequency[i][j] = 0.0;
  412. else
  413. /*---------------------------------------------*/
  414. m_dFrequency[i][j]=-log(pWordItems[nWordsIndex].dValue)+log((double)(m_context.GetFrequency(0,m_nTags[i][j])+1));
  415. if(m_dFrequency[i][j]<0)//Not permit the value less than 0
  416. m_dFrequency[i][j]=0;
  417. j++;
  418. }
  419. else
  420. {//The word has multiple POSs, we should retrieve the information from Core Dictionary 
  421. if(pWordItems[nWordsIndex].nHandle<0)
  422. {//The word has  is only one POS value
  423.  //We have record its POS and nFrequncy in the items.
  424. /*
  425. if(pWordItems[nWordsIndex].nHandle==-'t'*256-'t')//tt
  426. {
  427. char sWordOrg[100],sPostfix[10];
  428. double dRatio=0.6925;//The ratio which transliteration as a person name 
  429. PostfixSplit(pWordItems[nWordsIndex].sWord,sWordOrg,sPostfix);
  430. if(sPostfix[0]!=0)
  431. dRatio=0.01;
  432. m_nTags[i][j]='n'*256+'r';
  433. m_dFrequency[i][j]=-log(dRatio)+pWordItems[nWordsIndex].dValue;
  434. //m_dFrequency[i][j]=log(dRatio)+pWordItems[nWordsIndex].dValue-log(m_context.GetFrequency(0,m_nTags[i][j]))+log(MAX_FREQUENCE);
  435. //P(W|R)=P(WRT)/P(RT)=P(R)*P(W|T)/P(R|T)
  436. j++;
  437. m_nTags[i][j]='n'*256+'s';
  438. m_dFrequency[i][j]=-log(1-dRatio)+pWordItems[nWordsIndex].dValue;
  439. //m_dFrequency[i][j]=log(1-dRatio)+pWordItems[nWordsIndex].dValue-log(m_context.GetFrequency(0,m_nTags[i][j]))+log(MAX_FREQUENCE);
  440. j++;
  441. }
  442. else//Unknown words such as Chinese person name or place name
  443. {
  444. */
  445. m_nTags[i][j]=-pWordItems[nWordsIndex].nHandle;
  446. m_dFrequency[i][j++]=pWordItems[nWordsIndex].dValue;
  447. //}
  448. }
  449. dictCore.GetHandle(m_sWords[i],&nCount,aPOS,aFreq);
  450. nPOSCount=nCount;
  451. for(;j<nCount;j++) 
  452. {//Get the POS set of sCurWord in the unknown dictionary
  453. m_nTags[i][j]=aPOS[j];
  454. /* 
  455. * ----- commented by huangjin@ict.ac.cn 2006-5-29 ------ 
  456. *  m_dFrequency[i][j]=-log(1+aFreq[j])+log(m_context.GetFrequency(0,m_nTags[i][j])+nPOSCount);
  457. *
  458. */
  459. /*----Added By huangjin@ict.ac.cn 2006-5-29----*/
  460. m_dFrequency[i][j]=-log(double(1+aFreq[j]))+log(double(m_context.GetFrequency(0,m_nTags[i][j])+nPOSCount));
  461. /*---------------------------------------------*/
  462. }
  463. }
  464. }
  465. if(j==0)
  466. {//We donot know the POS, so we have to guess them according lexical knowledge
  467. GuessPOS(i,&j);// the POS of current word
  468. }
  469. m_nTags[i][j]=-1;//Set the ending POS 
  470. if(j==1&&m_nTags[i][j]!=CT_SENTENCE_BEGIN)//No ambuguity
  471. {//No ambuguity, so we can break from the loop
  472. i++;
  473. m_sWords[i][0]=0;
  474. break;
  475. }
  476. if(!bSplit)
  477. nWordsIndex++;
  478. }
  479. if(pWordItems[nWordsIndex].sWord[0]==0)
  480. nRetPos=-1;//Reaching ending
  481. if(m_nTags[i-1][1]!=-1)//||m_sWords[i][0]==0
  482. {//Set end for words like "张/华/平"
  483. if(m_tagType!=TT_NORMAL)
  484. m_nTags[i][0]=101;
  485. else
  486. m_nTags[i][0]=1;
  487. m_dFrequency[i][0]=0;
  488.     m_sWords[i][0]=0;//Set virtual ending
  489. /*----Added By huangjin@ict.ac.cn 2006-9-12----*/
  490. if ( i == MAX_WORDS_PER_SENTENCE - 1 )
  491. {//backfoward one word
  492. nWordsIndex--;
  493. }
  494. /*---------------------------------------------*/
  495.         m_nTags[i++][1]=-1;
  496. }
  497. m_nCurLength=i;//The current word count
  498. if(nRetPos!=-1)
  499. return nWordsIndex+1;//Next start position
  500. return -1;//Reaching ending
  501. }
  502. //Set the tag type
  503. void CSpan::SetTagType(enum TAG_TYPE  nType)
  504. {
  505. m_tagType=nType;
  506. }
  507. //POS tagging with Hidden Markov Model
  508. bool CSpan::POSTagging(PWORD_RESULT pWordItems,CDictionary &dictCore,CDictionary &dictUnknown)
  509. {
  510. //pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary
  511.     int i=0,j,nStartPos;
  512. Reset(false);
  513.     while(i>-1&&pWordItems[i].sWord[0]!=0)
  514. {
  515. nStartPos=i;//Start Position
  516. i=GetFrom(pWordItems,nStartPos,dictCore,dictUnknown);
  517. GetBestPOS();
  518. switch(m_tagType)
  519. {
  520. case TT_NORMAL://normal POS tagging
  521. j=1;
  522. while(m_nBestTag[j]!=-1&&j<m_nCurLength)
  523. {//Store the best POS tagging
  524. pWordItems[j+nStartPos-1].nHandle=m_nBestTag[j];
  525. //Let 。be 0
  526. if(pWordItems[j+nStartPos-1].dValue>0&&dictCore.IsExist(pWordItems[j+nStartPos-1].sWord,-1))//Exist and update its frequncy as a POS value
  527. pWordItems[j+nStartPos-1].dValue=dictCore.GetFrequency(pWordItems[j+nStartPos-1].sWord,m_nBestTag[j]);
  528. j+=1;
  529. }
  530. break;
  531. case TT_PERSON://Person recognition
  532. PersonRecognize(dictUnknown);
  533. break;
  534. case TT_PLACE://Place name recognition
  535. case TT_TRANS_PERSON://Transliteration Person
  536. PlaceRecognize(dictCore,dictUnknown);
  537. break;
  538. default:
  539. break;
  540. }
  541. Reset();
  542. }
  543. return true;
  544. }
  545. //Guess the POS of No. nIndex word item
  546. bool CSpan::GuessPOS(int nIndex,int *pSubIndex)
  547. {
  548. int j=0,i=nIndex,nCharType;
  549. unsigned int nLen;
  550. switch(m_tagType)
  551. {
  552. case TT_NORMAL:
  553. /*----Added By huangjin@ict.ac.cn 2006-9-12----*/
  554. m_nTags[i][j]='x'*256;
  555. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,'x'*256)+1);
  556. /*---------------------------------------------*/
  557. break;
  558. case TT_PERSON:
  559. j=0;
  560. if(CC_Find("××",m_sWords[nIndex]))
  561. {
  562. m_nTags[i][j]=6;
  563. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,6)+1);
  564. }
  565. else
  566. {
  567. m_nTags[i][j]=0;
  568. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  569. nLen=strlen(m_sWords[nIndex]);
  570. if(nLen>=4)
  571. {
  572. m_nTags[i][j]=0;
  573. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  574. m_nTags[i][j]=11;
  575. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
  576. m_nTags[i][j]=12;
  577. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
  578. m_nTags[i][j]=13;
  579. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
  580. }
  581. else if(nLen==2)
  582. {
  583. m_nTags[i][j]=0;
  584. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  585. nCharType=charType((unsigned char *)m_sWords[nIndex]);
  586. if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
  587. {
  588. m_nTags[i][j]=1;
  589. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
  590. m_nTags[i][j]=2;
  591. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
  592. m_nTags[i][j]=3;
  593. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
  594. m_nTags[i][j]=4;
  595. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
  596. }
  597. m_nTags[i][j]=11;
  598. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
  599. m_nTags[i][j]=12;
  600. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
  601. m_nTags[i][j]=13;
  602. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
  603. }
  604. }
  605. break;
  606. case TT_PLACE:
  607. j=0;
  608. m_nTags[i][j]=0;
  609. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  610. nLen=strlen(m_sWords[nIndex]);
  611. if(nLen>=4)
  612. {
  613. m_nTags[i][j]=11;
  614. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
  615. m_nTags[i][j]=12;
  616. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
  617. m_nTags[i][j]=13;
  618. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
  619. }
  620. else if(nLen==2)
  621. {
  622. m_nTags[i][j]=0;
  623. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  624. nCharType=charType((unsigned char *)m_sWords[nIndex]);
  625. if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
  626. {
  627. m_nTags[i][j]=1;
  628. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
  629. m_nTags[i][j]=2;
  630. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
  631. m_nTags[i][j]=3;
  632. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
  633. m_nTags[i][j]=4;
  634. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
  635. }
  636. m_nTags[i][j]=11;
  637. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
  638. m_nTags[i][j]=12;
  639. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
  640. m_nTags[i][j]=13;
  641. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
  642. }
  643. break;
  644. case TT_TRANS_PERSON:
  645. j=0;
  646. nLen=strlen(m_sWords[nIndex]);
  647. m_nTags[i][j]=0;
  648. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  649. if(!IsAllChinese((unsigned char *)m_sWords[nIndex]))
  650. {
  651. if(IsAllLetter((unsigned char *)m_sWords[nIndex]))
  652. {
  653. m_nTags[i][j]=1;
  654. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
  655. m_nTags[i][j]=11;
  656. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)+1);
  657. m_nTags[i][j]=2;
  658. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
  659. m_nTags[i][j]=3;
  660. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
  661. m_nTags[i][j]=12;
  662. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*2+1);
  663. m_nTags[i][j]=13;
  664. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*2+1);
  665. }
  666. m_nTags[i][j]=41;
  667. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
  668. m_nTags[i][j]=42;
  669. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
  670. m_nTags[i][j]=43;
  671. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
  672. }
  673. else if(nLen>=4)
  674. {
  675. m_nTags[i][j]=41;
  676. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
  677. m_nTags[i][j]=42;
  678. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
  679. m_nTags[i][j]=43;
  680. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
  681. }
  682. else if(nLen==2)
  683. {
  684. nCharType=charType((unsigned char *)m_sWords[nIndex]);
  685. if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
  686. {
  687. m_nTags[i][j]=1;
  688. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)*2+1);
  689. m_nTags[i][j]=2;
  690. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
  691. m_nTags[i][j]=3;
  692. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
  693. m_nTags[i][j]=30;
  694. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,30)*8+1);
  695. m_nTags[i][j]=11;
  696. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*4+1);
  697. m_nTags[i][j]=12;
  698. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*4+1);
  699. m_nTags[i][j]=13;
  700. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*4+1);
  701. m_nTags[i][j]=21;
  702. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,21)*2+1);
  703. m_nTags[i][j]=22;
  704. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,22)*2+1);
  705. m_nTags[i][j]=23;
  706. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,23)*2+1);
  707. }
  708. m_nTags[i][j]=41;
  709. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
  710. m_nTags[i][j]=42;
  711. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
  712. m_nTags[i][j]=43;
  713. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
  714. }
  715. break;
  716. default:
  717. break;
  718. }
  719. *pSubIndex=j;
  720. return true;
  721. }
  722. ELEMENT_TYPE  CSpan::ComputePossibility(int nStartPos,int nLength,CDictionary &dict)
  723. {
  724. ELEMENT_TYPE dRetValue=0,dPOSPoss;
  725. //dPOSPoss: the possibility of a POS appears
  726. //dContextPoss: The possibility of context POS appears
  727. int nFreq;
  728. for(int i=nStartPos;i<nStartPos+nLength;i++)
  729. {
  730. nFreq=dict.GetFrequency(m_sWords[i],m_nBestTag[i]);
  731. //nFreq is word being the POS
  732. dPOSPoss=log((double)(m_context.GetFrequency(0,m_nBestTag[i])+1))-log((double)(nFreq+1));
  733. dRetValue+=dPOSPoss;
  734. /* if(i<nStartPos+nLength-1)
  735. {
  736. dContextPoss=log((double)(m_context.GetContextPossibility(0,m_nBestTag[i],m_nBestTag[i+1])+1));
  737. dRetValue+=dPOSPoss-dContextPoss;
  738. }
  739. */ }
  740. return dRetValue;
  741. }
  742. //DEL bool CSpan::TransRecognize(CDictionary &dictCore,CDictionary &transDict)
  743. //DEL {
  744. //DEL   char sPOS[MAX_WORDS_PER_SENTENCE]="Z";
  745. //DEL   int nStart=1,nEnd=1,i=1;
  746. //DEL   while(m_nBestTag[i]>-1)
  747. //DEL   {
  748. //DEL    if(m_nBestTag[i]==1||m_nBestTag[i]==11||m_nBestTag[i]==21)//1,11,21 Trigger the recognition
  749. //DEL    {
  750. //DEL  nStart=i;
  751. //DEL  nEnd=nStart+1;
  752. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart])//1,11,21
  753. //DEL  nEnd++;
  754. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
  755. //DEL  nEnd++;
  756. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart]+2)//3,13,23
  757. //DEL  nEnd++;
  758. //DEL  while(m_nBestTag[nEnd]==30)//3,13,23
  759. //DEL  nEnd++;
  760. //DEL    }
  761. //DEL    else if(m_nBestTag[i]==2||m_nBestTag[i]==12||m_nBestTag[i]==22)//1,11,21 Trigger the recognition
  762. //DEL    {
  763. //DEL  nStart=i;
  764. //DEL  nEnd=nStart+1;
  765. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart])//2,12,22
  766. //DEL  nEnd++;
  767. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
  768. //DEL  nEnd++;
  769. //DEL  while(m_nBestTag[nEnd]==30)//3,13,23
  770. //DEL  nEnd++;
  771. //DEL    }
  772. //DEL    if(nEnd>nStart&&!IsAllNum((unsigned char *)m_sWords[nStart])&&(nEnd>nStart+2||(nEnd==nStart+2&&(m_nBestTag[nEnd-1]!=30||strlen(m_sWords[nStart])>2))||(nEnd==nStart+1&&strlen(m_sWords[nStart])>2&&!dictCore.IsExist(m_sWords[nStart],-1))))
  773. //DEL    {
  774. //DEL  m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
  775. //DEL  m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
  776. //DEL  m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,transDict);
  777. //DEL  nStart=nEnd;
  778. //DEL    }
  779. //DEL 
  780. //DEL    if(i<nEnd)
  781. //DEL    i=nEnd;
  782. //DEL    else
  783. //DEL    i=i+1;
  784. //DEL   }
  785. //DEL   return true;
  786. //DEL }
  787. bool CSpan::PlaceRecognize(CDictionary &dictCore,CDictionary &placeDict)
  788. {
  789.   int nStart=1,nEnd=1,i=1,nTemp;
  790.   double dPanelty=1.0;//Panelty value
  791.   while(m_nBestTag[i]>-1)
  792.   {
  793.   if(m_nBestTag[i]==1)//1 Trigger the recognition procession
  794.   {
  795. nStart=i;
  796. nEnd=nStart+1;
  797. while(m_nBestTag[nEnd]==1)//
  798. {
  799. if(nEnd>nStart+1)
  800. dPanelty+=1.0;
  801. nEnd++;
  802. }
  803. while(m_nBestTag[nEnd]==2)//2,12,22
  804. nEnd++;
  805. nTemp=nEnd;
  806. while(m_nBestTag[nEnd]==3)
  807. {
  808. if(nEnd>nTemp)
  809. dPanelty+=1.0;
  810. nEnd++;
  811. }
  812.   }
  813.   else if(m_nBestTag[i]==2)//1,11,21 Trigger the recognition
  814.   {
  815. dPanelty+=1.0;
  816. nStart=i;
  817. nEnd=nStart+1;
  818. while(m_nBestTag[nEnd]==2)//2
  819. nEnd++;
  820. nTemp=nEnd;
  821. while(m_nBestTag[nEnd]==3)//2
  822. {
  823. if(nEnd>nTemp)
  824. dPanelty+=1.0;
  825. nEnd++;
  826. }
  827.   }
  828.   if(nEnd>nStart)
  829.   {
  830. m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
  831. m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
  832. m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,placeDict)+log(dPanelty);
  833. nStart=nEnd;
  834.   }
  835.   if(i<nEnd)
  836.   i=nEnd;
  837.   else
  838.   i=i+1;
  839.   }
  840.   return true;
  841. }
  842. //DEL bool CSpan::TransPersonRecognize(CDictionary &dictCore, CDictionary &transDict)
  843. //DEL {
  844. //DEL   int nStart=1,nEnd=1,i=1;
  845. //DEL   while(m_nBestTag[i]>-1)
  846. //DEL   {
  847. //DEL    if(m_nBestTag[i]==1)//1,11,21 Trigger the recognition
  848. //DEL    {
  849. //DEL  nStart=i;
  850. //DEL  nEnd=nStart+1;
  851. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart])//1,11,21
  852. //DEL  nEnd++;
  853. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
  854. //DEL  nEnd++;
  855. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart]+2)//3,13,23
  856. //DEL  nEnd++;
  857. //DEL    }
  858. //DEL    else if(m_nBestTag[i]==2)//1,11,21 Trigger the recognition
  859. //DEL    {
  860. //DEL  nStart=i;
  861. //DEL  nEnd=nStart+1;
  862. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart])//2,12,22
  863. //DEL  nEnd++;
  864. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
  865. //DEL  nEnd++;
  866. //DEL    }
  867. //DEL    if(nEnd>nStart&&!IsAllNum((unsigned char *)m_sWords[nStart])&&(nEnd>nStart+2||(nEnd==nStart+2&&(m_nBestTag[nEnd-1]!=30||strlen(m_sWords[nStart])>2))||(nEnd==nStart+1&&strlen(m_sWords[nStart])>2&&!dictCore.IsExist(m_sWords[nStart],-1))))
  868. //DEL    {
  869. //DEL  m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
  870. //DEL  m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
  871. //DEL  m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,transDict);
  872. //DEL  nStart=nEnd;
  873. //DEL    }
  874. //DEL 
  875. //DEL    if(i<nEnd)
  876. //DEL    i=nEnd;
  877. //DEL    else
  878. //DEL    i=i+1;
  879. //DEL   }
  880. //DEL   return true;
  881. //DEL }