Span.cpp
上传用户:yxl0916
上传日期:2007-05-25
资源大小:2245k
文件大小:27k
源码类别:

多国语言处理

开发平台:

Visual C++

  1. //////////////////////////////////////////////////////////////////////
  2. //ICTCLAS简介:计算所汉语词法分析系统ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System),
  3. //             功能有:中文分词;词性标注;未登录词识别。
  4. //             分词正确率高达97.58%(973专家评测结果),
  5. //             未登录词识别召回率均高于90%,其中中国人名的识别召回率接近98%;
  6. //             处理速度为31.5Kbytes/s。
  7. //著作权:  Copyright?2002-2005中科院计算所 职务著作权人:张华平 刘群
  8. //遵循协议:自然语言处理开放资源许可证1.0
  9. //Email: zhanghp@software.ict.ac.cn
  10. //Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
  11. /****************************************************************************
  12.  *
  13.  * Copyright (c) 2000, 2001 
  14.  *     Machine Group
  15.  *     Software Research Lab.
  16.  *     Institute of Computing Tech.
  17.  *     Chinese Academy of Sciences
  18.  *     All rights reserved.
  19.  *
  20.  * This file is the confidential and proprietary property of 
  21.  * Institute of Computing Tech. and the posession or use of this file requires 
  22.  * a written license from the author.
  23.  * Filename: Span.cpp
  24.  * Abstract:
  25.  *           implementation of the CSpan class.
  26.  * Author:   Kevin Zhang 
  27.  *          (zhanghp@software.ict.ac.cn)
  28.  * Date:     2002-4-23
  29.  *
  30.  * Notes:    Tagging with Hidden Markov Model
  31.  *                
  32.  ****************************************************************************/
  33. #include "stdafx.h"
  34. #include "Span.h"
  35. #include "..\Segment\Segment.h"
  36. #include "..\Utility\Utility.h"
  37. #include <math.h>
  38. #include <string.h>
  39. #include <stdio.h>
  40. #include <time.h>
  41. //////////////////////////////////////////////////////////////////////
  42. // Construction/Destruction
  43. //////////////////////////////////////////////////////////////////////
  44. CSpan::CSpan()
  45. {
  46. if(m_tagType!=TT_NORMAL)
  47.       m_nTags[0][0]=100;//Begin tag
  48. else
  49.       m_nTags[0][0]=0;//Begin tag
  50. m_nTags[0][1]=-1;
  51. m_dFrequency[0][0]=0;
  52. m_nCurLength=1;
  53. m_nUnknownIndex=0;
  54. m_nStartPos=0;
  55. m_nWordPosition[1]=0;
  56. m_sWords[0][0]=0;
  57. m_tagType=TT_NORMAL;//Default tagging type
  58. }
  59. CSpan::~CSpan()
  60. {
  61. }
  62. bool CSpan::Disamb()
  63. {
  64. int i,j,k,nMinCandidate;
  65. double dMinFee,dTmp;
  66. for(i=1;i<m_nCurLength;i++)//For every word
  67. {
  68. for(j=0;m_nTags[i][j]>=0;j++)//For every word
  69. {
  70. nMinCandidate=MAX_POS_PER_WORD+1;
  71. for(k=0;m_nTags[i-1][k]>=0;k++)
  72. {
  73. //ConvertPOS(m_nTags[i-1][k],&nKey,&nPrevPOS);
  74. //ConvertPOS(m_nTags[i][j],&nKey,&nCurPOS);
  75. //dTmp=m_context.GetContextPossibility(nKey,nPrevPOS,nCurPOS);
  76. dTmp=-log(m_context.GetContextPossibility(0,m_nTags[i-1][k],m_nTags[i][j]));
  77. dTmp+=m_dFrequency[i-1][k];//Add the fees
  78. if(nMinCandidate>10||dTmp<dMinFee)//Get the minimum fee
  79. {
  80. nMinCandidate=k;
  81. dMinFee=dTmp;
  82. }
  83. }
  84. m_nBestPrev[i][j]=nMinCandidate;//The best previous for j
  85. m_dFrequency[i][j]=m_dFrequency[i][j]+dMinFee;
  86. }
  87. }
  88. return true;
  89. }
  90. bool CSpan::Reset(bool bContinue)
  91. {
  92. if(!bContinue)
  93. {//||CC_Find("。!”〕〉》」〗】",m_sWords[m_nCurLength-1])
  94. if(m_tagType!=TT_NORMAL)//Get the last POS in the last sentence
  95.       m_nTags[0][0]=100;//Begin tag
  96. else
  97.       m_nTags[0][0]=0;//Begin tag
  98. m_nUnknownIndex=0;
  99. m_dFrequency[0][0]=0;
  100. m_nStartPos=0;
  101. }
  102. else
  103. {
  104. m_nTags[0][0]=m_nTags[m_nCurLength-1][0];//Get the last POS in the last sentence
  105. m_dFrequency[0][0]=m_dFrequency[m_nCurLength-1][0];
  106. }
  107.     m_nTags[0][1]=-1;//Get the last POS in the last sentence,set the -1 as end flag
  108. m_nCurLength=1;
  109. m_nWordPosition[1]=m_nStartPos;
  110. m_sWords[0][0]=0;
  111. return true;
  112. }
  113. bool CSpan::LoadContext(char *sFilename)
  114. {
  115. return m_context.Load(sFilename);
  116. }
  117. bool CSpan::GetBestPOS()
  118. {
  119.   Disamb();
  120.   for(int i=m_nCurLength-1,j=0;i>0;i--)//,j>=0
  121.   {
  122.  if(m_sWords[i][0])
  123.  {//Not virtual ending
  124.  m_nBestTag[i]=m_nTags[i][j];//Record the best POS and its possibility
  125.  }
  126.  j=m_nBestPrev[i][j];
  127.   }
  128.   int nEnd=m_nCurLength;//Set the end of POS tagging
  129.   if(m_sWords[m_nCurLength-1][0]==0)
  130.   nEnd=m_nCurLength-1;
  131.   m_nBestTag[nEnd]=-1;
  132.   return true;
  133. }
  134. bool CSpan::PersonRecognize(CDictionary &personDict)
  135. {
  136.   char sPOS[MAX_WORDS_PER_SENTENCE]="z",sPersonName[100];
  137.                           //0     1    2    3    4   5   
  138.   char sPatterns[][5]={ "BBCD","BBC","BBE","BBZ","BCD","BEE","BE","BG",
  139.                     "BXD","BZ", "CDCD","CD","EE", "FB", "Y","XD",""};
  140.                     //BBCD        BBC       BBE     BBZ    BCD         BEE      BE         BG
  141.   double dFactor[]={0.003606,0.000021,0.001314,0.000315,0.656624, 0.000021,0.146116,0.009136,
  142.                // BXD      BZ   CDCD     CD      EE      FB       Y         XD  
  143. 0.000042,0.038971,0,0.090367,0.000273,0.009157,0.034324,0.009735,0
  144.   };
  145.   //About parameter:
  146. /*
  147. BBCD 343 0.003606
  148. BBC 2 0.000021
  149. BBE 125 0.001314
  150. BBZ 30 0.000315
  151. BCD 62460 0.656624
  152. BEE 0 0.000000
  153. BE 13899 0.146116
  154. BG 869 0.009136
  155. BXD 4 0.000042
  156. BZ 3707 0.038971
  157. CD 8596 0.090367
  158. EE 26 0.000273
  159. FB 871 0.009157
  160. Y 3265 0.034324
  161. XD 926 0.009735
  162.  */
  163.   //The person recognition patterns set
  164.   //BBCD:姓+姓+名1+名2;
  165.   //BBE: 姓+姓+单名;
  166.   //BBZ: 姓+姓+双名成词;
  167.   //BCD: 姓+名1+名2;
  168.   //BE:  姓+单名;
  169.   //BEE: 姓+单名+单名;韩磊磊
  170.   //BG:  姓+后缀
  171.   //BXD: 姓+姓双名首字成词+双名末字
  172.   //BZ:  姓+双名成词;
  173.   //B:  姓
  174.   //CD:  名1+名2;
  175.   //EE:  单名+单名;
  176.   //FB:  前缀+姓
  177.   //XD:  姓双名首字成词+双名末字
  178.   //Y:   姓单名成词
  179.   int nPatternLen[]={4,3,3,3,3,3,2,2,3,2,4,2,2,2,1,2,0};
  180.   for(int i=1;m_nBestTag[i]>-1;i++)//Convert to string from POS
  181. sPOS[i]=m_nBestTag[i]+'A';
  182.   sPOS[i]=0;
  183.   int j=1,k,nPos;//Find the proper pattern from the first POS
  184.   int nLittleFreqCount;//Counter for the person name role with little frequecy
  185.   bool bMatched=false;   
  186.   while(j<i)
  187.   {
  188. bMatched=false;   
  189. for(k=0;!bMatched&&nPatternLen[k]>0;k++)
  190. {
  191. if(strncmp(sPatterns[k],sPOS+j,nPatternLen[k])==0&&strcmp(m_sWords[j-1],"·")!=0&&strcmp(m_sWords[j+nPatternLen[k]],"·")!=0)
  192. {//Find the proper pattern k
  193. if(strcmp(sPatterns[k],"FB")==0&&(sPOS[j+2]=='E'||sPOS[j+2]=='C'||sPOS[j+2]=='G'))
  194. {//Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效;
  195. continue;
  196. }
  197. /* if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0)
  198. {//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同,规则失效.如:韩磊磊
  199. continue;
  200. }
  201. if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12)
  202. {//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘
  203. continue;
  204. }
  205. */ //Get the possible name
  206. nPos=j;//Record the person position in the tag sequence
  207. sPersonName[0]=0;
  208. nLittleFreqCount=0;//Record the number of role with little frequency
  209. while(nPos<j+nPatternLen[k])
  210. {//Get the possible person name
  211.  //
  212. if(m_nBestTag[nPos]<4&&personDict.GetFrequency(m_sWords[nPos],m_nBestTag[nPos])<LITTLE_FREQUENCY)
  213. nLittleFreqCount++;//The counter increase
  214. strcat(sPersonName,m_sWords[nPos]);
  215. nPos+=1;
  216. }
  217. /*
  218. if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY)
  219. {//Exclusion foreign name
  220.  //Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效
  221. j+=nPatternLen[k]-1;
  222. continue;
  223. }
  224. */ if(strcmp(sPatterns[k],"CDCD")==0)
  225. {//Rule for exclusion
  226.  //规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱
  227.    //Rule 3 for exclusion:含外国人名用字 规则适用
  228.  //否则,排除规则失效:黑妞白妞姐俩拔了头筹。
  229. if(GetForeignCharCount(sPersonName)>0)
  230. j+=nPatternLen[k]-1;
  231. continue;
  232. }
  233. /* if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName))
  234. {//
  235. j+=nPatternLen[k]-1;
  236. continue;
  237. }
  238. if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3)
  239. //马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀,
  240. //The all roles appear with two lower frequecy,we will ignore them
  241. continue;
  242. */ m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[j];
  243. m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[j+nPatternLen[k]];
  244. m_dWordsPossibility[m_nUnknownIndex]=-log(dFactor[k])+ComputePossibility(j,nPatternLen[k],personDict);
  245. //Mutiply the factor 
  246. m_nUnknownIndex+=1;
  247. j+=nPatternLen[k];
  248. bMatched=true;
  249. }
  250. }
  251.     if(!bMatched)//Not matched, add j by 1
  252. j+=1;
  253.   }
  254.   return true;
  255. }
  256. int CSpan::GetFrom(PWORD_RESULT pWordItems,int nIndex,CDictionary &dictCore, CDictionary &dictUnknown)
  257. {
  258. int nCount,aPOS[MAX_POS_PER_WORD],aFreq[MAX_POS_PER_WORD];
  259. int nFreq=0,j,nRetPos=0,nWordsIndex=0;
  260. bool bSplit=false;//Need to split in Transliteration recognition 
  261.     int i=1,nPOSCount;
  262. char sCurWord[WORD_MAXLENGTH];//Current word
  263. nWordsIndex=i+nIndex-1;
  264. for(;i<MAX_WORDS_PER_SENTENCE&&pWordItems[nWordsIndex].sWord[0]!=0;i++)
  265. {
  266. if(m_tagType==TT_NORMAL||!dictUnknown.IsExist(pWordItems[nWordsIndex].sWord,44))
  267.         {
  268. strcpy(m_sWords[i],pWordItems[nWordsIndex].sWord);//store current word
  269.         m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]);
  270. }
  271. else
  272. {
  273. if(!bSplit)
  274. {
  275. strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord,2);//store current word
  276. m_sWords[i][2]=0;
  277. bSplit=true;
  278. }
  279. else
  280. {
  281. unsigned int nLen=strlen(pWordItems[nWordsIndex].sWord+2);
  282. strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord+2,nLen);//store current word
  283. m_sWords[i][nLen]=0;
  284. bSplit=false;
  285. }
  286.         m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]);
  287. }
  288. //Record the position of current word
  289. m_nStartPos=m_nWordPosition[i+1];
  290. //Move the Start POS to the ending
  291. if(m_tagType!=TT_NORMAL)
  292. {
  293. //Get the POSs from the unknown recognition dictionary
  294. strcpy(sCurWord,m_sWords[i]);
  295. if(m_tagType==TT_TRANS_PERSON&&i>0&&charType((unsigned char*)m_sWords[i-1])==CT_CHINESE)
  296. {
  297. if(m_sWords[i][0]=='.'&&m_sWords[i][1]==0)
  298. strcpy(sCurWord,".");
  299. else if(m_sWords[i][0]=='-'&&m_sWords[i][1]==0)
  300. strcpy(sCurWord,"-");
  301. }
  302. dictUnknown.GetHandle(sCurWord,&nCount,aPOS,aFreq);
  303. nPOSCount=nCount+1;
  304. for(j=0;j<nCount;j++) 
  305. {//Get the POS set of sCurWord in the unknown dictionary
  306. m_nTags[i][j]=aPOS[j];
  307.     m_dFrequency[i][j]=-log((double)(1+aFreq[j]))+log((double)(m_context.GetFrequency(0,aPOS[j])+nPOSCount));
  308. }
  309. //Get the POS set of sCurWord in the core dictionary
  310. //We ignore the POS in the core dictionary and recognize them as other (0).
  311. //We add their frequency to get the possibility as POS 0
  312. if(strcmp(m_sWords[i],"始##始")==0)
  313. {
  314. m_nTags[i][j]=100;
  315.     m_dFrequency[i][j]=0;
  316. j++;
  317. }
  318. else if(strcmp(m_sWords[i],"末##末")==0)
  319. {
  320. m_nTags[i][j]=101;
  321.     m_dFrequency[i][j]=0;
  322. j++;
  323. }
  324. else
  325. {
  326. dictCore.GetHandle(m_sWords[i],&nCount,aPOS,aFreq);
  327. nFreq=0;
  328. for(int k=0;k<nCount;k++) 
  329. {
  330. nFreq+=aFreq[k];
  331. }
  332. if(nCount>0)
  333. {
  334. m_nTags[i][j]=0;
  335. //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1);
  336. m_dFrequency[i][j]=-log((double)(1+nFreq))+log((double)(m_context.GetFrequency(0,0)+nPOSCount));
  337. j++;
  338. }
  339. }
  340. }
  341. else//For normal POS tagging
  342. {
  343. j=0;
  344. //Get the POSs from the unknown recognition dictionary
  345. if(pWordItems[nWordsIndex].nHandle>0)
  346. {//The word has  is only one POS value
  347.  //We have record its POS and nFrequncy in the items.
  348. m_nTags[i][j]=pWordItems[nWordsIndex].nHandle;
  349. m_dFrequency[i][j]=-log(pWordItems[nWordsIndex].dValue)+log((double)(m_context.GetFrequency(0,m_nTags[i][j])+1));
  350. if(m_dFrequency[i][j]<0)//Not permit the value less than 0
  351. m_dFrequency[i][j]=0;
  352. j++;
  353. }
  354. else
  355. {//The word has multiple POSs, we should retrieve the information from Core Dictionary 
  356. if(pWordItems[nWordsIndex].nHandle<0)
  357. {//The word has  is only one POS value
  358.  //We have record its POS and nFrequncy in the items.
  359. /*
  360. if(pWordItems[nWordsIndex].nHandle==-'t'*256-'t')//tt
  361. {
  362. char sWordOrg[100],sPostfix[10];
  363. double dRatio=0.6925;//The ratio which transliteration as a person name 
  364. PostfixSplit(pWordItems[nWordsIndex].sWord,sWordOrg,sPostfix);
  365. if(sPostfix[0]!=0)
  366. dRatio=0.01;
  367. m_nTags[i][j]='n'*256+'r';
  368. m_dFrequency[i][j]=-log(dRatio)+pWordItems[nWordsIndex].dValue;
  369. //m_dFrequency[i][j]=log(dRatio)+pWordItems[nWordsIndex].dValue-log(m_context.GetFrequency(0,m_nTags[i][j]))+log(MAX_FREQUENCE);
  370. //P(W|R)=P(WRT)/P(RT)=P(R)*P(W|T)/P(R|T)
  371. j++;
  372. m_nTags[i][j]='n'*256+'s';
  373. m_dFrequency[i][j]=-log(1-dRatio)+pWordItems[nWordsIndex].dValue;
  374. //m_dFrequency[i][j]=log(1-dRatio)+pWordItems[nWordsIndex].dValue-log(m_context.GetFrequency(0,m_nTags[i][j]))+log(MAX_FREQUENCE);
  375. j++;
  376. }
  377. else//Unknown words such as Chinese person name or place name
  378. {
  379. */
  380. m_nTags[i][j]=-pWordItems[nWordsIndex].nHandle;
  381. m_dFrequency[i][j++]=pWordItems[nWordsIndex].dValue;
  382. //}
  383. }
  384. dictCore.GetHandle(m_sWords[i],&nCount,aPOS,aFreq);
  385. nPOSCount=nCount;
  386. for(;j<nCount;j++) 
  387. {//Get the POS set of sCurWord in the unknown dictionary
  388. m_nTags[i][j]=aPOS[j];
  389.     m_dFrequency[i][j]=-log(1+aFreq[j])+log(m_context.GetFrequency(0,m_nTags[i][j])+nPOSCount);
  390. }
  391. }
  392. }
  393. if(j==0)
  394. {//We donot know the POS, so we have to guess them according lexical knowledge
  395. GuessPOS(i,&j);//Guess the POS of current word
  396. }
  397. m_nTags[i][j]=-1;//Set the ending POS 
  398. if(j==1&&m_nTags[i][j]!=CT_SENTENCE_BEGIN)//No ambuguity
  399. {//No ambuguity, so we can break from the loop
  400. i++;
  401. m_sWords[i][0]=0;
  402. break;
  403. }
  404. if(!bSplit)
  405. nWordsIndex++;
  406. }
  407. if(pWordItems[nWordsIndex].sWord[0]==0)
  408. nRetPos=-1;//Reaching ending
  409. if(m_nTags[i-1][1]!=-1)//||m_sWords[i][0]==0
  410. {//Set end for words like "张/华/平"
  411. if(m_tagType!=TT_NORMAL)
  412.        m_nTags[i][0]=101;
  413. else
  414.        m_nTags[i][0]=1;
  415. m_dFrequency[i][0]=0;
  416.     m_sWords[i][0]=0;//Set virtual ending
  417. m_nTags[i++][1]=-1;
  418. }
  419. m_nCurLength=i;//The current word count
  420. if(nRetPos!=-1)
  421. return nWordsIndex+1;//Next start position
  422. return -1;//Reaching ending
  423. }
  424. //Set the tag type
  425. void CSpan::SetTagType(enum TAG_TYPE  nType)
  426. {
  427. m_tagType=nType;
  428. }
  429. //POS tagging with Hidden Markov Model
  430. bool CSpan::POSTagging(PWORD_RESULT pWordItems,CDictionary &dictCore,CDictionary &dictUnknown)
  431. {
  432. //pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary
  433.     int i=0,j,nStartPos;
  434. Reset(false);
  435.     while(i>-1&&pWordItems[i].sWord[0]!=0)
  436. {
  437. nStartPos=i;//Start Position
  438. i=GetFrom(pWordItems,nStartPos,dictCore,dictUnknown);
  439. GetBestPOS();
  440. switch(m_tagType)
  441. {
  442. case TT_NORMAL://normal POS tagging
  443. j=1;
  444. while(m_nBestTag[j]!=-1&&j<m_nCurLength)
  445. {//Store the best POS tagging
  446. pWordItems[j+nStartPos-1].nHandle=m_nBestTag[j];
  447. //Let 。be 0
  448. if(pWordItems[j+nStartPos-1].dValue>0&&dictCore.IsExist(pWordItems[j+nStartPos-1].sWord,-1))//Exist and update its frequncy as a POS value
  449. pWordItems[j+nStartPos-1].dValue=dictCore.GetFrequency(pWordItems[j+nStartPos-1].sWord,m_nBestTag[j]);
  450. j+=1;
  451. }
  452. break;
  453. case TT_PERSON://Person recognition
  454. PersonRecognize(dictUnknown);
  455. break;
  456. case TT_PLACE://Place name recognition
  457. case TT_TRANS_PERSON://Transliteration Person
  458. PlaceRecognize(dictCore,dictUnknown);
  459. break;
  460. default:
  461. break;
  462. }
  463. Reset();
  464. }
  465. return true;
  466. }
  467. //Guess the POS of No. nIndex word item
  468. bool CSpan::GuessPOS(int nIndex,int *pSubIndex)
  469. {
  470. int j=0,i=nIndex,nCharType;
  471. unsigned int nLen;
  472. switch(m_tagType)
  473. {
  474. case TT_NORMAL:
  475. break;
  476. case TT_PERSON:
  477. j=0;
  478. if(CC_Find("××",m_sWords[nIndex]))
  479. {
  480. m_nTags[i][j]=6;
  481. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,6)+1);
  482. }
  483. else
  484. {
  485. m_nTags[i][j]=0;
  486. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  487. nLen=strlen(m_sWords[nIndex]);
  488. if(nLen>=4)
  489. {
  490. m_nTags[i][j]=0;
  491. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  492. m_nTags[i][j]=11;
  493. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
  494. m_nTags[i][j]=12;
  495. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
  496. m_nTags[i][j]=13;
  497. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
  498. }
  499. else if(nLen==2)
  500. {
  501. m_nTags[i][j]=0;
  502. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  503. nCharType=charType((unsigned char *)m_sWords[nIndex]);
  504. if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
  505. {
  506. m_nTags[i][j]=1;
  507. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
  508. m_nTags[i][j]=2;
  509. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
  510. m_nTags[i][j]=3;
  511. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
  512. m_nTags[i][j]=4;
  513. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
  514. }
  515. m_nTags[i][j]=11;
  516. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
  517. m_nTags[i][j]=12;
  518. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
  519. m_nTags[i][j]=13;
  520. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
  521. }
  522. }
  523. break;
  524. case TT_PLACE:
  525. j=0;
  526. m_nTags[i][j]=0;
  527. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  528. nLen=strlen(m_sWords[nIndex]);
  529. if(nLen>=4)
  530. {
  531. m_nTags[i][j]=11;
  532. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
  533. m_nTags[i][j]=12;
  534. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
  535. m_nTags[i][j]=13;
  536. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
  537. }
  538. else if(nLen==2)
  539. {
  540. m_nTags[i][j]=0;
  541. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  542. nCharType=charType((unsigned char *)m_sWords[nIndex]);
  543. if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
  544. {
  545. m_nTags[i][j]=1;
  546. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
  547. m_nTags[i][j]=2;
  548. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
  549. m_nTags[i][j]=3;
  550. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
  551. m_nTags[i][j]=4;
  552. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
  553. }
  554. m_nTags[i][j]=11;
  555. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
  556. m_nTags[i][j]=12;
  557. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
  558. m_nTags[i][j]=13;
  559. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
  560. }
  561. break;
  562. case TT_TRANS_PERSON:
  563. j=0;
  564. nLen=strlen(m_sWords[nIndex]);
  565. m_nTags[i][j]=0;
  566. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
  567. if(!IsAllChinese((unsigned char *)m_sWords[nIndex]))
  568. {
  569. if(IsAllLetter((unsigned char *)m_sWords[nIndex]))
  570. {
  571. m_nTags[i][j]=1;
  572. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
  573. m_nTags[i][j]=11;
  574. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)+1);
  575. m_nTags[i][j]=2;
  576. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
  577. m_nTags[i][j]=3;
  578. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
  579. m_nTags[i][j]=12;
  580. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*2+1);
  581. m_nTags[i][j]=13;
  582. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*2+1);
  583. }
  584. m_nTags[i][j]=41;
  585. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
  586. m_nTags[i][j]=42;
  587. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
  588. m_nTags[i][j]=43;
  589. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
  590. }
  591. else if(nLen>=4)
  592. {
  593. m_nTags[i][j]=41;
  594. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
  595. m_nTags[i][j]=42;
  596. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
  597. m_nTags[i][j]=43;
  598. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
  599. }
  600. else if(nLen==2)
  601. {
  602. nCharType=charType((unsigned char *)m_sWords[nIndex]);
  603. if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
  604. {
  605. m_nTags[i][j]=1;
  606. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)*2+1);
  607. m_nTags[i][j]=2;
  608. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
  609. m_nTags[i][j]=3;
  610. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
  611. m_nTags[i][j]=30;
  612. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,30)*8+1);
  613. m_nTags[i][j]=11;
  614. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*4+1);
  615. m_nTags[i][j]=12;
  616. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*4+1);
  617. m_nTags[i][j]=13;
  618. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*4+1);
  619. m_nTags[i][j]=21;
  620. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,21)*2+1);
  621. m_nTags[i][j]=22;
  622. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,22)*2+1);
  623. m_nTags[i][j]=23;
  624. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,23)*2+1);
  625. }
  626. m_nTags[i][j]=41;
  627. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
  628. m_nTags[i][j]=42;
  629. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
  630. m_nTags[i][j]=43;
  631. m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
  632. }
  633. break;
  634. default:
  635. break;
  636. }
  637. *pSubIndex=j;
  638. return true;
  639. }
  640. ELEMENT_TYPE  CSpan::ComputePossibility(int nStartPos,int nLength,CDictionary &dict)
  641. {
  642. ELEMENT_TYPE dRetValue=0,dPOSPoss;
  643. //dPOSPoss: the possibility of a POS appears
  644. //dContextPoss: The possibility of context POS appears
  645. int nFreq;
  646. for(int i=nStartPos;i<nStartPos+nLength;i++)
  647. {
  648. nFreq=dict.GetFrequency(m_sWords[i],m_nBestTag[i]);
  649. //nFreq is word being the POS
  650. dPOSPoss=log((double)(m_context.GetFrequency(0,m_nBestTag[i])+1))-log((double)(nFreq+1));
  651. dRetValue+=dPOSPoss;
  652. /* if(i<nStartPos+nLength-1)
  653. {
  654. dContextPoss=log((double)(m_context.GetContextPossibility(0,m_nBestTag[i],m_nBestTag[i+1])+1));
  655. dRetValue+=dPOSPoss-dContextPoss;
  656. }
  657. */ }
  658. return dRetValue;
  659. }
  660. //DEL bool CSpan::TransRecognize(CDictionary &dictCore,CDictionary &transDict)
  661. //DEL {
  662. //DEL   char sPOS[MAX_WORDS_PER_SENTENCE]="Z";
  663. //DEL   int nStart=1,nEnd=1,i=1;
  664. //DEL   while(m_nBestTag[i]>-1)
  665. //DEL   {
  666. //DEL    if(m_nBestTag[i]==1||m_nBestTag[i]==11||m_nBestTag[i]==21)//1,11,21 Trigger the recognition
  667. //DEL    {
  668. //DEL  nStart=i;
  669. //DEL  nEnd=nStart+1;
  670. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart])//1,11,21
  671. //DEL  nEnd++;
  672. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
  673. //DEL  nEnd++;
  674. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart]+2)//3,13,23
  675. //DEL  nEnd++;
  676. //DEL  while(m_nBestTag[nEnd]==30)//3,13,23
  677. //DEL  nEnd++;
  678. //DEL    }
  679. //DEL    else if(m_nBestTag[i]==2||m_nBestTag[i]==12||m_nBestTag[i]==22)//1,11,21 Trigger the recognition
  680. //DEL    {
  681. //DEL  nStart=i;
  682. //DEL  nEnd=nStart+1;
  683. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart])//2,12,22
  684. //DEL  nEnd++;
  685. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
  686. //DEL  nEnd++;
  687. //DEL  while(m_nBestTag[nEnd]==30)//3,13,23
  688. //DEL  nEnd++;
  689. //DEL    }
  690. //DEL    if(nEnd>nStart&&!IsAllNum((unsigned char *)m_sWords[nStart])&&(nEnd>nStart+2||(nEnd==nStart+2&&(m_nBestTag[nEnd-1]!=30||strlen(m_sWords[nStart])>2))||(nEnd==nStart+1&&strlen(m_sWords[nStart])>2&&!dictCore.IsExist(m_sWords[nStart],-1))))
  691. //DEL    {
  692. //DEL  m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
  693. //DEL  m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
  694. //DEL  m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,transDict);
  695. //DEL  nStart=nEnd;
  696. //DEL    }
  697. //DEL 
  698. //DEL    if(i<nEnd)
  699. //DEL    i=nEnd;
  700. //DEL    else
  701. //DEL    i=i+1;
  702. //DEL   }
  703. //DEL   return true;
  704. //DEL }
  705. bool CSpan::PlaceRecognize(CDictionary &dictCore,CDictionary &placeDict)
  706. {
  707.   int nStart=1,nEnd=1,i=1,nTemp;
  708.   double dPanelty=1.0;//Panelty value
  709.   while(m_nBestTag[i]>-1)
  710.   {
  711.   if(m_nBestTag[i]==1)//1 Trigger the recognition procession
  712.   {
  713. nStart=i;
  714. nEnd=nStart+1;
  715. while(m_nBestTag[nEnd]==1)//
  716. {
  717. if(nEnd>nStart+1)
  718. dPanelty+=1.0;
  719. nEnd++;
  720. }
  721. while(m_nBestTag[nEnd]==2)//2,12,22
  722. nEnd++;
  723. nTemp=nEnd;
  724. while(m_nBestTag[nEnd]==3)
  725. {
  726. if(nEnd>nTemp)
  727. dPanelty+=1.0;
  728. nEnd++;
  729. }
  730.   }
  731.   else if(m_nBestTag[i]==2)//1,11,21 Trigger the recognition
  732.   {
  733. dPanelty+=1.0;
  734. nStart=i;
  735. nEnd=nStart+1;
  736. while(m_nBestTag[nEnd]==2)//2
  737. nEnd++;
  738. nTemp=nEnd;
  739. while(m_nBestTag[nEnd]==3)//2
  740. {
  741. if(nEnd>nTemp)
  742. dPanelty+=1.0;
  743. nEnd++;
  744. }
  745.   }
  746.   if(nEnd>nStart)
  747.   {
  748. m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
  749. m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
  750. m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,placeDict)+log(dPanelty);
  751. nStart=nEnd;
  752.   }
  753.   if(i<nEnd)
  754.   i=nEnd;
  755.   else
  756.   i=i+1;
  757.   }
  758.   return true;
  759. }
  760. //DEL bool CSpan::TransPersonRecognize(CDictionary &dictCore, CDictionary &transDict)
  761. //DEL {
  762. //DEL   int nStart=1,nEnd=1,i=1;
  763. //DEL   while(m_nBestTag[i]>-1)
  764. //DEL   {
  765. //DEL    if(m_nBestTag[i]==1)//1,11,21 Trigger the recognition
  766. //DEL    {
  767. //DEL  nStart=i;
  768. //DEL  nEnd=nStart+1;
  769. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart])//1,11,21
  770. //DEL  nEnd++;
  771. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
  772. //DEL  nEnd++;
  773. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart]+2)//3,13,23
  774. //DEL  nEnd++;
  775. //DEL    }
  776. //DEL    else if(m_nBestTag[i]==2)//1,11,21 Trigger the recognition
  777. //DEL    {
  778. //DEL  nStart=i;
  779. //DEL  nEnd=nStart+1;
  780. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart])//2,12,22
  781. //DEL  nEnd++;
  782. //DEL  while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
  783. //DEL  nEnd++;
  784. //DEL    }
  785. //DEL    if(nEnd>nStart&&!IsAllNum((unsigned char *)m_sWords[nStart])&&(nEnd>nStart+2||(nEnd==nStart+2&&(m_nBestTag[nEnd-1]!=30||strlen(m_sWords[nStart])>2))||(nEnd==nStart+1&&strlen(m_sWords[nStart])>2&&!dictCore.IsExist(m_sWords[nStart],-1))))
  786. //DEL    {
  787. //DEL  m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
  788. //DEL  m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
  789. //DEL  m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,transDict);
  790. //DEL  nStart=nEnd;
  791. //DEL    }
  792. //DEL 
  793. //DEL    if(i<nEnd)
  794. //DEL    i=nEnd;
  795. //DEL    else
  796. //DEL    i=i+1;
  797. //DEL   }
  798. //DEL   return true;
  799. //DEL }