SegGraph.cpp
上传用户:sanxfzhen
上传日期:2014-12-28
资源大小:2324k
文件大小:8k
源码类别:

多国语言处理

开发平台:

Visual C++

  1. //////////////////////////////////////////////////////////////////////
  2. //ICTCLAS简介:计算所汉语词法分析系统ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System),
  3. //             功能有:中文分词;词性标注;未登录词识别。
  4. //             分词正确率高达97.58%(973专家评测结果),
  5. //             未登录词识别召回率均高于90%,其中中国人名的识别召回率接近98%;
  6. //             处理速度为31.5Kbytes/s。
  7. //著作权:  Copyright?2002-2005中科院计算所 职务著作权人:张华平 刘群
  8. //遵循协议:自然语言处理开放资源许可证1.0
  9. //Email: zhanghp@software.ict.ac.cn
  10. //Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
  11. /****************************************************************************
  12.  *
  13.  * Copyright (c) 2000, 2001 
  14.  *     Machine Group
  15.  *     Software Research Lab.
  16.  *     Institute of Computing Tech.
  17.  *     Chinese Academy of Sciences
  18.  *     All rights reserved.
  19.  *
  20.  * This file is the confidential and proprietary property of 
  21.  * Institute of Computing Tech. and the posession or use of this file requires 
  22.  * a written license from the author.
  23.  * Filename: SegGraph.cpp
  24.  * Abstract:
  25.  *            implement for the Word Segmentation Directed Graph.
  26.  *
  27.  * Author:   Kevin Zhang 
  28.  *          (zhanghp@software.ict.ac.cn)
  29.  * Date:     2002-1-8
  30.  *
  31.  * Notes:
  32.  *                
  33.  * 
  34.  ****************************************************************************/
  35. // SegGraph.cpp: implementation of the CSegGraph class.
  36. //
  37. //////////////////////////////////////////////////////////////////////
  38. #include "stdafx.h"
  39. #include "SegGraph.h"
  40. #include "..\Utility\Utility.h"
  41. #include <string.h>
  42. #include <math.h>
  43. //////////////////////////////////////////////////////////////////////
  44. // Construction/Destruction
  45. //////////////////////////////////////////////////////////////////////
  46. CSegGraph::CSegGraph()
  47. {
  48. m_segGraph.SetRowFirst();
  49. //segGraph: The segmentation word graph
  50. //Row first array
  51. }
  52. CSegGraph::~CSegGraph()
  53. {
  54. }
  55. bool CSegGraph::GenerateWordNet(char *sSentence,CDictionary &dictCore,bool bOriginalFreq)
  56. {
  57. //Gernerate the word net from the sLine, that's list all the possible word
  58. unsigned int i=0,j,nLen=strlen(sSentence);
  59. char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH];
  60. int nWordIndex=0,nHandleTemp,k,nPOS;
  61. int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount;
  62. double dValue=0;
  63. m_nAtomCount=0;
  64. m_segGraph.SetEmpty();//Set segmentation graph empty
  65. AtomSegment(sSentence);
  66. //Atomic Segmentation
  67.     for(i=0;i<m_nAtomCount;i++)//Init the cost array
  68.     {
  69. if(m_nAtomPOS[i]==CT_CHINESE)//The atom is a Chinese Char
  70. {
  71. if(!bOriginalFreq)//Not original frequency
  72. m_segGraph.SetElement(i,i+1,log(MAX_FREQUENCE),0);//init the link with the maximum value
  73. else
  74. m_segGraph.SetElement(i,i+1,0,0,m_sAtom[i]);//init the link with the maximum value
  75. }
  76. else//Other atom
  77. {
  78. strcpy(sWord,m_sAtom[i]);//init the word 
  79. dValue=MAX_FREQUENCE;
  80. switch(m_nAtomPOS[i])
  81. {
  82. case CT_INDEX:
  83. case CT_NUM:
  84. nPOS=-27904;//'m'*256
  85. strcpy(sWord,"未##数");
  86. dValue=0;
  87. break;
  88. case CT_DELIMITER:
  89. nPOS=30464;//'w'*256;
  90. break;
  91. case CT_LETTER:
  92. nPOS=-'n'*256-'x';//
  93. dValue=0;
  94. strcpy(sWord,"未##串");
  95. break;
  96. case CT_SINGLE://12021-2129-3121
  97. if(GetCharCount("+-1234567890",m_sAtom[i])==(int)strlen(m_sAtom[i]))
  98. {
  99. nPOS=-27904;//'m'*256
  100. strcpy(sWord,"未##数");
  101. }
  102. else
  103. {
  104. nPOS=-'n'*256-'x';//
  105. strcpy(sWord,"未##串");
  106. }
  107. dValue=0;
  108. break;
  109. default:
  110. nPOS=m_nAtomPOS[i];//'?'*256;
  111. break;
  112. }
  113. if(!bOriginalFreq)//Not original frequency
  114. m_segGraph.SetElement(i,i+1,0,nPOS);//init the link with minimum
  115. else
  116. m_segGraph.SetElement(i,i+1,dValue,nPOS,sWord);//init the link with minimum
  117. }
  118.     }
  119. i=0;
  120. while(i<m_nAtomCount)//All the word
  121. {
  122.   strcpy(sWord,m_sAtom[i]);//Get the current atom
  123.   j=i+1;
  124.   if(strcmp(sWord,"月")==0&&strcmp(m_sAtom[i+1],"份")==0)//Don't split 月份
  125.   j+=1;
  126.   while(j<=m_nAtomCount&&dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp))
  127.   {//Add a condition to control the end of string
  128.    //retrieve the dictionary with the word
  129.           if(strcmp(sWordMatch,sWord)==0)//find the current word
  130.   {
  131.   nTotalFreq=0;
  132.   dictCore.GetHandle(sWord,&nMatchCount,nMatchHandle,nMatchFreq);
  133.   for(k=0;k<nMatchCount;k++)//Add the frequency
  134.   {
  135.  nTotalFreq+=nMatchFreq[k];
  136.   }
  137.   //Adding a rule to exclude some words to be formed.
  138.   if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)m_sAtom[i-1])||IsAllChineseNum(m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0||strncmp(sWord,"月",2)==0))
  139.   {//1年内、1999年末
  140.      if(CC_Find("末内中底前间初",sWord+2))
  141.      break;
  142.   }
  143.   if(nMatchCount==1)//The possible word has only one POS, store it
  144.   {
  145. if(!bOriginalFreq)//Not original frequency
  146. m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),nMatchHandle[0]);
  147. else
  148. m_segGraph.SetElement(i,j,nTotalFreq,nMatchHandle[0],sWord);
  149.   }
  150.   else 
  151.   {
  152. if(!bOriginalFreq)//Not original frequency
  153. m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),0);
  154. else
  155. m_segGraph.SetElement(i,j,nTotalFreq,0,sWord);
  156.   }
  157.   }
  158.   strcat(sWord,m_sAtom[j++]);
  159.   }
  160.   i+=1;//Start from i++;
  161. }
  162. return true;
  163. }
  164. bool CSegGraph::AtomSegment(char *sSentence)
  165. {
  166.     unsigned int i=0,j=0,nCurType,nNextType;
  167. //i is the pointer of sentence string
  168. //j is the pointer of pAtoms
  169. char sChar[3];
  170. sChar[2]=0;//Set the char ending
  171. m_sAtom[j][0]=0;//Set the first word as null
  172. m_nAtomLength[j]=0;
  173. if(strncmp(sSentence,SENTENCE_BEGIN,strlen(SENTENCE_BEGIN))==0)
  174. {
  175. strcpy(m_sAtom[j],SENTENCE_BEGIN);//Set the first word as sentence begining
  176. m_nAtomLength[j]=strlen(SENTENCE_BEGIN);
  177. m_nAtomPOS[j]=CT_SENTENCE_BEGIN;//init
  178. i+=m_nAtomLength[j];
  179. j+=1;
  180. m_sAtom[j][0]=0;//Set the first word as null
  181. m_nAtomLength[j]=0;
  182. }
  183. while(i<strlen(sSentence))
  184. {
  185. if(strncmp(sSentence+i,SENTENCE_END,strlen(SENTENCE_END))==0)
  186. {
  187. strcpy(m_sAtom[j],SENTENCE_END);//Set the first word as null
  188. m_nAtomLength[j]=strlen(SENTENCE_END);
  189. m_nAtomPOS[j]=CT_SENTENCE_END;//init
  190. i+=m_nAtomLength[j];
  191. j+=1;
  192. m_sAtom[j][0]=0;//Set the first word as null
  193. m_nAtomLength[j]=0;
  194. continue;
  195. }
  196. sChar[0]=*(sSentence+i);//Get the char with first byte
  197. sChar[1]=0;//
  198. i+=1;
  199. if(sChar[0]<0)//Two byte char
  200. {
  201. sChar[1]=*(sSentence+i);//Get the char with second byte
  202. i+=1;//i increased by 1
  203. }
  204. strcat(m_sAtom[j],sChar);
  205. nCurType=charType((unsigned char *)sChar);
  206. if(sChar[0]=='.'&&(charType((unsigned char *)sSentence+i)==CT_NUM||(*(sSentence+i)>='0'&&*(sSentence+i)<='9')))
  207. nCurType=CT_NUM;//Digit after . indicate . as a point in the numeric
  208. m_nAtomPOS[j]=nCurType;
  209. //Record its property, just convience for continuous processing
  210. if(nCurType==CT_CHINESE||nCurType==CT_INDEX||nCurType==CT_DELIMITER||nCurType==CT_OTHER)
  211. {//Chinese char, index number,delimiter and other is treated as atom
  212. m_nAtomLength[j]=strlen(m_sAtom[j]);//Save its length
  213. j+=1;//Skip to next atom
  214. m_sAtom[j][0]=0;//init
  215. }
  216. else 
  217. {//Number,single char, letter
  218. nNextType=255;
  219. if(i<strlen(sSentence))
  220. nNextType=charType((unsigned char *)(sSentence+i));
  221. if(nNextType!=nCurType||i==strlen(sSentence))
  222. //Reaching end or next char type is different from current char
  223. {
  224. m_nAtomLength[j]=strlen(m_sAtom[j]);//Save its length
  225. j+=1;
  226. m_sAtom[j][0]=0;//init
  227. }
  228. }
  229. }
  230. m_nAtomCount=j;//The count of segmentation atoms
  231. return true;
  232. }