SegGraph.cpp
上传用户:chen_dj
上传日期:2013-04-22
资源大小:111k
文件大小:8k
源码类别:

多国语言处理

开发平台:

C/C++

  1. /****************************************************************************
  2.  *
  3.  * Copyright (c) 2000, 2001 
  4.  *     Machine Group
  5.  *     Software Research Lab.
  6.  *     Institute of Computing Tech.
  7.  *     Chinese Academy of Sciences
  8.  *     All rights reserved.
  9.  *
  10.  * This file is the confidential and proprietary property of 
  11.  * Institute of Computing Tech. and the posession or use of this file requires 
  12.  * a written license from the author.
  13.  * Filename: SegGraph.cpp
  14.  * Abstract:
  15.  *            implement for the Word Segmentation Directed Graph.
  16.  *
  17.  * Author:   Kevin Zhang 
  18.  *          (zhanghp@software.ict.ac.cn)
  19.  * Date:     2002-1-8
  20.  *
  21.  * Notes:
  22.  *                
  23.  * 
  24.  ****************************************************************************/
  25. // SegGraph.cpp: implementation of the CSegGraph class.
  26. //
  27. //////////////////////////////////////////////////////////////////////
  28. #include "stdafx.h"
  29. #include "SegGraph.h"
  30. #include "..\Utility\Utility.h"
  31. #include <string.h>
  32. #include <math.h>
  33. //////////////////////////////////////////////////////////////////////
  34. // Construction/Destruction
  35. //////////////////////////////////////////////////////////////////////
  36. CSegGraph::CSegGraph()
  37. {
  38. }
  39. CSegGraph::~CSegGraph()
  40. {
  41. }
  42. bool CSegGraph::GenerateWordNet(char *sSentence,CDictionary &dictCore)
  43. {
  44. //Gernerate the word net from the sLine, that's list all the possible word
  45. unsigned int i=0,j,nLen=strlen(sSentence);
  46. char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH];
  47. int nWordIndex=0,nHandleTemp,k,nPOS;
  48. int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount;
  49. m_nAtomCount=0;
  50. m_segGraph.SetEmpty();//Set segmentation graph empty
  51. AtomSegment(sSentence);
  52. //Atomic Segmentation
  53.     for(i=0;i<m_nAtomCount;i++)//Init the cost array
  54.     {
  55. if(m_nAtomPOS[i]==CT_CHINESE)//The atom is a Chinese Char
  56. m_segGraph.SetElement(i,i+1,log(MAX_FREQUENCE),0,m_sAtom[i]);//init the link with the maximum value
  57. else//Other atom
  58. {
  59. switch(m_nAtomPOS[i])
  60. {
  61. case CT_INDEX:
  62. case CT_NUM:
  63. nPOS='m'*256;
  64. break;
  65. case CT_DELIMITER:
  66. nPOS='w'*256;
  67. break;
  68. case CT_LETTER:
  69. nPOS='s'*256+'s';
  70. break;
  71. case CT_SINGLE://12021-2129-3121
  72. if(m_sAtom[i][0]<='9'&&m_sAtom[i][0]>='0')
  73. nPOS='m'*256;
  74. else
  75. nPOS='s'*256+'s';
  76. break;
  77. default:
  78. nPOS=m_nAtomPOS[i];//'?'*256;
  79. break;
  80. }
  81. m_segGraph.SetElement(i,i+1,0,nPOS,m_sAtom[i]);//init the link with minimum
  82. }
  83.     }
  84. i=0;
  85. while(i<m_nAtomCount)//All the word
  86. {
  87.   strcpy(sWord,m_sAtom[i]);//Get the current atom
  88.   j=i+1;
  89.   while(j<=m_nAtomCount&&dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp))
  90.   {//Add a condition to control the end of string
  91.    //retrieve the dictionary with the word
  92.           if(strcmp(sWordMatch,sWord)==0)//find the current word
  93.   {
  94.   nTotalFreq=0;
  95.   dictCore.GetHandle(sWord,&nMatchCount,nMatchHandle,nMatchFreq);
  96.   for(k=0;k<nMatchCount;k++)//Add the frequency
  97.   {
  98.  nTotalFreq+=nMatchFreq[k];
  99.   }
  100.   //Adding a rule to exclude some words to be formed.
  101.   if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)m_sAtom[i-1])||IsAllChineseNum(m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0||strncmp(sWord,"月",2)==0))
  102.   {//1年内、1999年末
  103.      if(CC_Find("末内中底前间初",sWord+2))
  104.   break;
  105.   }
  106.   if(nMatchCount==1)//The possible word has only one POS, store it
  107.   m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),nMatchHandle[0],sWord);
  108.   else 
  109.   m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),0,sWord);
  110.   }
  111.   strcat(sWord,m_sAtom[j++]);
  112.   }
  113.   i+=1;//Start from i++;
  114. }
  115. return true;
  116. }
  117. bool CSegGraph::AtomSegment(char *sSentence)
  118. {
  119.     unsigned int i=0,j=0,nCurType,nNextType;
  120. //i is the pointer of sentence string
  121. //j is the pointer of pAtoms
  122. char sChar[3];
  123. sChar[2]=0;//Set the char ending
  124. m_sAtom[j][0]=0;//Set the first word as null
  125. m_nAtomLength[j]=0;
  126. if(strncmp(sSentence,SENTENCE_BEGIN,strlen(SENTENCE_BEGIN))==0)
  127. {
  128. strcpy(m_sAtom[j],SENTENCE_BEGIN);//Set the first word as sentence begining
  129. m_nAtomLength[j]=strlen(SENTENCE_BEGIN);
  130. m_nAtomPOS[j]=CT_SENTENCE_BEGIN;//init
  131. i+=m_nAtomLength[j];
  132. j+=1;
  133. m_sAtom[j][0]=0;//Set the first word as null
  134. m_nAtomLength[j]=0;
  135. }
  136. while(i<strlen(sSentence))
  137. {
  138. if(strncmp(sSentence+i,SENTENCE_END,strlen(SENTENCE_END))==0)
  139. {
  140. strcpy(m_sAtom[j],SENTENCE_END);//Set the first word as null
  141. m_nAtomLength[j]=strlen(SENTENCE_END);
  142. m_nAtomPOS[j]=CT_SENTENCE_END;//init
  143. i+=m_nAtomLength[j];
  144. j+=1;
  145. m_sAtom[j][0]=0;//Set the first word as null
  146. m_nAtomLength[j]=0;
  147. continue;
  148. }
  149. sChar[0]=*(sSentence+i);//Get the char with first byte
  150. sChar[1]=0;//
  151. i+=1;
  152. if(sChar[0]<0)//Two byte char
  153. {
  154. sChar[1]=*(sSentence+i);//Get the char with second byte
  155. i+=1;//i increased by 1
  156. }
  157. strcat(m_sAtom[j],sChar);
  158. nCurType=charType((unsigned char *)sChar);
  159. if(sChar[0]=='.'&&(charType((unsigned char *)sSentence+i)==CT_NUM||(*(sSentence+i)>='0'&&*(sSentence+i)<='9')))
  160. nCurType=CT_NUM;//Digit after . indicate . as a point in the numeric
  161. m_nAtomPOS[j]=nCurType;
  162. //Record its property, just convience for continuous processing
  163. if(nCurType==CT_CHINESE||nCurType==CT_INDEX||nCurType==CT_DELIMITER||nCurType==CT_OTHER)
  164. {//Chinese char, index number,delimiter and other is treated as atom
  165. m_nAtomLength[j]=strlen(m_sAtom[j]);//Save its length
  166. j+=1;//Skip to next atom
  167. m_sAtom[j][0]=0;//init
  168. }
  169. else 
  170. {//Number,single char, letter
  171. nNextType=255;
  172. if(i<strlen(sSentence))
  173. nNextType=charType((unsigned char *)(sSentence+i));
  174. // if(nNextType==CT_CHINESE||nCurType==CT_INDEX||nCurType==CT_DELIMITER||nCurType==CT_OTHER||i==strlen(sSentence))
  175. if(nNextType!=nCurType||i==strlen(sSentence))
  176. //Reaching end or next char type is different from current char
  177. {
  178. m_nAtomLength[j]=strlen(m_sAtom[j]);//Save its length
  179. j+=1;
  180. m_sAtom[j][0]=0;//init
  181. }
  182. }
  183. }
  184. m_nAtomCount=j;//The count of segmentation atoms
  185. return true;
  186. }
  187. bool CSegGraph::GenerateWordArray(char *sSentence, CDictionary &dictCore)
  188. {
  189. //Gernerate the word array from the sLine, that's list all the possible word
  190. unsigned int i=0,j,nLen=strlen(sSentence);
  191. char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH];
  192. int nWordIndex=0,nHandleTemp,k,nPOS;
  193. int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount;
  194. m_nAtomCount=0;
  195. m_segGraph.SetEmpty();//Set segmentation graph empty
  196. AtomSegment(sSentence);
  197. //Atomic Segmentation
  198.     for(i=0;i<m_nAtomCount;i++)//Init the cost array
  199.     {
  200. if(m_nAtomPOS[i]==CT_CHINESE)//The atom is a Chinese Char
  201. m_segGraph.SetElement(i,i+1,0,0,m_sAtom[i]);//init the link with the maximum value
  202. else//Other atom
  203. {
  204. switch(m_nAtomPOS[i])
  205. {
  206. case CT_INDEX:
  207. case CT_NUM:
  208. nPOS='m'*256;
  209. break;
  210. case CT_DELIMITER:
  211. nPOS='w'*256;
  212. break;
  213. case CT_LETTER:
  214. nPOS='s'*256+'s';
  215. break;
  216. case CT_SINGLE://12021-2129-3121
  217. if(m_sAtom[i][0]<='9'&&m_sAtom[i][0]>='0')
  218. nPOS='m'*256;
  219. else
  220. nPOS='s'*256+'s';
  221. break;
  222. default:
  223. nPOS=m_nAtomPOS[i];//'?'*256;
  224. break;
  225. }
  226. m_segGraph.SetElement(i,i+1,MAX_FREQUENCE,nPOS,m_sAtom[i]);//init the link with minimum
  227. }
  228.     }
  229. i=0;
  230. while(i<m_nAtomCount)//All the word
  231. {
  232.   strcpy(sWord,m_sAtom[i]);//Get the current atom
  233.   j=i+1;
  234.   while(j<=m_nAtomCount&&dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp))
  235.   {//Add a condition to control the end of string
  236.    //retrieve the dictionary with the word
  237.           if(strcmp(sWordMatch,sWord)==0)//find the current word
  238.   {
  239.   nTotalFreq=0;
  240.   dictCore.GetHandle(sWord,&nMatchCount,nMatchHandle,nMatchFreq);
  241.   for(k=0;k<nMatchCount;k++)//Add the frequency
  242.   {
  243.  nTotalFreq+=nMatchFreq[k];
  244.   }
  245.   //Adding a rule to exclude some words to be formed.
  246.   if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)m_sAtom[i-1])||IsAllChineseNum(m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0||strncmp(sWord,"月",2)==0))
  247.   {//1年内、1999年末
  248.      if(CC_Find("末内中底前间初",sWord+2))
  249.   break;
  250.   }
  251.   if(nMatchCount==1)//The possible word has only one POS, store it
  252.   m_segGraph.SetElement(i,j,nTotalFreq,nMatchHandle[0],sWord);
  253.   else 
  254.   m_segGraph.SetElement(i,j,nTotalFreq,0,sWord);
  255.   }
  256.   strcat(sWord,m_sAtom[j++]);
  257.   }
  258.   i+=1;//Start from i++;
  259. }
  260. return true;
  261. }