多国语言处理

开发平台：
Java

SegGraph.cpp：源码内容
							//////////////////////////////////////////////////////////////////////
//ICTCLAS简介：计算所汉语词法分析系统ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System)，
//             功能有：中文分词；词性标注；未登录词识别。
//             分词正确率高达97.58%(973专家评测结果)，
//             未登录词识别召回率均高于90%，其中中国人名的识别召回率接近98%;
//             处理速度为31.5Kbytes/s。
//著作权：  Copyright?2002-2005中科院计算所 职务著作权人：张华平 刘群
//遵循协议：自然语言处理开放资源许可证1.0
//Email: zhanghp@software.ict.ac.cn
//Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
/****************************************************************************
 *
 * Copyright (c) 2000, 2001 
 *     Machine Group
 *     Software Research Lab.
 *     Institute of Computing Tech.
 *     Chinese Academy of Sciences
 *     All rights reserved.
 *
 * This file is the confidential and proprietary property of 
 * Institute of Computing Tech. and the posession or use of this file requires 
 * a written license from the author.
 * Filename: SegGraph.cpp
 * Abstract:
 *            implement for the Word Segmentation Directed Graph.
 *
 * Author:   Kevin Zhang 
 *          (zhanghp@software.ict.ac.cn)
 * Date:     2002-1-8
 *
 * Notes:
 *                
 * 
 ****************************************************************************/
// SegGraph.cpp: implementation of the CSegGraph class.
//
//////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "SegGraph.h"
#include "..\Utility\Utility.h"
#include <string.h>
#include <math.h>
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
CSegGraph::CSegGraph()
{
	m_segGraph.SetRowFirst();	
	//segGraph: The segmentation word graph
	//Row first array
}
CSegGraph::~CSegGraph()
{
}
bool CSegGraph::GenerateWordNet(char *sSentence,CDictionary &dictCore,bool	bOriginalFreq)
{
//Gernerate the word net from the sLine, that's list all the possible word
	unsigned int i=0,j,nLen=strlen(sSentence);
	
	/* 
	* ----- commented by huangjin@ict.ac.cn 2006-6-8 ------ 
	*
	* 	char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH];
	*
	*/			
	/*----Added By huangjin@ict.ac.cn 2006-6-8----*/
	char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH]="";
	/*---------------------------------------------*/
	
	int nWordIndex=0,nHandleTemp,k,nPOS;
	int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount;
	double dValue=0;
	m_nAtomCount=0;
	m_segGraph.SetEmpty();//Set segmentation graph empty
	AtomSegment(sSentence);
	//Atomic Segmentation
    for(i=0;i<m_nAtomCount;i++)//Init the cost array
    {
		if(m_nAtomPOS[i]==CT_CHINESE)//The atom is a Chinese Char
		{
			if(!bOriginalFreq)//Not original frequency
			/* 
			* ----- commented by huangjin@ict.ac.cn 2006-5-30 ------ 
			* 
			* 	m_segGraph.SetElement(i,i+1,log(MAX_FREQUENCE),0);//init the link with the maximum value
			*
			*/
			/*----Added By huangjin@ict.ac.cn 2006-5-30----*/
				m_segGraph.SetElement(i,i+1,log(double(MAX_FREQUENCE)),0);//init the link with the maximum value
			/*--------------------------------------------*/
				
			else
				m_segGraph.SetElement(i,i+1,0,0,m_sAtom[i]);//init the link with the maximum value
		}
		else//Other atom
		{
			strcpy(sWord,m_sAtom[i]);//init the word 
			dValue=MAX_FREQUENCE;
			switch(m_nAtomPOS[i])
			{
			case CT_INDEX:
			case CT_NUM:
			/*----Added By huangjin@ict.ac.cn 2006-7-11----*/
			case CT_SINGLE_NUM:
			/*---------------------------------------------*/
				nPOS=-27904;//'m'*256
				strcpy(sWord,"未##数");
				dValue=0;
				break;
			case CT_DELIMITER:			
				nPOS=30464;//'w'*256;
				break;
			case CT_LETTER:
				nPOS=-'n'*256-'x';//
				dValue=0;
				strcpy(sWord,"未##串");
				break;
			/*----Added By huangjin@ict.ac.cn 2006-7-11----*/
			case CT_SINGLE_DELIMITER://12021-2129-3121
				if(GetCharCount("+-1234567890",m_sAtom[i])==(int)strlen(m_sAtom[i]))
				{
					nPOS=-27904;//'m'*256
					dValue=0;
					strcpy(sWord,"未##数");
				}
				else
				{
					nPOS=30464;	//'w'*256					
				}
				break;
			/*---------------------------------------------*/
			case CT_SINGLE://12021-2129-3121
				if(GetCharCount("+-1234567890",m_sAtom[i])==(int)strlen(m_sAtom[i]))
				{
					nPOS=-27904;//'m'*256
					strcpy(sWord,"未##数");
				}
				else
				{
					nPOS=-'n'*256-'x';//
					strcpy(sWord,"未##串");
				}
				dValue=0;
				break;
			default:
				nPOS=m_nAtomPOS[i];//'?'*256;
				break;
			}
			if(!bOriginalFreq)//Not original frequency
				m_segGraph.SetElement(i,i+1,0,nPOS);//init the link with minimum
			else
				m_segGraph.SetElement(i,i+1,dValue,nPOS,sWord);//init the link with minimum
		}
    }
	i=0;
	
	while(i<m_nAtomCount)//All the word
	{
	  strcpy(sWord,m_sAtom[i]);//Get the current atom
	  j=i+1;
	  /* 
	  * ----- commented by huangjin@ict.ac.cn 2006-5-31 ------ 
	  *
	  * 	if(strcmp(sWord,"月")==0&&strcmp(m_sAtom[i+1],"份")==0)//Don't split 月份
	  *
	  */			
	  /*----Added By huangjin@ict.ac.cn 2006-5-31----*/
	  //Add "i<m_nAtomCount-1" so that the i will not slop over when compare m_sAtom[i+1]
	  if(strcmp(sWord,"月")==0&&i<m_nAtomCount-1&&strcmp(m_sAtom[i+1],"份")==0)//Don't split 月份
	  /*---------------------------------------------*/
		  j+=1;
	  /*----Added By huangjin@ict.ac.cn 2006-7-17----*/
	  if((strcmp(sWord,"点")==0||strcmp(sWord,"刻")==0)&&
		  i<m_nAtomCount-1&&strcmp(m_sAtom[i+1],"钟")==0)//Don't split 点钟｜刻钟
		  j+=1;
	  /*---------------------------------------------*/
	  //while(j<=m_nAtomCount&&dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp))
	//=============================modify by Jiang Wenbin===================================
	  while(j<=m_nAtomCount)
	  {
		  
		  bool find=dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp);
		  if(find)
		  {
			  
				if(strcmp(sWordMatch,sWord)==0)//find the current word
				{
			  
					nTotalFreq=0;
					dictCore.GetHandle(sWord,&nMatchCount,nMatchHandle,nMatchFreq);
					for(k=0;k<nMatchCount;k++)//Add the frequency
					{
						nTotalFreq+=nMatchFreq[k];
					}
					//Adding a rule to exclude some words to be formed.
					if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)m_sAtom[i-1])||IsAllChineseNum(m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0||strncmp(sWord,"月",2)==0))
					{//1年内、1999年末
						if(CC_Find("末内中底前间初",sWord+2))
							break;
					 }
					if(nMatchCount==1)//The possible word has only one POS, store it
					{
						if(!bOriginalFreq)//Not original frequency
							/* 
							* ----- commented by huangjin@ict.ac.cn 2006-5-30 ------ 
							* 
							* 	m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),nMatchHandle[0]);
							*
							*/
							/*----Added By huangjin@ict.ac.cn 2006-5-30----*/
							m_segGraph.SetElement(i,j,-log(double(nTotalFreq+1))+log(double(MAX_FREQUENCE)),nMatchHandle[0]);
							/*---------------------------------------------*/					
						else
							m_segGraph.SetElement(i,j,nTotalFreq,nMatchHandle[0],sWord);
					}
					else 
					{
						if(!bOriginalFreq)//Not original frequency
							/* 
							* ----- commented by huangjin@ict.ac.cn 2006-5-30 ------ 
							* 
							* 	m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),0);
							*
							*/
							/*----Added By huangjin@ict.ac.cn 2006-5-30----*/
							m_segGraph.SetElement(i,j,-log(double(nTotalFreq+1))+log(double(MAX_FREQUENCE)),0);
							/*---------------------------------------------*/								
							else
								m_segGraph.SetElement(i,j,nTotalFreq,0,sWord);
					}
				}
		  }
		  else if(j-i>8)
		  {
				break;
		  }
		  //Add a condition to control the end of string
	   //retrieve the dictionary with the word
		  //cout<<"Word: "<<sWord<<endl;
  
		  strcat(sWord,m_sAtom[j++]);
	  }
	  i+=1;//Start from i++;
	}
	return true;
}
/* 
* ----- commented by huangjin@ict.ac.cn 2006-7-11 ------ 
*
* 	
//DELbool CSegGraph::AtomSegment(char *sSentence)
//DEL{
//DEL	unsigned int i=0,j=0,nCurType,nNextType;	
//DEL	//i is the pointer of sentence string
//DEL	//j is the pointer of pAtoms
//DEL	char sChar[3];
//DEL	sChar[2]=0;//Set the char ending
//DEL	m_sAtom[j][0]=0;//Set the first word as null
//DEL	m_nAtomLength[j]=0;
//DEL	if(strncmp(sSentence,SENTENCE_BEGIN,strlen(SENTENCE_BEGIN))==0)
//DEL	{
//DEL		strcpy(m_sAtom[j],SENTENCE_BEGIN);//Set the first word as sentence begining
//DEL		m_nAtomLength[j]=strlen(SENTENCE_BEGIN);
//DEL		m_nAtomPOS[j]=CT_SENTENCE_BEGIN;//init
//DEL		i+=m_nAtomLength[j];
//DEL		j+=1;
//DEL		m_sAtom[j][0]=0;//Set the first word as null
//DEL		m_nAtomLength[j]=0;
//DEL	}
//DEL	while(i<strlen(sSentence))
//DEL	{
//DEL		if(strncmp(sSentence+i,SENTENCE_END,strlen(SENTENCE_END))==0)
//DEL		{
//DEL			strcpy(m_sAtom[j],SENTENCE_END);//Set the first word as null
//DEL			m_nAtomLength[j]=strlen(SENTENCE_END);
//DEL			m_nAtomPOS[j]=CT_SENTENCE_END;//init
//DEL			i+=m_nAtomLength[j];
//DEL			j+=1;
//DEL			m_sAtom[j][0]=0;//Set the first word as null
//DEL			m_nAtomLength[j]=0;
//DEL			continue;
//DEL		}
//DEL		/*----Added By huangjin@ict.ac.cn 2006-6-8----*/
//DEL		//连续的三个点到六个点认为是省略号，不要切开
//DEL		char c=*(sSentence+i);
//DEL		if(c=='.')
//DEL		{
//DEL			char ellipsis[]= "......";
//DEL			bool bEllipsis = false;
//DEL			for( int tmpindex = 5; tmpindex>=2; tmpindex-- )
//DEL			{
//DEL				if( strncmp(sSentence+i,ellipsis,strlen(ellipsis))==0)
//DEL				{
//DEL					strcpy(m_sAtom[j],ellipsis);
//DEL					m_nAtomLength[j]=strlen(ellipsis);
//DEL					m_nAtomPOS[j]=CT_DELIMITER;//init
//DEL					i+=m_nAtomLength[j];
//DEL					j+=1;
//DEL					m_sAtom[j][0]=0;//Set the first word as null
//DEL					m_nAtomLength[j]=0;
//DEL					bEllipsis=true;
//DEL					break;
//DEL				}
//DEL				else
//DEL				{
//DEL					ellipsis[tmpindex]='';
//DEL				}
//DEL			}
//DEL			if( bEllipsis )
//DEL			{
//DEL				continue;
//DEL			}
//DEL		}
//DEL		else if(c=='-'&&i+1<strlen(sSentence))
//DEL		{
//DEL			c=*(sSentence+i+1);
//DEL			if(c=='-')
//DEL			{
//DEL				strcpy(m_sAtom[j],"--");
//DEL				m_nAtomLength[j]=strlen("--");
//DEL				m_nAtomPOS[j]=CT_DELIMITER;//init
//DEL				i+=m_nAtomLength[j];
//DEL				j+=1;
//DEL				m_sAtom[j][0]=0;//Set the first word as null
//DEL				m_nAtomLength[j]=0;
//DEL				continue;
//DEL			}
//DEL		}
//DEL		/*---------------------------------------------*/
//DEL
//DEL		sChar[0]=*(sSentence+i);//Get the char with first byte
//DEL		sChar[1]=0;//
//DEL		i+=1;
//DEL		if(sChar[0]<0)//Two byte char
//DEL		{
//DEL			sChar[1]=*(sSentence+i);//Get the char with second byte
//DEL			i+=1;//i increased by 1
//DEL		}
//DEL		strcat(m_sAtom[j],sChar);
//DEL		nCurType=charType((unsigned char *)sChar);
//DEL		if(sChar[0]=='.'&&(charType((unsigned char *)sSentence+i)==CT_NUM||(*(sSentence+i)>='0'&&*(sSentence+i)<='9')))
//DEL			nCurType=CT_NUM;//Digit after . indicate . as a point in the numeric	
//DEL		
//DEL		m_nAtomPOS[j]=nCurType;
//DEL		//Record its property, just convience for continuous processing
//DEL		
//DEL		if(nCurType==CT_CHINESE||nCurType==CT_INDEX||nCurType==CT_DELIMITER||nCurType==CT_OTHER)
//DEL		{//Chinese char, index number,delimiter and other is treated as atom
//DEL			m_nAtomLength[j]=strlen(m_sAtom[j]);//Save its length
//DEL			j+=1;//Skip to next atom
//DEL			m_sAtom[j][0]=0;//init
//DEL			/*----Added By huangjin@ict.ac.cn 2006-6-8----*/
//DEL			m_nAtomLength[j]=0;
//DEL			/*---------------------------------------------*/
//DEL		}	
//DEL		else
//DEL		{//Number,single char, letter
//DEL			nNextType=255;
//DEL			if(i<strlen(sSentence))
//DEL				nNextType=charType((unsigned char *)(sSentence+i));
//DEL			if(nNextType!=nCurType||i==strlen(sSentence))
//DEL				//Reaching end or next char type is different from current char
//DEL			{
//DEL				m_nAtomLength[j]=strlen(m_sAtom[j]);//Save its length	
//DEL				j+=1;
//DEL				m_sAtom[j][0]=0;//init
//DEL				/*----Added By huangjin@ict.ac.cn 2006-6-8----*/
//DEL		 		m_nAtomLength[j]=0;
//DEL		 		/*---------------------------------------------*/
//DEL		 	}
//DEL		 }
//DEL	}
//DEL	m_nAtomCount=j;//The count of segmentation atoms
//DEL	return true;
//DEL}
//DEL*
//DEL*/
/*----Added By huangjin@ict.ac.cn 2006-9-12----*/
bool CSegGraph::AtomSegment(char *sSentence)
{
	unsigned int i=0, j=0, nCurType, nNextType;
	//i is the pointer of sSentence string
	//j is the pointer of m_sAtom, m_nAtomLength an m_nAtomPOS
	char sChar[3]="";
	const unsigned int nLen = strlen(sSentence);	//store the length of sSentence
	//Set the first word as null
	UpdateAtoms(j,"",-1,false);
	if(!strncmp(sSentence,SENTENCE_BEGIN,strlen(SENTENCE_BEGIN)))	//sentence begin
	{
		i+=UpdateAtoms(j,SENTENCE_BEGIN,CT_SENTENCE_BEGIN,false);//Set the word as sentence beginning
		j++;
		UpdateAtoms(j,"",-1,false);		
	}
	while(i<nLen)
	{
		if(strncmp(sSentence+i,SENTENCE_END,strlen(SENTENCE_END))==0)
		{//Set the word as sentence ending
			i+=UpdateAtoms(j,SENTENCE_END,CT_SENTENCE_END,false);
			j++;
			UpdateAtoms(j,"",-1,false);					
			continue;
		}
		i+=GetChar(sSentence+i, sChar);	//Get current word
		nCurType=charType((unsigned char *)sChar);		
		UpdateAtoms(j,sChar,nCurType);
	
		if ( nCurType==CT_CHINESE )
		{//一万八千六百五十二			
			unsigned int k=i;
			char sNumCandidate[WORD_MAXLENGTH]="";
			if(k<nLen)
			{
				bool bNum=false;
				do
				{
					bNum=IsChineseNumCadidate(sChar);
					if(bNum)
					{
						strcat(sNumCandidate,sChar);
						k+=GetChar(sSentence+k,sChar);
					}
					else if(!strncmp(sSentence+k-2,"分之",4))
					{
						strcat(sNumCandidate,"分之");
						bNum=true;						
						k+=GetChar(sSentence+k+2,sChar)+2;
					}					
				}while(k<nLen&&bNum);
			}
			unsigned int h=ChineseNumRecognize(sNumCandidate, sSentence+k-strlen(sChar));
			if(h!=(unsigned int)-1)
			{
				sNumCandidate[h]=0;//截断
				UpdateAtoms(j,sNumCandidate,CT_NUM,false);				
				i+=strlen(sNumCandidate)-2;
			}			
		}
		else
		{
			nNextType=255;
			if(i<nLen)
				nNextType=charType((unsigned char*)(sSentence+i));
			//Numeric
			if(nCurType==CT_NUM||nCurType==CT_SINGLE_NUM||
				strchr("+-.",sChar[0])&&nNextType==CT_SINGLE_NUM||
				CC_Find("±－＋",sChar)&&nNextType==CT_NUM)			
			{//Numeric and Numeric Prefix
				unsigned int k=i;	//tmp index save i		
				bool bSBC = false;
				if(strchr("+-.",sChar[0])||nCurType==CT_SINGLE_NUM)
					bSBC = true;
				
				bool bPuncEnd=true;
				if(nCurType==CT_NUM||nCurType==CT_SINGLE_NUM)
					bPuncEnd=false;
				while(k<nLen)
				{
					k+=GetChar(sSentence+k,sChar);	
					nNextType=charType((unsigned char*)sChar);
					if(!bPuncEnd&&(strchr(".+-:",sChar[0])&&sChar[1]==0)||CC_Find("．·：∶／",sChar))
						bPuncEnd=true;
					else if(bSBC&&nNextType==CT_SINGLE_NUM||!bSBC&&nNextType==CT_NUM)
						bPuncEnd=false;					
					else
						break;
					strcat(m_sAtom[j],sChar);//Get the atom					
					i=k;									
				}
				if(bPuncEnd)//末尾是符号或者标点，要分开
				{				
					if(CC_Find("．·：∶／",m_sAtom[j]+strlen(m_sAtom[j])-2))					
					{
						m_sAtom[j][strlen(m_sAtom[j])-2]=0;
						i-=2;
					}
					else //.+-:
					{
						m_sAtom[j][strlen(m_sAtom[j])-1]=0;
						i-=1;
					}					
				}
				UpdateAtoms(j,"",CT_NUM);				
			}//end of Numeric
			else if(i<nLen-1&&strchr(".-",sChar[0])&&!sChar[1]&&*(sSentence+i)==sChar[0])
			{//for ...... and --
				if(sChar[0]=='.')
				{
					char ellipsis[]= ".....";
					bool bEllipsis = false;
					for( int tmpindex = 4; tmpindex>=1; tmpindex-- )
					{
						if( strncmp(sSentence+i,ellipsis,strlen(ellipsis))==0)
						{
							bEllipsis=true;
							break;							
						}
						else
						{
							ellipsis[tmpindex]='';
						}
					}
					if( bEllipsis )					
					{
						i+=UpdateAtoms(j,ellipsis,CT_DELIMITER);												
					}
				}
				else if(sChar[0]=='-')//the next character must be '-'
				{
					i+=UpdateAtoms(j,"-",CT_DELIMITER);					
				}
			}
			else if(nCurType==CT_LETTER||
				nCurType==CT_SINGLE&&sChar[0]!='+'&&sChar[0]!='-'||
				(strchr("+-",sChar[0])||CC_Find("±－＋",sChar))&&nNextType!=CT_CHINESE)						
			{//letters
				unsigned int k=i;
				bool bSBC = true;
				if(nCurType==CT_SINGLE||strchr("+-",sChar[0]))
					bSBC=false;
				while(k<nLen)
				{					
					i=k;
					k+=GetChar(sSentence+k,sChar);
					nNextType=charType((unsigned char *)sChar);//Get the type
					if(	bSBC&&(nNextType==CT_LETTER||nNextType==CT_NUM) ||
						!bSBC&&(nNextType==CT_SINGLE||nNextType==CT_SINGLE_NUM))
					{
						strcat(m_sAtom[j],sChar);
					}
					else
					{
						break;
					}				
				}			
				m_nAtomPOS[j]=CT_LETTER;
				m_nAtomLength[j]=strlen(m_sAtom[j]);
			}
			/*---------------------------------------------*/
			else if(!strcmp(sChar,"－") && !strncmp(sSentence+i,"－",2) )
			{
				i+=UpdateAtoms(j,"－",CT_DELIMITER);				
			}		
		}		
		j++;
		UpdateAtoms(j,"",-1,false);
	}
	m_nAtomCount=j;
	return true;
}
/*---------------------------------------------*/
/*----Added By huangjin@ict.ac.cn 2006-9-12----*/
int CSegGraph::UpdateAtoms( int j, const char* str, int nPOS, bool bApp )
{	
	if(!bApp)
	{
		m_sAtom[j][0]=0;
	}
	strcat(m_sAtom[j],str);
	m_nAtomLength[j]=strlen(m_sAtom[j]);
	m_nAtomPOS[j]=nPOS;
	return strlen(str);
}
/*---------------------------------------------*/