多国语言处理

开发平台：
Visual C++

SegGraph.cpp：源码内容
							//////////////////////////////////////////////////////////////////////
//ICTCLAS简介：计算所汉语词法分析系统ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System)，
//             功能有：中文分词；词性标注；未登录词识别。
//             分词正确率高达97.58%(973专家评测结果)，
//             未登录词识别召回率均高于90%，其中中国人名的识别召回率接近98%;
//             处理速度为31.5Kbytes/s。
//著作权：  Copyright?2002-2005中科院计算所 职务著作权人：张华平 刘群
//遵循协议：自然语言处理开放资源许可证1.0
//Email: zhanghp@software.ict.ac.cn
//Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
/****************************************************************************
 *
 * Copyright (c) 2000, 2001 
 *     Machine Group
 *     Software Research Lab.
 *     Institute of Computing Tech.
 *     Chinese Academy of Sciences
 *     All rights reserved.
 *
 * This file is the confidential and proprietary property of 
 * Institute of Computing Tech. and the posession or use of this file requires 
 * a written license from the author.
 * Filename: SegGraph.cpp
 * Abstract:
 *            implement for the Word Segmentation Directed Graph.
 *
 * Author:   Kevin Zhang 
 *          (zhanghp@software.ict.ac.cn)
 * Date:     2002-1-8
 *
 * Notes:
 *                
 * 
 ****************************************************************************/
// SegGraph.cpp: implementation of the CSegGraph class.
//
//////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "SegGraph.h"
#include "..\Utility\Utility.h"
#include <string.h>
#include <math.h>
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
CSegGraph::CSegGraph()
{
	m_segGraph.SetRowFirst();	
	//segGraph: The segmentation word graph
	//Row first array
}
CSegGraph::~CSegGraph()
{
}
bool CSegGraph::GenerateWordNet(char *sSentence,CDictionary &dictCore,bool	bOriginalFreq)
{
//Gernerate the word net from the sLine, that's list all the possible word
	unsigned int i=0,j,nLen=strlen(sSentence);
	char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH];
	int nWordIndex=0,nHandleTemp,k,nPOS;
	int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount;
	double dValue=0;
	m_nAtomCount=0;
	m_segGraph.SetEmpty();//Set segmentation graph empty
	AtomSegment(sSentence);
	//Atomic Segmentation
    for(i=0;i<m_nAtomCount;i++)//Init the cost array
    {
		if(m_nAtomPOS[i]==CT_CHINESE)//The atom is a Chinese Char
		{
			if(!bOriginalFreq)//Not original frequency
				m_segGraph.SetElement(i,i+1,log(MAX_FREQUENCE),0);//init the link with the maximum value
			else
				m_segGraph.SetElement(i,i+1,0,0,m_sAtom[i]);//init the link with the maximum value
		}
		else//Other atom
		{
			strcpy(sWord,m_sAtom[i]);//init the word 
			dValue=MAX_FREQUENCE;
			switch(m_nAtomPOS[i])
			{
			case CT_INDEX:
			case CT_NUM:
				nPOS=-27904;//'m'*256
				strcpy(sWord,"未##数");
				dValue=0;
				break;
			case CT_DELIMITER:
				nPOS=30464;//'w'*256;
				break;
			case CT_LETTER:
				nPOS=-'n'*256-'x';//
				dValue=0;
				strcpy(sWord,"未##串");
				break;
			case CT_SINGLE://12021-2129-3121
				if(GetCharCount("+-1234567890",m_sAtom[i])==(int)strlen(m_sAtom[i]))
				{
					nPOS=-27904;//'m'*256
					strcpy(sWord,"未##数");
				}
				else
				{
					nPOS=-'n'*256-'x';//
					strcpy(sWord,"未##串");
				}
				dValue=0;
				break;
			default:
				nPOS=m_nAtomPOS[i];//'?'*256;
				break;
			}
			if(!bOriginalFreq)//Not original frequency
				m_segGraph.SetElement(i,i+1,0,nPOS);//init the link with minimum
			else
				m_segGraph.SetElement(i,i+1,dValue,nPOS,sWord);//init the link with minimum
		}
    }
	i=0;
	while(i<m_nAtomCount)//All the word
	{
	  strcpy(sWord,m_sAtom[i]);//Get the current atom
	  j=i+1;
	  if(strcmp(sWord,"月")==0&&strcmp(m_sAtom[i+1],"份")==0)//Don't split 月份
		  j+=1;
	  while(j<=m_nAtomCount&&dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp))
	  {//Add a condition to control the end of string
	   //retrieve the dictionary with the word
          if(strcmp(sWordMatch,sWord)==0)//find the current word
		  {
			  nTotalFreq=0;
			  dictCore.GetHandle(sWord,&nMatchCount,nMatchHandle,nMatchFreq);
			  for(k=0;k<nMatchCount;k++)//Add the frequency
			  {
				 nTotalFreq+=nMatchFreq[k];
			  }
			  //Adding a rule to exclude some words to be formed.
			  if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)m_sAtom[i-1])||IsAllChineseNum(m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0||strncmp(sWord,"月",2)==0))
			  {//1年内、1999年末
			     if(CC_Find("末内中底前间初",sWord+2))
				     break;
			  }
			  if(nMatchCount==1)//The possible word has only one POS, store it
			  {
				if(!bOriginalFreq)//Not original frequency
					m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),nMatchHandle[0]);
				else
					m_segGraph.SetElement(i,j,nTotalFreq,nMatchHandle[0],sWord);
			  }
			  else 
			  {
					if(!bOriginalFreq)//Not original frequency
						m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),0);
					else
						m_segGraph.SetElement(i,j,nTotalFreq,0,sWord);
			  }
		  }
		  strcat(sWord,m_sAtom[j++]);
	  }
	  i+=1;//Start from i++;
	}
	return true;
}
bool CSegGraph::AtomSegment(char *sSentence)
{
    unsigned int i=0,j=0,nCurType,nNextType;	
	//i is the pointer of sentence string
	//j is the pointer of pAtoms
	char sChar[3];
	sChar[2]=0;//Set the char ending
	m_sAtom[j][0]=0;//Set the first word as null
	m_nAtomLength[j]=0;
	if(strncmp(sSentence,SENTENCE_BEGIN,strlen(SENTENCE_BEGIN))==0)
	{
		strcpy(m_sAtom[j],SENTENCE_BEGIN);//Set the first word as sentence begining
		m_nAtomLength[j]=strlen(SENTENCE_BEGIN);
		m_nAtomPOS[j]=CT_SENTENCE_BEGIN;//init
		i+=m_nAtomLength[j];
		j+=1;
		m_sAtom[j][0]=0;//Set the first word as null
		m_nAtomLength[j]=0;
	}
	while(i<strlen(sSentence))
	{
		if(strncmp(sSentence+i,SENTENCE_END,strlen(SENTENCE_END))==0)
		{
			strcpy(m_sAtom[j],SENTENCE_END);//Set the first word as null
			m_nAtomLength[j]=strlen(SENTENCE_END);
			m_nAtomPOS[j]=CT_SENTENCE_END;//init
			i+=m_nAtomLength[j];
			j+=1;
			m_sAtom[j][0]=0;//Set the first word as null
			m_nAtomLength[j]=0;
			continue;
		}
		sChar[0]=*(sSentence+i);//Get the char with first byte
		sChar[1]=0;//
		i+=1;
		if(sChar[0]<0)//Two byte char
		{
			sChar[1]=*(sSentence+i);//Get the char with second byte
			i+=1;//i increased by 1
		}
		strcat(m_sAtom[j],sChar);
		nCurType=charType((unsigned char *)sChar);
		if(sChar[0]=='.'&&(charType((unsigned char *)sSentence+i)==CT_NUM||(*(sSentence+i)>='0'&&*(sSentence+i)<='9')))
			nCurType=CT_NUM;//Digit after . indicate . as a point in the numeric
		m_nAtomPOS[j]=nCurType;
		//Record its property, just convience for continuous processing
		
		if(nCurType==CT_CHINESE||nCurType==CT_INDEX||nCurType==CT_DELIMITER||nCurType==CT_OTHER)
		{//Chinese char, index number,delimiter and other is treated as atom
			m_nAtomLength[j]=strlen(m_sAtom[j]);//Save its length
			j+=1;//Skip to next atom
			m_sAtom[j][0]=0;//init
		}
		else 
		{//Number,single char, letter
			nNextType=255;
			if(i<strlen(sSentence))
				nNextType=charType((unsigned char *)(sSentence+i));
			if(nNextType!=nCurType||i==strlen(sSentence))
			//Reaching end or next char type is different from current char
			{
				m_nAtomLength[j]=strlen(m_sAtom[j]);//Save its length	
				j+=1;
				m_sAtom[j][0]=0;//init
			}
		}
	}
	m_nAtomCount=j;//The count of segmentation atoms
	return true;
}