SegGraph.cpp
上传用户:sunyong76
上传日期:2021-10-03
资源大小:2236k
文件大小:17k
- //////////////////////////////////////////////////////////////////////
- //ICTCLAS简介:计算所汉语词法分析系统ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System),
- // 功能有:中文分词;词性标注;未登录词识别。
- // 分词正确率高达97.58%(973专家评测结果),
- // 未登录词识别召回率均高于90%,其中中国人名的识别召回率接近98%;
- // 处理速度为31.5Kbytes/s。
- //著作权: Copyright?2002-2005中科院计算所 职务著作权人:张华平 刘群
- //遵循协议:自然语言处理开放资源许可证1.0
- //Email: zhanghp@software.ict.ac.cn
- //Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
- /****************************************************************************
- *
- * Copyright (c) 2000, 2001
- * Machine Group
- * Software Research Lab.
- * Institute of Computing Tech.
- * Chinese Academy of Sciences
- * All rights reserved.
- *
- * This file is the confidential and proprietary property of
- * Institute of Computing Tech. and the posession or use of this file requires
- * a written license from the author.
- * Filename: SegGraph.cpp
- * Abstract:
- * implement for the Word Segmentation Directed Graph.
- *
- * Author: Kevin Zhang
- * (zhanghp@software.ict.ac.cn)
- * Date: 2002-1-8
- *
- * Notes:
- *
- *
- ****************************************************************************/
- // SegGraph.cpp: implementation of the CSegGraph class.
- //
- //////////////////////////////////////////////////////////////////////
- #include "stdafx.h"
- #include "SegGraph.h"
- #include "..\Utility\Utility.h"
- #include <string.h>
- #include <math.h>
- //////////////////////////////////////////////////////////////////////
- // Construction/Destruction
- //////////////////////////////////////////////////////////////////////
- CSegGraph::CSegGraph()
- {
- m_segGraph.SetRowFirst();
- //segGraph: The segmentation word graph
- //Row first array
- }
- CSegGraph::~CSegGraph()
- {
- }
- bool CSegGraph::GenerateWordNet(char *sSentence,CDictionary &dictCore,bool bOriginalFreq)
- {
- //Gernerate the word net from the sLine, that's list all the possible word
- unsigned int i=0,j,nLen=strlen(sSentence);
-
- /*
- * ----- commented by huangjin@ict.ac.cn 2006-6-8 ------
- *
- * char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH];
- *
- */
- /*----Added By huangjin@ict.ac.cn 2006-6-8----*/
- char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH]="";
- /*---------------------------------------------*/
-
- int nWordIndex=0,nHandleTemp,k,nPOS;
- int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount;
- double dValue=0;
- m_nAtomCount=0;
- m_segGraph.SetEmpty();//Set segmentation graph empty
- AtomSegment(sSentence);
- //Atomic Segmentation
- for(i=0;i<m_nAtomCount;i++)//Init the cost array
- {
- if(m_nAtomPOS[i]==CT_CHINESE)//The atom is a Chinese Char
- {
- if(!bOriginalFreq)//Not original frequency
- /*
- * ----- commented by huangjin@ict.ac.cn 2006-5-30 ------
- *
- * m_segGraph.SetElement(i,i+1,log(MAX_FREQUENCE),0);//init the link with the maximum value
- *
- */
- /*----Added By huangjin@ict.ac.cn 2006-5-30----*/
- m_segGraph.SetElement(i,i+1,log(double(MAX_FREQUENCE)),0);//init the link with the maximum value
- /*--------------------------------------------*/
-
- else
- m_segGraph.SetElement(i,i+1,0,0,m_sAtom[i]);//init the link with the maximum value
- }
- else//Other atom
- {
- strcpy(sWord,m_sAtom[i]);//init the word
- dValue=MAX_FREQUENCE;
- switch(m_nAtomPOS[i])
- {
- case CT_INDEX:
- case CT_NUM:
- /*----Added By huangjin@ict.ac.cn 2006-7-11----*/
- case CT_SINGLE_NUM:
- /*---------------------------------------------*/
- nPOS=-27904;//'m'*256
- strcpy(sWord,"未##数");
- dValue=0;
- break;
- case CT_DELIMITER:
- nPOS=30464;//'w'*256;
- break;
- case CT_LETTER:
- nPOS=-'n'*256-'x';//
- dValue=0;
- strcpy(sWord,"未##串");
- break;
- /*----Added By huangjin@ict.ac.cn 2006-7-11----*/
- case CT_SINGLE_DELIMITER://12021-2129-3121
- if(GetCharCount("+-1234567890",m_sAtom[i])==(int)strlen(m_sAtom[i]))
- {
- nPOS=-27904;//'m'*256
- dValue=0;
- strcpy(sWord,"未##数");
- }
- else
- {
- nPOS=30464; //'w'*256
- }
- break;
- /*---------------------------------------------*/
- case CT_SINGLE://12021-2129-3121
- if(GetCharCount("+-1234567890",m_sAtom[i])==(int)strlen(m_sAtom[i]))
- {
- nPOS=-27904;//'m'*256
- strcpy(sWord,"未##数");
- }
- else
- {
- nPOS=-'n'*256-'x';//
- strcpy(sWord,"未##串");
- }
- dValue=0;
- break;
- default:
- nPOS=m_nAtomPOS[i];//'?'*256;
- break;
- }
- if(!bOriginalFreq)//Not original frequency
- m_segGraph.SetElement(i,i+1,0,nPOS);//init the link with minimum
- else
- m_segGraph.SetElement(i,i+1,dValue,nPOS,sWord);//init the link with minimum
- }
- }
- i=0;
-
- while(i<m_nAtomCount)//All the word
- {
- strcpy(sWord,m_sAtom[i]);//Get the current atom
- j=i+1;
- /*
- * ----- commented by huangjin@ict.ac.cn 2006-5-31 ------
- *
- * if(strcmp(sWord,"月")==0&&strcmp(m_sAtom[i+1],"份")==0)//Don't split 月份
- *
- */
- /*----Added By huangjin@ict.ac.cn 2006-5-31----*/
- //Add "i<m_nAtomCount-1" so that the i will not slop over when compare m_sAtom[i+1]
- if(strcmp(sWord,"月")==0&&i<m_nAtomCount-1&&strcmp(m_sAtom[i+1],"份")==0)//Don't split 月份
- /*---------------------------------------------*/
- j+=1;
- /*----Added By huangjin@ict.ac.cn 2006-7-17----*/
- if((strcmp(sWord,"点")==0||strcmp(sWord,"刻")==0)&&
- i<m_nAtomCount-1&&strcmp(m_sAtom[i+1],"钟")==0)//Don't split 点钟|刻钟
- j+=1;
- /*---------------------------------------------*/
- //while(j<=m_nAtomCount&&dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp))
- //=============================modify by Jiang Wenbin===================================
- while(j<=m_nAtomCount)
- {
-
- bool find=dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp);
- if(find)
- {
-
- if(strcmp(sWordMatch,sWord)==0)//find the current word
- {
-
- nTotalFreq=0;
- dictCore.GetHandle(sWord,&nMatchCount,nMatchHandle,nMatchFreq);
- for(k=0;k<nMatchCount;k++)//Add the frequency
- {
- nTotalFreq+=nMatchFreq[k];
- }
- //Adding a rule to exclude some words to be formed.
- if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)m_sAtom[i-1])||IsAllChineseNum(m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0||strncmp(sWord,"月",2)==0))
- {//1年内、1999年末
- if(CC_Find("末内中底前间初",sWord+2))
- break;
- }
- if(nMatchCount==1)//The possible word has only one POS, store it
- {
- if(!bOriginalFreq)//Not original frequency
- /*
- * ----- commented by huangjin@ict.ac.cn 2006-5-30 ------
- *
- * m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),nMatchHandle[0]);
- *
- */
- /*----Added By huangjin@ict.ac.cn 2006-5-30----*/
- m_segGraph.SetElement(i,j,-log(double(nTotalFreq+1))+log(double(MAX_FREQUENCE)),nMatchHandle[0]);
- /*---------------------------------------------*/
- else
- m_segGraph.SetElement(i,j,nTotalFreq,nMatchHandle[0],sWord);
- }
- else
- {
- if(!bOriginalFreq)//Not original frequency
- /*
- * ----- commented by huangjin@ict.ac.cn 2006-5-30 ------
- *
- * m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),0);
- *
- */
- /*----Added By huangjin@ict.ac.cn 2006-5-30----*/
- m_segGraph.SetElement(i,j,-log(double(nTotalFreq+1))+log(double(MAX_FREQUENCE)),0);
- /*---------------------------------------------*/
- else
- m_segGraph.SetElement(i,j,nTotalFreq,0,sWord);
- }
- }
- }
- else if(j-i>8)
- {
- break;
- }
- //Add a condition to control the end of string
- //retrieve the dictionary with the word
- //cout<<"Word: "<<sWord<<endl;
-
- strcat(sWord,m_sAtom[j++]);
- }
- i+=1;//Start from i++;
- }
- return true;
- }
- /*
- * ----- commented by huangjin@ict.ac.cn 2006-7-11 ------
- *
- *
- //DELbool CSegGraph::AtomSegment(char *sSentence)
- //DEL{
- //DEL unsigned int i=0,j=0,nCurType,nNextType;
- //DEL //i is the pointer of sentence string
- //DEL //j is the pointer of pAtoms
- //DEL char sChar[3];
- //DEL sChar[2]=0;//Set the char ending
- //DEL m_sAtom[j][0]=0;//Set the first word as null
- //DEL m_nAtomLength[j]=0;
- //DEL if(strncmp(sSentence,SENTENCE_BEGIN,strlen(SENTENCE_BEGIN))==0)
- //DEL {
- //DEL strcpy(m_sAtom[j],SENTENCE_BEGIN);//Set the first word as sentence begining
- //DEL m_nAtomLength[j]=strlen(SENTENCE_BEGIN);
- //DEL m_nAtomPOS[j]=CT_SENTENCE_BEGIN;//init
- //DEL i+=m_nAtomLength[j];
- //DEL j+=1;
- //DEL m_sAtom[j][0]=0;//Set the first word as null
- //DEL m_nAtomLength[j]=0;
- //DEL }
- //DEL while(i<strlen(sSentence))
- //DEL {
- //DEL if(strncmp(sSentence+i,SENTENCE_END,strlen(SENTENCE_END))==0)
- //DEL {
- //DEL strcpy(m_sAtom[j],SENTENCE_END);//Set the first word as null
- //DEL m_nAtomLength[j]=strlen(SENTENCE_END);
- //DEL m_nAtomPOS[j]=CT_SENTENCE_END;//init
- //DEL i+=m_nAtomLength[j];
- //DEL j+=1;
- //DEL m_sAtom[j][0]=0;//Set the first word as null
- //DEL m_nAtomLength[j]=0;
- //DEL continue;
- //DEL }
- //DEL /*----Added By huangjin@ict.ac.cn 2006-6-8----*/
- //DEL //连续的三个点到六个点认为是省略号,不要切开
- //DEL char c=*(sSentence+i);
- //DEL if(c=='.')
- //DEL {
- //DEL char ellipsis[]= "......";
- //DEL bool bEllipsis = false;
- //DEL for( int tmpindex = 5; tmpindex>=2; tmpindex-- )
- //DEL {
- //DEL if( strncmp(sSentence+i,ellipsis,strlen(ellipsis))==0)
- //DEL {
- //DEL strcpy(m_sAtom[j],ellipsis);
- //DEL m_nAtomLength[j]=strlen(ellipsis);
- //DEL m_nAtomPOS[j]=CT_DELIMITER;//init
- //DEL i+=m_nAtomLength[j];
- //DEL j+=1;
- //DEL m_sAtom[j][0]=0;//Set the first word as null
- //DEL m_nAtomLength[j]=0;
- //DEL bEllipsis=true;
- //DEL break;
- //DEL }
- //DEL else
- //DEL {
- //DEL ellipsis[tmpindex]=' ';
- //DEL }
- //DEL }
- //DEL if( bEllipsis )
- //DEL {
- //DEL continue;
- //DEL }
- //DEL }
- //DEL else if(c=='-'&&i+1<strlen(sSentence))
- //DEL {
- //DEL c=*(sSentence+i+1);
- //DEL if(c=='-')
- //DEL {
- //DEL strcpy(m_sAtom[j],"--");
- //DEL m_nAtomLength[j]=strlen("--");
- //DEL m_nAtomPOS[j]=CT_DELIMITER;//init
- //DEL i+=m_nAtomLength[j];
- //DEL j+=1;
- //DEL m_sAtom[j][0]=0;//Set the first word as null
- //DEL m_nAtomLength[j]=0;
- //DEL continue;
- //DEL }
- //DEL }
- //DEL /*---------------------------------------------*/
- //DEL
- //DEL sChar[0]=*(sSentence+i);//Get the char with first byte
- //DEL sChar[1]=0;//
- //DEL i+=1;
- //DEL if(sChar[0]<0)//Two byte char
- //DEL {
- //DEL sChar[1]=*(sSentence+i);//Get the char with second byte
- //DEL i+=1;//i increased by 1
- //DEL }
- //DEL strcat(m_sAtom[j],sChar);
- //DEL nCurType=charType((unsigned char *)sChar);
- //DEL if(sChar[0]=='.'&&(charType((unsigned char *)sSentence+i)==CT_NUM||(*(sSentence+i)>='0'&&*(sSentence+i)<='9')))
- //DEL nCurType=CT_NUM;//Digit after . indicate . as a point in the numeric
- //DEL
- //DEL m_nAtomPOS[j]=nCurType;
- //DEL //Record its property, just convience for continuous processing
- //DEL
- //DEL if(nCurType==CT_CHINESE||nCurType==CT_INDEX||nCurType==CT_DELIMITER||nCurType==CT_OTHER)
- //DEL {//Chinese char, index number,delimiter and other is treated as atom
- //DEL m_nAtomLength[j]=strlen(m_sAtom[j]);//Save its length
- //DEL j+=1;//Skip to next atom
- //DEL m_sAtom[j][0]=0;//init
- //DEL /*----Added By huangjin@ict.ac.cn 2006-6-8----*/
- //DEL m_nAtomLength[j]=0;
- //DEL /*---------------------------------------------*/
- //DEL }
- //DEL else
- //DEL {//Number,single char, letter
- //DEL nNextType=255;
- //DEL if(i<strlen(sSentence))
- //DEL nNextType=charType((unsigned char *)(sSentence+i));
- //DEL if(nNextType!=nCurType||i==strlen(sSentence))
- //DEL //Reaching end or next char type is different from current char
- //DEL {
- //DEL m_nAtomLength[j]=strlen(m_sAtom[j]);//Save its length
- //DEL j+=1;
- //DEL m_sAtom[j][0]=0;//init
- //DEL /*----Added By huangjin@ict.ac.cn 2006-6-8----*/
- //DEL m_nAtomLength[j]=0;
- //DEL /*---------------------------------------------*/
- //DEL }
- //DEL }
- //DEL }
- //DEL m_nAtomCount=j;//The count of segmentation atoms
- //DEL return true;
- //DEL}
- //DEL*
- //DEL*/
- /*----Added By huangjin@ict.ac.cn 2006-9-12----*/
- bool CSegGraph::AtomSegment(char *sSentence)
- {
- unsigned int i=0, j=0, nCurType, nNextType;
- //i is the pointer of sSentence string
- //j is the pointer of m_sAtom, m_nAtomLength an m_nAtomPOS
- char sChar[3]="