Segment.cpp
上传用户:chen_dj
上传日期:2013-04-22
资源大小:111k
文件大小:9k
- /****************************************************************************
- *
- * Copyright (c) 2000, 2001
- * Machine Group
- * Software Research Lab.
- * Institute of Computing Tech.
- * Chinese Academy of Sciences
- * All rights reserved.
- *
- * This file is the confidential and proprietary property of
- * Institute of Computing Tech. and the posession or use of this file requires
- * a written license from the author.
- * Filename: Segment.cpp
- * Abstract:
- * implementation of the CSegment class.
- * Author: Kevin Zhang
- * (zhanghp@software.ict.ac.cn)
- * Date: 2002-4-23
- *
- * Notes: N-Shortest paths Word segmentation
- *
- ****************************************************************************/
- #include "stdafx.h"
- #include "Segment.h"
- #include "..\Utility\Dictionary.h"
- #include "..\Utility\Utility.h"
- #include "NShortPath.h"
- #include <string.h>
- #include <math.h>
- //////////////////////////////////////////////////////////////////////
- // Construction/Destruction
- //////////////////////////////////////////////////////////////////////
- CSegment::CSegment()
- {
- //malloc buffer
- m_pWordSeg=new PWORD_RESULT[MAX_SEGMENT_NUM];
- for(int i=0;i<MAX_SEGMENT_NUM;i++)
- {
- m_pWordSeg[i]=new WORD_RESULT[MAX_WORDS];
- }
- }
- CSegment::~CSegment()
- {
- //free buffer
- for(int i=0;i<MAX_SEGMENT_NUM;i++)
- {
- delete m_pWordSeg[i];
- }
- delete m_pWordSeg;
- }
- bool CSegment::Segment(char *sSentence,CDictionary &dictCore,int nResultCount)
- {
- int **nSegRoute;//The segmentation route
- nSegRoute=new int*[MAX_SEGMENT_NUM];
- for(int i=0;i<MAX_SEGMENT_NUM;i++)
- {
- nSegRoute[i]=new int[MAX_SENTENCE_LEN/2];
- memset(nSegRoute[i],0,MAX_SENTENCE_LEN/2*sizeof(int));
- }
- m_graphSeg.GenerateWordNet(sSentence,dictCore);
- CNShortPath sp(&m_graphSeg.m_segGraph,nResultCount);
- sp.ShortPath();
- sp.Output((int **)nSegRoute,false,&m_nSegmentCount);
- m_graphOptimum.SetEmpty();//Set graph optimum empty
- i=0;
- while(i<m_nSegmentCount)
- {
- GenerateWord(nSegRoute,i);
- //Gernerate word according the Segmentation route
- i++;
- }
-
- //free the memory
- for(i=0;i<MAX_SEGMENT_NUM;i++)
- {
- delete [] nSegRoute[i];//free the pointer memory
- }
- delete [] nSegRoute;//free the pointer array
-
- return true;
- }
- //Generate Word according the segmentation route
- bool CSegment::GenerateWord(int **nSegRoute, int nIndex)
- {
- unsigned int i=0,k=0;
- int j,nStartVertex,nEndVertex,nPOS;
- char sAtom[WORD_MAXLENGTH],sNumCandidate[100];
- ELEMENT_TYPE fValue;
- while(nSegRoute[nIndex][i]!=-1&&nSegRoute[nIndex][i+1]!=-1&&nSegRoute[nIndex][i]<nSegRoute[nIndex][i+1])
- {
- nStartVertex=nSegRoute[nIndex][i];
- j=nStartVertex;//Set the start vertex
- nEndVertex=nSegRoute[nIndex][i+1];//Set the end vertex
- nPOS=0;
- m_graphSeg.m_segGraph.GetElement(nStartVertex,nEndVertex,&fValue,&nPOS);
- sAtom[0]=0;
- while(j<nEndVertex)
- {//Generate the word according the segmentation route
- strcat(sAtom,m_graphSeg.m_sAtom[j]);
- j++;
- }
- m_pWordSeg[nIndex][k].sWord[0]=0;//Init the result ending
- strcpy(sNumCandidate,sAtom);
- while(sAtom[0]!=0&&(IsAllNum((unsigned char *)sNumCandidate)||IsAllChineseNum(sNumCandidate)))
- {//Merge all seperate continue num into one number
- //sAtom[0]!=0: add in 2002-5-9
- strcpy(m_pWordSeg[nIndex][k].sWord,sNumCandidate);
- //Save them in the result segmentation
- i++;//Skip to next atom now
- sAtom[0]=0;
-
- while(j<nSegRoute[nIndex][i+1])
- {//Generate the word according the segmentation route
- strcat(sAtom,m_graphSeg.m_sAtom[j]);
- j++;
- }
- strcat(sNumCandidate,sAtom);
- }
- if(m_pWordSeg[nIndex][k].sWord[0]==0)//Have never entering the while loop
- {
- strcpy(m_pWordSeg[nIndex][k].sWord,sAtom);
- //Save them in the result segmentation
- }
- else
- {//It is a num
- if(strcmp("--",m_pWordSeg[nIndex][k].sWord)==0||strcmp("—",m_pWordSeg[nIndex][k].sWord)==0||m_pWordSeg[nIndex][k].sWord[0]=='-')//The delimiter "--"
- {
- nPOS=30464;//'w'*256;Set the POS with 'w'
- i--;//Not num, back to previous word
- }
- else
- {//Adding time suffix
- char sInitChar[3];
- unsigned int nCharIndex=0;//Get first char
- sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex];
- if(sInitChar[nCharIndex]<0)
- {
- nCharIndex+=1;
- sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex];
- }
- nCharIndex+=1;
- sInitChar[nCharIndex]=' ';
- if(k>0&&m_pWordSeg[nIndex][k-1].nHandle==27904&&(strcmp(sInitChar,"—")==0||sInitChar[0]=='-')&&(strlen(m_pWordSeg[nIndex][k].sWord)>nCharIndex))
- {//3-4月 //27904='m'*256
- //Split the sInitChar from the original word
- strcpy(m_pWordSeg[nIndex][k+1].sWord,m_pWordSeg[nIndex][k].sWord+nCharIndex);
- m_pWordSeg[nIndex][k+1].dValue=m_pWordSeg[nIndex][k].dValue;
- m_pWordSeg[nIndex][k+1].nHandle=27904;
- m_pWordSeg[nIndex][k].sWord[nCharIndex]=0;
- m_pWordSeg[nIndex][k].dValue=0;
- m_pWordSeg[nIndex][k].nHandle=30464;//'w'*256;
- m_graphOptimum.SetElement(nStartVertex,nStartVertex+1,m_pWordSeg[nIndex][k].dValue,m_pWordSeg[nIndex][k].nHandle);
- nStartVertex+=1;
- k+=1;
- }
- unsigned int nLen=strlen(m_pWordSeg[nIndex][k].sWord);
- if((strlen(sAtom)==2&&CC_Find("月日时分秒",sAtom))||strcmp(sAtom,"月份")==0)
- {//2001年
- strcat(m_pWordSeg[nIndex][k].sWord,sAtom);
- nPOS=29696;//'t'*256;//Set the POS with 'm'
- }
- else if(strcmp(sAtom,"年")==0)
- {
- if(IsYearTime(m_pWordSeg[nIndex][k].sWord))//strncmp(sAtom,"年",2)==0&&
- {//1998年,
- strcat(m_pWordSeg[nIndex][k++].sWord,sAtom);
- nPOS='t'*256;//Set the POS with 'm'
- }
- else
- i--;//Can not be a time word
- }
- else
- {
- //早晨/t 五点/t
- if(strcmp(m_pWordSeg[nIndex][k].sWord+strlen(m_pWordSeg[nIndex][k].sWord)-2,"点")==0)
- {
- nPOS='t'*256;//Set the POS with 'm'
- }
- else
- {
- if(m_pWordSeg[nIndex][k].sWord[0]!='.')
- nPOS='m'*256;//Set the POS with 'm'
- if(nLen>1&&m_pWordSeg[nIndex][k].sWord[nLen-1]=='.')
- {//Get rid of . example 1.
- m_pWordSeg[nIndex][k].sWord[nLen-1]=0;
- i--;
- }
- }
- i--;//Not num, back to previous word
- }
- }
- fValue=0;
- nEndVertex=nSegRoute[nIndex][i+1];//Ending POS changed to latter
- }
- m_pWordSeg[nIndex][k].nHandle=nPOS;//Get the POS of current word
- m_pWordSeg[nIndex][k].dValue=fValue;//(int)(MAX_FREQUENCE*exp(-fValue));//Return the frequency of current word
- m_graphOptimum.SetElement(nStartVertex,nEndVertex,fValue,nPOS);
- //Generate optimum segmentation graph according the segmentation result
- i++;//Skip to next atom
- k++;//Accept next word
- }
- m_pWordSeg[nIndex][k].sWord[0]=0;
- m_pWordSeg[nIndex][k].nHandle=-1;//Set ending
- return true;
- }
- //DEL bool CSegment::GetSegmentResult(int nIndex,char *sResult)
- //DEL {
- //DEL int i=0;
- //DEL char sTempBuffer[WORD_MAXLENGTH];
- //DEL sResult[0]=0;
- //DEL if(nIndex<0||nIndex>=m_nSegmentCount)
- //DEL return false;
- //DEL while(m_WordSeg[nIndex][i].sWord[0]!=0)
- //DEL {
- //DEL sprintf(sTempBuffer,"%s/%c%c",m_WordSeg[nIndex][i].sWord,m_WordSeg[nIndex][i].nHandle/256,m_WordSeg[nIndex][i].nHandle%256);
- //DEL strcat(sResult,sTempBuffer);
- //DEL strcat(sResult," ");
- //DEL i++;
- //DEL }
- //DEL return true;
- //DEL }
- //Word Segmentation based on optimum segmentation graph
- //After unknown word recognition
- bool CSegment::OptimumSegmet(int nResultCount)
- {
- int **nSegRoute;//The segmentation route
- nSegRoute=new int*[MAX_SEGMENT_NUM];
- for(int i=0;i<MAX_SEGMENT_NUM;i++)
- {
- nSegRoute[i]=new int[MAX_SENTENCE_LEN/2];
- }
- CNShortPath sp(&m_graphOptimum,nResultCount);
- sp.ShortPath();
- sp.Output((int **)nSegRoute,false,&m_nSegmentCount);
- i=0;
- m_graphSeg.m_segGraph=m_graphOptimum;
- m_graphOptimum.SetEmpty();//Set graph optimum empty
- while(i<m_nSegmentCount)
- {
- GenerateWord(nSegRoute,i);
- //Gernerate word according the Segmentation route
- i++;
- }
-
- //free the memory
- for(i=0;i<MAX_SEGMENT_NUM;i++)
- {
- delete [] nSegRoute[i];//free the pointer memory
- }
- delete [] nSegRoute;//free the pointer array
- return true;
- }
- int CSegment::GetResultCount(PWORD_RESULT pItem)
- {
- int nCount=0;
- while(pItem[nCount].sWord[0]!=0)
- {
- nCount+=1;
- }
- return nCount;
- }
- bool CSegment::GetLastWord(PWORD_RESULT pItem, char *sWordRet)
- {
- int nCount=0;
- sWordRet[0]=0;
- while(pItem[nCount].sWord[0]!=0)
- {
- strcpy(sWordRet,pItem[nCount].sWord);
- nCount+=1;
- }
- return !sWordRet[0];
- }
- bool CSegment::IsYearTime(char *sNum)
- {//Judge whether the sNum is a num genearating year
- unsigned int nLen=strlen(sNum);
- char sTemp[3];
- strncpy(sTemp,sNum,2);
- sTemp[2]=0;
- if(IsAllSingleByte((unsigned char *)sNum)&&(nLen>=3||nLen==2&&sNum[0]>'4'))//1992年, 90年
- return true;
- if(IsAllNum((unsigned char *)sNum)&&(nLen>=6||nLen==4&&CC_Find("56789",sTemp)))
- return true;
- if(GetCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖",sNum)==(int)nLen/2&&nLen>=3)
- return true;
- if(nLen==8&&GetCharCount("千仟零○",sNum)==2)//二仟零二年
- return true;
- return false;
- }