WordSegment.cpp
上传用户:sanxfzhen
上传日期:2014-12-28
资源大小:2324k
文件大小:9k
- // WordSegment.cpp: implementation of the CWordSegment class.
- //
- //////////////////////////////////////////////////////////////////////
- #include "stdafx.h"
- #include "WordSegment.h"
- #include ".\Utility\Utility.h"
- const unsigned short CWordSegment::uTag = 0x0001; // word classify tag
- const unsigned short CWordSegment::uPlace = 0x0002; // Chinese place name
- const unsigned short CWordSegment::uPerson = 0x0004; // chinese name
- const unsigned short CWordSegment::uTransPerson = 0x0008; // translation name, include translation
- char CWordSegment::m_pSentence[MAX_PATH*10]; //用来存放经过去掉空格回车等字母后的句子
- CWordSegment g_wordSeg;
- //////////////////////////////////////////////////////////////////////
- // Construction/Destruction
- //////////////////////////////////////////////////////////////////////
- AFX_INLINE UINT AFXAPI HashKey(CString key)
- {
- return HashKey((LPCTSTR)key);
- }
- CWordSegment::CWordSegment()
- {
- m_nInit=0;
- m_nOperateType=0;
- m_dSmoothingPara=0.1;//Smoothing parameter
- }
- CWordSegment::~CWordSegment()
- {
- FreeWordSegment();
- }
- bool CWordSegment::InitWorgSegment(char *pPath, int nLanguage)
- {
- if(!m_nInit)
- {
- char filename[MAX_PATH];
- if(nLanguage==0)
- {
- strcpy(filename,pPath);
- strcat(filename,"\data\coreDict.dct");
- if(!m_dictCore.Load(filename)) return false;
- strcpy(filename,pPath);
- strcat(filename,"\data\lexical.ctx");
- if(!m_POSTagger.LoadContext(filename)) return false;
- m_POSTagger.SetTagType();
- strcpy(filename,pPath);
- strcat(filename,"\data\nr");
- if(!m_uPerson.Configure(filename,TT_PERSON)) return false;
- strcpy(filename,pPath);
- strcat(filename,"\data\ns");
- if(!m_uPlace.Configure(filename,TT_PLACE)) return false;
- strcpy(filename,pPath);
- strcat(filename,"\data\tr");
- if(!m_uTransPerson.Configure(filename,TT_TRANS_PERSON)) return false;
- strcpy(filename,pPath);
- strcat(filename,"\data\BigramDict.dct");
- if(!m_dictBigram.Load(filename)) return false;
- }
- FILE *stream;
- char word[MAX_PATH];
- strcpy(filename,pPath);
- if(nLanguage==0)
- {
- strcat(filename,"\data\stopwords.txt");
- m_nInit=1;
- }
- else
- {
- strcat(filename,"\data\engstopwords.txt");
- m_nInit=2;
- }
- if((stream = fopen( filename, "r" )) == NULL)
- return false;
- while(!feof(stream))
- {
- fscanf(stream,"%sn",word);
- if(word[0]!=' ') m_lstStopWords.SetAt(word,1);
- }
- m_lstStopWords.SetAt("rn",1);
- m_lstStopWords.SetAt(" rn",1);
- m_lstStopWords.SetAt("n",1);
- m_lstStopWords.SetAt(" n",1);
- fclose(stream);
- }
- return true;
- }
- void CWordSegment::FreeWordSegment(void)
- {
- if(m_nInit==1)
- {
- m_lstStopWords.RemoveAll();
- m_dictCore.ReleaseDict();
- m_dictBigram.ReleaseDict();
- m_POSTagger.ReleaseSpan();
- m_uPerson.ReleaseUnknowWord();
- m_uTransPerson.ReleaseUnknowWord();
- m_uPlace.ReleaseUnknowWord();
- }
- if(m_nInit==2)
- {
- m_lstStopWords.RemoveAll();
- }
- m_nInit=0;
- }
- void CWordSegment::SetSegSetting(unsigned short Setting)
- {
- m_nOperateType=Setting;
- }
- unsigned short CWordSegment::GetSegSetting()
- {
- return m_nOperateType;
- }
- bool CWordSegment::Segment(char* sentence)
- {
- int nIndex,nResultCount;
- strcpy(m_pNewSentence,SENTENCE_BEGIN);
- strcat(m_pNewSentence,sentence);
- strcat(m_pNewSentence,SENTENCE_END);
- m_Seg.BiSegment(m_pNewSentence, m_dSmoothingPara,m_dictCore,m_dictBigram,1);
- nResultCount=m_Seg.m_nSegmentCount;
- for(nIndex=0;nIndex<m_Seg.m_nSegmentCount;nIndex++)
- {
- if(m_nOperateType&&uPerson)
- m_uPerson.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
- if(m_nOperateType&&uTransPerson)
- m_uTransPerson.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
- if(m_nOperateType&&uPlace)
- m_uPlace.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
- }
- m_Seg.BiOptimumSegment(1,m_dSmoothingPara,m_dictBigram,m_dictCore);
- for(nIndex=0;nIndex<m_Seg.m_nSegmentCount;nIndex++)
- {
- if(m_nOperateType&&uTag)
- m_POSTagger.POSTagging(m_Seg.m_pWordSeg[nIndex],m_dictCore,m_dictCore);
- }
- return true;
- }
- int CWordSegment::GetSegmentCount()
- {
- return m_Seg.m_nSegmentCount;
- }
- PWORD_RESULT CWordSegment::GetWordSeg(int index)
- {
- return m_Seg.m_pWordSeg[index];
- }
- bool CWordSegment::isInStopWords(char *pWord)
- {
- int t;
- return(m_lstStopWords.Lookup(pWord,t)>0);
- }
- //bStopWord=true 去掉停用词
- //bTag=true 结果文件中包含词性标记
- int CWordSegment::SegmentFile(const char *source, const char *target, BOOL bStopWord, BOOL bTag)
- {
- CFile fin;
- FILE *fout;
- if(!fin.Open(source,CFile::modeRead))
- return -1;
- if((fout=fopen(target,"w+"))==NULL)
- {
- fin.Close();
- return -1;
- }
- //读入文件的内容
- int num=0;
- unsigned int flen=fin.GetLength();
- char *buffer=new char[flen+1];
- flen=fin.ReadHuge(buffer,flen);
- buffer[flen]=' ';
- fin.Close();
- int i,j,sum;
- char *w;
- //realcnt为文章中去掉停用词后剩下的总共词数
- //nStart为一个句子在buffer中的开始位置
- int nStart=0,nNewStart=0;
- bool flag=true;
- int nSentenceLen=0;
- int realcnt=0;
- while(buffer[nStart]!=' ')
- {
- flag=true;
- nSentenceLen=ParseFile(buffer,nStart,nNewStart);
- nStart=nNewStart;
- if(nSentenceLen==0) continue;
- if(m_pSentence[0]>0) //如果是一个英文单词
- {
- //如果英文单词的长度大于等于2,且不是数字
- if((nSentenceLen>=2)&&((m_pSentence[0]<'0')||(m_pSentence[0]>'9')))
- {
- fprintf(fout,"%sn",m_pSentence);
- realcnt++;
- }
- }
- else //如果是汉字串
- {
- if(nSentenceLen%2!=0) continue;
- if(nSentenceLen==2) //如果是单个汉字
- {
- fprintf(fout,"%sn",m_pSentence);;
- realcnt++;
- }
- else
- {
- g_wordSeg.Segment(m_pSentence);
- for(i=0;i<g_wordSeg.GetSegmentCount();i++)
- {
- sum=0;
- PWORD_RESULT pItem=g_wordSeg.GetWordSeg(i);
- while(pItem[sum].sWord[0]!=0) sum++;
-
- for(j=1;j<sum-1;j++)
- {
- w=pItem[j].sWord;
- if(w[0]==' '||(bStopWord&&g_wordSeg.isInStopWords(w))) flag=false;
- if(flag)
- {
- fprintf(fout,"%sn",w);
- realcnt++;
- }
- }
- }
- }
- }
- }
- fclose(fout);
- delete[] buffer;
- return realcnt;
- }
- //对pBuffer中的文字进行处理,得到一个句子的结束位置nEnd
- //并且将去掉空格回车等字母后的句子放入变量m_pSentence
- int CWordSegment::ParseFile(char *pBuffer, int nStart, int &nEnd)
- {
- //nSum为得到的句子包含的字节数
- int nCurrent,nSum=0;
- byte bChar[2];
- //是否为中文句子
- bool bChinese=true;
- nCurrent=nStart;
- //去掉句首的全角和半角空格
- //判断句子是以中文字母开头,还是英文字母开头
- while(pBuffer[nCurrent]!=' ')
- {
- bChar[0]=pBuffer[nCurrent];
- if(bChar[0]==' '||bChar[0]=='r'||bChar[0]=='n')
- nCurrent++;
- else if(bChar[0]==0xA1)
- {
- bChar[1]=pBuffer[nCurrent+1];
- if(bChar[1]==0xA1)
- nCurrent+=2;
- else
- {
- bChinese=true;
- break;
- }
- }
- else if(pBuffer[nCurrent]>0)
- {
- bChinese=false;
- break;
- }
- else
- {
- bChinese=true;
- break;
- }
- }
- while(pBuffer[nCurrent]!=' ')
- {
- bChar[0]=pBuffer[nCurrent];
- if(bChar[0]>127)
- {
- if(!bChinese) break;
- nCurrent++;
- bChar[1]=pBuffer[nCurrent];
- //0xA1A1为全角的空格
- if((bChar[0]!=0xA1)||(bChar[1]!=0xA1))
- {
- //如果为"的"字,或为标点符号或其它全角字母
- if(((bChar[0]==0xB5)&&(bChar[1]==0xC4))||
- ((bChar[0]==0xA1)&&(bChar[1]>0xA1)&&(bChar[1]<=0xFE))||
- ((bChar[0]==0xA2)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xFC))||
- ((bChar[0]==0xA3)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xFE))||
- ((bChar[0]==0xA4)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF3))||
- ((bChar[0]==0xA5)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF6))||
- ((bChar[0]==0xA6)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF5))||
- ((bChar[0]==0xA7)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF1))||
- ((bChar[0]==0xA8)&&(bChar[1]>=0x40)&&(bChar[1]<=0xE9))||
- ((bChar[0]==0xA9)&&(bChar[1]>=0x40)&&(bChar[1]<=0xEF))||
- ((bChar[0]==0xAA)&&(bChar[1]==0xA5)))
- {
- nCurrent++;
- break;
- }
- //如果后半部分小于0x80,代表是一个错误的汉字
- else if(bChar[1]>=0x80)
- {
- m_pSentence[nSum]=pBuffer[nCurrent-1];
- nSum++;
- m_pSentence[nSum]=pBuffer[nCurrent];
- nSum++;
- }
- }
- }
- else
- {
- if(bChar[0]==' '||bChar[0]=='r'||bChar[0]=='n')
- {
- if(!bChinese)
- {
- nCurrent++;
- break;
- }
- }
- else
- {
- //if(bChar[0]=='!'||bChar[0]=='?'||bChar[0]==':'||
- // bChar[0]==';'||bChar[0]=='.')
- if((bChar[0]>32&&bChar[0]<=47)||(bChar[0]>=58&&bChar[0]<=64)||
- (bChar[0]>=91&&bChar[0]<=96)||(bChar[0]>=123&&bChar[0]<=127))
- {
- nCurrent++;
- break;
- }
- else if(bChinese) break;
- else
- {
- m_pSentence[nSum]=pBuffer[nCurrent];
- nSum++;
- }
- }
- }
- nCurrent++;
- }
- m_pSentence[nSum]=' ';
- nEnd=nCurrent;
- return nSum;
- }