WordSegment.cpp
资源名称:Cfenci.rar [点击查看]
上传用户:mesing
上传日期:2022-05-05
资源大小:353k
文件大小:6k
源码类别:
多国语言处理
开发平台:
C/C++
- #include<iostream>
- #include<fstream>
- #include<vector>
- #include<string>
- #include<stack>
- using namespace std;
- const int s1 = 0XB0,s2 = 0XA1,e1 = 0XF8,e2 = 0XFF;
- const int maxwordlen = 50;
- struct Second
- {
- string key;
- Second *next;
- Second(string k = "",Second *n = 0):key(k),next(n){}
- };
- struct Head
- {
- int size;
- string key;
- vector<Second*> W;
- Head(string k = "",int s = 0):key(k),size(s){}
- };
- class Dictiory
- {
- vector<Head> H;
- ifstream fin;
- ifstream fcin;
- ofstream fout;
- int hash[e1 - s1][e2 - s2];
- int BinarySearch(string str,int k);
- int GetNum();
- void LoadDic();
- bool IsC(char c);
- bool IsEc(char c);
- void AddWord(string str,int k);
- void InsertWord(string str,int k);
- bool IsWord(string str,int k,int t);
- void SkipNotChinese(string &str,stack<string> &stk);
- public:
- Dictiory(string sfilename,string dfilename);
- void SegmentWord(string s);
- void PrintDic()
- {
- for(int i = 0; i < e1 - s1;i++)
- for(int j = 0; j < e2 - s2;j++)
- {
- if(hash[i][j] >= 0)
- {
- fout << H[hash[i][j]].key << endl;
- for(int k = 0; k < H[hash[i][j]].W.size() ;k++)
- {
- Second *t = H[hash[i][j]].W[k];
- while(t)
- {
- fout << H[hash[i][j]].key;
- fout << t->key << endl;
- t = t->next;
- }
- }
- }
- fout << endl;
- }
- }
- };
- Dictiory::Dictiory(string sfilename,string dfilename)
- {
- int i,j;
- for(i = 0; i < e1 - s1;i++)
- for(j = 0; j < e2 - s2;j++)
- hash[i][j] = -1;
- H.resize(6768);
- fin.open(sfilename.c_str());
- fout.open(dfilename.c_str());
- LoadDic();
- }
- int Dictiory::BinarySearch(string str,int k)
- {
- int len = str.length();
- int L = 0,R = H[k].W.size() - 1,M;
- while(L <= R)
- {
- M = (L + R)/2;
- if(H[k].W[M]->key.size() == len)
- return M;
- else if(H[k].W[M]->key.size() < len)
- L = M + 1;
- else R = M - 1;
- }
- return -1;
- }
- void Dictiory::AddWord(string str,int k)
- {
- if(str.length() > H[k].size)
- {
- H[k].size = str.length();
- Second *t = new Second(str);
- H[k].W.push_back(t);
- }
- else
- InsertWord(str,k);
- }
- void Dictiory::InsertWord(string str,int k)
- {
- int in = BinarySearch(str,k);
- if(in == -1)
- {
- int L = 0,R = H[k].W.size() - 1;
- int len = str.length();
- while(L <= R&&len > H[k].W[L]->key.size())
- L++;
- H[k].W.resize(H[k].W.size() + 1);
- for(int i = R + 1;i > L;i--)
- H[k].W[i] = H[k].W[i - 1];
- Second *t = new Second(str);
- H[k].W[L] = t;
- }
- else
- {
- Second *pre,*t = H[k].W[in];
- while(t)
- {
- pre = t;
- t = t->next;
- }
- pre->next = new Second(str);
- }
- }
- int Dictiory::GetNum()
- {
- char cstr[maxwordlen];
- fin.getline(cstr,maxwordlen);
- int n = 0,i;
- for(i = 0; i < strlen(cstr);i++)
- n = n * 10 + cstr[i] - '0';
- return n;
- }
- void Dictiory::LoadDic()
- {
- char cstr[maxwordlen];
- string str;
- int i,j,k = 0,wordnumber;
- while(fin.getline(cstr,maxwordlen))
- {
- i = (unsigned char)cstr[0] - s1;
- j = (unsigned char)cstr[1] - s2;
- hash[i][j] = k;
- H[k].key = cstr;
- wordnumber = GetNum();
- for(i = 0; i < wordnumber;i++)
- {
- fin.getline(cstr,maxwordlen);
- str = cstr;
- str = str.substr(2,str.length() - 2);
- AddWord(str,k);
- }
- k++;
- }
- }
- bool Dictiory::IsC(char c)
- {
- unsigned value = unsigned((unsigned char)c);
- return value >= s1&&value < e1;
- }
- bool Dictiory::IsEc(char c)
- {
- unsigned value = unsigned((unsigned char)c);
- return value <= 0X7F;
- }
- bool Dictiory::IsWord(string str,int k,int t)
- {
- Second *temp = H[k].W[t];
- while(temp)
- {
- if(temp->key == str)
- return true;
- temp = temp->next;
- }
- return false;
- }
- void Dictiory::SkipNotChinese(string &str,stack<string> &stk)
- {
- unsigned L = 0,R = str.length();
- while(L < R&&!IsC(str[L]))
- {
- if(!IsEc(str[L]))
- L++;
- L++;
- }
- if(L > 0)
- {
- stk.push(str.substr(0,L));
- str = str.substr(L,R - L);
- }
- }
- void Dictiory::SegmentWord(string s)
- {
- stack<string> stk;
- fcin.open(s.c_str());
- char cstr[maxwordlen];
- string str,sstr;
- int i,j,startpos,endpos;
- char c;
- while(fcin.read(&c,sizeof(char)))
- {
- if(!IsC(c))
- {
- if(!str.empty())
- {
- cout << str << " " << str.length() << endl;
- startpos = 0,endpos = str.length();
- while(startpos < endpos)
- {
- if(str.length() <= 2)
- {
- stk.push(str);
- if(!sstr.empty())
- {
- str = sstr;
- sstr.clear();
- }
- startpos += 2;
- }
- else
- {
- i = (unsigned char)str[0] - s1,j = (unsigned char)str[1] - s2;
- if(hash[i][j] >= 0)
- {
- string word = str.substr(2,str.length() - 2);
- int in = BinarySearch(word,hash[i][j]);
- if((in != -1)&&IsWord(word,hash[i][j],in))
- {
- stk.push(H[hash[i][j]].key + word);
- startpos += str.length();
- str = sstr;
- sstr.clear();
- }
- else
- {
- sstr = sstr + str.substr(0,2);
- str = str.substr(2,str.length() - 2);
- }
- }
- else
- {
- sstr = sstr + str.substr(0,2);
- str = str.substr(2,str.length() - 2);
- }
- }
- }
- while(!stk.empty())
- {
- fout << stk.top() << endl;
- stk.pop();
- }
- }
- str.clear();
- str += c;
- while(fcin.read(&c,sizeof(char))&&!IsC(c))
- str += c;
- fout << str << endl;
- cout << str << " " << str.length() << endl;
- str.clear();
- str += c;
- fcin.read(&c,sizeof(char));
- str += c;
- }
- else
- {
- str += c;
- fcin.read(&c,sizeof(char));
- str += c;
- }
- }
- }
- int main()
- {
- Dictiory D("dictiory.txt","data.txt");
- //D.PrintDic();
- D.SegmentWord("sou.txt");
- system("pause");
- return 0;
- }