TextQuery.cpp
资源名称:TextQuery.rar [点击查看]
上传用户:cydzxjc
上传日期:2021-11-14
资源大小:2668k
文件大小:8k
源码类别:
STL
开发平台:
Visual C++
- #include "StdAfx.h"
- #include "TextQuery.h"
- CTextQuery::CTextQuery(void)
- {
- memset(this, 0, sizeof(CTextQuery));
- }
- CTextQuery::~CTextQuery(void)
- {
- }
- void CTextQuery::filter_elements(string felems)
- {
- filt_elems = felems;
- }
- void CTextQuery::retrieve_text()
- {
- string file_name;
- cout<<"please enter file name: ";
- cin>>file_name;
- ifstream infile(file_name.c_str(), ios::in);
- if (!infile)
- {
- cerr<<"oops! unable to open file "
- <<file_name<<" -- bailing out!n";
- exit(-1);
- }
- else
- {
- cout<<"n";
- }
- lines_of_text = new vector<string>;
- string textline;
- while (getline(infile, textline, 'n'))
- {
- lines_of_text->push_back(textline);
- }
- }
- void CTextQuery::separate_words()
- {
- vector<string> *words = new vector<string>;
- vector<location> *locations = new vector<location>;
- for (short line_pos = 0; line_pos < lines_of_text->size(); line_pos++)
- {
- short word_pos = 0;
- string textline = (*lines_of_text)[line_pos];
- string::size_type eol = textline.length();
- string::size_type pos = 0, prev_pos = 0;
- while((pos = textline.find_first_of(' ', pos)) != string::npos)
- {
- words->push_back(textline.substr(prev_pos, pos - prev_pos));
- locations->push_back(make_pair(line_pos, word_pos));
- word_pos++;
- pos++;
- prev_pos = pos;
- }
- words->push_back(textline.substr(prev_pos, pos - prev_pos));
- locations->push_back(make_pair(line_pos, word_pos));
- }
- text_locations = new text_loc(words, locations);
- }
- void CTextQuery::filter_text()
- {
- if (filt_elems.empty())
- {
- return;
- }
- vector<string> *words = text_locations->first;
- vector<string>::iterator iter = words->begin();
- vector<string>::iterator iter_end = words->end();
- while (iter != iter_end)
- {
- string::size_type pos = 0;
- while ((pos = (*iter).find_first_of(filt_elems, pos)) != string::npos)
- {
- (*iter).erase(pos, 1);
- }
- ++iter;
- }
- }
- void CTextQuery::suffix_text()
- {
- vector<string> *words = text_locations->first;
- vector<string>::iterator iter = words->begin();
- vector<string>::iterator iter_end = words->end();
- while (iter != iter_end)
- {
- if ((*iter).size() <= 3)
- {
- iter++;
- continue;
- }
- if ((*iter)[(*iter).size() - 1] == 's')
- {
- suffix_s(*iter);
- }
- //其他的后缀处理放在这里
- iter++;
- }
- }
- void CTextQuery::suffix_s(string &word)
- {
- string::size_type spos = 0;
- string::size_type pos3 = word.size() - 3;
- //"ous", "ss", "is", "ius"
- string suffixes("oussisius");
- if (!word.compare(pos3, 3, suffixes, spos, 3) ||
- !word.compare(pos3, 3, suffixes, spos + 6, 3) ||
- !word.compare(pos3 + 1, 2, suffixes, spos + 2, 2) ||
- !word.compare(pos3 + 1, 2, suffixes, spos + 4, 2))
- {
- return;
- }
- string ies("ies");
- if (!word.compare(pos3, 3, ies))
- {
- word.replace(pos3, 3, 1, 'y');
- return;
- }
- string ses("ses");
- if (!word.compare(pos3, 3, ses))
- {
- word.erase(pos3 + 1, 2);
- return;
- }
- //去掉尾部的's'
- word.erase(pos3 + 2);
- //watch out for "'s"
- if (word[pos3 + 1] == ''')
- {
- word.erase(pos3 + 1);
- }
- }
- //大写转小写
- void CTextQuery::strip_caps()
- {
- vector<string> *words = text_locations->first;
- vector<string>::iterator iter = words->begin();
- vector<string>::iterator iter_end = words->end();
- string caps("ABCDEFGHIGKLMNOPQRSTUVWXYZ");
- while (iter != iter_end)
- {
- string::size_type pos = 0;
- while ((pos = (*iter).find_first_of(caps, pos)) != string::npos)
- {
- (*iter)[pos] = tolower((*iter)[pos]);
- }
- ++iter;
- }
- }
- void CTextQuery::build_word_map()
- {
- word_map = new map<string, loc*>;
- set<string> exclusion_set;
- /*string file_name = "exclusion_set.txt";*/
- /*basic_streambuf<char *, char_traits<char *>> *strBuf;
- basic_istream<char *, char_traits<char *>> infile(strBuf);*/
- ifstream infile("exclusion_set");
- if (!infile)
- {
- static string default_excluded_words[25] = {
- "the", "and", "but", "that", "then", "are", "been",
- "can", "can't", "cannot", "could", "did", "for",
- "had", "have", "him", "his", "her", "its", "into",
- "were", "which", "when", "with", "would"};
- cerr<<"warning! unable to open word exclusion file! --"
- <<"using default setn";
- copy(default_excluded_words, default_excluded_words + 25,
- inserter(exclusion_set, exclusion_set.begin()));
- }
- else
- {
- //istream_iterator<string, diff_type> input_set/*(infile)*/, eos;
- //copy(input_set, eos, inserter(exclusion_set, exclusion_set.begin()));
- }
- //遍历单词,输入键/值对
- vector<string> *text_words = text_locations->first;
- vector<location> *text_locs = text_locations->second;
- register int elem_cnt = text_words->size();
- for (int ix = 0; ix < elem_cnt; ++ix)
- {
- string textword = (*text_words)[ix];
- if (textword.size() < 3 || exclusion_set.count(textword))
- {
- continue;
- }
- if (!word_map->count((*text_words)[ix]))
- {
- //没有,添加
- loc *ploc = new vector<location>;
- ploc->push_back((*text_locs)[ix]);
- word_map->insert(val_Type((*text_words)[ix], ploc));
- }
- else
- {
- (*word_map)[(*text_words)[ix]]->push_back((*text_locs)[ix]);
- }
- }
- }
- void CTextQuery::query_text()
- {
- string que_text;
- do
- {
- cout<<"enter a word against which to search the text.n"
- <<"to quit, enter a single character ==> ";
- cin>>que_text;
- if (que_text.size() < 2)
- {
- break;
- }
- string caps("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
- string::size_type pos = 0;
- while ((pos = que_text.find_first_of(caps, pos)) != string::npos)
- {
- que_text[pos] = tolower(que_text[pos]);
- }
- //如果对map索引,输入que_text,如无
- //说明没有要找的词
- if (!word_map->count(que_text))
- {
- cout<<"]nSorry. There are no enteries for "
- <<que_text<<".nn";
- continue;
- }
- loc *ploc = (*word_map)[que_text];
- set<short> occurrence_lines;
- loc::iterator liter = ploc->begin(), liter_end = ploc->end();
- while (liter != liter_end)
- {
- occurrence_lines.insert(occurrence_lines.end(), (*liter).first);
- ++liter;
- }
- register int size = occurrence_lines.size();
- cout<<"n"<<que_text<<"noccurs"<<size<<(size == 1 ? " time: " : " times: ")<<"nn";
- set<short>::iterator it = occurrence_lines.begin();
- for (; it != occurrence_lines.end(); ++it)
- {
- int line = *it;
- //不要用从0开始有问本行把用户弄迷糊了
- cout<<"t( line"<<line + 1<<" ) "<<(*lines_of_text)[line]<<endl;
- }
- cout<<endl;
- }
- while (!que_text.empty());
- cout<<"Ok, bye!n";
- }
- void CTextQuery::display_map_text()
- {
- map_text::iterator iter = word_map->begin(), iter_end = word_map->end();
- while (iter != iter_end)
- {
- cout<<"word: "<<(*iter).first<<" (";
- int loc_cnt = 0;
- loc *text_locs = (*iter).second;
- loc::iterator liter = text_locs->begin(), liter_end = text_locs->end();
- while (liter != liter_end)
- {
- if (loc_cnt)
- {
- cout<<",";
- }
- else
- {
- ++loc_cnt;
- }
- cout<<"("<<(*liter).first<<","<<(*liter).second<<")";
- ++liter;
- }
- cout<<")n";
- ++iter;
- }
- cout<<endl;
- }
- void CTextQuery::display_text_locations()
- {
- vector<string> *text_words = text_locations->first;
- vector<location> *text_locs = text_locations->second;
- register int elem_cnt = text_words->size();
- if (elem_cnt != text_locs->size())
- {
- cerr<<"oops! internal error: word and position vectors "
- <<"are of unequal sizen"
- <<"words: "<<elem_cnt<<" "
- <<"locs: "<<text_locs->size()
- <<" -- bailing out!n";
- exit(-2);
- }
- for (int ix = 0; ix < elem_cnt; ix++)
- {
- cout<<"word: "<<(*text_words)[ix]<<"t"
- <<"location: ("
- <<(*text_locs)[ix].first<<","
- <<(*text_locs)[ix].second<<")"
- <<"n";
- }
- cout<<endl;
- }