word_db.cc
上传用户:psq1974
上传日期:2007-01-06
资源大小:1195k
文件大小:8k
- /* Copyright (C) 1998, 1999 State University of New York at Stony Brook
- Author: Andrew V. Shuvalov ( andrew@ecsl.cs.sunysb.edu )
- Software license is located in file "COPYING"
- */
- #include <stdio.h>
- #include <string>
- #include <queue>
- #include <algorithm>
- #include "word_db.h"
- const string WordDb::spaces(" tn.;,:!@-=/\(){}" );
- WordTreeNode *WordTreeNode::get_child( int ch )
- {
- if( childrens.find( ch ) == childrens.end() ) // create new
- {
- WordTreeNode *node = new WordTreeNode();
- childrens[ ch ] = node;
- return node;
- }
- return childrens[ ch ];
- }
- void WordTreeNode::register_word( int movieId, int strIdx, int posInLine )
- {
- WordOccurence wo( movieId, strIdx, posInLine );
- WordOccurences.push_back( wo );
- }
- // -----------------------------------------------------------------------
- const TextOfMovieT *WordDb::get_movie_text( int id )
- {
- if( allTextsMap.find( id ) == allTextsMap.end() )
- return 0;
- return allTextsMap[ id ];
- }
- const StringOfText *WordDb::get_string_of_text( int movieId,
- int linenum ) const
- {
- if( allTextsMap.find( movieId ) == allTextsMap.end() )
- return 0;
-
- AllTextsMapT::const_iterator it = allTextsMap.find( movieId );
- const TextOfMovieT *tom = (*it).second;
- if( tom->size() <= linenum || linenum < 0 )
- return 0;
- return &(*tom)[ linenum ];
- }
- void WordDb::add_new_movie_text( int id, TextOfMovieT *tom )
- {
- allTextsMap[id] = tom;
- // and put text into search tree
- for( int i = 0; i < tom->size(); i++ )
- insert_new_string_in_search_tree( (*tom)[i].get_text(), id, i );
- }
- void WordDb::insert_new_string_in_search_tree( const string &str,
- int movieId, int strIdx )
- {
- string::size_type pos = 0;
- str.c_str();
- int posInLine = 0;
- while( pos != str.npos )
- {
- pos = str.find_first_not_of( spaces, pos );
- if( pos == str.npos )
- break;
- int pos_end = str.find_first_of( spaces, pos );
- string word = str.substr( pos, pos_end != str.npos? pos_end - pos:
- str.npos );
- word.c_str();
- insert_new_word_in_search_tree( word, movieId, strIdx, posInLine );
- posInLine ++;
- pos = pos_end;
- }
- }
- void WordDb::insert_new_word_in_search_tree( const string &w, int movieId,
- int strIdx, int posInLine )
- {
- if( !w.length() )
- return;
- w.c_str();
- // remember, we need case-insensative
- WordTreeNode *node = &parentNode;
- for( int pos = 0; pos < w.length(); pos++ )
- {
- int ch = tolower( w[ pos ] );
- node = node->get_child( ch );
- // this is the last char or not?
- if( pos == w.length() - 1 )
- node->register_word( movieId, strIdx, posInLine );
- }
- }
- const WordOccurencesT &WordDb::find_word( const string &w ) const
- {
- static WordOccurencesT nothing;
- if( !w.length() )
- return nothing;
- w.c_str();
- // remember, we need case-insensative
- const WordTreeNode *node = &parentNode;
- for( int pos = 0; pos < w.length(); pos++ )
- {
- int ch = tolower( w[ pos ] );
- node = node->get_child( ch );
- // this is the last char or not?
- if( pos == w.length() - 1 )
- return node->get_word_occurences();
- }
- // nothing
- return nothing;
- }
- void WordDb::find_similar_words( const string &word, GList *&similar ) const
- {
- if( !word.length() )
- return;
- // where to store strings to avoid memory leaks
- static vector< string * > strings;
- for( int i = 0; i < strings.size(); i++ )
- delete strings[i];
- strings.clear();
- word.c_str();
- // remember, we need case-insensative
- const WordTreeNode *node = &parentNode;
- for( int pos = 0; pos < word.length(); pos++ )
- {
- int ch = tolower( word[ pos ] );
- node = node->get_child( ch );
- // this is the last char or not?
- if( pos == word.length() - 1 )
- {
- for( WordTreeNode::childrensT::const_iterator it =
- node->get_all_childrens().begin();
- it != node->get_all_childrens().end(); it++ )
- {
- if( (*it).second->get_word_occurences().size() > 0 )
- {
- string *nw = new string( word + (char)(*it).first );
- strings.push_back( nw );
- similar = g_list_append( similar, (void*)nw->c_str() );
- }
- }
- break;
- }
- }
- }
- void WordDb::find_hint_words( const string &word, GList *&similar ) const
- {
- if( !word.length() )
- return;
- // where to store strings to avoid memory leaks
- static vector< string * > strings;
- for( int i = 0; i < strings.size(); i++ )
- delete strings[i];
- strings.clear();
- word.c_str();
- // remember, we need case-insensative
- const WordTreeNode *node = &parentNode;
- for( int pos = 0; pos < word.length(); pos++ )
- {
- int ch = tolower( word[ pos ] );
- node = node->get_child( ch );
- // this is the last char or not?
- if( pos == word.length() - 1 )
- {
- const WordOccurencesT &wo = node->get_word_occurences();
- for( int i = 0; i < wo.size(); i++ )
- for( int j = -1; j <= 1; j++ )
- {
- const StringOfText *sot =
- get_string_of_text(wo[i].get_movie_id(),wo[i].get_text_line()+j);
- if( sot )
- {
- const string &text = sot->get_text();
- string::size_type pos = 0;
- while( pos != text.npos )
- {
- pos = text.find_first_not_of( spaces, pos );
- if( pos == text.npos )
- break;
- string::size_type pos_end = text.find_first_of(spaces,pos);
- string *hword = new string
- ( text.substr( pos, pos_end != text.npos?
- pos_end - pos: text.npos ) );
- hword->c_str();
- const WordOccurencesT &hwo = find_word( *hword );
- if( hwo.size() <= wo.size() && *hword != word )
- {
- strings.push_back( hword );
- similar = g_list_append(similar,(void*)hword->c_str());
- }
- pos = pos_end;
- }
- }
- }
- }
- }
- }
- WordOccurencesT WordDb::merge_and( const WordOccurencesT &left,
- const WordOccurencesT &right ) const
- {
- // put both data in the heap and merge head by head
- priority_queue< WordOccurence > lheap( left.begin(), left.end() );
- priority_queue< WordOccurence > rheap( right.begin(), right.end() );
- WordOccurencesT result;
- while( lheap.size() && rheap.size() )
- {
- WordOccurence l = lheap.top();
- lheap.pop();
- WordOccurence r = rheap.top();
- rheap.pop();
- if( l.is_near( r ))
- {
- result.push_back( l );
- continue;
- }
- // not near
- if( l > r )
- {
- while( lheap.size() && l > r )
- {
- l = lheap.top();
- lheap.pop();
- if( l.is_near( r ) )
- {
- result.push_back( l );
- break;
- }
- }
- continue;
- }
- if( r > l )
- {
- while( rheap.size() && r > l )
- {
- r = rheap.top();
- rheap.pop();
- if( l.is_near( r ) )
- {
- result.push_back( l );
- break;
- }
- }
- continue;
- }
- }
- return result;
- }
- WordOccurencesT WordDb::merge_and_not( const WordOccurencesT &left,
- const WordOccurencesT &right ) const
- {
- // put both data in the heap and merge head by head
- priority_queue< WordOccurence > lheap( left.begin(), left.end() );
- priority_queue< WordOccurence > rheap( right.begin(), right.end() );
- WordOccurencesT result;
- while( lheap.size() && rheap.size() )
- {
- WordOccurence l = lheap.top();
- lheap.pop();
- WordOccurence r = rheap.top();
- rheap.pop();
- if( l.is_near( r ))
- {
- // trash both
- continue;
- }
- // not near
- if( l > r )
- {
- while( lheap.size() && l > r )
- {
- // just what we need
- result.push_back( l );
- l = lheap.top();
- lheap.pop();
- if( l.is_near( r ) )
- {
- // trash both
- break;
- }
- }
- continue;
- }
- if( r > l )
- {
- while( rheap.size() && r > l )
- {
- // just what we need
- result.push_back( r );
-
- r = rheap.top();
- rheap.pop();
- if( l.is_near( r ) )
- {
- // trash both
- break;
- }
- }
- continue;
- }
- }
- return result;
- }
- WordOccurencesT WordDb::merge_or( const WordOccurencesT &left,
- const WordOccurencesT &right ) const
- {
- // true merge
- WordOccurencesT result;
- result.resize( left.size() + right.size() );
- merge( left.begin(), left.end(), right.begin(), right.end(),
- result.begin() );
- return result;
- }