mpeg/mp3

开发平台：
C/C++

word_db.cc：源码内容
							/* Copyright (C) 1998, 1999 State University of New York at Stony Brook
   Author: Andrew V. Shuvalov ( andrew@ecsl.cs.sunysb.edu )
   Software license is located in file "COPYING"
*/
#include <stdio.h>
#include <string>
#include <queue>
#include <algorithm>
#include "word_db.h"
const string WordDb::spaces(" tn.;,:!@-=/\(){}" );
WordTreeNode *WordTreeNode::get_child( int ch )
{
  if( childrens.find( ch ) == childrens.end() )  // create new
    {
      WordTreeNode *node = new WordTreeNode();
      childrens[ ch ] = node;
      return node;
    }
  return childrens[ ch ];
}
void WordTreeNode::register_word( int movieId, int strIdx, int posInLine )
{
  WordOccurence wo( movieId, strIdx, posInLine );
  WordOccurences.push_back( wo );
}
// -----------------------------------------------------------------------
const TextOfMovieT *WordDb::get_movie_text( int id )
{
  if( allTextsMap.find( id ) == allTextsMap.end() )
    return 0;
  return allTextsMap[ id ];
}
const StringOfText *WordDb::get_string_of_text( int movieId, 
						int linenum ) const
{
  if( allTextsMap.find( movieId ) == allTextsMap.end() )
    return 0;
  
  AllTextsMapT::const_iterator it = allTextsMap.find( movieId );
  const TextOfMovieT *tom = (*it).second;
  if( tom->size() <= linenum || linenum < 0 )
    return 0;
  return &(*tom)[ linenum ];
}
void WordDb::add_new_movie_text( int id, TextOfMovieT *tom )
{
  allTextsMap[id] = tom;
  // and put text into search tree
  for( int i = 0; i < tom->size(); i++ )
    insert_new_string_in_search_tree( (*tom)[i].get_text(), id, i );
}
void WordDb::insert_new_string_in_search_tree( const string &str, 
					       int movieId, int strIdx )
{
  string::size_type pos = 0;
  str.c_str();
  int posInLine = 0;
  while( pos != str.npos )
    {
      pos = str.find_first_not_of( spaces, pos );
      if( pos == str.npos )
	break;
      int pos_end = str.find_first_of( spaces, pos );
      string word = str.substr( pos, pos_end != str.npos? pos_end - pos:
				str.npos );
      word.c_str();
      insert_new_word_in_search_tree( word, movieId, strIdx, posInLine );
      posInLine ++;
      pos = pos_end;
    }
}
void WordDb::insert_new_word_in_search_tree( const string &w, int movieId, 
					     int strIdx, int posInLine )
{
  if( !w.length() )
    return;
  w.c_str();
  // remember, we need case-insensative 
  WordTreeNode *node = &parentNode;
  for( int pos = 0; pos < w.length(); pos++ )
    {
      int ch = tolower( w[ pos ] );
      node = node->get_child( ch );
      // this is the last char or not?
      if( pos == w.length() - 1 )
	node->register_word( movieId, strIdx, posInLine );
    }
}
const WordOccurencesT &WordDb::find_word( const string &w ) const
{
  static WordOccurencesT nothing;
  if( !w.length() )
    return nothing;
  w.c_str();
  // remember, we need case-insensative 
  const WordTreeNode *node = &parentNode;
  for( int pos = 0; pos < w.length(); pos++ )
    {
      int ch = tolower( w[ pos ] );
      node = node->get_child( ch );
      // this is the last char or not?
      if( pos == w.length() - 1 )
	return node->get_word_occurences();
    }
  // nothing
  return nothing;
}
void WordDb::find_similar_words( const string &word, GList *&similar ) const
{
  if( !word.length() )
    return;
  // where to store strings to avoid memory leaks
  static vector< string * > strings;
  for( int i = 0; i < strings.size(); i++ )
    delete strings[i];
  strings.clear();
  word.c_str();
  // remember, we need case-insensative 
  const WordTreeNode *node = &parentNode;
  for( int pos = 0; pos < word.length(); pos++ )
    {
      int ch = tolower( word[ pos ] );
      node = node->get_child( ch );
      // this is the last char or not?
      if( pos == word.length() - 1 )
	{
	  for( WordTreeNode::childrensT::const_iterator it = 
		 node->get_all_childrens().begin(); 
	       it != node->get_all_childrens().end(); it++ )
	    {
	      if( (*it).second->get_word_occurences().size() > 0 )
		{
		  string *nw = new string( word + (char)(*it).first );
		  strings.push_back( nw );
		  similar = g_list_append( similar, (void*)nw->c_str() );
		}
	    }
	  break;
	}
    }
}
void  WordDb::find_hint_words( const string &word, GList *&similar ) const
{
  if( !word.length() )
    return;
  // where to store strings to avoid memory leaks
  static vector< string * > strings;
  for( int i = 0; i < strings.size(); i++ )
    delete strings[i];
  strings.clear();
  word.c_str();
  // remember, we need case-insensative 
  const WordTreeNode *node = &parentNode;
  for( int pos = 0; pos < word.length(); pos++ )
    {
      int ch = tolower( word[ pos ] );
      node = node->get_child( ch );
      // this is the last char or not?
      if( pos == word.length() - 1 )
	{
	  const WordOccurencesT &wo = node->get_word_occurences();
	  for( int i = 0; i < wo.size(); i++ )
	    for( int j = -1; j <= 1; j++ )
	      {
		const StringOfText *sot = 
		  get_string_of_text(wo[i].get_movie_id(),wo[i].get_text_line()+j);
		if( sot )
		  {
		    const string &text = sot->get_text();
		    string::size_type pos = 0;
		    while( pos != text.npos )
		      {
			pos = text.find_first_not_of( spaces, pos );
			if( pos == text.npos )
			  break;
			string::size_type pos_end = text.find_first_of(spaces,pos);
			string *hword = new string
			  ( text.substr( pos, pos_end != text.npos? 
					 pos_end - pos: text.npos ) );
			hword->c_str();
			const WordOccurencesT &hwo = find_word( *hword );
			if( hwo.size() <= wo.size() && *hword != word )
			  {
			    strings.push_back( hword );
			    similar = g_list_append(similar,(void*)hword->c_str());
			  }
			pos = pos_end;
		      }
		  }
	      }
	}
    }
}
WordOccurencesT WordDb::merge_and( const WordOccurencesT &left, 
				   const WordOccurencesT &right ) const
{
  // put both data in the heap and merge head by head
  priority_queue< WordOccurence > lheap( left.begin(), left.end() );
  priority_queue< WordOccurence > rheap( right.begin(), right.end() );
  WordOccurencesT result;
  while( lheap.size() && rheap.size() )
    {
      WordOccurence l = lheap.top();
      lheap.pop();
      WordOccurence r = rheap.top();
      rheap.pop();
      if( l.is_near( r ))
	{
	  result.push_back( l );
	  continue;
	}
      // not near
      if( l > r )
	{
	  while( lheap.size() && l > r )
	    {
	      l = lheap.top();
	      lheap.pop();
	      if( l.is_near( r ) )
		{
		  result.push_back( l );
		  break;
		}
	    }
	  continue;
	}
      if( r > l )
	{
	  while( rheap.size() && r > l )
	    {
	      r = rheap.top();
	      rheap.pop();
	      if( l.is_near( r ) )
		{
		  result.push_back( l );
		  break;
		}
	    }
	  continue;
	}
    }      
  return result;
}
WordOccurencesT WordDb::merge_and_not( const WordOccurencesT &left, 
				       const WordOccurencesT &right ) const
{
  // put both data in the heap and merge head by head
  priority_queue< WordOccurence > lheap( left.begin(), left.end() );
  priority_queue< WordOccurence > rheap( right.begin(), right.end() );
  WordOccurencesT result;
  while( lheap.size() && rheap.size() )
    {
      WordOccurence l = lheap.top();
      lheap.pop();
      WordOccurence r = rheap.top();
      rheap.pop();
      if( l.is_near( r ))
	{
	  // trash both
	  continue;
	}
      // not near
      if( l > r )
	{
	  while( lheap.size() && l > r )
	    {
	      // just what we need 
	      result.push_back( l );
	      l = lheap.top();
	      lheap.pop();
	      if( l.is_near( r ) )
		{
		  // trash both
		  break;
		}
	    }
	  continue;
	}
      if( r > l )
	{
	  while( rheap.size() && r > l )
	    {
	      // just what we need 
	      result.push_back( r );
	      
	      r = rheap.top();
	      rheap.pop();
	      if( l.is_near( r ) )
		{
		  // trash both
		  break;
		}
	    }
	  continue;
	}
    }      
  return result;
}
WordOccurencesT WordDb::merge_or( const WordOccurencesT &left, 
				  const WordOccurencesT &right ) const
{
  // true merge
  WordOccurencesT result;
  result.resize( left.size() + right.size() );
  merge( left.begin(), left.end(), right.begin(), right.end(), 
	 result.begin() );
  return result;
}