strsearch.hpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:18k
- /*
- * ===========================================================================
- * PRODUCTION $Log: strsearch.hpp,v $
- * PRODUCTION Revision 1000.2 2004/06/01 19:39:02 gouriano
- * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.19
- * PRODUCTION
- * ===========================================================================
- */
- #ifndef UTIL___STRSEARCH__HPP
- #define UTIL___STRSEARCH__HPP
- /* $Id: strsearch.hpp,v 1000.2 2004/06/01 19:39:02 gouriano Exp $
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * Author: Mati Shomrat
- * Anatoliy Kuznetsov
- *
- * File Description:
- * String search utilities.
- *
- */
- /// @file strsearch.hpp
- /// String search utilities.
- ///
- /// Currently there are two search utilities:
- /// 1. An implementation of the Boyer-Moore algorithm.
- /// 2. A finite state automata.
- #include <corelib/ncbistd.hpp>
- #include <corelib/ncbistr.hpp>
- #include <string>
- #include <vector>
- /** @addtogroup StringSearch
- *
- * @{
- */
- BEGIN_NCBI_SCOPE
- //============================================================================//
- // CBoyerMooreMatcher //
- //============================================================================//
- /// This implemetation uses the Boyer-Moore alg. in order to search a single
- /// pattern over varying texts.
- class NCBI_XUTIL_EXPORT CBoyerMooreMatcher
- {
- public:
- enum EWordMatch
- {
- eSubstrMatch = 0,
- ePrefixMatch = (1 << 0),
- eSuffixMatch = (1 << 1),
- eWholeWordMatch = (ePrefixMatch | eSuffixMatch)
- };
- public:
- /// Initialize a matcher with the pattern to be matched.
- ///
- /// @param pattern
- /// Pattern to be matched
- /// @param case_sensitive
- /// should the search be case sensitive (false by default).
- /// @param whole_word
- /// a match is found ony if the pattern was found to
- /// be between delimiting characters (whitespaces)
- CBoyerMooreMatcher(const string& pattern,
- NStr::ECase case_sensitive = NStr::eNocase,
- unsigned int whole_word = eSubstrMatch);
-
- /// Initialize a matcher with the pattern to be matched.
- ///
- /// @param pattern
- /// Pattern to be matched
- /// @param word_delimeters
- /// a match is found ony if the pattern was found to
- /// be between delimiting characters like whitespaces
- /// and punctuation marks
- /// @param case_sensitive
- /// should the search be case sensitive (false by default).
- /// @param invert_delimiters
- /// when TRUE means all characters NOT belonging to
- /// word_delimeters are to be used
- CBoyerMooreMatcher(const string& pattern,
- const string& word_delimeters,
- NStr::ECase case_sensitive = NStr::eNocase,
- bool invert_delimiters = false);
- /// Set word delimiting characters
- ///
- /// @param word_delimeters
- /// string of characters used as word delimiters
- /// @param invert_delimiters
- /// when TRUE means all characters NOT belonging to
- /// word_delimeters are to be used
- void SetWordDelimiters(const string& word_delimeters,
- bool invert_delimiters = false);
- /// Add new word delimiters
- ///
- /// @param word_delimeters
- /// string of characters used as word delimiters
- ///
- void AddDelimiters(const string& word_delimeters);
- /// Add new word delimiter charracter
- ///
- /// @param word_delimeters
- /// string of characters used as word delimiters
- ///
- void AddDelimiters(char ch);
- /// Init delimiters most common for the English language,
- /// (whitespaces, punctuations, etc)
- void InitCommonDelimiters();
-
- /// Set word matching mode
- ///
- /// @param whole_word
- /// word matching mode. Can be OR combination of EWordMatch values
- void SetWordMatching(unsigned int whole_word = eWholeWordMatch)
- {
- m_WholeWord = whole_word;
- }
- /// Search for the pattern over text starting at position pos.
- ///
- /// @param text
- /// Text to search in.
- /// @param pos
- /// Position shift in text
- /// @return
- /// the position at which the pattern was found, -1 otherwise.
- size_t Search(const string& text, size_t pos = 0) const
- {
- return Search(text.c_str(), pos, text.length());
- }
- /// Search for the pattern over text starting at position pos.
- ///
- /// @param text
- /// Text memory taken as NON ASCIIZ string.
- /// @param pos
- /// String offset position from the start
- /// @param text_len
- /// Length of text
- /// @return
- /// the position at which the pattern was found, -1 otherwise.
- SIZE_TYPE Search(const char* text,
- SIZE_TYPE pos,
- SIZE_TYPE text_len) const;
-
- private:
- // Constants
- static const int sm_AlphabetSize;
-
- // Member Functions:
-
- /// Check if the pattern at position pos in the text lies on a
- /// whole word boundry.
- bool IsWholeWord(const char* text,
- SIZE_TYPE pos,
- SIZE_TYPE text_len) const;
- void x_InitPattern(void);
- private:
- string m_Pattern;
- SIZE_TYPE m_PatLen;
- NStr::ECase m_CaseSensitive;
- unsigned int m_WholeWord;
- vector<size_t> m_LastOccurrence;
- vector<unsigned char> m_WordDelimiters;
-
- };
- //============================================================================//
- // Finite State Automata //
- //============================================================================//
- template <typename MatchType>
- class CTextFsm
- {
- public:
- // Constants (done as an enum to avoid link errors on Darwin)
- enum ESpecialStates {
- eFailState = -1
- };
-
- // Constructors and Destructors:
- CTextFsm(bool case_sensitive = false);
- ~CTextFsm(void);
-
- // Add a word to the Fsm. Store a match for later retreival.
- void AddWord(const string& word, const MatchType& match);
-
- // Prime the FSM.
- // After finishing adding all the words to the FSM it needs to be
- // primed to enable usage.
- bool IsPrimed(void) const { return m_Primed; }
- void Prime(void);
-
- // Retreive the FSM's initial state.
- int GetInitialState(void) const { return 0; }
-
- // Get the next state, based on the currect state and a transition
- // character.
- int GetNextState(int state, char letter) const;
-
- // Are there any matches stored in state?
- bool IsMatchFound(int state) const;
-
- // Retrieve the stored matches in state.
- const vector<MatchType>& GetMatches(int state) const;
-
- private:
- // Internal representation of states.
- class CState
- {
- public:
- // Ad-hoc definitions
- typedef map<char, int> TMapCharInt;
-
- // Constructors and Destructors
- CState(void) : m_OnFailure(0) {}
- ~CState(void) {};
-
- // Add a transition to the table.
- void AddTransition(char letter, int to) { m_Transitions[letter] = to; }
-
- // Retreive the transition state, give the transition character.
- int GetNextState(char letter) const {
- TMapCharInt::const_iterator it = m_Transitions.find(letter);
- return it != m_Transitions.end() ? it->second : eFailState;
- }
-
-
- // Are there any matches stored in this state?
- bool IsMatchFound(void) const { return !m_Matches.empty(); }
-
- // Retreive the stored matches
- vector<MatchType>& GetMatches(void) { return m_Matches; }
- const vector<MatchType>& GetMatches(void) const { return m_Matches; }
-
- // Store a match.
- void AddMatch(const MatchType& match) { m_Matches.push_back(match); }
-
- const TMapCharInt& GetTransitions(void) const { return m_Transitions; };
-
- // Getter and Setter for failure transition.
- void SetOnFailure(int state) { m_OnFailure = state; }
- int GetOnFailure(void) const { return m_OnFailure; }
-
- private:
-
- // Member Variables:
- TMapCharInt m_Transitions; // Transition table
- vector<MatchType> m_Matches; // Stored matches
- int m_OnFailure; // Transition state in failure.
-
- }; // end of class CState
-
- // Private Methods:
- CState *GetState(int state);
- int GetNextState(const CState& from, char letter) const;
-
- // Compute the transition to be performed on failure.
- void ComputeFail(void);
- void FindFail(int state, int new_state, char ch);
- void QueueAdd(vector<int>& in_queue, int qbeg, int val);
-
- // Member Variables
- bool m_Primed;
- vector< CState > m_States;
- bool m_CaseSensitive;
-
- }; // end of class CTextFsm
- // Convenience class when the MatchType is of string type (most cases)
- class NCBI_XUTIL_EXPORT CTextFsa : public CTextFsm<string>
- {
- public:
- CTextFsa(bool case_sensitive = false) :
- CTextFsm<string>(case_sensitive)
- {}
- void AddWord(const string& word) {
- CTextFsm<string>::AddWord(word, word);
- }
- };
- //============================================================================//
- // Finite State Automata Implementation //
- //============================================================================//
- // Public:
- // =======
- template <typename MatchType>
- CTextFsm<MatchType>::CTextFsm(bool case_sensitive) :
- m_Primed(false), m_CaseSensitive(case_sensitive)
- {
- CState initial;
- m_States.push_back(initial);
- }
- template <typename MatchType>
- void CTextFsm<MatchType>::AddWord(const string& word, const MatchType& match)
- {
- string temp = word;
- if ( !m_CaseSensitive ) {
- NStr::ToUpper(temp);
- }
- int next, i,
- state = 0,
- word_len = temp.length();
-
- // try to overlay beginning of word onto existing table
- for ( i = 0; i < word_len; ++i ) {
- next = m_States[state].GetNextState(word[i]);
- if ( next == eFailState ) break;
- state = next;
- }
-
- // now create new states for remaining characters in word
- for ( ; i < word_len; ++ i ) {
- CState new_state;
-
- m_States.push_back(new_state);
- m_States[state].AddTransition(temp[i], m_States.size() - 1);
- state = m_States[state].GetNextState(temp[i]);
- }
-
- // add match information
- m_States[state].AddMatch(match);
- }
- template <typename MatchType>
- void CTextFsm<MatchType>::Prime(void)
- {
- if ( m_Primed ) return;
-
- ComputeFail();
-
- m_Primed = true;
- }
- template <typename MatchType>
- typename CTextFsm<MatchType>::CState *CTextFsm<MatchType>::GetState(int state)
- {
- if ( (state < 0) || (state > m_States.size()) ) {
- return 0;
- }
- return &(m_States[state]);
- }
- template <typename MatchType>
- int CTextFsm<MatchType>::GetNextState(const CState& from, char letter) const {
- char ch = m_CaseSensitive ? letter : toupper(letter);
- return from.GetNextState(ch);
- }
- template <typename MatchType>
- int CTextFsm<MatchType>::GetNextState(int state, char letter) const
- {
- if ( state < 0 || size_t(state) >= m_States.size() ) {
- return eFailState;
- }
-
- int next;
- int initial = GetInitialState();
- while ( (next = GetNextState(m_States[state], letter)) == eFailState ) {
- if ( state == initial ) {
- next = initial;
- break;
- }
- state = m_States[state].GetOnFailure();
- }
-
- return next;
- }
- template <typename MatchType>
- void CTextFsm<MatchType>::QueueAdd(vector<int>& in_queue, int qbeg, int val)
- {
- int q;
-
- q = in_queue [qbeg];
- if (q == 0) {
- in_queue [qbeg] = val;
- } else {
- for ( ; in_queue [q] != 0; q = in_queue [q]) continue;
- in_queue [q] = val;
- }
- in_queue [val] = 0;
- }
- template <typename MatchType>
- void CTextFsm<MatchType>::ComputeFail(void)
- {
- int qbeg, r, s, state;
- vector<int> state_queue(m_States.size());
-
- qbeg = 0;
- state_queue [0] = 0;
-
- // queue up states reached directly from state 0 (depth 1)
-
- ITERATE ( typename CState::TMapCharInt,
- it,
- m_States[GetInitialState()].GetTransitions() ) {
- s = it->second;
- m_States[s].SetOnFailure(0);
- QueueAdd(state_queue, qbeg, s);
- }
-
- while (state_queue [qbeg] != 0) {
- r = state_queue [qbeg];
- qbeg = r;
-
- // depth 1 states beget depth 2 states, etc.
-
- ITERATE ( typename CState::TMapCharInt, it,
- m_States[r].GetTransitions() ) {
- s = it->second;
- QueueAdd(state_queue, qbeg, s);
-
- // State Substring Transitions Failure
- // 2 st a -> 3 6
- // 3 sta l -> 4
- // 6 t a -> 7 0
- // 7 ta p -> 8
- //
- // For example, r = 2 (st), if 'a' would go to s = 3 (sta).
- // From previous computation, 2 (st) fails to 6 (t).
- // Thus, check state 6 (t) for any transitions using 'a'.
- // Since 6 (t) 'a' -> 7 (ta), therefore set fail [3] -> 7.
-
- state = m_States[r].GetOnFailure();
- FindFail(state, s, it->first);
- }
- }
- }
- template <typename MatchType>
- void CTextFsm<MatchType>::FindFail(int state, int new_state, char ch)
- {
- int next;
-
- // traverse existing failure path
-
- while ( (next = GetNextState(state, ch)) == eFailState) {
- if ( state == 0 ) {
- next = 0; break;
- }
- state = m_States[state].GetOnFailure();
- }
-
- // add new failure state
-
- m_States[new_state].SetOnFailure(next);
-
- // add matches of substring at new state
-
- copy( m_States[next].GetMatches().begin(),
- m_States[next].GetMatches().end(),
- back_inserter(m_States[new_state].GetMatches()) );
- }
- template <typename MatchType>
- const vector<MatchType>& CTextFsm<MatchType>::GetMatches(int state) const {
- return m_States[state].GetMatches();
- }
- template <typename MatchType>
- bool CTextFsm<MatchType>::IsMatchFound(int state) const
- {
- return m_States[state].IsMatchFound();
- }
- template <typename MatchType>
- CTextFsm<MatchType>::~CTextFsm(void)
- {}
- END_NCBI_SCOPE
- /* @} */
- /*
- * ===========================================================================
- *
- * $Log: strsearch.hpp,v $
- * Revision 1000.2 2004/06/01 19:39:02 gouriano
- * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.19
- *
- * Revision 1.19 2004/05/27 13:41:16 kuznets
- * Fixed warnings (GCC 3.4)
- *
- * Revision 1.18 2004/04/26 14:52:34 ucko
- * Add "typename" as needed to accommodate GCC 3.4's stricter treatment of
- * templates.
- *
- * Revision 1.17 2004/04/01 14:14:01 lavr
- * Spell "occurred", "occurrence", and "occurring"
- *
- * Revision 1.16 2004/03/05 15:45:23 kuznets
- * fixed compilation warnings on 64-bit (Sun)
- *
- * Revision 1.15 2004/03/04 17:37:38 kuznets
- * CBoyerMooreMatcher added functions to work with different word delimiters
- *
- * Revision 1.14 2004/03/03 17:55:47 kuznets
- * Code cleane up (CBoyerMooreMatcher) to use enums instead of bools,
- * better coverage or different types of whole word matchers
- *
- * Revision 1.13 2004/03/02 19:59:57 kuznets
- * Changes in CBoyerMooreMatcher:
- * - added work with memory areas
- * - alternative word delimiters
- *
- * Revision 1.12 2003/05/23 13:33:55 dicuccio
- * Changed local variable name from 'queue' to avoid clash with queue<>
- *
- * Revision 1.11 2003/04/17 17:50:36 siyan
- * Added doxygen support
- *
- * Revision 1.10 2003/03/10 17:56:34 kuznets
- * iterate->ITERATE
- *
- * Revision 1.9 2003/02/04 20:15:16 shomrat
- * Change int to unsigned
- *
- * Revision 1.8 2003/01/22 20:03:56 vasilche
- * Removed compiler warning.
- *
- * Revision 1.7 2002/12/19 14:51:00 dicuccio
- * Added export specifier for Win32 DLL builds.
- *
- * Revision 1.6 2002/11/13 19:12:23 shomrat
- * Add prime checking
- *
- * Revision 1.5 2002/11/12 15:38:56 ucko
- * Since sm_FailState was constant anyway, made it into a (singleton)
- * enum element so it won't end up as an undefined symbol on Darwin.
- *
- * Revision 1.4 2002/11/06 15:08:03 ucko
- * Remove extraneous CState:: that caused trouble with MIPSpro.
- *
- * Revision 1.3 2002/11/05 22:58:57 shomrat
- * Coding style changes; Case sensetivity option added to finite state automata; Bug fix in GetNextState
- *
- * Revision 1.2 2002/11/03 21:58:49 kans
- * BoyerMoore takes caseSensitive, wholeWord parameters (MS)
- *
- * Revision 1.1 2002/10/29 16:32:52 kans
- * initial checkin - Boyer-Moore string search and templated text search finite state machine (MS)
- *
- *
- * ===========================================================================
- */
- #endif // UTIL___STRSEARCH__HPP