生物技术

开发平台：
C/C++

strsearch.cpp：源码内容
							/*
 * ===========================================================================
 * PRODUCTION $Log: strsearch.cpp,v $
 * PRODUCTION Revision 1000.3  2004/06/01 19:40:38  gouriano
 * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.17
 * PRODUCTION
 * ===========================================================================
 */
/*  $Id: strsearch.cpp,v 1000.3 2004/06/01 19:40:38 gouriano Exp $
* ===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*               National Center for Biotechnology Information
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's official duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government have not placed any restriction on its use or reproduction.
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
*  Please cite the author in any work or product based on this material.
*
* ===========================================================================
*
* Author:  Mati Shomrat
*
* File Description:
*   String search utilities.
*
*   Currently there are two search utilities:
*   1. An implementation of the Boyer-Moore algorithm.
*   2. A finite state automaton.
*
*/
#include <ncbi_pch.hpp>
#include <util/strsearch.hpp>
#include <algorithm>
#include <vector>
#include <ctype.h>
NCBI_USING_NAMESPACE_STD;
BEGIN_NCBI_SCOPE
//==============================================================================
//                            CBoyerMooreMatcher
//==============================================================================
// Public:
// =======
CBoyerMooreMatcher::CBoyerMooreMatcher(const string& pattern, 
                                       NStr::ECase   case_sensitive,
                                       unsigned int  whole_word)
: m_Pattern(pattern), 
  m_PatLen(pattern.length()), 
  m_CaseSensitive(case_sensitive), 
  m_WholeWord(whole_word),
  m_LastOccurrence(sm_AlphabetSize),
  m_WordDelimiters(sm_AlphabetSize)
{
    x_InitPattern();
    // Init the word deimiting alphabet
    if (m_WholeWord) {
        for (int i = 0; i < sm_AlphabetSize; ++i) {
            m_WordDelimiters[i] = (isspace(i) != 0);
        }
    }
}
CBoyerMooreMatcher::CBoyerMooreMatcher(const string& pattern,
                                       const string& word_delimeters,
                                       NStr::ECase   case_sensitive,
                                       bool          invert_delimiters)
: m_Pattern(pattern), 
  m_PatLen(pattern.length()), 
  m_CaseSensitive(case_sensitive), 
  m_WholeWord(true),
  m_LastOccurrence(sm_AlphabetSize),
  m_WordDelimiters(sm_AlphabetSize)
{
    x_InitPattern();
    SetWordDelimiters(word_delimeters, invert_delimiters);
}
void CBoyerMooreMatcher::SetWordDelimiters(const string& word_delimeters,
                                           bool          invert_delimiters)
{
    m_WholeWord = eWholeWordMatch;
    string word_d = word_delimeters;
    if (m_CaseSensitive == NStr::eNocase) {
        NStr::ToUpper(word_d);
    }
    // Init the word delimiting alphabet
    for (int i = 0; i < sm_AlphabetSize; ++i) {
        char ch = m_CaseSensitive ? i : toupper(i);
        string::size_type n = word_d.find_first_of(ch);
        m_WordDelimiters[i] = (!invert_delimiters) == (n != string::npos);
    }
}
void CBoyerMooreMatcher::AddDelimiters(const string& word_delimeters)
{
    if (m_WholeWord == 0) {
        m_WholeWord = eWholeWordMatch;
    }
    string word_d = word_delimeters;
    if (m_CaseSensitive == NStr::eNocase) {
        NStr::ToUpper(word_d);
    }
    for (int i = 0; i < sm_AlphabetSize; ++i) {
        char ch = m_CaseSensitive ? i : toupper(i);
        string::size_type n = word_d.find_first_of(ch);
        if (n != NPOS) {
            m_WordDelimiters[i] = true;
        }
    }
}
void CBoyerMooreMatcher::AddDelimiters(char ch)
{
    if (m_WholeWord == 0) {
        m_WholeWord = eWholeWordMatch;
    }
    m_WordDelimiters[ch] = true;
    if (m_CaseSensitive == NStr::eNocase) {
        ch = toupper(ch);
    }
    
    m_WordDelimiters[ch] = true;
}
void CBoyerMooreMatcher::InitCommonDelimiters()
{
    if (m_WholeWord == 0) {
        m_WholeWord = eWholeWordMatch;
    }
    for (int i = 0; i < sm_AlphabetSize; ++i) {
        char ch = m_CaseSensitive ? i : toupper(i);
        if ((ch >= 'A' && ch <= 'Z') ||
            (ch >= '0' && ch <= '9') ||
            (ch == '_')){
        } else {
            m_WordDelimiters[i] = true;
        }
    }
}
void CBoyerMooreMatcher::x_InitPattern(void)
{
    if ( m_CaseSensitive == NStr::eNocase) {
        NStr::ToUpper(m_Pattern);
    }
    
    // For each character in the alpahbet compute its last occurrence in 
    // the pattern.
    
    // Initilalize vector
    size_t size = m_LastOccurrence.size();
    for ( size_t i = 0;  i < size;  ++i ) {
        m_LastOccurrence[i] = m_PatLen;
    }
    
    // compute right-most occurrence
    for ( int j = 0;  j < (int)m_PatLen - 1;  ++j ) {
        /* int lo = */
		m_LastOccurrence[(int)m_Pattern[j]] = m_PatLen - j - 1;
   }
}
SIZE_TYPE CBoyerMooreMatcher::Search(const char*  text, 
                                     SIZE_TYPE shift,
                                     SIZE_TYPE text_len) const
{
    // Implementation note.
    // Case sensitivity check has been taken out of loop. 
    // Code size for performance optimization. (We generally choose speed).
    // (Anatoliy)
    if (m_CaseSensitive == NStr::eCase) {
        while (shift + m_PatLen <= text_len) {
            int j = (int)m_PatLen - 1;
            for(char text_char = text[shift + j];
                j >= 0 && m_Pattern[j]==(text_char=text[shift + j]);
                --j) {}
            if ( (j == -1)  &&  IsWholeWord(text, shift, text_len) ) {
                return  shift;
            } else {
                shift += (unsigned int)m_LastOccurrence[text[shift + j]];
            }
        }
    } else { // case insensitive NStr::eNocase
        while (shift + m_PatLen <= text_len) {
            int j = (int)m_PatLen - 1;
            for(char text_char = toupper(text[shift + j]);
                j >= 0 && m_Pattern[j]==(text_char=toupper(text[shift + j]));
                --j) {}
            if ( (j == -1)  &&  IsWholeWord(text, shift, text_len) ) {
                return  shift;
            } else {
                shift += 
                    (unsigned int)m_LastOccurrence[toupper(text[shift + j])];
            }
        }
    }
    return (SIZE_TYPE)-1;
}
// Private:
// ========
// Constants
const int CBoyerMooreMatcher::sm_AlphabetSize = 256;     // assuming ASCII
// Member Functions
bool CBoyerMooreMatcher::IsWholeWord(const char*  text, 
                                     SIZE_TYPE    pos,
                                     SIZE_TYPE    text_len) const
{
    SIZE_TYPE left, right;
    left = right = 1;
    // Words at the beginning and end of text are also considered "whole"
    // check on the left  
    if (m_WholeWord & ePrefixMatch) {
        left = (pos == 0) ||
               ((pos > 0) && m_WordDelimiters[text[pos - 1]]);
    }
    // check on the right
    if (m_WholeWord & eSuffixMatch) {
        pos += (unsigned int)m_PatLen;
        right = (pos == text_len) || 
                ((pos < text_len) && m_WordDelimiters[text[pos]]);
    }
    return (left && right);
}
END_NCBI_SCOPE
/*
* ===========================================================================
*
* $Log: strsearch.cpp,v $
* Revision 1000.3  2004/06/01 19:40:38  gouriano
* PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.17
*
* Revision 1.17  2004/05/27 13:40:58  kuznets
* Fixed warnings (GCC 3.4)
*
* Revision 1.16  2004/05/17 21:06:02  gorelenk
* Added include of PCH ncbi_pch.hpp
*
* Revision 1.15  2004/04/01 14:14:02  lavr
* Spell "occurred", "occurrence", and "occurring"
*
* Revision 1.14  2004/03/11 17:02:53  kuznets
* Fixed misspelling in comments
*
* Revision 1.13  2004/03/11 16:56:33  kuznets
* Minor reformatting
*
* Revision 1.12  2004/03/11 16:54:48  kuznets
* CBoyerMooreMatcher::Search fixed incorrect bad-character shift in case
* insensitive search...
*
* Revision 1.11  2004/03/05 15:46:02  kuznets
* fixed crash on 64-bit (sun)
*
* Revision 1.10  2004/03/04 18:25:20  ucko
* Fix compilation errors in previous revision.
*
* Revision 1.9  2004/03/04 17:37:48  kuznets
* CBoyerMooreMatcher added functions to work with different word delimiters
*
* Revision 1.8  2004/03/03 17:56:02  kuznets
* Code cleane up (CBoyerMooreMatcher) to use enums instead of bools,
* better coverage or different types of whole word matchers
*
* Revision 1.7  2004/03/03 14:37:14  kuznets
* bug fix: CBoyerMooreMatcher::IsWholeWord added case when
* the tested token is at the very beggining or end of the text.
*
* Revision 1.6  2004/03/02 20:00:45  kuznets
* Changes in CBoyerMooreMatcher:
*   - added work with memory areas
*   - alternative word delimiters
*   - performance optimizations
*
* Revision 1.5  2003/11/07 17:16:23  ivanov
* Fixed  warnings on 64-bit Workshop compiler
*
* Revision 1.4  2003/02/04 20:16:15  shomrat
* Change signed to unsigned to eliminate compilation warning
*
* Revision 1.3  2002/11/05 23:01:13  shomrat
* Coding style changes
*
* Revision 1.2  2002/11/03 21:59:14  kans
* BoyerMoore takes caseSensitive, wholeWord parameters (MS)
*
* Revision 1.1  2002/10/29 16:33:11  kans
* initial checkin - Boyer-Moore string search and templated text search finite state machine (MS)
*
*
* ===========================================================================
*/