regexp.cpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:11k
- /*
- * ===========================================================================
- * PRODUCTION $Log: regexp.cpp,v $
- * PRODUCTION Revision 1000.2 2004/06/01 19:40:23 gouriano
- * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.7
- * PRODUCTION
- * ===========================================================================
- */
- /* $Id: regexp.cpp,v 1000.2 2004/06/01 19:40:23 gouriano Exp $
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * Author: Clifford Clausen
- * File Description:
- * C++ wrappers for Perl Compatible Regular Expression (pcre) library
- *
- * ===========================================================================
- */
- #include <ncbi_pch.hpp>
- #include <corelib/ncbi_limits.h>
- #include <util/regexp.hpp>
- #include <memory>
- #include <stdlib.h>
- BEGIN_NCBI_SCOPE
- //////////////////////////////////////////////////////////////////////////////
- //
- // CRegexpException
- //
- class CRegexpException : public CException
- {
- public:
- enum EErrCode {
- eCompile
- };
- virtual const char* GetErrCodeString(void) const {
- switch ( GetErrCode() ) {
- case eCompile: return "eCompile";
- default: return CException::GetErrCodeString();
- }
- }
- NCBI_EXCEPTION_DEFAULT(CRegexpException,CException);
- };
- //////////////////////////////////////////////////////////////////////////////
- //
- // CRegexp
- //
- CRegexp::CRegexp(const string& pattern, TCompile flags)
- : m_NumFound(0)
- {
- const char *err;
- int err_offset;
- m_PReg = pcre_compile(pattern.c_str(), flags, &err, &err_offset, NULL);
- if (m_PReg == NULL) {
- NCBI_THROW(CRegexpException, eCompile, "Compilation of the pattern '" +
- pattern + "' failed: " + err);
- }
- }
- CRegexp::~CRegexp()
- {
- (*pcre_free)(m_PReg);
- }
- void CRegexp::Set(const string& pattern, TCompile flags)
- {
- if (m_PReg != NULL) {
- (*pcre_free)(m_PReg);
- }
- const char *err;
- int err_offset;
- m_PReg = pcre_compile(pattern.c_str(), flags, &err, &err_offset, NULL);
- if (m_PReg == NULL) {
- NCBI_THROW(CRegexpException, eCompile, "Compilation of the pattern '" +
- pattern + "' failed: " + err);
- }
- }
- string CRegexp::GetSub(const string& str, size_t idx) const
- {
- int start = m_Results[2 * idx];
- int end = m_Results[2 * idx + 1];
- if ((int)idx >= m_NumFound || start == -1 || end == -1) {
- return kEmptyStr;
- }
- return str.substr(start, end - start);
- }
- string CRegexp::GetMatch(
- const string& str,
- TSeqPos offset,
- size_t idx,
- TMatch flags,
- bool noreturn)
- {
- m_NumFound = pcre_exec(m_PReg, NULL, str.c_str(), (int)str.length(),
- (int)offset, flags, m_Results,
- (int)(kRegexpMaxSubPatterns +1) * 3);
- if ( noreturn ) {
- return kEmptyStr;
- } else {
- return GetSub(str, idx);
- }
- }
- //////////////////////////////////////////////////////////////////////////////
- //
- // CRegexpUtil
- //
- CRegexpUtil::CRegexpUtil(const string& str)
- : m_Content(str), m_IsDivided(false),
- m_RangeStart(kEmptyStr), m_RangeEnd(kEmptyStr), m_Delimiter("n")
- {
- return;
- }
- void CRegexpUtil::SetRange(
- const string& addr_start,
- const string& addr_end,
- const string& delimiter)
- {
- m_RangeStart = addr_start;
- m_RangeEnd = addr_end;
- x_Divide(delimiter);
- }
- size_t CRegexpUtil::Replace(
- const string& search,
- const string& replace,
- CRegexp::TCompile compile_flags,
- CRegexp::TMatch match_flags,
- size_t max_replace)
- {
- if ( search.empty() ) {
- return 0;
- }
- size_t n_replace = 0;
- // Fill shure that string is not divided.
- x_Join();
- // Compile regular expression.
- CRegexp re(search, compile_flags);
- size_t start_pos = 0;
- for (size_t count = 0; !(max_replace && count >= max_replace); count++) {
- // Match pattern.
- re.GetMatch(m_Content.c_str(), (int)start_pos, 0, match_flags, true);
- int num_found = re.NumFound();
- if (num_found <= 0) {
- break;
- }
- // Substitute all subpatterns "$<digit>" to values in the "replace"
- // string.
- const int* result;
- string x_replace = replace;
- size_t pos = 0;
- for (;;) {
- // Find "$"
- pos = x_replace.find("$", pos);
- if (pos == NPOS) {
- break;
- }
- // Try to convert string after the "$" to number
- errno = 0;
- const char* startptr = x_replace.c_str() + pos + 1;
- char* endptr = 0;
- long value = strtol(startptr, &endptr, 10);
- if ( errno || endptr == startptr || !endptr ||
- value < kMin_Int || value > kMax_Int) {
- // Format error, skip single "$".
- pos++;
- continue;
- }
- int n = (int)value;
- // Get subpattern value
- string subpattern;
- if ( n > 0 && n < num_found ) {
- result = re.GetResults(n);
- if (result[0] >= 0 && result[1] >= 0) {
- subpattern = m_Content.substr(result[0],
- result[1] - result[0]);
- }
- }
- // Check braces {$...}
- size_t sp_start = pos;
- size_t sp_end = endptr - x_replace.c_str();
- if ( sp_start > 0 && x_replace[sp_start-1] == '{') {
- sp_start--;
- if ( sp_end < x_replace.length() &&
- x_replace[sp_end] == '}') {
- sp_end++;
- } else {
- // Format error -- missed closed brace.
- sp_start++;
- }
- }
- // Replace $n with subpattern value.
- x_replace.replace(sp_start, sp_end - sp_start, subpattern);
- pos += subpattern.length();
- }
- // Replace pattern with "x_replace".
- result = re.GetResults(0);
- m_Content.replace(result[0], result[1] - result[0], x_replace);
- n_replace++;
- start_pos = result[0] + x_replace.length();
- }
- return n_replace;
- }
- size_t CRegexpUtil::ReplaceRange(
- const string& search,
- const string& replace,
- CRegexp::TCompile compile_flags,
- CRegexp::TMatch match_flags,
- CRegexpUtil::ERange process_inside,
- size_t max_replace
- )
- {
- if ( search.empty() ) {
- return 0;
- }
- size_t n_replace = 0;
- // Split source string to parts by delimiter
- x_Divide();
- // Flag which denote that current line is inside "range"
- bool inside = m_RangeStart.empty();
- NON_CONST_ITERATE (list<string>, i, m_ContentList) {
- // Get new line
- string line = *i;
- // Check beginning of block [addr_re_start:addr_re_end]
- if ( !inside && !m_RangeStart.empty() ) {
- CRegexp re(m_RangeStart.c_str());
- re.GetMatch(line.c_str(), 0, 0, 0, true);
- inside = (re.NumFound() > 0);
- } else {
- inside = true;
- }
- // Process current line
- if ( (inside && process_inside == eInside) ||
- (!inside && process_inside == eOutside) ) {
- CRegexpUtil re(line);
- n_replace += re.Replace(search, replace,
- compile_flags, match_flags, max_replace);
- *i = re;
- }
- // Check ending of block [addr_re_start:addr_re_end]
- if ( inside && !m_RangeEnd.empty() ) {
- // Two addresses
- CRegexp re(m_RangeEnd.c_str());
- re.GetMatch(line.c_str(), 0, 0, 0, true);
- inside = (re.NumFound() <= 0);
- } else {
- // One address -- process one current string only
- inside = false;
- }
- }
- return n_replace;
- }
- void CRegexpUtil::x_Divide(const string& delimiter)
- {
- string x_delimiter = delimiter.empty() ? m_Delimiter : delimiter;
- if ( m_IsDivided ) {
- if ( x_delimiter == m_Delimiter ) {
- return;
- }
- x_Join();
- }
- m_ContentList.clear();
- // Split source string to parts by delimiter
- size_t pos;
- size_t start_pos = 0;
- for (;;) {
- pos = m_Content.find(x_delimiter, start_pos);
- if (pos == NPOS) {
- m_ContentList.push_back(m_Content.substr(start_pos));
- break;
- } else {
- m_ContentList.push_back(m_Content.substr(start_pos,
- pos - start_pos));
- start_pos = pos + x_delimiter.length();
- }
- }
- m_IsDivided = true;
- // Save delimiter for consecutive joining
- m_Delimiter = x_delimiter;
- }
- void CRegexpUtil::x_Join(void)
- {
- if ( m_IsDivided ) {
- m_Content = NStr::Join(m_ContentList, m_Delimiter);
- m_IsDivided = false;
- }
- }
- END_NCBI_SCOPE
- /*
- * ===========================================================================
- * $Log: regexp.cpp,v $
- * Revision 1000.2 2004/06/01 19:40:23 gouriano
- * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.7
- *
- * Revision 1.7 2004/05/17 21:06:02 gorelenk
- * Added include of PCH ncbi_pch.hpp
- *
- * Revision 1.6 2003/11/07 17:16:23 ivanov
- * Fixed warnings on 64-bit Workshop compiler
- *
- * Revision 1.5 2003/11/07 13:39:56 ivanov
- * Fixed lines wrapped at 79th columns
- *
- * Revision 1.4 2003/11/06 16:13:04 ivanov
- * Added CRegexpUtil class. Some formal code rearrangement.
- *
- * Revision 1.3 2003/07/16 19:13:50 clausen
- * Added TCompile and TMatch
- *
- * Revision 1.2 2003/06/20 18:26:37 clausen
- * Switched to native regexp interface
- *
- * Revision 1.1 2003/06/03 14:46:23 clausen
- * Initial version
- *
- * ===========================================================================
- */