Token.cpp
上传用户:zhuqijet
上传日期:2013-06-25
资源大小:10074k
文件大小:14k
- /*
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Xerces" and "Apache Software Foundation" must
- * not be used to endorse or promote products derived from this
- * software without prior written permission. For written
- * permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * nor may "Apache" appear in their name, without prior written
- * permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation, and was
- * originally based on software copyright (c) 2001, International
- * Business Machines, Inc., http://www.ibm.com . For more information
- * on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
- /*
- * $Log: Token.cpp,v $
- * Revision 1.5 2002/11/21 14:56:35 gareth
- * Fixed bug in Token::analyzeFirstCharacter so that . matches new line with head character optimisation enabled. As per discussion Jennifer Schachter had with Khaled.
- *
- * Revision 1.4 2002/11/04 15:17:00 tng
- * C++ Namespace Support.
- *
- * Revision 1.3 2002/10/15 18:11:02 knoaman
- * [Bug 13489]: missing 'return' in Token.cpp
- *
- * Revision 1.2 2002/03/18 19:29:53 knoaman
- * Change constant names to eliminate possible conflict with user defined ones.
- *
- * Revision 1.1.1.1 2002/02/01 22:22:31 peiyongz
- * sane_include
- *
- * Revision 1.3 2001/05/11 13:26:50 tng
- * Copyright update.
- *
- * Revision 1.2 2001/05/03 18:17:49 knoaman
- * Some design changes:
- * o Changed the TokenFactory from a single static instance, to a
- * normal class. Each RegularExpression object will have its own
- * instance of TokenFactory, and that instance will be passed to
- * other classes that need to use a TokenFactory to create Token
- * objects (with the exception of RangeTokenMap).
- * o Added a new class RangeTokenMap to map a the different ranges
- * in a given category to a specific RangeFactory object. In the old
- * design RangeFactory had dual functionality (act as a Map, and as
- * a factory for creating RangeToken(s)). The RangeTokenMap will
- * have its own copy of the TokenFactory. There will be only one
- * instance of the RangeTokenMap class, and that instance will be
- * lazily deleted when XPlatformUtils::Terminate is called.
- *
- * Revision 1.1 2001/03/02 19:22:58 knoaman
- * Schema: Regular expression handling part I
- *
- */
- // ---------------------------------------------------------------------------
- // Includes
- // ---------------------------------------------------------------------------
- #include <xercesc/util/regx/RangeToken.hpp>
- #include <xercesc/util/regx/ModifierToken.hpp>
- #include <xercesc/util/regx/RegularExpression.hpp>
- #include <xercesc/util/regx/RegxUtil.hpp>
- XERCES_CPP_NAMESPACE_BEGIN
- // ---------------------------------------------------------------------------
- // Static member data initialization
- // ---------------------------------------------------------------------------
- const XMLInt32 Token::UTF16_MAX = 0x10FFFF;
- const unsigned short Token::FC_CONTINUE = 0;
- const unsigned short Token::FC_TERMINAL = 1;
- const unsigned short Token::FC_ANY = 2;
- // ---------------------------------------------------------------------------
- // Token: Constructors and Destructors
- // ---------------------------------------------------------------------------
- Token::Token(const unsigned short tokType) : fTokenType(tokType) {
- }
- Token::~Token() {
- }
- // ---------------------------------------------------------------------------
- // Token: Getter mthods
- // ---------------------------------------------------------------------------
- int Token::getMinLength() const {
- switch (fTokenType) {
- case T_CONCAT:
- {
- int sum = 0;
- unsigned int childSize = size();
- for (unsigned int i=0; i<childSize; i++) {
- sum += getChild(i)->getMinLength();
- }
- return sum;
- }
- case T_CONDITION:
- case T_UNION:
- {
- unsigned int childSize = size();
- if (childSize == 0) {
- return 0;
- }
- int ret = getChild(0)->getMinLength();
- for (unsigned int i=1; i < childSize; i++) {
- int min = getChild(i)->getMinLength();
- if (min < ret)
- ret = min;
- }
- return ret;
- }
- case T_CLOSURE:
- case T_NONGREEDYCLOSURE:
- if (getMin() >= 0)
- return getMin() * getChild(0)->getMinLength();
- return 0;
- case T_EMPTY:
- case T_ANCHOR:
- return 0;
- case T_DOT:
- case T_CHAR:
- case T_RANGE:
- case T_NRANGE:
- return 1;
- case T_INDEPENDENT:
- case T_PAREN:
- case T_MODIFIERGROUP:
- return getChild(0)->getMinLength();
- case T_BACKREFERENCE:
- return 0; // ***** - REVISIT
- case T_STRING:
- return XMLString::stringLen(getString());
- case T_LOOKAHEAD:
- case T_NEGATIVELOOKAHEAD:
- case T_LOOKBEHIND:
- case T_NEGATIVELOOKBEHIND:
- return 0; // ***** - REVIST
- // default:
- // throw;
- }
- // We should not get here, but we have it to make some compilers happy
- return -1;
- }
- int Token::getMaxLength() const {
- switch (fTokenType) {
- case T_CONCAT:
- {
- int sum = 0;
- unsigned int childSize = size();
- for (unsigned int i=0; i<childSize; i++) {
- int val = getChild(i)->getMaxLength();
- if (val < 0){
- return -1;
- }
- sum += val;
- }
- return sum;
- }
- case T_CONDITION:
- case T_UNION:
- {
- unsigned int childSize = size();
- if (childSize == 0)
- return 0;
- int ret = getChild(0)->getMaxLength();
- for (unsigned i = 1; ret > 0 && i < childSize; i++) {
- int max = getChild(i)->getMaxLength();
- if (max < 0) {
- ret = -1;
- break;
- }
- if (max > ret)
- ret = max;
- }
- return ret;
- }
- case T_CLOSURE:
- case T_NONGREEDYCLOSURE:
- if (getMax() >= 0) {
- return getMax() * getChild(0)->getMaxLength();
- }
- return -1;
- case T_EMPTY:
- case T_ANCHOR:
- return 0;
- case T_CHAR:
- return 1;
- case T_DOT:
- case T_RANGE:
- case T_NRANGE:
- return 2;
- case T_INDEPENDENT:
- case T_PAREN:
- case T_MODIFIERGROUP:
- return getChild(0)->getMaxLength();
- case T_BACKREFERENCE:
- return -1; // REVISIT
- case T_STRING:
- return XMLString::stringLen(getString());
- case T_LOOKAHEAD:
- case T_NEGATIVELOOKAHEAD:
- case T_LOOKBEHIND:
- case T_NEGATIVELOOKBEHIND:
- return 0; // REVISIT
- // default:
- // throw; //ThrowXML(RuntimeException, ...)
- } // end switch
- return -1;
- }
- // ---------------------------------------------------------------------------
- // Token: Helper mthods
- // ---------------------------------------------------------------------------
- int Token::analyzeFirstCharacter(RangeToken* const rangeTok,
- const int options,
- TokenFactory* const tokFactory)
- {
- switch(fTokenType) {
- case T_CONCAT:
- {
- int ret = FC_CONTINUE;
- for (int i=0; i<size(); i++) {
- Token* tok = getChild(i);
- if (tok
- && (ret=tok->analyzeFirstCharacter(rangeTok,
- options, tokFactory))!= FC_CONTINUE)
- break;
- }
- return ret;
- }
- case T_UNION:
- {
- unsigned int childSize = size();
- if (childSize == 0)
- return FC_CONTINUE;
- int ret = FC_CONTINUE;
- bool hasEmpty = false;
- for (unsigned int i=0; i < childSize; i++) {
- ret = getChild(i)->analyzeFirstCharacter(rangeTok, options, tokFactory);
- if (ret == FC_ANY)
- break;
- else
- hasEmpty = true;
- }
- return hasEmpty ? FC_CONTINUE : ret;
- }
- case T_CONDITION:
- {
- int ret1 = getChild(0)->analyzeFirstCharacter(rangeTok, options, tokFactory);
- if (size() == 1)
- return FC_CONTINUE;
- int ret2;
- if (ret1 != FC_ANY) {
- ret2 = getChild(1)->analyzeFirstCharacter(rangeTok, options, tokFactory);
- }
- if (ret1 == FC_ANY || ret2 == FC_ANY)
- return FC_ANY;
- if (ret1 == FC_CONTINUE || ret2 == FC_CONTINUE)
- return FC_CONTINUE;
- return FC_TERMINAL;
- }
- case T_CLOSURE:
- case T_NONGREEDYCLOSURE:
- {
- Token* tok = getChild(0);
- if (tok)
- tok->analyzeFirstCharacter(rangeTok, options, tokFactory);
- return FC_CONTINUE;
- }
- case T_DOT:
- return FC_ANY;
- case T_EMPTY:
- case T_ANCHOR:
- return FC_CONTINUE;
- case T_CHAR:
- {
- XMLInt32 ch = getChar();
- rangeTok->addRange(ch, ch);
- if (ch < 0x1000 && isSet(options,RegularExpression::IGNORE_CASE)) {
- //REVISIT
- }
- }
- return FC_TERMINAL;
- case T_RANGE:
- {
- if (isSet(options, RegularExpression::IGNORE_CASE)) {
- rangeTok->mergeRanges(((RangeToken*)
- this)->getCaseInsensitiveToken(tokFactory));
- }
- else {
- rangeTok->mergeRanges(this);
- }
- return FC_TERMINAL;
- }
- case T_NRANGE:
- {
- if (isSet(options, RegularExpression::IGNORE_CASE)) {
- RangeToken* caseITok = (((RangeToken*)
- this)->getCaseInsensitiveToken(tokFactory));
- rangeTok->mergeRanges(RangeToken::complementRanges(caseITok, tokFactory));
- }
- else {
- rangeTok->mergeRanges(
- RangeToken::complementRanges((RangeToken*) this, tokFactory));
- }
- }
- case T_INDEPENDENT:
- case T_PAREN:
- {
- Token* tok = getChild(0);
- if (tok)
- return tok->analyzeFirstCharacter(rangeTok,options, tokFactory);
- }
- case T_MODIFIERGROUP:
- case T_BACKREFERENCE:
- rangeTok->addRange(0, UTF16_MAX);
- return FC_ANY;
- case T_STRING:
- {
- const XMLCh* str = getString();
- XMLInt32 ch = str[0];
- if (RegxUtil::isHighSurrogate((XMLCh) ch)) {
- }
- rangeTok->addRange(ch, ch);
- if (ch<0x10000 && isSet(options,RegularExpression::IGNORE_CASE)) {
- //REVISIT
- }
- }
- return FC_TERMINAL;
- case T_LOOKAHEAD:
- case T_NEGATIVELOOKAHEAD:
- case T_LOOKBEHIND:
- case T_NEGATIVELOOKBEHIND:
- return FC_CONTINUE;
- // default:
- // throw;
- }
- return 0;
- }
- Token* Token::findFixedString(int options, int& outOptions) {
- switch(fTokenType) {
- case T_CHAR:
- return 0;
- case T_STRING:
- outOptions = options;
- return this;
- case T_UNION:
- case T_CLOSURE:
- case T_NONGREEDYCLOSURE:
- case T_EMPTY:
- case T_ANCHOR:
- case T_RANGE:
- case T_NRANGE:
- case T_DOT:
- case T_BACKREFERENCE:
- case T_LOOKAHEAD:
- case T_NEGATIVELOOKAHEAD:
- case T_LOOKBEHIND:
- case T_NEGATIVELOOKBEHIND:
- case T_CONDITION:
- return 0;
- case T_INDEPENDENT:
- case T_PAREN:
- return getChild(0)->findFixedString(options, outOptions);
- case T_CONCAT:
- {
- Token* prevTok = 0;
- int prevOptions = 0;
- for (int i=0; i<size(); i++) {
- Token* tok = getChild(i)->findFixedString(options, outOptions);
- if (prevTok == 0 || prevTok->isShorterThan(tok)) {
- prevTok = tok;
- prevOptions = outOptions;
- }
- }
- outOptions = prevOptions;
- return prevTok;
- }
- case T_MODIFIERGROUP:
- {
- options |= ((ModifierToken *) this)->getOptions();
- options &= ~((ModifierToken *) this)->getOptionsMask();
- return getChild(0)->findFixedString(options, outOptions);
- }
- } // end switch
- return 0;
- }
- bool Token::isShorterThan(Token* const tok) {
- if (tok == 0)
- return false;
- if (getTokenType() != T_STRING && tok->getTokenType() != T_STRING)
- return false; //Should we throw an exception?
- int length = XMLString::stringLen(getString());
- int tokLength = XMLString::stringLen(tok->getString());
- return length < tokLength;
- }
- XERCES_CPP_NAMESPACE_END
- /**
- * End of file Token.cpp
- */