WFXMLScanner.cpp
上传用户:zhuqijet
上传日期:2013-06-25
资源大小:10074k
文件大小:76k
- /*
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2002,2003 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Xerces" and "Apache Software Foundation" must
- * not be used to endorse or promote products derived from this
- * software without prior written permission. For written
- * permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * nor may "Apache" appear in their name, without prior written
- * permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation, and was
- * originally based on software copyright (c) 1999, International
- * Business Machines, Inc., http://www.ibm.com . For more information
- * on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
- /*
- * $Id: WFXMLScanner.cpp,v 1.12 2003/05/18 14:02:04 knoaman Exp $
- */
- // ---------------------------------------------------------------------------
- // Includes
- // ---------------------------------------------------------------------------
- #include <xercesc/internal/WFXMLScanner.hpp>
- #include <xercesc/util/Janitor.hpp>
- #include <xercesc/util/RuntimeException.hpp>
- #include <xercesc/util/UnexpectedEOFException.hpp>
- #include <xercesc/sax/InputSource.hpp>
- #include <xercesc/framework/XMLDocumentHandler.hpp>
- #include <xercesc/framework/XMLEntityHandler.hpp>
- #include <xercesc/framework/XMLPScanToken.hpp>
- #include <xercesc/framework/XMLValidityCodes.hpp>
- #include <xercesc/internal/EndOfEntityException.hpp>
- XERCES_CPP_NAMESPACE_BEGIN
- // ---------------------------------------------------------------------------
- // WFXMLScanner: Constructors and Destructor
- // ---------------------------------------------------------------------------
- WFXMLScanner::WFXMLScanner( XMLValidator* const valToAdopt
- , MemoryManager* const manager) :
- XMLScanner(valToAdopt, manager)
- , fElementIndex(0)
- , fElements(0)
- , fEntityTable(0)
- , fAttrNameHashList(0)
- , fAttrNSList(0)
- , fElementLookup(0)
- , fElemStack(manager)
- {
- try
- {
- commonInit();
- }
- catch(...)
- {
- cleanUp();
- throw;
- }
- }
- WFXMLScanner::WFXMLScanner( XMLDocumentHandler* const docHandler
- , DocTypeHandler* const docTypeHandler
- , XMLEntityHandler* const entityHandler
- , XMLErrorReporter* const errHandler
- , XMLValidator* const valToAdopt
- , MemoryManager* const manager) :
- XMLScanner(docHandler, docTypeHandler, entityHandler, errHandler, valToAdopt, manager)
- , fElementIndex(0)
- , fElements(0)
- , fEntityTable(0)
- , fAttrNameHashList(0)
- , fAttrNSList(0)
- , fElementLookup(0)
- , fElemStack(manager)
- {
- try
- {
- commonInit();
- }
- catch(...)
- {
- cleanUp();
- throw;
- }
- }
- WFXMLScanner::~WFXMLScanner()
- {
- cleanUp();
- }
- // ---------------------------------------------------------------------------
- // XMLScanner: Getter methods
- // ---------------------------------------------------------------------------
- NameIdPool<DTDEntityDecl>* WFXMLScanner::getEntityDeclPool()
- {
- return 0;
- }
- const NameIdPool<DTDEntityDecl>* WFXMLScanner::getEntityDeclPool() const
- {
- return 0;
- }
- // ---------------------------------------------------------------------------
- // WFXMLScanner: Main entry point to scan a document
- // ---------------------------------------------------------------------------
- void WFXMLScanner::scanDocument(const InputSource& src)
- {
- // Bump up the sequence id for this parser instance. This will invalidate
- // any previous progressive scan tokens.
- fSequenceId++;
- try
- {
- // Reset the scanner and its plugged in stuff for a new run. This
- // resets all the data structures, creates the initial reader and
- // pushes it on the stack, and sets up the base document path.
- scanReset(src);
- // If we have a document handler, then call the start document
- if (fDocHandler)
- fDocHandler->startDocument();
- // Scan the prolog part, which is everything before the root element
- // including the DTD subsets.
- scanProlog();
- // If we got to the end of input, then its not a valid XML file.
- // Else, go on to scan the content.
- if (fReaderMgr.atEOF())
- {
- emitError(XMLErrs::EmptyMainEntity);
- }
- else
- {
- // Scan content, and tell it its not an external entity
- if (scanContent(false))
- {
- // That went ok, so scan for any miscellaneous stuff
- if (!fReaderMgr.atEOF())
- scanMiscellaneous();
- }
- }
- // If we have a document handler, then call the end document
- if (fDocHandler)
- fDocHandler->endDocument();
- // Reset the reader manager to close all files, sockets, etc...
- fReaderMgr.reset();
- }
- // NOTE:
- //
- // In all of the error processing below, the emitError() call MUST come
- // before the flush of the reader mgr, or it will fail because it tries
- // to find out the position in the XML source of the error.
- catch(const XMLErrs::Codes)
- {
- // This is a 'first fatal error' type exit, so reset and fall through
- fReaderMgr.reset();
- }
- catch(const XMLValid::Codes)
- {
- // This is a 'first fatal error' type exit, so reset and fall through
- fReaderMgr.reset();
- }
- catch(const XMLException& excToCatch)
- {
- // Emit the error and catch any user exception thrown from here. Make
- // sure in all cases we flush the reader manager.
- fInException = true;
- try
- {
- if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
- emitError
- (
- XMLErrs::XMLException_Warning
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
- emitError
- (
- XMLErrs::XMLException_Fatal
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- else
- emitError
- (
- XMLErrs::XMLException_Error
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- }
- catch(...)
- {
- // Flush the reader manager and rethrow user's error
- fReaderMgr.reset();
- throw;
- }
- // If it returned, then reset the reader manager and fall through
- fReaderMgr.reset();
- }
- catch(...)
- {
- // Reset and rethrow
- fReaderMgr.reset();
- throw;
- }
- }
- bool WFXMLScanner::scanNext(XMLPScanToken& token)
- {
- // Make sure this token is still legal
- if (!isLegalToken(token))
- ThrowXML(RuntimeException, XMLExcepts::Scan_BadPScanToken);
- // Find the next token and remember the reader id
- unsigned int orgReader;
- XMLTokens curToken;
- bool retVal = true;
- try
- {
- while (true)
- {
- // We have to handle any end of entity exceptions that happen here.
- // We could be at the end of X nested entities, each of which will
- // generate an end of entity exception as we try to move forward.
- try
- {
- curToken = senseNextToken(orgReader);
- break;
- }
- catch(const EndOfEntityException& toCatch)
- {
- // Send an end of entity reference event
- if (fDocHandler)
- fDocHandler->endEntityReference(toCatch.getEntity());
- }
- }
- if (curToken == Token_CharData)
- {
- scanCharData(fCDataBuf);
- }
- else if (curToken == Token_EOF)
- {
- if (!fElemStack.isEmpty())
- {
- const ElemStack::StackElem* topElem = fElemStack.popTop();
- emitError
- (
- XMLErrs::EndedWithTagsOnStack
- , topElem->fThisElement->getFullName()
- );
- }
- retVal = false;
- }
- else
- {
- // Its some sort of markup
- bool gotData = true;
- switch(curToken)
- {
- case Token_CData :
- // Make sure we are within content
- if (fElemStack.isEmpty())
- emitError(XMLErrs::CDATAOutsideOfContent);
- scanCDSection();
- break;
- case Token_Comment :
- scanComment();
- break;
- case Token_EndTag :
- scanEndTag(gotData);
- break;
- case Token_PI :
- scanPI();
- break;
- case Token_StartTag :
- if (fDoNamespaces)
- scanStartTagNS(gotData);
- else
- scanStartTag(gotData);
- break;
- default :
- fReaderMgr.skipToChar(chOpenAngle);
- break;
- }
- if (orgReader != fReaderMgr.getCurrentReaderNum())
- emitError(XMLErrs::PartialMarkupInEntity);
- // If we hit the end, then do the miscellaneous part
- if (!gotData)
- {
- // That went ok, so scan for any miscellaneous stuff
- scanMiscellaneous();
- if (fDocHandler)
- fDocHandler->endDocument();
- }
- }
- }
- // NOTE:
- //
- // In all of the error processing below, the emitError() call MUST come
- // before the flush of the reader mgr, or it will fail because it tries
- // to find out the position in the XML source of the error.
- catch(const XMLErrs::Codes)
- {
- // This is a 'first failure' exception, so reset and return failure
- fReaderMgr.reset();
- return false;
- }
- catch(const XMLValid::Codes)
- {
- // This is a 'first fatal error' type exit, so reset and reuturn failure
- fReaderMgr.reset();
- return false;
- }
- catch(const XMLException& excToCatch)
- {
- // Emit the error and catch any user exception thrown from here. Make
- // sure in all cases we flush the reader manager.
- fInException = true;
- try
- {
- if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
- emitError
- (
- XMLErrs::XMLException_Warning
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
- emitError
- (
- XMLErrs::XMLException_Fatal
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- else
- emitError
- (
- XMLErrs::XMLException_Error
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- }
- catch(...)
- {
- // Reset and rethrow user error
- fReaderMgr.reset();
- throw;
- }
- // Reset and return failure
- fReaderMgr.reset();
- return false;
- }
- catch(...)
- {
- // Reset and rethrow original error
- fReaderMgr.reset();
- throw;
- }
- // If we hit the end, then flush the reader manager
- if (!retVal)
- fReaderMgr.reset();
- return retVal;
- }
- // ---------------------------------------------------------------------------
- // WFXMLScanner: Private helper methods.
- // ---------------------------------------------------------------------------
- // This method handles the common initialization, to avoid having to do
- // it redundantly in multiple constructors.
- void WFXMLScanner::commonInit()
- {
- fEntityTable = new (fMemoryManager) ValueHashTableOf<XMLCh>(11, fMemoryManager);
- fAttrNameHashList = new (fMemoryManager)ValueVectorOf<unsigned int>(16, fMemoryManager);
- fAttrNSList = new (fMemoryManager) ValueVectorOf<XMLAttr*>(8, fMemoryManager);
- fElements = new (fMemoryManager) RefVectorOf<XMLElementDecl>(32, true, fMemoryManager);
- fElementLookup = new (fMemoryManager) RefHashTableOf<XMLElementDecl>(109, false, fMemoryManager);
- // Add the default entity entries for the character refs that must always
- // be present.
- fEntityTable->put((void*) XMLUni::fgAmp, chAmpersand);
- fEntityTable->put((void*) XMLUni::fgLT, chOpenAngle);
- fEntityTable->put((void*) XMLUni::fgGT, chCloseAngle);
- fEntityTable->put((void*) XMLUni::fgQuot, chDoubleQuote);
- fEntityTable->put((void*) XMLUni::fgApos, chSingleQuote);
- }
- void WFXMLScanner::cleanUp()
- {
- delete fEntityTable;
- delete fAttrNameHashList;
- delete fAttrNSList;
- delete fElementLookup;
- delete fElements;
- }
- unsigned int
- WFXMLScanner::resolvePrefix(const XMLCh* const prefix
- , const ElemStack::MapModes mode)
- {
- // Watch for the special namespace prefixes. We always map these to
- // special URIs. 'xml' gets mapped to the official URI that its defined
- // to map to by the NS spec. xmlns gets mapped to a special place holder
- // URI that we define (so that it maps to something checkable.)
- if (XMLString::equals(prefix, XMLUni::fgXMLNSString))
- return fXMLNSNamespaceId;
- else if (XMLString::equals(prefix, XMLUni::fgXMLString))
- return fXMLNamespaceId;
- // Ask the element stack to search up itself for a mapping for the
- // passed prefix.
- bool unknown;
- unsigned int uriId = fElemStack.mapPrefixToURI(prefix, mode, unknown);
- // If it was unknown, then the URI was faked in but we have to issue an error
- if (unknown)
- emitError(XMLErrs::UnknownPrefix, prefix);
- return uriId;
- }
- // This method will reset the scanner data structures, and related plugged
- // in stuff, for a new scan session. We get the input source for the primary
- // XML entity, create the reader for it, and push it on the stack so that
- // upon successful return from here we are ready to go.
- void WFXMLScanner::scanReset(const InputSource& src)
- {
- // For all installed handlers, send reset events. This gives them
- // a chance to flush any cached data.
- if (fDocHandler)
- fDocHandler->resetDocument();
- if (fEntityHandler)
- fEntityHandler->resetEntities();
- if (fErrorReporter)
- fErrorReporter->resetErrors();
- // Reset the element stack, and give it the latest ids for the special
- // URIs it has to know about.
- fElemStack.reset
- (
- fEmptyNamespaceId
- , fUnknownNamespaceId
- , fXMLNamespaceId
- , fXMLNSNamespaceId
- );
- // Reset some status flags
- fInException = false;
- fStandalone = false;
- fErrorCount = 0;
- fHasNoDTD = true;
- fElementIndex = 0;
- // Reset elements lookup table
- fElementLookup->removeAll();
- // Handle the creation of the XML reader object for this input source.
- // This will provide us with transcoding and basic lexing services.
- XMLReader* newReader = fReaderMgr.createReader
- (
- src
- , true
- , XMLReader::RefFrom_NonLiteral
- , XMLReader::Type_General
- , XMLReader::Source_External
- , fCalculateSrcOfs
- );
- if (!newReader) {
- if (src.getIssueFatalErrorIfNotFound())
- ThrowXML1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId());
- else
- ThrowXML1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId());
- }
- // Push this read onto the reader manager
- fReaderMgr.pushReader(newReader, 0);
- // and reset security-related things if necessary:
- if(fSecurityManager != 0)
- {
- fEntityExpansionLimit = fSecurityManager->getEntityExpansionLimit();
- fEntityExpansionCount = 0;
- }
- }
- // This method is called between markup in content. It scans for character
- // data that is sent to the document handler. It watches for any markup
- // characters that would indicate that the character data has ended. It also
- // handles expansion of general and character entities.
- //
- // sendData() is a local static helper for this method which handles some
- // code that must be done in three different places here.
- void WFXMLScanner::sendCharData(XMLBuffer& toSend)
- {
- // If no data in the buffer, then nothing to do
- if (toSend.isEmpty())
- return;
- // Always assume its just char data if not validating
- if (fDocHandler)
- fDocHandler->docCharacters(toSend.getRawBuffer(), toSend.getLen(), false);
- // Reset buffer
- toSend.reset();
- }
- // ---------------------------------------------------------------------------
- // WFXMLScanner: Private scanning methods
- // ---------------------------------------------------------------------------
- // This method will kick off the scanning of the primary content of the
- // document, i.e. the elements.
- bool WFXMLScanner::scanContent(const bool extEntity)
- {
- // Go into a loop until we hit the end of the root element, or we fall
- // out because there is no root element.
- //
- // We have to do kind of a deeply nested double loop here in order to
- // avoid doing the setup/teardown of the exception handler on each
- // round. Doing it this way we only do it when an exception actually
- // occurs.
- bool gotData = true;
- bool inMarkup = false;
- while (gotData)
- {
- try
- {
- while (gotData)
- {
- // Sense what the next top level token is. According to what
- // this tells us, we will call something to handle that kind
- // of thing.
- unsigned int orgReader;
- const XMLTokens curToken = senseNextToken(orgReader);
- // Handle character data and end of file specially. Char data
- // is not markup so we don't want to handle it in the loop
- // below.
- if (curToken == Token_CharData)
- {
- // Scan the character data and call appropriate events. Let
- // him use our local character data buffer for efficiency.
- scanCharData(fCDataBuf);
- continue;
- }
- else if (curToken == Token_EOF)
- {
- // The element stack better be empty at this point or we
- // ended prematurely before all elements were closed.
- if (!fElemStack.isEmpty())
- {
- const ElemStack::StackElem* topElem = fElemStack.popTop();
- emitError
- (
- XMLErrs::EndedWithTagsOnStack
- , topElem->fThisElement->getFullName()
- );
- }
- // Its the end of file, so clear the got data flag
- gotData = false;
- continue;
- }
- // We are in some sort of markup now
- inMarkup = true;
- // According to the token we got, call the appropriate
- // scanning method.
- switch(curToken)
- {
- case Token_CData :
- // Make sure we are within content
- if (fElemStack.isEmpty())
- emitError(XMLErrs::CDATAOutsideOfContent);
- scanCDSection();
- break;
- case Token_Comment :
- scanComment();
- break;
- case Token_EndTag :
- scanEndTag(gotData);
- break;
- case Token_PI :
- scanPI();
- break;
- case Token_StartTag :
- if (fDoNamespaces)
- scanStartTagNS(gotData);
- else
- scanStartTag(gotData);
- break;
- default :
- fReaderMgr.skipToChar(chOpenAngle);
- break;
- }
- if (orgReader != fReaderMgr.getCurrentReaderNum())
- emitError(XMLErrs::PartialMarkupInEntity);
- // And we are back out of markup again
- inMarkup = false;
- }
- }
- catch(const EndOfEntityException& toCatch)
- {
- // If we were in some markup when this happened, then its a
- // partial markup error.
- if (inMarkup)
- emitError(XMLErrs::PartialMarkupInEntity);
- // Send an end of entity reference event
- if (fDocHandler)
- fDocHandler->endEntityReference(toCatch.getEntity());
- inMarkup = false;
- }
- }
- // It went ok, so return success
- return true;
- }
- void WFXMLScanner::scanEndTag(bool& gotData)
- {
- // Assume we will still have data until proven otherwise. It will only
- // ever be false if this is the end of the root element.
- gotData = true;
- // Check if the element stack is empty. If so, then this is an unbalanced
- // element (i.e. more ends than starts, perhaps because of bad text
- // causing one to be skipped.)
- if (fElemStack.isEmpty())
- {
- emitError(XMLErrs::MoreEndThanStartTags);
- fReaderMgr.skipPastChar(chCloseAngle);
- ThrowXML(RuntimeException, XMLExcepts::Scan_UnbalancedStartEnd);
- }
- // Pop the stack of the element we are supposed to be ending. Remember
- // that we don't own this. The stack just keeps them and reuses them.
- unsigned int uriId = (fDoNamespaces)
- ? fElemStack.getCurrentURI() : fEmptyNamespaceId;
- const ElemStack::StackElem* topElem = fElemStack.popTop();
- // See if it was the root element, to avoid multiple calls below
- const bool isRoot = fElemStack.isEmpty();
- // Make sure that its the end of the element that we expect
- if (!fReaderMgr.skippedString(topElem->fThisElement->getFullName()))
- {
- emitError
- (
- XMLErrs::ExpectedEndOfTagX
- , topElem->fThisElement->getFullName()
- );
- fReaderMgr.skipPastChar(chCloseAngle);
- return;
- }
- // Make sure we are back on the same reader as where we started
- if (topElem->fReaderNum != fReaderMgr.getCurrentReaderNum())
- emitError(XMLErrs::PartialTagMarkupError);
- // Skip optional whitespace
- fReaderMgr.skipPastSpaces();
- // Make sure we find the closing bracket
- if (!fReaderMgr.skippedChar(chCloseAngle))
- {
- emitError
- (
- XMLErrs::UnterminatedEndTag
- , topElem->fThisElement->getFullName()
- );
- }
- // If we have a doc handler, tell it about the end tag
- if (fDocHandler)
- {
- fDocHandler->endElement
- (
- *topElem->fThisElement
- , uriId
- , isRoot
- , topElem->fThisElement->getElementName()->getPrefix()
- );
- }
- // If this was the root, then done with content
- gotData = !isRoot;
- }
- void WFXMLScanner::scanDocTypeDecl()
- {
- // Just skips over it
- // REVISIT: Should we issue a warning
- static const XMLCh doctypeIE[] =
- {
- chOpenSquare, chCloseAngle, chNull
- };
- XMLCh nextCh = fReaderMgr.skipUntilIn(doctypeIE);
- if (nextCh == chOpenSquare)
- fReaderMgr.skipPastChar(chCloseSquare);
- fReaderMgr.skipPastChar(chCloseAngle);
- }
- bool WFXMLScanner::scanStartTag(bool& gotData)
- {
- // Assume we will still have data until proven otherwise. It will only
- // ever be false if this is the root and its empty.
- gotData = true;
- // Get the QName. In this case, we are not doing namespaces, so we just
- // use it as is and don't have to break it into parts.
- if (!fReaderMgr.getName(fQNameBuf))
- {
- emitError(XMLErrs::ExpectedElementName);
- fReaderMgr.skipToChar(chOpenAngle);
- return false;
- }
- // Assume it won't be an empty tag
- bool isEmpty = false;
- // See if its the root element
- const bool isRoot = fElemStack.isEmpty();
- // Lets try to look up the element
- const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer();
- XMLElementDecl* elemDecl = fElementLookup->get(qnameRawBuf);
- if (!elemDecl) {
- if (fElementIndex < fElements->size()) {
- elemDecl = fElements->elementAt(fElementIndex);
- }
- else {
- elemDecl = new (fMemoryManager) DTDElementDecl
- (
- fMemoryManager
- );
- fElements->addElement(elemDecl);
- }
- elemDecl->setElementName(XMLUni::fgZeroLenString, qnameRawBuf, fEmptyNamespaceId);
- fElementLookup->put((void*)elemDecl->getFullName(), elemDecl);
- fElementIndex++;
- }
- // Expand the element stack and add the new element
- fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum());
- // Skip any whitespace after the name
- fReaderMgr.skipPastSpaces();
- // We loop until we either see a /> or >, handling attribute/value
- // pairs until we get there.
- unsigned int attCount = 0;
- unsigned int curAttListSize = fAttrList->size();
- while (true)
- {
- // And get the next non-space character
- XMLCh nextCh = fReaderMgr.peekNextChar();
- // If the next character is not a slash or closed angle bracket,
- // then it must be whitespace, since whitespace is required
- // between the end of the last attribute and the name of the next
- // one.
- if (attCount)
- {
- if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle))
- {
- if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
- {
- // Ok, skip by them and peek another char
- fReaderMgr.skipPastSpaces();
- nextCh = fReaderMgr.peekNextChar();
- }
- else
- {
- // Emit the error but keep on going
- emitError(XMLErrs::ExpectedWhitespace);
- }
- }
- }
- // Ok, here we first check for any of the special case characters.
- // If its not one, then we do the normal case processing, which
- // assumes that we've hit an attribute value, Otherwise, we do all
- // the special case checks.
- if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh))
- {
- // Assume its going to be an attribute, so get a name from
- // the input.
- if (!fReaderMgr.getName(fAttNameBuf))
- {
- emitError(XMLErrs::ExpectedAttrName);
- fReaderMgr.skipPastChar(chCloseAngle);
- return false;
- }
- // And next must be an equal sign
- if (!scanEq())
- {
- static const XMLCh tmpList[] =
- {
- chSingleQuote, chDoubleQuote, chCloseAngle
- , chOpenAngle, chForwardSlash, chNull
- };
- emitError(XMLErrs::ExpectedEqSign);
- // Try to sync back up by skipping forward until we either
- // hit something meaningful.
- const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
- if ((chFound == chCloseAngle) || (chFound == chForwardSlash))
- {
- // Jump back to top for normal processing of these
- continue;
- }
- else if ((chFound == chSingleQuote)
- || (chFound == chDoubleQuote)
- || fReaderMgr.getCurrentReader()->isWhitespace(chFound))
- {
- // Just fall through assuming that the value is to follow
- }
- else if (chFound == chOpenAngle)
- {
- // Assume a malformed tag and that new one is starting
- emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
- return false;
- }
- else
- {
- // Something went really wrong
- return false;
- }
- }
- // See if this attribute is declared more than one for this element.
- const XMLCh* attNameRawBuf = fAttNameBuf.getRawBuffer();
- unsigned int attNameHash = XMLString::hash(attNameRawBuf, 109);
- if (attCount) {
- for (unsigned int k=0; k < attCount; k++) {
- if (fAttrNameHashList->elementAt(k) == attNameHash) {
- if (
- XMLString::equals
- (
- fAttrList->elementAt(k)->getName()
- , attNameRawBuf
- )
- )
- {
- emitError
- (
- XMLErrs::AttrAlreadyUsedInSTag
- , attNameRawBuf
- , qnameRawBuf
- );
- break;
- }
- }
- }
- }
- // Skip any whitespace before the value and then scan the att
- // value. This will come back normalized with entity refs and
- // char refs expanded.
- fReaderMgr.skipPastSpaces();
- if (!scanAttValue(attNameRawBuf, fAttValueBuf))
- {
- static const XMLCh tmpList[] =
- {
- chCloseAngle, chOpenAngle, chForwardSlash, chNull
- };
- emitError(XMLErrs::ExpectedAttrValue);
- // It failed, so lets try to get synced back up. We skip
- // forward until we find some whitespace or one of the
- // chars in our list.
- const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
- if ((chFound == chCloseAngle)
- || (chFound == chForwardSlash)
- || fReaderMgr.getCurrentReader()->isWhitespace(chFound))
- {
- // Just fall through and process this attribute, though
- // the value will be "".
- }
- else if (chFound == chOpenAngle)
- {
- // Assume a malformed tag and that new one is starting
- emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
- return false;
- }
- else
- {
- // Something went really wrong
- return false;
- }
- }
- // Add this attribute to the attribute list that we use to
- // pass them to the handler. We reuse its existing elements
- // but expand it as required.
- XMLAttr* curAtt;
- if (attCount >= curAttListSize)
- {
- curAtt = new (fMemoryManager) XMLAttr
- (
- -1
- , attNameRawBuf
- , XMLUni::fgZeroLenString
- , fAttValueBuf.getRawBuffer()
- , XMLAttDef::CData
- , true
- , fMemoryManager
- );
- fAttrList->addElement(curAtt);
- fAttrNameHashList->addElement(attNameHash);
- }
- else
- {
- curAtt = fAttrList->elementAt(attCount);
- curAtt->set
- (
- -1
- , attNameRawBuf
- , XMLUni::fgZeroLenString
- , fAttValueBuf.getRawBuffer()
- );
- curAtt->setSpecified(true);
- fAttrNameHashList->setElementAt(attNameHash, attCount);
- }
- attCount++;
- // And jump back to the top of the loop
- continue;
- }
- // It was some special case character so do all of the checks and
- // deal with it.
- if (!nextCh)
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- if (nextCh == chForwardSlash)
- {
- fReaderMgr.getNextChar();
- isEmpty = true;
- if (!fReaderMgr.skippedChar(chCloseAngle))
- emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
- break;
- }
- else if (nextCh == chCloseAngle)
- {
- fReaderMgr.getNextChar();
- break;
- }
- else if (nextCh == chOpenAngle)
- {
- // Check for this one specially, since its going to be common
- // and it is kind of auto-recovering since we've already hit the
- // next open bracket, which is what we would have seeked to (and
- // skipped this whole tag.)
- emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
- break;
- }
- else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote))
- {
- // Check for this one specially, which is probably a missing
- // attribute name, e.g. ="value". Just issue expected name
- // error and eat the quoted string, then jump back to the
- // top again.
- emitError(XMLErrs::ExpectedAttrName);
- fReaderMgr.getNextChar();
- fReaderMgr.skipQuotedString(nextCh);
- fReaderMgr.skipPastSpaces();
- continue;
- }
- }
- // If empty, validate content right now if we are validating and then
- // pop the element stack top. Else, we have to update the current stack
- // top's namespace mapping elements.
- if (isEmpty)
- {
- // Pop the element stack back off since it'll never be used now
- fElemStack.popTop();
- // If the elem stack is empty, then it was an empty root
- if (isRoot)
- gotData = false;
- }
- // If we have a document handler, then tell it about this start tag. We
- // don't have any URI id to send along, so send fEmptyNamespaceId. We also do not send
- // any prefix since its just one big name if we are not doing namespaces.
- if (fDocHandler)
- {
- fDocHandler->startElement
- (
- *elemDecl
- , fEmptyNamespaceId
- , 0
- , *fAttrList
- , attCount
- , isEmpty
- , isRoot
- );
- }
- return true;
- }
- // This method is called to scan a start tag when we are processing
- // namespaces. There are two different versions of this method, one for
- // namespace aware processing an done for non-namespace aware processing.
- //
- // This method is called after we've scanned the < of a start tag. So we
- // have to get the element name, then scan the attributes, after which
- // we are either going to see >, />, or attributes followed by one of those
- // sequences.
- bool WFXMLScanner::scanStartTagNS(bool& gotData)
- {
- // Assume we will still have data until proven otherwise. It will only
- // ever be false if this is the root and its empty.
- gotData = true;
- // The current position is after the open bracket, so we need to read in
- // in the element name.
- if (!fReaderMgr.getName(fQNameBuf))
- {
- emitError(XMLErrs::ExpectedElementName);
- fReaderMgr.skipToChar(chOpenAngle);
- return false;
- }
- // See if its the root element
- const bool isRoot = fElemStack.isEmpty();
- // Assume it won't be an empty tag
- bool isEmpty = false;
- // Skip any whitespace after the name
- fReaderMgr.skipPastSpaces();
- // Lets try to look up the element
- const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer();
- XMLElementDecl* elemDecl = fElementLookup->get(qnameRawBuf);
- if (!elemDecl) {
- if (!XMLString::compareNString(qnameRawBuf, XMLUni::fgXMLNSColonString, 6))
- emitError(XMLErrs::NoXMLNSAsElementPrefix, qnameRawBuf);
- if (fElementIndex < fElements->size()) {
- elemDecl = fElements->elementAt(fElementIndex);
- }
- else {
- elemDecl = new (fMemoryManager) DTDElementDecl
- (
- fMemoryManager
- );
- fElements->addElement(elemDecl);
- }
- elemDecl->setElementName(qnameRawBuf, fEmptyNamespaceId);
- fElementLookup->put((void*)elemDecl->getFullName(), elemDecl);
- fElementIndex++;
- }
- // Expand the element stack and add the new element
- fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum());
- // reset NS attribute list
- fAttrNSList->removeAllElements();
- // We loop until we either see a /> or >, handling attribute/value
- // pairs until we get there.
- unsigned int attCount = 0;
- unsigned int curAttListSize = fAttrList->size();
- while (true)
- {
- // And get the next non-space character
- XMLCh nextCh = fReaderMgr.peekNextChar();
- // If the next character is not a slash or closed angle bracket,
- // then it must be whitespace, since whitespace is required
- // between the end of the last attribute and the name of the next
- // one.
- if (attCount)
- {
- if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle))
- {
- if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
- {
- // Ok, skip by them and peek another char
- fReaderMgr.skipPastSpaces();
- nextCh = fReaderMgr.peekNextChar();
- }
- else
- {
- // Emit the error but keep on going
- emitError(XMLErrs::ExpectedWhitespace);
- }
- }
- }
- // Ok, here we first check for any of the special case characters.
- // If its not one, then we do the normal case processing, which
- // assumes that we've hit an attribute value, Otherwise, we do all
- // the special case checks.
- if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh))
- {
- // Assume its going to be an attribute, so get a name from
- // the input.
- if (!fReaderMgr.getName(fAttNameBuf))
- {
- emitError(XMLErrs::ExpectedAttrName);
- fReaderMgr.skipPastChar(chCloseAngle);
- return false;
- }
- // And next must be an equal sign
- if (!scanEq())
- {
- static const XMLCh tmpList[] =
- {
- chSingleQuote, chDoubleQuote, chCloseAngle
- , chOpenAngle, chForwardSlash, chNull
- };
- emitError(XMLErrs::ExpectedEqSign);
- // Try to sync back up by skipping forward until we either
- // hit something meaningful.
- const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
- if ((chFound == chCloseAngle) || (chFound == chForwardSlash))
- {
- // Jump back to top for normal processing of these
- continue;
- }
- else if ((chFound == chSingleQuote)
- || (chFound == chDoubleQuote)
- || fReaderMgr.getCurrentReader()->isWhitespace(chFound))
- {
- // Just fall through assuming that the value is to follow
- }
- else if (chFound == chOpenAngle)
- {
- // Assume a malformed tag and that new one is starting
- emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
- return false;
- }
- else
- {
- // Something went really wrong
- return false;
- }
- }
- // See if this attribute is declared more than one for this element.
- const XMLCh* attNameRawBuf = fAttNameBuf.getRawBuffer();
- unsigned int attNameHash = XMLString::hash(attNameRawBuf, 109);
- if (attCount) {
- for (unsigned int k=0; k < attCount; k++) {
- if (fAttrNameHashList->elementAt(k) == attNameHash) {
- if (XMLString::equals(
- fAttrList->elementAt(k)->getQName()
- , attNameRawBuf))
- {
- emitError
- (
- XMLErrs::AttrAlreadyUsedInSTag
- , attNameRawBuf
- , qnameRawBuf
- );
- break;
- }
- }
- }
- }
- // Skip any whitespace before the value and then scan the att
- // value. This will come back normalized with entity refs and
- // char refs expanded.
- fReaderMgr.skipPastSpaces();
- if (!scanAttValue(attNameRawBuf, fAttValueBuf))
- {
- static const XMLCh tmpList[] =
- {
- chCloseAngle, chOpenAngle, chForwardSlash, chNull
- };
- emitError(XMLErrs::ExpectedAttrValue);
- // It failed, so lets try to get synced back up. We skip
- // forward until we find some whitespace or one of the
- // chars in our list.
- const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
- if ((chFound == chCloseAngle)
- || (chFound == chForwardSlash)
- || fReaderMgr.getCurrentReader()->isWhitespace(chFound))
- {
- // Just fall through and process this attribute, though
- // the value will be "".
- }
- else if (chFound == chOpenAngle)
- {
- // Assume a malformed tag and that new one is starting
- emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
- return false;
- }
- else
- {
- // Something went really wrong
- return false;
- }
- }
- // Add this attribute to the attribute list that we use to
- // pass them to the handler. We reuse its existing elements
- // but expand it as required.
- const XMLCh* attValueRawBuf = fAttValueBuf.getRawBuffer();
- XMLAttr* curAtt = 0;
- if (attCount >= curAttListSize)
- {
- curAtt = new (fMemoryManager) XMLAttr
- (
- fEmptyNamespaceId
- , attNameRawBuf
- , attValueRawBuf
- , XMLAttDef::CData
- , true
- , fMemoryManager
- );
- fAttrList->addElement(curAtt);
- fAttrNameHashList->addElement(attNameHash);
- }
- else
- {
- curAtt = fAttrList->elementAt(attCount);
- curAtt->set
- (
- fEmptyNamespaceId
- , attNameRawBuf
- , attValueRawBuf
- );
- curAtt->setSpecified(true);
- fAttrNameHashList->setElementAt(attNameHash, attCount);
- }
- // Make sure that the name is basically well formed for namespace
- // enabled rules. It either has no colons, or it has one which
- // is neither the first or last char.
- const int colonFirst = XMLString::indexOf(attNameRawBuf, chColon);
- if (colonFirst != -1)
- {
- const int colonLast = XMLString::lastIndexOf(attNameRawBuf, chColon);
- if (colonFirst != colonLast)
- {
- emitError(XMLErrs::TooManyColonsInName);
- continue;
- }
- else if ((colonFirst == 0)
- || (colonLast == (int)fAttNameBuf.getLen() - 1))
- {
- emitError(XMLErrs::InvalidColonPos);
- continue;
- }
- }
- // Map prefix to namespace
- const XMLCh* attPrefix = curAtt->getPrefix();
- const XMLCh* attLocalName = curAtt->getName();
- const XMLCh* namespaceURI = fAttValueBuf.getRawBuffer();
- if (attPrefix && *attPrefix) {
- if (XMLString::equals(attPrefix, XMLUni::fgXMLString)) {
- curAtt->setURIId(fXMLNamespaceId);
- }
- else if (XMLString::equals(attPrefix, XMLUni::fgXMLNSString)) {
- if (XMLString::equals(attLocalName, XMLUni::fgXMLNSString))
- emitError(XMLErrs::NoUseOfxmlnsAsPrefix);
- else if (XMLString::equals(attLocalName, XMLUni::fgXMLString)) {
- if (!XMLString::equals(namespaceURI, XMLUni::fgXMLURIName))
- emitError(XMLErrs::PrefixXMLNotMatchXMLURI);
- }
- if (!namespaceURI)
- emitError(XMLErrs::NoEmptyStrNamespace, attNameRawBuf);
- else if(!*namespaceURI && fXMLVersion == XMLReader::XMLV1_0)
- emitError(XMLErrs::NoEmptyStrNamespace, attNameRawBuf);
- fElemStack.addPrefix
- (
- attLocalName
- , fURIStringPool->addOrFind(namespaceURI)
- );
- curAtt->setURIId(fXMLNSNamespaceId);
- }
- else {
- fAttrNSList->addElement(curAtt);
- }
- }
- else {
- if (XMLString::equals(XMLUni::fgXMLNSString, attLocalName)) {
- if (XMLString::equals(namespaceURI, XMLUni::fgXMLNSURIName))
- emitError(XMLErrs::NoUseOfxmlnsURI);
- else if (XMLString::equals(namespaceURI, XMLUni::fgXMLURIName))
- emitError(XMLErrs::XMLURINotMatchXMLPrefix);
- fElemStack.addPrefix
- (
- XMLUni::fgZeroLenString
- , fURIStringPool->addOrFind(namespaceURI)
- );
- }
- }
- // increment attribute count
- attCount++;
- // And jump back to the top of the loop
- continue;
- }
- // It was some special case character so do all of the checks and
- // deal with it.
- if (!nextCh)
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- if (nextCh == chForwardSlash)
- {
- fReaderMgr.getNextChar();
- isEmpty = true;
- if (!fReaderMgr.skippedChar(chCloseAngle))
- emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
- break;
- }
- else if (nextCh == chCloseAngle)
- {
- fReaderMgr.getNextChar();
- break;
- }
- else if (nextCh == chOpenAngle)
- {
- // Check for this one specially, since its going to be common
- // and it is kind of auto-recovering since we've already hit the
- // next open bracket, which is what we would have seeked to (and
- // skipped this whole tag.)
- emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
- break;
- }
- else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote))
- {
- // Check for this one specially, which is probably a missing
- // attribute name, e.g. ="value". Just issue expected name
- // error and eat the quoted string, then jump back to the
- // top again.
- emitError(XMLErrs::ExpectedAttrName);
- fReaderMgr.getNextChar();
- fReaderMgr.skipQuotedString(nextCh);
- fReaderMgr.skipPastSpaces();
- continue;
- }
- }
- // Handle provided attributes that we did not map their prefixes
- for (unsigned int i=0; i < fAttrNSList->size(); i++) {
- XMLAttr* providedAttr = fAttrNSList->elementAt(i);
- providedAttr->setURIId
- (
- resolvePrefix
- (
- providedAttr->getPrefix(),
- ElemStack::Mode_Attribute
- )
- );
- }
- // Resolve the qualified name to a URI.
- unsigned int uriId = resolvePrefix
- (
- elemDecl->getElementName()->getPrefix()
- , ElemStack::Mode_Element
- );
- // Now we can update the element stack
- fElemStack.setCurrentURI(uriId);
- // Tell the document handler about this start tag
- if (fDocHandler)
- {
- fDocHandler->startElement
- (
- *elemDecl
- , uriId
- , elemDecl->getElementName()->getPrefix()
- , *fAttrList
- , attCount
- , false
- , isRoot
- );
- }
- // If empty, validate content right now if we are validating and then
- // pop the element stack top. Else, we have to update the current stack
- // top's namespace mapping elements.
- if (isEmpty)
- {
- // Pop the element stack back off since it'll never be used now
- fElemStack.popTop();
- // If we have a doc handler, tell it about the end tag
- if (fDocHandler)
- {
- fDocHandler->endElement
- (
- *elemDecl
- , uriId
- , isRoot
- , elemDecl->getElementName()->getPrefix()
- );
- }
- // If the elem stack is empty, then it was an empty root
- if (isRoot)
- gotData = false;
- }
- return true;
- }
- unsigned int
- WFXMLScanner::resolveQName(const XMLCh* const qName
- , XMLBuffer& prefixBuf
- , const short mode
- , int& prefixColonPos)
- {
- // Lets split out the qName into a URI and name buffer first. The URI
- // can be empty.
- prefixColonPos = XMLString::indexOf(qName, chColon);
- if (prefixColonPos == -1)
- {
- // Its all name with no prefix, so put the whole thing into the name
- // buffer. Then map the empty string to a URI, since the empty string
- // represents the default namespace. This will either return some
- // explicit URI which the default namespace is mapped to, or the
- // the default global namespace.
- bool unknown = false;
- prefixBuf.reset();
- return fElemStack.mapPrefixToURI(XMLUni::fgZeroLenString, (ElemStack::MapModes) mode, unknown);
- }
- else
- {
- // Copy the chars up to but not including the colon into the prefix
- // buffer.
- prefixBuf.set(qName, prefixColonPos);
- // Watch for the special namespace prefixes. We always map these to
- // special URIs. 'xml' gets mapped to the official URI that its defined
- // to map to by the NS spec. xmlns gets mapped to a special place holder
- // URI that we define (so that it maps to something checkable.)
- const XMLCh* prefixRawBuf = prefixBuf.getRawBuffer();
- if (XMLString::equals(prefixRawBuf, XMLUni::fgXMLNSString)) {
- // if this is an element, it is an error to have xmlns as prefix
- if (mode == ElemStack::Mode_Element)
- emitError(XMLErrs::NoXMLNSAsElementPrefix, qName);
- return fXMLNSNamespaceId;
- }
- else if (XMLString::equals(prefixRawBuf, XMLUni::fgXMLString)) {
- return fXMLNamespaceId;
- }
- else
- {
- bool unknown = false;
- unsigned int uriId = fElemStack.mapPrefixToURI(prefixRawBuf, (ElemStack::MapModes) mode, unknown);
- if (unknown)
- emitError(XMLErrs::UnknownPrefix, prefixRawBuf);
- return uriId;
- }
- }
- }
- // ---------------------------------------------------------------------------
- // XMLScanner: Private parsing methods
- // ---------------------------------------------------------------------------
- bool WFXMLScanner::scanAttValue(const XMLCh* const attrName
- , XMLBuffer& toFill)
- {
- // Reset the target buffer
- toFill.reset();
- // Get the next char which must be a single or double quote
- XMLCh quoteCh;
- if (!fReaderMgr.skipIfQuote(quoteCh))
- return false;
- // We have to get the current reader because we have to ignore closing
- // quotes until we hit the same reader again.
- const unsigned int curReader = fReaderMgr.getCurrentReaderNum();
- // Loop until we get the attribute value. Note that we use a double
- // loop here to avoid the setup/teardown overhead of the exception
- // handler on every round.
- XMLCh nextCh;
- XMLCh secondCh = 0;
- bool firstNonWS = false;
- bool gotLeadingSurrogate = false;
- bool escaped;
- while (true)
- {
- try
- {
- while(true)
- {
- nextCh = fReaderMgr.getNextChar();
- if (!nextCh)
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- // Check for our ending quote in the same entity
- if (nextCh == quoteCh)
- {
- if (curReader == fReaderMgr.getCurrentReaderNum())
- return true;
- // Watch for spillover into a previous entity
- if (curReader > fReaderMgr.getCurrentReaderNum())
- {
- emitError(XMLErrs::PartialMarkupInEntity);
- return false;
- }
- }
- // Check for an entity ref now, before we let it affect our
- // whitespace normalization logic below. We ignore the empty flag
- // in this one.
- escaped = false;
- if (nextCh == chAmpersand)
- {
- if (scanEntityRef(true, nextCh, secondCh, escaped) != EntityExp_Returned)
- {
- gotLeadingSurrogate = false;
- continue;
- }
- }
- else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
- {
- // Deal with surrogate pairs
- // Its a leading surrogate. If we already got one, then
- // issue an error, else set leading flag to make sure that
- // we look for a trailing next time.
- if (gotLeadingSurrogate)
- {
- emitError(XMLErrs::Expected2ndSurrogateChar);
- }
- else
- gotLeadingSurrogate = true;
- }
- else
- {
- // If its a trailing surrogate, make sure that we are
- // prepared for that. Else, its just a regular char so make
- // sure that we were not expected a trailing surrogate.
- if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
- {
- // Its trailing, so make sure we were expecting it
- if (!gotLeadingSurrogate)
- emitError(XMLErrs::Unexpected2ndSurrogateChar);
- }
- else
- {
- // Its just a char, so make sure we were not expecting a
- // trailing surrogate.
- if (gotLeadingSurrogate) {
- emitError(XMLErrs::Expected2ndSurrogateChar);
- }
- // Its got to at least be a valid XML character
- else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
- {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf);
- }
- }
- gotLeadingSurrogate = false;
- }
- // If its not escaped, then make sure its not a < character, which
- // is not allowed in attribute values.
- if (!escaped) {
- if (nextCh == chOpenAngle)
- emitError(XMLErrs::BracketInAttrValue, attrName);
- else if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
- nextCh = chSpace;
- }
- // Else add it to the buffer
- toFill.append(nextCh);
- if (secondCh)
- toFill.append(secondCh);
- }
- }
- catch(const EndOfEntityException&)
- {
- // Just eat it and continue.
- gotLeadingSurrogate = false;
- escaped = false;
- }
- }
- return true;
- }
- // This method scans a CDATA section. It collects the character into one
- // of the temp buffers and calls the document handler, if any, with the
- // characters. It assumes that the <![CDATA string has been scanned before
- // this call.
- void WFXMLScanner::scanCDSection()
- {
- // This is the CDATA section opening sequence, minus the '<' character.
- // We use this to watch for nested CDATA sections, which are illegal.
- static const XMLCh CDataPrefix[] =
- {
- chBang, chOpenSquare, chLatin_C, chLatin_D, chLatin_A
- , chLatin_T, chLatin_A, chOpenSquare, chNull
- };
- static const XMLCh CDataClose[] =
- {
- chCloseSquare, chCloseAngle, chNull
- };
- // The next character should be the opening square bracket. If not
- // issue an error, but then try to recover by skipping any whitespace
- // and checking again.
- if (!fReaderMgr.skippedChar(chOpenSquare))
- {
- emitError(XMLErrs::ExpectedOpenSquareBracket);
- fReaderMgr.skipPastSpaces();
- // If we still don't find it, then give up, else keep going
- if (!fReaderMgr.skippedChar(chOpenSquare))
- return;
- }
- // Get a buffer for this
- XMLBufBid bbCData(&fBufMgr);
- // We just scan forward until we hit the end of CDATA section sequence.
- // CDATA is effectively a big escape mechanism so we don't treat markup
- // characters specially here.
- bool emittedError = false;
- bool gotLeadingSurrogate = false;
- while (true)
- {
- const XMLCh nextCh = fReaderMgr.getNextChar();
- // Watch for unexpected end of file
- if (!nextCh)
- {
- emitError(XMLErrs::UnterminatedCDATASection);
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- }
- // If this is a close square bracket it could be our closing
- // sequence.
- if (nextCh == chCloseSquare && fReaderMgr.skippedString(CDataClose))
- {
- // make sure we were not expecting a trailing surrogate.
- if (gotLeadingSurrogate)
- emitError(XMLErrs::Expected2ndSurrogateChar);
- // If we have a doc handler, call it
- if (fDocHandler)
- {
- fDocHandler->docCharacters
- (
- bbCData.getRawBuffer()
- , bbCData.getLen()
- , true
- );
- }
- // And we are done
- break;
- }
- // Make sure its a valid character. But if we've emitted an error
- // already, don't bother with the overhead since we've already told
- // them about it.
- if (!emittedError)
- {
- // Deal with surrogate pairs
- if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
- {
- // Its a leading surrogate. If we already got one, then
- // issue an error, else set leading flag to make sure that
- // we look for a trailing next time.
- if (gotLeadingSurrogate)
- emitError(XMLErrs::Expected2ndSurrogateChar);
- else
- gotLeadingSurrogate = true;
- }
- else
- {
- // If its a trailing surrogate, make sure that we are
- // prepared for that. Else, its just a regular char so make
- // sure that we were not expected a trailing surrogate.
- if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
- {
- // Its trailing, so make sure we were expecting it
- if (!gotLeadingSurrogate)
- emitError(XMLErrs::Unexpected2ndSurrogateChar);
- }
- else
- {
- // Its just a char, so make sure we were not expecting a
- // trailing surrogate.
- if (gotLeadingSurrogate)
- emitError(XMLErrs::Expected2ndSurrogateChar);
- // Its got to at least be a valid XML character
- else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
- {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- emitError(XMLErrs::InvalidCharacter, tmpBuf);
- emittedError = true;
- }
- }
- gotLeadingSurrogate = false;
- }
- }
- // Add it to the buffer
- bbCData.append(nextCh);
- }
- }
- void WFXMLScanner::scanCharData(XMLBuffer& toUse)
- {
- // We have to watch for the stupid ]]> sequence, which is illegal in
- // character data. So this is a little state machine that handles that.
- enum States
- {
- State_Waiting
- , State_GotOne
- , State_GotTwo
- };
- // Reset the buffer before we start
- toUse.reset();
- // Turn on the 'throw at end' flag of the reader manager
- ThrowEOEJanitor jan(&fReaderMgr, true);
- // In order to be more efficient we have to use kind of a deeply nested
- // set of blocks here. The outer block puts on a try and catches end of
- // entity exceptions. The inner loop is the per-character loop. If we
- // put the try inside the inner loop, it would work but would require
- // the exception handling code setup/teardown code to be invoked for
- // each character.
- XMLCh nextCh;
- XMLCh secondCh = 0;
- States curState = State_Waiting;
- bool escaped = false;
- bool gotLeadingSurrogate = false;
- bool notDone = true;
- while (notDone)
- {
- try
- {
- while (true)
- {
- // Eat through as many plain content characters as possible without
- // needing special handling. Moving most content characters here,
- // in this one call, rather than running the overall loop once
- // per content character, is a speed optimization.
- if (curState == State_Waiting && !gotLeadingSurrogate)
- {
- fReaderMgr.movePlainContentChars(toUse);
- }
- // Try to get another char from the source
- // The code from here on down covers all contengencies,
- if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh))
- {
- // If we were waiting for a trailing surrogate, its an error
- if (gotLeadingSurrogate)
- emitError(XMLErrs::Expected2ndSurrogateChar);
- notDone = false;
- break;
- }
- // Watch for a reference. Note that the escapement mechanism
- // is ignored in this content.
- escaped = false;
- if (nextCh == chAmpersand)
- {
- sendCharData(toUse);
- // Turn off the throwing at the end of entity during this
- ThrowEOEJanitor jan(&fReaderMgr, false);
- if (scanEntityRef(false, nextCh, secondCh, escaped) != EntityExp_Returned)
- {
- gotLeadingSurrogate = false;
- continue;
- }
- }
- else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
- {
- // Deal with surrogate pairs
- // Its a leading surrogate. If we already got one, then
- // issue an error, else set leading flag to make sure that
- // we look for a trailing next time.
- if (gotLeadingSurrogate)
- {
- emitError(XMLErrs::Expected2ndSurrogateChar);
- }
- else
- gotLeadingSurrogate = true;
- }
- else
- {
- // If its a trailing surrogate, make sure that we are
- // prepared for that. Else, its just a regular char so make
- // sure that we were not expected a trailing surrogate.
- if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
- {
- // Its trailing, so make sure we were expecting it
- if (!gotLeadingSurrogate)
- emitError(XMLErrs::Unexpected2ndSurrogateChar);
- }
- else
- {
- // Its just a char, so make sure we were not expecting a
- // trailing surrogate.
- if (gotLeadingSurrogate) {
- emitError(XMLErrs::Expected2ndSurrogateChar);
- }
- // Its got to at least be a valid XML character
- else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
- {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- emitError(XMLErrs::InvalidCharacter, tmpBuf);
- }
- }
- gotLeadingSurrogate = false;
- }
- // Keep the state machine up to date
- if (!escaped)
- {
- if (nextCh == chCloseSquare)
- {
- if (curState == State_Waiting)
- curState = State_GotOne;
- else if (curState == State_GotOne)
- curState = State_GotTwo;
- }
- else if (nextCh == chCloseAngle)
- {
- if (curState == State_GotTwo)
- emitError(XMLErrs::BadSequenceInCharData);
- curState = State_Waiting;
- }
- else
- {
- curState = State_Waiting;
- }
- }
- else
- {
- curState = State_Waiting;
- }
- // Add this char to the buffer
- toUse.append(nextCh);
- if (secondCh)
- toUse.append(secondCh);
- }
- }
- catch(const EndOfEntityException& toCatch)
- {
- // Some entity ended, so we have to send any accumulated
- // chars and send an end of entity event.
- sendCharData(toUse);
- gotLeadingSurrogate = false;
- if (fDocHandler)
- fDocHandler->endEntityReference(toCatch.getEntity());
- }
- }
- // Send any char data that we accumulated into the buffer
- sendCharData(toUse);
- }
- // This method will scan a general/character entity ref. It will either
- // expand a char ref and return it directly, or push a reader for a general
- // entity.
- //
- // The return value indicates whether the char parameters hold the value
- // or whether the value was pushed as a reader, or that it failed.
- //
- // The escaped flag tells the caller whether the returned parameter resulted
- // from a character reference, which escapes the character in some cases. It
- // only makes any difference if the return value indicates the value was
- // returned directly.
- XMLScanner::EntityExpRes
- WFXMLScanner::scanEntityRef(const bool inAttVal
- , XMLCh& firstCh
- , XMLCh& secondCh
- , bool& escaped)
- {
- // Assume no escape
- secondCh = 0;
- escaped = false;
- // We have to insure that its all in one entity
- const unsigned int curReader = fReaderMgr.getCurrentReaderNum();
- // If the next char is a pound, then its a character reference and we
- // need to expand it always.
- if (fReaderMgr.skippedChar(chPound))
- {
- // Its a character reference, so scan it and get back the numeric
- // value it represents.
- if (!scanCharRef(firstCh, secondCh))
- return EntityExp_Failed;
- escaped = true;
- if (curReader != fReaderMgr.getCurrentReaderNum())
- emitError(XMLErrs::PartialMarkupInEntity);
- return EntityExp_Returned;
- }
- // Expand it since its a normal entity ref
- XMLBufBid bbName(&fBufMgr);
- if (!fReaderMgr.getName(bbName.getBuffer()))
- {
- emitError(XMLErrs::ExpectedEntityRefName);
- return EntityExp_Failed;
- }
- // Next char must be a semi-colon. But if its not, just emit
- // an error and try to continue.
- if (!fReaderMgr.skippedChar(chSemiColon))
- emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
- // Make sure we ended up on the same entity reader as the & char
- if (curReader != fReaderMgr.getCurrentReaderNum())
- emitError(XMLErrs::PartialMarkupInEntity);
- // Look up the name in the general entity pool
- // If it does not exist, then obviously an error
- if (!fEntityTable->containsKey(bbName.getRawBuffer()))
- {
- // XML 1.0 Section 4.1
- // Well-formedness Constraint for entity not found:
- // In a document without any DTD, a document with only an internal DTD subset which contains no parameter entity references,
- // or a document with "standalone='yes'", for an entity reference that does not occur within the external subset
- // or a parameter entity
- if (fStandalone || fHasNoDTD)
- emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
- return EntityExp_Failed;
- }
- // here's where we need to check if there's a SecurityManager,
- // how many entity references we've had
- if(fSecurityManager != 0 && ++fEntityExpansionCount > fEntityExpansionLimit) {
- XMLCh expLimStr[16];
- XMLString::binToText(fEntityExpansionLimit, expLimStr, 15, 10);
- emitError
- (
- XMLErrs::EntityExpansionLimitExceeded
- , expLimStr
- );
- // there seems nothing better to be done than to reset the entity expansion counter
- fEntityExpansionCount = 0;
- }
- firstCh = fEntityTable->get(bbName.getRawBuffer());
- escaped = true;
- return EntityExp_Returned;
- }
- // ---------------------------------------------------------------------------
- // WFXMLScanner: Grammar preparsing
- // ---------------------------------------------------------------------------
- Grammar* WFXMLScanner::loadGrammar(const InputSource&
- , const short
- , const bool)
- {
- // REVISIT: emit a warning or throw an exception
- return 0;
- }
- XERCES_CPP_NAMESPACE_END