DTDScanner.cpp
上传用户:zhuqijet
上传日期:2013-06-25
资源大小:10074k
文件大小:134k
- /*
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 1999-2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Xerces" and "Apache Software Foundation" must
- * not be used to endorse or promote products derived from this
- * software without prior written permission. For written
- * permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * nor may "Apache" appear in their name, without prior written
- * permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation, and was
- * originally based on software copyright (c) 1999, International
- * Business Machines, Inc., http://www.ibm.com . For more information
- * on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
- /*
- * $Log: DTDScanner.cpp,v $
- * Revision 1.27 2003/05/18 14:02:06 knoaman
- * Memory manager implementation: pass per instance manager.
- *
- * Revision 1.26 2003/05/16 21:43:19 knoaman
- * Memory manager implementation: Modify constructors to pass in the memory manager.
- *
- * Revision 1.25 2003/05/15 18:54:50 knoaman
- * Partial implementation of the configurable memory manager.
- *
- * Revision 1.24 2003/03/10 15:28:07 tng
- * XML1.0 Errata E38
- *
- * Revision 1.23 2003/02/05 22:07:09 tng
- * [Bug 3111] Problem with LexicalHandler::startDTD() and LexicalHandler::endDTD().
- *
- * Revision 1.22 2003/01/20 22:01:38 tng
- * Need to check text decl when expanding PE
- *
- * Revision 1.21 2003/01/16 21:30:14 tng
- * [Bug 16151] Memory leak in DTDScanner with ill-formed DTD declaration. Fix by David Bertoni.
- *
- * Revision 1.20 2002/12/24 16:12:19 tng
- * For performance reason, move the character check to scancharref.
- *
- * Revision 1.19 2002/12/20 22:10:47 tng
- * XML 1.1
- *
- * Revision 1.18 2002/12/18 14:17:55 gareth
- * Fix to bug #13438. When you eant a vector that calls delete[] on its members you should use RefArrayVectorOf.
- *
- * Revision 1.17 2002/12/04 02:47:25 knoaman
- * scanner re-organization.
- *
- * Revision 1.16 2002/11/14 22:34:11 tng
- * [Bug 14265] Access violation with Null systemId/publicId in DTDScanner
- *
- * Revision 1.15 2002/11/05 21:40:36 tng
- * Oasis test fix:
- * 1. Should check if content model allow character for CDataSection case
- * 2. Should check partial markup in entity for INCLUDE and IGNORE scenario
- * 3. If standalone is yes, reference to entity where its declaration is external is a well-formness fatal error (XML 1.0 Section 4.1)
- * If standalone is yes, reference to parameter entity where is declaration is external is a validity constraint (XML 1.0 Section 2.9)
- * 4. XML 1.0 Section 2.8 Partial markup in parameter entity reference.
- * If it is a complete declaration, partial markup is a fatal error.
- *
- * Revision 1.14 2002/11/04 14:50:40 tng
- * C++ Namespace Support.
- *
- * Revision 1.13 2002/09/24 20:10:30 tng
- * Performance: use XMLString::equals instead of XMLString::compareString
- *
- * Revision 1.12 2002/08/22 21:05:29 tng
- * [Bug 7475] Xerces-C++ reports validation error with Docbook.
- *
- * Revision 1.11 2002/08/22 20:26:01 tng
- * [Bug 7512] Wrong error message created .
- *
- * Revision 1.10 2002/08/22 19:29:13 tng
- * [Bug 11448] DomCount has problems with XHTML1.1 DTD.
- *
- * Revision 1.9 2002/08/19 14:40:31 tng
- * Fix: public id / system id in entity decl should be null if empty
- *
- * Revision 1.8 2002/07/26 13:33:44 knoaman
- * Public/System id for notations should be stored as NULL if missing.
- *
- * Revision 1.7 2002/07/11 18:39:48 knoaman
- * Access entities through the DTDGrammar instead of the scanner.
- *
- * Revision 1.6 2002/06/06 20:36:33 tng
- * Fix: Valid encoding name is not checked in scanning Text Decl
- *
- * Revision 1.5 2002/05/30 16:17:19 tng
- * Add feature to optionally ignore external DTD.
- *
- * Revision 1.4 2002/05/03 14:51:16 peiyongz
- * Bug#8769: UMR detected by memory tool - patch from Kenneth Palsson
- *
- * Revision 1.3 2002/02/28 22:34:36 peiyongz
- * Bug#2717: patch to Unterminated INCLUDE section causes infinite loop with setExitOnFirstFatalError(false)
- *
- * Revision 1.2 2002/02/26 21:06:53 knoaman
- * Create ZeroOrOne node only if needed.
- *
- * Revision 1.1.1.1 2002/02/01 22:22:44 peiyongz
- * sane_include
- *
- * Revision 1.25 2002/01/24 16:30:50 tng
- * [Bug 3111] Problem with LexicalHandler::startDTD() and LexicalHandler::endDTD() .
- *
- * Revision 1.24 2001/12/17 15:39:14 knoaman
- * Fix for surrogate pair support.
- *
- * Revision 1.23 2001/12/14 20:21:37 knoaman
- * Add surrogate support to comments and processing instrunctions.
- *
- * Revision 1.22 2001/12/06 17:51:18 tng
- * Performance Enhancement. The ContentSpecNode constructor always copied the QName
- * that was passed to it. Added a second constructor that allows the QName to be just assigned, not copied.
- * That was because there are some cases in which a temporary QName was constructed, passed to ContentSpecNode, and then deleted.
- * There were examples of that in TraverseSchema and DTDScanner.
- * By Henry Zongaro.
- *
- * Revision 1.21 2001/11/13 13:27:28 tng
- * Move root element check to XMLScanner.
- *
- * Revision 1.20 2001/09/05 20:49:10 knoaman
- * Fix for complexTypes with mixed content model.
- *
- * Revision 1.19 2001/08/02 16:54:39 tng
- * Reset some Scanner flags in scanReset().
- *
- * Revision 1.18 2001/07/13 16:57:11 tng
- * ScanId fix.
- *
- * Revision 1.17 2001/07/12 20:10:18 tng
- * Partial Markup in Parameter Entity is validity constraint and thus should be just error, not fatal error.
- *
- * Revision 1.16 2001/07/10 21:09:39 tng
- * Give proper error messsage when scanning external id.
- *
- * Revision 1.15 2001/07/10 20:56:17 tng
- * Should check the first char of PI Target Name.
- *
- * Revision 1.14 2001/07/09 13:42:20 tng
- * Partial Markup in Parameter Entity is validity constraint and thus should be just error, not fatal error.
- *
- * Revision 1.13 2001/07/05 14:05:29 tng
- * Encoding String must present for external entity text decl.
- *
- * Revision 1.12 2001/07/05 13:12:19 tng
- * Standalone checking is validity constraint and thus should be just error, not fatal error:
- *
- * Revision 1.11 2001/06/25 14:39:54 knoaman
- * Fix bug #965 - submitted by Matt Lovett
- *
- * Revision 1.10 2001/06/22 12:42:33 tng
- * [Bug 2257] 1.5 thinks a <?xml-stylesheet ...> tag is a <?xml ...> tag
- *
- * Revision 1.9 2001/06/21 14:25:53 knoaman
- * Fix for bug 1946
- *
- * Revision 1.8 2001/06/04 13:25:50 tng
- * the start tag "<?xml" could be followed by (#x20 | #x9 | #xD | #xA)+. Fixed by Pei Yong Zhang.
- *
- * Revision 1.7 2001/05/28 20:54:06 tng
- * Schema: allocate a fDTDValidator, fSchemaValidator explicitly to avoid wrong cast
- *
- * Revision 1.6 2001/05/11 13:27:09 tng
- * Copyright update.
- *
- * Revision 1.5 2001/05/03 20:34:36 tng
- * Schema: SchemaValidator update
- *
- * Revision 1.4 2001/04/23 18:54:35 tng
- * Reuse grammar should allow users to use any stored element decl as root. Fixed by Erik Rydgren.
- *
- * Revision 1.3 2001/04/19 18:17:21 tng
- * Schema: SchemaValidator update, and use QName in Content Model
- *
- * Revision 1.2 2001/03/30 16:35:17 tng
- * Schema: Whitespace normalization.
- *
- * Revision 1.1 2001/03/21 21:56:20 tng
- * Schema: Add Schema Grammar, Schema Validator, and split the DTDValidator into DTDValidator, DTDScanner, and DTDGrammar.
- *
- */
- // ---------------------------------------------------------------------------
- // Includes
- // ---------------------------------------------------------------------------
- #include <xercesc/util/BinMemInputStream.hpp>
- #include <xercesc/util/FlagJanitor.hpp>
- #include <xercesc/util/Janitor.hpp>
- #include <xercesc/util/XMLUniDefs.hpp>
- #include <xercesc/util/UnexpectedEOFException.hpp>
- #include <xercesc/sax/InputSource.hpp>
- #include <xercesc/framework/XMLDocumentHandler.hpp>
- #include <xercesc/framework/XMLEntityHandler.hpp>
- #include <xercesc/framework/XMLValidator.hpp>
- #include <xercesc/internal/EndOfEntityException.hpp>
- #include <xercesc/internal/XMLScanner.hpp>
- #include <xercesc/validators/common/ContentSpecNode.hpp>
- #include <xercesc/validators/common/MixedContentModel.hpp>
- #include <xercesc/validators/DTD/DTDEntityDecl.hpp>
- #include <xercesc/validators/DTD/DocTypeHandler.hpp>
- #include <xercesc/validators/DTD/DTDScanner.hpp>
- XERCES_CPP_NAMESPACE_BEGIN
- // ---------------------------------------------------------------------------
- // Local methods
- // ---------------------------------------------------------------------------
- //
- // This method automates the grunt work of looking at a char and see if its
- // a repetition suffix. If so, it creates a new correct rep node and wraps
- // the pass node in it. Otherwise, it returns the previous node.
- //
- static ContentSpecNode* makeRepNode(const XMLCh testCh,
- ContentSpecNode* const prevNode,
- MemoryManager* const manager)
- {
- if (testCh == chQuestion)
- {
- return new (manager) ContentSpecNode
- (
- ContentSpecNode::ZeroOrOne
- , prevNode
- , 0
- , true
- , true
- , manager
- );
- }
- else if (testCh == chPlus)
- {
- return new (manager) ContentSpecNode
- (
- ContentSpecNode::OneOrMore
- , prevNode
- , 0
- , true
- , true
- , manager
- );
- }
- else if (testCh == chAsterisk)
- {
- return new (manager) ContentSpecNode
- (
- ContentSpecNode::ZeroOrMore
- , prevNode
- , 0
- , true
- , true
- , manager
- );
- }
- // Just return the incoming node
- return prevNode;
- }
- // ---------------------------------------------------------------------------
- // DTDValidator: Constructors and Destructor
- // ---------------------------------------------------------------------------
- DTDScanner::DTDScanner( DTDGrammar* dtdGrammar
- , DocTypeHandler* const docTypeHandler
- , MemoryManager* const manager) :
- fMemoryManager(manager)
- , fDocTypeHandler(docTypeHandler)
- , fDumAttDef(0)
- , fDumElemDecl(0)
- , fDumEntityDecl(0)
- , fInternalSubset(false)
- , fNextAttrId(1)
- , fDTDGrammar(dtdGrammar)
- , fPEntityDeclPool(0)
- , fDocTypeReaderId(0)
- {
- fPEntityDeclPool = new (fMemoryManager) NameIdPool<DTDEntityDecl>(109, 128, fMemoryManager);
- }
- DTDScanner::~DTDScanner()
- {
- delete fDumAttDef;
- delete fDumElemDecl;
- delete fDumEntityDecl;
- delete fPEntityDeclPool;
- }
- // -----------------------------------------------------------------------
- // Setter methods
- // -----------------------------------------------------------------------
- void DTDScanner::setScannerInfo(XMLScanner* const owningScanner
- , ReaderMgr* const readerMgr
- , XMLBufferMgr* const bufMgr)
- {
- // We don't own any of these, we just reference them
- fScanner = owningScanner;
- fReaderMgr = readerMgr;
- fBufMgr = bufMgr;
- if (fScanner->getDoNamespaces())
- fEmptyNamespaceId = fScanner->getEmptyNamespaceId();
- else
- fEmptyNamespaceId = 0;
- fDocTypeReaderId = fReaderMgr->getCurrentReaderNum();
- }
- // ---------------------------------------------------------------------------
- // DTDScanner: Private scanning methods
- // ---------------------------------------------------------------------------
- bool DTDScanner::checkForPERef(const bool spaceRequired
- , const bool inLiteral
- , const bool inMarkup
- , const bool throwAtEndExt)
- {
- bool gotSpace = false;
- //
- // See if we have any spaces up front. If so, then skip them and set
- // the gotSpaces flag.
- //
- if (fReaderMgr->skippedSpace())
- {
- fReaderMgr->skipPastSpaces();
- gotSpace = true;
- }
- // If the next char is a percent, then expand the PERef
- if (!fReaderMgr->skippedChar(chPercent))
- return gotSpace;
- while (true)
- {
- if (!expandPERef(false, inLiteral, inMarkup, throwAtEndExt))
- fScanner->emitError(XMLErrs::ExpectedEntityRefName);
- // And skip any more spaces in the expanded value
- if (fReaderMgr->skippedSpace())
- {
- fReaderMgr->skipPastSpaces();
- gotSpace = true;
- }
- if (!fReaderMgr->skippedChar(chPercent))
- break;
- }
- return gotSpace;
- }
- bool DTDScanner::expandPERef( const bool scanExternal
- , const bool inLiteral
- , const bool inMarkup
- , const bool throwEndOfExt)
- {
- fScanner->setHasNoDTD(false);
- XMLBufBid bbName(fBufMgr);
- //
- // If we are in the internal subset and in markup, then this is
- // an error but we go ahead and do it anyway.
- //
- if (fInternalSubset && inMarkup)
- fScanner->emitError(XMLErrs::PERefInMarkupInIntSubset);
- if (!fReaderMgr->getName(bbName.getBuffer()))
- {
- fScanner->emitError(XMLErrs::ExpectedPEName);
- // Skip the semicolon if that's what we ended up on
- fReaderMgr->skippedChar(chSemiColon);
- return false;
- }
- // If no terminating semicolon, emit an error but try to keep going
- if (!fReaderMgr->skippedChar(chSemiColon))
- fScanner->emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
- //
- // Look it up in the PE decl pool and see if it exists. If not, just
- // emit an error and continue.
- //
- XMLEntityDecl* decl = fPEntityDeclPool->getByKey(bbName.getRawBuffer());
- if (!decl)
- {
- // XML 1.0 Section 4.1
- if (fScanner->getStandalone()) {
- // no need to check fScanner->fHasNoDTD which is for sure false
- // since we are in expandPERef already
- fScanner->emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
- }
- else {
- if (fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer());
- }
- return false;
- }
- //
- // XML 1.0 Section 2.9
- // If we are a standalone document, then it has to have been declared
- // in the internal subset. Keep going though.
- //
- if (fScanner->getDoValidation() && fScanner->getStandalone() && !decl->getDeclaredInIntSubset())
- fScanner->getValidator()->emitError(XMLValid::VC_IllegalRefInStandalone, bbName.getRawBuffer());
- //
- // Okee dokee, we found it. So create either a memory stream with
- // the entity value contents, or a file stream if its an external
- // entity.
- //
- if (decl->isExternal())
- {
- // And now create a reader to read this entity
- InputSource* srcUsed;
- XMLReader* reader = fReaderMgr->createReader
- (
- decl->getBaseURI()
- , decl->getSystemId()
- , decl->getPublicId()
- , false
- , inLiteral ? XMLReader::RefFrom_Literal : XMLReader::RefFrom_NonLiteral
- , XMLReader::Type_PE
- , XMLReader::Source_External
- , srcUsed
- );
- // Put a janitor on the source so its cleaned up on exit
- Janitor<InputSource> janSrc(srcUsed);
- // If the creation failed then throw an exception
- if (!reader)
- ThrowXML1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed->getSystemId());
- // Set the 'throw at end' flag, to the one we were given
- reader->setThrowAtEnd(throwEndOfExt);
- //
- // Push the reader. If its a recursive expansion, then emit an error
- // and return an failure.
- //
- if (!fReaderMgr->pushReader(reader, decl))
- {
- fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
- return false;
- }
- //
- // If the caller wants us to scan the external entity, then lets
- // do that now.
- //
- if (scanExternal)
- {
- XMLEntityHandler* entHandler = fScanner->getEntityHandler();
- // If we have an entity handler, tell it we are starting this entity
- if (entHandler)
- entHandler->startInputSource(*srcUsed);
- //
- // Scan the external entity now. The parameter tells it that
- // it is not in an include section. Get the current reader
- // level so we can catch partial markup errors and be sure
- // to get back to here if we get an exception out of the
- // ext subset scan.
- //
- const unsigned int readerNum = fReaderMgr->getCurrentReaderNum();
- try
- {
- scanExtSubsetDecl(false, false);
- }
- catch(...)
- {
- // Pop the reader back to the original level
- fReaderMgr->cleanStackBackTo(readerNum);
- // End the input source, even though its not happy
- if (entHandler)
- entHandler->endInputSource(*srcUsed);
- throw;
- }
- // If we have an entity handler, tell it we are ending this entity
- if (entHandler)
- entHandler->endInputSource(*srcUsed);
- }
- else {
- // If it starts with the XML string, then parse a text decl
- if (fScanner->checkXMLDecl(true))
- scanTextDecl();
- }
- }
- else
- {
- // Create a reader over a memory stream over the entity value
- XMLReader* valueReader = fReaderMgr->createIntEntReader
- (
- decl->getName()
- , inLiteral ? XMLReader::RefFrom_Literal : XMLReader::RefFrom_NonLiteral
- , XMLReader::Type_PE
- , decl->getValue()
- , decl->getValueLen()
- , false
- );
- //
- // Trt to push the entity reader onto the reader manager stack,
- // where it will become the subsequent input. If it fails, that
- // means the entity is recursive, so issue an error. The reader
- // will have just been discarded, but we just keep going.
- //
- if (!fReaderMgr->pushReader(valueReader, decl))
- fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
- }
- return true;
- }
- bool DTDScanner::getQuotedString(XMLBuffer& toFill)
- {
- // Reset the target buffer
- toFill.reset();
- // Get the next char which must be a single or double quote
- XMLCh quoteCh;
- if (!fReaderMgr->skipIfQuote(quoteCh))
- return false;
- while (true)
- {
- // Get another char
- const XMLCh nextCh = fReaderMgr->getNextChar();
- // See if it matches the starting quote char
- if (nextCh == quoteCh)
- break;
- //
- // We should never get either an end of file null char here. If we
- // do, just fail. It will be handled more gracefully in the higher
- // level code that called us.
- //
- if (!nextCh)
- return false;
- // Else add it to the buffer
- toFill.append(nextCh);
- }
- return true;
- }
- XMLAttDef*
- DTDScanner::scanAttDef(DTDElementDecl& parentElem, XMLBuffer& bufToUse)
- {
- // Check for PE ref or optional whitespace
- checkForPERef(false, false, true);
- // Get the name of the attribute
- if (!fReaderMgr->getName(bufToUse))
- {
- fScanner->emitError(XMLErrs::ExpectedAttrName);
- return 0;
- }
- //
- // Look up this attribute in the parent element's attribute list. If
- // it already exists, then use the dummy.
- //
- DTDAttDef* decl = parentElem.getAttDef(bufToUse.getRawBuffer());
- if (decl)
- {
- // It already exists, so put out a warning
- fScanner->emitError
- (
- XMLErrs::AttListAlreadyExists
- , bufToUse.getRawBuffer()
- , parentElem.getFullName()
- );
- // Use the dummy decl to parse into and set its name to the name we got
- if (!fDumAttDef)
- {
- fDumAttDef = new (fMemoryManager) DTDAttDef(fMemoryManager);
- fDumAttDef->setId(fNextAttrId++);
- }
- fDumAttDef->setName(bufToUse.getRawBuffer());
- decl = fDumAttDef;
- }
- else
- {
- //
- // It does not already exist so create a new one, give it the next
- // available unique id, and add it
- //
- decl = new (fMemoryManager) DTDAttDef
- (
- bufToUse.getRawBuffer()
- , XMLAttDef::CData
- , XMLAttDef::Implied
- , fMemoryManager
- );
- decl->setId(fNextAttrId++);
- decl->setExternalAttDeclaration(isReadingExternalEntity());
- parentElem.addAttDef(decl);
- }
- // Set a flag to indicate whether we are doing a dummy parse
- const bool isIgnored = (decl == fDumAttDef);
- // Space is required here, so check for PE ref, and require space
- if (!checkForPERef(true, false, true))
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- //
- // Next has to be one of the attribute type strings. This tells us what
- // is to follow.
- //
- if (fReaderMgr->skippedString(XMLUni::fgCDATAString))
- {
- decl->setType(XMLAttDef::CData);
- }
- else if (fReaderMgr->skippedString(XMLUni::fgIDString))
- {
- if (!fReaderMgr->skippedString(XMLUni::fgRefString))
- decl->setType(XMLAttDef::ID);
- else if (!fReaderMgr->skippedChar(chLatin_S))
- decl->setType(XMLAttDef::IDRef);
- else
- decl->setType(XMLAttDef::IDRefs);
- }
- else if (fReaderMgr->skippedString(XMLUni::fgEntitString))
- {
- if (fReaderMgr->skippedChar(chLatin_Y))
- {
- decl->setType(XMLAttDef::Entity);
- }
- else if (fReaderMgr->skippedString(XMLUni::fgIESString))
- {
- decl->setType(XMLAttDef::Entities);
- }
- else
- {
- fScanner->emitError
- (
- XMLErrs::ExpectedAttributeType
- , decl->getFullName()
- , parentElem.getFullName()
- );
- return 0;
- }
- }
- else if (fReaderMgr->skippedString(XMLUni::fgNmTokenString))
- {
- if (fReaderMgr->skippedChar(chLatin_S))
- decl->setType(XMLAttDef::NmTokens);
- else
- decl->setType(XMLAttDef::NmToken);
- }
- else if (fReaderMgr->skippedString(XMLUni::fgNotationString))
- {
- // Check for PE ref and require space
- if (!checkForPERef(true, false, true))
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- decl->setType(XMLAttDef::Notation);
- if (!scanEnumeration(*decl, bufToUse, true))
- return 0;
- // Set the value as the enumeration for this decl
- decl->setEnumeration(bufToUse.getRawBuffer());
- }
- else if (fReaderMgr->skippedChar(chOpenParen))
- {
- decl->setType(XMLAttDef::Enumeration);
- if (!scanEnumeration(*decl, bufToUse, false))
- return 0;
- // Set the value as the enumeration for this decl
- decl->setEnumeration(bufToUse.getRawBuffer());
- }
- else
- {
- fScanner->emitError
- (
- XMLErrs::ExpectedAttributeType
- , decl->getFullName()
- , parentElem.getFullName()
- );
- return 0;
- }
- // Space is required here, so check for PE ref, and require space
- if (!checkForPERef(true, false, true))
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- // And then scan for the optional default value declaration
- scanDefaultDecl(*decl);
- // If validating, then do a couple of validation constraints
- if (fScanner->getDoValidation())
- {
- if (decl->getType() == XMLAttDef::ID)
- {
- if ((decl->getDefaultType() != XMLAttDef::Implied)
- && (decl->getDefaultType() != XMLAttDef::Required))
- {
- fScanner->getValidator()->emitError(XMLValid::BadIDAttrDefType, decl->getFullName());
- }
- }
- // if attdef is xml:space, check correct enumeration (default|preserve)
- const XMLCh fgXMLSpace[] = { chLatin_x, chLatin_m, chLatin_l, chColon, chLatin_s, chLatin_p, chLatin_a, chLatin_c, chLatin_e, chNull };
- if (XMLString::equals(decl->getFullName(),fgXMLSpace)) {
- const XMLCh fgPreserve[] = { chLatin_p, chLatin_r, chLatin_e, chLatin_s, chLatin_e, chLatin_r, chLatin_v, chLatin_e, chNull };
- const XMLCh fgDefault[] = { chLatin_d, chLatin_e, chLatin_f, chLatin_a, chLatin_u, chLatin_l, chLatin_t, chNull };
- bool ok = false;
- if (decl->getType() == XMLAttDef::Enumeration) {
- BaseRefVectorOf<XMLCh>* enumVector = XMLString::tokenizeString(decl->getEnumeration());
- int size = enumVector->size();
- ok = (size == 1 &&
- (XMLString::equals(enumVector->elementAt(0), fgDefault) ||
- XMLString::equals(enumVector->elementAt(0), fgPreserve))) ||
- (size == 2 &&
- (XMLString::equals(enumVector->elementAt(0), fgDefault) &&
- XMLString::equals(enumVector->elementAt(1), fgPreserve))) ||
- (size == 2 &&
- (XMLString::equals(enumVector->elementAt(1), fgDefault) &&
- XMLString::equals(enumVector->elementAt(0), fgPreserve)));
- delete enumVector;
- }
- if (!ok)
- fScanner->getValidator()->emitError(XMLValid::IllegalXMLSpace);
- }
- }
- // If we have a doc type handler, tell it about this attdef.
- if (fDocTypeHandler)
- fDocTypeHandler->attDef(parentElem, *decl, isIgnored);
- return decl;
- }
- void DTDScanner::scanAttListDecl()
- {
- // Space is required here, so check for a PE ref
- if (!checkForPERef(true, false, true))
- {
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- //
- // Next should be the name of the element it belongs to, so get a buffer
- // and get the name into it.
- //
- XMLBufBid bbName(fBufMgr);
- if (!fReaderMgr->getName(bbName.getBuffer()))
- {
- fScanner->emitError(XMLErrs::ExpectedElementName);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- //
- // Find this element's declaration. If it has not been declared yet,
- // we will force one into the list, but not mark it as declared.
- //
- DTDElementDecl* elemDecl = (DTDElementDecl*) fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bbName.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
- if (!elemDecl)
- {
- //
- // Lets fault in a declaration and add it to the pool. We mark
- // it having been created because of an attlist. Later, if its
- // declared, this will be updated.
- //
- elemDecl = new (fMemoryManager) DTDElementDecl
- (
- bbName.getRawBuffer()
- , fEmptyNamespaceId
- , DTDElementDecl::Any
- , fMemoryManager
- );
- elemDecl->setCreateReason(XMLElementDecl::AttList);
- elemDecl->setExternalElemDeclaration(isReadingExternalEntity());
- fDTDGrammar->putElemDecl((XMLElementDecl*) elemDecl);
- }
- // If we have a doc type handler, tell it the att list is starting
- if (fDocTypeHandler)
- fDocTypeHandler->startAttList(*elemDecl);
- //
- // Now we loop until we are done with all of the attributes in this
- // list. We need a buffer to use for local processing.
- //
- XMLBufBid bbTmp(fBufMgr);
- XMLBuffer& tmpBuf = bbTmp.getBuffer();
- bool seenAnId = false;
- while (true)
- {
- // Get the next char out and see what it tells us to do
- const XMLCh nextCh = fReaderMgr->peekNextChar();
- // Watch for EOF
- if (!nextCh)
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- if (nextCh == chCloseAngle)
- {
- // We are done with this attribute list
- fReaderMgr->getNextChar();
- break;
- }
- else if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
- {
- //
- // If advanced callbacks are enabled and we have a doc
- // type handler, then gather up the white space and call
- // back on the doctype handler. Otherwise, just skip
- // whitespace.
- //
- if (fDocTypeHandler)
- {
- fReaderMgr->getSpaces(tmpBuf);
- fDocTypeHandler->doctypeWhitespace
- (
- tmpBuf.getRawBuffer()
- , tmpBuf.getLen()
- );
- }
- else
- {
- fReaderMgr->skipPastSpaces();
- }
- }
- else if (nextCh == chPercent)
- {
- // Eat the percent and expand the ref
- fReaderMgr->getNextChar();
- expandPERef(false, false, true);
- }
- else
- {
- //
- // It must be an attribute name, so scan it. We let
- // it use our local buffer for its name scanning.
- //
- XMLAttDef* attDef = scanAttDef(*elemDecl, tmpBuf);
- if (!attDef)
- {
- fReaderMgr->skipPastChar(chCloseAngle);
- break;
- }
- //
- // If we are validating and its an ID type, then we have to
- // make sure that we have not seen an id attribute yet. Set
- // the flag to say that we've seen one now also.
- //
- if (fScanner->getDoValidation())
- {
- if (attDef->getType() == XMLAttDef::ID)
- {
- if (seenAnId)
- fScanner->getValidator()->emitError(XMLValid::MultipleIdAttrs, elemDecl->getFullName());
- seenAnId = true;
- }
- }
- }
- }
- // If we have a doc type handler, tell it the att list is ending
- if (fDocTypeHandler)
- fDocTypeHandler->endAttList(*elemDecl);
- }
- //
- // This method is called to scan the value of an attribute in content. This
- // involves some normalization and replacement of general entity and
- // character references.
- //
- // End of entity's must be dealt with here. During DTD scan, they can come
- // from external entities. During content, they can come from any entity.
- // We just eat the end of entity and continue with our scan until we come
- // to the closing quote. If an unterminated value causes us to go through
- // subsequent entities, that will cause errors back in the calling code,
- // but there's little we can do about it here.
- //
- bool DTDScanner::scanAttValue(const XMLCh* const attrName
- , XMLBuffer& toFill
- , const XMLAttDef::AttTypes type)
- {
- enum States
- {
- InWhitespace
- , InContent
- };
- // Reset the target buffer
- toFill.reset();
- // Get the next char which must be a single or double quote
- XMLCh quoteCh;
- if (!fReaderMgr->skipIfQuote(quoteCh))
- return false;
- //
- // We have to get the current reader because we have to ignore closing
- // quotes until we hit the same reader again.
- //
- const unsigned int curReader = fReaderMgr->getCurrentReaderNum();
- //
- // Loop until we get the attribute value. Note that we use a double
- // loop here to avoid the setup/teardown overhead of the exception
- // handler on every round.
- //
- XMLCh nextCh;
- XMLCh secondCh = 0;
- States curState = InContent;
- bool firstNonWS = false;
- bool gotLeadingSurrogate = false;
- bool escaped;
- while (true)
- {
- try
- {
- while(true)
- {
- nextCh = fReaderMgr->getNextChar();
- if (!nextCh)
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- // Check for our ending quote in the same entity
- if (nextCh == quoteCh)
- {
- if (curReader == fReaderMgr->getCurrentReaderNum())
- return true;
- // Watch for spillover into a previous entity
- if (curReader > fReaderMgr->getCurrentReaderNum())
- {
- fScanner->emitError(XMLErrs::PartialMarkupInEntity);
- return false;
- }
- }
- //
- // Check for an entity ref now, before we let it affect our
- // whitespace normalization logic below. We ignore the empty flag
- // in this one.
- //
- escaped = false;
- if (nextCh == chAmpersand)
- {
- if (scanEntityRef(nextCh, secondCh, escaped) != EntityExp_Returned)
- {
- gotLeadingSurrogate = false;
- continue;
- }
- }
- else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
- {
- // Check for correct surrogate pairs
- if (gotLeadingSurrogate)
- fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
- else
- gotLeadingSurrogate = true;
- }
- else
- {
- if (gotLeadingSurrogate)
- {
- if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
- fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
- }
- // Its got to at least be a valid XML character
- else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh))
- {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- fScanner->emitError
- (
- XMLErrs::InvalidCharacterInAttrValue
- , attrName
- , tmpBuf
- );
- }
- gotLeadingSurrogate = false;
- }
- //
- // If its not escaped, then make sure its not a < character, which
- // is not allowed in attribute values.
- //
- if (!escaped && (nextCh == chOpenAngle))
- fScanner->emitError(XMLErrs::BracketInAttrValue, attrName);
- //
- // If the attribute is a CDATA type we do simple replacement of
- // tabs and new lines with spaces, if the character is not escaped
- // by way of a char ref.
- //
- // Otherwise, we do the standard non-CDATA normalization of
- // compressing whitespace to single spaces and getting rid of
- // leading and trailing whitespace.
- //
- if (type == XMLAttDef::CData)
- {
- if (!escaped)
- {
- if ((nextCh == 0x09) || (nextCh == 0x0A) || (nextCh == 0x0D))
- nextCh = chSpace;
- }
- }
- else
- {
- if (curState == InWhitespace)
- {
- if (!fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
- {
- if (firstNonWS)
- toFill.append(chSpace);
- curState = InContent;
- firstNonWS = true;
- }
- else
- {
- continue;
- }
- }
- else if (curState == InContent)
- {
- if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
- {
- curState = InWhitespace;
- continue;
- }
- firstNonWS = true;
- }
- }
- // Else add it to the buffer
- toFill.append(nextCh);
- if (secondCh)
- toFill.append(secondCh);
- }
- }
- catch(const EndOfEntityException&)
- {
- // Just eat it and continue.
- gotLeadingSurrogate = false;
- escaped = false;
- }
- }
- return true;
- }
- bool DTDScanner::scanCharRef(XMLCh& first, XMLCh& second)
- {
- bool gotOne = false;
- unsigned int value = 0;
- //
- // Set the radix. Its supposed to be a lower case x if hex. But, in
- // order to recover well, we check for an upper and put out an error
- // for that.
- //
- unsigned int radix = 10;
- if (fReaderMgr->skippedChar(chLatin_x))
- {
- radix = 16;
- }
- else if (fReaderMgr->skippedChar(chLatin_X))
- {
- fScanner->emitError(XMLErrs::HexRadixMustBeLowerCase);
- radix = 16;
- }
- while (true)
- {
- const XMLCh nextCh = fReaderMgr->peekNextChar();
- // Watch for EOF
- if (!nextCh)
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- // Break out on the terminating semicolon
- if (nextCh == chSemiColon)
- {
- fReaderMgr->getNextChar();
- break;
- }
- //
- // Convert this char to a binary value, or bail out if its not
- // one.
- //
- unsigned int nextVal;
- if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9))
- nextVal = (unsigned int)(nextCh - chDigit_0);
- else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F))
- nextVal= (unsigned int)(10 + (nextCh - chLatin_A));
- else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f))
- nextVal = (unsigned int)(10 + (nextCh - chLatin_a));
- else
- {
- //
- // If we got at least a sigit, then do an unterminated ref
- // error. Else, do an expected a numerical ref thing.
- //
- if (gotOne)
- fScanner->emitError(XMLErrs::UnterminatedCharRef);
- else
- fScanner->emitError(XMLErrs::ExpectedNumericalCharRef);
- return false;
- }
- //
- // Make sure its valid for the radix. If not, then just eat the
- // digit and go on after issueing an error. Else, update the
- // running value with this new digit.
- //
- if (nextVal >= radix)
- {
- XMLCh tmpStr[2];
- tmpStr[0] = nextCh;
- tmpStr[1] = chNull;
- fScanner->emitError(XMLErrs::BadDigitForRadix, tmpStr);
- }
- else
- {
- value = (value * radix) + nextVal;
- }
- // Indicate that we got at least one good digit
- gotOne = true;
- // Eat the char we just processed
- fReaderMgr->getNextChar();
- }
- // Return the char (or chars)
- // And check if the character expanded is valid or not
- if (value >= 0x10000 && value <= 0x10FFFF)
- {
- value -= 0x10000;
- first = XMLCh((value >> 10) + 0xD800);
- second = XMLCh((value & 0x3FF) + 0xDC00);
- }
- else if (value <= 0xFFFD)
- {
- first = XMLCh(value);
- second = 0;
- if (!fReaderMgr->getCurrentReader()->isXMLChar(first) && !fReaderMgr->getCurrentReader()->isControlChar(first)) {
- // Character reference was not in the valid range
- fScanner->emitError(XMLErrs::InvalidCharacterRef);
- return false;
- }
- }
- else {
- // Character reference was not in the valid range
- fScanner->emitError(XMLErrs::InvalidCharacterRef);
- return false;
- }
- return true;
- }
- ContentSpecNode*
- DTDScanner::scanChildren(const DTDElementDecl& elemDecl, XMLBuffer& bufToUse)
- {
- // Check for a PE ref here, but don't require spaces
- checkForPERef(false, false, true);
- // We have to check entity nesting here
- unsigned int curReader;
- //
- // We know that the caller just saw an opening parenthesis, so we need
- // to parse until we hit the end of it, recursing for other nested
- // parentheses we see.
- //
- // We have to check for one up front, since it could be something like
- // (((a)*)) etc...
- //
- ContentSpecNode* curNode = 0;
- if (fReaderMgr->skippedChar(chOpenParen))
- {
- curReader = fReaderMgr->getCurrentReaderNum();
- // Lets call ourself and get back the resulting node
- curNode = scanChildren(elemDecl, bufToUse);
- // If that failed, no need to go further, return failure
- if (!curNode)
- return 0;
- if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
- }
- else
- {
- // Not a nested paren, so it must be a leaf node
- if (!fReaderMgr->getName(bufToUse))
- {
- fScanner->emitError(XMLErrs::ExpectedElementName);
- return 0;
- }
- //
- // Create a leaf node for it. If we can find the element id for
- // this element, then use it. Else, we have to fault in an element
- // decl, marked as created because of being in a content model.
- //
- XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bufToUse.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
- if (!decl)
- {
- decl = new (fMemoryManager) DTDElementDecl
- (
- bufToUse.getRawBuffer()
- , fEmptyNamespaceId
- , DTDElementDecl::Any
- , fMemoryManager
- );
- decl->setCreateReason(XMLElementDecl::InContentModel);
- decl->setExternalElemDeclaration(isReadingExternalEntity());
- fDTDGrammar->putElemDecl(decl);
- }
- curNode = new (fMemoryManager) ContentSpecNode
- (
- decl->getElementName()
- , fMemoryManager
- );
- // Check for a PE ref here, but don't require spaces
- const bool gotSpaces = checkForPERef(false, false, true);
- // Check for a repetition character after the leaf
- const XMLCh repCh = fReaderMgr->peekNextChar();
- ContentSpecNode* tmpNode = makeRepNode(repCh, curNode, fMemoryManager);
- if (tmpNode != curNode)
- {
- if (gotSpaces)
- fScanner->emitError(XMLErrs::UnexpectedWhitespace);
- fReaderMgr->getNextChar();
- curNode = tmpNode;
- }
- }
- // Check for a PE ref here, but don't require spaces
- checkForPERef(false, false, true);
- //
- // Ok, the next character tells us what kind of content this particular
- // model this particular parentesized section is. Its either a choice if
- // we see ',', a sequence if we see '|', or a single leaf node if we see
- // a closing paren.
- //
- const XMLCh opCh = fReaderMgr->peekNextChar();
- if ((opCh != chComma)
- && (opCh != chPipe)
- && (opCh != chCloseParen))
- {
- // Not a legal char, so delete our node and return failure
- delete curNode;
- fScanner->emitError(XMLErrs::ExpectedSeqChoiceLeaf);
- return 0;
- }
- //
- // Create the head node of the correct type. We need this to remember
- // the top of the local tree. If it was a single subexpr, then just
- // set the head node to the current node. For the others, we'll build
- // the tree off the second child as we move across.
- //
- ContentSpecNode* headNode = 0;
- ContentSpecNode::NodeTypes curType = ContentSpecNode::UnknownType;
- if (opCh == chComma)
- {
- curType = ContentSpecNode::Sequence;
- headNode = new (fMemoryManager) ContentSpecNode
- (
- curType
- , curNode
- , 0
- , true
- , true
- , fMemoryManager
- );
- curNode = headNode;
- }
- else if (opCh == chPipe)
- {
- curType = ContentSpecNode::Choice;
- headNode = new (fMemoryManager) ContentSpecNode
- (
- curType
- , curNode
- , 0
- , true
- , true
- , fMemoryManager
- );
- curNode = headNode;
- }
- else
- {
- headNode = curNode;
- fReaderMgr->getNextChar();
- }
- //
- // If it was a sequence or choice, we just loop until we get to the
- // end of our section, adding each new leaf or sub expression to the
- // right child of the current node, and making that new node the current
- // node.
- //
- if ((opCh == chComma) || (opCh == chPipe))
- {
- ContentSpecNode* lastNode = 0;
- while (true)
- {
- //
- // The next thing must either be another | or , character followed
- // by another leaf or subexpression, or a closing parenthesis, or a
- // PE ref.
- //
- if (fReaderMgr->lookingAtChar(chPercent))
- {
- checkForPERef(false, false, true);
- }
- else if (fReaderMgr->skippedSpace())
- {
- // Just skip whitespace
- fReaderMgr->skipPastSpaces();
- }
- else if (fReaderMgr->skippedChar(chCloseParen))
- {
- //
- // We've hit the end of this section, so break out. But, we
- // need to see if we left a partial sequence of choice node
- // without a second node. If so, we have to undo that and
- // put its left child into the right node of the previous
- // node.
- //
- if ((curNode->getType() == ContentSpecNode::Choice)
- || (curNode->getType() == ContentSpecNode::Sequence))
- {
- if (!curNode->getSecond())
- {
- ContentSpecNode* saveFirst = curNode->orphanFirst();
- lastNode->setSecond(saveFirst);
- curNode = lastNode;
- }
- }
- break;
- }
- else if (fReaderMgr->skippedChar(opCh))
- {
- // Check for a PE ref here, but don't require spaces
- checkForPERef(false, false, true);
- if (fReaderMgr->skippedChar(chOpenParen))
- {
- curReader = fReaderMgr->getCurrentReaderNum();
- // Recurse to handle this new guy
- ContentSpecNode* subNode = scanChildren(elemDecl, bufToUse);
- // If it failed, we are done, clean up here and return failure
- if (!subNode)
- {
- delete headNode;
- return 0;
- }
- if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
- // Else patch it in and make it the new current
- ContentSpecNode* newCur = new (fMemoryManager) ContentSpecNode
- (
- curType
- , subNode
- , 0
- , true
- , true
- , fMemoryManager
- );
- curNode->setSecond(newCur);
- lastNode = curNode;
- curNode = newCur;
- }
- else
- {
- //
- // Got to be a leaf node, so get a name. If we cannot get
- // one, then clean up and get outa here.
- //
- if (!fReaderMgr->getName(bufToUse))
- {
- delete headNode;
- fScanner->emitError(XMLErrs::ExpectedElementName);
- return 0;
- }
- //
- // Create a leaf node for it. If we can find the element
- // id for this element, then use it. Else, we have to
- // fault in an element decl, marked as created because
- // of being in a content model.
- //
- XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bufToUse.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
- if (!decl)
- {
- decl = new (fMemoryManager) DTDElementDecl
- (
- bufToUse.getRawBuffer()
- , fEmptyNamespaceId
- , DTDElementDecl::Any
- , fMemoryManager
- );
- decl->setCreateReason(XMLElementDecl::InContentModel);
- decl->setExternalElemDeclaration(isReadingExternalEntity());
- fDTDGrammar->putElemDecl(decl);
- }
- ContentSpecNode* tmpLeaf = new (fMemoryManager) ContentSpecNode
- (
- decl->getElementName()
- , fMemoryManager
- );
- // Check for a repetition character after the leaf
- const XMLCh repCh = fReaderMgr->peekNextChar();
- ContentSpecNode* tmpLeaf2 = makeRepNode(repCh, tmpLeaf, fMemoryManager);
- if (tmpLeaf != tmpLeaf2)
- fReaderMgr->getNextChar();
- //
- // Create a new sequence or choice node, with the leaf
- // (or rep surrounding it) we just got as its first node.
- // Make the new node the second node of the current node,
- // and then make it the current node.
- //
- ContentSpecNode* newCur = new (fMemoryManager) ContentSpecNode
- (
- curType
- , tmpLeaf2
- , 0
- , true
- , true
- , fMemoryManager
- );
- curNode->setSecond(newCur);
- lastNode = curNode;
- curNode = newCur;
- }
- }
- else
- {
- // Cannot be valid
- if (opCh == chComma)
- {
- fScanner->emitError(XMLErrs::ExpectedChoiceOrCloseParen);
- }
- else
- {
- fScanner->emitError
- (
- XMLErrs::ExpectedSeqOrCloseParen
- , elemDecl.getFullName()
- );
- }
- delete headNode;
- return 0;
- }
- }
- }
- //
- // We saw the terminating parenthesis so lets check for any repetition
- // character, and create a node for that, making the head node the child
- // of it.
- //
- XMLCh repCh = fReaderMgr->peekNextChar();
- ContentSpecNode* retNode = makeRepNode(repCh, headNode, fMemoryManager);
- if (retNode != headNode)
- fReaderMgr->getNextChar();
- return retNode;
- }
- //
- // We get here after the '<!--' part of the comment. We scan past the
- // terminating '-->' It will calls the appropriate handler with the comment
- // text, if one is provided. A comment can be in either the document or
- // the DTD, so the fInDocument flag is used to know which handler to send
- // it to.
- //
- void DTDScanner::scanComment()
- {
- enum States
- {
- InText
- , OneDash
- , TwoDashes
- };
- // Get a buffer for this
- XMLBufBid bbComment(fBufMgr);
- //
- // Get the comment text into a temp buffer. Be sure to use temp buffer
- // two here, since its to be used for stuff that is potentially longer
- // than just a name.
- //
- bool gotLeadingSurrogate = false;
- States curState = InText;
- while (true)
- {
- // Get the next character
- const XMLCh nextCh = fReaderMgr->getNextChar();
- // Watch for an end of file
- if (!nextCh)
- {
- fScanner->emitError(XMLErrs::UnterminatedComment);
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- }
- // Check for correct surrogate pairs
- if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
- {
- if (gotLeadingSurrogate)
- fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
- else
- gotLeadingSurrogate = true;
- }
- else
- {
- if (gotLeadingSurrogate)
- {
- if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
- fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
- }
- // Its got to at least be a valid XML character
- else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
- }
- gotLeadingSurrogate = false;
- }
- if (curState == InText)
- {
- // If its a dash, go to OneDash state. Otherwise take as text
- if (nextCh == chDash)
- curState = OneDash;
- else
- bbComment.append(nextCh);
- }
- else if (curState == OneDash)
- {
- //
- // If its another dash, then we change to the two dashes states.
- // Otherwise, we have to put in the deficit dash and the new
- // character and go back to InText.
- //
- if (nextCh == chDash)
- {
- curState = TwoDashes;
- }
- else
- {
- bbComment.append(chDash);
- bbComment.append(nextCh);
- curState = InText;
- }
- }
- else if (curState == TwoDashes)
- {
- // The next character must be the closing bracket
- if (nextCh != chCloseAngle)
- {
- fScanner->emitError(XMLErrs::IllegalSequenceInComment);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- break;
- }
- }
- // If there is a doc type handler, then pass on the comment stuff
- if (fDocTypeHandler)
- fDocTypeHandler->doctypeComment(bbComment.getRawBuffer());
- }
- bool DTDScanner::scanContentSpec(DTDElementDecl& toFill)
- {
- //
- // Check for for a couple of the predefined content type strings. If
- // its not one of these, its got to be a parenthesized reg ex type
- // expression.
- //
- if (fReaderMgr->skippedString(XMLUni::fgEmptyString))
- {
- toFill.setModelType(DTDElementDecl::Empty);
- return true;
- }
- if (fReaderMgr->skippedString(XMLUni::fgAnyString))
- {
- toFill.setModelType(DTDElementDecl::Any);
- return true;
- }
- // Its got to be a parenthesized regular expression
- if (!fReaderMgr->skippedChar(chOpenParen))
- {
- fScanner->emitError
- (
- XMLErrs::ExpectedContentSpecExpr
- , toFill.getFullName()
- );
- return false;
- }
- // Get the current reader id, so we can test for partial markup
- const unsigned int curReader = fReaderMgr->getCurrentReaderNum();
- // We could have a PE ref here, but don't require space
- checkForPERef(false, false, true);
- //
- // Now we look for a PCDATA string. If its PCDATA, then it must be a
- // MIXED model. Otherwise, it must be a regular list of children in
- // a regular expression perhaps.
- //
- bool status;
- if (fReaderMgr->skippedString(XMLUni::fgPCDATAString))
- {
- // Set the model to mixed
- toFill.setModelType(DTDElementDecl::Mixed_Simple);
- status = scanMixed(toFill);
- //
- // If we are validating we have to check that there are no multiple
- // uses of any child elements.
- //
- if (fScanner->getDoValidation())
- {
- if (((const MixedContentModel*)toFill.getContentModel())->hasDups())
- fScanner->getValidator()->emitError(XMLValid::RepElemInMixed);
- }
- }
- else
- {
- //
- // We have to do a recursive scan of the content model. Create a
- // buffer for it to use, for efficiency. It returns the top ofthe
- // content spec node tree, which we set if successful.
- //
- toFill.setModelType(DTDElementDecl::Children);
- XMLBufBid bbTmp(fBufMgr);
- ContentSpecNode* resNode = scanChildren(toFill, bbTmp.getBuffer());
- status = (resNode != 0);
- if (status)
- toFill.setContentSpec(resNode);
- }
- // Make sure we are on the same reader as where we started
- if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
- return status;
- }
- void DTDScanner::scanDefaultDecl(DTDAttDef& toFill)
- {
- if (fReaderMgr->skippedString(XMLUni::fgRequiredString))
- {
- toFill.setDefaultType(XMLAttDef::Required);
- return;
- }
- if (fReaderMgr->skippedString(XMLUni::fgImpliedString))
- {
- toFill.setDefaultType(XMLAttDef::Implied);
- return;
- }
- if (fReaderMgr->skippedString(XMLUni::fgFixedString))
- {
- //
- // There must be space before the fixed value. If there is not, then
- // emit an error but keep going.
- //
- if (!fReaderMgr->skippedSpace())
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- else
- fReaderMgr->skipPastSpaces();
- toFill.setDefaultType(XMLAttDef::Fixed);
- }
- else
- {
- toFill.setDefaultType(XMLAttDef::Default);
- }
- //
- // If we got here, its fixed or default, so we need to get a value.
- // If we don't, then emit an error but just set the default value to
- // an empty string and try to keep going.
- //
- // Check for PE ref or optional whitespace
- checkForPERef(false, false, true);
- XMLBufBid bbValue(fBufMgr);
- if (!scanAttValue(toFill.getFullName(), bbValue.getBuffer(), toFill.getType()))
- fScanner->emitError(XMLErrs::ExpectedDefAttrDecl);
- toFill.setValue(bbValue.getRawBuffer());
- }
- //
- // This is called after seeing '<!ELEMENT' which indicates that an element
- // markup is starting. This guy scans the rest of it and adds it to the
- // element decl pool if it has not already been declared.
- //
- void DTDScanner::scanElementDecl()
- {
- //
- // Space is legal (required actually) here so check for a PE ref. If
- // we don't get our whitespace, then issue and error, but try to keep
- // going.
- //
- if (!checkForPERef(true, false, true))
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- // Get a buffer for the element name and scan in the name
- XMLBufBid bbName(fBufMgr);
- if (!fReaderMgr->getName(bbName.getBuffer()))
- {
- fScanner->emitError(XMLErrs::ExpectedElementName);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // Look this guy up in the element decl pool
- DTDElementDecl* decl = (DTDElementDecl*) fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bbName.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
- //
- // If it does not exist, then we need to create it. If it does and
- // its marked as declared, then that's an error, but we still need to
- // scan over the content model so use the dummy declaration that the
- // parsing code can fill in.
- //
- if (decl)
- {
- if (decl->isDeclared())
- {
- if (fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::ElementAlreadyExists, bbName.getRawBuffer());
- if (!fDumElemDecl)
- fDumElemDecl = new (fMemoryManager) DTDElementDecl
- (
- bbName.getRawBuffer()
- , fEmptyNamespaceId
- , DTDElementDecl::Any
- , fMemoryManager
- );
- else
- fDumElemDecl->setElementName(bbName.getRawBuffer(),fEmptyNamespaceId);
- }
- }
- else
- {
- //
- // Create the new empty declaration to fill in and put it into
- // the decl pool.
- //
- decl = new (fMemoryManager) DTDElementDecl
- (
- bbName.getRawBuffer()
- , fEmptyNamespaceId
- , DTDElementDecl::Any
- , fMemoryManager
- );
- fDTDGrammar->putElemDecl(decl);
- }
- // Set a flag for whether we will ignore this one
- const bool isIgnored = (decl == fDumElemDecl);
- // Mark this one if being externally declared
- decl->setExternalElemDeclaration(isReadingExternalEntity());
- // Mark this one as being declared
- decl->setCreateReason(XMLElementDecl::Declared);
- // Another check for a PE ref, with at least required whitespace
- if (!checkForPERef(true, false, true))
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- // And now scan the content model for this guy.
- if (!scanContentSpec(*decl))
- {
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // Another check for a PE ref, but we don't require whitespace here
- checkForPERef(false, false, true);
- // And we should have the ending angle bracket
- if (!fReaderMgr->skippedChar(chCloseAngle))
- {
- fScanner->emitError(XMLErrs::UnterminatedElementDecl, bbName.getRawBuffer());
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- //
- // If we have a DTD handler tell it about the new element decl. We
- // tell it if its one that can be ignored, cause its an override of a
- // previously existing decl. If it is being ignored, only call back
- // if advanced callbacks are enabled.
- //
- if (fDocTypeHandler)
- fDocTypeHandler->elementDecl(*decl, isIgnored);
- }
- //
- // This method will process a general or parameter entity reference. The
- // entity name and entity text will be stored in the entity pool. The value
- // of the entity will be scanned for any other parameter entity or char
- // references which will be expanded. So the stored value can only have
- // general entity references when done.
- //
- void DTDScanner::scanEntityDecl()
- {
- //
- // Space is required here, but we cannot check for a PE Ref since
- // there could be a legal (no-ref) percent sign here. Since any
- // entity that ended here would be illegal, we just skip spaces
- // and then check for a percent.
- //
- if (!fReaderMgr->lookingAtSpace())
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- else
- fReaderMgr->skipPastSpaces();
- const bool isPEDecl = fReaderMgr->skippedChar(chPercent);
- //
- // If a PE decl, then eat the percent and check for spaces or a
- // PE ref on the other side of it. At least spaces are required.
- //
- if (isPEDecl)
- {
- if (!checkForPERef(true, false, true))
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- }
- //
- // Now lets get a name, which should be the name of the entity. We
- // have to get a buffer for this.
- //
- XMLBufBid bbName(fBufMgr);
- if (!fReaderMgr->getName(bbName.getBuffer()))
- {
- fScanner->emitError(XMLErrs::ExpectedPEName);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // If namespaces are enabled, then no colons allowed
- if (fScanner->getDoNamespaces())
- {
- if (XMLString::indexOf(bbName.getRawBuffer(), chColon) != -1)
- fScanner->emitError(XMLErrs::ColonNotLegalWithNS);
- }
- //
- // See if this entity already exists. If so, then the existing one
- // takes precendence. So we use the local dummy decl to parse into
- // and just ignore the results.
- //
- DTDEntityDecl* entityDecl;
- if (isPEDecl)
- entityDecl = fPEntityDeclPool->getByKey(bbName.getRawBuffer());
- else
- entityDecl = fDTDGrammar->getEntityDecl(bbName.getRawBuffer());
- if (entityDecl)
- {
- if (!fDumEntityDecl)
- fDumEntityDecl = new (fMemoryManager) DTDEntityDecl(fMemoryManager);
- fDumEntityDecl->setName(bbName.getRawBuffer());
- entityDecl = fDumEntityDecl;
- }
- else
- {
- // Its not in existence already, then create an entity decl for it
- entityDecl = new (fMemoryManager) DTDEntityDecl(bbName.getRawBuffer(), false, fMemoryManager);
- //
- // Set the declaration location. The parameter indicates whether its
- // declared in the content/internal subset, so we know whether or not
- // its in the external subset.
- //
- entityDecl->setDeclaredInIntSubset(fInternalSubset);
- // Add it to the appropriate entity decl pool
- if (isPEDecl)
- fPEntityDeclPool->put(entityDecl);
- else
- fDTDGrammar->putEntityDecl(entityDecl);
- }
- // Set a flag that indicates whether we are ignoring this one
- const bool isIgnored = (entityDecl == fDumEntityDecl);
- // Set the PE flag on it