DTDScanner.cpp
上传用户:huihehuasu
上传日期:2007-01-10
资源大小:6948k
文件大小:124k
源码类别:
xml/soap/webservice
开发平台:
C/C++
- /*
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 1999-2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Xerces" and "Apache Software Foundation" must
- * not be used to endorse or promote products derived from this
- * software without prior written permission. For written
- * permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * nor may "Apache" appear in their name, without prior written
- * permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation, and was
- * originally based on software copyright (c) 1999, International
- * Business Machines, Inc., http://www.ibm.com . For more information
- * on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
- /*
- * $Log: DTDScanner.cpp,v $
- * Revision 1.22 2001/12/06 17:51:18 tng
- * Performance Enhancement. The ContentSpecNode constructor always copied the QName
- * that was passed to it. Added a second constructor that allows the QName to be just assigned, not copied.
- * That was because there are some cases in which a temporary QName was constructed, passed to ContentSpecNode, and then deleted.
- * There were examples of that in TraverseSchema and DTDScanner.
- * By Henry Zongaro.
- *
- * Revision 1.21 2001/11/13 13:27:28 tng
- * Move root element check to XMLScanner.
- *
- * Revision 1.20 2001/09/05 20:49:10 knoaman
- * Fix for complexTypes with mixed content model.
- *
- * Revision 1.19 2001/08/02 16:54:39 tng
- * Reset some Scanner flags in scanReset().
- *
- * Revision 1.18 2001/07/13 16:57:11 tng
- * ScanId fix.
- *
- * Revision 1.17 2001/07/12 20:10:18 tng
- * Partial Markup in Parameter Entity is validity constraint and thus should be just error, not fatal error.
- *
- * Revision 1.16 2001/07/10 21:09:39 tng
- * Give proper error messsage when scanning external id.
- *
- * Revision 1.15 2001/07/10 20:56:17 tng
- * Should check the first char of PI Target Name.
- *
- * Revision 1.14 2001/07/09 13:42:20 tng
- * Partial Markup in Parameter Entity is validity constraint and thus should be just error, not fatal error.
- *
- * Revision 1.13 2001/07/05 14:05:29 tng
- * Encoding String must present for external entity text decl.
- *
- * Revision 1.12 2001/07/05 13:12:19 tng
- * Standalone checking is validity constraint and thus should be just error, not fatal error:
- *
- * Revision 1.11 2001/06/25 14:39:54 knoaman
- * Fix bug #965 - submitted by Matt Lovett
- *
- * Revision 1.10 2001/06/22 12:42:33 tng
- * [Bug 2257] 1.5 thinks a <?xml-stylesheet ...> tag is a <?xml ...> tag
- *
- * Revision 1.9 2001/06/21 14:25:53 knoaman
- * Fix for bug 1946
- *
- * Revision 1.8 2001/06/04 13:25:50 tng
- * the start tag "<?xml" could be followed by (#x20 | #x9 | #xD | #xA)+. Fixed by Pei Yong Zhang.
- *
- * Revision 1.7 2001/05/28 20:54:06 tng
- * Schema: allocate a fDTDValidator, fSchemaValidator explicitly to avoid wrong cast
- *
- * Revision 1.6 2001/05/11 13:27:09 tng
- * Copyright update.
- *
- * Revision 1.5 2001/05/03 20:34:36 tng
- * Schema: SchemaValidator update
- *
- * Revision 1.4 2001/04/23 18:54:35 tng
- * Reuse grammar should allow users to use any stored element decl as root. Fixed by Erik Rydgren.
- *
- * Revision 1.3 2001/04/19 18:17:21 tng
- * Schema: SchemaValidator update, and use QName in Content Model
- *
- * Revision 1.2 2001/03/30 16:35:17 tng
- * Schema: Whitespace normalization.
- *
- * Revision 1.1 2001/03/21 21:56:20 tng
- * Schema: Add Schema Grammar, Schema Validator, and split the DTDValidator into DTDValidator, DTDScanner, and DTDGrammar.
- *
- */
- // ---------------------------------------------------------------------------
- // Includes
- // ---------------------------------------------------------------------------
- #include <util/BinMemInputStream.hpp>
- #include <util/FlagJanitor.hpp>
- #include <util/Janitor.hpp>
- #include <util/XMLUniDefs.hpp>
- #include <util/UnexpectedEOFException.hpp>
- #include <sax/InputSource.hpp>
- #include <framework/XMLDocumentHandler.hpp>
- #include <framework/XMLEntityHandler.hpp>
- #include <internal/EndOfEntityException.hpp>
- #include <internal/XMLScanner.hpp>
- #include <validators/common/ContentSpecNode.hpp>
- #include <validators/common/MixedContentModel.hpp>
- #include <validators/DTD/DTDEntityDecl.hpp>
- #include <validators/DTD/DocTypeHandler.hpp>
- #include <validators/DTD/DTDScanner.hpp>
- // ---------------------------------------------------------------------------
- // Local methods
- // ---------------------------------------------------------------------------
- //
- // This method automates the grunt work of looking at a char and see if its
- // a repetition suffix. If so, it creates a new correct rep node and wraps
- // the pass node in it. Otherwise, it returns the previous node.
- //
- static ContentSpecNode*
- makeRepNode(const XMLCh testCh, ContentSpecNode* const prevNode)
- {
- if (testCh == chQuestion)
- {
- return new ContentSpecNode
- (
- ContentSpecNode::ZeroOrOne
- , prevNode
- , 0
- );
- }
- else if (testCh == chPlus)
- {
- return new ContentSpecNode
- (
- ContentSpecNode::OneOrMore
- , prevNode
- , 0
- );
- }
- else if (testCh == chAsterisk)
- {
- return new ContentSpecNode
- (
- ContentSpecNode::ZeroOrMore
- , prevNode
- , 0
- );
- }
- // Just return the incoming node
- return prevNode;
- }
- // ---------------------------------------------------------------------------
- // DTDValidator: Constructors and Destructor
- // ---------------------------------------------------------------------------
- DTDScanner::DTDScanner(DTDGrammar* dtdGrammar, NameIdPool<DTDEntityDecl>* entityDeclPool, DocTypeHandler* const docTypeHandler) :
- fDocTypeHandler(docTypeHandler)
- , fDumAttDef(0)
- , fDumElemDecl(0)
- , fDumEntityDecl(0)
- , fInternalSubset(false)
- , fNextAttrId(1)
- , fDTDGrammar(dtdGrammar)
- , fPEntityDeclPool(0)
- , fEntityDeclPool(entityDeclPool)
- , fDocTypeReaderId(0)
- {
- fPEntityDeclPool = new NameIdPool<DTDEntityDecl>(109);
- }
- DTDScanner::~DTDScanner()
- {
- delete fDumAttDef;
- delete fDumElemDecl;
- delete fDumEntityDecl;
- delete fPEntityDeclPool;
- }
- // ---------------------------------------------------------------------------
- // DTDScanner: Private scanning methods
- // ---------------------------------------------------------------------------
- bool DTDScanner::checkForPERef(const bool spaceRequired
- , const bool inLiteral
- , const bool inMarkup
- , const bool throwAtEndExt)
- {
- bool gotSpace = false;
- //
- // See if we have any spaces up front. If so, then skip them and set
- // the gotSpaces flag.
- //
- if (fReaderMgr->skippedSpace())
- {
- fReaderMgr->skipPastSpaces();
- gotSpace = true;
- }
- // If the next char is a percent, then expand the PERef
- if (!fReaderMgr->skippedChar(chPercent))
- return gotSpace;
- while (true)
- {
- if (!expandPERef(false, inLiteral, inMarkup, throwAtEndExt))
- fScanner->emitError(XMLErrs::ExpectedEntityRefName);
- // And skip any more spaces in the expanded value
- if (fReaderMgr->skippedSpace())
- {
- fReaderMgr->skipPastSpaces();
- gotSpace = true;
- }
- if (!fReaderMgr->skippedChar(chPercent))
- break;
- }
- return gotSpace;
- }
- bool DTDScanner::expandPERef( const bool scanExternal
- , const bool inLiteral
- , const bool inMarkup
- , const bool throwEndOfExt)
- {
- fScanner->setHasNoDTD(false);
- XMLBufBid bbName(fBufMgr);
- //
- // If we are in the internal subset and in markup, then this is
- // an error but we go ahead and do it anyway.
- //
- if (fInternalSubset && inMarkup)
- fScanner->emitError(XMLErrs::PERefInMarkupInIntSubset);
- if (!fReaderMgr->getName(bbName.getBuffer()))
- {
- fScanner->emitError(XMLErrs::ExpectedPEName);
- // Skip the semicolon if that's what we ended up on
- fReaderMgr->skippedChar(chSemiColon);
- return false;
- }
- // If no terminating semicolon, emit an error but try to keep going
- if (!fReaderMgr->skippedChar(chSemiColon))
- fScanner->emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
- //
- // Look it up in the PE decl pool and see if it exists. If not, just
- // emit an error and continue.
- //
- XMLEntityDecl* decl = fPEntityDeclPool->getByKey(bbName.getRawBuffer());
- if (!decl)
- {
- // XML 1.0 Section 4.1
- if (fScanner->getStandalone()) {
- // no need to check fScanner->fHasNoDTD which is for sure false
- // since we are in expandPERef already
- fScanner->emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
- }
- else {
- if (fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer());
- }
- return false;
- }
- //
- // If we are a standalone document, then it has to have been declared
- // in the internal subset. Keep going though.
- //
- if (fScanner->getDoValidation() && fScanner->getStandalone() && !decl->getDeclaredInIntSubset())
- fScanner->getValidator()->emitError(XMLValid::IllegalRefInStandalone, bbName.getRawBuffer());
- //
- // Okee dokee, we found it. So create either a memory stream with
- // the entity value contents, or a file stream if its an external
- // entity.
- //
- if (decl->isExternal())
- {
- // And now create a reader to read this entity
- InputSource* srcUsed;
- XMLReader* reader = fReaderMgr->createReader
- (
- decl->getSystemId()
- , decl->getPublicId()
- , false
- , inLiteral ? XMLReader::RefFrom_Literal : XMLReader::RefFrom_NonLiteral
- , XMLReader::Type_PE
- , XMLReader::Source_External
- , srcUsed
- );
- // Put a janitor on the source so its cleaned up on exit
- Janitor<InputSource> janSrc(srcUsed);
- // If the creation failed then throw an exception
- if (!reader)
- ThrowXML1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed->getSystemId());
- // Set the 'throw at end' flag, to the one we were given
- reader->setThrowAtEnd(throwEndOfExt);
- //
- // Push the reader. If its a recursive expansion, then emit an error
- // and return an failure.
- //
- if (!fReaderMgr->pushReader(reader, decl))
- {
- fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
- return false;
- }
- //
- // If the caller wants us to scan the external entity, then lets
- // do that now.
- //
- if (scanExternal)
- {
- XMLEntityHandler* entHandler = fScanner->getEntityHandler();
- // If we have an entity handler, tell it we are starting this entity
- if (entHandler)
- entHandler->startInputSource(*srcUsed);
- //
- // Scan the external entity now. The parameter tells it that
- // it is not in an include section. Get the current reader
- // level so we can catch partial markup errors and be sure
- // to get back to here if we get an exception out of the
- // ext subset scan.
- //
- const unsigned int readerNum = fReaderMgr->getCurrentReaderNum();
- try
- {
- scanExtSubsetDecl(false);
- }
- catch(...)
- {
- // Pop the reader back to the original level
- fReaderMgr->cleanStackBackTo(readerNum);
- // End the input source, even though its not happy
- if (entHandler)
- entHandler->endInputSource(*srcUsed);
- throw;
- }
- // If we have an entity handler, tell it we are ending this entity
- if (entHandler)
- entHandler->endInputSource(*srcUsed);
- }
- }
- else
- {
- // Create a reader over a memory stream over the entity value
- XMLReader* valueReader = fReaderMgr->createIntEntReader
- (
- decl->getName()
- , inLiteral ? XMLReader::RefFrom_Literal : XMLReader::RefFrom_NonLiteral
- , XMLReader::Type_PE
- , decl->getValue()
- , decl->getValueLen()
- , false
- );
- //
- // Trt to push the entity reader onto the reader manager stack,
- // where it will become the subsequent input. If it fails, that
- // means the entity is recursive, so issue an error. The reader
- // will have just been discarded, but we just keep going.
- //
- if (!fReaderMgr->pushReader(valueReader, decl))
- fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
- }
- return true;
- }
- bool DTDScanner::getQuotedString(XMLBuffer& toFill)
- {
- // Reset the target buffer
- toFill.reset();
- // Get the next char which must be a single or double quote
- XMLCh quoteCh;
- if (!fReaderMgr->skipIfQuote(quoteCh))
- return false;
- while (true)
- {
- // Get another char
- const XMLCh nextCh = fReaderMgr->getNextChar();
- // See if it matches the starting quote char
- if (nextCh == quoteCh)
- break;
- //
- // We should never get either an end of file null char here. If we
- // do, just fail. It will be handled more gracefully in the higher
- // level code that called us.
- //
- if (!nextCh)
- return false;
- // Else add it to the buffer
- toFill.append(nextCh);
- }
- return true;
- }
- XMLAttDef*
- DTDScanner::scanAttDef(DTDElementDecl& parentElem, XMLBuffer& bufToUse)
- {
- // Check for PE ref or optional whitespace
- checkForPERef(false, false, true);
- // Get the name of the attribute
- if (!fReaderMgr->getName(bufToUse))
- {
- fScanner->emitError(XMLErrs::ExpectedAttrName);
- return 0;
- }
- //
- // Look up this attribute in the parent element's attribute list. If
- // it already exists, then use the dummy.
- //
- DTDAttDef* decl = parentElem.getAttDef(bufToUse.getRawBuffer());
- if (decl)
- {
- // It already exists, so put out a warning
- fScanner->emitError
- (
- XMLErrs::AttListAlreadyExists
- , bufToUse.getRawBuffer()
- , parentElem.getFullName()
- );
- // Use the dummy decl to parse into and set its name to the name we got
- if (!fDumAttDef)
- {
- fDumAttDef = new DTDAttDef;
- fDumAttDef->setId(fNextAttrId++);
- }
- fDumAttDef->setName(bufToUse.getRawBuffer());
- decl = fDumAttDef;
- }
- else
- {
- //
- // It does not already exist so create a new one, give it the next
- // available unique id, and add it
- //
- decl = new DTDAttDef(bufToUse.getRawBuffer());
- decl->setId(fNextAttrId++);
- decl->setExternalAttDeclaration(isReadingExternalEntity());
- parentElem.addAttDef(decl);
- }
- // Set a flag to indicate whether we are doing a dummy parse
- const bool isIgnored = (decl == fDumAttDef);
- // Space is required here, so check for PE ref, and require space
- if (!checkForPERef(true, false, true))
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- //
- // Next has to be one of the attribute type strings. This tells us what
- // is to follow.
- //
- if (fReaderMgr->skippedString(XMLUni::fgCDATAString))
- {
- decl->setType(XMLAttDef::CData);
- }
- else if (fReaderMgr->skippedString(XMLUni::fgIDString))
- {
- if (!fReaderMgr->skippedString(XMLUni::fgRefString))
- decl->setType(XMLAttDef::ID);
- else if (!fReaderMgr->skippedChar(chLatin_S))
- decl->setType(XMLAttDef::IDRef);
- else
- decl->setType(XMLAttDef::IDRefs);
- }
- else if (fReaderMgr->skippedString(XMLUni::fgEntitString))
- {
- if (fReaderMgr->skippedChar(chLatin_Y))
- {
- decl->setType(XMLAttDef::Entity);
- }
- else if (fReaderMgr->skippedString(XMLUni::fgIESString))
- {
- decl->setType(XMLAttDef::Entities);
- }
- else
- {
- fScanner->emitError
- (
- XMLErrs::ExpectedAttributeType
- , decl->getFullName()
- , parentElem.getFullName()
- );
- return 0;
- }
- }
- else if (fReaderMgr->skippedString(XMLUni::fgNmTokenString))
- {
- if (fReaderMgr->skippedChar(chLatin_S))
- decl->setType(XMLAttDef::NmTokens);
- else
- decl->setType(XMLAttDef::NmToken);
- }
- else if (fReaderMgr->skippedString(XMLUni::fgNotationString))
- {
- // Check for PE ref and require space
- if (!checkForPERef(true, false, true))
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- decl->setType(XMLAttDef::Notation);
- if (!scanEnumeration(*decl, bufToUse, true))
- return 0;
- // Set the value as the enumeration for this decl
- decl->setEnumeration(bufToUse.getRawBuffer());
- }
- else if (fReaderMgr->skippedChar(chOpenParen))
- {
- decl->setType(XMLAttDef::Enumeration);
- if (!scanEnumeration(*decl, bufToUse, false))
- return 0;
- // Set the value as the enumeration for this decl
- decl->setEnumeration(bufToUse.getRawBuffer());
- }
- else
- {
- fScanner->emitError
- (
- XMLErrs::ExpectedAttributeType
- , decl->getFullName()
- , parentElem.getFullName()
- );
- return 0;
- }
- // Space is required here, so check for PE ref, and require space
- if (!checkForPERef(true, false, true))
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- // And then scan for the optional default value declaration
- scanDefaultDecl(*decl);
- // If validating, then do a couple of validation constraints
- if (fScanner->getDoValidation())
- {
- if (decl->getType() == XMLAttDef::ID)
- {
- if ((decl->getDefaultType() != XMLAttDef::Implied)
- && (decl->getDefaultType() != XMLAttDef::Required))
- {
- fScanner->getValidator()->emitError(XMLValid::BadIDAttrDefType, decl->getFullName());
- }
- }
- // if attdef is xml:space, check correct enumeration (default|preserve)
- const XMLCh fgXMLSpace[] = { chLatin_x, chLatin_m, chLatin_l, chColon, chLatin_s, chLatin_p, chLatin_a, chLatin_c, chLatin_e, chNull };
- if (!XMLString::compareString(decl->getFullName(),fgXMLSpace)) {
- const XMLCh fgPreserve[] = { chLatin_p, chLatin_r, chLatin_e, chLatin_s, chLatin_e, chLatin_r, chLatin_v, chLatin_e, chNull };
- const XMLCh fgDefault[] = { chLatin_d, chLatin_e, chLatin_f, chLatin_a, chLatin_u, chLatin_l, chLatin_t, chNull };
- bool ok = false;
- if (decl->getType() == XMLAttDef::Enumeration) {
- RefVectorOf<XMLCh>* enumVector = XMLString::tokenizeString(decl->getEnumeration());
- int size = enumVector->size();
- ok = (size == 1 &&
- (!XMLString::compareString(enumVector->elementAt(0), fgDefault) ||
- !XMLString::compareString(enumVector->elementAt(0), fgPreserve))) ||
- (size == 2 &&
- (!XMLString::compareString(enumVector->elementAt(0), fgDefault) &&
- !XMLString::compareString(enumVector->elementAt(1), fgPreserve))) ||
- (size == 2 &&
- (!XMLString::compareString(enumVector->elementAt(1), fgDefault) &&
- !XMLString::compareString(enumVector->elementAt(0), fgPreserve)));
- delete enumVector;
- }
- if (!ok)
- fScanner->getValidator()->emitError(XMLValid::IllegalXMLSpace);
- }
- }
- // If we have a doc type handler, tell it about this attdef.
- if (fDocTypeHandler)
- fDocTypeHandler->attDef(parentElem, *decl, isIgnored);
- return decl;
- }
- void DTDScanner::scanAttListDecl()
- {
- // Space is required here, so check for a PE ref
- if (!checkForPERef(true, false, true))
- {
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- //
- // Next should be the name of the element it belongs to, so get a buffer
- // and get the name into it.
- //
- XMLBufBid bbName(fBufMgr);
- if (!fReaderMgr->getName(bbName.getBuffer()))
- {
- fScanner->emitError(XMLErrs::ExpectedElementName);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- //
- // Find this element's declaration. If it has not been declared yet,
- // we will force one into the list, but not mark it as declared.
- //
- DTDElementDecl* elemDecl = (DTDElementDecl*) fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bbName.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
- if (!elemDecl)
- {
- //
- // Lets fault in a declaration and add it to the pool. We mark
- // it having been created because of an attlist. Later, if its
- // declared, this will be updated.
- //
- elemDecl = new DTDElementDecl(bbName.getRawBuffer(), fEmptyNamespaceId);
- elemDecl->setCreateReason(XMLElementDecl::AttList);
- elemDecl->setExternalElemDeclaration(isReadingExternalEntity());
- fDTDGrammar->putElemDecl((XMLElementDecl*) elemDecl);
- }
- // If we have a doc type handler, tell it the att list is starting
- if (fDocTypeHandler)
- fDocTypeHandler->startAttList(*elemDecl);
- //
- // Now we loop until we are done with all of the attributes in this
- // list. We need a buffer to use for local processing.
- //
- XMLBufBid bbTmp(fBufMgr);
- XMLBuffer& tmpBuf = bbTmp.getBuffer();
- bool seenAnId = false;
- while (true)
- {
- // Get the next char out and see what it tells us to do
- const XMLCh nextCh = fReaderMgr->peekNextChar();
- // Watch for EOF
- if (!nextCh)
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- if (nextCh == chCloseAngle)
- {
- // We are done with this attribute list
- fReaderMgr->getNextChar();
- break;
- }
- else if (XMLReader::isWhitespace(nextCh))
- {
- //
- // If advanced callbacks are enabled and we have a doc
- // type handler, then gather up the white space and call
- // back on the doctype handler. Otherwise, just skip
- // whitespace.
- //
- if (fDocTypeHandler)
- {
- fReaderMgr->getSpaces(tmpBuf);
- fDocTypeHandler->doctypeWhitespace
- (
- tmpBuf.getRawBuffer()
- , tmpBuf.getLen()
- );
- }
- else
- {
- fReaderMgr->skipPastSpaces();
- }
- }
- else if (nextCh == chPercent)
- {
- // Eat the percent and expand the ref
- fReaderMgr->getNextChar();
- expandPERef(false, false, true);
- }
- else
- {
- //
- // It must be an attribute name, so scan it. We let
- // it use our local buffer for its name scanning.
- //
- XMLAttDef* attDef = scanAttDef(*elemDecl, tmpBuf);
- if (!attDef)
- {
- fReaderMgr->skipPastChar(chCloseAngle);
- break;
- }
- //
- // If we are validating and its an ID type, then we have to
- // make sure that we have not seen an id attribute yet. Set
- // the flag to say that we've seen one now also.
- //
- if (fScanner->getDoValidation())
- {
- if (attDef->getType() == XMLAttDef::ID)
- {
- if (seenAnId)
- fScanner->getValidator()->emitError(XMLValid::MultipleIdAttrs, elemDecl->getFullName());
- seenAnId = true;
- }
- }
- }
- }
- // If we have a doc type handler, tell it the att list is ending
- if (fDocTypeHandler)
- fDocTypeHandler->endAttList(*elemDecl);
- }
- //
- // This method is called to scan the value of an attribute in content. This
- // involves some normalization and replacement of general entity and
- // character references.
- //
- // End of entity's must be dealt with here. During DTD scan, they can come
- // from external entities. During content, they can come from any entity.
- // We just eat the end of entity and continue with our scan until we come
- // to the closing quote. If an unterminated value causes us to go through
- // subsequent entities, that will cause errors back in the calling code,
- // but there's little we can do about it here.
- //
- bool DTDScanner::scanAttValue(const XMLCh* const attrName
- , XMLBuffer& toFill
- , const XMLAttDef::AttTypes type)
- {
- enum States
- {
- InWhitespace
- , InContent
- };
- // Reset the target buffer
- toFill.reset();
- // Get the next char which must be a single or double quote
- XMLCh quoteCh;
- if (!fReaderMgr->skipIfQuote(quoteCh))
- return false;
- //
- // We have to get the current reader because we have to ignore closing
- // quotes until we hit the same reader again.
- //
- const unsigned int curReader = fReaderMgr->getCurrentReaderNum();
- //
- // Loop until we get the attribute value. Note that we use a double
- // loop here to avoid the setup/teardown overhead of the exception
- // handler on every round.
- //
- XMLCh nextCh;
- XMLCh secondCh = 0;
- States curState = InContent;
- bool firstNonWS = false;
- bool gotLeadingSurrogate = false;
- bool escaped;
- while (true)
- {
- try
- {
- while(true)
- {
- // Get another char. Use second char from prevous is its there
- if (secondCh)
- {
- nextCh = secondCh;
- secondCh = 0;
- }
- else
- {
- nextCh = fReaderMgr->getNextChar();
- }
- if (!nextCh)
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- // Check for our ending quote in the same entity
- if (nextCh == quoteCh)
- {
- if (curReader == fReaderMgr->getCurrentReaderNum())
- return true;
- // Watch for spillover into a previous entity
- if (curReader > fReaderMgr->getCurrentReaderNum())
- {
- fScanner->emitError(XMLErrs::PartialMarkupInEntity);
- return false;
- }
- }
- //
- // Check for an entity ref now, before we let it affect our
- // whitespace normalization logic below. We ignore the empty flag
- // in this one.
- //
- escaped = false;
- if (nextCh == chAmpersand)
- {
- if (scanEntityRef(nextCh, secondCh, escaped) != EntityExp_Returned)
- {
- gotLeadingSurrogate = false;
- continue;
- }
- }
- // Check for correct surrogate pairs
- if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
- {
- if (gotLeadingSurrogate)
- fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
- else
- gotLeadingSurrogate = true;
- }
- else
- {
- if (gotLeadingSurrogate)
- {
- if ((nextCh < 0xDC00) && (nextCh > 0xDFFF))
- fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
- }
- gotLeadingSurrogate = false;
- // Its got to at least be a valid XML character
- if (!XMLReader::isXMLChar(nextCh))
- {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- fScanner->emitError
- (
- XMLErrs::InvalidCharacter
- , attrName
- , tmpBuf
- );
- }
- }
- //
- // If its not escaped, then make sure its not a < character, which
- // is not allowed in attribute values.
- //
- if (!escaped && (nextCh == chOpenAngle))
- fScanner->emitError(XMLErrs::BracketInAttrValue, attrName);
- //
- // If the attribute is a CDATA type we do simple replacement of
- // tabs and new lines with spaces, if the character is not escaped
- // by way of a char ref.
- //
- // Otherwise, we do the standard non-CDATA normalization of
- // compressing whitespace to single spaces and getting rid of
- // leading and trailing whitespace.
- //
- if (type == XMLAttDef::CData)
- {
- if (!escaped)
- {
- if ((nextCh == 0x09) || (nextCh == 0x0A) || (nextCh == 0x0D))
- nextCh = chSpace;
- }
- }
- else
- {
- if (curState == InWhitespace)
- {
- if (!XMLReader::isWhitespace(nextCh))
- {
- if (firstNonWS)
- toFill.append(chSpace);
- curState = InContent;
- firstNonWS = true;
- }
- else
- {
- continue;
- }
- }
- else if (curState == InContent)
- {
- if (XMLReader::isWhitespace(nextCh))
- {
- curState = InWhitespace;
- continue;
- }
- firstNonWS = true;
- }
- }
- // Else add it to the buffer
- toFill.append(nextCh);
- }
- }
- catch(const EndOfEntityException&)
- {
- // Just eat it and continue.
- gotLeadingSurrogate = false;
- escaped = false;
- }
- }
- return true;
- }
- bool DTDScanner::scanCharRef(XMLCh& first, XMLCh& second)
- {
- bool gotOne = false;
- unsigned int value = 0;
- //
- // Set the radix. Its supposed to be a lower case x if hex. But, in
- // order to recover well, we check for an upper and put out an error
- // for that.
- //
- unsigned int radix = 10;
- if (fReaderMgr->skippedChar(chLatin_x))
- {
- radix = 16;
- }
- else if (fReaderMgr->skippedChar(chLatin_X))
- {
- fScanner->emitError(XMLErrs::HexRadixMustBeLowerCase);
- radix = 16;
- }
- while (true)
- {
- const XMLCh nextCh = fReaderMgr->peekNextChar();
- // Watch for EOF
- if (!nextCh)
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- // Break out on the terminating semicolon
- if (nextCh == chSemiColon)
- {
- fReaderMgr->getNextChar();
- break;
- }
- //
- // Convert this char to a binary value, or bail out if its not
- // one.
- //
- unsigned int nextVal;
- if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9))
- nextVal = (unsigned int)(nextCh - chDigit_0);
- else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F))
- nextVal= (unsigned int)(10 + (nextCh - chLatin_A));
- else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f))
- nextVal = (unsigned int)(10 + (nextCh - chLatin_a));
- else
- {
- //
- // If we got at least a sigit, then do an unterminated ref
- // error. Else, do an expected a numerical ref thing.
- //
- if (gotOne)
- fScanner->emitError(XMLErrs::UnterminatedCharRef);
- else
- fScanner->emitError(XMLErrs::ExpectedNumericalCharRef);
- return false;
- }
- //
- // Make sure its valid for the radix. If not, then just eat the
- // digit and go on after issueing an error. Else, update the
- // running value with this new digit.
- //
- if (nextVal >= radix)
- {
- XMLCh tmpStr[2];
- tmpStr[0] = nextCh;
- tmpStr[1] = chNull;
- fScanner->emitError(XMLErrs::BadDigitForRadix, tmpStr);
- }
- else
- {
- value = (value * radix) + nextVal;
- }
- // Indicate that we got at least one good digit
- gotOne = true;
- // Eat the char we just processed
- fReaderMgr->getNextChar();
- }
- // Return the char (or chars)
- if (value >= 0x10000)
- {
- value -= 0x10000;
- first = XMLCh((value >> 10) + 0xD800);
- second = XMLCh((value & 0x3FF) + 0xDC00);
- }
- else
- {
- first = XMLCh(value);
- second = 0;
- }
- return true;
- }
- ContentSpecNode*
- DTDScanner::scanChildren(const DTDElementDecl& elemDecl, XMLBuffer& bufToUse)
- {
- // Check for a PE ref here, but don't require spaces
- checkForPERef(false, false, true);
- // We have to check entity nesting here
- unsigned int curReader;
- //
- // We know that the caller just saw an opening parenthesis, so we need
- // to parse until we hit the end of it, recursing for other nested
- // parentheses we see.
- //
- // We have to check for one up front, since it could be something like
- // (((a)*)) etc...
- //
- ContentSpecNode* curNode = 0;
- if (fReaderMgr->skippedChar(chOpenParen))
- {
- curReader = fReaderMgr->getCurrentReaderNum();
- // Lets call ourself and get back the resulting node
- curNode = scanChildren(elemDecl, bufToUse);
- // If that failed, no need to go further, return failure
- if (!curNode)
- return 0;
- if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
- }
- else
- {
- // Not a nested paren, so it must be a leaf node
- if (!fReaderMgr->getName(bufToUse))
- {
- fScanner->emitError(XMLErrs::ExpectedElementName);
- return 0;
- }
- //
- // Create a leaf node for it. If we can find the element id for
- // this element, then use it. Else, we have to fault in an element
- // decl, marked as created because of being in a content model.
- //
- XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bufToUse.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
- if (!decl)
- {
- decl = new DTDElementDecl(bufToUse.getRawBuffer(), fEmptyNamespaceId);
- decl->setCreateReason(XMLElementDecl::InContentModel);
- decl->setExternalElemDeclaration(isReadingExternalEntity());
- fDTDGrammar->putElemDecl(decl);
- }
- curNode = new ContentSpecNode(decl->getElementName());
- // Check for a PE ref here, but don't require spaces
- const bool gotSpaces = checkForPERef(false, false, true);
- // Check for a repetition character after the leaf
- const XMLCh repCh = fReaderMgr->peekNextChar();
- ContentSpecNode* tmpNode = makeRepNode(repCh, curNode);
- if (tmpNode != curNode)
- {
- if (gotSpaces)
- fScanner->emitError(XMLErrs::UnexpectedWhitespace);
- fReaderMgr->getNextChar();
- curNode = tmpNode;
- }
- }
- // Check for a PE ref here, but don't require spaces
- checkForPERef(false, false, true);
- //
- // Ok, the next character tells us what kind of content this particular
- // model this particular parentesized section is. Its either a choice if
- // we see ',', a sequence if we see '|', or a single leaf node if we see
- // a closing paren.
- //
- const XMLCh opCh = fReaderMgr->peekNextChar();
- if ((opCh != chComma)
- && (opCh != chPipe)
- && (opCh != chCloseParen))
- {
- // Not a legal char, so delete our node and return failure
- fScanner->emitError(XMLErrs::ExpectedSeqChoiceLeaf);
- delete curNode;
- return 0;
- }
- //
- // Create the head node of the correct type. We need this to remember
- // the top of the local tree. If it was a single subexpr, then just
- // set the head node to the current node. For the others, we'll build
- // the tree off the second child as we move across.
- //
- ContentSpecNode* headNode = 0;
- ContentSpecNode::NodeTypes curType;
- if (opCh == chComma)
- {
- curType = ContentSpecNode::Sequence;
- headNode = new ContentSpecNode(curType, curNode, 0);
- curNode = headNode;
- }
- else if (opCh == chPipe)
- {
- curType = ContentSpecNode::Choice;
- headNode = new ContentSpecNode(curType, curNode, 0);
- curNode = headNode;
- }
- else
- {
- headNode = curNode;
- fReaderMgr->getNextChar();
- }
- //
- // If it was a sequence or choice, we just loop until we get to the
- // end of our section, adding each new leaf or sub expression to the
- // right child of the current node, and making that new node the current
- // node.
- //
- if ((opCh == chComma) || (opCh == chPipe))
- {
- ContentSpecNode* lastNode = 0;
- while (true)
- {
- //
- // The next thing must either be another | or , character followed
- // by another leaf or subexpression, or a closing parenthesis, or a
- // PE ref.
- //
- if (fReaderMgr->lookingAtChar(chPercent))
- {
- checkForPERef(false, false, true);
- }
- else if (fReaderMgr->skippedSpace())
- {
- // Just skip whitespace
- fReaderMgr->skipPastSpaces();
- }
- else if (fReaderMgr->skippedChar(chCloseParen))
- {
- //
- // We've hit the end of this section, so break out. But, we
- // need to see if we left a partial sequence of choice node
- // without a second node. If so, we have to undo that and
- // put its left child into the right node of the previous
- // node.
- //
- if ((curNode->getType() == ContentSpecNode::Choice)
- || (curNode->getType() == ContentSpecNode::Sequence))
- {
- if (!curNode->getSecond())
- {
- ContentSpecNode* saveFirst = curNode->orphanFirst();
- lastNode->setSecond(saveFirst);
- curNode = lastNode;
- }
- }
- break;
- }
- else if (fReaderMgr->skippedChar(opCh))
- {
- // Check for a PE ref here, but don't require spaces
- checkForPERef(false, false, true);
- if (fReaderMgr->skippedChar(chOpenParen))
- {
- curReader = fReaderMgr->getCurrentReaderNum();
- // Recurse to handle this new guy
- ContentSpecNode* subNode = scanChildren(elemDecl, bufToUse);
- // If it failed, we are done, clean up here and return failure
- if (!subNode)
- {
- delete headNode;
- return 0;
- }
- if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
- // Else patch it in and make it the new current
- ContentSpecNode* newCur = new ContentSpecNode
- (
- curType
- , subNode
- , 0
- );
- curNode->setSecond(newCur);
- lastNode = curNode;
- curNode = newCur;
- }
- else
- {
- //
- // Got to be a leaf node, so get a name. If we cannot get
- // one, then clean up and get outa here.
- //
- if (!fReaderMgr->getName(bufToUse))
- {
- delete headNode;
- fScanner->emitError(XMLErrs::ExpectedElementName);
- return 0;
- }
- //
- // Create a leaf node for it. If we can find the element
- // id for this element, then use it. Else, we have to
- // fault in an element decl, marked as created because
- // of being in a content model.
- //
- XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bufToUse.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
- if (!decl)
- {
- decl = new DTDElementDecl(bufToUse.getRawBuffer(), fEmptyNamespaceId);
- decl->setCreateReason(XMLElementDecl::InContentModel);
- decl->setExternalElemDeclaration(isReadingExternalEntity());
- fDTDGrammar->putElemDecl(decl);
- }
- ContentSpecNode* tmpLeaf = new ContentSpecNode(decl->getElementName());
- // Check for a repetition character after the leaf
- const XMLCh repCh = fReaderMgr->peekNextChar();
- ContentSpecNode* tmpLeaf2 = makeRepNode(repCh, tmpLeaf);
- if (tmpLeaf != tmpLeaf2)
- fReaderMgr->getNextChar();
- //
- // Create a new sequence or choice node, with the leaf
- // (or rep surrounding it) we just got as its first node.
- // Make the new node the second node of the current node,
- // and then make it the current node.
- //
- ContentSpecNode* newCur = new ContentSpecNode
- (
- curType
- , tmpLeaf2
- , 0
- );
- curNode->setSecond(newCur);
- lastNode = curNode;
- curNode = newCur;
- }
- }
- else
- {
- // Cannot be valid
- if (opCh == chComma)
- {
- fScanner->emitError(XMLErrs::ExpectedChoiceOrCloseParen);
- }
- else
- {
- fScanner->emitError
- (
- XMLErrs::ExpectedSeqOrCloseParen
- , elemDecl.getFullName()
- );
- }
- delete headNode;
- return 0;
- }
- }
- }
- //
- // We saw the terminating parenthesis so lets check for any repetition
- // character, and create a node for that, making the head node the child
- // of it.
- //
- XMLCh repCh = fReaderMgr->peekNextChar();
- ContentSpecNode* retNode = makeRepNode(repCh, headNode);
- if (retNode != headNode)
- fReaderMgr->getNextChar();
- return retNode;
- }
- //
- // We get here after the '<!--' part of the comment. We scan past the
- // terminating '-->' It will calls the appropriate handler with the comment
- // text, if one is provided. A comment can be in either the document or
- // the DTD, so the fInDocument flag is used to know which handler to send
- // it to.
- //
- void DTDScanner::scanComment()
- {
- enum States
- {
- InText
- , OneDash
- , TwoDashes
- };
- // Get a buffer for this
- XMLBufBid bbComment(fBufMgr);
- //
- // Get the comment text into a temp buffer. Be sure to use temp buffer
- // two here, since its to be used for stuff that is potentially longer
- // than just a name.
- //
- States curState = InText;
- while (true)
- {
- // Get the next character
- const XMLCh nextCh = fReaderMgr->getNextChar();
- // Watch for an end of file
- if (!nextCh)
- {
- fScanner->emitError(XMLErrs::UnterminatedComment);
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- }
- // Make sure its a valid XML character
- if (!XMLReader::isXMLChar(nextCh))
- {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
- }
- if (curState == InText)
- {
- // If its a dash, go to OneDash state. Otherwise take as text
- if (nextCh == chDash)
- curState = OneDash;
- else
- bbComment.append(nextCh);
- }
- else if (curState == OneDash)
- {
- //
- // If its another dash, then we change to the two dashes states.
- // Otherwise, we have to put in the deficit dash and the new
- // character and go back to InText.
- //
- if (nextCh == chDash)
- {
- curState = TwoDashes;
- }
- else
- {
- bbComment.append(chDash);
- bbComment.append(nextCh);
- curState = InText;
- }
- }
- else if (curState == TwoDashes)
- {
- // The next character must be the closing bracket
- if (nextCh != chCloseAngle)
- {
- fScanner->emitError(XMLErrs::IllegalSequenceInComment);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- break;
- }
- }
- // If there is a doc type handler, then pass on the comment stuff
- if (fDocTypeHandler)
- fDocTypeHandler->doctypeComment(bbComment.getRawBuffer());
- }
- bool DTDScanner::scanContentSpec(DTDElementDecl& toFill)
- {
- //
- // Check for for a couple of the predefined content type strings. If
- // its not one of these, its got to be a parenthesized reg ex type
- // expression.
- //
- if (fReaderMgr->skippedString(XMLUni::fgEmptyString))
- {
- toFill.setModelType(DTDElementDecl::Empty);
- return true;
- }
- if (fReaderMgr->skippedString(XMLUni::fgAnyString))
- {
- toFill.setModelType(DTDElementDecl::Any);
- return true;
- }
- // Its got to be a parenthesized regular expression
- if (!fReaderMgr->skippedChar(chOpenParen))
- {
- fScanner->emitError
- (
- XMLErrs::ExpectedContentSpecExpr
- , toFill.getFullName()
- );
- return false;
- }
- // Get the current reader id, so we can test for partial markup
- const unsigned int curReader = fReaderMgr->getCurrentReaderNum();
- // We could have a PE ref here, but don't require space
- checkForPERef(false, false, true);
- //
- // Now we look for a PCDATA string. If its PCDATA, then it must be a
- // MIXED model. Otherwise, it must be a regular list of children in
- // a regular expression perhaps.
- //
- bool status;
- if (fReaderMgr->skippedString(XMLUni::fgPCDATAString))
- {
- // Set the model to mixed
- toFill.setModelType(DTDElementDecl::Mixed_Simple);
- status = scanMixed(toFill);
- //
- // If we are validating we have to check that there are no multiple
- // uses of any child elements.
- //
- if (fScanner->getDoValidation())
- {
- if (((const MixedContentModel*)toFill.getContentModel())->hasDups())
- fScanner->getValidator()->emitError(XMLValid::RepElemInMixed);
- }
- }
- else
- {
- //
- // We have to do a recursive scan of the content model. Create a
- // buffer for it to use, for efficiency. It returns the top ofthe
- // content spec node tree, which we set if successful.
- //
- toFill.setModelType(DTDElementDecl::Children);
- XMLBufBid bbTmp(fBufMgr);
- ContentSpecNode* resNode = scanChildren(toFill, bbTmp.getBuffer());
- status = (resNode != 0);
- if (status)
- toFill.setContentSpec(resNode);
- }
- // Make sure we are on the same reader as where we started
- if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
- return status;
- }
- void DTDScanner::scanDefaultDecl(DTDAttDef& toFill)
- {
- if (fReaderMgr->skippedString(XMLUni::fgRequiredString))
- {
- toFill.setDefaultType(XMLAttDef::Required);
- return;
- }
- if (fReaderMgr->skippedString(XMLUni::fgImpliedString))
- {
- toFill.setDefaultType(XMLAttDef::Implied);
- return;
- }
- if (fReaderMgr->skippedString(XMLUni::fgFixedString))
- {
- //
- // There must be space before the fixed value. If there is not, then
- // emit an error but keep going.
- //
- if (!fReaderMgr->skippedSpace())
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- else
- fReaderMgr->skipPastSpaces();
- toFill.setDefaultType(XMLAttDef::Fixed);
- }
- else
- {
- toFill.setDefaultType(XMLAttDef::Default);
- }
- //
- // If we got here, its fixed or default, so we need to get a value.
- // If we don't, then emit an error but just set the default value to
- // an empty string and try to keep going.
- //
- XMLBufBid bbValue(fBufMgr);
- if (!scanAttValue(toFill.getFullName(), bbValue.getBuffer(), toFill.getType()))
- fScanner->emitError(XMLErrs::ExpectedDefAttrDecl);
- toFill.setValue(bbValue.getRawBuffer());
- }
- //
- // This method handles the high level logic of scanning the DOCType
- // declaration. This kicks off both the scanning of the internal subset and
- // the scanning of the external subset, if any.
- //
- // When we get here the '<!DOCTYPE' part has already been scanned, which is
- // what told us that we had a doc type decl to parse.
- //
- void DTDScanner::scanDocTypeDecl(const bool reuseGrammar)
- {
- // There must be some space after DOCTYPE
- if (!fReaderMgr->skipPastSpaces())
- {
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- // Just skip the Doctype declaration and return
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // Get a buffer for the root element
- XMLBufBid bbRootName(fBufMgr);
- //
- // Get a name from the input, which should be the name of the root
- // element of the upcoming content.
- //
- fReaderMgr->getName(bbRootName.getBuffer());
- if (bbRootName.isEmpty())
- {
- fScanner->emitError(XMLErrs::NoRootElemInDOCTYPE);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- //
- // Store the root element name for later check
- //
- fScanner->setRootElemName(bbRootName.getRawBuffer());
- //
- // This element obviously is not going to exist in the element decl
- // pool yet, but we need to call docTypeDecl. So force it into
- // the element decl pool, marked as being there because it was in
- // the DOCTYPE. Later, when its declared, the status will be updated.
- //
- // Only do this if we are not reusing the validator! If we are reusing,
- // then look it up instead. It has to exist!
- //
- DTDElementDecl* rootDecl;
- Janitor<DTDElementDecl> janSrc(0);
- if (reuseGrammar)
- {
- Grammar* fGrammar = fDTDGrammar;
- if (fGrammar->getGrammarType() == Grammar::DTDGrammarType) {
- rootDecl = (DTDElementDecl*) fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bbRootName.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
- if (rootDecl)
- fDTDGrammar->setRootElemId(rootDecl->getId());
- else {
- rootDecl = new DTDElementDecl(bbRootName.getRawBuffer(), fEmptyNamespaceId);
- rootDecl->setCreateReason(DTDElementDecl::AsRootElem);
- rootDecl->setExternalElemDeclaration(isReadingExternalEntity());
- fDTDGrammar->setRootElemId(fDTDGrammar->putElemDecl(rootDecl));
- }
- }
- else {
- rootDecl = new DTDElementDecl(bbRootName.getRawBuffer(), fEmptyNamespaceId);
- rootDecl->setCreateReason(DTDElementDecl::AsRootElem);
- rootDecl->setExternalElemDeclaration(isReadingExternalEntity());
- janSrc.reset(rootDecl);
- }
- }
- else
- {
- rootDecl = new DTDElementDecl(bbRootName.getRawBuffer(), fEmptyNamespaceId);
- rootDecl->setCreateReason(DTDElementDecl::AsRootElem);
- rootDecl->setExternalElemDeclaration(isReadingExternalEntity());
- fDTDGrammar->setRootElemId(fDTDGrammar->putElemDecl(rootDecl));
- }
- // Skip any spaces after the name
- fReaderMgr->skipPastSpaces();
- //
- // And now if we are looking at a >, then we are done. It is not
- // required to have an internal or external subset, though why you
- // would not escapes me.
- //
- if (fReaderMgr->skippedChar(chCloseAngle)) {
- //
- // If we have a doc type handler and advanced callbacks are enabled,
- // call the doctype event.
- //
- if (fDocTypeHandler)
- fDocTypeHandler->doctypeDecl(*rootDecl, 0, 0, false);
- return;
- }
- // either internal/external subset
- if(!reuseGrammar) {
- if (fScanner->getValidationScheme() == XMLScanner::Val_Auto)
- fScanner->setDoValidation(true, false);
- }
- bool hasIntSubset = false;
- bool hasExtSubset = false;
- XMLCh* sysId = 0;
- XMLCh* pubId = 0;
- //
- // If the next character is '[' then we have no external subset cause
- // there is no system id, just the opening character of the internal
- // subset. Else, has to be an id.
- //
- // Just look at the next char, don't eat it.
- if (fReaderMgr->peekNextChar() == chOpenSquare)
- {
- hasIntSubset = true;
- }
- else
- {
- // Indicate we have an external subset
- hasExtSubset = true;
- fScanner->setHasNoDTD(false);
- // Get buffers for the ids
- XMLBufBid bbPubId(fBufMgr);
- XMLBufBid bbSysId(fBufMgr);
- // Get the external subset id
- if (!scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), IDType_External))
- {
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // Get copies of the ids we got
- pubId = XMLString::replicate(bbPubId.getRawBuffer());
- sysId = XMLString::replicate(bbSysId.getRawBuffer());
- // Skip spaces and check again for the opening of an internal subset
- fReaderMgr->skipPastSpaces();
- // Just look at the next char, don't eat it.
- if (fReaderMgr->peekNextChar() == chOpenSquare) {
- hasIntSubset = true;
- }
- }
- // Insure that the ids get cleaned up, if they got allocated
- ArrayJanitor<XMLCh> janSysId(sysId);
- ArrayJanitor<XMLCh> janPubId(pubId);
- //
- // If we have a doc type handler and advanced callbacks are enabled,
- // call the doctype event.
- //
- if (fDocTypeHandler)
- fDocTypeHandler->doctypeDecl(*rootDecl, pubId, sysId, hasIntSubset);
- //
- // Ok, if we had an internal subset, we are just past the [ character
- // and need to parse that first.
- //
- if (hasIntSubset)
- {
- // Eat the opening square bracket
- fReaderMgr->getNextChar();
- // We can't have any internal subset if we are reusing the validator
- if (reuseGrammar)
- ThrowXML(RuntimeException, XMLExcepts::Val_CantHaveIntSS);
- // Indicate we are in the internal subset now
- FlagJanitor<bool> janContentFlag(&fInternalSubset, true);
- //
- // And try to scan the internal subset. If we fail, try to recover
- // by skipping forward tot he close angle and returning.
- //
- if (!scanInternalSubset())
- {
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- //
- // Do a sanity check that some expanded PE did not propogate out of
- // the doctype. This could happen if it was terminated early by bad
- // syntax.
- //
- if (fReaderMgr->getReaderDepth() > 1)
- {
- fScanner->emitError(XMLErrs::PEPropogated);
- // Ask the reader manager to pop back down to the main level
- fReaderMgr->cleanStackBackTo(1);
- }
- fReaderMgr->skipPastSpaces();
- }
- // And that should leave us at the closing > of the DOCTYPE line
- if (!fReaderMgr->skippedChar(chCloseAngle))
- {
- //
- // Do a special check for the common scenario of an extra ] char at
- // the end. This is easy to recover from.
- //
- if (fReaderMgr->skippedChar(chCloseSquare)
- && fReaderMgr->skippedChar(chCloseAngle))
- {
- fScanner->emitError(XMLErrs::ExtraCloseSquare);
- }
- else
- {
- fScanner->emitError(XMLErrs::UnterminatedDOCTYPE);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- }
- //
- // If we had an external subset, then we need to deal with that one
- // next. If we are reusing the validator, then don't scan it.
- //
- if (hasExtSubset && !reuseGrammar)
- {
- // Indicate we are in the external subset now
- FlagJanitor<bool> janContentFlag(&fInternalSubset, false);
- // And now create a reader to read this entity
- InputSource* srcUsed;
- XMLReader* reader = fReaderMgr->createReader
- (
- sysId
- , pubId
- , false
- , XMLReader::RefFrom_NonLiteral
- , XMLReader::Type_General
- , XMLReader::Source_External
- , srcUsed
- );
- // Put a janitor on the input source
- Janitor<InputSource> janSrc(srcUsed);
- //
- // If it failed then throw an exception
- //
- if (!reader)
- ThrowXML1(RuntimeException, XMLExcepts::Gen_CouldNotOpenDTD, srcUsed->getSystemId());
- //
- // In order to make the processing work consistently, we have to
- // make this look like an external entity. So create an entity
- // decl and fill it in and push it with the reader, as happens
- // with an external entity. Put a janitor on it to insure it gets
- // cleaned up. The reader manager does not adopt them.
- //
- const XMLCh gDTDStr[] = { chLatin_D, chLatin_T, chLatin_D , chNull };
- DTDEntityDecl* declDTD = new DTDEntityDecl(gDTDStr);
- declDTD->setSystemId(sysId);
- Janitor<DTDEntityDecl> janDecl(declDTD);
- // Mark this one as a throw at end
- reader->setThrowAtEnd(true);
- // And push it onto the stack, with its pseudo name
- fReaderMgr->pushReader(reader, declDTD);
- // Tell it its not in an include section
- scanExtSubsetDecl(false);
- }
- }
- //
- // This is called after seeing '<!ELEMENT' which indicates that an element
- // markup is starting. This guy scans the rest of it and adds it to the
- // element decl pool if it has not already been declared.
- //
- void DTDScanner::scanElementDecl()
- {
- //
- // Space is legal (required actually) here so check for a PE ref. If
- // we don't get our whitespace, then issue and error, but try to keep
- // going.
- //
- if (!checkForPERef(true, false, true))
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- // Get a buffer for the element name and scan in the name
- XMLBufBid bbName(fBufMgr);
- if (!fReaderMgr->getName(bbName.getBuffer()))
- {
- fScanner->emitError(XMLErrs::ExpectedElementName);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // Look this guy up in the element decl pool
- DTDElementDecl* decl = (DTDElementDecl*) fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bbName.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
- //
- // If it does not exist, then we need to create it. If it does and
- // its marked as declared, then that's an error, but we still need to
- // scan over the content model so use the dummy declaration that the
- // parsing code can fill in.
- //
- if (decl)
- {
- if (decl->isDeclared())
- {
- if (fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::ElementAlreadyExists, bbName.getRawBuffer());
- if (!fDumElemDecl)
- fDumElemDecl = new DTDElementDecl(bbName.getRawBuffer(), fEmptyNamespaceId);
- else
- fDumElemDecl->setElementName(bbName.getRawBuffer(),fEmptyNamespaceId);
- }
- }
- else
- {
- //
- // Create the new empty declaration to fill in and put it into
- // the decl pool.
- //
- decl = new DTDElementDecl(bbName.getRawBuffer(), fEmptyNamespaceId);
- fDTDGrammar->putElemDecl(decl);
- }
- // Set a flag for whether we will ignore this one
- const bool isIgnored = (decl == fDumElemDecl);
- // Mark this one if being externally declared
- decl->setExternalElemDeclaration(isReadingExternalEntity());
- // Mark this one as being declared
- decl->setCreateReason(XMLElementDecl::Declared);
- // Another check for a PE ref, with at least required whitespace
- if (!checkForPERef(true, false, true))
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- // And now scan the content model for this guy.
- if (!scanContentSpec(*decl))
- {
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // Another check for a PE ref, but we don't require whitespace here
- checkForPERef(false, false, true);
- // And we should have the ending angle bracket
- if (!fReaderMgr->skippedChar(chCloseAngle))
- {
- fScanner->emitError(XMLErrs::UnterminatedElementDecl, bbName.getRawBuffer());
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- //
- // If we have a DTD handler tell it about the new element decl. We
- // tell it if its one that can be ignored, cause its an override of a
- // previously existing decl. If it is being ignored, only call back
- // if advanced callbacks are enabled.
- //
- if (fDocTypeHandler)
- fDocTypeHandler->elementDecl(*decl, isIgnored);
- }
- //
- // This method will process a general or parameter entity reference. The
- // entity name and entity text will be stored in the entity pool. The value
- // of the entity will be scanned for any other parameter entity or char
- // references which will be expanded. So the stored value can only have
- // general entity references when done.
- //
- void DTDScanner::scanEntityDecl()
- {
- //
- // Space is required here, but we cannot check for a PE Ref since
- // there could be a legal (no-ref) percent sign here. Since any
- // entity that ended here would be illegal, we just skip spaces
- // and then check for a percent.
- //
- if (!fReaderMgr->lookingAtSpace())
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- else
- fReaderMgr->skipPastSpaces();
- const bool isPEDecl = fReaderMgr->skippedChar(chPercent);
- //
- // If a PE decl, then eat the percent and check for spaces or a
- // PE ref on the other side of it. At least spaces are required.
- //
- if (isPEDecl)
- {
- if (!checkForPERef(true, false, true))
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- }
- //
- // Now lets get a name, which should be the name of the entity. We
- // have to get a buffer for this.
- //
- XMLBufBid bbName(fBufMgr);
- if (!fReaderMgr->getName(bbName.getBuffer()))
- {
- fScanner->emitError(XMLErrs::ExpectedPEName);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // If namespaces are enabled, then no colons allowed
- if (fScanner->getDoNamespaces())
- {
- if (XMLString::indexOf(bbName.getRawBuffer(), chColon) != -1)
- fScanner->emitError(XMLErrs::ColonNotLegalWithNS);
- }
- //
- // See if this entity already exists. If so, then the existing one
- // takes precendence. So we use the local dummy decl to parse into
- // and just ignore the results.
- //
- DTDEntityDecl* entityDecl;
- if (isPEDecl)
- entityDecl = fPEntityDeclPool->getByKey(bbName.getRawBuffer());
- else
- entityDecl = fEntityDeclPool->getByKey(bbName.getRawBuffer());
- if (entityDecl)
- {
- if (!fDumEntityDecl)
- fDumEntityDecl = new DTDEntityDecl;
- fDumEntityDecl->setName(bbName.getRawBuffer());
- entityDecl = fDumEntityDecl;
- }
- else
- {
- // Its not in existence already, then create an entity decl for it
- entityDecl = new DTDEntityDecl(bbName.getRawBuffer());
- //
- // Set the declaration location. The parameter indicates whether its
- // declared in the content/internal subset, so we know whether or not
- // its in the external subset.
- //
- entityDecl->setDeclaredInIntSubset(fInternalSubset);
- // Add it to the appropriate entity decl pool
- if (isPEDecl)
- fPEntityDeclPool->put(entityDecl);
- else
- fEntityDeclPool->put(entityDecl);
- }
- // Set a flag that indicates whether we are ignoring this one
- const bool isIgnored = (entityDecl == fDumEntityDecl);
- // Set the PE flag on it
- entityDecl->setIsParameter(isPEDecl);
- //
- // Space is legal (required actually) here so check for a PE ref. If
- // we don't get our whitespace, then issue an error, but try to keep
- // going.
- //
- if (!checkForPERef(true, false, true))
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- // save the hasNoDTD status for Entity Constraint Checking
- bool hasNoDTD = fScanner->getHasNoDTD();
- if (hasNoDTD && isPEDecl)
- fScanner->setHasNoDTD(false);
- // According to the type call the value scanning method
- if (!scanEntityDef(*entityDecl, isPEDecl))
- {
- fReaderMgr->skipPastChar(chCloseAngle);
- fScanner->setHasNoDTD(true);
- fScanner->emitError(XMLErrs::ExpectedEntityValue);
- return;
- }
- if (hasNoDTD)
- fScanner->setHasNoDTD(true);
- // Space is legal (but not required) here so check for a PE ref
- checkForPERef(false, false, true);
- // And then we have to have the closing angle bracket
- if (!fReaderMgr->skippedChar(chCloseAngle))
- {
- fScanner->emitError(XMLErrs::UnterminatedEntityDecl, entityDecl->getName());
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- //
- // If we have a doc type handler, then call it. But only call it for
- // ignored elements if advanced callbacks are enabled.
- //
- if (fDocTypeHandler)
- fDocTypeHandler->entityDecl(*entityDecl, isPEDecl, isIgnored);
- }
- //
- // This method will scan a general/character entity ref. It will either
- // expand a char ref and return the value directly, or it will expand
- // a general entity and a reader for it onto the reader stack.
- //
- // The return value indicates whether the value was returned directly or
- // pushed as a reader or it failed.
- //
- // The escaped flag tells the caller whether the returnd parameter resulted
- // from a character reference, which escapes the character in some cases. It
- // only makes any difference if the return indicates the value was returned
- // directly.
- //
- // NOTE: This is only called when scanning attribute values, so we always
- // expand general entities.
- //
- DTDScanner::EntityExpRes
- DTDScanner::scanEntityRef(XMLCh& firstCh, XMLCh& secondCh, bool& escaped)
- {
- // Assume no escape and no second char
- escaped = false;
- secondCh = 0;
- // We have to insure its all done in a single entity
- const unsigned int curReader = fReaderMgr->getCurrentReaderNum();
- //
- // If the next char is a pound, then its a character reference and we
- // need to expand it always.
- //
- if (fReaderMgr->skippedChar(chPound))
- {
- //
- // Its a character reference, so scan it and get back the numeric
- // value it represents. If it fails, just return immediately.
- //
- if (!scanCharRef(firstCh, secondCh))
- return EntityExp_Failed;
- if (curReader != fReaderMgr->getCurrentReaderNum())
- fScanner->emitError(XMLErrs::PartialMarkupInEntity);
- // Its now escaped since it was a char ref
- escaped = true;
- return EntityExp_Returned;
- }
- // Get the name of the general entity
- XMLBufBid bbName(fBufMgr);
- if (!fReaderMgr->getName(bbName.getBuffer()))
- {
- fScanner->emitError(XMLErrs::ExpectedEntityRefName);
- return EntityExp_Failed;
- }
- //
- // Next char must be a semi-colon. But if its not, just emit
- // an error and try to continue.
- //
- if (!fReaderMgr->skippedChar(chSemiColon))
- fScanner->emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
- // Make sure it was all in one entity reader
- if (curReader != fReaderMgr->getCurrentReaderNum())
- fScanner->emitError(XMLErrs::PartialMarkupInEntity);
- // Look it up the name the general entity pool
- XMLEntityDecl* decl = fEntityDeclPool->getByKey(bbName.getRawBuffer());
- // If it does not exist, then obviously an error
- if (!decl)
- {
- // XML 1.0 Section 4.1
- if (fScanner->getStandalone() || fScanner->getHasNoDTD()) {
- fScanner->emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
- }
- else {
- if (fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer());
- }
- return EntityExp_Failed;
- }
- //
- // If we are a standalone document, then it has to have been declared
- // in the internal subset. Keep going though.
- //
- if (fScanner->getDoValidation() && fScanner->getStandalone() && !decl->getDeclaredInIntSubset())
- fScanner->getValidator()->emitError(XMLValid::IllegalRefInStandalone, bbName.getRawBuffer());
- //
- // If its a special char reference, then its escaped and we can return
- // it directly.
- //
- if (decl->getIsSpecialChar())
- {
- firstCh = decl->getValue()[0];
- escaped = true;
- return EntityExp_Returned;
- }
- if (decl->isExternal())
- {
- // If its unparsed, then its not valid here
- // XML 1.0 Section 4.4.4 the appearance of a reference to an unparsed entity is forbidden.
- if (decl->isUnparsed())
- {
- fScanner->emitError(XMLErrs::NoUnparsedEntityRefs, bbName.getRawBuffer());
- return EntityExp_Failed;
- }
- // We are in an attribute value, so not valid.
- // XML 1.0 Section 4.4.4 a reference to an external entity in an attribute value is forbidden.
- fScanner->emitError(XMLErrs::NoExtRefsInAttValue);
- // And now create a reader to read this entity
- InputSource* srcUsed;
- XMLReader* reader = fReaderMgr->createReader
- (
- decl->getSystemId()
- , decl->getPublicId()
- , false
- , XMLReader::RefFrom_NonLiteral
- , XMLReader::Type_General
- , XMLReader::Source_External
- , srcUsed
- );
- // Put a janitor on the source so it gets cleaned up on exit
- Janitor<InputSource> janSrc(srcUsed);
- //
- // If the creation failed then throw an exception
- //
- if (!reader)
- ThrowXML1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed->getSystemId());
- //
- // Push the reader. If its a recursive expansion, then emit an error
- // and return an failure.
- //
- if (!fReaderMgr->pushReader(reader, decl))
- {
- fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
- return EntityExp_Failed;
- }
- // If it starts with the XML string, then parse a text decl
- if (fScanner->checkXMLDecl(true))
- scanTextDecl();
- }
- else
- {
- //
- // Create a reader over a memory stream over the entity value
- // We force it to assume UTF-16 by passing in an encoding
- // string. This way it won't both trying to predecode the
- // first line, looking for an XML/TextDecl.
- //
- XMLReader* valueReader = fReaderMgr->createIntEntReader
- (
- decl->getName()
- , XMLReader::RefFrom_NonLiteral
- , XMLReader::Type_General
- , decl->getValue()
- , decl->getValueLen()
- , false
- );
- //
- // Trt to push the entity reader onto the reader manager stack,
- // where it will become the subsequent input. If it fails, that
- // means the entity is recursive, so issue an error. The reader
- // will have just been discarded, but we just keep going.
- //
- if (!fReaderMgr->pushReader(valueReader, decl))
- fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
- }
- return EntityExp_Pushed;
- }
- //
- // This method will scan a quoted literal of an entity value. It has to
- // deal with replacement of PE references; however, since this is a DTD
- // scanner, all such entity literals are in entity decls and therefore
- // general entities are not expanded.
- //
- bool DTDScanner::scanEntityLiteral(XMLBuffer& toFill, const bool isPE)
- {
- toFill.reset();
- // Get the next char which must be a single or double quote
- XMLCh quoteCh;
- if (!fReaderMgr->skipIfQuote(quoteCh))
- return false;
- // Get a buffer for pulling in entity names when we see GE refs
- XMLBufBid bbName(fBufMgr);
- XMLBuffer& nameBuf = bbName.getBuffer();
- // Remember the current reader
- const unsigned int orgReader = fReaderMgr->getCurrentReaderNum();
- //
- // Loop until we see the ending quote character, handling any references
- // in the process.
- //
- XMLCh nextCh;
- XMLCh secondCh = 0;
- bool gotLeadingSurrogate = false;
- while (true)
- {
- // Get the second char if we have one, else get another
- if (secondCh)
- {
- nextCh = secondCh;
- secondCh = 0;
- }
- else
- {
- nextCh = fReaderMgr->getNextChar();
- }
- //
- // Watch specifically for EOF and issue a more meaningful error
- // if that occurs (since an unterminated quoted char can cause
- // this easily.)
- //
- if (!nextCh)
- {
- fScanner->emitError(XMLErrs::UnterminatedEntityLiteral);
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- }
- //
- // Break out on our terminating quote char when we are back in the
- // same reader. Otherwise, we might trigger on a nested quote char
- // in an expanded entity.
- //
- if ((nextCh == quoteCh)
- && (fReaderMgr->getCurrentReaderNum() == orgReader))
- {
- break;
- }
- if (nextCh == chPercent)
- {
- //
- // Put the PE's value on the reader stack and then jump back
- // to the top to start processing it. The parameter indicates
- // that it should not scan the reference's content as an external
- // subset.
- //
- expandPERef(false, true, true);
- continue;
- }
- //
- // Ok, now that all the other special stuff is checked, we can
- // look for a general entity. In here, we cannot have a naked &
- // and will only expand numerical char refs or the intrinsic char
- // refs. Others will be left alone.
- //
- if (nextCh == chAmpersand)
- {
- //
- // Here, we only expand numeric char refs, but not any general
- // entities. However, the stupid XML spec requires that we check
- // and make sure it does refer to a general entity if its not
- // a char ref (i.e. no naked '&' chars.)
- //
- if (fReaderMgr->skippedChar(chPound))
- {
- // If it failed, then just jump back to the top and try to pick up
- if (!scanCharRef(nextCh, secondCh))
- {
- gotLeadingSurrogate = false;
- continue;
- }
- }
- else
- {
- if (!fReaderMgr->getName(nameBuf))
- {
- fScanner->emitError(XMLErrs::ExpectedEntityRefName);
- }
- else
- {
- //
- // Since we are not expanding any of this, we have to
- // put the amp and name into the target buffer as data.
- //
- toFill.append(chAmpersand);
- toFill.append(nameBuf.getRawBuffer());
- // Make sure we skipped a trailing semicolon
- if (!fReaderMgr->skippedChar(chSemiColon))
- {
- fScanner->emitError
- (
- XMLErrs::UnterminatedEntityRef
- , nameBuf.getRawBuffer()
- );
- }
- // And make the new character the semicolon
- nextCh = chSemiColon;
- }
- // Either way here we reset the surrogate flag
- gotLeadingSurrogate = false;
- }
- }
- if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
- {
- if (gotLeadingSurrogate)
- fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
- else
- gotLeadingSurrogate = true;
- }
- else
- {
- if (gotLeadingSurrogate)
- {
- if ((nextCh < 0xDC00) && (nextCh > 0xDFFF))
- fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
- }
- else if (!XMLReader::isXMLChar(nextCh))
- {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
- fReaderMgr->skipPastChar(quoteCh);
- return false;
- }
- gotLeadingSurrogate = false;
- }
- // Looks ok, so add it to the literal
- toFill.append(nextCh);
- }
- //
- // If we got here and did not get back to the original reader level,
- // then we propogated some entity out of the literal, so issue an
- // error, but don't fail.
- //
- if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
- return true;
- }
- //
- // This method is called after the entity name has been scanned, and any
- // PE referenced following the name is handled. The passed decl will be
- // filled in with the info scanned.
- //
- bool DTDScanner::scanEntityDef(DTDEntityDecl& decl, const bool isPEDecl)
- {
- // Its got to be an entity literal
- if (fReaderMgr->lookingAtChar(chSingleQuote)
- || fReaderMgr->lookingAtChar(chDoubleQuote))
- {
- // Get a buffer for the literal
- XMLBufBid bbValue(fBufMgr);
- if (!scanEntityLiteral(bbValue.getBuffer(), isPEDecl))
- return false;
- // Set it on the entity decl
- decl.setValue(bbValue.getRawBuffer());
- return true;
- }
- //
- // Its got to be an external entity, so there must be an external id.
- // Get buffers for them and scan an external id into them.
- //
- XMLBufBid bbPubId(fBufMgr);
- XMLBufBid bbSysId(fBufMgr);
- if (!scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), IDType_External))
- return false;
- // Fill in the id fields of the decl with the info we got
- decl.setPublicId(bbPubId.getRawBuffer());
- decl.setSystemId(bbSysId.getRawBuffer());
- // If its a PE decl, we are done
- bool gotSpaces = checkForPERef(false, false, true);
- if (isPEDecl)
- {
- //
- // Check for a common error here. NDATA is not allowed for PEs
- // so check for the NDATA string. If found give a nice meaningful
- // error and continue parsing to eat the NDATA text.
- //
- if (gotSpaces)
- {
- if (fReaderMgr->skippedString(XMLUni::fgNDATAString))
- fScanner->emitError(XMLErrs::NDATANotValidForPE);
- }
- else
- {
- return true;
- }
- }
- // If looking at close angle now, we are done
- if (fReaderMgr->lookingAtChar(chCloseAngle))
- return true;
- // Else we had to have seem the whitespace
- if (!gotSpaces)
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- // We now have to see a notation data string
- if (!fReaderMgr->skippedString(XMLUni::fgNDATAString))
- fScanner->emitError(XMLErrs::ExpectedNDATA);
- // Space is required here, but try to go on if not
- if (!checkForPERef(false, false, true))
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- // Get a name
- XMLBufBid bbName(fBufMgr);
- if (!fReaderMgr->getName(bbName.getBuffer()))
- {
- fScanner->emitError(XMLErrs::ExpectedNotationName);
- return false;
- }
- // Set the decl's notation name
- decl.setNotationName(bbName.getRawBuffer());
- return true;
- }
- //
- // This method is called after an attribute decl name or a notation decl has
- // been scanned and then an opening parenthesis was see, indicating the list
- // of values. It scans the enumeration values and creates a single string
- // which has a single space between each value.
- //
- // The terminating close paren ends this scan.
- //
- bool DTDScanner::scanEnumeration( const DTDAttDef& attDef
- , XMLBuffer& toFill
- , const bool notation)
- {
- // Reset the passed buffer
- toFill.reset();
- // Check for PE ref but don't require space
- checkForPERef(false, false, true);
- // If this is a notation, we need an opening paren
- if (notation)
- {
- if (!fReaderMgr->skippedChar(chOpenParen))
- fScanner->emitError(XMLErrs::ExpectedOpenParen);
- }
- // We need a local buffer to use as well
- XMLBufBid bbTmp(fBufMgr);
- while (true)
- {
- // Space is allowed here for either type so check for PE ref
- checkForPERef(false, false, true);
- // And then get either a name or a name token
- bool success;
- if (notation)
- success = fReaderMgr->getName(bbTmp.getBuffer());
- else
- success = fReaderMgr->getNameToken(bbTmp.getBuffer());
- if (!success)
- {
- fScanner->emitError
- (
- XMLErrs::ExpectedEnumValue
- , attDef.getFullName()
- );
- return false;
- }
- // Append this value to the target value
- toFill.append(bbTmp.getRawBuffer(), bbTmp.getLen());
- // Space is allowed here for either type so check for PE ref
- checkForPERef(false, false, true);
- // Check for the terminating paren
- if (fReaderMgr->skippedChar(chCloseParen))
- break;
- // And append a space separator
- toFill.append(chSpace);
- // Check for the pipe character separator
- if (!fReaderMgr->skippedChar(chPipe))
- {
- fScanner->emitError(XMLErrs::ExpectedEnumSepOrParen);
- return false;
- }
- }
- return true;
- }
- bool DTDScanner::scanEq()
- {
- fReaderMgr->skipPastSpaces();
- if (fReaderMgr->skippedChar(chEqual))
- {
- fReaderMgr->skipPastSpaces();
- return true;
- }
- return false;
- }
- //
- // This method is called when an external entity reference is seen in the
- // DTD or an external DTD subset is encountered, and their contents pushed
- // onto the reader stack. This method will scan that contents.
- //
- void DTDScanner::scanExtSubsetDecl(const bool inIncludeSect)
- {
- bool bAcceptDecl = !inIncludeSect;
- // Get a buffer for whitespace
- XMLBufBid bbSpace(fBufMgr);
- //
- // If we have a doc type handler and we are not being called recursively
- // to handle an include section, tell it the ext subset starts
- //
- if (fDocTypeHandler && !inIncludeSect)
- fDocTypeHandler->startExtSubset();
- //
- // We have to play a trick here if the current entity we are parsing
- // is a PE. Because the spooling code will put out a whitespace before
- // and after an expanded PE if its being scanned outside the context of
- // a literal entity, this will confuse this external subset code.
- //
- // So, we see if that is what is happening and, if so, eat the single
- // space, a check for the <?xml string. If we find it, we parse that
- // markup right now and put the space back.
- //
- if (fReaderMgr->isScanningPERefOutOfLiteral())
- {
- if (fReaderMgr->skippedSpace())
- {
- if (fScanner->checkXMLDecl(true))
- {
- scanTextDecl();
- bAcceptDecl = false;
- // <TBD> Figure out how to do this
- // fReaderMgr->unGet(chSpace);
- }
- }
- }
- // Get the current reader number
- const unsigned int orgReader = fReaderMgr->getCurrentReaderNum();
- //
- // Loop until we hit the end of the external subset entity. Note that
- // we use a double loop here in order to avoid the overhead of doing
- // the exception setup/teardown work on every loop.
- //
- bool inMarkup = false;
- bool inCharData = false;
- while (true)
- {
- try
- {
- while (true)
- {
- const XMLCh nextCh = fReaderMgr->peekNextChar();
- if (nextCh == chOpenAngle)
- {
- // Get the reader we started this on
- const unsigned int orgReader = fReaderMgr->getCurrentReaderNum();
- //
- // Now scan the markup. Set the flag so that we will know that
- // we were in markup if an end of entity exception occurs.
- //
- fReaderMgr->getNextChar();
- inMarkup = true;
- scanMarkupDecl(bAcceptDecl);
- inMarkup = false;
- //
- // And see if we got back to the same level. If not, then its
- // a partial markup error.
- //
- if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
- }
- else if (XMLReader::isWhitespace(nextCh))
- {
- //
- // If we have a doc type handler, and advanced callbacks are
- // enabled, then gather up whitespace and call back. Otherwise
- // just skip whitespaces.
- //
- if (fDocTypeHandler)
- {
- inCharData = true;
- fReaderMgr->getSpaces(bbSpace.getBuffer());
- inCharData = false;
- fDocTypeHandler->doctypeWhitespace
- (
- bbSpace.getRawBuffer()
- , bbSpace.getLen()
- );
- }
- else
- {
- //
- // If we hit an end of entity in the middle of white
- // space, that's fine. We'll just come back in here
- // again on the next round and skip some more.
- //
- fReaderMgr->skipPastSpaces();
- }
- }
- else if (nextCh == chPercent)
- {
- //
- // Expand (and scan if external) the reference value. Tell
- // it to throw an end of entity exception at the end of the
- // entity.
- //
- fReaderMgr->getNextChar();
- expandPERef(true, false, false, true);
- }
- else if (inIncludeSect && (nextCh == chCloseSquare))
- {
- //
- // Its the end of a conditional include section. So scan it and
- // decrement the include depth counter.
- //
- fReaderMgr->getNextChar();
- if (!fReaderMgr->skippedChar(chCloseSquare))
- {
- fScanner->emitError(XMLErrs::ExpectedEndOfConditional);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- else if (!fReaderMgr->skippedChar(chCloseAngle))
- {
- fScanner->emitError(XMLErrs::ExpectedEndOfConditional);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- return;
- }
- else
- {
- fReaderMgr->getNextChar();
- if (!XMLReader::isXMLChar(nextCh))
- {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
- }
- else
- {
- fScanner->emitError(XMLErrs::InvalidDocumentStructure);
- }
- // Try to get realigned
- static const XMLCh toSkip[] =
- {
- chPercent, chCloseSquare, chOpenAngle, chNull
- };
- fReaderMgr->skipUntilInOrWS(toSkip);
- }
- bAcceptDecl = false;
- }
- }
- catch(const EndOfEntityException& toCatch)
- {
- //
- // If the external entity ended while we were in markup, then that's
- // a partial markup error.
- //
- if (inMarkup)
- {
- fScanner->emitError(XMLErrs::PartialMarkupInEntity);
- inMarkup = false;
- }
- // If we were in char data, then send what we got
- if (inCharData)
- {
- // Send what we got, then rethrow
- if (fDocTypeHandler)
- {
- fDocTypeHandler->doctypeWhitespace
- (
- bbSpace.getRawBuffer()
- , bbSpace.getLen()
- );
- }
- inCharData = false;
- }
- //
- // If the entity that just ended was the entity that we started
- // on, then this is the end of the external subset.
- //
- if (orgReader == toCatch.getReaderNum())
- break;
- }
- }
- // If we have a doc type handler, tell it the ext subset ends
- if (fDocTypeHandler)
- fDocTypeHandler->endExtSubset();
- }
- //
- // This method will scan for an id, either public or external.
- //
- //
- // [75] ExternalID ::= 'SYSTEM' S SystemLiteral
- // | 'PUBLIC' S PubidLiteral S SystemLiteral
- // [83] PublicID ::= 'PUBLIC' S PubidLiteral
- //
- bool DTDScanner::scanId( XMLBuffer& pubIdToFill
- , XMLBuffer& sysIdToFill
- , const IDTypes whatKind)
- {
- // Clean out both return buffers
- pubIdToFill.reset();
- sysIdToFill.reset();
- //
- // Check first for the system id first. If we find it, and system id
- // is one of the legal values, then lets try to scan it.
- //
- // 'SYSTEM' S SystemLiteral
- if (fReaderMgr->skippedString(XMLUni::fgSysIDString))
- {
- // If they were looking for a public id, then we failed
- if (whatKind == IDType_Public)
- {
- fScanner->emitError(XMLErrs::ExpectedPublicId);
- return false;
- }
- // We must skip spaces
- if (!fReaderMgr->skipPastSpaces())
- {
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- return false;
- }
- // Get the system literal value
- return scanSystemLiteral(sysIdToFill);
- }
- // Now scan for public id
- // 'PUBLIC' S PubidLiteral S SystemLiteral
- // or
- // 'PUBLIC' S PubidLiteral
- // If we don't have any public id string => Error
- if (!fReaderMgr->skippedString(XMLUni::fgPubIDString)) {
- fScanner->emitError(XMLErrs::ExpectedSystemOrPublicId);
- return false;
- }
- //
- // So following this we must have whitespace, a public literal, whitespace,
- // and a system literal.
- //
- if (!fReaderMgr->skipPastSpaces())
- {
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- //
- // Just in case, if they just forgot the whitespace but the next char
- // is a single or double quote, then keep going.
- //
- const XMLCh chPeek = fReaderMgr->peekNextChar();
- if ((chPeek != chDoubleQuote) && (chPeek != chSingleQuote))
- return false;
- }
- if (!scanPublicLiteral(pubIdToFill))
- return false;
- // If they wanted a public id, then this is all
- if (whatKind == IDType_Public)
- return true;
- // check if there is any space follows
- bool hasSpace = fReaderMgr->skipPastSpaces();
- //
- // In order to recover best here we need to see if
- // the next thing is a quote or not
- //
- const XMLCh chPeek = fReaderMgr->peekNextChar();
- const bool bIsQuote = ((chPeek == chDoubleQuote)
- || (chPeek == chSingleQuote));
- if (!hasSpace)
- {
- if (whatKind == IDType_External)
- {
- //
- // If its an external Id, then we need to see the system id.
- // So, emit the error. But, if the next char is a quote, don't
- // give up since its probably going to work. The user just
- // missed the separating space. Otherwise, fail.
- //
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- if (!bIsQuote)
- return false;
- }
- else
- {
- //
- // We can legally return here. But, if the next char is a quote,
- // then that's probably not what was desired, since its probably
- // just that space was forgotten and there really is a system
- // id to follow.
- //
- // So treat it like missing whitespace if so and keep going.
- // Else, just return success.
- //
- if (bIsQuote)
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- else
- return true;
- }
- }
- if (bIsQuote) {
- // there is a quote coming, scan the system literal
- if (!scanSystemLiteral(sysIdToFill))
- return false;
- }
- else {
- // no quote, if expecting exteral id, this is an error
- if (whatKind == IDType_External)
- fScanner->emitError(XMLErrs::ExpectedQuotedString);
- }
- return true;
- }
- //
- // This method will scan the contents of an ignored section. It assumes that
- // we already are in the body, i.e. we've seen <![IGNORE[ at this point. So
- // we have to just scan until we see a matching ]]> closing markup.
- //
- void DTDScanner::scanIgnoredSection()
- {
- //
- // Depth starts at one because we are already in one section and want
- // to parse until we hit its end.
- //
- unsigned long depth = 1;
- while (true)
- {
- const XMLCh nextCh = fReaderMgr->getNextChar();
- if (!nextCh)
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- if (nextCh == chOpenAngle)
- {
- if (fReaderMgr->skippedChar(chBang)
- && fReaderMgr->skippedChar(chOpenSquare))
- {
- depth++;
- }
- }
- else if (nextCh == chCloseSquare)
- {
- if (fReaderMgr->skippedChar(chCloseSquare))
- {
- while (fReaderMgr->skippedChar(chCloseSquare))
- {
- // Do nothing, just skip them
- }
- if (fReaderMgr->skippedChar(chCloseAngle))
- {
- depth--;
- if (!depth)
- break;
- }
- }
- }
- else if (!XMLReader::isXMLChar(nextCh))
- {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
- }
- }
- }
- //
- // This method scans the entire internal subset. All we can have here is
- // decl markup, and PE references. The expanded PE references must contain
- // whole markup, so we don't have to worry about their content at this
- // level. We just scan them, expand them, push them, and parse their content
- // right there, via the expandERef() method.
- //
- bool DTDScanner::scanInternalSubset()
- {
- // If we have a doc type handler, tell it the internal subset starts
- if (fDocTypeHandler)
- fDocTypeHandler->startIntSubset();
- // Get a buffer for whitespace
- XMLBufBid bbSpace(fBufMgr);
- bool noErrors = true;
- while (true)
- {
- const XMLCh nextCh = fReaderMgr->peekNextChar();
- //
- // If we get an end of file marker, just unget it and return a
- // failure status. The caller will then see the end of file and
- // faill out correctly.
- //
- if (!nextCh)
- return false;
- // Watch for the end of internal subset marker
- if (nextCh == chCloseSquare)
- {
- fReaderMgr->getNextChar();
- break;
- }
- if (nextCh == chPercent)
- {
- //
- // Expand (and scan if external) the reference value. Tell
- // it to set the reader to cause an end of entity exception
- // when this reader dies, which is what the scanExtSubset
- // method wants (who is called to scan this.)
- //
- fReaderMgr->getNextChar();
- expandPERef(true, false, false, true);
- }
- else if (nextCh == chOpenAngle)
- {
- // Remember this reader before we start the scan
- const unsigned int orgReader = fReaderMgr->getCurrentReaderNum();
- // And scan this markup
- fReaderMgr->getNextChar();
- scanMarkupDecl(false);
- // If we did not get back to entry level, then partial markup
- if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
- }
- else if (XMLReader::isWhitespace(nextCh))
- {
- //
- // IF we are doing advanced callbacks and have a doc type
- // handler, then get the whitespace and call the doc type
- // handler with it. Otherwise, just skip whitespace.
- //
- if (fDocTypeHandler)
- {
- fReaderMgr->getSpaces(bbSpace.getBuffer());
- fDocTypeHandler->doctypeWhitespace
- (
- bbSpace.getRawBuffer()
- , bbSpace.getLen()
- );
- }
- else
- {
- fReaderMgr->skipPastSpaces();
- }
- }
- else
- {
- // Not valid, so emit an error
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- fReaderMgr->getNextChar()
- , tmpBuf
- , 8
- , 16
- );
- fScanner->emitError
- (
- XMLErrs::InvalidCharacterInIntSubset
- , tmpBuf
- );
- //
- // If an '>', then probably an abnormally terminated
- // internal subset so just return.
- //
- if (nextCh == chCloseAngle)
- {
- noErrors = false;
- break;
- }
- //
- // Otherwise, try to sync back up by scanning forward for
- // a reasonable start character.
- //
- static const XMLCh toSkip[] =
- {
- chPercent, chCloseSquare, chOpenAngle, chNull
- };
- fReaderMgr->skipUntilInOrWS(toSkip);
- }
- }
- // If we have a doc type handler, tell it the internal subset ends
- if (fDocTypeHandler)
- fDocTypeHandler->endIntSubset();
- return noErrors;
- }
- //
- // This method is called once we see a < in the input of an int/ext subset,
- // which indicates the start of some sort of markup.
- //
- void DTDScanner::scanMarkupDecl(const bool parseTextDecl)
- {
- //
- // We only have two valid first characters here. One is a ! which opens
- // some markup decl. The other is a ?, which could begin either a PI
- // or a text decl. If parseTextDecl is false, we cannot accept a text
- // decl.
- //
- const XMLCh nextCh = fReaderMgr->getNextChar();
- if (nextCh == chBang)
- {
- if (fReaderMgr->skippedChar(chDash))
- {
- if (fReaderMgr->skippedChar(chDash))
- {
- scanComment();
- }
- else
- {
- fScanner->emitError(XMLErrs::CommentsMustStartWith);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- }
- else if (fReaderMgr->skippedChar(chOpenSquare))
- {
- //
- // Its a conditional section. This is only valid in the external
- // subset, so issue an error if we aren't there.
- //
- if (fInternalSubset)
- {
- fScanner->emitError(XMLErrs::ConditionalSectInIntSubset);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // A PE ref can happen here, but space is not required
- checkForPERef(false, false, true);
- if (fReaderMgr->skippedString(XMLUni::fgIncludeString))
- {
- checkForPERef(false, false, true);
- // Check for the following open square bracket
- if (!fReaderMgr->skippedChar(chOpenSquare))
- fScanner->emitError(XMLErrs::ExpectedINCLUDEBracket);
- checkForPERef(false, false, true);
- //
- // Recurse back to the ext subset call again, telling it its
- // in an include section.
- //
- scanExtSubsetDecl(true);
- }
- else if (fReaderMgr->skippedString(XMLUni::fgIgnoreString))
- {
- checkForPERef(false, false, true);
- // Check for the following open square bracket
- if (!fReaderMgr->skippedChar(chOpenSquare))
- fScanner->emitError(XMLErrs::ExpectedINCLUDEBracket);
- // And scan over the ignored part
- scanIgnoredSection();
- }
- else
- {
- fScanner->emitError(XMLErrs::ExpectedIncOrIgn);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- }
- else if (fReaderMgr->skippedString(XMLUni::fgAttListString))
- {
- scanAttListDecl();
- }
- else if (fReaderMgr->skippedString(XMLUni::fgElemString))
- {
- scanElementDecl();
- }
- else if (fReaderMgr->skippedString(XMLUni::fgEntityString))
- {
- scanEntityDecl();
- }
- else if (fReaderMgr->skippedString(XMLUni::fgNotationString))
- {
- scanNotationDecl();
- }
- else
- {
- fScanner->emitError(XMLErrs::ExpectedMarkupDecl);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- }
- else if (nextCh == chQuestion)
- {
- // It could be a PI or the XML declaration. Check for Decl
- if (fScanner->checkXMLDecl(false))
- {
- // If we are not accepting text decls, its an error
- if (parseTextDecl)
- {
- scanTextDecl();
- }
- else
- {
- // Emit the error and skip past this markup
- fScanner->emitError(XMLErrs::TextDeclNotLegalHere);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- }
- else
- {
- // It has to be a PI
- scanPI();
- }
- }
- else
- {
- // Can't be valid so emit error and try to skip past end of this decl
- fScanner->emitError(XMLErrs::ExpectedMarkupDecl);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- }
- //
- // This method is called for a mixed model element's content mode. We've
- // already scanned past the '(PCDATA' part by the time we get here. So
- // everything else is element names separated by | characters until we
- // hit the end. The passed element decl's content model is filled in with
- // the information found.
- //
- bool DTDScanner::scanMixed(DTDElementDecl& toFill)
- {
- //
- // The terminating star is only required if there is something more
- // than (PCDATA).
- //
- bool starRequired = false;
- // Get a buffer to be used below to get element names
- XMLBufBid bbName(fBufMgr);
- XMLBuffer& nameBuf = bbName.getBuffer();
- //
- // Create an initial content spec node. Its just a leaf node with a
- // PCDATA element id. This current node pointer will be pushed down the
- // tree as we go.
- //
- ContentSpecNode* curNode =
- new ContentSpecNode(new QName(XMLUni::fgZeroLenString,
- XMLUni::fgZeroLenString,
- XMLElementDecl::fgPCDataElemId),
- false);
- //
- // Set the initial leaf as the temporary head. If we hit the first choice
- // node, it will be set up here. When done, this is the node that's set
- // as the content spec for the element.
- //
- ContentSpecNode* headNode = curNode;
- // Remember the original node so we can sense the first choice node
- ContentSpecNode* orgNode = curNode;
- //
- // We just loop around, getting the | character at the top and then
- // looking for the next element name. We keep up with the last node
- // and add each new one to its right node.
- //
- while (true)
- {
- //
- // First of all we check for some grunt work details of skipping
- // whitespace, expand PE refs, and catching invalid reps.
- //
- if (fReaderMgr->lookingAtChar(chPercent))
- {
- // Expand it and continue
- checkForPERef(false, false, true);
- }
- else if (fReaderMgr->skippedChar(chAsterisk))
- {
- //
- // Tell them they can't have reps in mixed model, but eat
- // it and keep going if we are allowed to.
- //
- fScanner->emitError(XMLErrs::NoRepInMixed);
- }
- else if (fReaderMgr->skippedSpace())
- {
- // Spaces are ok at this point, just eat them and continue
- fReaderMgr->skipPastSpaces();
- }
- else
- {
- if (!fReaderMgr->skippedChar(chPipe))
- {
- // Has to be the closing paren now.
- if (!fReaderMgr->skippedChar(chCloseParen))
- {
- fScanner->emitError(XMLErrs::UnterminatedContentModel);
- delete headNode;
- return false;
- }
- if (!fReaderMgr->skippedChar(chAsterisk) && starRequired)
- fScanner->emitError(XMLErrs::ExpectedAsterisk);
- //
- // Create a zero or more node and make the original head
- // node its first child.
- //
- headNode = new ContentSpecNode
- (
- ContentSpecNode::ZeroOrMore
- , headNode
- , 0
- );
- // Store the head node as the content spec of the element.
- toFill.setContentSpec(headNode);
- break;
- }
- // Its more than just a PCDATA, so an ending star will be required now
- starRequired = true;
- // Space is legal here so check for a PE ref, but don't require space
- checkForPERef(false, false, true);
- // Get a name token
- if (!fReaderMgr->getName(nameBuf))
- {
- fScanner->emitError(XMLErrs::ExpectedElementName);
- delete headNode;
- return false;
- }
- //
- // Create a leaf node for it. If we can find the element id for
- // this element, then use it. Else, we have to fault in an element
- // decl, marked as created because of being in a content model.
- //
- XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, nameBuf.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
- if (!decl)
- {
- decl = new DTDElementDecl(nameBuf.getRawBuffer(), fEmptyNamespaceId);
- decl->setCreateReason(XMLElementDecl::InContentModel);
- decl->setExternalElemDeclaration(isReadingExternalEntity());
- fDTDGrammar->putElemDecl(decl);
- }
- //
- // If the current node is the original node, this is the first choice
- // node, so create an initial choice node with the current node and
- // the new element id. Store this as the head node.
- //
- // Otherwise, we have to steal the right node of the previous choice
- // and weave in another choice node there, which has the old choice
- // as its left and the new leaf as its right.
- //
- if (curNode == orgNode)
- {
- curNode = new ContentSpecNode
- (
- ContentSpecNode::Choice
- , curNode
- , new ContentSpecNode(decl->getElementName())
- );
- // Remember the top node
- headNode = curNode;
- }
- else
- {
- ContentSpecNode* oldRight = curNode->orphanSecond();
- curNode->setSecond
- (
- new ContentSpecNode
- (
- ContentSpecNode::Choice
- , oldRight
- , new ContentSpecNode(decl->getElementName())
- )
- );
- // Make the new right node the current node
- curNode = curNode->getSecond();
- }
- }
- }
- return true;
- }
- //
- // This method is called when we see a '<!NOTATION' string while scanning
- // markup decl. It parses out the notation and its id and stores a new
- // notation decl object in the notation decl pool.
- //
- void DTDScanner::scanNotationDecl()
- {
- // Space is required here so check for a PE ref, and require space
- if (!checkForPERef(true, false, true))
- {
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- //
- // And now we get a name, which is the name of the notation. Get a
- // buffer for the name.
- //
- XMLBufBid bbName(fBufMgr);
- if (!fReaderMgr->getName(bbName.getBuffer()))
- {
- fScanner->emitError(XMLErrs::ExpectedNotationName);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // If namespaces are enabled, then no colons allowed
- if (fScanner->getDoNamespaces())
- {
- if (XMLString::indexOf(bbName.getRawBuffer(), chColon) != -1)
- fScanner->emitError(XMLErrs::ColonNotLegalWithNS);
- }
- // Space is required here so check for a PE ref, and require space
- if (!checkForPERef(true, false, true))
- {
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- //
- // And scan an external or public id. We need buffers to use for both
- // of these.
- //
- XMLBufBid bbPubId(fBufMgr);
- XMLBufBid bbSysId(fBufMgr);
- if (!scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), IDType_Either))
- {
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // We can have an optional space or PE ref here
- checkForPERef(false, false, true);
- //
- // See if it already exists. If so, add it to the notatino decl pool.
- // Otherwise, if advanced callbacks are on, create a temp one and
- // call out for that one.
- //
- XMLNotationDecl* decl = fDTDGrammar->getNotationDecl(bbName.getRawBuffer());
- bool isIgnoring = (decl != 0);
- if (isIgnoring)
- {
- fScanner->emitError(XMLErrs::NotationAlreadyExists, bbName.getRawBuffer());
- }
- else
- {
- // Fill in a new notation declaration and add it to the pool
- decl = new XMLNotationDecl
- (
- bbName.getRawBuffer()
- , bbPubId.getRawBuffer()
- , bbSysId.getRawBuffer()
- );
- fDTDGrammar->putNotationDecl(decl);
- }
- //
- // If we have a document type handler, then tell it about this. If we
- // are ignoring it, only call out if advanced callbacks are enabled.
- //
- if (fDocTypeHandler)
- {
- fDocTypeHandler->notationDecl
- (
- *decl
- , isIgnoring
- );
- }
- // And one more optional space or PE ref
- checkForPERef(false, false, true);
- // And skip the terminating bracket
- if (!fReaderMgr->skippedChar(chCloseAngle))
- fScanner->emitError(XMLErrs::UnterminatedNotationDecl);
- }
- //
- // Scans a PI and calls the appropriate callbacks. A PI can happen in either
- // the document or the DTD, so it calls the appropriate handler according
- // to the fInDocument flag.
- //
- // At entry we have just scanned the <? part, and need to now start on the
- // PI target name.
- //
- void DTDScanner::scanPI()
- {
- const XMLCh* namePtr = 0;
- const XMLCh* targetPtr = 0;
- //
- // If there are any spaces here, then warn about it. If we aren't in
- // 'first error' mode, then we'll come back and can easily pick up
- // again by just skipping them.
- //
- if (fReaderMgr->lookingAtSpace())
- {
- fScanner->emitError(XMLErrs::PINameExpected);
- fReaderMgr->skipPastSpaces();
- }
- // Get a buffer for the PI name and scan it in
- XMLBufBid bbName(fBufMgr);
- if (!fReaderMgr->getName(bbName.getBuffer()))
- {
- fScanner->emitError(XMLErrs::PINameExpected);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // Point the name pointer at the raw data
- namePtr = bbName.getRawBuffer();
- // See if it issome form of 'xml' and emit a warning
- if (!XMLString::compareIString(namePtr, XMLUni::fgXMLString))
- fScanner->emitError(XMLErrs::NoPIStartsWithXML);
- // If namespaces are enabled, then no colons allowed
- if (fScanner->getDoNamespaces())
- {
- if (XMLString::indexOf(namePtr, chColon) != -1)
- fScanner->emitError(XMLErrs::ColonNotLegalWithNS);
- }
- //
- // If we don't hit a space next, then the PI has no target. If we do
- // then get out the target. Get a buffer for it as well
- //
- XMLBufBid bbTarget(fBufMgr);
- if (fReaderMgr->skippedSpace())
- {
- // Skip any leading spaces
- fReaderMgr->skipPastSpaces();
- // It does have a target, so lets move on to deal with that.
- while (1)
- {
- const XMLCh nextCh = fReaderMgr->getNextChar();
- // Watch for an end of file, which is always bad here
- if (!nextCh)
- {
- fScanner->emitError(XMLErrs::UnterminatedPI);
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- }
- // Watch for potential terminating character
- if (nextCh == chQuestion)
- {
- // It must be followed by '>' to be a termination of the target
- if (fReaderMgr->skippedChar(chCloseAngle))
- break;
- }
- // Watch for invalid chars but try to keep going
- if (!XMLReader::isXMLChar(nextCh))
- {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
- }
- bbTarget.append(nextCh);
- }
- }
- else
- {
- // No target, but make sure its terminated ok
- if (!fReaderMgr->skippedChar(chQuestion))
- {
- fScanner->emitError(XMLErrs::UnterminatedPI);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- if (!fReaderMgr->skippedChar(chCloseAngle))
- {
- fScanner->emitError(XMLErrs::UnterminatedPI);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- }
- // Point the target pointer at the raw data
- targetPtr = bbTarget.getRawBuffer();
- //
- // If we have a handler, then call it.
- //
- if (fDocTypeHandler)
- {
- fDocTypeHandler->doctypePI
- (
- namePtr
- , targetPtr
- );
- }
- }
- //
- // This method scans a public literal. It must be quoted and all of its
- // characters must be valid public id characters. The quotes are discarded
- // and the results are returned.
- //
- bool DTDScanner::scanPublicLiteral(XMLBuffer& toFill)
- {
- toFill.reset();
- // Get the next char which must be a single or double quote
- XMLCh quoteCh;
- if (!fReaderMgr->skipIfQuote(quoteCh)) {
- fScanner->emitError(XMLErrs::ExpectedQuotedString);
- return false;
- }
- while (true)
- {
- const XMLCh nextCh = fReaderMgr->getNextChar();
- // Watch for EOF
- if (!nextCh)
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- if (nextCh == quoteCh)
- break;
- //
- // If its not a valid public id char, then report it but keep going
- // since that's the best recovery scheme.
- //
- if (!XMLReader::isPublicIdChar(nextCh))
- {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- fScanner->emitError(XMLErrs::InvalidPublicIdChar, tmpBuf);
- }
- toFill.append(nextCh);
- }
- return true;
- }
- //
- // This method handles scanning in a quoted system literal. It expects to
- // start on the open quote and returns after eating the ending quote. There
- // are not really any restrictions on the contents of system literals.
- //
- bool DTDScanner::scanSystemLiteral(XMLBuffer& toFill)
- {
- toFill.reset();
- // Get the next char which must be a single or double quote
- XMLCh quoteCh;
- if (!fReaderMgr->skipIfQuote(quoteCh)) {
- fScanner->emitError(XMLErrs::ExpectedQuotedString);
- return false;
- }
- while (true)
- {
- const XMLCh nextCh = fReaderMgr->getNextChar();
- // Watch for EOF
- if (!nextCh)
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- // Break out on terminating quote
- if (nextCh == quoteCh)
- break;
- toFill.append(nextCh);
- }
- return true;
- }
- //
- // This method is called to scan a text decl line, which can be the first
- // line in an external entity or external subset.
- //
- // On entry the <? has been scanned, and next should be 'xml' followed by
- // some whitespace, version string, etc...
- // [77] TextDecl::= '<?xml' VersionInfo? EncodingDecl S? '?>'
- //
- void DTDScanner::scanTextDecl()
- {
- // Skip any subsequent whitespace before the version string
- fReaderMgr->skipPastSpaces();
- // Next should be the version string
- XMLBufBid bbVersion(fBufMgr);
- if (fReaderMgr->skippedString(XMLUni::fgVersionString))
- {
- if (!scanEq())
- {
- fScanner->emitError(XMLErrs::ExpectedEqSign);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- //
- // Followed by a single or double quoted version. Get a buffer for
- // the string.
- //
- if (!getQuotedString(bbVersion.getBuffer()))
- {
- fScanner->emitError(XMLErrs::BadXMLVersion);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // If its not our supported version, issue an error but continue
- if (XMLString::compareString(bbVersion.getRawBuffer(), XMLUni::fgSupportedVersion))
- fScanner->emitError(XMLErrs::UnsupportedXMLVersion, bbVersion.getRawBuffer());
- }
- // Ok, now we must have an encoding string
- XMLBufBid bbEncoding(fBufMgr);
- fReaderMgr->skipPastSpaces();
- bool gotEncoding = false;
- if (fReaderMgr->skippedString(XMLUni::fgEncodingString))
- {
- // There must be a equal sign next
- if (!scanEq())
- {
- fScanner->emitError(XMLErrs::ExpectedEqSign);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // Followed by a single or double quoted version string
- getQuotedString(bbEncoding.getBuffer());
- if (bbEncoding.isEmpty())
- {
- fScanner->emitError(XMLErrs::BadXMLEncoding, bbEncoding.getRawBuffer());
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // Indicate that we got an encoding
- gotEncoding = true;
- }
- //
- // Encoding declarations are required in the external entity
- // if there is a text declaration present
- //
- if (!gotEncoding)
- {
- fScanner->emitError(XMLErrs::EncodingRequired);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- fReaderMgr->skipPastSpaces();
- if (!fReaderMgr->skippedChar(chQuestion))
- {
- fScanner->emitError(XMLErrs::UnterminatedXMLDecl);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- else if (!fReaderMgr->skippedChar(chCloseAngle))
- {
- fScanner->emitError(XMLErrs::UnterminatedXMLDecl);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- //
- // If we have a document type handler and advanced callbacks are on,
- // then call the TextDecl callback
- //
- if (fDocTypeHandler)
- {
- fDocTypeHandler->TextDecl
- (
- bbVersion.getRawBuffer()
- , bbEncoding.getRawBuffer()
- );
- }
- //
- // If we got an encoding string, then we have to call back on the reader
- // to tell it what the encoding is.
- //
- if (!bbEncoding.isEmpty())
- {
- if (!fReaderMgr->getCurrentReader()->setEncoding(bbEncoding.getRawBuffer()))
- fScanner->emitError(XMLErrs::ContradictoryEncoding, bbEncoding.getRawBuffer());
- }
- }