词法分析

开发平台：

Visual C++

DTDScanner.cpp：源码内容

/*
* The Apache Software License, Version 1.1
*
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation, and was
* Business Machines, Inc., http://www.ibm.com . For more information
* on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/*
* $Log: DTDScanner.cpp,v $
* Revision 1.27 2003/05/18 14:02:06 knoaman
* Memory manager implementation: pass per instance manager.
*
* Revision 1.26 2003/05/16 21:43:19 knoaman
* Memory manager implementation: Modify constructors to pass in the memory manager.
*
* Revision 1.25 2003/05/15 18:54:50 knoaman
* Partial implementation of the configurable memory manager.
*
* Revision 1.24 2003/03/10 15:28:07 tng
* XML1.0 Errata E38
*
* Revision 1.23 2003/02/05 22:07:09 tng
* [Bug 3111] Problem with LexicalHandler::startDTD() and LexicalHandler::endDTD().
*
* Revision 1.22 2003/01/20 22:01:38 tng
* Need to check text decl when expanding PE
*
* Revision 1.21 2003/01/16 21:30:14 tng
* [Bug 16151] Memory leak in DTDScanner with ill-formed DTD declaration. Fix by David Bertoni.
*
* Revision 1.20 2002/12/24 16:12:19 tng
* For performance reason, move the character check to scancharref.
*
* Revision 1.19 2002/12/20 22:10:47 tng
* XML 1.1
*
* Revision 1.18 2002/12/18 14:17:55 gareth
* Fix to bug #13438. When you eant a vector that calls delete[] on its members you should use RefArrayVectorOf.
*
* Revision 1.17 2002/12/04 02:47:25 knoaman
* scanner re-organization.
*
* Revision 1.16 2002/11/14 22:34:11 tng
* [Bug 14265] Access violation with Null systemId/publicId in DTDScanner
*
* Revision 1.15 2002/11/05 21:40:36 tng
* Oasis test fix:
* 1. Should check if content model allow character for CDataSection case
* 2. Should check partial markup in entity for INCLUDE and IGNORE scenario
* 3. If standalone is yes, reference to entity where its declaration is external is a well-formness fatal error (XML 1.0 Section 4.1)
* If standalone is yes, reference to parameter entity where is declaration is external is a validity constraint (XML 1.0 Section 2.9)
* 4. XML 1.0 Section 2.8 Partial markup in parameter entity reference.
* If it is a complete declaration, partial markup is a fatal error.
*
* Revision 1.14 2002/11/04 14:50:40 tng
* C++ Namespace Support.
*
* Revision 1.13 2002/09/24 20:10:30 tng
* Performance: use XMLString::equals instead of XMLString::compareString
*
* Revision 1.12 2002/08/22 21:05:29 tng
* [Bug 7475] Xerces-C++ reports validation error with Docbook.
*
* Revision 1.11 2002/08/22 20:26:01 tng
* [Bug 7512] Wrong error message created .
*
* Revision 1.10 2002/08/22 19:29:13 tng
* [Bug 11448] DomCount has problems with XHTML1.1 DTD.
*
* Revision 1.9 2002/08/19 14:40:31 tng
* Fix: public id / system id in entity decl should be null if empty
*
* Revision 1.8 2002/07/26 13:33:44 knoaman
* Public/System id for notations should be stored as NULL if missing.
*
* Revision 1.7 2002/07/11 18:39:48 knoaman
* Access entities through the DTDGrammar instead of the scanner.
*
* Revision 1.6 2002/06/06 20:36:33 tng
* Fix: Valid encoding name is not checked in scanning Text Decl
*
* Revision 1.5 2002/05/30 16:17:19 tng
* Add feature to optionally ignore external DTD.
*
* Revision 1.4 2002/05/03 14:51:16 peiyongz
* Bug#8769: UMR detected by memory tool - patch from Kenneth Palsson
*
* Revision 1.3 2002/02/28 22:34:36 peiyongz
* Bug#2717: patch to Unterminated INCLUDE section causes infinite loop with setExitOnFirstFatalError(false)
*
* Revision 1.2 2002/02/26 21:06:53 knoaman
* Create ZeroOrOne node only if needed.
*
* Revision 1.1.1.1 2002/02/01 22:22:44 peiyongz
* sane_include
*
* Revision 1.25 2002/01/24 16:30:50 tng
* [Bug 3111] Problem with LexicalHandler::startDTD() and LexicalHandler::endDTD() .
*
* Revision 1.24 2001/12/17 15:39:14 knoaman
* Fix for surrogate pair support.
*
* Revision 1.23 2001/12/14 20:21:37 knoaman
* Add surrogate support to comments and processing instrunctions.
*
* Revision 1.22 2001/12/06 17:51:18 tng
* Performance Enhancement. The ContentSpecNode constructor always copied the QName
* that was passed to it. Added a second constructor that allows the QName to be just assigned, not copied.
* That was because there are some cases in which a temporary QName was constructed, passed to ContentSpecNode, and then deleted.
* There were examples of that in TraverseSchema and DTDScanner.
* By Henry Zongaro.
*
* Revision 1.21 2001/11/13 13:27:28 tng
* Move root element check to XMLScanner.
*
* Revision 1.20 2001/09/05 20:49:10 knoaman
* Fix for complexTypes with mixed content model.
*
* Revision 1.19 2001/08/02 16:54:39 tng
* Reset some Scanner flags in scanReset().
*
* Revision 1.18 2001/07/13 16:57:11 tng
* ScanId fix.
*
* Revision 1.17 2001/07/12 20:10:18 tng
* Partial Markup in Parameter Entity is validity constraint and thus should be just error, not fatal error.
*
* Revision 1.16 2001/07/10 21:09:39 tng
* Give proper error messsage when scanning external id.
*
* Revision 1.15 2001/07/10 20:56:17 tng
* Should check the first char of PI Target Name.
*
* Revision 1.14 2001/07/09 13:42:20 tng
* Partial Markup in Parameter Entity is validity constraint and thus should be just error, not fatal error.
*
* Revision 1.13 2001/07/05 14:05:29 tng
* Encoding String must present for external entity text decl.
*
* Revision 1.12 2001/07/05 13:12:19 tng
* Standalone checking is validity constraint and thus should be just error, not fatal error:
*
* Revision 1.11 2001/06/25 14:39:54 knoaman
* Fix bug #965 - submitted by Matt Lovett
*
* Revision 1.10 2001/06/22 12:42:33 tng
* [Bug 2257] 1.5 thinks a <?xml-stylesheet ...> tag is a <?xml ...> tag
*
* Revision 1.9 2001/06/21 14:25:53 knoaman
* Fix for bug 1946
*
* Revision 1.8 2001/06/04 13:25:50 tng
* the start tag "<?xml" could be followed by (#x20 | #x9 | #xD | #xA)+. Fixed by Pei Yong Zhang.
*
* Revision 1.7 2001/05/28 20:54:06 tng
* Schema: allocate a fDTDValidator, fSchemaValidator explicitly to avoid wrong cast
*
* Revision 1.6 2001/05/11 13:27:09 tng
* Copyright update.
*
* Revision 1.5 2001/05/03 20:34:36 tng
* Schema: SchemaValidator update
*
* Revision 1.4 2001/04/23 18:54:35 tng
* Reuse grammar should allow users to use any stored element decl as root. Fixed by Erik Rydgren.
*
* Revision 1.3 2001/04/19 18:17:21 tng
* Schema: SchemaValidator update, and use QName in Content Model
*
* Revision 1.2 2001/03/30 16:35:17 tng
* Schema: Whitespace normalization.
*
* Revision 1.1 2001/03/21 21:56:20 tng
* Schema: Add Schema Grammar, Schema Validator, and split the DTDValidator into DTDValidator, DTDScanner, and DTDGrammar.
*
*/
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <xercesc/util/BinMemInputStream.hpp>
#include <xercesc/util/FlagJanitor.hpp>
#include <xercesc/util/Janitor.hpp>
#include <xercesc/util/XMLUniDefs.hpp>
#include <xercesc/util/UnexpectedEOFException.hpp>
#include <xercesc/sax/InputSource.hpp>
#include <xercesc/framework/XMLDocumentHandler.hpp>
#include <xercesc/framework/XMLEntityHandler.hpp>
#include <xercesc/framework/XMLValidator.hpp>
#include <xercesc/internal/EndOfEntityException.hpp>
#include <xercesc/internal/XMLScanner.hpp>
#include <xercesc/validators/common/ContentSpecNode.hpp>
#include <xercesc/validators/common/MixedContentModel.hpp>
#include <xercesc/validators/DTD/DTDEntityDecl.hpp>
#include <xercesc/validators/DTD/DocTypeHandler.hpp>
#include <xercesc/validators/DTD/DTDScanner.hpp>
XERCES_CPP_NAMESPACE_BEGIN
// ---------------------------------------------------------------------------
// Local methods
// ---------------------------------------------------------------------------
//
// This method automates the grunt work of looking at a char and see if its
// a repetition suffix. If so, it creates a new correct rep node and wraps
// the pass node in it. Otherwise, it returns the previous node.
//
static ContentSpecNode* makeRepNode(const XMLCh testCh,
ContentSpecNode* const prevNode,
MemoryManager* const manager)
{
if (testCh == chQuestion)
{
return new (manager) ContentSpecNode
(
ContentSpecNode::ZeroOrOne
, prevNode
, 0
, true
, true
, manager
);
}
else if (testCh == chPlus)
{
return new (manager) ContentSpecNode
(
ContentSpecNode::OneOrMore
, prevNode
, 0
, true
, true
, manager
);
}
else if (testCh == chAsterisk)
{
return new (manager) ContentSpecNode
(
ContentSpecNode::ZeroOrMore
, prevNode
, 0
, true
, true
, manager
);
}
// Just return the incoming node
return prevNode;
}
// ---------------------------------------------------------------------------
// DTDValidator: Constructors and Destructor
// ---------------------------------------------------------------------------
DTDScanner::DTDScanner( DTDGrammar* dtdGrammar
, DocTypeHandler* const docTypeHandler
, MemoryManager* const manager) :
fMemoryManager(manager)
, fDocTypeHandler(docTypeHandler)
, fDumAttDef(0)
, fDumElemDecl(0)
, fDumEntityDecl(0)
, fInternalSubset(false)
, fNextAttrId(1)
, fDTDGrammar(dtdGrammar)
, fPEntityDeclPool(0)
, fDocTypeReaderId(0)
{
fPEntityDeclPool = new (fMemoryManager) NameIdPool<DTDEntityDecl>(109, 128, fMemoryManager);
}
DTDScanner::~DTDScanner()
{
delete fDumAttDef;
delete fDumElemDecl;
delete fDumEntityDecl;
delete fPEntityDeclPool;
}
// -----------------------------------------------------------------------
// Setter methods
// -----------------------------------------------------------------------
void DTDScanner::setScannerInfo(XMLScanner* const owningScanner
, ReaderMgr* const readerMgr
, XMLBufferMgr* const bufMgr)
{
// We don't own any of these, we just reference them
fScanner = owningScanner;
fReaderMgr = readerMgr;
fBufMgr = bufMgr;
if (fScanner->getDoNamespaces())
fEmptyNamespaceId = fScanner->getEmptyNamespaceId();
else
fEmptyNamespaceId = 0;
fDocTypeReaderId = fReaderMgr->getCurrentReaderNum();
}
// ---------------------------------------------------------------------------
// DTDScanner: Private scanning methods
// ---------------------------------------------------------------------------
bool DTDScanner::checkForPERef(const bool spaceRequired
, const bool inLiteral
, const bool inMarkup
, const bool throwAtEndExt)
{
bool gotSpace = false;
//
// See if we have any spaces up front. If so, then skip them and set
// the gotSpaces flag.
//
if (fReaderMgr->skippedSpace())
{
fReaderMgr->skipPastSpaces();
gotSpace = true;
}
// If the next char is a percent, then expand the PERef
if (!fReaderMgr->skippedChar(chPercent))
return gotSpace;
while (true)
{
if (!expandPERef(false, inLiteral, inMarkup, throwAtEndExt))
fScanner->emitError(XMLErrs::ExpectedEntityRefName);
// And skip any more spaces in the expanded value
if (fReaderMgr->skippedSpace())
{
fReaderMgr->skipPastSpaces();
gotSpace = true;
}
if (!fReaderMgr->skippedChar(chPercent))
break;
}
return gotSpace;
}
bool DTDScanner::expandPERef( const bool scanExternal
, const bool inLiteral
, const bool inMarkup
, const bool throwEndOfExt)
{
fScanner->setHasNoDTD(false);
XMLBufBid bbName(fBufMgr);
//
// If we are in the internal subset and in markup, then this is
// an error but we go ahead and do it anyway.
//
if (fInternalSubset && inMarkup)
fScanner->emitError(XMLErrs::PERefInMarkupInIntSubset);
if (!fReaderMgr->getName(bbName.getBuffer()))
{
fScanner->emitError(XMLErrs::ExpectedPEName);
// Skip the semicolon if that's what we ended up on
fReaderMgr->skippedChar(chSemiColon);
return false;
}
// If no terminating semicolon, emit an error but try to keep going
if (!fReaderMgr->skippedChar(chSemiColon))
fScanner->emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
//
// Look it up in the PE decl pool and see if it exists. If not, just
// emit an error and continue.
//
XMLEntityDecl* decl = fPEntityDeclPool->getByKey(bbName.getRawBuffer());
if (!decl)
{
// XML 1.0 Section 4.1
if (fScanner->getStandalone()) {
// no need to check fScanner->fHasNoDTD which is for sure false
// since we are in expandPERef already
fScanner->emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
}
else {
if (fScanner->getDoValidation())
fScanner->getValidator()->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer());
}
return false;
}
//
// XML 1.0 Section 2.9
// If we are a standalone document, then it has to have been declared
// in the internal subset. Keep going though.
//
if (fScanner->getDoValidation() && fScanner->getStandalone() && !decl->getDeclaredInIntSubset())
fScanner->getValidator()->emitError(XMLValid::VC_IllegalRefInStandalone, bbName.getRawBuffer());
//
// Okee dokee, we found it. So create either a memory stream with
// the entity value contents, or a file stream if its an external
// entity.
//
if (decl->isExternal())
{
// And now create a reader to read this entity
InputSource* srcUsed;
XMLReader* reader = fReaderMgr->createReader
(
decl->getBaseURI()
, decl->getSystemId()
, decl->getPublicId()
, false
, inLiteral ? XMLReader::RefFrom_Literal : XMLReader::RefFrom_NonLiteral
, XMLReader::Type_PE
, XMLReader::Source_External
, srcUsed
);
// Put a janitor on the source so its cleaned up on exit
Janitor<InputSource> janSrc(srcUsed);
// If the creation failed then throw an exception
if (!reader)
ThrowXML1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed->getSystemId());
// Set the 'throw at end' flag, to the one we were given
reader->setThrowAtEnd(throwEndOfExt);
//
// Push the reader. If its a recursive expansion, then emit an error
// and return an failure.
//
if (!fReaderMgr->pushReader(reader, decl))
{
fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
return false;
}
//
// If the caller wants us to scan the external entity, then lets
// do that now.
//
if (scanExternal)
{
XMLEntityHandler* entHandler = fScanner->getEntityHandler();
// If we have an entity handler, tell it we are starting this entity
if (entHandler)
entHandler->startInputSource(*srcUsed);
//
// Scan the external entity now. The parameter tells it that
// it is not in an include section. Get the current reader
// level so we can catch partial markup errors and be sure
// to get back to here if we get an exception out of the
// ext subset scan.
//
const unsigned int readerNum = fReaderMgr->getCurrentReaderNum();
try
{
scanExtSubsetDecl(false, false);
}
catch(...)
{
// Pop the reader back to the original level
fReaderMgr->cleanStackBackTo(readerNum);
// End the input source, even though its not happy
if (entHandler)
entHandler->endInputSource(*srcUsed);
throw;
}
// If we have an entity handler, tell it we are ending this entity
if (entHandler)
entHandler->endInputSource(*srcUsed);
}
else {
// If it starts with the XML string, then parse a text decl
if (fScanner->checkXMLDecl(true))
scanTextDecl();
}
}
else
{
// Create a reader over a memory stream over the entity value
XMLReader* valueReader = fReaderMgr->createIntEntReader
(
decl->getName()
, inLiteral ? XMLReader::RefFrom_Literal : XMLReader::RefFrom_NonLiteral
, XMLReader::Type_PE
, decl->getValue()
, decl->getValueLen()
, false
);
//
// Trt to push the entity reader onto the reader manager stack,
// where it will become the subsequent input. If it fails, that
// means the entity is recursive, so issue an error. The reader
// will have just been discarded, but we just keep going.
//
if (!fReaderMgr->pushReader(valueReader, decl))
fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
}
return true;
}
bool DTDScanner::getQuotedString(XMLBuffer& toFill)
{
// Reset the target buffer
toFill.reset();
// Get the next char which must be a single or double quote
XMLCh quoteCh;
if (!fReaderMgr->skipIfQuote(quoteCh))
return false;
while (true)
{
// Get another char
const XMLCh nextCh = fReaderMgr->getNextChar();
// See if it matches the starting quote char
if (nextCh == quoteCh)
break;
//
// We should never get either an end of file null char here. If we
// do, just fail. It will be handled more gracefully in the higher
// level code that called us.
//
if (!nextCh)
return false;
// Else add it to the buffer
toFill.append(nextCh);
}
return true;
}
XMLAttDef*
DTDScanner::scanAttDef(DTDElementDecl& parentElem, XMLBuffer& bufToUse)
{
// Check for PE ref or optional whitespace
checkForPERef(false, false, true);
// Get the name of the attribute
if (!fReaderMgr->getName(bufToUse))
{
fScanner->emitError(XMLErrs::ExpectedAttrName);
return 0;
}
//
// Look up this attribute in the parent element's attribute list. If
// it already exists, then use the dummy.
//
DTDAttDef* decl = parentElem.getAttDef(bufToUse.getRawBuffer());
if (decl)
{
// It already exists, so put out a warning
fScanner->emitError
(
XMLErrs::AttListAlreadyExists
, bufToUse.getRawBuffer()
, parentElem.getFullName()
);
// Use the dummy decl to parse into and set its name to the name we got
if (!fDumAttDef)
{
fDumAttDef = new (fMemoryManager) DTDAttDef(fMemoryManager);
fDumAttDef->setId(fNextAttrId++);
}
fDumAttDef->setName(bufToUse.getRawBuffer());
decl = fDumAttDef;
}
else
{
//
// It does not already exist so create a new one, give it the next
// available unique id, and add it
//
decl = new (fMemoryManager) DTDAttDef
(
bufToUse.getRawBuffer()
, XMLAttDef::CData
, XMLAttDef::Implied
, fMemoryManager
);
decl->setId(fNextAttrId++);
decl->setExternalAttDeclaration(isReadingExternalEntity());
parentElem.addAttDef(decl);
}
// Set a flag to indicate whether we are doing a dummy parse
const bool isIgnored = (decl == fDumAttDef);
// Space is required here, so check for PE ref, and require space
if (!checkForPERef(true, false, true))
fScanner->emitError(XMLErrs::ExpectedWhitespace);
//
// Next has to be one of the attribute type strings. This tells us what
// is to follow.
//
if (fReaderMgr->skippedString(XMLUni::fgCDATAString))
{
decl->setType(XMLAttDef::CData);
}
else if (fReaderMgr->skippedString(XMLUni::fgIDString))
{
if (!fReaderMgr->skippedString(XMLUni::fgRefString))
decl->setType(XMLAttDef::ID);
else if (!fReaderMgr->skippedChar(chLatin_S))
decl->setType(XMLAttDef::IDRef);
else
decl->setType(XMLAttDef::IDRefs);
}
else if (fReaderMgr->skippedString(XMLUni::fgEntitString))
{
if (fReaderMgr->skippedChar(chLatin_Y))
{
decl->setType(XMLAttDef::Entity);
}
else if (fReaderMgr->skippedString(XMLUni::fgIESString))
{
decl->setType(XMLAttDef::Entities);
}
else
{
fScanner->emitError
(
XMLErrs::ExpectedAttributeType
, decl->getFullName()
, parentElem.getFullName()
);
return 0;
}
}
else if (fReaderMgr->skippedString(XMLUni::fgNmTokenString))
{
if (fReaderMgr->skippedChar(chLatin_S))
decl->setType(XMLAttDef::NmTokens);
else
decl->setType(XMLAttDef::NmToken);
}
else if (fReaderMgr->skippedString(XMLUni::fgNotationString))
{
// Check for PE ref and require space
if (!checkForPERef(true, false, true))
fScanner->emitError(XMLErrs::ExpectedWhitespace);
decl->setType(XMLAttDef::Notation);
if (!scanEnumeration(*decl, bufToUse, true))
return 0;
// Set the value as the enumeration for this decl
decl->setEnumeration(bufToUse.getRawBuffer());
}
else if (fReaderMgr->skippedChar(chOpenParen))
{
decl->setType(XMLAttDef::Enumeration);
if (!scanEnumeration(*decl, bufToUse, false))
return 0;
// Set the value as the enumeration for this decl
decl->setEnumeration(bufToUse.getRawBuffer());
}
else
{
fScanner->emitError
(
XMLErrs::ExpectedAttributeType
, decl->getFullName()
, parentElem.getFullName()
);
return 0;
}
// Space is required here, so check for PE ref, and require space
if (!checkForPERef(true, false, true))
fScanner->emitError(XMLErrs::ExpectedWhitespace);
// And then scan for the optional default value declaration
scanDefaultDecl(*decl);
// If validating, then do a couple of validation constraints
if (fScanner->getDoValidation())
{
if (decl->getType() == XMLAttDef::ID)
{
if ((decl->getDefaultType() != XMLAttDef::Implied)
&& (decl->getDefaultType() != XMLAttDef::Required))
{
fScanner->getValidator()->emitError(XMLValid::BadIDAttrDefType, decl->getFullName());
}
}
// if attdef is xml:space, check correct enumeration (default|preserve)
const XMLCh fgXMLSpace[] = { chLatin_x, chLatin_m, chLatin_l, chColon, chLatin_s, chLatin_p, chLatin_a, chLatin_c, chLatin_e, chNull };
if (XMLString::equals(decl->getFullName(),fgXMLSpace)) {
const XMLCh fgPreserve[] = { chLatin_p, chLatin_r, chLatin_e, chLatin_s, chLatin_e, chLatin_r, chLatin_v, chLatin_e, chNull };
const XMLCh fgDefault[] = { chLatin_d, chLatin_e, chLatin_f, chLatin_a, chLatin_u, chLatin_l, chLatin_t, chNull };
bool ok = false;
if (decl->getType() == XMLAttDef::Enumeration) {
BaseRefVectorOf<XMLCh>* enumVector = XMLString::tokenizeString(decl->getEnumeration());
int size = enumVector->size();
ok = (size == 1 &&
(XMLString::equals(enumVector->elementAt(0), fgDefault) ||
XMLString::equals(enumVector->elementAt(0), fgPreserve))) ||
(size == 2 &&
(XMLString::equals(enumVector->elementAt(0), fgDefault) &&
XMLString::equals(enumVector->elementAt(1), fgPreserve))) ||
(size == 2 &&
(XMLString::equals(enumVector->elementAt(1), fgDefault) &&
XMLString::equals(enumVector->elementAt(0), fgPreserve)));
delete enumVector;
}
if (!ok)
fScanner->getValidator()->emitError(XMLValid::IllegalXMLSpace);
}
}
// If we have a doc type handler, tell it about this attdef.
if (fDocTypeHandler)
fDocTypeHandler->attDef(parentElem, *decl, isIgnored);
return decl;
}
void DTDScanner::scanAttListDecl()
{
// Space is required here, so check for a PE ref
if (!checkForPERef(true, false, true))
{
fScanner->emitError(XMLErrs::ExpectedWhitespace);
fReaderMgr->skipPastChar(chCloseAngle);
return;
}
//
// Next should be the name of the element it belongs to, so get a buffer
// and get the name into it.
//
XMLBufBid bbName(fBufMgr);
if (!fReaderMgr->getName(bbName.getBuffer()))
{
fScanner->emitError(XMLErrs::ExpectedElementName);
fReaderMgr->skipPastChar(chCloseAngle);
return;
}
//
// Find this element's declaration. If it has not been declared yet,
// we will force one into the list, but not mark it as declared.
//
DTDElementDecl* elemDecl = (DTDElementDecl*) fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bbName.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
if (!elemDecl)
{
//
// Lets fault in a declaration and add it to the pool. We mark
// it having been created because of an attlist. Later, if its
// declared, this will be updated.
//
elemDecl = new (fMemoryManager) DTDElementDecl
(
bbName.getRawBuffer()
, fEmptyNamespaceId
, DTDElementDecl::Any
, fMemoryManager
);
elemDecl->setCreateReason(XMLElementDecl::AttList);
elemDecl->setExternalElemDeclaration(isReadingExternalEntity());
fDTDGrammar->putElemDecl((XMLElementDecl*) elemDecl);
}
// If we have a doc type handler, tell it the att list is starting
if (fDocTypeHandler)
fDocTypeHandler->startAttList(*elemDecl);
//
// Now we loop until we are done with all of the attributes in this
// list. We need a buffer to use for local processing.
//
XMLBufBid bbTmp(fBufMgr);
XMLBuffer& tmpBuf = bbTmp.getBuffer();
bool seenAnId = false;
while (true)
{
// Get the next char out and see what it tells us to do
const XMLCh nextCh = fReaderMgr->peekNextChar();
// Watch for EOF
if (!nextCh)
ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
if (nextCh == chCloseAngle)
{
// We are done with this attribute list
fReaderMgr->getNextChar();
break;
}
else if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
{
//
// If advanced callbacks are enabled and we have a doc
// type handler, then gather up the white space and call
// back on the doctype handler. Otherwise, just skip
// whitespace.
//
if (fDocTypeHandler)
{
fReaderMgr->getSpaces(tmpBuf);
fDocTypeHandler->doctypeWhitespace
(
tmpBuf.getRawBuffer()
, tmpBuf.getLen()
);
}
else
{
fReaderMgr->skipPastSpaces();
}
}
else if (nextCh == chPercent)
{
// Eat the percent and expand the ref
fReaderMgr->getNextChar();
expandPERef(false, false, true);
}
else
{
//
// It must be an attribute name, so scan it. We let
// it use our local buffer for its name scanning.
//
XMLAttDef* attDef = scanAttDef(*elemDecl, tmpBuf);
if (!attDef)
{
fReaderMgr->skipPastChar(chCloseAngle);
break;
}
//
// If we are validating and its an ID type, then we have to
// make sure that we have not seen an id attribute yet. Set
// the flag to say that we've seen one now also.
//
if (fScanner->getDoValidation())
{
if (attDef->getType() == XMLAttDef::ID)
{
if (seenAnId)
fScanner->getValidator()->emitError(XMLValid::MultipleIdAttrs, elemDecl->getFullName());
seenAnId = true;
}
}
}
}
// If we have a doc type handler, tell it the att list is ending
if (fDocTypeHandler)
fDocTypeHandler->endAttList(*elemDecl);
}
//
// This method is called to scan the value of an attribute in content. This
// involves some normalization and replacement of general entity and
// character references.
//
// End of entity's must be dealt with here. During DTD scan, they can come
// from external entities. During content, they can come from any entity.
// We just eat the end of entity and continue with our scan until we come
// to the closing quote. If an unterminated value causes us to go through
// subsequent entities, that will cause errors back in the calling code,
// but there's little we can do about it here.
//
bool DTDScanner::scanAttValue(const XMLCh* const attrName
, XMLBuffer& toFill
, const XMLAttDef::AttTypes type)
{
enum States
{
InWhitespace
, InContent
};
// Reset the target buffer
toFill.reset();
// Get the next char which must be a single or double quote
XMLCh quoteCh;
if (!fReaderMgr->skipIfQuote(quoteCh))
return false;
//
// We have to get the current reader because we have to ignore closing
// quotes until we hit the same reader again.
//
const unsigned int curReader = fReaderMgr->getCurrentReaderNum();
//
// Loop until we get the attribute value. Note that we use a double
// loop here to avoid the setup/teardown overhead of the exception
// handler on every round.
//
XMLCh nextCh;
XMLCh secondCh = 0;
States curState = InContent;
bool firstNonWS = false;
bool gotLeadingSurrogate = false;
bool escaped;
while (true)
{
try
{
while(true)
{
nextCh = fReaderMgr->getNextChar();
if (!nextCh)
ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
// Check for our ending quote in the same entity
if (nextCh == quoteCh)
{
if (curReader == fReaderMgr->getCurrentReaderNum())
return true;
// Watch for spillover into a previous entity
if (curReader > fReaderMgr->getCurrentReaderNum())
{
fScanner->emitError(XMLErrs::PartialMarkupInEntity);
return false;
}
}
//
// Check for an entity ref now, before we let it affect our
// whitespace normalization logic below. We ignore the empty flag
// in this one.
//
escaped = false;
if (nextCh == chAmpersand)
{
if (scanEntityRef(nextCh, secondCh, escaped) != EntityExp_Returned)
{
gotLeadingSurrogate = false;
continue;
}
}
else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
{
// Check for correct surrogate pairs
if (gotLeadingSurrogate)
fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
else
gotLeadingSurrogate = true;
}
else
{
if (gotLeadingSurrogate)
{
if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
}
// Its got to at least be a valid XML character
else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh))
{
XMLCh tmpBuf[9];
XMLString::binToText
(
nextCh
, tmpBuf
, 8
, 16
);
fScanner->emitError
(
XMLErrs::InvalidCharacterInAttrValue
, attrName
, tmpBuf
);
}
gotLeadingSurrogate = false;
}
//
// If its not escaped, then make sure its not a < character, which
// is not allowed in attribute values.
//
if (!escaped && (nextCh == chOpenAngle))
fScanner->emitError(XMLErrs::BracketInAttrValue, attrName);
//
// If the attribute is a CDATA type we do simple replacement of
// tabs and new lines with spaces, if the character is not escaped
// by way of a char ref.
//
// Otherwise, we do the standard non-CDATA normalization of
// compressing whitespace to single spaces and getting rid of
// leading and trailing whitespace.
//
if (type == XMLAttDef::CData)
{
if (!escaped)
{
if ((nextCh == 0x09) || (nextCh == 0x0A) || (nextCh == 0x0D))
nextCh = chSpace;
}
}
else
{
if (curState == InWhitespace)
{
if (!fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
{
if (firstNonWS)
toFill.append(chSpace);
curState = InContent;
firstNonWS = true;
}
else
{
continue;
}
}
else if (curState == InContent)
{
if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
{
curState = InWhitespace;
continue;
}
firstNonWS = true;
}
}
// Else add it to the buffer
toFill.append(nextCh);
if (secondCh)
toFill.append(secondCh);
}
}
catch(const EndOfEntityException&)
{
// Just eat it and continue.
gotLeadingSurrogate = false;
escaped = false;
}
}
return true;
}
bool DTDScanner::scanCharRef(XMLCh& first, XMLCh& second)
{
bool gotOne = false;
unsigned int value = 0;
//
// Set the radix. Its supposed to be a lower case x if hex. But, in
// order to recover well, we check for an upper and put out an error
// for that.
//
unsigned int radix = 10;
if (fReaderMgr->skippedChar(chLatin_x))
{
radix = 16;
}
else if (fReaderMgr->skippedChar(chLatin_X))
{
fScanner->emitError(XMLErrs::HexRadixMustBeLowerCase);
radix = 16;
}
while (true)
{
const XMLCh nextCh = fReaderMgr->peekNextChar();
// Watch for EOF
if (!nextCh)
ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
// Break out on the terminating semicolon
if (nextCh == chSemiColon)
{
fReaderMgr->getNextChar();
break;
}
//
// Convert this char to a binary value, or bail out if its not
// one.
//
unsigned int nextVal;
if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9))
nextVal = (unsigned int)(nextCh - chDigit_0);
else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F))
nextVal= (unsigned int)(10 + (nextCh - chLatin_A));
else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f))
nextVal = (unsigned int)(10 + (nextCh - chLatin_a));
else
{
//
// If we got at least a sigit, then do an unterminated ref
// error. Else, do an expected a numerical ref thing.
//
if (gotOne)
fScanner->emitError(XMLErrs::UnterminatedCharRef);
else
fScanner->emitError(XMLErrs::ExpectedNumericalCharRef);
return false;
}
//
// Make sure its valid for the radix. If not, then just eat the
// digit and go on after issueing an error. Else, update the
// running value with this new digit.
//
if (nextVal >= radix)
{
XMLCh tmpStr[2];
tmpStr[0] = nextCh;
tmpStr[1] = chNull;
fScanner->emitError(XMLErrs::BadDigitForRadix, tmpStr);
}
else
{
value = (value * radix) + nextVal;
}
// Indicate that we got at least one good digit
gotOne = true;
// Eat the char we just processed
fReaderMgr->getNextChar();
}
// Return the char (or chars)
// And check if the character expanded is valid or not
if (value >= 0x10000 && value <= 0x10FFFF)
{
value -= 0x10000;
first = XMLCh((value >> 10) + 0xD800);
second = XMLCh((value & 0x3FF) + 0xDC00);
}
else if (value <= 0xFFFD)
{
first = XMLCh(value);
second = 0;
if (!fReaderMgr->getCurrentReader()->isXMLChar(first) && !fReaderMgr->getCurrentReader()->isControlChar(first)) {
// Character reference was not in the valid range
fScanner->emitError(XMLErrs::InvalidCharacterRef);
return false;
}
}
else {
// Character reference was not in the valid range
fScanner->emitError(XMLErrs::InvalidCharacterRef);
return false;
}
return true;
}
ContentSpecNode*
DTDScanner::scanChildren(const DTDElementDecl& elemDecl, XMLBuffer& bufToUse)
{
// Check for a PE ref here, but don't require spaces
checkForPERef(false, false, true);
// We have to check entity nesting here
unsigned int curReader;
//
// We know that the caller just saw an opening parenthesis, so we need
// to parse until we hit the end of it, recursing for other nested
// parentheses we see.
//
// We have to check for one up front, since it could be something like
// (((a)*)) etc...
//
ContentSpecNode* curNode = 0;
if (fReaderMgr->skippedChar(chOpenParen))
{
curReader = fReaderMgr->getCurrentReaderNum();
// Lets call ourself and get back the resulting node
curNode = scanChildren(elemDecl, bufToUse);
// If that failed, no need to go further, return failure
if (!curNode)
return 0;
if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getDoValidation())
fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
}
else
{
// Not a nested paren, so it must be a leaf node
if (!fReaderMgr->getName(bufToUse))
{
fScanner->emitError(XMLErrs::ExpectedElementName);
return 0;
}
//
// Create a leaf node for it. If we can find the element id for
// this element, then use it. Else, we have to fault in an element
// decl, marked as created because of being in a content model.
//
XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bufToUse.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
if (!decl)
{
decl = new (fMemoryManager) DTDElementDecl
(
bufToUse.getRawBuffer()
, fEmptyNamespaceId
, DTDElementDecl::Any
, fMemoryManager
);
decl->setCreateReason(XMLElementDecl::InContentModel);
decl->setExternalElemDeclaration(isReadingExternalEntity());
fDTDGrammar->putElemDecl(decl);
}
curNode = new (fMemoryManager) ContentSpecNode
(
decl->getElementName()
, fMemoryManager
);
// Check for a PE ref here, but don't require spaces
const bool gotSpaces = checkForPERef(false, false, true);
// Check for a repetition character after the leaf
const XMLCh repCh = fReaderMgr->peekNextChar();
ContentSpecNode* tmpNode = makeRepNode(repCh, curNode, fMemoryManager);
if (tmpNode != curNode)
{
if (gotSpaces)
fScanner->emitError(XMLErrs::UnexpectedWhitespace);
fReaderMgr->getNextChar();
curNode = tmpNode;
}
}
// Check for a PE ref here, but don't require spaces
checkForPERef(false, false, true);
//
// Ok, the next character tells us what kind of content this particular
// model this particular parentesized section is. Its either a choice if
// we see ',', a sequence if we see '|', or a single leaf node if we see
// a closing paren.
//
const XMLCh opCh = fReaderMgr->peekNextChar();
if ((opCh != chComma)
&& (opCh != chPipe)
&& (opCh != chCloseParen))
{
// Not a legal char, so delete our node and return failure
delete curNode;
fScanner->emitError(XMLErrs::ExpectedSeqChoiceLeaf);
return 0;
}
//
// Create the head node of the correct type. We need this to remember
// the top of the local tree. If it was a single subexpr, then just
// set the head node to the current node. For the others, we'll build
// the tree off the second child as we move across.
//
ContentSpecNode* headNode = 0;
ContentSpecNode::NodeTypes curType = ContentSpecNode::UnknownType;
if (opCh == chComma)
{
curType = ContentSpecNode::Sequence;
headNode = new (fMemoryManager) ContentSpecNode
(
curType
, curNode
, 0
, true
, true
, fMemoryManager
);
curNode = headNode;
}
else if (opCh == chPipe)
{
curType = ContentSpecNode::Choice;
headNode = new (fMemoryManager) ContentSpecNode
(
curType
, curNode
, 0
, true
, true
, fMemoryManager
);
curNode = headNode;
}
else
{
headNode = curNode;
fReaderMgr->getNextChar();
}
//
// If it was a sequence or choice, we just loop until we get to the
// end of our section, adding each new leaf or sub expression to the
// right child of the current node, and making that new node the current
// node.
//
if ((opCh == chComma) || (opCh == chPipe))
{
ContentSpecNode* lastNode = 0;
while (true)
{
//
// The next thing must either be another | or , character followed
// by another leaf or subexpression, or a closing parenthesis, or a
// PE ref.
//
if (fReaderMgr->lookingAtChar(chPercent))
{
checkForPERef(false, false, true);
}
else if (fReaderMgr->skippedSpace())
{
// Just skip whitespace
fReaderMgr->skipPastSpaces();
}
else if (fReaderMgr->skippedChar(chCloseParen))
{
//
// We've hit the end of this section, so break out. But, we
// need to see if we left a partial sequence of choice node
// without a second node. If so, we have to undo that and
// put its left child into the right node of the previous
// node.
//
if ((curNode->getType() == ContentSpecNode::Choice)
|| (curNode->getType() == ContentSpecNode::Sequence))
{
if (!curNode->getSecond())
{
ContentSpecNode* saveFirst = curNode->orphanFirst();
lastNode->setSecond(saveFirst);
curNode = lastNode;
}
}
break;
}
else if (fReaderMgr->skippedChar(opCh))
{
// Check for a PE ref here, but don't require spaces
checkForPERef(false, false, true);
if (fReaderMgr->skippedChar(chOpenParen))
{
curReader = fReaderMgr->getCurrentReaderNum();
// Recurse to handle this new guy
ContentSpecNode* subNode = scanChildren(elemDecl, bufToUse);
// If it failed, we are done, clean up here and return failure
if (!subNode)
{
delete headNode;
return 0;
}
if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getDoValidation())
fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
// Else patch it in and make it the new current
ContentSpecNode* newCur = new (fMemoryManager) ContentSpecNode
(
curType
, subNode
, 0
, true
, true
, fMemoryManager
);
curNode->setSecond(newCur);
lastNode = curNode;
curNode = newCur;
}
else
{
//
// Got to be a leaf node, so get a name. If we cannot get
// one, then clean up and get outa here.
//
if (!fReaderMgr->getName(bufToUse))
{
delete headNode;
fScanner->emitError(XMLErrs::ExpectedElementName);
return 0;
}
//
// Create a leaf node for it. If we can find the element
// id for this element, then use it. Else, we have to
// fault in an element decl, marked as created because
// of being in a content model.
//
XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bufToUse.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
if (!decl)
{
decl = new (fMemoryManager) DTDElementDecl
(
bufToUse.getRawBuffer()
, fEmptyNamespaceId
, DTDElementDecl::Any
, fMemoryManager
);
decl->setCreateReason(XMLElementDecl::InContentModel);
decl->setExternalElemDeclaration(isReadingExternalEntity());
fDTDGrammar->putElemDecl(decl);
}
ContentSpecNode* tmpLeaf = new (fMemoryManager) ContentSpecNode
(
decl->getElementName()
, fMemoryManager
);
// Check for a repetition character after the leaf
const XMLCh repCh = fReaderMgr->peekNextChar();
ContentSpecNode* tmpLeaf2 = makeRepNode(repCh, tmpLeaf, fMemoryManager);
if (tmpLeaf != tmpLeaf2)
fReaderMgr->getNextChar();
//
// Create a new sequence or choice node, with the leaf
// (or rep surrounding it) we just got as its first node.
// Make the new node the second node of the current node,
// and then make it the current node.
//
ContentSpecNode* newCur = new (fMemoryManager) ContentSpecNode
(
curType
, tmpLeaf2
, 0
, true
, true
, fMemoryManager
);
curNode->setSecond(newCur);
lastNode = curNode;
curNode = newCur;
}
}
else
{
// Cannot be valid
if (opCh == chComma)
{
fScanner->emitError(XMLErrs::ExpectedChoiceOrCloseParen);
}
else
{
fScanner->emitError
(
XMLErrs::ExpectedSeqOrCloseParen
, elemDecl.getFullName()
);
}
delete headNode;
return 0;
}
}
}
//
// We saw the terminating parenthesis so lets check for any repetition
// character, and create a node for that, making the head node the child
// of it.
//
XMLCh repCh = fReaderMgr->peekNextChar();
ContentSpecNode* retNode = makeRepNode(repCh, headNode, fMemoryManager);
if (retNode != headNode)
fReaderMgr->getNextChar();
return retNode;
}
//
// We get here after the '<!--' part of the comment. We scan past the
// terminating '-->' It will calls the appropriate handler with the comment
// text, if one is provided. A comment can be in either the document or
// the DTD, so the fInDocument flag is used to know which handler to send
// it to.
//
void DTDScanner::scanComment()
{
enum States
{
InText
, OneDash
, TwoDashes
};
// Get a buffer for this
XMLBufBid bbComment(fBufMgr);
//
// Get the comment text into a temp buffer. Be sure to use temp buffer
// two here, since its to be used for stuff that is potentially longer
// than just a name.
//
bool gotLeadingSurrogate = false;
States curState = InText;
while (true)
{
// Get the next character
const XMLCh nextCh = fReaderMgr->getNextChar();
// Watch for an end of file
if (!nextCh)
{
fScanner->emitError(XMLErrs::UnterminatedComment);
ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
}
// Check for correct surrogate pairs
if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
{
if (gotLeadingSurrogate)
fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
else
gotLeadingSurrogate = true;
}
else
{
if (gotLeadingSurrogate)
{
if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
}
// Its got to at least be a valid XML character
else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) {
XMLCh tmpBuf[9];
XMLString::binToText
(
nextCh
, tmpBuf
, 8
, 16
);
fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
}
gotLeadingSurrogate = false;
}
if (curState == InText)
{
// If its a dash, go to OneDash state. Otherwise take as text
if (nextCh == chDash)
curState = OneDash;
else
bbComment.append(nextCh);
}
else if (curState == OneDash)
{
//
// If its another dash, then we change to the two dashes states.
// Otherwise, we have to put in the deficit dash and the new
// character and go back to InText.
//
if (nextCh == chDash)
{
curState = TwoDashes;
}
else
{
bbComment.append(chDash);
bbComment.append(nextCh);
curState = InText;
}
}
else if (curState == TwoDashes)
{
// The next character must be the closing bracket
if (nextCh != chCloseAngle)
{
fScanner->emitError(XMLErrs::IllegalSequenceInComment);
fReaderMgr->skipPastChar(chCloseAngle);
return;
}
break;
}
}
// If there is a doc type handler, then pass on the comment stuff
if (fDocTypeHandler)
fDocTypeHandler->doctypeComment(bbComment.getRawBuffer());
}
bool DTDScanner::scanContentSpec(DTDElementDecl& toFill)
{
//
// Check for for a couple of the predefined content type strings. If
// its not one of these, its got to be a parenthesized reg ex type
// expression.
//
if (fReaderMgr->skippedString(XMLUni::fgEmptyString))
{
toFill.setModelType(DTDElementDecl::Empty);
return true;
}
if (fReaderMgr->skippedString(XMLUni::fgAnyString))
{
toFill.setModelType(DTDElementDecl::Any);
return true;
}
// Its got to be a parenthesized regular expression
if (!fReaderMgr->skippedChar(chOpenParen))
{
fScanner->emitError
(
XMLErrs::ExpectedContentSpecExpr
, toFill.getFullName()
);
return false;
}
// Get the current reader id, so we can test for partial markup
const unsigned int curReader = fReaderMgr->getCurrentReaderNum();
// We could have a PE ref here, but don't require space
checkForPERef(false, false, true);
//
// Now we look for a PCDATA string. If its PCDATA, then it must be a
// MIXED model. Otherwise, it must be a regular list of children in
// a regular expression perhaps.
//
bool status;
if (fReaderMgr->skippedString(XMLUni::fgPCDATAString))
{
// Set the model to mixed
toFill.setModelType(DTDElementDecl::Mixed_Simple);
status = scanMixed(toFill);
//
// If we are validating we have to check that there are no multiple
// uses of any child elements.
//
if (fScanner->getDoValidation())
{
if (((const MixedContentModel*)toFill.getContentModel())->hasDups())
fScanner->getValidator()->emitError(XMLValid::RepElemInMixed);
}
}
else
{
//
// We have to do a recursive scan of the content model. Create a
// buffer for it to use, for efficiency. It returns the top ofthe
// content spec node tree, which we set if successful.
//
toFill.setModelType(DTDElementDecl::Children);
XMLBufBid bbTmp(fBufMgr);
ContentSpecNode* resNode = scanChildren(toFill, bbTmp.getBuffer());
status = (resNode != 0);
if (status)
toFill.setContentSpec(resNode);
}
// Make sure we are on the same reader as where we started
if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getDoValidation())
fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
return status;
}
void DTDScanner::scanDefaultDecl(DTDAttDef& toFill)
{
if (fReaderMgr->skippedString(XMLUni::fgRequiredString))
{
toFill.setDefaultType(XMLAttDef::Required);
return;
}
if (fReaderMgr->skippedString(XMLUni::fgImpliedString))
{
toFill.setDefaultType(XMLAttDef::Implied);
return;
}
if (fReaderMgr->skippedString(XMLUni::fgFixedString))
{
//
// There must be space before the fixed value. If there is not, then
// emit an error but keep going.
//
if (!fReaderMgr->skippedSpace())
fScanner->emitError(XMLErrs::ExpectedWhitespace);
else
fReaderMgr->skipPastSpaces();
toFill.setDefaultType(XMLAttDef::Fixed);
}
else
{
toFill.setDefaultType(XMLAttDef::Default);
}
//
// If we got here, its fixed or default, so we need to get a value.
// If we don't, then emit an error but just set the default value to
// an empty string and try to keep going.
//
// Check for PE ref or optional whitespace
checkForPERef(false, false, true);
XMLBufBid bbValue(fBufMgr);
if (!scanAttValue(toFill.getFullName(), bbValue.getBuffer(), toFill.getType()))
fScanner->emitError(XMLErrs::ExpectedDefAttrDecl);
toFill.setValue(bbValue.getRawBuffer());
}
//
// This is called after seeing '<!ELEMENT' which indicates that an element
// markup is starting. This guy scans the rest of it and adds it to the
// element decl pool if it has not already been declared.
//
void DTDScanner::scanElementDecl()
{
//
// Space is legal (required actually) here so check for a PE ref. If
// we don't get our whitespace, then issue and error, but try to keep
// going.
//
if (!checkForPERef(true, false, true))
fScanner->emitError(XMLErrs::ExpectedWhitespace);
// Get a buffer for the element name and scan in the name
XMLBufBid bbName(fBufMgr);
if (!fReaderMgr->getName(bbName.getBuffer()))
{
fScanner->emitError(XMLErrs::ExpectedElementName);
fReaderMgr->skipPastChar(chCloseAngle);
return;
}
// Look this guy up in the element decl pool
DTDElementDecl* decl = (DTDElementDecl*) fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bbName.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
//
// If it does not exist, then we need to create it. If it does and
// its marked as declared, then that's an error, but we still need to
// scan over the content model so use the dummy declaration that the
// parsing code can fill in.
//
if (decl)
{
if (decl->isDeclared())
{
if (fScanner->getDoValidation())
fScanner->getValidator()->emitError(XMLValid::ElementAlreadyExists, bbName.getRawBuffer());
if (!fDumElemDecl)
fDumElemDecl = new (fMemoryManager) DTDElementDecl
(
bbName.getRawBuffer()
, fEmptyNamespaceId
, DTDElementDecl::Any
, fMemoryManager
);
else
fDumElemDecl->setElementName(bbName.getRawBuffer(),fEmptyNamespaceId);
}
}
else
{
//
// Create the new empty declaration to fill in and put it into
// the decl pool.
//
decl = new (fMemoryManager) DTDElementDecl
(
bbName.getRawBuffer()
, fEmptyNamespaceId
, DTDElementDecl::Any
, fMemoryManager
);
fDTDGrammar->putElemDecl(decl);
}
// Set a flag for whether we will ignore this one
const bool isIgnored = (decl == fDumElemDecl);
// Mark this one if being externally declared
decl->setExternalElemDeclaration(isReadingExternalEntity());
// Mark this one as being declared
decl->setCreateReason(XMLElementDecl::Declared);
// Another check for a PE ref, with at least required whitespace
if (!checkForPERef(true, false, true))
fScanner->emitError(XMLErrs::ExpectedWhitespace);
// And now scan the content model for this guy.
if (!scanContentSpec(*decl))
{
fReaderMgr->skipPastChar(chCloseAngle);
return;
}
// Another check for a PE ref, but we don't require whitespace here
checkForPERef(false, false, true);
// And we should have the ending angle bracket
if (!fReaderMgr->skippedChar(chCloseAngle))
{
fScanner->emitError(XMLErrs::UnterminatedElementDecl, bbName.getRawBuffer());
fReaderMgr->skipPastChar(chCloseAngle);
}
//
// If we have a DTD handler tell it about the new element decl. We
// tell it if its one that can be ignored, cause its an override of a
// previously existing decl. If it is being ignored, only call back
// if advanced callbacks are enabled.
//
if (fDocTypeHandler)
fDocTypeHandler->elementDecl(*decl, isIgnored);
}
//
// This method will process a general or parameter entity reference. The
// entity name and entity text will be stored in the entity pool. The value
// of the entity will be scanned for any other parameter entity or char
// references which will be expanded. So the stored value can only have
// general entity references when done.
//
void DTDScanner::scanEntityDecl()
{
//
// Space is required here, but we cannot check for a PE Ref since
// there could be a legal (no-ref) percent sign here. Since any
// entity that ended here would be illegal, we just skip spaces
// and then check for a percent.
//
if (!fReaderMgr->lookingAtSpace())
fScanner->emitError(XMLErrs::ExpectedWhitespace);
else
fReaderMgr->skipPastSpaces();
const bool isPEDecl = fReaderMgr->skippedChar(chPercent);
//
// If a PE decl, then eat the percent and check for spaces or a
// PE ref on the other side of it. At least spaces are required.
//
if (isPEDecl)
{
if (!checkForPERef(true, false, true))
fScanner->emitError(XMLErrs::ExpectedWhitespace);
}
//
// Now lets get a name, which should be the name of the entity. We
// have to get a buffer for this.
//
XMLBufBid bbName(fBufMgr);
if (!fReaderMgr->getName(bbName.getBuffer()))
{
fScanner->emitError(XMLErrs::ExpectedPEName);
fReaderMgr->skipPastChar(chCloseAngle);
return;
}
// If namespaces are enabled, then no colons allowed
if (fScanner->getDoNamespaces())
{
if (XMLString::indexOf(bbName.getRawBuffer(), chColon) != -1)
fScanner->emitError(XMLErrs::ColonNotLegalWithNS);
}
//
// See if this entity already exists. If so, then the existing one
// takes precendence. So we use the local dummy decl to parse into
// and just ignore the results.
//
DTDEntityDecl* entityDecl;
if (isPEDecl)
entityDecl = fPEntityDeclPool->getByKey(bbName.getRawBuffer());
else
entityDecl = fDTDGrammar->getEntityDecl(bbName.getRawBuffer());
if (entityDecl)
{
if (!fDumEntityDecl)
fDumEntityDecl = new (fMemoryManager) DTDEntityDecl(fMemoryManager);
fDumEntityDecl->setName(bbName.getRawBuffer());
entityDecl = fDumEntityDecl;
}
else
{
// Its not in existence already, then create an entity decl for it
entityDecl = new (fMemoryManager) DTDEntityDecl(bbName.getRawBuffer(), false, fMemoryManager);
//
// Set the declaration location. The parameter indicates whether its
// declared in the content/internal subset, so we know whether or not
// its in the external subset.
//
entityDecl->setDeclaredInIntSubset(fInternalSubset);
// Add it to the appropriate entity decl pool
if (isPEDecl)
fPEntityDeclPool->put(entityDecl);
else
fDTDGrammar->putEntityDecl(entityDecl);
}
// Set a flag that indicates whether we are ignoring this one
const bool isIgnored = (entityDecl == fDumEntityDecl);
// Set the PE flag on it