XMLScanner.cpp
上传用户:zhuqijet
上传日期:2013-06-25
资源大小:10074k
文件大小:72k
- /*
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 1999-2002 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Xerces" and "Apache Software Foundation" must
- * not be used to endorse or promote products derived from this
- * software without prior written permission. For written
- * permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * nor may "Apache" appear in their name, without prior written
- * permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation, and was
- * originally based on software copyright (c) 1999, International
- * Business Machines, Inc., http://www.ibm.com . For more information
- * on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
- /*
- * $Id: XMLScanner.cpp,v 1.45 2003/05/18 14:02:04 knoaman Exp $
- */
- // ---------------------------------------------------------------------------
- // Includes
- // ---------------------------------------------------------------------------
- #include <xercesc/internal/XMLScanner.hpp>
- #include <xercesc/util/Janitor.hpp>
- #include <xercesc/util/Mutexes.hpp>
- #include <xercesc/util/RuntimeException.hpp>
- #include <xercesc/util/UnexpectedEOFException.hpp>
- #include <xercesc/util/XMLMsgLoader.hpp>
- #include <xercesc/util/XMLRegisterCleanup.hpp>
- #include <xercesc/framework/LocalFileInputSource.hpp>
- #include <xercesc/framework/URLInputSource.hpp>
- #include <xercesc/framework/XMLDocumentHandler.hpp>
- #include <xercesc/framework/XMLEntityHandler.hpp>
- #include <xercesc/framework/XMLPScanToken.hpp>
- #include <xercesc/framework/XMLValidator.hpp>
- #include <xercesc/internal/EndOfEntityException.hpp>
- #include <xercesc/validators/DTD/DocTypeHandler.hpp>
- #include <xercesc/validators/common/GrammarResolver.hpp>
- XERCES_CPP_NAMESPACE_BEGIN
- // ---------------------------------------------------------------------------
- // Local static data
- // ---------------------------------------------------------------------------
- static XMLUInt32 gScannerId;
- static bool sRegistered = false;
- static XMLMutex* sScannerMutex = 0;
- static XMLRegisterCleanup scannerMutexCleanup;
- static XMLMsgLoader* gMsgLoader = 0;
- static XMLRegisterCleanup cleanupMsgLoader;
- // ---------------------------------------------------------------------------
- // Local, static functions
- // ---------------------------------------------------------------------------
- // Cleanup for the message loader
- void XMLScanner::reinitMsgLoader()
- {
- delete gMsgLoader;
- gMsgLoader = 0;
- }
- // Cleanup for the scanner mutex
- void XMLScanner::reinitScannerMutex()
- {
- delete sScannerMutex;
- sScannerMutex = 0;
- sRegistered = false;
- }
- //
- // We need to fault in this mutex. But, since its used for synchronization
- // itself, we have to do this the low level way using a compare and swap.
- //
- static XMLMutex& gScannerMutex()
- {
- if (!sScannerMutex)
- {
- XMLMutex* tmpMutex = new XMLMutex;
- if (XMLPlatformUtils::compareAndSwap((void**)&sScannerMutex, tmpMutex, 0))
- {
- // Someone beat us to it, so let's clean up ours
- delete tmpMutex;
- }
- // Now lock it and try to register it
- XMLMutexLock lock(sScannerMutex);
- // If we got here first, then register it and set the registered flag
- if (!sRegistered)
- {
- scannerMutexCleanup.registerCleanup(XMLScanner::reinitScannerMutex);
- sRegistered = true;
- }
- }
- return *sScannerMutex;
- }
- static XMLMsgLoader& gScannerMsgLoader()
- {
- XMLMutexLock lockInit(&gScannerMutex());
- // If we haven't loaded our message yet, then do that
- if (!gMsgLoader)
- {
- gMsgLoader = XMLPlatformUtils::loadMsgSet(XMLUni::fgXMLErrDomain);
- if (!gMsgLoader)
- XMLPlatformUtils::panic(PanicHandler::Panic_CantLoadMsgDomain);
- // Register this object to be cleaned up at termination
- cleanupMsgLoader.registerCleanup(XMLScanner::reinitMsgLoader);
- }
- return *gMsgLoader;
- }
- // ---------------------------------------------------------------------------
- // XMLScanner: Constructors and Destructor
- // ---------------------------------------------------------------------------
- XMLScanner::XMLScanner(XMLValidator* const valToAdopt,
- MemoryManager* const manager) :
- fCalculateSrcOfs(false)
- , fDoNamespaces(false)
- , fExitOnFirstFatal(true)
- , fValidationConstraintFatal(false)
- , fInException(false)
- , fStandalone(false)
- , fHasNoDTD(true)
- , fValidate(false)
- , fValidatorFromUser(false)
- , fDoSchema(false)
- , fSchemaFullChecking(false)
- , fToCacheGrammar(false)
- , fUseCachedGrammar(false)
- , fLoadExternalDTD(true)
- , fNormalizeData(true)
- , fErrorCount(0)
- , fEmptyNamespaceId(0)
- , fUnknownNamespaceId(0)
- , fXMLNamespaceId(0)
- , fXMLNSNamespaceId(0)
- , fSchemaNamespaceId(0)
- , fScannerId(0)
- , fSequenceId(0)
- , fAttrList(0)
- , fDocHandler(0)
- , fDocTypeHandler(0)
- , fEntityHandler(0)
- , fErrorReporter(0)
- , fErrorHandler(0)
- , fIDRefList(0)
- , fReaderMgr(manager)
- , fValidator(valToAdopt)
- , fValScheme(Val_Never)
- , fGrammarResolver(0)
- , fGrammar(0)
- , fRootGrammar(0)
- , fURIStringPool(0)
- , fRootElemName(0)
- , fExternalSchemaLocation(0)
- , fExternalNoNamespaceSchemaLocation(0)
- , fStandardUriConformant(false)
- , fSecurityManager(0)
- , fXMLVersion(XMLReader::XMLV1_0)
- , fMemoryManager(manager)
- , fBufMgr(manager)
- , fAttNameBuf(1023, manager)
- , fAttValueBuf(1023, manager)
- , fCDataBuf(1023, manager)
- , fQNameBuf(1023, manager)
- , fPrefixBuf(1023, manager)
- , fURIBuf(1023, manager)
- {
- commonInit();
- if (fValidator) {
- fValidatorFromUser = true;
- initValidator(fValidator);
- }
- }
- XMLScanner::XMLScanner( XMLDocumentHandler* const docHandler
- , DocTypeHandler* const docTypeHandler
- , XMLEntityHandler* const entityHandler
- , XMLErrorReporter* const errHandler
- , XMLValidator* const valToAdopt
- , MemoryManager* const manager) :
- fCalculateSrcOfs(false)
- , fDoNamespaces(false)
- , fExitOnFirstFatal(true)
- , fValidationConstraintFatal(false)
- , fInException(false)
- , fStandalone(false)
- , fHasNoDTD(true)
- , fValidate(false)
- , fValidatorFromUser(false)
- , fDoSchema(false)
- , fSchemaFullChecking(false)
- , fToCacheGrammar(false)
- , fUseCachedGrammar(false)
- , fLoadExternalDTD(true)
- , fNormalizeData(true)
- , fErrorCount(0)
- , fEmptyNamespaceId(0)
- , fUnknownNamespaceId(0)
- , fXMLNamespaceId(0)
- , fXMLNSNamespaceId(0)
- , fSchemaNamespaceId(0)
- , fScannerId(0)
- , fSequenceId(0)
- , fAttrList(0)
- , fDocHandler(docHandler)
- , fDocTypeHandler(docTypeHandler)
- , fEntityHandler(entityHandler)
- , fErrorReporter(errHandler)
- , fErrorHandler(0)
- , fIDRefList(0)
- , fReaderMgr(manager)
- , fValidator(valToAdopt)
- , fValScheme(Val_Never)
- , fGrammarResolver(0)
- , fGrammar(0)
- , fRootGrammar(0)
- , fURIStringPool(0)
- , fRootElemName(0)
- , fExternalSchemaLocation(0)
- , fExternalNoNamespaceSchemaLocation(0)
- , fStandardUriConformant(false)
- , fSecurityManager(0)
- , fXMLVersion(XMLReader::XMLV1_0)
- , fMemoryManager(manager)
- , fBufMgr(manager)
- , fAttNameBuf(1023, manager)
- , fAttValueBuf(1023, manager)
- , fCDataBuf(1023, manager)
- , fQNameBuf(1023, manager)
- , fPrefixBuf(1023, manager)
- , fURIBuf(1023, manager)
- {
- commonInit();
- if (valToAdopt){
- fValidatorFromUser = true;
- initValidator(fValidator);
- }
- }
- XMLScanner::~XMLScanner()
- {
- delete fAttrList;
- delete fIDRefList;
- fMemoryManager->deallocate(fRootElemName);//delete [] fRootElemName;
- fMemoryManager->deallocate(fExternalSchemaLocation);//delete [] fExternalSchemaLocation;
- fMemoryManager->deallocate(fExternalNoNamespaceSchemaLocation);//delete [] fExternalNoNamespaceSchemaLocation;
- }
- void XMLScanner::setValidator(XMLValidator* const valToAdopt)
- {
- if (fValidatorFromUser)
- delete fValidator;
- fValidator = valToAdopt;
- fValidatorFromUser = true;
- initValidator(fValidator);
- }
- // ---------------------------------------------------------------------------
- // XMLScanner: Main entry point to scan a document
- // ---------------------------------------------------------------------------
- void XMLScanner::scanDocument( const XMLCh* const systemId)
- {
- // First we try to parse it as a URL. If that fails, we assume its
- // a file and try it that way.
- InputSource* srcToUse = 0;
- try
- {
- // Create a temporary URL. Since this is the primary document,
- // it has to be fully qualified. If not, then assume we are just
- // mistaking a file for a URL.
- XMLURL tmpURL(systemId, fMemoryManager);
- if (tmpURL.isRelative()) {
- if (!fStandardUriConformant)
- srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
- else {
- // since this is the top of the try/catch, cannot call ThrowXML
- // emit the error directly
- MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent);
- fInException = true;
- emitError
- (
- XMLErrs::XMLException_Fatal
- , e.getType()
- , e.getMessage()
- );
- return;
- }
- }
- else
- {
- if (fStandardUriConformant && tmpURL.hasInvalidChar()) {
- MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL);
- fInException = true;
- emitError
- (
- XMLErrs::XMLException_Fatal
- , e.getType()
- , e.getMessage()
- );
- return;
- }
- srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager);
- }
- }
- catch(const MalformedURLException& e)
- {
- if (!fStandardUriConformant)
- srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
- else {
- // since this is the top of the try/catch, cannot call ThrowXML
- // emit the error directly
- // lazy bypass ... since all MalformedURLException are fatal, no need to check the type
- fInException = true;
- emitError
- (
- XMLErrs::XMLException_Fatal
- , e.getType()
- , e.getMessage()
- );
- return;
- }
- }
- catch(const XMLException& excToCatch)
- {
- // For any other XMLException,
- // emit the error and catch any user exception thrown from here.
- fInException = true;
- if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
- emitError
- (
- XMLErrs::XMLException_Warning
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
- emitError
- (
- XMLErrs::XMLException_Fatal
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- else
- emitError
- (
- XMLErrs::XMLException_Error
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- return;
- }
- catch(...)
- {
- // Just rethrow this, since its not our problem
- throw;
- }
- Janitor<InputSource> janSrc(srcToUse);
- scanDocument(*srcToUse);
- }
- void XMLScanner::scanDocument( const char* const systemId)
- {
- // We just delegate this to the XMLCh version after transcoding
- XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager);
- ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
- scanDocument(tmpBuf);
- }
- // This method begins a progressive parse. It scans through the prolog and
- // returns a token to be used on subsequent scanNext() calls. If the return
- // value is true, then the token is legal and ready for further use. If it
- // returns false, then the scan of the prolog failed and the token is not
- // going to work on subsequent scanNext() calls.
- bool XMLScanner::scanFirst( const XMLCh* const systemId
- , XMLPScanToken& toFill)
- {
- // First we try to parse it as a URL. If that fails, we assume its
- // a file and try it that way.
- InputSource* srcToUse = 0;
- try
- {
- // Create a temporary URL. Since this is the primary document,
- // it has to be fully qualified. If not, then assume we are just
- // mistaking a file for a URL.
- XMLURL tmpURL(systemId, fMemoryManager);
- if (tmpURL.isRelative()) {
- if (!fStandardUriConformant)
- srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
- else {
- // since this is the top of the try/catch, cannot call ThrowXML
- // emit the error directly
- MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent);
- fInException = true;
- emitError
- (
- XMLErrs::XMLException_Fatal
- , e.getType()
- , e.getMessage()
- );
- return false;
- }
- }
- else
- {
- if (fStandardUriConformant && tmpURL.hasInvalidChar()) {
- MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL);
- fInException = true;
- emitError
- (
- XMLErrs::XMLException_Fatal
- , e.getType()
- , e.getMessage()
- );
- return false;
- }
- srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager);
- }
- }
- catch(const MalformedURLException& e)
- {
- if (!fStandardUriConformant)
- srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
- else {
- // since this is the top of the try/catch, cannot call ThrowXML
- // emit the error directly
- // lazy bypass ... since all MalformedURLException are fatal, no need to check the type
- fInException = true;
- emitError
- (
- XMLErrs::XMLException_Fatal
- , e.getType()
- , e.getMessage()
- );
- return false;
- }
- }
- catch(const XMLException& excToCatch)
- {
- // For any other XMLException,
- // emit the error and catch any user exception thrown from here.
- fInException = true;
- if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
- emitError
- (
- XMLErrs::XMLException_Warning
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
- emitError
- (
- XMLErrs::XMLException_Fatal
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- else
- emitError
- (
- XMLErrs::XMLException_Error
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- return false;
- }
- catch(...)
- {
- // Just rethrow this, since its not our problem
- throw;
- }
- Janitor<InputSource> janSrc(srcToUse);
- return scanFirst(*srcToUse, toFill);
- }
- bool XMLScanner::scanFirst( const char* const systemId
- , XMLPScanToken& toFill)
- {
- // We just delegate this to the XMLCh version after transcoding
- XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager);
- ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
- return scanFirst(tmpBuf, toFill);
- }
- bool XMLScanner::scanFirst( const InputSource& src
- , XMLPScanToken& toFill)
- {
- // Bump up the sequence id for this new scan cycle. This will invalidate
- // any previous tokens we've returned.
- fSequenceId++;
- // Reset the scanner and its plugged in stuff for a new run. This
- // resets all the data structures, creates the initial reader and
- // pushes it on the stack, and sets up the base document path
- scanReset(src);
- // If we have a document handler, then call the start document
- if (fDocHandler)
- fDocHandler->startDocument();
- try
- {
- // Scan the prolog part, which is everything before the root element
- // including the DTD subsets. This is all that is done on the scan
- // first.
- scanProlog();
- // If we got to the end of input, then its not a valid XML file.
- // Else, go on to scan the content.
- if (fReaderMgr.atEOF())
- {
- emitError(XMLErrs::EmptyMainEntity);
- }
- }
- // NOTE:
- //
- // In all of the error processing below, the emitError() call MUST come
- // before the flush of the reader mgr, or it will fail because it tries
- // to find out the position in the XML source of the error.
- catch(const XMLErrs::Codes)
- {
- // This is a 'first failure' exception so reset and return a failure
- fReaderMgr.reset();
- return false;
- }
- catch(const XMLValid::Codes)
- {
- // This is a 'first fatal error' type exit, so reset and reuturn failure
- fReaderMgr.reset();
- return false;
- }
- catch(const XMLException& excToCatch)
- {
- // Emit the error and catch any user exception thrown from here. Make
- // sure in all cases we flush the reader manager.
- fInException = true;
- try
- {
- if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
- emitError
- (
- XMLErrs::XMLException_Warning
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
- emitError
- (
- XMLErrs::XMLException_Fatal
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- else
- emitError
- (
- XMLErrs::XMLException_Error
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- }
- catch(...)
- {
- // Reset and rethrow the user error
- fReaderMgr.reset();
- throw;
- }
- // Reset and return a failure
- fReaderMgr.reset();
- return false;
- }
- catch(...)
- {
- // Reset and rethrow original error
- fReaderMgr.reset();
- throw;
- }
- // Fill in the caller's token to make it legal and return success
- toFill.set(fScannerId, fSequenceId);
- return true;
- }
- void XMLScanner::scanReset(XMLPScanToken& token)
- {
- // Make sure this token is still legal
- if (!isLegalToken(token))
- ThrowXML(RuntimeException, XMLExcepts::Scan_BadPScanToken);
- // Reset the reader manager
- fReaderMgr.reset();
- // And invalidate any tokens by bumping our sequence number
- fSequenceId++;
- // Reset our error count
- fErrorCount = 0;
- }
- void XMLScanner::setParseSettings(XMLScanner* const refScanner)
- {
- setDocHandler(refScanner->getDocHandler());
- setDocTypeHandler(refScanner->getDocTypeHandler());
- setErrorHandler(refScanner->getErrorHandler());
- setErrorReporter(refScanner->getErrorReporter());
- setEntityHandler(refScanner->getEntityHandler());
- setDoNamespaces(refScanner->getDoNamespaces());
- setDoSchema(refScanner->getDoSchema());
- setCalculateSrcOfs(refScanner->getCalculateSrcOfs());
- setStandardUriConformant(refScanner->getStandardUriConformant());
- setExitOnFirstFatal(refScanner->getExitOnFirstFatal());
- setValidationConstraintFatal(refScanner->getValidationConstraintFatal());
- setValidationSchemaFullChecking(refScanner->getValidationSchemaFullChecking());
- cacheGrammarFromParse(refScanner->isCachingGrammarFromParse());
- useCachedGrammarInParse(refScanner->isUsingCachedGrammarInParse());
- setLoadExternalDTD(refScanner->getLoadExternalDTD());
- setNormalizeData(refScanner->getNormalizeData());
- setExternalSchemaLocation(refScanner->getExternalSchemaLocation());
- setExternalNoNamespaceSchemaLocation(refScanner->getExternalNoNamespaceSchemaLocation());
- setValidationScheme(refScanner->getValidationScheme());
- }
- // ---------------------------------------------------------------------------
- // XMLScanner: Private helper methods.
- // ---------------------------------------------------------------------------
- // This method handles the common initialization, to avoid having to do
- // it redundantly in multiple constructors.
- void XMLScanner::commonInit()
- {
- // We have to do a little init that involves statics, so we have to
- // use the mutex to protect it.
- {
- XMLMutexLock lockInit(&gScannerMutex());
- // And assign ourselves the next available scanner id
- fScannerId = ++gScannerId;
- }
- // Create the attribute list, which is used to store attribute values
- // during start tag processing. Give it a reasonable initial size that
- // will serve for most folks, though it will grow as required.
- fAttrList = new (fMemoryManager) RefVectorOf<XMLAttr>(32, true, fMemoryManager);
- // Create the id ref list. This is used to enforce XML 1.0 ID ref
- // semantics, i.e. all id refs must refer to elements that exist
- fIDRefList = new (fMemoryManager) RefHashTableOf<XMLRefInfo>(109, fMemoryManager);
- // Create the GrammarResolver
- //fGrammarResolver = new GrammarResolver();
- }
- void XMLScanner::initValidator(XMLValidator* theValidator) {
- // Tell the validator about the stuff it needs to know in order to
- // do its work.
- theValidator->setScannerInfo(this, &fReaderMgr, &fBufMgr);
- theValidator->setErrorReporter(fErrorReporter);
- }
- // ---------------------------------------------------------------------------
- // XMLScanner: Error emitting methods
- // ---------------------------------------------------------------------------
- // These methods are called whenever the scanner wants to emit an error.
- // It handles getting the message loaded, doing token replacement, etc...
- // and then calling the error handler, if its installed.
- void XMLScanner::emitError(const XMLErrs::Codes toEmit)
- {
- // Bump the error count if it is not a warning
- if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning)
- incrementErrorCount();
- if (fErrorReporter)
- {
- // Load the message into a local for display
- const unsigned int msgSize = 1023;
- XMLCh errText[msgSize + 1];
- if (!gScannerMsgLoader().loadMsg(toEmit, errText, msgSize))
- {
- // <TBD> Probably should load a default msg here
- }
- // Create a LastExtEntityInfo structure and get the reader manager
- // to fill it in for us. This will give us the information about
- // the last reader on the stack that was an external entity of some
- // sort (i.e. it will ignore internal entities.
- ReaderMgr::LastExtEntityInfo lastInfo;
- fReaderMgr.getLastExtEntityInfo(lastInfo);
- fErrorReporter->error
- (
- toEmit
- , XMLUni::fgXMLErrDomain
- , XMLErrs::errorType(toEmit)
- , errText
- , lastInfo.systemId
- , lastInfo.publicId
- , lastInfo.lineNumber
- , lastInfo.colNumber
- );
- }
- // Bail out if its fatal an we are to give up on the first fatal error
- if (XMLErrs::isFatal(toEmit) && fExitOnFirstFatal && !fInException)
- throw toEmit;
- }
- void XMLScanner::emitError( const XMLErrs::Codes toEmit
- , const XMLCh* const text1
- , const XMLCh* const text2
- , const XMLCh* const text3
- , const XMLCh* const text4)
- {
- // Bump the error count if it is not a warning
- if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning)
- incrementErrorCount();
- if (fErrorReporter)
- {
- // Load the message into alocal and replace any tokens found in
- // the text.
- const unsigned int maxChars = 2047;
- XMLCh errText[maxChars + 1];
- if (!gScannerMsgLoader().loadMsg(toEmit, errText, maxChars, text1, text2, text3, text4))
- {
- // <TBD> Should probably load a default message here
- }
- // Create a LastExtEntityInfo structure and get the reader manager
- // to fill it in for us. This will give us the information about
- // the last reader on the stack that was an external entity of some
- // sort (i.e. it will ignore internal entities.
- ReaderMgr::LastExtEntityInfo lastInfo;
- fReaderMgr.getLastExtEntityInfo(lastInfo);
- fErrorReporter->error
- (
- toEmit
- , XMLUni::fgXMLErrDomain
- , XMLErrs::errorType(toEmit)
- , errText
- , lastInfo.systemId
- , lastInfo.publicId
- , lastInfo.lineNumber
- , lastInfo.colNumber
- );
- }
- // Bail out if its fatal an we are to give up on the first fatal error
- if (XMLErrs::isFatal(toEmit) && fExitOnFirstFatal && !fInException)
- throw toEmit;
- }
- void XMLScanner::emitError( const XMLErrs::Codes toEmit
- , const char* const text1
- , const char* const text2
- , const char* const text3
- , const char* const text4)
- {
- // Bump the error count if it is not a warning
- if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning)
- incrementErrorCount();
- if (fErrorReporter)
- {
- // Load the message into alocal and replace any tokens found in
- // the text.
- const unsigned int maxChars = 2047;
- XMLCh errText[maxChars + 1];
- if (!gScannerMsgLoader().loadMsg(toEmit, errText, maxChars, text1, text2, text3, text4))
- {
- // <TBD> Should probably load a default message here
- }
- // Create a LastExtEntityInfo structure and get the reader manager
- // to fill it in for us. This will give us the information about
- // the last reader on the stack that was an external entity of some
- // sort (i.e. it will ignore internal entities.
- ReaderMgr::LastExtEntityInfo lastInfo;
- fReaderMgr.getLastExtEntityInfo(lastInfo);
- fErrorReporter->error
- (
- toEmit
- , XMLUni::fgXMLErrDomain
- , XMLErrs::errorType(toEmit)
- , errText
- , lastInfo.systemId
- , lastInfo.publicId
- , lastInfo.lineNumber
- , lastInfo.colNumber
- );
- }
- // Bail out if its fatal an we are to give up on the first fatal error
- if (XMLErrs::isFatal(toEmit) && fExitOnFirstFatal && !fInException)
- throw toEmit;
- }
- // ---------------------------------------------------------------------------
- // XMLScanner: Getter methods
- // ---------------------------------------------------------------------------
- // This method allows the caller to query the current location of the scanner.
- // It will return the sys/public ids of the current entity, and the line/col
- // position within it.
- //
- // NOTE: This API returns the location with the last external file. So if its
- // currently scanning an entity, the position returned will be the end of
- // the entity reference in the file that had the reference.
- //
- /*bool
- XMLScanner::getLastExtLocation( XMLCh* const sysIdToFill
- , const unsigned int maxSysIdChars
- , XMLCh* const pubIdToFill
- , const unsigned int maxPubIdChars
- , XMLSSize_t& lineToFill
- , XMLSSize_t& colToFill) const
- {
- // Create a local info object and get it filled in by the reader manager
- ReaderMgr::LastExtEntityInfo lastInfo;
- fReaderMgr.getLastExtEntityInfo(lastInfo);
- // Fill in the line and column number
- lineToFill = lastInfo.lineNumber;
- colToFill = lastInfo.colNumber;
- // And copy over as much of the ids as will fit
- sysIdToFill[0] = 0;
- if (lastInfo.systemId)
- {
- if (XMLString::stringLen(lastInfo.systemId) > maxSysIdChars)
- return false;
- XMLString::copyString(sysIdToFill, lastInfo.systemId);
- }
- pubIdToFill[0] = 0;
- if (lastInfo.publicId)
- {
- if (XMLString::stringLen(lastInfo.publicId) > maxPubIdChars)
- return false;
- XMLString::copyString(pubIdToFill, lastInfo.publicId);
- }
- return true;
- }*/
- // ---------------------------------------------------------------------------
- // XMLScanner: Private scanning methods
- // ---------------------------------------------------------------------------
- // This method is called after the end of the root element, to handle
- // any miscellaneous stuff hanging around.
- void XMLScanner::scanMiscellaneous()
- {
- // Get a buffer for this work
- XMLBufBid bbCData(&fBufMgr);
- while (true)
- {
- try
- {
- const XMLCh nextCh = fReaderMgr.peekNextChar();
- // Watch for end of file and break out
- if (!nextCh)
- break;
- if (nextCh == chOpenAngle)
- {
- if (checkXMLDecl(true))
- {
- // Can't have an XML decl here
- emitError(XMLErrs::NotValidAfterContent);
- fReaderMgr.skipPastChar(chCloseAngle);
- }
- else if (fReaderMgr.skippedString(XMLUni::fgPIString))
- {
- scanPI();
- }
- else if (fReaderMgr.skippedString(XMLUni::fgCommentString))
- {
- scanComment();
- }
- else
- {
- // This can't be possible, so just give up
- emitError(XMLErrs::ExpectedCommentOrPI);
- fReaderMgr.skipPastChar(chCloseAngle);
- }
- }
- else if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
- {
- // If we have a doc handler, then gather up the spaces and
- // call back. Otherwise, just skip over whitespace.
- if (fDocHandler)
- {
- fReaderMgr.getSpaces(bbCData.getBuffer());
- fDocHandler->ignorableWhitespace
- (
- bbCData.getRawBuffer()
- , bbCData.getLen()
- , false
- );
- }
- else
- {
- fReaderMgr.skipPastSpaces();
- }
- }
- else
- {
- emitError(XMLErrs::ExpectedCommentOrPI);
- fReaderMgr.skipPastChar(chCloseAngle);
- }
- }
- catch(const EndOfEntityException&)
- {
- // Some entity leaked out of the content part of the document. Issue
- // a warning and keep going.
- emitError(XMLErrs::EntityPropogated);
- }
- }
- }
- // Scans a PI and calls the appropriate callbacks. At entry we have just
- // scanned the <? part, and need to now start on the PI target name.
- void XMLScanner::scanPI()
- {
- const XMLCh* namePtr = 0;
- const XMLCh* targetPtr = 0;
- // If there are any spaces here, then warn about it. If we aren't in
- // 'first error' mode, then we'll come back and can easily pick up
- // again by just skipping them.
- if (fReaderMgr.lookingAtSpace())
- {
- emitError(XMLErrs::PINameExpected);
- fReaderMgr.skipPastSpaces();
- }
- // Get a buffer for the PI name and scan it in
- XMLBufBid bbName(&fBufMgr);
- if (!fReaderMgr.getName(bbName.getBuffer()))
- {
- emitError(XMLErrs::PINameExpected);
- fReaderMgr.skipPastChar(chCloseAngle);
- return;
- }
- // Point the name pointer at the raw data
- namePtr = bbName.getRawBuffer();
- // See if it is some form of 'xml' and emit a warning
- if (!XMLString::compareIString(namePtr, XMLUni::fgXMLString))
- emitError(XMLErrs::NoPIStartsWithXML);
- // If namespaces are enabled, then no colons allowed
- if (fDoNamespaces)
- {
- if (XMLString::indexOf(namePtr, chColon) != -1)
- emitError(XMLErrs::ColonNotLegalWithNS);
- }
- // If we don't hit a space next, then the PI has no target. If we do
- // then get out the target. Get a buffer for it as well
- XMLBufBid bbTarget(&fBufMgr);
- if (fReaderMgr.skippedSpace())
- {
- // Skip any leading spaces
- fReaderMgr.skipPastSpaces();
- bool gotLeadingSurrogate = false;
- // It does have a target, so lets move on to deal with that.
- while (1)
- {
- const XMLCh nextCh = fReaderMgr.getNextChar();
- // Watch for an end of file, which is always bad here
- if (!nextCh)
- {
- emitError(XMLErrs::UnterminatedPI);
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- }
- // Watch for potential terminating character
- if (nextCh == chQuestion)
- {
- // It must be followed by '>' to be a termination of the target
- if (fReaderMgr.skippedChar(chCloseAngle))
- break;
- }
- // Check for correct surrogate pairs
- if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
- {
- if (gotLeadingSurrogate)
- emitError(XMLErrs::Expected2ndSurrogateChar);
- else
- gotLeadingSurrogate = true;
- }
- else
- {
- if (gotLeadingSurrogate)
- {
- if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
- emitError(XMLErrs::Expected2ndSurrogateChar);
- }
- // Its got to at least be a valid XML character
- else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- emitError(XMLErrs::InvalidCharacter, tmpBuf);
- }
- gotLeadingSurrogate = false;
- }
- bbTarget.append(nextCh);
- }
- }
- else
- {
- // No target, but make sure its terminated ok
- if (!fReaderMgr.skippedChar(chQuestion))
- {
- emitError(XMLErrs::UnterminatedPI);
- fReaderMgr.skipPastChar(chCloseAngle);
- return;
- }
- if (!fReaderMgr.skippedChar(chCloseAngle))
- {
- emitError(XMLErrs::UnterminatedPI);
- fReaderMgr.skipPastChar(chCloseAngle);
- return;
- }
- }
- // Point the target pointer at the raw data
- targetPtr = bbTarget.getRawBuffer();
- // If we have a handler, then call it
- if (fDocHandler)
- {
- fDocHandler->docPI
- (
- namePtr
- , targetPtr
- );
- }
- }
- // Scans all the input from the start of the file to the root element.
- // There does not have to be anything in the prolog necessarily, but usually
- // there is at least an XMLDecl.
- //
- // On exit from here we are either at the end of the file or about to read
- // the opening < of the root element.
- void XMLScanner::scanProlog()
- {
- // Get a buffer for whitespace processing
- XMLBufBid bbCData(&fBufMgr);
- // Loop through the prolog. If there is no content, this could go all
- // the way to the end of the file.
- try
- {
- while (true)
- {
- const XMLCh nextCh = fReaderMgr.peekNextChar();
- if (nextCh == chOpenAngle)
- {
- // Ok, it could be the xml decl, a comment, the doc type line,
- // or the start of the root element.
- if (checkXMLDecl(true))
- {
- // There shall be at lease --ONE-- space in between
- // the tag '<?xml' and the VersionInfo.
- //
- // If we are not at line 1, col 6, then the decl was not
- // the first text, so its invalid.
- const XMLReader* curReader = fReaderMgr.getCurrentReader();
- if ((curReader->getLineNumber() != 1)
- || (curReader->getColumnNumber() != 7))
- {
- emitError(XMLErrs::XMLDeclMustBeFirst);
- }
- scanXMLDecl(Decl_XML);
- }
- else if (fReaderMgr.skippedString(XMLUni::fgPIString))
- {
- scanPI();
- }
- else if (fReaderMgr.skippedString(XMLUni::fgCommentString))
- {
- scanComment();
- }
- else if (fReaderMgr.skippedString(XMLUni::fgDocTypeString))
- {
- scanDocTypeDecl();
- // if reusing grammar, this has been validated already in first scan
- // skip for performance
- if (fValidate && !fGrammar->getValidated()) {
- // validate the DTD scan so far
- fValidator->preContentValidation(fUseCachedGrammar, true);
- }
- }
- else
- {
- // Assume its the start of the root element
- return;
- }
- }
- else if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
- {
- // If we have a document handler then gather up the
- // whitespace and call back. Otherwise just skip over spaces.
- if (fDocHandler)
- {
- fReaderMgr.getSpaces(bbCData.getBuffer());
- fDocHandler->ignorableWhitespace
- (
- bbCData.getRawBuffer()
- , bbCData.getLen()
- , false
- );
- }
- else
- {
- fReaderMgr.skipPastSpaces();
- }
- }
- else
- {
- emitError(XMLErrs::InvalidDocumentStructure);
- // Watch for end of file and break out
- if (!nextCh)
- break;
- else
- fReaderMgr.skipPastChar(chCloseAngle);
- }
- }
- }
- catch(const EndOfEntityException&)
- {
- // We should never get an end of entity here. They should only
- // occur within the doc type scanning method, and not leak out to
- // here.
- emitError
- (
- XMLErrs::UnexpectedEOE
- , "in prolog"
- );
- }
- }
- // Scans the <?xml .... ?> line. This stuff is all sequential so we don't
- // do any state machine loop here. We just bull straight through it. It ends
- // past the closing bracket. If there is a document handler, then its called
- // on the XMLDecl callback.
- //
- // On entry, the <?xml has been scanned, and we pick it up from there.
- //
- // NOTE: In order to provide good recovery from bad XML here, we try to be
- // very flexible. No matter what order the stuff is in, we'll keep going
- // though we'll issue errors.
- //
- // The parameter tells us which type of decl we should expect, Text or XML.
- // [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
- // [77] TextDecl::= '<?xml' VersionInfo? EncodingDecl S? '?>'
- void XMLScanner::scanXMLDecl(const DeclTypes type)
- {
- // Get us some buffers to use
- XMLBufBid bbVersion(&fBufMgr);
- XMLBufBid bbEncoding(&fBufMgr);
- XMLBufBid bbStand(&fBufMgr);
- XMLBufBid bbDummy(&fBufMgr);
- XMLBufBid bbName(&fBufMgr);
- // We use this little enum and array to keep up with what we found
- // and what order we found them in. This lets us get them free form
- // without too much overhead, but still know that they were in the
- // wrong order.
- enum Strings
- {
- VersionString
- , EncodingString
- , StandaloneString
- , UnknownString
- , StringCount
- };
- int flags[StringCount] = { -1, -1, -1, -1 };
- // Also set up a list of buffers in the right order so that we know
- // where to put stuff.
- XMLBuffer* buffers[StringCount] ;
- buffers[0] = &bbVersion.getBuffer();
- buffers[1] = &bbEncoding.getBuffer();
- buffers[2] = &bbStand.getBuffer();
- buffers[3] = &bbDummy.getBuffer();
- int curCount = 0;
- Strings curString;
- XMLBuffer& nameBuf = bbName.getBuffer();
- while (true)
- {
- // Skip any spaces
- const unsigned int spaceCount = fReaderMgr.skipPastSpaces();
- // If we are looking at a question mark, then break out
- if (fReaderMgr.lookingAtChar(chQuestion))
- break;
- // If this is not the first string, then we require the spaces
- if (!spaceCount && curCount)
- emitError(XMLErrs::ExpectedWhitespace);
- // Get characters up to the next whitespace or equal's sign.
- if (!scanUpToWSOr(nameBuf, chEqual))
- emitError(XMLErrs::ExpectedDeclString);
- // See if it matches any of our expected strings
- if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgVersionString))
- curString = VersionString;
- else if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgEncodingString))
- curString = EncodingString;
- else if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgStandaloneString))
- curString = StandaloneString;
- else
- curString = UnknownString;
- // If its an unknown string, then give that error. Else check to
- // see if this one has been done already and give that error.
- if (curString == UnknownString)
- emitError(XMLErrs::ExpectedDeclString, nameBuf.getRawBuffer());
- else if (flags[curString] != -1)
- emitError(XMLErrs::DeclStringRep, nameBuf.getRawBuffer());
- else if (flags[curString] == -1)
- flags[curString] = ++curCount;
- // Scan for an equal's sign. If we don't find it, issue an error
- // but keep trying to go on.
- if (!scanEq())
- emitError(XMLErrs::ExpectedEqSign);
- // Get a quote string into the buffer for the string that we are
- // currently working on.
- if (!getQuotedString(*buffers[curString]))
- {
- emitError(XMLErrs::ExpectedQuotedString);
- fReaderMgr.skipPastChar(chCloseAngle);
- return;
- }
- // And validate the value according which one it was
- const XMLCh* rawValue = buffers[curString]->getRawBuffer();
- if (curString == VersionString)
- {
- if (XMLString::equals(rawValue, XMLUni::fgVersion1_1)) {
- if (type == Decl_XML) {
- fXMLVersion = XMLReader::XMLV1_1;
- fReaderMgr.setXMLVersion(XMLReader::XMLV1_1);
- }
- else {
- if (fXMLVersion != XMLReader::XMLV1_1)
- emitError(XMLErrs::UnsupportedXMLVersion, rawValue);
- }
- }
- else if (XMLString::equals(rawValue, XMLUni::fgVersion1_0)) {
- if (type == Decl_XML) {
- fXMLVersion = XMLReader::XMLV1_0;
- fReaderMgr.setXMLVersion(XMLReader::XMLV1_0);
- }
- }
- else
- emitError(XMLErrs::UnsupportedXMLVersion, rawValue);
- }
- else if (curString == EncodingString)
- {
- if (!XMLString::isValidEncName(rawValue))
- emitError(XMLErrs::BadXMLEncoding, rawValue);
- }
- else if (curString == StandaloneString)
- {
- if (XMLString::equals(rawValue, XMLUni::fgYesString))
- fStandalone = true;
- else if (XMLString::equals(rawValue, XMLUni::fgNoString))
- fStandalone = false;
- else
- {
- emitError(XMLErrs::BadStandalone);
- if (!XMLString::compareIString(rawValue, XMLUni::fgYesString))
- fStandalone = true;
- else if (!XMLString::compareIString(rawValue, XMLUni::fgNoString))
- fStandalone = false;
- }
- }
- }
- // Make sure that the strings present are in order. We don't care about
- // which ones are present at this point, just that any there are in the
- // right order.
- int curTop = 0;
- for (int index = VersionString; index < StandaloneString; index++)
- {
- if (flags[index] != -1)
- {
- if (flags[index] != curTop + 1)
- {
- emitError(XMLErrs::DeclStringsInWrongOrder);
- break;
- }
- curTop = flags[index];
- }
- }
- // If its an XML decl, the version must be present.
- // If its a Text decl, then encoding must be present AND standalone must not be present.
- if ((type == Decl_XML) && (flags[VersionString] == -1))
- emitError(XMLErrs::XMLVersionRequired);
- else if (type == Decl_Text) {
- if (flags[StandaloneString] != -1)
- emitError(XMLErrs::StandaloneNotLegal);
- if (flags[EncodingString] == -1)
- emitError(XMLErrs::EncodingRequired);
- }
- if (!fReaderMgr.skippedChar(chQuestion))
- {
- emitError(XMLErrs::UnterminatedXMLDecl);
- fReaderMgr.skipPastChar(chCloseAngle);
- }
- else if (!fReaderMgr.skippedChar(chCloseAngle))
- {
- emitError(XMLErrs::UnterminatedXMLDecl);
- fReaderMgr.skipPastChar(chCloseAngle);
- }
- // Do this before we possibly update the reader with the
- // actual encoding string. Otherwise, we will pass the wrong thing
- // for the last parameter!
- const XMLCh* actualEnc = fReaderMgr.getCurrentEncodingStr();
- // Ok, we've now seen the real encoding string, if there was one, so
- // lets call back on the current reader and tell it what the real
- // encoding string was. If it fails, that's because it represents some
- // sort of contradiction with the autosensed format, and it keeps the
- // original encoding.
- //
- // NOTE: This can fail for a number of reasons, such as a bogus encoding
- // name or because its in flagrant contradiction of the auto-sensed
- // format.
- if (flags[EncodingString] != -1)
- {
- if (!fReaderMgr.getCurrentReader()->setEncoding(bbEncoding.getRawBuffer()))
- emitError(XMLErrs::ContradictoryEncoding, bbEncoding.getRawBuffer());
- else
- actualEnc = bbEncoding.getRawBuffer();
- }
- // If we have a document handler then call the XML Decl callback.
- if (type == Decl_XML)
- {
- if (fDocHandler)
- fDocHandler->XMLDecl
- (
- bbVersion.getRawBuffer()
- , bbEncoding.getRawBuffer()
- , bbStand.getRawBuffer()
- , actualEnc
- );
- }
- else if (type == Decl_Text)
- {
- if (fDocTypeHandler)
- fDocTypeHandler->TextDecl
- (
- bbVersion.getRawBuffer()
- , bbEncoding.getRawBuffer()
- );
- }
- }
- const XMLCh* XMLScanner::getURIText(const unsigned int uriId) const
- {
- if (fURIStringPool->exists(uriId)) {
- // Look up the URI in the string pool and return its id
- const XMLCh* value = fURIStringPool->getValueForId(uriId);
- if (!value)
- return XMLUni::fgZeroLenString;
- return value;
- }
- else
- return XMLUni::fgZeroLenString;
- }
- bool XMLScanner::getURIText( const unsigned int uriId
- , XMLBuffer& uriBufToFill) const
- {
- if (fURIStringPool->exists(uriId)) {
- // Look up the URI in the string pool and return its id
- const XMLCh* value = fURIStringPool->getValueForId(uriId);
- if (!value)
- return false;
- uriBufToFill.set(value);
- return true;
- }
- else
- return false;
- }
- bool XMLScanner::checkXMLDecl(bool startWithAngle) {
- // [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
- // [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
- //
- // [3] S ::= (#x20 | #x9 | #xD | #xA)+
- if (startWithAngle) {
- if (fReaderMgr.peekString(XMLUni::fgXMLDeclString)) {
- if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpace)
- || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTab)
- || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLF)
- || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCR))
- {
- return true;
- }
- else if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpaceU)
- || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTabU)
- || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLFU)
- || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCRU))
- {
- // Just in case, check for upper case. If found, issue
- // an error, but keep going.
- emitError(XMLErrs::XMLDeclMustBeLowerCase);
- return true;
- }
- }
- }
- else {
- if (fReaderMgr.peekString(XMLUni::fgXMLString)) {
- if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpace)
- || fReaderMgr.skippedString(XMLUni::fgXMLStringHTab)
- || fReaderMgr.skippedString(XMLUni::fgXMLStringLF)
- || fReaderMgr.skippedString(XMLUni::fgXMLStringCR))
- {
- return true;
- }
- else if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpaceU)
- || fReaderMgr.skippedString(XMLUni::fgXMLStringHTabU)
- || fReaderMgr.skippedString(XMLUni::fgXMLStringLFU)
- || fReaderMgr.skippedString(XMLUni::fgXMLStringCRU))
- {
- // Just in case, check for upper case. If found, issue
- // an error, but keep going.
- emitError(XMLErrs::XMLDeclMustBeLowerCase);
- return true;
- }
- }
- }
- return false;
- }
- // ---------------------------------------------------------------------------
- // XMLScanner: Grammar preparsing
- // ---------------------------------------------------------------------------
- Grammar* XMLScanner::loadGrammar(const XMLCh* const systemId
- , const short grammarType
- , const bool toCache)
- {
- InputSource* srcToUse = 0;
- if (fEntityHandler){
- srcToUse = fEntityHandler->resolveEntity(XMLUni::fgZeroLenString, systemId);
- }
- // First we try to parse it as a URL. If that fails, we assume its
- // a file and try it that way.
- if (!srcToUse) {
- try
- {
- // Create a temporary URL. Since this is the primary document,
- // it has to be fully qualified. If not, then assume we are just
- // mistaking a file for a URL.
- XMLURL tmpURL(systemId, fMemoryManager);
- if (tmpURL.isRelative())
- {
- if (!fStandardUriConformant)
- srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
- else {
- // since this is the top of the try/catch, cannot call ThrowXML
- // emit the error directly
- MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent);
- fInException = true;
- emitError
- (
- XMLErrs::XMLException_Fatal
- , e.getType()
- , e.getMessage()
- );
- return 0;
- }
- }
- else
- {
- if (fStandardUriConformant && tmpURL.hasInvalidChar()) {
- MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL);
- fInException = true;
- emitError
- (
- XMLErrs::XMLException_Fatal
- , e.getType()
- , e.getMessage()
- );
- return 0;
- }
- srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager);
- }
- }
- catch(const MalformedURLException& e)
- {
- if (!fStandardUriConformant)
- srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
- else {
- // since this is the top of the try/catch, cannot call ThrowXML
- // emit the error directly
- // lazy bypass ... since all MalformedURLException are fatal, no need to check the type
- fInException = true;
- emitError
- (
- XMLErrs::XMLException_Fatal
- , e.getType()
- , e.getMessage()
- );
- return 0;
- }
- }
- catch(const XMLException& excToCatch)
- {
- // For any other XMLException,
- // emit the error and catch any user exception thrown from here.
- fInException = true;
- if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
- emitError
- (
- XMLErrs::XMLException_Warning
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
- emitError
- (
- XMLErrs::XMLException_Fatal
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- else
- emitError
- (
- XMLErrs::XMLException_Error
- , excToCatch.getType()
- , excToCatch.getMessage()
- );
- return 0;
- }
- catch(...)
- {
- // Just rethrow this, since its not our problem
- throw;
- }
- }
- Janitor<InputSource> janSrc(srcToUse);
- return loadGrammar(*srcToUse, grammarType, toCache);
- }
- Grammar* XMLScanner::loadGrammar(const char* const systemId
- , const short grammarType
- , const bool toCache)
- {
- // We just delegate this to the XMLCh version after transcoding
- XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager);
- ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
- return loadGrammar(tmpBuf, grammarType, toCache);
- }
- // ---------------------------------------------------------------------------
- // XMLScanner: Setter methods
- // ---------------------------------------------------------------------------
- void XMLScanner::setURIStringPool(XMLStringPool* const stringPool)
- {
- fURIStringPool = stringPool;
- fEmptyNamespaceId = fURIStringPool->addOrFind(XMLUni::fgZeroLenString);
- fUnknownNamespaceId = fURIStringPool->addOrFind(XMLUni::fgUnknownURIName);
- fXMLNamespaceId = fURIStringPool->addOrFind(XMLUni::fgXMLURIName);
- fXMLNSNamespaceId = fURIStringPool->addOrFind(XMLUni::fgXMLNSURIName);
- }
- // ---------------------------------------------------------------------------
- // XMLScanner: Private helper methods
- // ---------------------------------------------------------------------------
- // This method is called after the content scan to insure that all the
- // ID/IDREF attributes match up (i.e. that all IDREFs refer to IDs.) This is
- // an XML 1.0 rule, so we can do here in the core.
- void XMLScanner::checkIDRefs()
- {
- // Iterate the id ref list. If we find any entries here which are used
- // but not declared, then that's an error.
- RefHashTableOfEnumerator<XMLRefInfo> refEnum(fIDRefList);
- while (refEnum.hasMoreElements())
- {
- // Get a ref to the current element
- const XMLRefInfo& curRef = refEnum.nextElement();
- // If its used but not declared, then its an error
- if (!curRef.getDeclared() && curRef.getUsed() && fValidate)
- fValidator->emitError(XMLValid::IDNotDeclared, curRef.getRefName());
- }
- }
- // This just does a simple check that the passed progressive scan token is
- // legal for this scanner.
- bool XMLScanner::isLegalToken(const XMLPScanToken& toCheck)
- {
- return ((fScannerId == toCheck.fScannerId)
- && (fSequenceId == toCheck.fSequenceId));
- }
- // This method will handle figuring out what the next top level token is
- // in the input stream. It will return an enumerated value that indicates
- // what it believes the next XML level token must be. It will eat as many
- // chars are required to figure out what is next.
- XMLScanner::XMLTokens XMLScanner::senseNextToken(unsigned int& orgReader)
- {
- // Get the next character and use it to guesstimate what the next token
- // is going to be. We turn on end of entity exceptions when we do this
- // in order to catch the scenario where the current entity ended at
- // the > of some markup.
- XMLCh nextCh;
- {
- ThrowEOEJanitor janMgr(&fReaderMgr, true);
- nextCh = fReaderMgr.peekNextChar();
- }
- // Check for special chars. Start with the most
- // obvious end of file, which should be legal here at top level.
- if (!nextCh)
- return Token_EOF;
- // If it's not a '<' we must be in content.
- //
- // This includes entity references '&' of some sort. These must
- // be character data because that's the only place a reference can
- // occur in content.
- if (nextCh != chOpenAngle)
- return Token_CharData;
- // Ok it had to have been a '<' character. So get it out of the reader
- // and store the reader number where we saw it, passing it back to the
- // caller.
- fReaderMgr.getNextChar();
- orgReader = fReaderMgr.getCurrentReaderNum();
- // Ok, so lets go through the things that it could be at this point which
- // are all some form of markup.
- nextCh = fReaderMgr.peekNextChar();
- if (nextCh == chForwardSlash)
- {
- fReaderMgr.getNextChar();
- return Token_EndTag;
- }
- else if (nextCh == chBang)
- {
- static const XMLCh gCDATAStr[] =
- {
- chBang, chOpenSquare, chLatin_C, chLatin_D, chLatin_A
- , chLatin_T, chLatin_A, chNull
- };
- static const XMLCh gCommentString[] =
- {
- chBang, chDash, chDash, chNull
- };
- if (fReaderMgr.skippedString(gCDATAStr))
- return Token_CData;
- if (fReaderMgr.skippedString(gCommentString))
- return Token_Comment;
- emitError(XMLErrs::ExpectedCommentOrCDATA);
- return Token_Unknown;
- }
- else if (nextCh == chQuestion)
- {
- // It must be a PI
- fReaderMgr.getNextChar();
- return Token_PI;
- }
- // Assume its an element name, so return with a start tag token. If it
- // turns out not to be, then it will fail when it cannot get a valid tag.
- return Token_StartTag;
- }
- // ---------------------------------------------------------------------------
- // XMLScanner: Private parsing methods
- // ---------------------------------------------------------------------------
- // This guy just scans out a single or double quoted string of characters.
- // It does not pass any judgement on the contents and assumes that it is
- // illegal to have another quote of the same kind inside the string's
- // contents.
- //
- // NOTE: This is for simple stuff like the strings in the XMLDecl which
- // cannot have any entities inside them. So this guy does not handle any
- // end of entity stuff.
- bool XMLScanner::getQuotedString(XMLBuffer& toFill)
- {
- // Reset the target buffer
- toFill.reset();
- // Get the next char which must be a single or double quote
- XMLCh quoteCh;
- if (!fReaderMgr.skipIfQuote(quoteCh))
- return false;
- while (true)
- {
- // Get another char
- const XMLCh nextCh = fReaderMgr.getNextChar();
- // See if it matches the starting quote char
- if (nextCh == quoteCh)
- break;
- // We should never get either an end of file null char here. If we
- // do, just fail. It will be handled more gracefully in the higher
- // level code that called us.
- if (!nextCh)
- return false;
- // Else add it to the buffer
- toFill.append(nextCh);
- }
- return true;
- }
- // This method scans a character reference and returns the character that
- // was refered to. It assumes that we've already scanned the &# characters
- // that prefix the numeric code.
- bool XMLScanner::scanCharRef(XMLCh& toFill, XMLCh& second)
- {
- bool gotOne = false;
- unsigned int value = 0;
- // Set the radix. Its supposed to be a lower case x if hex. But, in
- // order to recover well, we check for an upper and put out an error
- // for that.
- unsigned int radix = 10;
- if (fReaderMgr.skippedChar(chLatin_x))
- {
- radix = 16;
- }
- else if (fReaderMgr.skippedChar(chLatin_X))
- {
- emitError(XMLErrs::HexRadixMustBeLowerCase);
- radix = 16;
- }
- while (true)
- {
- const XMLCh nextCh = fReaderMgr.peekNextChar();
- // Watch for EOF
- if (!nextCh)
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- // Break out on the terminating semicolon
- if (nextCh == chSemiColon)
- {
- fReaderMgr.getNextChar();
- break;
- }
- // Convert this char to a binary value, or bail out if its not
- // one.
- unsigned int nextVal;
- if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9))
- nextVal = (unsigned int)(nextCh - chDigit_0);
- else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F))
- nextVal= (unsigned int)(10 + (nextCh - chLatin_A));
- else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f))
- nextVal = (unsigned int)(10 + (nextCh - chLatin_a));
- else
- {
- // Return a zero
- toFill = 0;
- // If we got at least a sigit, then do an unterminated ref error.
- // Else, do an expected a numerical ref thing.
- if (gotOne)
- emitError(XMLErrs::UnterminatedCharRef);
- else
- emitError(XMLErrs::ExpectedNumericalCharRef);
- // Return failure
- return false;
- }
- // Make sure its valid for the radix. If not, then just eat the
- // digit and go on after issueing an error. Else, update the
- // running value with this new digit.
- if (nextVal >= radix)
- {
- XMLCh tmpStr[2];
- tmpStr[0] = nextCh;
- tmpStr[1] = chNull;
- emitError(XMLErrs::BadDigitForRadix, tmpStr);
- }
- else
- {
- value = (value * radix) + nextVal;
- }
- // Indicate that we got at least one good digit
- gotOne = true;
- // And eat the last char
- fReaderMgr.getNextChar();
- }
- // Return the char (or chars)
- // And check if the character expanded is valid or not
- if (value >= 0x10000 && value <= 0x10FFFF)
- {
- value -= 0x10000;
- toFill = XMLCh((value >> 10) + 0xD800);
- second = XMLCh((value & 0x3FF) + 0xDC00);
- }
- else if (value <= 0xFFFD)
- {
- toFill = XMLCh(value);
- second = 0;
- if (!fReaderMgr.getCurrentReader()->isXMLChar(toFill) && !fReaderMgr.getCurrentReader()->isControlChar(toFill)) {
- // Character reference was not in the valid range
- emitError(XMLErrs::InvalidCharacterRef);
- return false;
- }
- }
- else {
- // Character reference was not in the valid range
- emitError(XMLErrs::InvalidCharacterRef);
- return false;
- }
- return true;
- }
- // We get here after the '<!--' part of the comment. We scan past the
- // terminating '-->' It will calls the appropriate handler with the comment
- // text, if one is provided. A comment can be in either the document or
- // the DTD, so the fInDocument flag is used to know which handler to send
- // it to.
- void XMLScanner::scanComment()
- {
- enum States
- {
- InText
- , OneDash
- , TwoDashes
- };
- // Get a buffer for this
- XMLBufBid bbComment(&fBufMgr);
- // Get the comment text into a temp buffer. Be sure to use temp buffer
- // two here, since its to be used for stuff that is potentially longer
- // than just a name.
- States curState = InText;
- bool gotLeadingSurrogate = false;
- while (true)
- {
- // Get the next character
- const XMLCh nextCh = fReaderMgr.getNextChar();
- // Watch for an end of file
- if (!nextCh)
- {
- emitError(XMLErrs::UnterminatedComment);
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- }
- // Check for correct surrogate pairs
- if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
- {
- if (gotLeadingSurrogate)
- emitError(XMLErrs::Expected2ndSurrogateChar);
- else
- gotLeadingSurrogate = true;
- }
- else
- {
- if (gotLeadingSurrogate)
- {
- if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
- emitError(XMLErrs::Expected2ndSurrogateChar);
- }
- // Its got to at least be a valid XML character
- else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- emitError(XMLErrs::InvalidCharacter, tmpBuf);
- }
- gotLeadingSurrogate = false;
- }
- if (curState == InText)
- {
- // If its a dash, go to OneDash state. Otherwise take as text
- if (nextCh == chDash)
- curState = OneDash;
- else
- bbComment.append(nextCh);
- }
- else if (curState == OneDash)
- {
- // If its another dash, then we change to the two dashes states.
- // Otherwise, we have to put in the deficit dash and the new
- // character and go back to InText.
- if (nextCh == chDash)
- {
- curState = TwoDashes;
- }
- else
- {
- bbComment.append(chDash);
- bbComment.append(nextCh);
- curState = InText;
- }
- }
- else if (curState == TwoDashes)
- {
- // The next character must be the closing bracket
- if (nextCh != chCloseAngle)
- {
- emitError(XMLErrs::IllegalSequenceInComment);
- fReaderMgr.skipPastChar(chCloseAngle);
- return;
- }
- break;
- }
- }
- // If we have an available handler, call back with the comment.
- if (fDocHandler)
- {
- fDocHandler->docComment
- (
- bbComment.getRawBuffer()
- );
- }
- }
- // Most equal signs can have white space around them, so this little guy
- // just makes the calling code cleaner by eating whitespace.
- bool XMLScanner::scanEq()
- {
- fReaderMgr.skipPastSpaces();
- if (fReaderMgr.skippedChar(chEqual))
- {
- fReaderMgr.skipPastSpaces();
- return true;
- }
- return false;
- }
- unsigned int
- XMLScanner::scanUpToWSOr(XMLBuffer& toFill, const XMLCh chEndChar)
- {
- fReaderMgr.getUpToCharOrWS(toFill, chEndChar);
- return toFill.getLen();
- }
- XERCES_CPP_NAMESPACE_END