DTDScanner.cpp
上传用户:zhuqijet
上传日期:2013-06-25
资源大小:10074k
文件大小:134k
- entityDecl->setIsParameter(isPEDecl);
- //
- // Space is legal (required actually) here so check for a PE ref. If
- // we don't get our whitespace, then issue an error, but try to keep
- // going.
- //
- if (!checkForPERef(true, false, true))
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- // save the hasNoDTD status for Entity Constraint Checking
- bool hasNoDTD = fScanner->getHasNoDTD();
- if (hasNoDTD && isPEDecl)
- fScanner->setHasNoDTD(false);
- // According to the type call the value scanning method
- if (!scanEntityDef(*entityDecl, isPEDecl))
- {
- fReaderMgr->skipPastChar(chCloseAngle);
- fScanner->setHasNoDTD(true);
- fScanner->emitError(XMLErrs::ExpectedEntityValue);
- return;
- }
- if (hasNoDTD)
- fScanner->setHasNoDTD(true);
- // Space is legal (but not required) here so check for a PE ref
- checkForPERef(false, false, true);
- // And then we have to have the closing angle bracket
- if (!fReaderMgr->skippedChar(chCloseAngle))
- {
- fScanner->emitError(XMLErrs::UnterminatedEntityDecl, entityDecl->getName());
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- //
- // If we have a doc type handler, then call it. But only call it for
- // ignored elements if advanced callbacks are enabled.
- //
- if (fDocTypeHandler)
- fDocTypeHandler->entityDecl(*entityDecl, isPEDecl, isIgnored);
- }
- //
- // This method will scan a general/character entity ref. It will either
- // expand a char ref and return the value directly, or it will expand
- // a general entity and a reader for it onto the reader stack.
- //
- // The return value indicates whether the value was returned directly or
- // pushed as a reader or it failed.
- //
- // The escaped flag tells the caller whether the returnd parameter resulted
- // from a character reference, which escapes the character in some cases. It
- // only makes any difference if the return indicates the value was returned
- // directly.
- //
- // NOTE: This is only called when scanning attribute values, so we always
- // expand general entities.
- //
- DTDScanner::EntityExpRes
- DTDScanner::scanEntityRef(XMLCh& firstCh, XMLCh& secondCh, bool& escaped)
- {
- // Assume no escape and no second char
- escaped = false;
- secondCh = 0;
- // We have to insure its all done in a single entity
- const unsigned int curReader = fReaderMgr->getCurrentReaderNum();
- //
- // If the next char is a pound, then its a character reference and we
- // need to expand it always.
- //
- if (fReaderMgr->skippedChar(chPound))
- {
- //
- // Its a character reference, so scan it and get back the numeric
- // value it represents. If it fails, just return immediately.
- //
- if (!scanCharRef(firstCh, secondCh))
- return EntityExp_Failed;
- if (curReader != fReaderMgr->getCurrentReaderNum())
- fScanner->emitError(XMLErrs::PartialMarkupInEntity);
- // Its now escaped since it was a char ref
- escaped = true;
- return EntityExp_Returned;
- }
- // Get the name of the general entity
- XMLBufBid bbName(fBufMgr);
- if (!fReaderMgr->getName(bbName.getBuffer()))
- {
- fScanner->emitError(XMLErrs::ExpectedEntityRefName);
- return EntityExp_Failed;
- }
- //
- // Next char must be a semi-colon. But if its not, just emit
- // an error and try to continue.
- //
- if (!fReaderMgr->skippedChar(chSemiColon))
- fScanner->emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
- // Make sure it was all in one entity reader
- if (curReader != fReaderMgr->getCurrentReaderNum())
- fScanner->emitError(XMLErrs::PartialMarkupInEntity);
- // Look it up the name the general entity pool
- XMLEntityDecl* decl = fDTDGrammar->getEntityDecl(bbName.getRawBuffer());
- // If it does not exist, then obviously an error
- if (!decl)
- {
- // XML 1.0 Section 4.1
- if (fScanner->getStandalone() || fScanner->getHasNoDTD()) {
- fScanner->emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
- }
- else {
- if (fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer());
- }
- return EntityExp_Failed;
- }
- //
- // XML 1.0 Section 4.1
- // If we are a standalone document, then it has to have been declared
- // in the internal subset.
- //
- if (fScanner->getStandalone() && !decl->getDeclaredInIntSubset())
- fScanner->emitError(XMLErrs::IllegalRefInStandalone, bbName.getRawBuffer());
- //
- // If its a special char reference, then its escaped and we can return
- // it directly.
- //
- if (decl->getIsSpecialChar())
- {
- firstCh = decl->getValue()[0];
- escaped = true;
- return EntityExp_Returned;
- }
- if (decl->isExternal())
- {
- // If its unparsed, then its not valid here
- // XML 1.0 Section 4.4.4 the appearance of a reference to an unparsed entity is forbidden.
- if (decl->isUnparsed())
- {
- fScanner->emitError(XMLErrs::NoUnparsedEntityRefs, bbName.getRawBuffer());
- return EntityExp_Failed;
- }
- // We are in an attribute value, so not valid.
- // XML 1.0 Section 4.4.4 a reference to an external entity in an attribute value is forbidden.
- fScanner->emitError(XMLErrs::NoExtRefsInAttValue);
- // And now create a reader to read this entity
- InputSource* srcUsed;
- XMLReader* reader = fReaderMgr->createReader
- (
- decl->getBaseURI()
- , decl->getSystemId()
- , decl->getPublicId()
- , false
- , XMLReader::RefFrom_NonLiteral
- , XMLReader::Type_General
- , XMLReader::Source_External
- , srcUsed
- );
- // Put a janitor on the source so it gets cleaned up on exit
- Janitor<InputSource> janSrc(srcUsed);
- //
- // If the creation failed then throw an exception
- //
- if (!reader)
- ThrowXML1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed->getSystemId());
- //
- // Push the reader. If its a recursive expansion, then emit an error
- // and return an failure.
- //
- if (!fReaderMgr->pushReader(reader, decl))
- {
- fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
- return EntityExp_Failed;
- }
- // If it starts with the XML string, then parse a text decl
- if (fScanner->checkXMLDecl(true))
- scanTextDecl();
- }
- else
- {
- //
- // Create a reader over a memory stream over the entity value
- // We force it to assume UTF-16 by passing in an encoding
- // string. This way it won't both trying to predecode the
- // first line, looking for an XML/TextDecl.
- //
- XMLReader* valueReader = fReaderMgr->createIntEntReader
- (
- decl->getName()
- , XMLReader::RefFrom_NonLiteral
- , XMLReader::Type_General
- , decl->getValue()
- , decl->getValueLen()
- , false
- );
- //
- // Trt to push the entity reader onto the reader manager stack,
- // where it will become the subsequent input. If it fails, that
- // means the entity is recursive, so issue an error. The reader
- // will have just been discarded, but we just keep going.
- //
- if (!fReaderMgr->pushReader(valueReader, decl))
- fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
- }
- return EntityExp_Pushed;
- }
- //
- // This method will scan a quoted literal of an entity value. It has to
- // deal with replacement of PE references; however, since this is a DTD
- // scanner, all such entity literals are in entity decls and therefore
- // general entities are not expanded.
- //
- bool DTDScanner::scanEntityLiteral(XMLBuffer& toFill, const bool isPE)
- {
- toFill.reset();
- // Get the next char which must be a single or double quote
- XMLCh quoteCh;
- if (!fReaderMgr->skipIfQuote(quoteCh))
- return false;
- // Get a buffer for pulling in entity names when we see GE refs
- XMLBufBid bbName(fBufMgr);
- XMLBuffer& nameBuf = bbName.getBuffer();
- // Remember the current reader
- const unsigned int orgReader = fReaderMgr->getCurrentReaderNum();
- //
- // Loop until we see the ending quote character, handling any references
- // in the process.
- //
- XMLCh nextCh;
- XMLCh secondCh = 0;
- bool gotLeadingSurrogate = false;
- while (true)
- {
- nextCh = fReaderMgr->getNextChar();
- //
- // Watch specifically for EOF and issue a more meaningful error
- // if that occurs (since an unterminated quoted char can cause
- // this easily.)
- //
- if (!nextCh)
- {
- fScanner->emitError(XMLErrs::UnterminatedEntityLiteral);
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- }
- //
- // Break out on our terminating quote char when we are back in the
- // same reader. Otherwise, we might trigger on a nested quote char
- // in an expanded entity.
- //
- if ((nextCh == quoteCh)
- && (fReaderMgr->getCurrentReaderNum() == orgReader))
- {
- break;
- }
- if (nextCh == chPercent)
- {
- //
- // Put the PE's value on the reader stack and then jump back
- // to the top to start processing it. The parameter indicates
- // that it should not scan the reference's content as an external
- // subset.
- //
- expandPERef(false, true, true);
- continue;
- }
- //
- // Ok, now that all the other special stuff is checked, we can
- // look for a general entity. In here, we cannot have a naked &
- // and will only expand numerical char refs or the intrinsic char
- // refs. Others will be left alone.
- //
- if (nextCh == chAmpersand)
- {
- //
- // Here, we only expand numeric char refs, but not any general
- // entities. However, the stupid XML spec requires that we check
- // and make sure it does refer to a general entity if its not
- // a char ref (i.e. no naked '&' chars.)
- //
- if (fReaderMgr->skippedChar(chPound))
- {
- // If it failed, then just jump back to the top and try to pick up
- if (!scanCharRef(nextCh, secondCh))
- {
- gotLeadingSurrogate = false;
- continue;
- }
- }
- else
- {
- if (!fReaderMgr->getName(nameBuf))
- {
- fScanner->emitError(XMLErrs::ExpectedEntityRefName);
- }
- else
- {
- //
- // Since we are not expanding any of this, we have to
- // put the amp and name into the target buffer as data.
- //
- toFill.append(chAmpersand);
- toFill.append(nameBuf.getRawBuffer());
- // Make sure we skipped a trailing semicolon
- if (!fReaderMgr->skippedChar(chSemiColon))
- {
- fScanner->emitError
- (
- XMLErrs::UnterminatedEntityRef
- , nameBuf.getRawBuffer()
- );
- }
- // And make the new character the semicolon
- nextCh = chSemiColon;
- }
- // Either way here we reset the surrogate flag
- gotLeadingSurrogate = false;
- }
- }
- else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
- {
- if (gotLeadingSurrogate)
- fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
- else
- gotLeadingSurrogate = true;
- }
- else
- {
- if (gotLeadingSurrogate)
- {
- if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
- fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
- }
- else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh))
- {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
- fReaderMgr->skipPastChar(quoteCh);
- return false;
- }
- gotLeadingSurrogate = false;
- }
- // Looks ok, so add it to the literal
- toFill.append(nextCh);
- if (secondCh)
- toFill.append(secondCh);
- }
- //
- // If we got here and did not get back to the original reader level,
- // then we propogated some entity out of the literal, so issue an
- // error, but don't fail.
- //
- if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
- return true;
- }
- //
- // This method is called after the entity name has been scanned, and any
- // PE referenced following the name is handled. The passed decl will be
- // filled in with the info scanned.
- //
- bool DTDScanner::scanEntityDef(DTDEntityDecl& decl, const bool isPEDecl)
- {
- // Its got to be an entity literal
- if (fReaderMgr->lookingAtChar(chSingleQuote)
- || fReaderMgr->lookingAtChar(chDoubleQuote))
- {
- // Get a buffer for the literal
- XMLBufBid bbValue(fBufMgr);
- if (!scanEntityLiteral(bbValue.getBuffer(), isPEDecl))
- return false;
- // Set it on the entity decl
- decl.setValue(bbValue.getRawBuffer());
- return true;
- }
- //
- // Its got to be an external entity, so there must be an external id.
- // Get buffers for them and scan an external id into them.
- //
- XMLBufBid bbPubId(fBufMgr);
- XMLBufBid bbSysId(fBufMgr);
- if (!scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), IDType_External))
- return false;
- ReaderMgr::LastExtEntityInfo lastInfo;
- fReaderMgr->getLastExtEntityInfo(lastInfo);
- // Fill in the id fields of the decl with the info we got
- const XMLCh* publicId = bbPubId.getRawBuffer();
- const XMLCh* systemId = bbSysId.getRawBuffer();
- decl.setPublicId((publicId && *publicId) ? publicId : 0);
- decl.setSystemId((systemId && *systemId) ? systemId : 0);
- decl.setBaseURI((lastInfo.systemId && *lastInfo.systemId) ? lastInfo.systemId : 0);
- // If its a PE decl, we are done
- bool gotSpaces = checkForPERef(false, false, true);
- if (isPEDecl)
- {
- //
- // Check for a common error here. NDATA is not allowed for PEs
- // so check for the NDATA string. If found give a nice meaningful
- // error and continue parsing to eat the NDATA text.
- //
- if (gotSpaces)
- {
- if (fReaderMgr->skippedString(XMLUni::fgNDATAString))
- fScanner->emitError(XMLErrs::NDATANotValidForPE);
- }
- else
- {
- return true;
- }
- }
- // If looking at close angle now, we are done
- if (fReaderMgr->lookingAtChar(chCloseAngle))
- return true;
- // Else we had to have seem the whitespace
- if (!gotSpaces)
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- // We now have to see a notation data string
- if (!fReaderMgr->skippedString(XMLUni::fgNDATAString))
- fScanner->emitError(XMLErrs::ExpectedNDATA);
- // Space is required here, but try to go on if not
- if (!checkForPERef(false, false, true))
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- // Get a name
- XMLBufBid bbName(fBufMgr);
- if (!fReaderMgr->getName(bbName.getBuffer()))
- {
- fScanner->emitError(XMLErrs::ExpectedNotationName);
- return false;
- }
- // Set the decl's notation name
- decl.setNotationName(bbName.getRawBuffer());
- return true;
- }
- //
- // This method is called after an attribute decl name or a notation decl has
- // been scanned and then an opening parenthesis was see, indicating the list
- // of values. It scans the enumeration values and creates a single string
- // which has a single space between each value.
- //
- // The terminating close paren ends this scan.
- //
- bool DTDScanner::scanEnumeration( const DTDAttDef& attDef
- , XMLBuffer& toFill
- , const bool notation)
- {
- // Reset the passed buffer
- toFill.reset();
- // Check for PE ref but don't require space
- checkForPERef(false, false, true);
- // If this is a notation, we need an opening paren
- if (notation)
- {
- if (!fReaderMgr->skippedChar(chOpenParen))
- fScanner->emitError(XMLErrs::ExpectedOpenParen);
- }
- // We need a local buffer to use as well
- XMLBufBid bbTmp(fBufMgr);
- while (true)
- {
- // Space is allowed here for either type so check for PE ref
- checkForPERef(false, false, true);
- // And then get either a name or a name token
- bool success;
- if (notation)
- success = fReaderMgr->getName(bbTmp.getBuffer());
- else
- success = fReaderMgr->getNameToken(bbTmp.getBuffer());
- if (!success)
- {
- fScanner->emitError
- (
- XMLErrs::ExpectedEnumValue
- , attDef.getFullName()
- );
- return false;
- }
- // Append this value to the target value
- toFill.append(bbTmp.getRawBuffer(), bbTmp.getLen());
- // Space is allowed here for either type so check for PE ref
- checkForPERef(false, false, true);
- // Check for the terminating paren
- if (fReaderMgr->skippedChar(chCloseParen))
- break;
- // And append a space separator
- toFill.append(chSpace);
- // Check for the pipe character separator
- if (!fReaderMgr->skippedChar(chPipe))
- {
- fScanner->emitError(XMLErrs::ExpectedEnumSepOrParen);
- return false;
- }
- }
- return true;
- }
- bool DTDScanner::scanEq()
- {
- fReaderMgr->skipPastSpaces();
- if (fReaderMgr->skippedChar(chEqual))
- {
- fReaderMgr->skipPastSpaces();
- return true;
- }
- return false;
- }
- //
- // This method is called when an external entity reference is seen in the
- // DTD or an external DTD subset is encountered, and their contents pushed
- // onto the reader stack. This method will scan that contents.
- //
- void DTDScanner::scanExtSubsetDecl(const bool inIncludeSect, const bool isDTD)
- {
- // Indicate we are in the external subset now
- FlagJanitor<bool> janContentFlag(&fInternalSubset, false);
- bool bAcceptDecl = !inIncludeSect;
- // Get a buffer for whitespace
- XMLBufBid bbSpace(fBufMgr);
- //
- // If we have a doc type handler and we are not being called recursively
- // to handle an include section, tell it the ext subset starts
- //
- if (fDocTypeHandler && !inIncludeSect)
- fDocTypeHandler->startExtSubset();
- //
- // We have to play a trick here if the current entity we are parsing
- // is a PE. Because the spooling code will put out a whitespace before
- // and after an expanded PE if its being scanned outside the context of
- // a literal entity, this will confuse this external subset code.
- //
- // So, we see if that is what is happening and, if so, eat the single
- // space, a check for the <?xml string. If we find it, we parse that
- // markup right now and put the space back.
- //
- if (fReaderMgr->isScanningPERefOutOfLiteral())
- {
- if (fReaderMgr->skippedSpace())
- {
- if (fScanner->checkXMLDecl(true))
- {
- scanTextDecl();
- bAcceptDecl = false;
- // <TBD> Figure out how to do this
- // fReaderMgr->unGet(chSpace);
- }
- }
- }
- // Get the current reader number
- const unsigned int orgReader = fReaderMgr->getCurrentReaderNum();
- //
- // Loop until we hit the end of the external subset entity. Note that
- // we use a double loop here in order to avoid the overhead of doing
- // the exception setup/teardown work on every loop.
- //
- bool inMarkup = false;
- bool inCharData = false;
- while (true)
- {
- try
- {
- while (true)
- {
- const XMLCh nextCh = fReaderMgr->peekNextChar();
- if (nextCh == chOpenAngle)
- {
- // Get the reader we started this on
- // XML 1.0 P28a Well-formedness constraint: PE Between Declarations
- const unsigned int orgReader = fReaderMgr->getCurrentReaderNum();
- bool wasInPE = (fReaderMgr->getCurrentReader()->getType() == XMLReader::Type_PE);
- //
- // Now scan the markup. Set the flag so that we will know that
- // we were in markup if an end of entity exception occurs.
- //
- fReaderMgr->getNextChar();
- inMarkup = true;
- scanMarkupDecl(bAcceptDecl);
- inMarkup = false;
- //
- // And see if we got back to the same level. If not, then its
- // a partial markup error.
- //
- if (fReaderMgr->getCurrentReaderNum() != orgReader){
- if (wasInPE)
- fScanner->emitError(XMLErrs::PEBetweenDecl);
- else if (fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
- }
- }
- else if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
- {
- //
- // If we have a doc type handler, and advanced callbacks are
- // enabled, then gather up whitespace and call back. Otherwise
- // just skip whitespaces.
- //
- if (fDocTypeHandler)
- {
- inCharData = true;
- fReaderMgr->getSpaces(bbSpace.getBuffer());
- inCharData = false;
- fDocTypeHandler->doctypeWhitespace
- (
- bbSpace.getRawBuffer()
- , bbSpace.getLen()
- );
- }
- else
- {
- //
- // If we hit an end of entity in the middle of white
- // space, that's fine. We'll just come back in here
- // again on the next round and skip some more.
- //
- fReaderMgr->skipPastSpaces();
- }
- }
- else if (nextCh == chPercent)
- {
- //
- // Expand (and scan if external) the reference value. Tell
- // it to throw an end of entity exception at the end of the
- // entity.
- //
- fReaderMgr->getNextChar();
- expandPERef(true, false, false, true);
- }
- else if (inIncludeSect && (nextCh == chCloseSquare))
- {
- //
- // Its the end of a conditional include section. So scan it and
- // decrement the include depth counter.
- //
- fReaderMgr->getNextChar();
- if (!fReaderMgr->skippedChar(chCloseSquare))
- {
- fScanner->emitError(XMLErrs::ExpectedEndOfConditional);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- else if (!fReaderMgr->skippedChar(chCloseAngle))
- {
- fScanner->emitError(XMLErrs::ExpectedEndOfConditional);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- return;
- }
- else if (!nextCh)
- {
- return; // nothing left
- }
- else
- {
- fReaderMgr->getNextChar();
- if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh))
- {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
- }
- else
- {
- fScanner->emitError(XMLErrs::InvalidDocumentStructure);
- }
- // Try to get realigned
- static const XMLCh toSkip[] =
- {
- chPercent, chCloseSquare, chOpenAngle, chNull
- };
- fReaderMgr->skipUntilInOrWS(toSkip);
- }
- bAcceptDecl = false;
- }
- }
- catch(const EndOfEntityException& toCatch)
- {
- //
- // If the external entity ended while we were in markup, then that's
- // a partial markup error.
- //
- if (inMarkup)
- {
- fScanner->emitError(XMLErrs::PartialMarkupInEntity);
- inMarkup = false;
- }
- // If we were in char data, then send what we got
- if (inCharData)
- {
- // Send what we got, then rethrow
- if (fDocTypeHandler)
- {
- fDocTypeHandler->doctypeWhitespace
- (
- bbSpace.getRawBuffer()
- , bbSpace.getLen()
- );
- }
- inCharData = false;
- }
- //
- // If the entity that just ended was the entity that we started
- // on, then this is the end of the external subset.
- //
- if (orgReader == toCatch.getReaderNum())
- break;
- }
- }
- // If we have a doc type handler, tell it the ext subset ends
- if (fDocTypeHandler && isDTD)
- fDocTypeHandler->endExtSubset();
- }
- //
- // This method will scan for an id, either public or external.
- //
- //
- // [75] ExternalID ::= 'SYSTEM' S SystemLiteral
- // | 'PUBLIC' S PubidLiteral S SystemLiteral
- // [83] PublicID ::= 'PUBLIC' S PubidLiteral
- //
- bool DTDScanner::scanId( XMLBuffer& pubIdToFill
- , XMLBuffer& sysIdToFill
- , const IDTypes whatKind)
- {
- // Clean out both return buffers
- pubIdToFill.reset();
- sysIdToFill.reset();
- //
- // Check first for the system id first. If we find it, and system id
- // is one of the legal values, then lets try to scan it.
- //
- // 'SYSTEM' S SystemLiteral
- if (fReaderMgr->skippedString(XMLUni::fgSysIDString))
- {
- // If they were looking for a public id, then we failed
- if (whatKind == IDType_Public)
- {
- fScanner->emitError(XMLErrs::ExpectedPublicId);
- return false;
- }
- // We must skip spaces
- if (!fReaderMgr->skipPastSpaces())
- {
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- return false;
- }
- // Get the system literal value
- return scanSystemLiteral(sysIdToFill);
- }
- // Now scan for public id
- // 'PUBLIC' S PubidLiteral S SystemLiteral
- // or
- // 'PUBLIC' S PubidLiteral
- // If we don't have any public id string => Error
- if (!fReaderMgr->skippedString(XMLUni::fgPubIDString)) {
- fScanner->emitError(XMLErrs::ExpectedSystemOrPublicId);
- return false;
- }
- //
- // So following this we must have whitespace, a public literal, whitespace,
- // and a system literal.
- //
- if (!fReaderMgr->skipPastSpaces())
- {
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- //
- // Just in case, if they just forgot the whitespace but the next char
- // is a single or double quote, then keep going.
- //
- const XMLCh chPeek = fReaderMgr->peekNextChar();
- if ((chPeek != chDoubleQuote) && (chPeek != chSingleQuote))
- return false;
- }
- if (!scanPublicLiteral(pubIdToFill))
- return false;
- // If they wanted a public id, then this is all
- if (whatKind == IDType_Public)
- return true;
- // check if there is any space follows
- bool hasSpace = fReaderMgr->skipPastSpaces();
- //
- // In order to recover best here we need to see if
- // the next thing is a quote or not
- //
- const XMLCh chPeek = fReaderMgr->peekNextChar();
- const bool bIsQuote = ((chPeek == chDoubleQuote)
- || (chPeek == chSingleQuote));
- if (!hasSpace)
- {
- if (whatKind == IDType_External)
- {
- //
- // If its an external Id, then we need to see the system id.
- // So, emit the error. But, if the next char is a quote, don't
- // give up since its probably going to work. The user just
- // missed the separating space. Otherwise, fail.
- //
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- if (!bIsQuote)
- return false;
- }
- else
- {
- //
- // We can legally return here. But, if the next char is a quote,
- // then that's probably not what was desired, since its probably
- // just that space was forgotten and there really is a system
- // id to follow.
- //
- // So treat it like missing whitespace if so and keep going.
- // Else, just return success.
- //
- if (bIsQuote)
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- else
- return true;
- }
- }
- if (bIsQuote) {
- // there is a quote coming, scan the system literal
- if (!scanSystemLiteral(sysIdToFill))
- return false;
- }
- else {
- // no quote, if expecting exteral id, this is an error
- if (whatKind == IDType_External)
- fScanner->emitError(XMLErrs::ExpectedQuotedString);
- }
- return true;
- }
- //
- // This method will scan the contents of an ignored section. It assumes that
- // we already are in the body, i.e. we've seen <![IGNORE[ at this point. So
- // we have to just scan until we see a matching ]]> closing markup.
- //
- void DTDScanner::scanIgnoredSection()
- {
- //
- // Depth starts at one because we are already in one section and want
- // to parse until we hit its end.
- //
- unsigned long depth = 1;
- bool gotLeadingSurrogate = false;
- while (true)
- {
- const XMLCh nextCh = fReaderMgr->getNextChar();
- if (!nextCh)
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- if (nextCh == chOpenAngle)
- {
- if (fReaderMgr->skippedChar(chBang)
- && fReaderMgr->skippedChar(chOpenSquare))
- {
- depth++;
- }
- }
- else if (nextCh == chCloseSquare)
- {
- if (fReaderMgr->skippedChar(chCloseSquare))
- {
- while (fReaderMgr->skippedChar(chCloseSquare))
- {
- // Do nothing, just skip them
- }
- if (fReaderMgr->skippedChar(chCloseAngle))
- {
- depth--;
- if (!depth)
- break;
- }
- }
- }
- // Deal with surrogate pairs
- else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
- {
- // Its a leading surrogate. If we already got one, then
- // issue an error, else set leading flag to make sure that
- // we look for a trailing next time.
- if (gotLeadingSurrogate)
- fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
- else
- gotLeadingSurrogate = true;
- }
- else
- {
- // If its a trailing surrogate, make sure that we are
- // prepared for that. Else, its just a regular char so make
- // sure that we were not expected a trailing surrogate.
- if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
- {
- // Its trailing, so make sure we were expecting it
- if (!gotLeadingSurrogate)
- fScanner->emitError(XMLErrs::Unexpected2ndSurrogateChar);
- }
- else
- {
- // Its just a char, so make sure we were not expecting a
- // trailing surrogate.
- if (gotLeadingSurrogate)
- fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
- // Its got to at least be a valid XML character
- else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh))
- {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
- }
- }
- gotLeadingSurrogate = false;
- }
- }
- }
- //
- // This method scans the entire internal subset. All we can have here is
- // decl markup, and PE references. The expanded PE references must contain
- // whole markup, so we don't have to worry about their content at this
- // level. We just scan them, expand them, push them, and parse their content
- // right there, via the expandERef() method.
- //
- bool DTDScanner::scanInternalSubset()
- {
- // Indicate we are in the internal subset now
- FlagJanitor<bool> janContentFlag(&fInternalSubset, true);
- // If we have a doc type handler, tell it the internal subset starts
- if (fDocTypeHandler)
- fDocTypeHandler->startIntSubset();
- // Get a buffer for whitespace
- XMLBufBid bbSpace(fBufMgr);
- bool noErrors = true;
- while (true)
- {
- const XMLCh nextCh = fReaderMgr->peekNextChar();
- //
- // If we get an end of file marker, just unget it and return a
- // failure status. The caller will then see the end of file and
- // faill out correctly.
- //
- if (!nextCh)
- return false;
- // Watch for the end of internal subset marker
- if (nextCh == chCloseSquare)
- {
- fReaderMgr->getNextChar();
- break;
- }
- if (nextCh == chPercent)
- {
- //
- // Expand (and scan if external) the reference value. Tell
- // it to set the reader to cause an end of entity exception
- // when this reader dies, which is what the scanExtSubset
- // method wants (who is called to scan this.)
- //
- fReaderMgr->getNextChar();
- expandPERef(true, false, false, true);
- }
- else if (nextCh == chOpenAngle)
- {
- // Remember this reader before we start the scan, for checking
- // XML 1.0 P28a Well-formedness constraint: PE Between Declarations
- const unsigned int orgReader = fReaderMgr->getCurrentReaderNum();
- bool wasInPE = (fReaderMgr->getCurrentReader()->getType() == XMLReader::Type_PE);
- // And scan this markup
- fReaderMgr->getNextChar();
- scanMarkupDecl(false);
- // If we did not get back to entry level, then partial markup
- if (fReaderMgr->getCurrentReaderNum() != orgReader) {
- if (wasInPE)
- fScanner->emitError(XMLErrs::PEBetweenDecl);
- else if (fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
- }
- }
- else if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
- {
- //
- // IF we are doing advanced callbacks and have a doc type
- // handler, then get the whitespace and call the doc type
- // handler with it. Otherwise, just skip whitespace.
- //
- if (fDocTypeHandler)
- {
- fReaderMgr->getSpaces(bbSpace.getBuffer());
- fDocTypeHandler->doctypeWhitespace
- (
- bbSpace.getRawBuffer()
- , bbSpace.getLen()
- );
- }
- else
- {
- fReaderMgr->skipPastSpaces();
- }
- }
- else
- {
- // Not valid, so emit an error
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- fReaderMgr->getNextChar()
- , tmpBuf
- , 8
- , 16
- );
- fScanner->emitError
- (
- XMLErrs::InvalidCharacterInIntSubset
- , tmpBuf
- );
- //
- // If an '>', then probably an abnormally terminated
- // internal subset so just return.
- //
- if (nextCh == chCloseAngle)
- {
- noErrors = false;
- break;
- }
- //
- // Otherwise, try to sync back up by scanning forward for
- // a reasonable start character.
- //
- static const XMLCh toSkip[] =
- {
- chPercent, chCloseSquare, chOpenAngle, chNull
- };
- fReaderMgr->skipUntilInOrWS(toSkip);
- }
- }
- // If we have a doc type handler, tell it the internal subset ends
- if (fDocTypeHandler)
- fDocTypeHandler->endIntSubset();
- return noErrors;
- }
- //
- // This method is called once we see a < in the input of an int/ext subset,
- // which indicates the start of some sort of markup.
- //
- void DTDScanner::scanMarkupDecl(const bool parseTextDecl)
- {
- //
- // We only have two valid first characters here. One is a ! which opens
- // some markup decl. The other is a ?, which could begin either a PI
- // or a text decl. If parseTextDecl is false, we cannot accept a text
- // decl.
- //
- const XMLCh nextCh = fReaderMgr->getNextChar();
- if (nextCh == chBang)
- {
- if (fReaderMgr->skippedChar(chDash))
- {
- if (fReaderMgr->skippedChar(chDash))
- {
- scanComment();
- }
- else
- {
- fScanner->emitError(XMLErrs::CommentsMustStartWith);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- }
- else if (fReaderMgr->skippedChar(chOpenSquare))
- {
- //
- // Its a conditional section. This is only valid in the external
- // subset, so issue an error if we aren't there.
- //
- if (fInternalSubset)
- {
- fScanner->emitError(XMLErrs::ConditionalSectInIntSubset);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // A PE ref can happen here, but space is not required
- checkForPERef(false, false, true);
- if (fReaderMgr->skippedString(XMLUni::fgIncludeString))
- {
- checkForPERef(false, false, true);
- // Check for the following open square bracket
- if (!fReaderMgr->skippedChar(chOpenSquare))
- fScanner->emitError(XMLErrs::ExpectedINCLUDEBracket);
- // Get the reader we started this on
- const unsigned int orgReader = fReaderMgr->getCurrentReaderNum();
- checkForPERef(false, false, true);
- //
- // Recurse back to the ext subset call again, telling it its
- // in an include section.
- //
- scanExtSubsetDecl(true, false);
- //
- // And see if we got back to the same level. If not, then its
- // a partial markup error.
- //
- if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
- }
- else if (fReaderMgr->skippedString(XMLUni::fgIgnoreString))
- {
- checkForPERef(false, false, true);
- // Check for the following open square bracket
- if (!fReaderMgr->skippedChar(chOpenSquare))
- fScanner->emitError(XMLErrs::ExpectedINCLUDEBracket);
- // Get the reader we started this on
- const unsigned int orgReader = fReaderMgr->getCurrentReaderNum();
- // And scan over the ignored part
- scanIgnoredSection();
- //
- // And see if we got back to the same level. If not, then its
- // a partial markup error.
- //
- if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getDoValidation())
- fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
- }
- else
- {
- fScanner->emitError(XMLErrs::ExpectedIncOrIgn);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- }
- else if (fReaderMgr->skippedString(XMLUni::fgAttListString))
- {
- scanAttListDecl();
- }
- else if (fReaderMgr->skippedString(XMLUni::fgElemString))
- {
- scanElementDecl();
- }
- else if (fReaderMgr->skippedString(XMLUni::fgEntityString))
- {
- scanEntityDecl();
- }
- else if (fReaderMgr->skippedString(XMLUni::fgNotationString))
- {
- scanNotationDecl();
- }
- else
- {
- fScanner->emitError(XMLErrs::ExpectedMarkupDecl);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- }
- else if (nextCh == chQuestion)
- {
- // It could be a PI or the XML declaration. Check for Decl
- if (fScanner->checkXMLDecl(false))
- {
- // If we are not accepting text decls, its an error
- if (parseTextDecl)
- {
- scanTextDecl();
- }
- else
- {
- // Emit the error and skip past this markup
- fScanner->emitError(XMLErrs::TextDeclNotLegalHere);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- }
- else
- {
- // It has to be a PI
- scanPI();
- }
- }
- else
- {
- // Can't be valid so emit error and try to skip past end of this decl
- fScanner->emitError(XMLErrs::ExpectedMarkupDecl);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- }
- //
- // This method is called for a mixed model element's content mode. We've
- // already scanned past the '(PCDATA' part by the time we get here. So
- // everything else is element names separated by | characters until we
- // hit the end. The passed element decl's content model is filled in with
- // the information found.
- //
- bool DTDScanner::scanMixed(DTDElementDecl& toFill)
- {
- //
- // The terminating star is only required if there is something more
- // than (PCDATA).
- //
- bool starRequired = false;
- // Get a buffer to be used below to get element names
- XMLBufBid bbName(fBufMgr);
- XMLBuffer& nameBuf = bbName.getBuffer();
- //
- // Create an initial content spec node. Its just a leaf node with a
- // PCDATA element id. This current node pointer will be pushed down the
- // tree as we go.
- //
- ContentSpecNode* curNode = new (fMemoryManager) ContentSpecNode
- (
- new (fMemoryManager) QName
- (
- XMLUni::fgZeroLenString
- , XMLUni::fgZeroLenString
- , XMLElementDecl::fgPCDataElemId
- , fMemoryManager
- )
- , false
- , fMemoryManager
- );
- //
- // Set the initial leaf as the temporary head. If we hit the first choice
- // node, it will be set up here. When done, this is the node that's set
- // as the content spec for the element.
- //
- ContentSpecNode* headNode = curNode;
- // Remember the original node so we can sense the first choice node
- ContentSpecNode* orgNode = curNode;
- //
- // We just loop around, getting the | character at the top and then
- // looking for the next element name. We keep up with the last node
- // and add each new one to its right node.
- //
- while (true)
- {
- //
- // First of all we check for some grunt work details of skipping
- // whitespace, expand PE refs, and catching invalid reps.
- //
- if (fReaderMgr->lookingAtChar(chPercent))
- {
- // Expand it and continue
- checkForPERef(false, false, true);
- }
- else if (fReaderMgr->skippedChar(chAsterisk))
- {
- //
- // Tell them they can't have reps in mixed model, but eat
- // it and keep going if we are allowed to.
- //
- fScanner->emitError(XMLErrs::NoRepInMixed);
- }
- else if (fReaderMgr->skippedSpace())
- {
- // Spaces are ok at this point, just eat them and continue
- fReaderMgr->skipPastSpaces();
- }
- else
- {
- if (!fReaderMgr->skippedChar(chPipe))
- {
- // Has to be the closing paren now.
- if (!fReaderMgr->skippedChar(chCloseParen))
- {
- delete headNode;
- fScanner->emitError(XMLErrs::UnterminatedContentModel);
- return false;
- }
- bool starSkipped = true;
- if (!fReaderMgr->skippedChar(chAsterisk)) {
- starSkipped = false;
- if (starRequired)
- fScanner->emitError(XMLErrs::ExpectedAsterisk);
- }
- //
- // Create a zero or more node and make the original head
- // node its first child.
- //
- if (starRequired || starSkipped) {
- headNode = new (fMemoryManager) ContentSpecNode
- (
- ContentSpecNode::ZeroOrMore
- , headNode
- , 0
- , true
- , true
- , fMemoryManager
- );
- }
- // Store the head node as the content spec of the element.
- toFill.setContentSpec(headNode);
- break;
- }
- // Its more than just a PCDATA, so an ending star will be required now
- starRequired = true;
- // Space is legal here so check for a PE ref, but don't require space
- checkForPERef(false, false, true);
- // Get a name token
- if (!fReaderMgr->getName(nameBuf))
- {
- delete headNode;
- fScanner->emitError(XMLErrs::ExpectedElementName);
- return false;
- }
- //
- // Create a leaf node for it. If we can find the element id for
- // this element, then use it. Else, we have to fault in an element
- // decl, marked as created because of being in a content model.
- //
- XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, nameBuf.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
- if (!decl)
- {
- decl = new (fMemoryManager) DTDElementDecl
- (
- nameBuf.getRawBuffer()
- , fEmptyNamespaceId
- , DTDElementDecl::Any
- , fMemoryManager
- );
- decl->setCreateReason(XMLElementDecl::InContentModel);
- decl->setExternalElemDeclaration(isReadingExternalEntity());
- fDTDGrammar->putElemDecl(decl);
- }
- //
- // If the current node is the original node, this is the first choice
- // node, so create an initial choice node with the current node and
- // the new element id. Store this as the head node.
- //
- // Otherwise, we have to steal the right node of the previous choice
- // and weave in another choice node there, which has the old choice
- // as its left and the new leaf as its right.
- //
- if (curNode == orgNode)
- {
- curNode = new (fMemoryManager) ContentSpecNode
- (
- ContentSpecNode::Choice
- , curNode
- , new (fMemoryManager) ContentSpecNode
- (
- decl->getElementName()
- , fMemoryManager
- )
- , true
- , true
- , fMemoryManager
- );
- // Remember the top node
- headNode = curNode;
- }
- else
- {
- ContentSpecNode* oldRight = curNode->orphanSecond();
- curNode->setSecond
- (
- new (fMemoryManager) ContentSpecNode
- (
- ContentSpecNode::Choice
- , oldRight
- , new (fMemoryManager) ContentSpecNode
- (
- decl->getElementName()
- , fMemoryManager
- )
- , true
- , true
- , fMemoryManager
- )
- );
- // Make the new right node the current node
- curNode = curNode->getSecond();
- }
- }
- }
- return true;
- }
- //
- // This method is called when we see a '<!NOTATION' string while scanning
- // markup decl. It parses out the notation and its id and stores a new
- // notation decl object in the notation decl pool.
- //
- void DTDScanner::scanNotationDecl()
- {
- // Space is required here so check for a PE ref, and require space
- if (!checkForPERef(true, false, true))
- {
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- //
- // And now we get a name, which is the name of the notation. Get a
- // buffer for the name.
- //
- XMLBufBid bbName(fBufMgr);
- if (!fReaderMgr->getName(bbName.getBuffer()))
- {
- fScanner->emitError(XMLErrs::ExpectedNotationName);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // If namespaces are enabled, then no colons allowed
- if (fScanner->getDoNamespaces())
- {
- if (XMLString::indexOf(bbName.getRawBuffer(), chColon) != -1)
- fScanner->emitError(XMLErrs::ColonNotLegalWithNS);
- }
- // Space is required here so check for a PE ref, and require space
- if (!checkForPERef(true, false, true))
- {
- fScanner->emitError(XMLErrs::ExpectedWhitespace);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- //
- // And scan an external or public id. We need buffers to use for both
- // of these.
- //
- XMLBufBid bbPubId(fBufMgr);
- XMLBufBid bbSysId(fBufMgr);
- if (!scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), IDType_Either))
- {
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // We can have an optional space or PE ref here
- checkForPERef(false, false, true);
- //
- // See if it already exists. If so, add it to the notatino decl pool.
- // Otherwise, if advanced callbacks are on, create a temp one and
- // call out for that one.
- //
- XMLNotationDecl* decl = fDTDGrammar->getNotationDecl(bbName.getRawBuffer());
- bool isIgnoring = (decl != 0);
- if (isIgnoring)
- {
- fScanner->emitError(XMLErrs::NotationAlreadyExists, bbName.getRawBuffer());
- }
- else
- {
- // Fill in a new notation declaration and add it to the pool
- const XMLCh* publicId = bbPubId.getRawBuffer();
- const XMLCh* systemId = bbSysId.getRawBuffer();
- ReaderMgr::LastExtEntityInfo lastInfo;
- fReaderMgr->getLastExtEntityInfo(lastInfo);
- decl = new (fMemoryManager) XMLNotationDecl
- (
- bbName.getRawBuffer()
- , (publicId && *publicId) ? publicId : 0
- , (systemId && *systemId) ? systemId : 0
- , (lastInfo.systemId && *lastInfo.systemId) ? lastInfo.systemId : 0
- , fMemoryManager
- );
- fDTDGrammar->putNotationDecl(decl);
- }
- //
- // If we have a document type handler, then tell it about this. If we
- // are ignoring it, only call out if advanced callbacks are enabled.
- //
- if (fDocTypeHandler)
- {
- fDocTypeHandler->notationDecl
- (
- *decl
- , isIgnoring
- );
- }
- // And one more optional space or PE ref
- checkForPERef(false, false, true);
- // And skip the terminating bracket
- if (!fReaderMgr->skippedChar(chCloseAngle))
- fScanner->emitError(XMLErrs::UnterminatedNotationDecl);
- }
- //
- // Scans a PI and calls the appropriate callbacks. A PI can happen in either
- // the document or the DTD, so it calls the appropriate handler according
- // to the fInDocument flag.
- //
- // At entry we have just scanned the <? part, and need to now start on the
- // PI target name.
- //
- void DTDScanner::scanPI()
- {
- const XMLCh* namePtr = 0;
- const XMLCh* targetPtr = 0;
- //
- // If there are any spaces here, then warn about it. If we aren't in
- // 'first error' mode, then we'll come back and can easily pick up
- // again by just skipping them.
- //
- if (fReaderMgr->lookingAtSpace())
- {
- fScanner->emitError(XMLErrs::PINameExpected);
- fReaderMgr->skipPastSpaces();
- }
- // Get a buffer for the PI name and scan it in
- XMLBufBid bbName(fBufMgr);
- if (!fReaderMgr->getName(bbName.getBuffer()))
- {
- fScanner->emitError(XMLErrs::PINameExpected);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // Point the name pointer at the raw data
- namePtr = bbName.getRawBuffer();
- // See if it issome form of 'xml' and emit a warning
- if (!XMLString::compareIString(namePtr, XMLUni::fgXMLString))
- fScanner->emitError(XMLErrs::NoPIStartsWithXML);
- // If namespaces are enabled, then no colons allowed
- if (fScanner->getDoNamespaces())
- {
- if (XMLString::indexOf(namePtr, chColon) != -1)
- fScanner->emitError(XMLErrs::ColonNotLegalWithNS);
- }
- //
- // If we don't hit a space next, then the PI has no target. If we do
- // then get out the target. Get a buffer for it as well
- //
- XMLBufBid bbTarget(fBufMgr);
- if (fReaderMgr->skippedSpace())
- {
- // Skip any leading spaces
- fReaderMgr->skipPastSpaces();
- bool gotLeadingSurrogate = false;
- // It does have a target, so lets move on to deal with that.
- while (1)
- {
- const XMLCh nextCh = fReaderMgr->getNextChar();
- // Watch for an end of file, which is always bad here
- if (!nextCh)
- {
- fScanner->emitError(XMLErrs::UnterminatedPI);
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- }
- // Watch for potential terminating character
- if (nextCh == chQuestion)
- {
- // It must be followed by '>' to be a termination of the target
- if (fReaderMgr->skippedChar(chCloseAngle))
- break;
- }
- // Check for correct surrogate pairs
- if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
- {
- if (gotLeadingSurrogate)
- fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
- else
- gotLeadingSurrogate = true;
- }
- else
- {
- if (gotLeadingSurrogate)
- {
- if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
- fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
- }
- // Its got to at least be a valid XML character
- else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
- }
- gotLeadingSurrogate = false;
- }
- bbTarget.append(nextCh);
- }
- }
- else
- {
- // No target, but make sure its terminated ok
- if (!fReaderMgr->skippedChar(chQuestion))
- {
- fScanner->emitError(XMLErrs::UnterminatedPI);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- if (!fReaderMgr->skippedChar(chCloseAngle))
- {
- fScanner->emitError(XMLErrs::UnterminatedPI);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- }
- // Point the target pointer at the raw data
- targetPtr = bbTarget.getRawBuffer();
- //
- // If we have a handler, then call it.
- //
- if (fDocTypeHandler)
- {
- fDocTypeHandler->doctypePI
- (
- namePtr
- , targetPtr
- );
- }
- }
- //
- // This method scans a public literal. It must be quoted and all of its
- // characters must be valid public id characters. The quotes are discarded
- // and the results are returned.
- //
- bool DTDScanner::scanPublicLiteral(XMLBuffer& toFill)
- {
- toFill.reset();
- // Get the next char which must be a single or double quote
- XMLCh quoteCh;
- if (!fReaderMgr->skipIfQuote(quoteCh)) {
- fScanner->emitError(XMLErrs::ExpectedQuotedString);
- return false;
- }
- while (true)
- {
- const XMLCh nextCh = fReaderMgr->getNextChar();
- // Watch for EOF
- if (!nextCh)
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- if (nextCh == quoteCh)
- break;
- //
- // If its not a valid public id char, then report it but keep going
- // since that's the best recovery scheme.
- //
- if (!fReaderMgr->getCurrentReader()->isPublicIdChar(nextCh))
- {
- XMLCh tmpBuf[9];
- XMLString::binToText
- (
- nextCh
- , tmpBuf
- , 8
- , 16
- );
- fScanner->emitError(XMLErrs::InvalidPublicIdChar, tmpBuf);
- }
- toFill.append(nextCh);
- }
- return true;
- }
- //
- // This method handles scanning in a quoted system literal. It expects to
- // start on the open quote and returns after eating the ending quote. There
- // are not really any restrictions on the contents of system literals.
- //
- bool DTDScanner::scanSystemLiteral(XMLBuffer& toFill)
- {
- toFill.reset();
- // Get the next char which must be a single or double quote
- XMLCh quoteCh;
- if (!fReaderMgr->skipIfQuote(quoteCh)) {
- fScanner->emitError(XMLErrs::ExpectedQuotedString);
- return false;
- }
- while (true)
- {
- const XMLCh nextCh = fReaderMgr->getNextChar();
- // Watch for EOF
- if (!nextCh)
- ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
- // Break out on terminating quote
- if (nextCh == quoteCh)
- break;
- toFill.append(nextCh);
- }
- return true;
- }
- //
- // This method is called to scan a text decl line, which can be the first
- // line in an external entity or external subset.
- //
- // On entry the <? has been scanned, and next should be 'xml' followed by
- // some whitespace, version string, etc...
- // [77] TextDecl::= '<?xml' VersionInfo? EncodingDecl S? '?>'
- //
- void DTDScanner::scanTextDecl()
- {
- // Skip any subsequent whitespace before the version string
- fReaderMgr->skipPastSpaces();
- // Next should be the version string
- XMLBufBid bbVersion(fBufMgr);
- if (fReaderMgr->skippedString(XMLUni::fgVersionString))
- {
- if (!scanEq())
- {
- fScanner->emitError(XMLErrs::ExpectedEqSign);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- //
- // Followed by a single or double quoted version. Get a buffer for
- // the string.
- //
- if (!getQuotedString(bbVersion.getBuffer()))
- {
- fScanner->emitError(XMLErrs::BadXMLVersion);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // If its not our supported version, issue an error but continue
- if (XMLString::equals(bbVersion.getRawBuffer(), XMLUni::fgVersion1_1)) {
- if (fScanner->getXMLVersion() != XMLReader::XMLV1_1)
- fScanner->emitError(XMLErrs::UnsupportedXMLVersion, bbVersion.getRawBuffer());
- }
- else if (!XMLString::equals(bbVersion.getRawBuffer(), XMLUni::fgVersion1_0))
- fScanner->emitError(XMLErrs::UnsupportedXMLVersion, bbVersion.getRawBuffer());
- }
- // Ok, now we must have an encoding string
- XMLBufBid bbEncoding(fBufMgr);
- fReaderMgr->skipPastSpaces();
- bool gotEncoding = false;
- if (fReaderMgr->skippedString(XMLUni::fgEncodingString))
- {
- // There must be a equal sign next
- if (!scanEq())
- {
- fScanner->emitError(XMLErrs::ExpectedEqSign);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // Followed by a single or double quoted version string
- getQuotedString(bbEncoding.getBuffer());
- if (bbEncoding.isEmpty() || !XMLString::isValidEncName(bbEncoding.getRawBuffer()))
- {
- fScanner->emitError(XMLErrs::BadXMLEncoding, bbEncoding.getRawBuffer());
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- // Indicate that we got an encoding
- gotEncoding = true;
- }
- //
- // Encoding declarations are required in the external entity
- // if there is a text declaration present
- //
- if (!gotEncoding)
- {
- fScanner->emitError(XMLErrs::EncodingRequired);
- fReaderMgr->skipPastChar(chCloseAngle);
- return;
- }
- fReaderMgr->skipPastSpaces();
- if (!fReaderMgr->skippedChar(chQuestion))
- {
- fScanner->emitError(XMLErrs::UnterminatedXMLDecl);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- else if (!fReaderMgr->skippedChar(chCloseAngle))
- {
- fScanner->emitError(XMLErrs::UnterminatedXMLDecl);
- fReaderMgr->skipPastChar(chCloseAngle);
- }
- //
- // If we have a document type handler and advanced callbacks are on,
- // then call the TextDecl callback
- //
- if (fDocTypeHandler)
- {
- fDocTypeHandler->TextDecl
- (
- bbVersion.getRawBuffer()
- , bbEncoding.getRawBuffer()
- );
- }
- //
- // If we got an encoding string, then we have to call back on the reader
- // to tell it what the encoding is.
- //
- if (!bbEncoding.isEmpty())
- {
- if (!fReaderMgr->getCurrentReader()->setEncoding(bbEncoding.getRawBuffer()))
- fScanner->emitError(XMLErrs::ContradictoryEncoding, bbEncoding.getRawBuffer());
- }
- }
- XERCES_CPP_NAMESPACE_END