xml/soap/webservice

开发平台：

C/C++

XMLReader.cpp：源码内容

/*
* The Apache Software License, Version 1.1
*
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation, and was
* Business Machines, Inc., http://www.ibm.com . For more information
* on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/*
* $Id: XMLReader.cpp,v 1.30 2001/12/06 17:47:04 tng Exp $
*/
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <util/BitOps.hpp>
#include <util/BinInputStream.hpp>
#include <util/PlatformUtils.hpp>
#include <util/RuntimeException.hpp>
#include <util/TranscodingException.hpp>
#include <util/TransService.hpp>
#include <util/UTFDataFormatException.hpp>
#include <util/XMLEBCDICTranscoder.hpp>
#include <util/XMLString.hpp>
#include <util/XMLUni.hpp>
#include <sax/InputSource.hpp>
#include <framework/XMLBuffer.hpp>
#include <internal/CharTypeTables.hpp>
#include <internal/XMLReader.hpp>
#include <internal/XMLScanner.hpp>
#include <string.h>
// ---------------------------------------------------------------------------
// XMLReader: static data initialization
// ---------------------------------------------------------------------------
bool XMLReader::fNEL = false;
// ---------------------------------------------------------------------------
// XMLReader: Public, static methods
// ---------------------------------------------------------------------------
bool XMLReader::isFirstNameChar(const XMLCh toCheck)
{
static const XMLByte ourMask = gBaseCharMask | gLetterCharMask;
if ((fgCharCharsTable[toCheck] & ourMask) != 0)
return true;
// Check the two special case name start chars
if ((toCheck == chUnderscore) || (toCheck == chColon))
return true;
return false;
}
//
// Checks whether all of the chars in the passed buffer are whitespace or
// not. Breaks out on the first non-whitespace.
//
bool XMLReader::isAllSpaces(const XMLCh* const toCheck
, const unsigned int count)
{
const XMLCh* curCh = toCheck;
const XMLCh* endPtr = toCheck + count;
while (curCh < endPtr)
{
if (!(fgCharCharsTable[*curCh++] & gWhitespaceCharMask))
return false;
}
return true;
}
//
// Checks whether at least one of the chars in the passed buffer are whitespace or
// not.
//
bool XMLReader::containsWhiteSpace(const XMLCh* const toCheck
, const unsigned int count)
{
const XMLCh* curCh = toCheck;
const XMLCh* endPtr = toCheck + count;
while (curCh < endPtr)
{
if (fgCharCharsTable[*curCh++] & gWhitespaceCharMask)
return true;
}
return false;
}
//
// This one is not called terribly often, so its done manually in order
// give up more bits in the character characteristics table for more often
// used characteristics.
//
bool XMLReader::isPublicIdChar(const XMLCh toCheck)
{
return checkTable(gPublicIdChars, toCheck);
}
void XMLReader::enableNELWS() {
if (!fNEL) {
fNEL = true;
// When option is on, treat NEL same as LF
fgCharCharsTable[chNEL] = fgCharCharsTable[chLF];
}
}
// ---------------------------------------------------------------------------
// XMLReader: Constructors and Destructor
// ---------------------------------------------------------------------------
XMLReader::XMLReader(const XMLCh* const pubId
, const XMLCh* const sysId
, BinInputStream* const streamToAdopt
, const RefFrom from
, const Types type
, const Sources source
, const bool throwAtEnd) :
fCharIndex(0)
, fCharsAvail(0)
, fCurCol(1)
, fCurLine(1)
, fEncodingStr(0)
, fForcedEncoding(false)
, fNoMore(false)
, fPublicId(XMLString::replicate(pubId))
, fRawBufIndex(0)
, fRawBytesAvail(0)
, fReaderNum(0xFFFFFFFF)
, fRefFrom(from)
, fSentTrailingSpace(false)
, fSource(source)
, fSpareCh(0)
, fSrcOfsBase(0)
, fSrcOfsSupported(false)
, fStream(streamToAdopt)
, fSystemId(XMLString::replicate(sysId))
, fSwapped(false)
, fThrowAtEnd(throwAtEnd)
, fTranscoder(0)
, fType(type)
{
// Do an initial load of raw bytes
refreshRawBuffer();
// Ask the transcoding service if it supports src offset info
fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs();
//
// Use the recognizer class to get a basic sense of what family of
// encodings this file is in. We'll start off with a reader of that
// type, and update it later if needed when we read the XMLDecl line.
//
fEncoding = XMLRecognizer::basicEncodingProbe(fRawByteBuf, fRawBytesAvail);
#if defined(XERCES_DEBUG)
if ((fEncoding < XMLRecognizer::Encodings_Min)
|| (fEncoding > XMLRecognizer::Encodings_Max))
{
ThrowXML(RuntimeException, XMLExcepts::Reader_BadAutoEncoding);
}
#endif
fEncodingStr = XMLString::replicate(XMLRecognizer::nameForEncoding(fEncoding));
// Check whether the fSwapped flag should be set or not
checkForSwapped();
//
// This will check to see if the first line is an XMLDecl and, if
// so, decode that first line manually one character at a time. This
// leaves enough characters in the buffer that the high level code
// can get through the Decl and call us back with the real encoding.
//
doInitDecode();
//
// NOTE: We won't create a transcoder until we either get a call to
// setEncoding() or we get a call to refreshCharBuffer() and no
// transcoder has been set yet.
//
}
XMLReader::XMLReader(const XMLCh* const pubId
, const XMLCh* const sysId
, BinInputStream* const streamToAdopt
, const XMLCh* const encodingStr
, const RefFrom from
, const Types type
, const Sources source
, const bool throwAtEnd) :
fCharIndex(0)
, fCharsAvail(0)
, fCurCol(1)
, fCurLine(1)
, fEncoding(XMLRecognizer::UTF_8)
, fEncodingStr(0)
, fForcedEncoding(true)
, fNoMore(false)
, fPublicId(XMLString::replicate(pubId))
, fRawBufIndex(0)
, fRawBytesAvail(0)
, fReaderNum(0xFFFFFFFF)
, fRefFrom(from)
, fSentTrailingSpace(false)
, fSource(source)
, fSpareCh(0)
, fSrcOfsBase(0)
, fSrcOfsSupported(false)
, fStream(streamToAdopt)
, fSystemId(XMLString::replicate(sysId))
, fSwapped(false)
, fThrowAtEnd(throwAtEnd)
, fTranscoder(0)
, fType(type)
{
// Do an initial load of raw bytes
refreshRawBuffer();
// Copy the encoding string to our member
fEncodingStr = XMLString::replicate(encodingStr);
// Ask the transcoding service if it supports src offset info
fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs();
//
// Map the passed encoding name to one of our enums. If it does not
// match one of the intrinsic encodings, it will come back 'other',
// which tells us to create a transcoder based reader.
//
fEncoding = XMLRecognizer::encodingForName(fEncodingStr);
// Check whether the fSwapped flag should be set or not
checkForSwapped();
//
// Create a transcoder for the encoding. Since the encoding has been
// forced, this will be the one we will use, period.
//
XMLTransService::Codes failReason;
fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
fEncodingStr
, failReason
, kCharBufSize
);
if (!fTranscoder)
{
ThrowXML1
(
TranscodingException
, XMLExcepts::Trans_CantCreateCvtrFor
, fEncodingStr
);
}
//
// Note that, unlike above, we do not do an initial decode of the
// first line. We take the caller's word that the encoding is correct
// and just assume that the first bulk decode (kicked off by the first
// get of a character) will work.
//
// So we do here the slipping in of the leading space if required.
//
if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
{
// This represents no data from the source
fCharSizeBuf[fCharsAvail] = 0;
fCharBuf[fCharsAvail++] = chSpace;
}
}
XMLReader::~XMLReader()
{
delete [] fEncodingStr;
delete [] fPublicId;
delete [] fSystemId;
delete fStream;
delete fTranscoder;
}
// ---------------------------------------------------------------------------
// XMLReader: Character buffer management methods
// ---------------------------------------------------------------------------
unsigned int XMLReader::getSrcOffset() const
{
if (!fSrcOfsSupported)
ThrowXML(RuntimeException, XMLExcepts::Reader_SrcOfsNotSupported);
//
// Take the current source offset and add in the sizes that we've
// eaten from the source so far.
//
unsigned int offset = fSrcOfsBase;
for (unsigned int index = 0; index < fCharIndex; index++)
offset += fCharSizeBuf[index];
return offset;
}
bool XMLReader::refreshCharBuffer()
{
// If the no more flag is set, then don't both doing anything
if (fNoMore)
return false;
unsigned int startInd;
// See if we have any existing chars.
const unsigned int spareChars = fCharsAvail - fCharIndex;
// If we are full, then don't do anything.
if (spareChars == kCharBufSize)
return false;
//
// If no transcoder has been created yet, then we never saw the
// any encoding="" string and the encoding was not forced, so lets
// create one now. We know that it won't change now.
//
// However, note that if we autosensed EBCDIC, then we have to
// consider it an error if we never got an encoding since we don't
// know what variant of EBCDIC it is.
//
if (!fTranscoder)
{
if (fEncoding == XMLRecognizer::EBCDIC)
ThrowXML(RuntimeException, XMLExcepts::Reader_EncodingStrRequired);
// Ask the transcoding service to make use a transcoder
XMLTransService::Codes failReason;
fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
fEncodingStr
, failReason
, kCharBufSize
);
if (!fTranscoder)
{
ThrowXML1
(
TranscodingException
, XMLExcepts::Trans_CantCreateCvtrFor
, fEncodingStr
);
}
}
//
// Add the number of source bytes eaten so far to the base src
// offset member.
//
for (startInd = 0; startInd < fCharIndex; startInd++)
fSrcOfsBase += fCharSizeBuf[startInd];
//
// If there are spare chars, then move then down to the bottom. We
// have to move the char sizes down also.
//
startInd = 0;
if (spareChars)
{
for (unsigned int index = fCharIndex; index < fCharsAvail; index++)
{
fCharBuf[startInd] = fCharBuf[index];
fCharSizeBuf[startInd] = fCharSizeBuf[index];
startInd++;
}
}
//
// And then get more chars, starting after any spare chars that were
// left over from the last time.
//
fCharsAvail = xcodeMoreChars
(
&fCharBuf[startInd]
, &fCharSizeBuf[startInd]
, kCharBufSize - spareChars
);
// Add back in the spare chars
fCharsAvail += spareChars;
// Reset the buffer index to zero, so we start from the 0th char again
fCharIndex = 0;
//
// If no chars available, then we have to check for one last thing. If
// this is reader for a PE and its not being expanded inside a literal,
// then unget a trailing space. We use a boolean to avoid triggering
// this more than once.
//
if (!fCharsAvail
&& (fType == Type_PE)
&& (fRefFrom == RefFrom_NonLiteral)
&& !fSentTrailingSpace)
{
fCharBuf[0] = chSpace;
fCharsAvail = 1;
fSentTrailingSpace = true;
}
//
// If we are on our first block of chars and the encoding is one of the
// UTF-16 formats, then check the first char for the BOM and skip over
// it manually.
//
if (fCharsAvail)
{
if ((fCurLine == 1) && (fCurCol == 1))
{
if (((fEncoding == XMLRecognizer::UTF_16L)
|| (fEncoding == XMLRecognizer::UTF_16B))
&& !startInd)
{
if ((fCharBuf[startInd] == chUnicodeMarker)
|| (fCharBuf[startInd] == chSwappedUnicodeMarker))
{
XMLCh chTmp;
getNextChar(chTmp);
}
}
}
}
//
// If we get here with no more chars, then set the fNoMore flag which
// lets us optimize and know without checking that no more chars are
// available.
//
if (!fCharsAvail)
fNoMore = true;
return (fCharsAvail != 0);
}
// ---------------------------------------------------------------------------
// XMLReader: Scanning methods
// ---------------------------------------------------------------------------
bool XMLReader::getName(XMLBuffer& toFill, const bool token)
{
//
// Ok, first lets see if we have chars in the buffer. If not, then lets
// reload.
//
if (fCharIndex == fCharsAvail)
{
if (!refreshCharBuffer())
return false;
}
//
// Lets check the first char for being a first name char. If not, then
// what's the point in living mannnn? Just give up now. We only do this
// if its a name and not a name token that they want.
//
if (!token)
{
if (!XMLReader::isFirstNameChar(fCharBuf[fCharIndex]))
return false;
// Looks ok, so lets eat it and put it in our buffer. Update column also!
toFill.append(fCharBuf[fCharIndex++]);
fCurCol++;
}
//
// And now we loop until we run out of data in this reader or we hit
// a non-name char.
//
do {
unsigned int curCol = fCurCol;
unsigned int charIndex = fCharIndex;
unsigned int charsAvail = fCharsAvail;
while (charIndex < charsAvail)
{
const XMLCh curCh = fCharBuf[charIndex];
//
// Check the current char and take it if its a name char. Else
// break out.
//
if (!XMLReader::isNameChar(curCh))
{
fCharIndex = charIndex;
fCurCol = curCol;
return !toFill.isEmpty();
}
toFill.append(curCh);
curCol++;
charIndex++;
}
fCharIndex = charIndex;
fCurCol = curCol;
// If we don't get no more, then break out.
} while (refreshCharBuffer());
return !toFill.isEmpty();
}
bool XMLReader::getNextChar(XMLCh& chGotten)
{
//
// See if there is at least a char in the buffer. Else, do the buffer
// reload logic.
//
if (fCharIndex >= fCharsAvail)
{
// If fNoMore is set, then we have nothing else to give
if (fNoMore)
return false;
// If the buffer is empty, then try to refresh
if (fCharIndex == fCharsAvail)
{
refreshCharBuffer();
// If still empty, then return false
if (fCharIndex == fCharsAvail)
return false;
}
}
chGotten = fCharBuf[fCharIndex++];
// Handle end of line normalization and line/col member maintenance.
if (chGotten == chCR)
{
//
// Do the normalization. We return chLF regardless of which was
// found. We also eat a chCR followed by an chLF.
//
// We only do this if the content being spooled is not already
// internalized.
//
if (fSource == Source_External)
{
//
// See if we have another char left. If not, don't bother.
// Else, see if its an chLF to eat. If it is, bump the
// index again.
//
if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
{
if (fCharBuf[fCharIndex] == chLF
|| ((fCharBuf[fCharIndex] == chNEL) && fNEL))
fCharIndex++;
}
// And return just an chLF
chGotten = chLF;
}
// And handle the line/col stuff
fCurCol = 1;
fCurLine++;
}
else if (chGotten == chLF
|| ((chGotten == chNEL) && fNEL))
{
chGotten = chLF;
fCurLine++;
fCurCol = 1;
}
else if (chGotten)
{
//
// Only do this is not a null char. Null chars are not part of the
// real content. They are just marker characters inserted into
// the stream.
//
fCurCol++;
}
return true;
}
bool XMLReader::getSpaces(XMLBuffer& toFill)
{
//
// We just loop until we either hit a non-space or the end of this
// entity. We return true if we returned because of a non-space and
// false if because of end of entity.
//
// NOTE: We have to maintain line/col info here and we have to do
// whitespace normalization if we are not already internalized.
//
while (true)
{
// Loop through the current chars in the buffer
while (fCharIndex < fCharsAvail)
{
// Get the current char out of the buffer
XMLCh curCh = fCharBuf[fCharIndex];
//
// See if its a white space char. If so, then process it. Else
// we've hit a non-space and need to return.
//
if (XMLReader::isWhitespace(curCh))
{
// Eat this char
fCharIndex++;
//
// Ok, we've got some whitespace here. So we have to store
// it. But we have to normalize it and update the line and
// column info along the way.
//
if (curCh == chCR)
{
fCurCol = 1;
fCurLine++;
//
// If not already internalized, then convert it to an
// LF and eat any following LF.
//
if (fSource == Source_External)
{
if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
{
if (fCharBuf[fCharIndex] == chLF
|| ((fCharBuf[fCharIndex] == chNEL) && fNEL))
fCharIndex++;
}
curCh = chLF;
}
}
else if (curCh == chLF
|| ((curCh == chNEL) && fNEL))
{
curCh = chLF;
fCurCol = 1;
fCurLine++;
}
else
{
fCurCol++;
}
// Ok we can add this guy to our buffer
toFill.append(curCh);
}
else
{
// Return true to indicate we broke out due to a whitespace
return true;
}
}
//
// We've eaten up the current buffer, so lets try to reload it. If
// we don't get anything new, then break out. If we do, then we go
// back to the top to keep getting spaces.
//
if (!refreshCharBuffer())
break;
}
return false;
}
bool XMLReader::getUpToCharOrWS(XMLBuffer& toFill, const XMLCh toCheck)
{
while (true)
{
// Loop through the current chars in the buffer
while (fCharIndex < fCharsAvail)
{
// Get the current char out of the buffer
XMLCh curCh = fCharBuf[fCharIndex];
//
// See if its not a white space or our target char, then process
// it. Else, we need to return.
//
if (!XMLReader::isWhitespace(curCh) && (curCh != toCheck))
{
// Eat this char
fCharIndex++;
//
// Ok, we've got some whitespace here. So we have to store
// it. But we have to normalize it and update the line and
// column info along the way.
//
if (curCh == chCR)
{
fCurCol = 1;
fCurLine++;
//
// If not already internalized, then convert it to an
// LF and eat any following LF.
//
if (fSource == Source_External)
{
if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
{
if (fCharBuf[fCharIndex] == chLF
|| ((fCharBuf[fCharIndex] == chNEL) && fNEL))
fCharIndex++;
}
curCh = chLF;
}
}
else if (curCh == chLF
|| ((curCh == chNEL) && fNEL))
{
curCh = chLF;
fCurCol = 1;
fCurLine++;
}
else
{
fCurCol++;
}
// Add it to our buffer
toFill.append(curCh);
}
else
{
return true;
}
}
//
// We've eaten up the current buffer, so lets try to reload it. If
// we don't get anything new, then break out. If we do, then we go
// back to the top to keep getting spaces.
//
if (!refreshCharBuffer())
break;
}
// We never hit any non-space and ate up the whole reader
return false;
}
bool XMLReader::peekNextChar(XMLCh& chGotten)
{
//
// If there is something still in the buffer, get it. Else do the reload
// scenario.
//
if (fCharIndex >= fCharsAvail)
{
// Try to refresh the buffer
if (!refreshCharBuffer())
{
chGotten = chNull;
return false;
}
}
chGotten = fCharBuf[fCharIndex];
//
// Even though we are only peeking, we have to act the same as the
// normal char get method in regards to newline normalization, though
// its not as complicated as the actual character getting method's.
//
if ((chGotten == chCR || ((chGotten == chNEL) && fNEL))
&& (fSource == Source_External))
chGotten = chLF;
return true;
}
bool XMLReader::skipIfQuote(XMLCh& chGotten)
{
if (fCharIndex == fCharsAvail)
{
if (!refreshCharBuffer())
return false;
}
const XMLCh curCh = fCharBuf[fCharIndex];
if ((curCh == chDoubleQuote) || (curCh == chSingleQuote))
{
chGotten = curCh;
fCharIndex++;
fCurCol++;
return true;
}
return false;
}
bool XMLReader::skipSpaces(bool& skippedSomething)
{
// Remember the current line and column
unsigned int orgLine = fCurLine;
unsigned int orgCol = fCurCol;
//
// We enter a loop where we skip over spaces until we hit the end of
// this reader or a non-space value. The return indicates whether we
// hit the non-space (true) or the end (false).
//
while (true)
{
// Loop through the current chars in the buffer
while (fCharIndex < fCharsAvail)
{
// Get the current char out of the buffer
XMLCh curCh = fCharBuf[fCharIndex];
//
// See if its a white space char. If so, then process it. Else
// we've hit a non-space and need to return.
//
if (XMLReader::isWhitespace(curCh))
{
// Eat this char
fCharIndex++;
//
// Ok, we've got some whitespace here. So we have to store
// it. But we have to normalize it and update the line and
// column info along the way.
//
if (curCh == chCR)
{
fCurCol = 1;
fCurLine++;
//
// If not already internalized, then convert it to an
// LF and eat any following LF.
//
if (fSource == Source_External)
{
if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
{
if (fCharBuf[fCharIndex] == chLF
|| ((fCharBuf[fCharIndex] == chNEL) && fNEL))
fCharIndex++;
}
curCh = chLF;
}
}
else if (curCh == chLF
|| ((curCh == chNEL) && fNEL))
{
curCh = chLF;
fCurCol = 1;
fCurLine++;
}
else
{
fCurCol++;
}
}
else
{
skippedSomething = (orgLine != fCurLine) || (orgCol != fCurCol);
return true;
}
}
//
// We've eaten up the current buffer, so lets try to reload it. If
// we don't get anything new, then break out. If we do, then we go
// back to the top to keep getting spaces.
//
if (!refreshCharBuffer())
break;
}
// We never hit any non-space and ate up the whole reader
skippedSomething = (orgLine != fCurLine) || (orgCol != fCurCol);
return false;
}
bool XMLReader::skippedChar(const XMLCh toSkip)
{
//
// If the buffer is empty, then try to reload it. If we still get
// nothing, then return false.
//
if (fCharIndex == fCharsAvail)
{
if (!refreshCharBuffer())
return false;
}
//
// See if the current char is the one we want. If so, then we need
// to eat it and return true.
//
if (fCharBuf[fCharIndex] == toSkip)
{
fCharIndex++;
fCurCol++;
return true;
}
return false;
}
bool XMLReader::skippedSpace()
{
//
// If the buffer is empty, then try to reload it. If we still get
// nothing, then return false.
//
if (fCharIndex == fCharsAvail)
{
if (!refreshCharBuffer())
return false;
}
//
// See if the current char is a whitespace. If so, then we need to eat
// it and return true.
//
const XMLCh curCh = fCharBuf[fCharIndex];
if (XMLReader::isWhitespace(curCh))
{
// Eat the character
fCharIndex++;
if (curCh == chCR)
{
fCurLine++;
fCurCol = 1;
if (fSource == Source_External)
{
if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
{
if (fCharBuf[fCharIndex] == chLF
|| ((fCharBuf[fCharIndex] == chNEL) && fNEL))
fCharIndex++;
}
}
}
else if (curCh == chLF
|| ((curCh == chNEL) && fNEL))
{
fCurLine++;
fCurCol = 1;
}
else
{
fCurCol++;
}
return true;
}
return false;
}
bool XMLReader::skippedString(const XMLCh* const toSkip)
{
// Get the length of the string to skip
const unsigned int srcLen = XMLString::stringLen(toSkip);
//
// See if the current reader has enough chars to test against this
// string. If not, then ask it to reload its buffer. If that does not
// get us enough, then it cannot match.
//
// NOTE: This works because strings never have to cross a reader! And
// a string to skip will never have a new line in it, so we will never
// miss adjusting the current line.
//
unsigned int charsLeft = charsLeftInBuffer();
while (charsLeft < srcLen)
{
refreshCharBuffer();
unsigned int t = charsLeftInBuffer();
if (t == charsLeft) // if the refreshCharBuf() did not add anything new
return false; // give up and return.
charsLeft = t;
}
//
// Ok, now we now that the current reader has enough chars in its
// buffer and that its index is back at zero. So we can do a quick and
// dirty comparison straight to its buffer with no requirement to unget
// if it fails.
//
if (XMLString::compareNString(&fCharBuf[fCharIndex], toSkip, srcLen))
return false;
// Add the source length to the current column to get it back right
fCurCol += srcLen;
//
// And get the character buffer index back right by just adding the
// source len to it.
//
fCharIndex += srcLen;
return true;
}
//
// This is just to peek if the next coming buffer
// matches the string toPeek.
// Similar to skippedString, but just the fCharIndex and fCurCol are not updated
//
bool XMLReader::peekString(const XMLCh* const toPeek)
{
// Get the length of the string to skip
const unsigned int srcLen = XMLString::stringLen(toPeek);
//
// See if the current reader has enough chars to test against this
// string. If not, then ask it to reload its buffer. If that does not
// get us enough, then it cannot match.
//
// NOTE: This works because strings never have to cross a reader! And
// a string to skip will never have a new line in it, so we will never
// miss adjusting the current line.
//
unsigned int charsLeft = charsLeftInBuffer();
while (charsLeft < srcLen)
{
refreshCharBuffer();
unsigned int t = charsLeftInBuffer();
if (t == charsLeft) // if the refreshCharBuf() did not add anything new
return false; // give up and return.
charsLeft = t;
}
//
// Ok, now we now that the current reader has enough chars in its
// buffer and that its index is back at zero. So we can do a quick and
// dirty comparison straight to its buffer with no requirement to unget
// if it fails.
//
if (XMLString::compareNString(&fCharBuf[fCharIndex], toPeek, srcLen))
return false;
return true;
}
// ---------------------------------------------------------------------------
// XMLReader: Setter methods (most are inlined)
// ---------------------------------------------------------------------------
bool XMLReader::setEncoding(const XMLCh* const newEncoding)
{
//
// If the encoding was forced, then we ignore the new value and just
// return with success. If it was forced, then we are to use that
// encoding without question. Note that, if we are forced, we created
// a transcoder up front so there is no need to do one here in that
// case.
//
if (fForcedEncoding)
return true;
// Clean up the old encoding string
// Do not delete until we know we have a good encoding
// if (fEncodingStr)
// {
// delete [] fEncodingStr;
// fEncodingStr = 0;
// }
//
// Try to map the string to one of our standard encodings. If its not
// one of them, then it has to be one of the non-intrinsic encodings,
// in which case we have to delete our intrinsic encoder and create a
// new one.
//
XMLRecognizer::Encodings newBaseEncoding = XMLRecognizer::encodingForName
(
newEncoding
);
//
// If it does not come back as one of the auto-sensed encodings, then we
// have to possibly replace it and at least check a few things.
//
if (newBaseEncoding == XMLRecognizer::OtherEncoding)
{
//
// Check for non-endian specific UTF-16 or UCS-4. If so, and if we
// are already in one of the endian versions of those encodings,
// then just keep it and go on. Otherwise, its not valid.
//
if (!XMLString::compareIString(newEncoding, XMLUni::fgUTF16EncodingString)
|| !XMLString::compareIString(newEncoding, XMLUni::fgUTF16EncodingString2)
|| !XMLString::compareIString(newEncoding, XMLUni::fgUTF16EncodingString3)
|| !XMLString::compareIString(newEncoding, XMLUni::fgUTF16EncodingString4))
{
if ((fEncoding != XMLRecognizer::UTF_16L)
&& (fEncoding != XMLRecognizer::UTF_16B))
{
return false;
}
// Override with the original endian specific encoding
newBaseEncoding = fEncoding;
if (fEncoding == XMLRecognizer::UTF_16L) {
delete [] fEncodingStr;
fEncodingStr = XMLString::replicate(XMLUni::fgUTF16LEncodingString);
}
else {
delete [] fEncodingStr;
fEncodingStr = XMLString::replicate(XMLUni::fgUTF16BEncodingString);
}
}
else if (!XMLString::compareIString(newEncoding, XMLUni::fgUCS4EncodingString)
|| !XMLString::compareIString(newEncoding, XMLUni::fgUCS4EncodingString2)
|| !XMLString::compareIString(newEncoding, XMLUni::fgUCS4EncodingString3))
{
if ((fEncoding != XMLRecognizer::UCS_4L)
&& (fEncoding != XMLRecognizer::UCS_4B))
{
return false;
}
// Override with the original endian specific encoding
newBaseEncoding = fEncoding;
if (fEncoding == XMLRecognizer::UCS_4L) {
delete [] fEncodingStr;
fEncodingStr = XMLString::replicate(XMLUni::fgUCS4LEncodingString);
}
else {
delete [] fEncodingStr;
fEncodingStr = XMLString::replicate(XMLUni::fgUCS4BEncodingString);
}
}
else
{
// None of those special cases, so just replicate the new name
delete [] fEncodingStr;
fEncodingStr = XMLString::replicate(newEncoding);
}
}
else
{
// Store the new encoding string since it is just an intrinsic
delete [] fEncodingStr;
fEncodingStr = XMLString::replicate(newEncoding);
}
//
// Now we can create a transcoder using the transcoding service. We
// might get back a transcoder for an intrinsically supported encoding,
// or we might get one from the underlying transcoding service.
//
XMLTransService::Codes failReason;
fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
fEncodingStr
, failReason
, kCharBufSize
);
if (!fTranscoder)
ThrowXML1(TranscodingException, XMLExcepts::Trans_CantCreateCvtrFor, fEncodingStr);
// Update the base encoding member with the new base encoding found
fEncoding = newBaseEncoding;
// Looks ok to us
return true;
}
// ---------------------------------------------------------------------------
// XMLReader: Private static init methods
// ---------------------------------------------------------------------------
bool XMLReader::checkTable( const XMLCh* const theTable
, const XMLCh toCheck)
{
const XMLCh* curTable = theTable;
// Check the ranges
while (*curTable)
{
//
// If the test char is less than the low range, then its never
// going to match any other range, so break out. But we have to
// run up to the range terminator first.
//
if (toCheck < *curTable++)
{
while (*curTable++)
{
// Purposefully empty, we are just running up the pointer
}
break;
}
// If its less than or equal to the top of the range, then a match
if (toCheck <= *curTable++)
return true;
}
// And now test against singles
while (*curTable)
{
if (toCheck == *curTable++)
return true;
}
return false;
}
// ---------------------------------------------------------------------------
// XMLReader: Private helper methods
// ---------------------------------------------------------------------------
//
// This is called when the encoding flag is set and just sets the fSwapped
// flag appropriately.
//
void XMLReader::checkForSwapped()
{
// Assume not swapped
fSwapped = false;
#if defined(ENDIANMODE_LITTLE)
if ((fEncoding == XMLRecognizer::UTF_16B)
|| (fEncoding == XMLRecognizer::UCS_4B))
{
fSwapped = true;
}
#elif defined(ENDIANMODE_BIG)
if ((fEncoding == XMLRecognizer::UTF_16L)
|| (fEncoding == XMLRecognizer::UCS_4L))
{
fSwapped = true;
}
#endif
}
//
// This is called from the constructor when the encoding is not forced.
// We assume that the encoding has been auto-sensed at this point and that
// fSwapped is set correctly.
//
// In the case of UCS-4 and EBCDIC, we don't have to check for a decl.
// The fact that we got here, means that there is one, because that's the
// only way we can autosense those.
//
void XMLReader::doInitDecode()
{
switch(fEncoding)
{
case XMLRecognizer::UCS_4B :
case XMLRecognizer::UCS_4L :
{
// Look at the raw buffer as UCS4 chars
const UCS4Ch* asUCS = (const UCS4Ch*)fRawByteBuf;
while (fRawBufIndex < fRawBytesAvail)
{
// Get out the current 4 byte value and inc our raw buf index
UCS4Ch curVal = *asUCS++;
fRawBufIndex += sizeof(UCS4Ch);
// Swap if that is required for this machine
if (fSwapped)
curVal = BitOps::swapBytes(curVal);
// Make sure its at least semi legal. If not, undo and throw
if (curVal > 0xFFFF)
{
fCharsAvail = 0;
fRawBufIndex = 0;
ThrowXML1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
);
}
// Convert the value to an XML char and store it
fCharSizeBuf[fCharsAvail] = 4;
fCharBuf[fCharsAvail++] = XMLCh(curVal);
// Break out on the > character
if (curVal == chCloseAngle)
break;
}
break;
}
case XMLRecognizer::UTF_8 :
{
// If there's a utf-8 BOM (0xEF 0xBB 0xBF), skip past it.
// Don't move to char buf - no one wants to see it.
// Note: this causes any encoding= declaration to override
// the BOM's attempt to say that the encoding is utf-8.
// Look at the raw buffer as short chars
const char* asChars = (const char*)fRawByteBuf;
if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen &&
XMLString::compareNString( asChars
, XMLRecognizer::fgUTF8BOM
, XMLRecognizer::fgUTF8BOMLen) == 0)
{
fRawBufIndex += XMLRecognizer::fgUTF8BOMLen;
asChars += XMLRecognizer::fgUTF8BOMLen;
}
//
// First check that there are enough bytes to even see the
// decl indentifier. If not, get out now with no action since
// there is no decl.
//
if (fRawBytesAvail < XMLRecognizer::fgASCIIPreLen)
break;
// Check for the opening sequence. If not, then no decl
if (XMLString::compareNString( asChars
, XMLRecognizer::fgASCIIPre
, XMLRecognizer::fgASCIIPreLen))
{
break;
}
while (fRawBufIndex < fRawBytesAvail)
{
const char curCh = *asChars++;
fRawBufIndex++;
// Looks ok, so store it
fCharSizeBuf[fCharsAvail] = 1;
fCharBuf[fCharsAvail++] = XMLCh(curCh);
// Break out on a > character
if (curCh == chCloseAngle)
break;
//
// A char greater than 0x7F is not allowed in this case. If
// so, undo and throw.
//
if (curCh & 0x80)
{
fCharsAvail = 0;
fRawBufIndex = 0;
ThrowXML1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
);
}
}
break;
}
case XMLRecognizer::UTF_16B :
case XMLRecognizer::UTF_16L :
{
//
// If there is a decl here, we just truncate back the characters
// as we go. No surrogate creation would be allowed here in legal
// XML, so we consider it a transoding error if we find one.
//
if (fRawBytesAvail < 2)
break;
const UTF16Ch* asUTF16 = (const UTF16Ch*)&fRawByteBuf[fRawBufIndex];
if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker))
{
fRawBufIndex += sizeof(UTF16Ch);
asUTF16++;
}
// First check that there are enough raw bytes for there to even
// be a decl indentifier. If not, then nothing to do.
//
if (fRawBytesAvail - fRawBufIndex < XMLRecognizer::fgUTF16PreLen)
{
fRawBufIndex = 0;
break;
}
//
// See we get a match on the prefix. If not, then reset and
// break out.
//