XMLReader.cpp
上传用户:huihehuasu
上传日期:2007-01-10
资源大小:6948k
文件大小:460k
源码类别:
xml/soap/webservice
开发平台:
C/C++
- /*
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 1999-2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Xerces" and "Apache Software Foundation" must
- * not be used to endorse or promote products derived from this
- * software without prior written permission. For written
- * permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * nor may "Apache" appear in their name, without prior written
- * permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation, and was
- * originally based on software copyright (c) 1999, International
- * Business Machines, Inc., http://www.ibm.com . For more information
- * on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
- /*
- * $Id: XMLReader.cpp,v 1.30 2001/12/06 17:47:04 tng Exp $
- */
- // ---------------------------------------------------------------------------
- // Includes
- // ---------------------------------------------------------------------------
- #include <util/BitOps.hpp>
- #include <util/BinInputStream.hpp>
- #include <util/PlatformUtils.hpp>
- #include <util/RuntimeException.hpp>
- #include <util/TranscodingException.hpp>
- #include <util/TransService.hpp>
- #include <util/UTFDataFormatException.hpp>
- #include <util/XMLEBCDICTranscoder.hpp>
- #include <util/XMLString.hpp>
- #include <util/XMLUni.hpp>
- #include <sax/InputSource.hpp>
- #include <framework/XMLBuffer.hpp>
- #include <internal/CharTypeTables.hpp>
- #include <internal/XMLReader.hpp>
- #include <internal/XMLScanner.hpp>
- #include <string.h>
- // ---------------------------------------------------------------------------
- // XMLReader: static data initialization
- // ---------------------------------------------------------------------------
- bool XMLReader::fNEL = false;
- // ---------------------------------------------------------------------------
- // XMLReader: Public, static methods
- // ---------------------------------------------------------------------------
- bool XMLReader::isFirstNameChar(const XMLCh toCheck)
- {
- static const XMLByte ourMask = gBaseCharMask | gLetterCharMask;
- if ((fgCharCharsTable[toCheck] & ourMask) != 0)
- return true;
- // Check the two special case name start chars
- if ((toCheck == chUnderscore) || (toCheck == chColon))
- return true;
- return false;
- }
- //
- // Checks whether all of the chars in the passed buffer are whitespace or
- // not. Breaks out on the first non-whitespace.
- //
- bool XMLReader::isAllSpaces(const XMLCh* const toCheck
- , const unsigned int count)
- {
- const XMLCh* curCh = toCheck;
- const XMLCh* endPtr = toCheck + count;
- while (curCh < endPtr)
- {
- if (!(fgCharCharsTable[*curCh++] & gWhitespaceCharMask))
- return false;
- }
- return true;
- }
- //
- // Checks whether at least one of the chars in the passed buffer are whitespace or
- // not.
- //
- bool XMLReader::containsWhiteSpace(const XMLCh* const toCheck
- , const unsigned int count)
- {
- const XMLCh* curCh = toCheck;
- const XMLCh* endPtr = toCheck + count;
- while (curCh < endPtr)
- {
- if (fgCharCharsTable[*curCh++] & gWhitespaceCharMask)
- return true;
- }
- return false;
- }
- //
- // This one is not called terribly often, so its done manually in order
- // give up more bits in the character characteristics table for more often
- // used characteristics.
- //
- bool XMLReader::isPublicIdChar(const XMLCh toCheck)
- {
- return checkTable(gPublicIdChars, toCheck);
- }
- void XMLReader::enableNELWS() {
- if (!fNEL) {
- fNEL = true;
- // When option is on, treat NEL same as LF
- fgCharCharsTable[chNEL] = fgCharCharsTable[chLF];
- }
- }
- // ---------------------------------------------------------------------------
- // XMLReader: Constructors and Destructor
- // ---------------------------------------------------------------------------
- XMLReader::XMLReader(const XMLCh* const pubId
- , const XMLCh* const sysId
- , BinInputStream* const streamToAdopt
- , const RefFrom from
- , const Types type
- , const Sources source
- , const bool throwAtEnd) :
- fCharIndex(0)
- , fCharsAvail(0)
- , fCurCol(1)
- , fCurLine(1)
- , fEncodingStr(0)
- , fForcedEncoding(false)
- , fNoMore(false)
- , fPublicId(XMLString::replicate(pubId))
- , fRawBufIndex(0)
- , fRawBytesAvail(0)
- , fReaderNum(0xFFFFFFFF)
- , fRefFrom(from)
- , fSentTrailingSpace(false)
- , fSource(source)
- , fSpareCh(0)
- , fSrcOfsBase(0)
- , fSrcOfsSupported(false)
- , fStream(streamToAdopt)
- , fSystemId(XMLString::replicate(sysId))
- , fSwapped(false)
- , fThrowAtEnd(throwAtEnd)
- , fTranscoder(0)
- , fType(type)
- {
- // Do an initial load of raw bytes
- refreshRawBuffer();
- // Ask the transcoding service if it supports src offset info
- fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs();
- //
- // Use the recognizer class to get a basic sense of what family of
- // encodings this file is in. We'll start off with a reader of that
- // type, and update it later if needed when we read the XMLDecl line.
- //
- fEncoding = XMLRecognizer::basicEncodingProbe(fRawByteBuf, fRawBytesAvail);
- #if defined(XERCES_DEBUG)
- if ((fEncoding < XMLRecognizer::Encodings_Min)
- || (fEncoding > XMLRecognizer::Encodings_Max))
- {
- ThrowXML(RuntimeException, XMLExcepts::Reader_BadAutoEncoding);
- }
- #endif
- fEncodingStr = XMLString::replicate(XMLRecognizer::nameForEncoding(fEncoding));
- // Check whether the fSwapped flag should be set or not
- checkForSwapped();
- //
- // This will check to see if the first line is an XMLDecl and, if
- // so, decode that first line manually one character at a time. This
- // leaves enough characters in the buffer that the high level code
- // can get through the Decl and call us back with the real encoding.
- //
- doInitDecode();
- //
- // NOTE: We won't create a transcoder until we either get a call to
- // setEncoding() or we get a call to refreshCharBuffer() and no
- // transcoder has been set yet.
- //
- }
- XMLReader::XMLReader(const XMLCh* const pubId
- , const XMLCh* const sysId
- , BinInputStream* const streamToAdopt
- , const XMLCh* const encodingStr
- , const RefFrom from
- , const Types type
- , const Sources source
- , const bool throwAtEnd) :
- fCharIndex(0)
- , fCharsAvail(0)
- , fCurCol(1)
- , fCurLine(1)
- , fEncoding(XMLRecognizer::UTF_8)
- , fEncodingStr(0)
- , fForcedEncoding(true)
- , fNoMore(false)
- , fPublicId(XMLString::replicate(pubId))
- , fRawBufIndex(0)
- , fRawBytesAvail(0)
- , fReaderNum(0xFFFFFFFF)
- , fRefFrom(from)
- , fSentTrailingSpace(false)
- , fSource(source)
- , fSpareCh(0)
- , fSrcOfsBase(0)
- , fSrcOfsSupported(false)
- , fStream(streamToAdopt)
- , fSystemId(XMLString::replicate(sysId))
- , fSwapped(false)
- , fThrowAtEnd(throwAtEnd)
- , fTranscoder(0)
- , fType(type)
- {
- // Do an initial load of raw bytes
- refreshRawBuffer();
- // Copy the encoding string to our member
- fEncodingStr = XMLString::replicate(encodingStr);
- // Ask the transcoding service if it supports src offset info
- fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs();
- //
- // Map the passed encoding name to one of our enums. If it does not
- // match one of the intrinsic encodings, it will come back 'other',
- // which tells us to create a transcoder based reader.
- //
- fEncoding = XMLRecognizer::encodingForName(fEncodingStr);
- // Check whether the fSwapped flag should be set or not
- checkForSwapped();
- //
- // Create a transcoder for the encoding. Since the encoding has been
- // forced, this will be the one we will use, period.
- //
- XMLTransService::Codes failReason;
- fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
- (
- fEncodingStr
- , failReason
- , kCharBufSize
- );
- if (!fTranscoder)
- {
- ThrowXML1
- (
- TranscodingException
- , XMLExcepts::Trans_CantCreateCvtrFor
- , fEncodingStr
- );
- }
- //
- // Note that, unlike above, we do not do an initial decode of the
- // first line. We take the caller's word that the encoding is correct
- // and just assume that the first bulk decode (kicked off by the first
- // get of a character) will work.
- //
- // So we do here the slipping in of the leading space if required.
- //
- if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
- {
- // This represents no data from the source
- fCharSizeBuf[fCharsAvail] = 0;
- fCharBuf[fCharsAvail++] = chSpace;
- }
- }
- XMLReader::~XMLReader()
- {
- delete [] fEncodingStr;
- delete [] fPublicId;
- delete [] fSystemId;
- delete fStream;
- delete fTranscoder;
- }
- // ---------------------------------------------------------------------------
- // XMLReader: Character buffer management methods
- // ---------------------------------------------------------------------------
- unsigned int XMLReader::getSrcOffset() const
- {
- if (!fSrcOfsSupported)
- ThrowXML(RuntimeException, XMLExcepts::Reader_SrcOfsNotSupported);
- //
- // Take the current source offset and add in the sizes that we've
- // eaten from the source so far.
- //
- unsigned int offset = fSrcOfsBase;
- for (unsigned int index = 0; index < fCharIndex; index++)
- offset += fCharSizeBuf[index];
- return offset;
- }
- bool XMLReader::refreshCharBuffer()
- {
- // If the no more flag is set, then don't both doing anything
- if (fNoMore)
- return false;
- unsigned int startInd;
- // See if we have any existing chars.
- const unsigned int spareChars = fCharsAvail - fCharIndex;
- // If we are full, then don't do anything.
- if (spareChars == kCharBufSize)
- return false;
- //
- // If no transcoder has been created yet, then we never saw the
- // any encoding="" string and the encoding was not forced, so lets
- // create one now. We know that it won't change now.
- //
- // However, note that if we autosensed EBCDIC, then we have to
- // consider it an error if we never got an encoding since we don't
- // know what variant of EBCDIC it is.
- //
- if (!fTranscoder)
- {
- if (fEncoding == XMLRecognizer::EBCDIC)
- ThrowXML(RuntimeException, XMLExcepts::Reader_EncodingStrRequired);
- // Ask the transcoding service to make use a transcoder
- XMLTransService::Codes failReason;
- fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
- (
- fEncodingStr
- , failReason
- , kCharBufSize
- );
- if (!fTranscoder)
- {
- ThrowXML1
- (
- TranscodingException
- , XMLExcepts::Trans_CantCreateCvtrFor
- , fEncodingStr
- );
- }
- }
- //
- // Add the number of source bytes eaten so far to the base src
- // offset member.
- //
- for (startInd = 0; startInd < fCharIndex; startInd++)
- fSrcOfsBase += fCharSizeBuf[startInd];
- //
- // If there are spare chars, then move then down to the bottom. We
- // have to move the char sizes down also.
- //
- startInd = 0;
- if (spareChars)
- {
- for (unsigned int index = fCharIndex; index < fCharsAvail; index++)
- {
- fCharBuf[startInd] = fCharBuf[index];
- fCharSizeBuf[startInd] = fCharSizeBuf[index];
- startInd++;
- }
- }
- //
- // And then get more chars, starting after any spare chars that were
- // left over from the last time.
- //
- fCharsAvail = xcodeMoreChars
- (
- &fCharBuf[startInd]
- , &fCharSizeBuf[startInd]
- , kCharBufSize - spareChars
- );
- // Add back in the spare chars
- fCharsAvail += spareChars;
- // Reset the buffer index to zero, so we start from the 0th char again
- fCharIndex = 0;
- //
- // If no chars available, then we have to check for one last thing. If
- // this is reader for a PE and its not being expanded inside a literal,
- // then unget a trailing space. We use a boolean to avoid triggering
- // this more than once.
- //
- if (!fCharsAvail
- && (fType == Type_PE)
- && (fRefFrom == RefFrom_NonLiteral)
- && !fSentTrailingSpace)
- {
- fCharBuf[0] = chSpace;
- fCharsAvail = 1;
- fSentTrailingSpace = true;
- }
- //
- // If we are on our first block of chars and the encoding is one of the
- // UTF-16 formats, then check the first char for the BOM and skip over
- // it manually.
- //
- if (fCharsAvail)
- {
- if ((fCurLine == 1) && (fCurCol == 1))
- {
- if (((fEncoding == XMLRecognizer::UTF_16L)
- || (fEncoding == XMLRecognizer::UTF_16B))
- && !startInd)
- {
- if ((fCharBuf[startInd] == chUnicodeMarker)
- || (fCharBuf[startInd] == chSwappedUnicodeMarker))
- {
- XMLCh chTmp;
- getNextChar(chTmp);
- }
- }
- }
- }
- //
- // If we get here with no more chars, then set the fNoMore flag which
- // lets us optimize and know without checking that no more chars are
- // available.
- //
- if (!fCharsAvail)
- fNoMore = true;
- return (fCharsAvail != 0);
- }
- // ---------------------------------------------------------------------------
- // XMLReader: Scanning methods
- // ---------------------------------------------------------------------------
- bool XMLReader::getName(XMLBuffer& toFill, const bool token)
- {
- //
- // Ok, first lets see if we have chars in the buffer. If not, then lets
- // reload.
- //
- if (fCharIndex == fCharsAvail)
- {
- if (!refreshCharBuffer())
- return false;
- }
- //
- // Lets check the first char for being a first name char. If not, then
- // what's the point in living mannnn? Just give up now. We only do this
- // if its a name and not a name token that they want.
- //
- if (!token)
- {
- if (!XMLReader::isFirstNameChar(fCharBuf[fCharIndex]))
- return false;
- // Looks ok, so lets eat it and put it in our buffer. Update column also!
- toFill.append(fCharBuf[fCharIndex++]);
- fCurCol++;
- }
- //
- // And now we loop until we run out of data in this reader or we hit
- // a non-name char.
- //
- do {
- unsigned int curCol = fCurCol;
- unsigned int charIndex = fCharIndex;
- unsigned int charsAvail = fCharsAvail;
- while (charIndex < charsAvail)
- {
- const XMLCh curCh = fCharBuf[charIndex];
- //
- // Check the current char and take it if its a name char. Else
- // break out.
- //
- if (!XMLReader::isNameChar(curCh))
- {
- fCharIndex = charIndex;
- fCurCol = curCol;
- return !toFill.isEmpty();
- }
- toFill.append(curCh);
- curCol++;
- charIndex++;
- }
- fCharIndex = charIndex;
- fCurCol = curCol;
- // If we don't get no more, then break out.
- } while (refreshCharBuffer());
- return !toFill.isEmpty();
- }
- bool XMLReader::getNextChar(XMLCh& chGotten)
- {
- //
- // See if there is at least a char in the buffer. Else, do the buffer
- // reload logic.
- //
- if (fCharIndex >= fCharsAvail)
- {
- // If fNoMore is set, then we have nothing else to give
- if (fNoMore)
- return false;
- // If the buffer is empty, then try to refresh
- if (fCharIndex == fCharsAvail)
- {
- refreshCharBuffer();
- // If still empty, then return false
- if (fCharIndex == fCharsAvail)
- return false;
- }
- }
- chGotten = fCharBuf[fCharIndex++];
- // Handle end of line normalization and line/col member maintenance.
- if (chGotten == chCR)
- {
- //
- // Do the normalization. We return chLF regardless of which was
- // found. We also eat a chCR followed by an chLF.
- //
- // We only do this if the content being spooled is not already
- // internalized.
- //
- if (fSource == Source_External)
- {
- //
- // See if we have another char left. If not, don't bother.
- // Else, see if its an chLF to eat. If it is, bump the
- // index again.
- //
- if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
- {
- if (fCharBuf[fCharIndex] == chLF
- || ((fCharBuf[fCharIndex] == chNEL) && fNEL))
- fCharIndex++;
- }
- // And return just an chLF
- chGotten = chLF;
- }
- // And handle the line/col stuff
- fCurCol = 1;
- fCurLine++;
- }
- else if (chGotten == chLF
- || ((chGotten == chNEL) && fNEL))
- {
- chGotten = chLF;
- fCurLine++;
- fCurCol = 1;
- }
- else if (chGotten)
- {
- //
- // Only do this is not a null char. Null chars are not part of the
- // real content. They are just marker characters inserted into
- // the stream.
- //
- fCurCol++;
- }
- return true;
- }
- bool XMLReader::getSpaces(XMLBuffer& toFill)
- {
- //
- // We just loop until we either hit a non-space or the end of this
- // entity. We return true if we returned because of a non-space and
- // false if because of end of entity.
- //
- // NOTE: We have to maintain line/col info here and we have to do
- // whitespace normalization if we are not already internalized.
- //
- while (true)
- {
- // Loop through the current chars in the buffer
- while (fCharIndex < fCharsAvail)
- {
- // Get the current char out of the buffer
- XMLCh curCh = fCharBuf[fCharIndex];
- //
- // See if its a white space char. If so, then process it. Else
- // we've hit a non-space and need to return.
- //
- if (XMLReader::isWhitespace(curCh))
- {
- // Eat this char
- fCharIndex++;
- //
- // Ok, we've got some whitespace here. So we have to store
- // it. But we have to normalize it and update the line and
- // column info along the way.
- //
- if (curCh == chCR)
- {
- fCurCol = 1;
- fCurLine++;
- //
- // If not already internalized, then convert it to an
- // LF and eat any following LF.
- //
- if (fSource == Source_External)
- {
- if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
- {
- if (fCharBuf[fCharIndex] == chLF
- || ((fCharBuf[fCharIndex] == chNEL) && fNEL))
- fCharIndex++;
- }
- curCh = chLF;
- }
- }
- else if (curCh == chLF
- || ((curCh == chNEL) && fNEL))
- {
- curCh = chLF;
- fCurCol = 1;
- fCurLine++;
- }
- else
- {
- fCurCol++;
- }
- // Ok we can add this guy to our buffer
- toFill.append(curCh);
- }
- else
- {
- // Return true to indicate we broke out due to a whitespace
- return true;
- }
- }
- //
- // We've eaten up the current buffer, so lets try to reload it. If
- // we don't get anything new, then break out. If we do, then we go
- // back to the top to keep getting spaces.
- //
- if (!refreshCharBuffer())
- break;
- }
- return false;
- }
- bool XMLReader::getUpToCharOrWS(XMLBuffer& toFill, const XMLCh toCheck)
- {
- while (true)
- {
- // Loop through the current chars in the buffer
- while (fCharIndex < fCharsAvail)
- {
- // Get the current char out of the buffer
- XMLCh curCh = fCharBuf[fCharIndex];
- //
- // See if its not a white space or our target char, then process
- // it. Else, we need to return.
- //
- if (!XMLReader::isWhitespace(curCh) && (curCh != toCheck))
- {
- // Eat this char
- fCharIndex++;
- //
- // Ok, we've got some whitespace here. So we have to store
- // it. But we have to normalize it and update the line and
- // column info along the way.
- //
- if (curCh == chCR)
- {
- fCurCol = 1;
- fCurLine++;
- //
- // If not already internalized, then convert it to an
- // LF and eat any following LF.
- //
- if (fSource == Source_External)
- {
- if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
- {
- if (fCharBuf[fCharIndex] == chLF
- || ((fCharBuf[fCharIndex] == chNEL) && fNEL))
- fCharIndex++;
- }
- curCh = chLF;
- }
- }
- else if (curCh == chLF
- || ((curCh == chNEL) && fNEL))
- {
- curCh = chLF;
- fCurCol = 1;
- fCurLine++;
- }
- else
- {
- fCurCol++;
- }
- // Add it to our buffer
- toFill.append(curCh);
- }
- else
- {
- return true;
- }
- }
- //
- // We've eaten up the current buffer, so lets try to reload it. If
- // we don't get anything new, then break out. If we do, then we go
- // back to the top to keep getting spaces.
- //
- if (!refreshCharBuffer())
- break;
- }
- // We never hit any non-space and ate up the whole reader
- return false;
- }
- bool XMLReader::peekNextChar(XMLCh& chGotten)
- {
- //
- // If there is something still in the buffer, get it. Else do the reload
- // scenario.
- //
- if (fCharIndex >= fCharsAvail)
- {
- // Try to refresh the buffer
- if (!refreshCharBuffer())
- {
- chGotten = chNull;
- return false;
- }
- }
- chGotten = fCharBuf[fCharIndex];
- //
- // Even though we are only peeking, we have to act the same as the
- // normal char get method in regards to newline normalization, though
- // its not as complicated as the actual character getting method's.
- //
- if ((chGotten == chCR || ((chGotten == chNEL) && fNEL))
- && (fSource == Source_External))
- chGotten = chLF;
- return true;
- }
- bool XMLReader::skipIfQuote(XMLCh& chGotten)
- {
- if (fCharIndex == fCharsAvail)
- {
- if (!refreshCharBuffer())
- return false;
- }
- const XMLCh curCh = fCharBuf[fCharIndex];
- if ((curCh == chDoubleQuote) || (curCh == chSingleQuote))
- {
- chGotten = curCh;
- fCharIndex++;
- fCurCol++;
- return true;
- }
- return false;
- }
- bool XMLReader::skipSpaces(bool& skippedSomething)
- {
- // Remember the current line and column
- unsigned int orgLine = fCurLine;
- unsigned int orgCol = fCurCol;
- //
- // We enter a loop where we skip over spaces until we hit the end of
- // this reader or a non-space value. The return indicates whether we
- // hit the non-space (true) or the end (false).
- //
- while (true)
- {
- // Loop through the current chars in the buffer
- while (fCharIndex < fCharsAvail)
- {
- // Get the current char out of the buffer
- XMLCh curCh = fCharBuf[fCharIndex];
- //
- // See if its a white space char. If so, then process it. Else
- // we've hit a non-space and need to return.
- //
- if (XMLReader::isWhitespace(curCh))
- {
- // Eat this char
- fCharIndex++;
- //
- // Ok, we've got some whitespace here. So we have to store
- // it. But we have to normalize it and update the line and
- // column info along the way.
- //
- if (curCh == chCR)
- {
- fCurCol = 1;
- fCurLine++;
- //
- // If not already internalized, then convert it to an
- // LF and eat any following LF.
- //
- if (fSource == Source_External)
- {
- if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
- {
- if (fCharBuf[fCharIndex] == chLF
- || ((fCharBuf[fCharIndex] == chNEL) && fNEL))
- fCharIndex++;
- }
- curCh = chLF;
- }
- }
- else if (curCh == chLF
- || ((curCh == chNEL) && fNEL))
- {
- curCh = chLF;
- fCurCol = 1;
- fCurLine++;
- }
- else
- {
- fCurCol++;
- }
- }
- else
- {
- skippedSomething = (orgLine != fCurLine) || (orgCol != fCurCol);
- return true;
- }
- }
- //
- // We've eaten up the current buffer, so lets try to reload it. If
- // we don't get anything new, then break out. If we do, then we go
- // back to the top to keep getting spaces.
- //
- if (!refreshCharBuffer())
- break;
- }
- // We never hit any non-space and ate up the whole reader
- skippedSomething = (orgLine != fCurLine) || (orgCol != fCurCol);
- return false;
- }
- bool XMLReader::skippedChar(const XMLCh toSkip)
- {
- //
- // If the buffer is empty, then try to reload it. If we still get
- // nothing, then return false.
- //
- if (fCharIndex == fCharsAvail)
- {
- if (!refreshCharBuffer())
- return false;
- }
- //
- // See if the current char is the one we want. If so, then we need
- // to eat it and return true.
- //
- if (fCharBuf[fCharIndex] == toSkip)
- {
- fCharIndex++;
- fCurCol++;
- return true;
- }
- return false;
- }
- bool XMLReader::skippedSpace()
- {
- //
- // If the buffer is empty, then try to reload it. If we still get
- // nothing, then return false.
- //
- if (fCharIndex == fCharsAvail)
- {
- if (!refreshCharBuffer())
- return false;
- }
- //
- // See if the current char is a whitespace. If so, then we need to eat
- // it and return true.
- //
- const XMLCh curCh = fCharBuf[fCharIndex];
- if (XMLReader::isWhitespace(curCh))
- {
- // Eat the character
- fCharIndex++;
- if (curCh == chCR)
- {
- fCurLine++;
- fCurCol = 1;
- if (fSource == Source_External)
- {
- if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
- {
- if (fCharBuf[fCharIndex] == chLF
- || ((fCharBuf[fCharIndex] == chNEL) && fNEL))
- fCharIndex++;
- }
- }
- }
- else if (curCh == chLF
- || ((curCh == chNEL) && fNEL))
- {
- fCurLine++;
- fCurCol = 1;
- }
- else
- {
- fCurCol++;
- }
- return true;
- }
- return false;
- }
- bool XMLReader::skippedString(const XMLCh* const toSkip)
- {
- // Get the length of the string to skip
- const unsigned int srcLen = XMLString::stringLen(toSkip);
- //
- // See if the current reader has enough chars to test against this
- // string. If not, then ask it to reload its buffer. If that does not
- // get us enough, then it cannot match.
- //
- // NOTE: This works because strings never have to cross a reader! And
- // a string to skip will never have a new line in it, so we will never
- // miss adjusting the current line.
- //
- unsigned int charsLeft = charsLeftInBuffer();
- while (charsLeft < srcLen)
- {
- refreshCharBuffer();
- unsigned int t = charsLeftInBuffer();
- if (t == charsLeft) // if the refreshCharBuf() did not add anything new
- return false; // give up and return.
- charsLeft = t;
- }
- //
- // Ok, now we now that the current reader has enough chars in its
- // buffer and that its index is back at zero. So we can do a quick and
- // dirty comparison straight to its buffer with no requirement to unget
- // if it fails.
- //
- if (XMLString::compareNString(&fCharBuf[fCharIndex], toSkip, srcLen))
- return false;
- // Add the source length to the current column to get it back right
- fCurCol += srcLen;
- //
- // And get the character buffer index back right by just adding the
- // source len to it.
- //
- fCharIndex += srcLen;
- return true;
- }
- //
- // This is just to peek if the next coming buffer
- // matches the string toPeek.
- // Similar to skippedString, but just the fCharIndex and fCurCol are not updated
- //
- bool XMLReader::peekString(const XMLCh* const toPeek)
- {
- // Get the length of the string to skip
- const unsigned int srcLen = XMLString::stringLen(toPeek);
- //
- // See if the current reader has enough chars to test against this
- // string. If not, then ask it to reload its buffer. If that does not
- // get us enough, then it cannot match.
- //
- // NOTE: This works because strings never have to cross a reader! And
- // a string to skip will never have a new line in it, so we will never
- // miss adjusting the current line.
- //
- unsigned int charsLeft = charsLeftInBuffer();
- while (charsLeft < srcLen)
- {
- refreshCharBuffer();
- unsigned int t = charsLeftInBuffer();
- if (t == charsLeft) // if the refreshCharBuf() did not add anything new
- return false; // give up and return.
- charsLeft = t;
- }
- //
- // Ok, now we now that the current reader has enough chars in its
- // buffer and that its index is back at zero. So we can do a quick and
- // dirty comparison straight to its buffer with no requirement to unget
- // if it fails.
- //
- if (XMLString::compareNString(&fCharBuf[fCharIndex], toPeek, srcLen))
- return false;
- return true;
- }
- // ---------------------------------------------------------------------------
- // XMLReader: Setter methods (most are inlined)
- // ---------------------------------------------------------------------------
- bool XMLReader::setEncoding(const XMLCh* const newEncoding)
- {
- //
- // If the encoding was forced, then we ignore the new value and just
- // return with success. If it was forced, then we are to use that
- // encoding without question. Note that, if we are forced, we created
- // a transcoder up front so there is no need to do one here in that
- // case.
- //
- if (fForcedEncoding)
- return true;
- // Clean up the old encoding string
- // Do not delete until we know we have a good encoding
- // if (fEncodingStr)
- // {
- // delete [] fEncodingStr;
- // fEncodingStr = 0;
- // }
- //
- // Try to map the string to one of our standard encodings. If its not
- // one of them, then it has to be one of the non-intrinsic encodings,
- // in which case we have to delete our intrinsic encoder and create a
- // new one.
- //
- XMLRecognizer::Encodings newBaseEncoding = XMLRecognizer::encodingForName
- (
- newEncoding
- );
- //
- // If it does not come back as one of the auto-sensed encodings, then we
- // have to possibly replace it and at least check a few things.
- //
- if (newBaseEncoding == XMLRecognizer::OtherEncoding)
- {
- //
- // Check for non-endian specific UTF-16 or UCS-4. If so, and if we
- // are already in one of the endian versions of those encodings,
- // then just keep it and go on. Otherwise, its not valid.
- //
- if (!XMLString::compareIString(newEncoding, XMLUni::fgUTF16EncodingString)
- || !XMLString::compareIString(newEncoding, XMLUni::fgUTF16EncodingString2)
- || !XMLString::compareIString(newEncoding, XMLUni::fgUTF16EncodingString3)
- || !XMLString::compareIString(newEncoding, XMLUni::fgUTF16EncodingString4))
- {
- if ((fEncoding != XMLRecognizer::UTF_16L)
- && (fEncoding != XMLRecognizer::UTF_16B))
- {
- return false;
- }
- // Override with the original endian specific encoding
- newBaseEncoding = fEncoding;
- if (fEncoding == XMLRecognizer::UTF_16L) {
- delete [] fEncodingStr;
- fEncodingStr = XMLString::replicate(XMLUni::fgUTF16LEncodingString);
- }
- else {
- delete [] fEncodingStr;
- fEncodingStr = XMLString::replicate(XMLUni::fgUTF16BEncodingString);
- }
- }
- else if (!XMLString::compareIString(newEncoding, XMLUni::fgUCS4EncodingString)
- || !XMLString::compareIString(newEncoding, XMLUni::fgUCS4EncodingString2)
- || !XMLString::compareIString(newEncoding, XMLUni::fgUCS4EncodingString3))
- {
- if ((fEncoding != XMLRecognizer::UCS_4L)
- && (fEncoding != XMLRecognizer::UCS_4B))
- {
- return false;
- }
- // Override with the original endian specific encoding
- newBaseEncoding = fEncoding;
- if (fEncoding == XMLRecognizer::UCS_4L) {
- delete [] fEncodingStr;
- fEncodingStr = XMLString::replicate(XMLUni::fgUCS4LEncodingString);
- }
- else {
- delete [] fEncodingStr;
- fEncodingStr = XMLString::replicate(XMLUni::fgUCS4BEncodingString);
- }
- }
- else
- {
- // None of those special cases, so just replicate the new name
- delete [] fEncodingStr;
- fEncodingStr = XMLString::replicate(newEncoding);
- }
- }
- else
- {
- // Store the new encoding string since it is just an intrinsic
- delete [] fEncodingStr;
- fEncodingStr = XMLString::replicate(newEncoding);
- }
- //
- // Now we can create a transcoder using the transcoding service. We
- // might get back a transcoder for an intrinsically supported encoding,
- // or we might get one from the underlying transcoding service.
- //
- XMLTransService::Codes failReason;
- fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
- (
- fEncodingStr
- , failReason
- , kCharBufSize
- );
- if (!fTranscoder)
- ThrowXML1(TranscodingException, XMLExcepts::Trans_CantCreateCvtrFor, fEncodingStr);
- // Update the base encoding member with the new base encoding found
- fEncoding = newBaseEncoding;
- // Looks ok to us
- return true;
- }
- // ---------------------------------------------------------------------------
- // XMLReader: Private static init methods
- // ---------------------------------------------------------------------------
- bool XMLReader::checkTable( const XMLCh* const theTable
- , const XMLCh toCheck)
- {
- const XMLCh* curTable = theTable;
- // Check the ranges
- while (*curTable)
- {
- //
- // If the test char is less than the low range, then its never
- // going to match any other range, so break out. But we have to
- // run up to the range terminator first.
- //
- if (toCheck < *curTable++)
- {
- while (*curTable++)
- {
- // Purposefully empty, we are just running up the pointer
- }
- break;
- }
- // If its less than or equal to the top of the range, then a match
- if (toCheck <= *curTable++)
- return true;
- }
- // And now test against singles
- while (*curTable)
- {
- if (toCheck == *curTable++)
- return true;
- }
- return false;
- }
- // ---------------------------------------------------------------------------
- // XMLReader: Private helper methods
- // ---------------------------------------------------------------------------
- //
- // This is called when the encoding flag is set and just sets the fSwapped
- // flag appropriately.
- //
- void XMLReader::checkForSwapped()
- {
- // Assume not swapped
- fSwapped = false;
- #if defined(ENDIANMODE_LITTLE)
- if ((fEncoding == XMLRecognizer::UTF_16B)
- || (fEncoding == XMLRecognizer::UCS_4B))
- {
- fSwapped = true;
- }
- #elif defined(ENDIANMODE_BIG)
- if ((fEncoding == XMLRecognizer::UTF_16L)
- || (fEncoding == XMLRecognizer::UCS_4L))
- {
- fSwapped = true;
- }
- #endif
- }
- //
- // This is called from the constructor when the encoding is not forced.
- // We assume that the encoding has been auto-sensed at this point and that
- // fSwapped is set correctly.
- //
- // In the case of UCS-4 and EBCDIC, we don't have to check for a decl.
- // The fact that we got here, means that there is one, because that's the
- // only way we can autosense those.
- //
- void XMLReader::doInitDecode()
- {
- switch(fEncoding)
- {
- case XMLRecognizer::UCS_4B :
- case XMLRecognizer::UCS_4L :
- {
- // Look at the raw buffer as UCS4 chars
- const UCS4Ch* asUCS = (const UCS4Ch*)fRawByteBuf;
- while (fRawBufIndex < fRawBytesAvail)
- {
- // Get out the current 4 byte value and inc our raw buf index
- UCS4Ch curVal = *asUCS++;
- fRawBufIndex += sizeof(UCS4Ch);
- // Swap if that is required for this machine
- if (fSwapped)
- curVal = BitOps::swapBytes(curVal);
- // Make sure its at least semi legal. If not, undo and throw
- if (curVal > 0xFFFF)
- {
- fCharsAvail = 0;
- fRawBufIndex = 0;
- ThrowXML1
- (
- TranscodingException
- , XMLExcepts::Reader_CouldNotDecodeFirstLine
- , fSystemId
- );
- }
- // Convert the value to an XML char and store it
- fCharSizeBuf[fCharsAvail] = 4;
- fCharBuf[fCharsAvail++] = XMLCh(curVal);
- // Break out on the > character
- if (curVal == chCloseAngle)
- break;
- }
- break;
- }
- case XMLRecognizer::UTF_8 :
- {
- // If there's a utf-8 BOM (0xEF 0xBB 0xBF), skip past it.
- // Don't move to char buf - no one wants to see it.
- // Note: this causes any encoding= declaration to override
- // the BOM's attempt to say that the encoding is utf-8.
- // Look at the raw buffer as short chars
- const char* asChars = (const char*)fRawByteBuf;
- if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen &&
- XMLString::compareNString( asChars
- , XMLRecognizer::fgUTF8BOM
- , XMLRecognizer::fgUTF8BOMLen) == 0)
- {
- fRawBufIndex += XMLRecognizer::fgUTF8BOMLen;
- asChars += XMLRecognizer::fgUTF8BOMLen;
- }
- //
- // First check that there are enough bytes to even see the
- // decl indentifier. If not, get out now with no action since
- // there is no decl.
- //
- if (fRawBytesAvail < XMLRecognizer::fgASCIIPreLen)
- break;
- // Check for the opening sequence. If not, then no decl
- if (XMLString::compareNString( asChars
- , XMLRecognizer::fgASCIIPre
- , XMLRecognizer::fgASCIIPreLen))
- {
- break;
- }
- while (fRawBufIndex < fRawBytesAvail)
- {
- const char curCh = *asChars++;
- fRawBufIndex++;
- // Looks ok, so store it
- fCharSizeBuf[fCharsAvail] = 1;
- fCharBuf[fCharsAvail++] = XMLCh(curCh);
- // Break out on a > character
- if (curCh == chCloseAngle)
- break;
- //
- // A char greater than 0x7F is not allowed in this case. If
- // so, undo and throw.
- //
- if (curCh & 0x80)
- {
- fCharsAvail = 0;
- fRawBufIndex = 0;
- ThrowXML1
- (
- TranscodingException
- , XMLExcepts::Reader_CouldNotDecodeFirstLine
- , fSystemId
- );
- }
- }
- break;
- }
- case XMLRecognizer::UTF_16B :
- case XMLRecognizer::UTF_16L :
- {
- //
- // If there is a decl here, we just truncate back the characters
- // as we go. No surrogate creation would be allowed here in legal
- // XML, so we consider it a transoding error if we find one.
- //
- if (fRawBytesAvail < 2)
- break;
- const UTF16Ch* asUTF16 = (const UTF16Ch*)&fRawByteBuf[fRawBufIndex];
- if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker))
- {
- fRawBufIndex += sizeof(UTF16Ch);
- asUTF16++;
- }
- // First check that there are enough raw bytes for there to even
- // be a decl indentifier. If not, then nothing to do.
- //
- if (fRawBytesAvail - fRawBufIndex < XMLRecognizer::fgUTF16PreLen)
- {
- fRawBufIndex = 0;
- break;
- }
- //
- // See we get a match on the prefix. If not, then reset and
- // break out.
- //