XMLRecognizer.cpp
上传用户:zhuqijet
上传日期:2013-06-25
资源大小:10074k
文件大小:12k
- /*
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 1999-2003 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Xerces" and "Apache Software Foundation" must
- * not be used to endorse or promote products derived from this
- * software without prior written permission. For written
- * permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * nor may "Apache" appear in their name, without prior written
- * permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation, and was
- * originally based on software copyright (c) 1999, International
- * Business Machines, Inc., http://www.ibm.com . For more information
- * on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
- /**
- * $Id: XMLRecognizer.cpp,v 1.8 2003/05/15 18:26:07 knoaman Exp $
- */
- // ---------------------------------------------------------------------------
- // Includes
- // ---------------------------------------------------------------------------
- #include <xercesc/framework/XMLRecognizer.hpp>
- #include <xercesc/util/RuntimeException.hpp>
- #include <xercesc/util/XMLString.hpp>
- XERCES_CPP_NAMESPACE_BEGIN
- // ---------------------------------------------------------------------------
- // Local data
- //
- // gEncodingNameMap
- // This array maps the Encodings enum values to their canonical names.
- // Be sure to keep this in sync with that enum!
- // ---------------------------------------------------------------------------
- static const XMLCh* gEncodingNameMap[XMLRecognizer::Encodings_Count] =
- {
- XMLUni::fgEBCDICEncodingString
- , XMLUni::fgUCS4BEncodingString
- , XMLUni::fgUCS4LEncodingString
- , XMLUni::fgUSASCIIEncodingString
- , XMLUni::fgUTF8EncodingString
- , XMLUni::fgUTF16BEncodingString
- , XMLUni::fgUTF16LEncodingString
- , XMLUni::fgXMLChEncodingString
- };
- // ---------------------------------------------------------------------------
- // XMLRecognizer: Public, const static data
- //
- // gXXXPre
- // gXXXPreLen
- // The byte sequence prefixes for all of the encodings that we can
- // auto sense. Also included is the length of each sequence.
- // ---------------------------------------------------------------------------
- const char XMLRecognizer::fgASCIIPre[] = { 0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20 };
- const unsigned int XMLRecognizer::fgASCIIPreLen = 6;
- const XMLByte XMLRecognizer::fgEBCDICPre[] = { 0x4C, 0x6F, 0xA7, 0x94, 0x93, 0x40 };
- const unsigned int XMLRecognizer::fgEBCDICPreLen = 6;
- const XMLByte XMLRecognizer::fgUTF16BPre[] = { 0x00, 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20 };
- const XMLByte XMLRecognizer::fgUTF16LPre[] = { 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20, 0x00 };
- const unsigned int XMLRecognizer::fgUTF16PreLen = 12;
- const XMLByte XMLRecognizer::fgUCS4BPre[] =
- {
- 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F
- , 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D
- , 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x20
- };
- const XMLByte XMLRecognizer::fgUCS4LPre[] =
- {
- 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00
- , 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00
- , 0x6C, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00
- };
- const unsigned int XMLRecognizer::fgUCS4PreLen = 24;
- const char XMLRecognizer::fgUTF8BOM[] = {(char)0xEF, (char)0xBB, (char)0xBF};
- const unsigned int XMLRecognizer::fgUTF8BOMLen = 3;
- // ---------------------------------------------------------------------------
- // XMLRecognizer: Encoding recognition methods
- // ---------------------------------------------------------------------------
- XMLRecognizer::Encodings
- XMLRecognizer::basicEncodingProbe( const XMLByte* const rawBuffer
- , const unsigned int rawByteCount)
- {
- //
- // As an optimization to check the 90% case, check first for the ASCII
- // sequence '<?xml', which means its either US-ASCII, UTF-8, or some
- // other encoding that we don't do manually but which happens to share
- // the US-ASCII code points for these characters. So just return UTF-8
- // to get us through the first line.
- //
- if (rawByteCount >= fgASCIIPreLen)
- {
- if (!memcmp(rawBuffer, fgASCIIPre, fgASCIIPreLen))
- return UTF_8;
- }
- //
- // If the count of raw bytes is less than 2, it cannot be anything
- // we understand, so return UTF-8 as a fallback.
- //
- if (rawByteCount < 2)
- return UTF_8;
-
- //
- // We have two to four bytes, so lets check for a UTF-16 BOM. That
- // is quick to check and enough to identify two major encodings.
- //
- if (rawByteCount < 4)
- {
- if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF))
- return UTF_16B;
- else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE))
- return UTF_16L;
- else
- return UTF_8;
- }
- /***
- * F.1 Detection Without External Encoding Information
- *
- * Because each XML entity not accompanied by external encoding information and
- * not in UTF-8 or UTF-16 encoding must begin with an XML encoding declaration,
- * in which the first characters must be '<?xml', any conforming processor can detect,
- * after two to four octets of input, which of the following cases apply.
- *
- * In reading this list, it may help to know that in UCS-4, '<' is "#x0000003C" and
- * '?' is "#x0000003F", and the Byte Order Mark required of UTF-16 data streams is
- * "#xFEFF". The notation ## is used to denote any byte value except that two consecutive
- * ##s cannot be both 00.
- *
- * With a Byte Order Mark:
- *
- * 00 00 FE FF UCS-4, big-endian machine (1234 order)
- * FF FE 00 00 UCS-4, little-endian machine (4321 order)
- * 00 00 FF FE UCS-4, unusual octet order (2143)
- * FE FF 00 00 UCS-4, unusual octet order (3412)
- * FE FF ## ## UTF-16, big-endian
- * FF FE ## ## UTF-16, little-endian
- * EF BB BF UTF-8
- *
- ***/
- //
- // We have at least four bytes, so we can check all BOM
- // for UCS-4BE, UCS-4LE, UTF-16BE and UTF-16LE as well.
- //
- if ((rawBuffer[0] == 0x00) && (rawBuffer[1] == 0x00) && (rawBuffer[2] == 0xFE) && (rawBuffer[3] == 0xFF))
- return UCS_4B;
- else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE) && (rawBuffer[2] == 0x00) && (rawBuffer[3] == 0x00))
- return UCS_4L;
- else if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF))
- return UTF_16B;
- else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE))
- return UTF_16L;
- //
- // We have at least 4 bytes. So lets check the 4 byte sequences that
- // indicate other UTF-16 and UCS encodings.
- //
- if ((rawBuffer[0] == 0x00) || (rawBuffer[0] == 0x3C))
- {
- if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4BPre, fgUCS4PreLen))
- return UCS_4B;
- else if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4LPre, fgUCS4PreLen))
- return UCS_4L;
- else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16BPre, fgUTF16PreLen))
- return UTF_16B;
- else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16LPre, fgUTF16PreLen))
- return UTF_16L;
- }
- //
- // See if we have enough bytes to possibly match the EBCDIC prefix.
- // If so, try it.
- //
- if (rawByteCount > fgEBCDICPreLen)
- {
- if (!memcmp(rawBuffer, fgEBCDICPre, fgEBCDICPreLen))
- return EBCDIC;
- }
- //
- // Does not seem to be anything we know, so go with UTF-8 to get at
- // least through the first line and see what it really is.
- //
- return UTF_8;
- }
- XMLRecognizer::Encodings
- XMLRecognizer::encodingForName(const XMLCh* const encName)
- {
- //
- // Compare the passed string, assume input string is already uppercased,
- // to the variations that we recognize.
- //
- // !!NOTE: Note that we don't handle EBCDIC here because we don't handle
- // that one ourselves. It is allowed to fall into 'other'.
- //
- if (encName == XMLUni::fgXMLChEncodingString ||
- !XMLString::compareString(encName, XMLUni::fgXMLChEncodingString))
- {
- return XMLRecognizer::XERCES_XMLCH;
- }
- else if (!XMLString::compareString(encName, XMLUni::fgUTF8EncodingString)
- || !XMLString::compareString(encName, XMLUni::fgUTF8EncodingString2))
- {
- return XMLRecognizer::UTF_8;
- }
- else if (!XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString)
- || !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString2)
- || !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString3)
- || !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString4))
- {
- return XMLRecognizer::US_ASCII;
- }
- else if (!XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString)
- || !XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString2))
- {
- return XMLRecognizer::UTF_16L;
- }
- else if (!XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString)
- || !XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString2))
- {
- return XMLRecognizer::UTF_16B;
- }
- else if (!XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString)
- || !XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString2))
- {
- return XMLRecognizer::UCS_4L;
- }
- else if (!XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString)
- || !XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString2))
- {
- return XMLRecognizer::UCS_4B;
- }
- // Return 'other' since we don't recognizer it
- return XMLRecognizer::OtherEncoding;
- }
- const XMLCh*
- XMLRecognizer::nameForEncoding(const XMLRecognizer::Encodings theEncoding)
- {
- if (theEncoding > Encodings_Count)
- ThrowXML(RuntimeException, XMLExcepts::XMLRec_UnknownEncoding);
- return gEncodingNameMap[theEncoding];
- }
- XERCES_CPP_NAMESPACE_END