xml/soap/webservice

开发平台：
C/C++

ICUData.cpp：源码内容
							/*
 * The Apache Software License, Version 1.1
 * 
 * Copyright (c) 1999-2000 The Apache Software Foundation.  All rights
 * reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer. 
 * 
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 * 
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:  
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 * 
 * 4. The names "Xerces" and "Apache Software Foundation" must
 *    not be used to endorse or promote products derived from this
 *    software without prior written permission. For written 
 *    permission, please contact apache@apache.org.
 * 
 * 5. Products derived from this software may not be called "Apache",
 *    nor may "Apache" appear in their name, without prior written
 *    permission of the Apache Software Foundation.
 * 
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 * 
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation, and was
 * originally based on software copyright (c) 1999, International
 * Business Machines, Inc., http://www.ibm.com .  For more information
 * on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 */
/*
 * $Log: ICUData.cpp,v $
 * Revision 1.1  2000/03/17 23:58:00  roddey
 * New utility for munging ICU UCM files and spitting out tables for
 * our intrinsic encoders.
 *
 */
// ---------------------------------------------------------------------------
//  This program is designed to parse a standard ICU .UCM file and spit out
//  a C++ code fragment that represents the tables required by the intrinsic
//  XML parser transcoders.
//
//  The file format is pretty simple and this program is not intended to be
//  industrial strength by any means. Its use by anyone but the author is
//  at the user's own risk.
//
//  The code looks for the min/max bytes per character to know what kind of
//  table to spit out, but for now only handles single char sets.
// ---------------------------------------------------------------------------
// ---------------------------------------------------------------------------
//  Includes
// ---------------------------------------------------------------------------
#include    <ctype.h>
#include    <stdio.h>
#include    <stdlib.h>
#include    <iostream.h>
#include    <string.h>
// ---------------------------------------------------------------------------
//  Const data
// ---------------------------------------------------------------------------
static const unsigned int   gMaxInRecs = 1024;
// ---------------------------------------------------------------------------
//  Local data types
// ---------------------------------------------------------------------------
struct XlatRec
{
    unsigned short  uniVal;
    unsigned char   cpVal;
};
// ---------------------------------------------------------------------------
//  Local data
//
//  gInFile
//  gOutFile
//      These are the file stream for the input UCM file and the output file
//      that we write the C++ code to.
//
//  fLineNum
//      Used to track the current line number in the source file, for error
//      reporting.
//
//  gMainTable
//  gMainTableSz
//      This is the table that is filled in from the original source document.
//      We don't know how big it will be, but its not likely to be much more
//      than 300 entries or so (256 output code points with some multiply
//      mapped Unicode code points.) So we make it extra large and watch for
//      possible overflow.
//
//      The size value is bumped up as we load entries into it during the
//      parse of the file.
//
//  gMaxChar
//  gMinChar
//      The min/max chars that are used to represent a character. These are
//      read from the header of the input file.
//
//  gRepChar
//      The replacement character to be used. This is read from the header of
//      the input file.
// ---------------------------------------------------------------------------
static FILE*            gInFile;
static FILE*            gOutFile;
static unsigned int     fLineNum;
static XlatRec          gMainTable[gMaxInRecs];
static unsigned int     gMainTableSz = 0;
static unsigned int     gMaxChar;
static unsigned int     gMinChar;
static unsigned char    gRepChar = 1;
// ---------------------------------------------------------------------------
//  Local functions
// ---------------------------------------------------------------------------
static unsigned int getLine(        char* const     toFill
                            , const unsigned int    maxChars
                            , const bool            eofOk = false)
{
    while (true)
    {
        if (!fgets(toFill, maxChars, gInFile))
        {
            if (feof(gInFile))
            {
                if (eofOk)
                    return ~0UL;
                else
                    cout << "Unexpected end of input at line: " << fLineNum << endl;
            }
             else
            {
                cout << "Error processing input at line: " << fLineNum << endl;
                exit(1);
            }
        }
        fLineNum++;
        //
        //  If its not a comment, then break out
        //
        if (toFill[0] != '#')
            break;
    }
    //
    //  There could be a trailing comment on this line, so lets get rid
    //  of it. Search for a # char and put a null there.
    //
    char* endPtr = toFill;
    while (*endPtr && (*endPtr != '#'))
        endPtr++;
    if (*endPtr == '#')
        *endPtr = 0;
    // Strip trailing whitespace
    endPtr = toFill + (strlen(toFill) - 1);
    while (isspace(*endPtr))
        endPtr--;
    *(endPtr + 1) = 0;
    // And return the count of chars we got
    return strlen(toFill);
}
static unsigned int extractVal(char* const srcStr)
{
    char* srcPtr = srcStr;
    // Run forward to the first non-space
    while (isspace(*srcPtr))
        srcPtr++;
    if (!*srcPtr)
    {
        cout << "Invalid numeric value on line: " << fLineNum << endl;
        exit(1);
    }
    //
    //  If it starts with , then its a hex value in the form xXX. Else its
    //  just a decimal value.
    //
    unsigned int retVal;
    char* endPtr;
    if (*srcPtr == '\')
    {
        // Skip the \x and interpret as a hex value
        srcPtr += 2;
        retVal = (unsigned int)strtoul(srcPtr, &endPtr, 16);
    }
     else
    {
        retVal = (unsigned int)strtoul(srcPtr, &endPtr, 10);
    }
    // We should have translated up to the end of the string
    if (*endPtr)
    {
        cout << "Invalid numeric value on line: " << fLineNum << endl;
        exit(1);
    }
    return retVal;
}
static void loadTable()
{
    //
    //  Just loop, reading lines at a time, until we either find the start
    //  of the character table or hit the end of the file. Along the way, we
    //  should see a few header values that we store away.
    //
    const unsigned int  tmpBufSz = 2048;
    char                tmpBuf[tmpBufSz - 1];
    while (getLine(tmpBuf, tmpBufSz))
    {
        //
        //  Check for one of the special values we are intersted int. If
        //  its CHARMAP, then we fall out of this loop.
        //
        if (!strcmp(tmpBuf, "CHARMAP"))
            break;
        if (!strncmp(tmpBuf, "<mb_cur_max>", 12))
        {
            gMaxChar = extractVal(&tmpBuf[12]);
        }
         else if (!strncmp(tmpBuf, "<mb_cur_min>", 12))
        {
            gMinChar = extractVal(&tmpBuf[12]);
        }
         else if (!strncmp(tmpBuf, "<subchar>", 9))
        {
            gRepChar = (char)extractVal(&tmpBuf[9]);
        }
    }
    //
    //  Ok, now we just run till we hit the "END CHARMAP" line. Each entry
    //  will be in the form:
    //
    //      <UXXXX>     xXX
    //
    //  Where X is a hex number.
    //
    char* endPtr;
    while (getLine(tmpBuf, tmpBufSz))
    {
        // Watch for the end of table
        if (!strcmp(tmpBuf, "END CHARMAP"))
            break;
        // The absolute minium it could be is 12 chars
        if (strlen(tmpBuf) < 12)
        {
            cout << "Line " << fLineNum << " is too short to hold a valid entry"
                 << endl;
            exit(1);
        }
        // Make sure the first token meets the criteria
        if ((tmpBuf[0] != '<')
        ||  (tmpBuf[1] != 'U')
        ||  (tmpBuf[6] != '>'))
        {
            cout << "Line " << fLineNum << " has a badly formed Unicode value"
                 << endl;
            exit(1);
        }
        //
        //  Looks reasonable so lets try to convert it. We can play tricks
        //  with this buffer, so put a null over the > char.
        //
        tmpBuf[6] = 0;
        const unsigned int uniVal = strtoul(&tmpBuf[2], &endPtr, 16);
        if (*endPtr)
        {
            cout << "Invalid Unicode value on line " << fLineNum << endl;
            exit(1);
        }
        //
        //  Ok, lets search over to the second token. We have to find a \
        //  character.
        //
        char* srcPtr = &tmpBuf[7];
        while (*srcPtr && (*srcPtr != '\'))
            srcPtr++;
        // If we never found it, its in error
        if (!*srcPtr)
        {
            cout << "Never found second token on line " << fLineNum << endl;
            exit(1);
        }
        // Try to translate it
        srcPtr += 2;
        const unsigned int cpVal = strtoul(srcPtr, &endPtr, 16);
        if (*endPtr)
        {
            cout << "Invalid code page value on line " << fLineNum << endl;
            exit(1);
        }
        // Make sure that the values are within range
        if (uniVal > 0xFFFF)
        {
            cout << "Unicode value is too big on line " << fLineNum << endl;
            exit(1);
        }
        if (cpVal > 0xFF)
        {
            cout << "Code page value is too big on line " << fLineNum << endl;
            exit(1);
        }
        // Looks reasonable, so add a new entry to the global table
        gMainTable[gMainTableSz].uniVal = (unsigned short)uniVal;
        gMainTable[gMainTableSz].cpVal = (unsigned char)cpVal;
        gMainTableSz++;
    }
}
int compFuncTo(const void* p1, const void* p2)
{
    const XlatRec* rec1 = (const XlatRec*)p1;
    const XlatRec* rec2 = (const XlatRec*)p2;
    return (int)rec1->uniVal - (int)rec2->uniVal;
}
int compFuncFrom(const void* p1, const void* p2)
{
    const XlatRec* rec1 = (const XlatRec*)p1;
    const XlatRec* rec2 = (const XlatRec*)p2;
    //
    //  Since there can be multiple Unicode chars that map to a single
    //  code page char, we have to handle the situationw here they are
    //  equal specially. If the code page vals are equal, then the one
    //  with the smaller Unicode code point is considered smaller.
    //
    if (rec1->cpVal == rec2->cpVal)
        return (int)rec1->uniVal - (int)rec2->uniVal;
    // Else use the code page value for sorting
    return (int)rec1->cpVal - (int)rec2->cpVal;
}
static void formatSBTables()
{
    // For now, only handle single byte char sets
    if ((gMinChar != 1) || (gMaxChar != 1))
    {
        cout << "formatSBTables can only handle single byte encodings"
             << endl;
        exit(1);
    }
    //
    //  First, we want to sort the table by the code page value field. This
    //  is the order required for the 'from' table to convert from the code
    //  page to the internal Unicode format.
    //
    qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncFrom);
    //
    //  Now spit out the header for the table. This is the same for all
    //  of them, since they are static to the file and can just all have
    //  the same name.
    //
    fprintf
    (
        gOutFile
        , "static const XMLCh gFromTable[256] =n{n    "
    );
    //
    //  Now for each unique entry in the cp value field, we want to put out
    //  the Unicode value for that entry. Since we sorted them such that
    //  dups have the one with the smaller Unicode value in the lower index,
    //  we always hit the desired value first, and then can just skip over
    //  a duplicate.
    //
    unsigned int curValue = 0;
    unsigned int index;
    for (index = 0; index < gMainTableSz; index++)
    {
        if (curValue)
        {
            if (!(curValue % 8))
                fprintf(gOutFile, "n  , ");
            else
                fprintf(gOutFile, ", ");
        }
        if (curValue == gMainTable[index].cpVal)
        {
            fprintf(gOutFile, "0x%04X", (unsigned int)gMainTable[index].uniVal);
            // If there is a dump, then skip it
            if (index < gMainTableSz)
            {
                if (gMainTable[index + 1].cpVal == curValue)
                    index++;
            }
        }
         else if (curValue < gMainTable[index].cpVal)
        {
            fprintf(gOutFile, "0xFFFF");
        }
         else
        {
            // Screwed up
            cout << "Current value got above target valuen" << endl;
            exit(1);
        }
        curValue++;
        // If the current value goes over 256, we are in trouble
        if (curValue > 256)
        {
            cout << "The code page value cannot be > 256 in SB moden" << endl;
            exit(1);
        }
    }
    // And print the trailer for this table
    fprintf(gOutFile, "n};nn");
    //
    //  Now lets sort by the Unicode value field. This sort is used for
    //  the 'to' table. The Unicode value is found by binary search and
    //  used to map to the right output encoding value.
    //
    qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncTo);
    // Output the table ehader for this one
    fprintf
    (
        gOutFile
        , "static const XMLTransService::TransRec gToTable[] =n{n    "
    );
    for (index = 0; index < gMainTableSz; index++)
    {
        if (index)
        {
            if (!(index % 4))
                fprintf(gOutFile, "n  , ");
            else
                fprintf(gOutFile, ", ");
        }
        fprintf
        (
            gOutFile
            , "{ 0x%04X, 0x%02X }"
            , (unsigned int)gMainTable[index].uniVal
            , (unsigned int)gMainTable[index].cpVal
        );
    }
    // Print the trailer for this table
    fprintf(gOutFile, "n};n");
    // And print out the table size constant
    fprintf(gOutFile, "static const unsigned int gToTableSz = %d;n", gMainTableSz);
}
static void showUsage()
{
    cout << "ICUData inputUCMfile outputfilen" << endl;
}
// ---------------------------------------------------------------------------
//  The parameters are:
//
//  argV[1] = The source UCM file
//  argV[2] = The path to the output file
// ---------------------------------------------------------------------------
int main(int argC, char** argV)
{
    // We have to have 3 parameters
    if (argC != 3)
    {
        showUsage();
        return 1;
    }
    // Try to open the first file for input
    gInFile = fopen(argV[1], "rt");
    if (!gInFile)
    {
        cout << "Could not find input file: " << argV[1] << endl;
        return 1;
    }
    // Try to open the second file for output (truncated)
    gOutFile = fopen(argV[2], "wt+");
    if (!gOutFile)
    {
        cout << "Could not create output file: " << argV[1] << endl;
        return 1;
    }
    //
    //  This will parse the file and load the table. It will also look for
    //  a couple of key fields in the file header and store that data into
    //  globals.
    //
    loadTable();
    // If we didn't get any table entries, then give up
    if (!gMainTableSz)
    {
        cout << "No translation table entries were found in the file" << endl;
        return 1;
    }
    //
    //  Ok, we got the data loaded. Now lets output the tables. This method
    //  spit out both tables to the output file, in a format ready to be
    //  incorporated directly into the source code.
    //
    formatSBTables();
    // Close our files
    fclose(gInFile);
    fclose(gOutFile);
    return 0;
}