ICUData.cpp
上传用户:zhuqijet
上传日期:2013-06-25
资源大小:10074k
文件大小:18k
- /*
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 1999-2000 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Xerces" and "Apache Software Foundation" must
- * not be used to endorse or promote products derived from this
- * software without prior written permission. For written
- * permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * nor may "Apache" appear in their name, without prior written
- * permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation, and was
- * originally based on software copyright (c) 1999, International
- * Business Machines, Inc., http://www.ibm.com . For more information
- * on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
- /*
- * $Log: ICUData.cpp,v $
- * Revision 1.1 2000/03/17 23:58:00 roddey
- * New utility for munging ICU UCM files and spitting out tables for
- * our intrinsic encoders.
- *
- */
- // ---------------------------------------------------------------------------
- // This program is designed to parse a standard ICU .UCM file and spit out
- // a C++ code fragment that represents the tables required by the intrinsic
- // XML parser transcoders.
- //
- // The file format is pretty simple and this program is not intended to be
- // industrial strength by any means. Its use by anyone but the author is
- // at the user's own risk.
- //
- // The code looks for the min/max bytes per character to know what kind of
- // table to spit out, but for now only handles single char sets.
- // ---------------------------------------------------------------------------
- // ---------------------------------------------------------------------------
- // Includes
- // ---------------------------------------------------------------------------
- #include <ctype.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <iostream.h>
- #include <string.h>
- // ---------------------------------------------------------------------------
- // Const data
- // ---------------------------------------------------------------------------
- static const unsigned int gMaxInRecs = 1024;
- // ---------------------------------------------------------------------------
- // Local data types
- // ---------------------------------------------------------------------------
- struct XlatRec
- {
- unsigned short uniVal;
- unsigned char cpVal;
- };
- // ---------------------------------------------------------------------------
- // Local data
- //
- // gInFile
- // gOutFile
- // These are the file stream for the input UCM file and the output file
- // that we write the C++ code to.
- //
- // fLineNum
- // Used to track the current line number in the source file, for error
- // reporting.
- //
- // gMainTable
- // gMainTableSz
- // This is the table that is filled in from the original source document.
- // We don't know how big it will be, but its not likely to be much more
- // than 300 entries or so (256 output code points with some multiply
- // mapped Unicode code points.) So we make it extra large and watch for
- // possible overflow.
- //
- // The size value is bumped up as we load entries into it during the
- // parse of the file.
- //
- // gMaxChar
- // gMinChar
- // The min/max chars that are used to represent a character. These are
- // read from the header of the input file.
- //
- // gRepChar
- // The replacement character to be used. This is read from the header of
- // the input file.
- // ---------------------------------------------------------------------------
- static FILE* gInFile;
- static FILE* gOutFile;
- static unsigned int fLineNum;
- static XlatRec gMainTable[gMaxInRecs];
- static unsigned int gMainTableSz = 0;
- static unsigned int gMaxChar;
- static unsigned int gMinChar;
- static unsigned char gRepChar = 1;
- // ---------------------------------------------------------------------------
- // Local functions
- // ---------------------------------------------------------------------------
- static unsigned int getLine( char* const toFill
- , const unsigned int maxChars
- , const bool eofOk = false)
- {
- while (true)
- {
- if (!fgets(toFill, maxChars, gInFile))
- {
- if (feof(gInFile))
- {
- if (eofOk)
- return ~0UL;
- else
- cout << "Unexpected end of input at line: " << fLineNum << endl;
- }
- else
- {
- cout << "Error processing input at line: " << fLineNum << endl;
- exit(1);
- }
- }
- fLineNum++;
- //
- // If its not a comment, then break out
- //
- if (toFill[0] != '#')
- break;
- }
- //
- // There could be a trailing comment on this line, so lets get rid
- // of it. Search for a # char and put a null there.
- //
- char* endPtr = toFill;
- while (*endPtr && (*endPtr != '#'))
- endPtr++;
- if (*endPtr == '#')
- *endPtr = 0;
- // Strip trailing whitespace
- endPtr = toFill + (strlen(toFill) - 1);
- while (isspace(*endPtr))
- endPtr--;
- *(endPtr + 1) = 0;
- // And return the count of chars we got
- return strlen(toFill);
- }
- static unsigned int extractVal(char* const srcStr)
- {
- char* srcPtr = srcStr;
- // Run forward to the first non-space
- while (isspace(*srcPtr))
- srcPtr++;
- if (!*srcPtr)
- {
- cout << "Invalid numeric value on line: " << fLineNum << endl;
- exit(1);
- }
- //
- // If it starts with , then its a hex value in the form xXX. Else its
- // just a decimal value.
- //
- unsigned int retVal;
- char* endPtr;
- if (*srcPtr == '\')
- {
- // Skip the \x and interpret as a hex value
- srcPtr += 2;
- retVal = (unsigned int)strtoul(srcPtr, &endPtr, 16);
- }
- else
- {
- retVal = (unsigned int)strtoul(srcPtr, &endPtr, 10);
- }
- // We should have translated up to the end of the string
- if (*endPtr)
- {
- cout << "Invalid numeric value on line: " << fLineNum << endl;
- exit(1);
- }
- return retVal;
- }
- static void loadTable()
- {
- //
- // Just loop, reading lines at a time, until we either find the start
- // of the character table or hit the end of the file. Along the way, we
- // should see a few header values that we store away.
- //
- const unsigned int tmpBufSz = 2048;
- char tmpBuf[tmpBufSz - 1];
- while (getLine(tmpBuf, tmpBufSz))
- {
- //
- // Check for one of the special values we are intersted int. If
- // its CHARMAP, then we fall out of this loop.
- //
- if (!strcmp(tmpBuf, "CHARMAP"))
- break;
- if (!strncmp(tmpBuf, "<mb_cur_max>", 12))
- {
- gMaxChar = extractVal(&tmpBuf[12]);
- }
- else if (!strncmp(tmpBuf, "<mb_cur_min>", 12))
- {
- gMinChar = extractVal(&tmpBuf[12]);
- }
- else if (!strncmp(tmpBuf, "<subchar>", 9))
- {
- gRepChar = (char)extractVal(&tmpBuf[9]);
- }
- }
- //
- // Ok, now we just run till we hit the "END CHARMAP" line. Each entry
- // will be in the form:
- //
- // <UXXXX> xXX
- //
- // Where X is a hex number.
- //
- char* endPtr;
- while (getLine(tmpBuf, tmpBufSz))
- {
- // Watch for the end of table
- if (!strcmp(tmpBuf, "END CHARMAP"))
- break;
- // The absolute minium it could be is 12 chars
- if (strlen(tmpBuf) < 12)
- {
- cout << "Line " << fLineNum << " is too short to hold a valid entry"
- << endl;
- exit(1);
- }
- // Make sure the first token meets the criteria
- if ((tmpBuf[0] != '<')
- || (tmpBuf[1] != 'U')
- || (tmpBuf[6] != '>'))
- {
- cout << "Line " << fLineNum << " has a badly formed Unicode value"
- << endl;
- exit(1);
- }
- //
- // Looks reasonable so lets try to convert it. We can play tricks
- // with this buffer, so put a null over the > char.
- //
- tmpBuf[6] = 0;
- const unsigned int uniVal = strtoul(&tmpBuf[2], &endPtr, 16);
- if (*endPtr)
- {
- cout << "Invalid Unicode value on line " << fLineNum << endl;
- exit(1);
- }
- //
- // Ok, lets search over to the second token. We have to find a \
- // character.
- //
- char* srcPtr = &tmpBuf[7];
- while (*srcPtr && (*srcPtr != '\'))
- srcPtr++;
- // If we never found it, its in error
- if (!*srcPtr)
- {
- cout << "Never found second token on line " << fLineNum << endl;
- exit(1);
- }
- // Try to translate it
- srcPtr += 2;
- const unsigned int cpVal = strtoul(srcPtr, &endPtr, 16);
- if (*endPtr)
- {
- cout << "Invalid code page value on line " << fLineNum << endl;
- exit(1);
- }
- // Make sure that the values are within range
- if (uniVal > 0xFFFF)
- {
- cout << "Unicode value is too big on line " << fLineNum << endl;
- exit(1);
- }
- if (cpVal > 0xFF)
- {
- cout << "Code page value is too big on line " << fLineNum << endl;
- exit(1);
- }
- // Looks reasonable, so add a new entry to the global table
- gMainTable[gMainTableSz].uniVal = (unsigned short)uniVal;
- gMainTable[gMainTableSz].cpVal = (unsigned char)cpVal;
- gMainTableSz++;
- }
- }
- int compFuncTo(const void* p1, const void* p2)
- {
- const XlatRec* rec1 = (const XlatRec*)p1;
- const XlatRec* rec2 = (const XlatRec*)p2;
- return (int)rec1->uniVal - (int)rec2->uniVal;
- }
- int compFuncFrom(const void* p1, const void* p2)
- {
- const XlatRec* rec1 = (const XlatRec*)p1;
- const XlatRec* rec2 = (const XlatRec*)p2;
- //
- // Since there can be multiple Unicode chars that map to a single
- // code page char, we have to handle the situationw here they are
- // equal specially. If the code page vals are equal, then the one
- // with the smaller Unicode code point is considered smaller.
- //
- if (rec1->cpVal == rec2->cpVal)
- return (int)rec1->uniVal - (int)rec2->uniVal;
- // Else use the code page value for sorting
- return (int)rec1->cpVal - (int)rec2->cpVal;
- }
- static void formatSBTables()
- {
- // For now, only handle single byte char sets
- if ((gMinChar != 1) || (gMaxChar != 1))
- {
- cout << "formatSBTables can only handle single byte encodings"
- << endl;
- exit(1);
- }
- //
- // First, we want to sort the table by the code page value field. This
- // is the order required for the 'from' table to convert from the code
- // page to the internal Unicode format.
- //
- qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncFrom);
- //
- // Now spit out the header for the table. This is the same for all
- // of them, since they are static to the file and can just all have
- // the same name.
- //
- fprintf
- (
- gOutFile
- , "static const XMLCh gFromTable[256] =n{n "
- );
- //
- // Now for each unique entry in the cp value field, we want to put out
- // the Unicode value for that entry. Since we sorted them such that
- // dups have the one with the smaller Unicode value in the lower index,
- // we always hit the desired value first, and then can just skip over
- // a duplicate.
- //
- unsigned int curValue = 0;
- unsigned int index;
- for (index = 0; index < gMainTableSz; index++)
- {
- if (curValue)
- {
- if (!(curValue % 8))
- fprintf(gOutFile, "n , ");
- else
- fprintf(gOutFile, ", ");
- }
- if (curValue == gMainTable[index].cpVal)
- {
- fprintf(gOutFile, "0x%04X", (unsigned int)gMainTable[index].uniVal);
- // If there is a dump, then skip it
- if (index < gMainTableSz)
- {
- if (gMainTable[index + 1].cpVal == curValue)
- index++;
- }
- }
- else if (curValue < gMainTable[index].cpVal)
- {
- fprintf(gOutFile, "0xFFFF");
- }
- else
- {
- // Screwed up
- cout << "Current value got above target valuen" << endl;
- exit(1);
- }
- curValue++;
- // If the current value goes over 256, we are in trouble
- if (curValue > 256)
- {
- cout << "The code page value cannot be > 256 in SB moden" << endl;
- exit(1);
- }
- }
- // And print the trailer for this table
- fprintf(gOutFile, "n};nn");
- //
- // Now lets sort by the Unicode value field. This sort is used for
- // the 'to' table. The Unicode value is found by binary search and
- // used to map to the right output encoding value.
- //
- qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncTo);
- // Output the table ehader for this one
- fprintf
- (
- gOutFile
- , "static const XMLTransService::TransRec gToTable[] =n{n "
- );
- for (index = 0; index < gMainTableSz; index++)
- {
- if (index)
- {
- if (!(index % 4))
- fprintf(gOutFile, "n , ");
- else
- fprintf(gOutFile, ", ");
- }
- fprintf
- (
- gOutFile
- , "{ 0x%04X, 0x%02X }"
- , (unsigned int)gMainTable[index].uniVal
- , (unsigned int)gMainTable[index].cpVal
- );
- }
- // Print the trailer for this table
- fprintf(gOutFile, "n};n");
- // And print out the table size constant
- fprintf(gOutFile, "static const unsigned int gToTableSz = %d;n", gMainTableSz);
- }
- static void showUsage()
- {
- cout << "ICUData inputUCMfile outputfilen" << endl;
- }
- // ---------------------------------------------------------------------------
- // The parameters are:
- //
- // argV[1] = The source UCM file
- // argV[2] = The path to the output file
- // ---------------------------------------------------------------------------
- int main(int argC, char** argV)
- {
- // We have to have 3 parameters
- if (argC != 3)
- {
- showUsage();
- return 1;
- }
- // Try to open the first file for input
- gInFile = fopen(argV[1], "rt");
- if (!gInFile)
- {
- cout << "Could not find input file: " << argV[1] << endl;
- return 1;
- }
- // Try to open the second file for output (truncated)
- gOutFile = fopen(argV[2], "wt+");
- if (!gOutFile)
- {
- cout << "Could not create output file: " << argV[1] << endl;
- return 1;
- }
- //
- // This will parse the file and load the table. It will also look for
- // a couple of key fields in the file header and store that data into
- // globals.
- //
- loadTable();
- // If we didn't get any table entries, then give up
- if (!gMainTableSz)
- {
- cout << "No translation table entries were found in the file" << endl;
- return 1;
- }
- //
- // Ok, we got the data loaded. Now lets output the tables. This method
- // spit out both tables to the output file, in a format ready to be
- // incorporated directly into the source code.
- //
- formatSBTables();
- // Close our files
- fclose(gInFile);
- fclose(gOutFile);
- return 0;
- }