tclUtf.c
上传用户:rrhhcc
上传日期:2015-12-11
资源大小:54129k
文件大小:46k
- /*
- * tclUtf.c --
- *
- * Routines for manipulating UTF-8 strings.
- *
- * Copyright (c) 1997-1998 Sun Microsystems, Inc.
- *
- * See the file "license.terms" for information on usage and redistribution
- * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
- *
- * RCS: @(#) $Id: tclUtf.c,v 1.30.2.3 2005/09/07 14:35:56 dgp Exp $
- */
- #include "tclInt.h"
- /*
- * Include the static character classification tables and macros.
- */
- #include "tclUniData.c"
- /*
- * The following macros are used for fast character category tests. The
- * x_BITS values are shifted right by the category value to determine whether
- * the given category is included in the set.
- */
- #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER)
- | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
- #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
- #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR)
- | (1 << PARAGRAPH_SEPARATOR))
- #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
- #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS |
- (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) |
- (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) |
- (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) |
- (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) |
- (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) |
- (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) |
- (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) |
- (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
- #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) |
- (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) |
- (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) |
- (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
- /*
- * Unicode characters less than this value are represented by themselves
- * in UTF-8 strings.
- */
- #define UNICODE_SELF 0x80
- /*
- * The following structures are used when mapping between Unicode (UCS-2)
- * and UTF-8.
- */
- static CONST unsigned char totalBytes[256] = {
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- #if TCL_UTF_MAX > 3
- 4,4,4,4,4,4,4,4,
- #else
- 1,1,1,1,1,1,1,1,
- #endif
- #if TCL_UTF_MAX > 4
- 5,5,5,5,
- #else
- 1,1,1,1,
- #endif
- #if TCL_UTF_MAX > 5
- 6,6,6,6
- #else
- 1,1,1,1
- #endif
- };
- /*
- * Procedures used only in this module.
- */
- static int UtfCount _ANSI_ARGS_((int ch));
- /*
- *---------------------------------------------------------------------------
- *
- * UtfCount --
- *
- * Find the number of bytes in the Utf character "ch".
- *
- * Results:
- * The return values is the number of bytes in the Utf character "ch".
- *
- * Side effects:
- * None.
- *
- *---------------------------------------------------------------------------
- */
-
- INLINE static int
- UtfCount(ch)
- int ch; /* The Tcl_UniChar whose size is returned. */
- {
- if ((ch > 0) && (ch < UNICODE_SELF)) {
- return 1;
- }
- if (ch <= 0x7FF) {
- return 2;
- }
- if (ch <= 0xFFFF) {
- return 3;
- }
- #if TCL_UTF_MAX > 3
- if (ch <= 0x1FFFFF) {
- return 4;
- }
- if (ch <= 0x3FFFFFF) {
- return 5;
- }
- if (ch <= 0x7FFFFFFF) {
- return 6;
- }
- #endif
- return 3;
- }
- /*
- *---------------------------------------------------------------------------
- *
- * Tcl_UniCharToUtf --
- *
- * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
- * provided buffer. Equivalent to Plan 9 runetochar().
- *
- * Results:
- * The return values is the number of bytes in the buffer that
- * were consumed.
- *
- * Side effects:
- * None.
- *
- *---------------------------------------------------------------------------
- */
-
- INLINE int
- Tcl_UniCharToUtf(ch, str)
- int ch; /* The Tcl_UniChar to be stored in the
- * buffer. */
- char *str; /* Buffer in which the UTF-8 representation
- * of the Tcl_UniChar is stored. Buffer must
- * be large enough to hold the UTF-8 character
- * (at most TCL_UTF_MAX bytes). */
- {
- if ((ch > 0) && (ch < UNICODE_SELF)) {
- str[0] = (char) ch;
- return 1;
- }
- if (ch >= 0) {
- if (ch <= 0x7FF) {
- str[1] = (char) ((ch | 0x80) & 0xBF);
- str[0] = (char) ((ch >> 6) | 0xC0);
- return 2;
- }
- if (ch <= 0xFFFF) {
- three:
- str[2] = (char) ((ch | 0x80) & 0xBF);
- str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
- str[0] = (char) ((ch >> 12) | 0xE0);
- return 3;
- }
- #if TCL_UTF_MAX > 3
- if (ch <= 0x1FFFFF) {
- str[3] = (char) ((ch | 0x80) & 0xBF);
- str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
- str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
- str[0] = (char) ((ch >> 18) | 0xF0);
- return 4;
- }
- if (ch <= 0x3FFFFFF) {
- str[4] = (char) ((ch | 0x80) & 0xBF);
- str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
- str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
- str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
- str[0] = (char) ((ch >> 24) | 0xF8);
- return 5;
- }
- if (ch <= 0x7FFFFFFF) {
- str[5] = (char) ((ch | 0x80) & 0xBF);
- str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
- str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
- str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
- str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
- str[0] = (char) ((ch >> 30) | 0xFC);
- return 6;
- }
- #endif
- }
- ch = 0xFFFD;
- goto three;
- }
- /*
- *---------------------------------------------------------------------------
- *
- * Tcl_UniCharToUtfDString --
- *
- * Convert the given Unicode string to UTF-8.
- *
- * Results:
- * The return value is a pointer to the UTF-8 representation of the
- * Unicode string. Storage for the return value is appended to the
- * end of dsPtr.
- *
- * Side effects:
- * None.
- *
- *---------------------------------------------------------------------------
- */
-
- char *
- Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
- CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */
- int numChars; /* Length of Unicode string in Tcl_UniChars
- * (must be >= 0). */
- Tcl_DString *dsPtr; /* UTF-8 representation of string is
- * appended to this previously initialized
- * DString. */
- {
- CONST Tcl_UniChar *w, *wEnd;
- char *p, *string;
- int oldLength;
- /*
- * UTF-8 string length in bytes will be <= Unicode string length *
- * TCL_UTF_MAX.
- */
- oldLength = Tcl_DStringLength(dsPtr);
- Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);
- string = Tcl_DStringValue(dsPtr) + oldLength;
- p = string;
- wEnd = wString + numChars;
- for (w = wString; w < wEnd; ) {
- p += Tcl_UniCharToUtf(*w, p);
- w++;
- }
- Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
- return string;
- }
- /*
- *---------------------------------------------------------------------------
- *
- * Tcl_UtfToUniChar --
- *
- * Extract the Tcl_UniChar represented by the UTF-8 string. Bad
- * UTF-8 sequences are converted to valid Tcl_UniChars and processing
- * continues. Equivalent to Plan 9 chartorune().
- *
- * The caller must ensure that the source buffer is long enough that
- * this routine does not run off the end and dereference non-existent
- * memory looking for trail bytes. If the source buffer is known to
- * be ' ' terminated, this cannot happen. Otherwise, the caller
- * should call Tcl_UtfCharComplete() before calling this routine to
- * ensure that enough bytes remain in the string.
- *
- * Results:
- * *chPtr is filled with the Tcl_UniChar, and the return value is the
- * number of bytes from the UTF-8 string that were consumed.
- *
- * Side effects:
- * None.
- *
- *---------------------------------------------------------------------------
- */
-
- int
- Tcl_UtfToUniChar(str, chPtr)
- register CONST char *str; /* The UTF-8 string. */
- register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented
- * by the UTF-8 string. */
- {
- register int byte;
-
- /*
- * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
- */
- byte = *((unsigned char *) str);
- if (byte < 0xC0) {
- /*
- * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
- * Also treats