wsutf8.c
资源名称:gateway-1.2.1 [点击查看]
上传用户:gzpyjq
上传日期:2013-01-31
资源大小:1852k
文件大小:8k
源码类别:
手机WAP编程
开发平台:
WINDOWS
- /*
- *
- * wsutf8.c
- *
- * Author: Markku Rossi <mtr@iki.fi>
- *
- * Copyright (c) 1999-2000 WAPIT OY LTD.
- * All rights reserved.
- *
- * Functions to manipulate UTF-8 encoded strings.
- *
- * Specification: RFC-2279
- *
- */
- #include "wsint.h"
- /********************* Types and definitions ****************************/
- /* Masks to determine the UTF-8 encoding of an ISO 10646 character. */
- #define WS_UTF8_ENC_1_M 0xffffff80
- #define WS_UTF8_ENC_2_M 0xfffff800
- #define WS_UTF8_ENC_3_M 0xffff0000
- #define WS_UTF8_ENC_4_M 0xffe00000
- #define WS_UTF8_ENC_5_M 0xfc000000
- #define WS_UTF8_ENC_6_M 0x80000000
- /* The high-order bits. This array can be indexed with the number of
- bytes in the encoding to get the initialization mask for the
- high-order bits. */
- static unsigned char utf8_hibits[7] =
- {
- 0x00, /* unused */
- 0x00, /* 1 byte */
- 0xc0, /* 2 bytes */
- 0xe0, /* 3 bytes */
- 0xf0, /* 4 bytes */
- 0xf8, /* 5 bytes */
- 0xfc, /* 6 bytes */
- };
- /* The high-order bits for continuation bytes (10xxxxxx). */
- #define WS_UTF8_ENC_C_BITS 0x80
- /* Mask to get the continuation bytes from the character (00111111). */
- #define WS_UTF8_CONT_DATA_MASK 0x3f
- /* Determine the encoding type of the ISO 10646 character `ch'. The
- argument `ch' must be given as `unsigned long'. The macro returns
- 0 if the value `ch' can not be encoded as UTF-8 and the number of
- bytes in the encoded value otherwise. */
- #define WS_UTF8_ENC_TYPE(ch)
- (((ch) & WS_UTF8_ENC_1_M) == 0
- ? 1
- : (((ch) & WS_UTF8_ENC_2_M) == 0
- ? 2
- : (((ch) & WS_UTF8_ENC_3_M) == 0
- ? 3
- : (((ch) & WS_UTF8_ENC_4_M) == 0
- ? 4
- : (((ch) & WS_UTF8_ENC_5_M) == 0
- ? 5
- : (((ch) & WS_UTF8_ENC_6_M) == 0
- ? 6
- : 0))))))
- /* Masks and values to determine the length of an UTF-8 encoded
- character. */
- #define WS_UTF8_DEC_1_M 0x80
- #define WS_UTF8_DEC_2_M 0xe0
- #define WS_UTF8_DEC_3_M 0xf0
- #define WS_UTF8_DEC_4_M 0xf8
- #define WS_UTF8_DEC_5_M 0xfc
- #define WS_UTF8_DEC_6_M 0xfe
- #define WS_UTF8_DEC_1_V 0x00
- #define WS_UTF8_DEC_2_V 0xc0
- #define WS_UTF8_DEC_3_V 0xe0
- #define WS_UTF8_DEC_4_V 0xf0
- #define WS_UTF8_DEC_5_V 0xf8
- #define WS_UTF8_DEC_6_V 0xfc
- /* Masks to get the data bits from the first byte of an UTF-8 encoded
- character. This array can be indexed with the number of bytes in
- the encoding. */
- static unsigned char utf8_hidata_masks[7] =
- {
- 0x00, /* unused */
- 0x7f, /* 1 byte */
- 0x1f, /* 2 bytes */
- 0x0f, /* 3 bytes */
- 0x07, /* 4 bytes */
- 0x03, /* 5 bytes */
- 0x01, /* 6 bytes */
- };
- /* The mask and the value of the continuation bytes. */
- #define WS_UTF8_DEC_C_M 0xc0
- #define WS_UTF8_DEC_C_V 0x80
- /* Determine how many bytes the UTF-8 encoding uses by investigating
- the first byte `b'. The argument `b' must be given as `unsigned
- char'. The macro returns 0 if the byte `b' is not a valid UTF-8
- first byte. */
- #define WS_UTF8_DEC_TYPE(b)
- (((b) & WS_UTF8_DEC_1_M) == WS_UTF8_DEC_1_V
- ? 1
- : (((b) & WS_UTF8_DEC_2_M) == WS_UTF8_DEC_2_V
- ? 2
- : (((b) & WS_UTF8_DEC_3_M) == WS_UTF8_DEC_3_V
- ? 3
- : (((b) & WS_UTF8_DEC_4_M) == WS_UTF8_DEC_4_V
- ? 4
- : (((b) & WS_UTF8_DEC_5_M) == WS_UTF8_DEC_5_V
- ? 5
- : (((b) & WS_UTF8_DEC_6_M) == WS_UTF8_DEC_6_V
- ? 6
- : 0))))))
- /* Predicate to check whether the `unsigned char' byte `b' is a
- continuation byte. */
- #define WS_UTF8_DEC_C_P(b) (((b) & WS_UTF8_DEC_C_M) == WS_UTF8_DEC_C_V)
- /********************* Global functions *********************************/
- WsUtf8String *ws_utf8_alloc()
- {
- return ws_calloc(1, sizeof(WsUtf8String));
- }
- void ws_utf8_free(WsUtf8String *string)
- {
- if (string == NULL)
- return;
- ws_free(string->data);
- ws_free(string);
- }
- int ws_utf8_append_char(WsUtf8String *string, unsigned long ch)
- {
- unsigned char *d;
- unsigned int num_bytes = WS_UTF8_ENC_TYPE(ch);
- unsigned int len, i;
- if (num_bytes == 0)
- ws_fatal("ws_utf8_append_char(): 0x%lx is not a valid UTF-8 character",
- ch);
- d = ws_realloc(string->data, string->len + num_bytes);
- if (d == NULL)
- return 0;
- len = string->len;
- /* Encode the continuation bytes (n > 1). */
- for (i = num_bytes - 1; i > 0; i--) {
- d[len + i] = WS_UTF8_ENC_C_BITS;
- d[len + i] |= ch & WS_UTF8_CONT_DATA_MASK;
- ch >>= 6;
- }
- /* And continue the first byte. */
- d[len] = utf8_hibits[num_bytes];
- d[len] |= ch;
- string->data = d;
- string->len += num_bytes;
- string->num_chars++;
- return 1;
- }
- int ws_utf8_verify(const unsigned char *data, size_t len,
- size_t *strlen_return)
- {
- unsigned int num_bytes, i;
- size_t strlen = 0;
- while (len > 0) {
- num_bytes = WS_UTF8_DEC_TYPE(*data);
- if (num_bytes == 0)
- /* Not a valid beginning. */
- return 0;
- if (len < num_bytes)
- /* The data is truncated. */
- return 0;
- for (i = 1; i < num_bytes; i++)
- if (!WS_UTF8_DEC_C_P(data[i]))
- /* Not a valid continuation byte. */
- return 0;
- len -= num_bytes;
- data += num_bytes;
- strlen++;
- }
- if (strlen_return)
- *strlen_return = strlen;
- return 1;
- }
- int ws_utf8_set_data(WsUtf8String *string, const unsigned char *data,
- size_t len)
- {
- size_t num_chars;
- if (!ws_utf8_verify(data, len, &num_chars))
- /* Malformed data. */
- return 0;
- /* Init `string' to empty. */
- ws_free(string->data);
- string->data = NULL;
- string->len = 0;
- string->num_chars = 0;
- /* Set the new data. */
- string->data = ws_memdup(data, len);
- if (string->data == NULL)
- return 0;
- string->len = len;
- string->num_chars = num_chars;
- return 1;
- }
- int ws_utf8_get_char(const WsUtf8String *string, unsigned long *ch_return,
- size_t *posp)
- {
- size_t pos = *posp;
- unsigned int num_bytes, i;
- unsigned char *data;
- unsigned long ch;
- if (pos < 0 || pos >= string->len)
- /* Index out range. */
- return 0;
- data = string->data + pos;
- num_bytes = WS_UTF8_DEC_TYPE(*data);
- if (num_bytes == 0)
- /* Invalid position. */
- return 0;
- if (pos + num_bytes > string->len)
- /* Truncated data. */
- return 0;
- /* Get the first byte. */
- ch = data[0] & utf8_hidata_masks[num_bytes];
- /* Add the continuation bytes. */
- for (i = 1; i < num_bytes; i++) {
- ch <<= 6;
- ch |= data[i] & WS_UTF8_CONT_DATA_MASK;
- }
- *ch_return = ch;
- *posp = pos + num_bytes;
- return 1;
- }
- unsigned char *ws_utf8_to_latin1(const WsUtf8String *string,
- unsigned char unknown_char,
- size_t *len_return)
- {
- unsigned char *cstr;
- size_t i;
- size_t pos = 0;
- if (string == NULL)
- return NULL;
- cstr = ws_malloc(string->num_chars + 1);
- if (cstr == NULL)
- return NULL;
- for (i = 0; i < string->num_chars; i++) {
- unsigned long ch;
- if (!ws_utf8_get_char(string, &ch, &pos))
- ws_fatal("ws_utf8_to_latin1_cstr(): internal inconsistency");
- if (ch > 0xff)
- cstr[i] = unknown_char;
- else
- cstr[i] = (unsigned char) ch;
- }
- cstr[i] = ' ';
- if (len_return)
- *len_return = string->num_chars;
- return cstr;
- }
- void ws_utf8_free_data(unsigned char *data)
- {
- if (data)
- ws_free(data);
- }