encoding.c
上传用户:sy_wanhua
上传日期:2013-07-25
资源大小:3048k
文件大小:26k
- /*
- * encoding.c : implements the encoding conversion functions needed for XML
- *
- * Related specs:
- * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
- * [ISO-10646] UTF-8 and UTF-16 in Annexes
- * [ISO-8859-1] ISO Latin-1 characters codes.
- * [UNICODE] The Unicode Consortium, "The Unicode Standard --
- * Worldwide Character Encoding -- Version 1.0", Addison-
- * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
- * described in Unicode Technical Report #4.
- * [US-ASCII] Coded Character Set--7-bit American Standard Code for
- * Information Interchange, ANSI X3.4-1986.
- *
- * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
- *
- * See Copyright for the status of this software.
- *
- * Daniel.Veillard@w3.org
- */
- #ifdef WIN32
- #include "win32config.h"
- #else
- #include "config.h"
- #endif
- #include <stdio.h>
- #include <string.h>
- #ifdef HAVE_CTYPE_H
- #include <ctype.h>
- #endif
- #ifdef HAVE_STDLIB_H
- #include <stdlib.h>
- #endif
- #include <libxml/encoding.h>
- #include <libxml/xmlmemory.h>
- xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
- xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
- /*
- * From rfc2044: encoding of the Unicode values on UTF-8:
- *
- * UCS-4 range (hex.) UTF-8 octet sequence (binary)
- * 0000 0000-0000 007F 0xxxxxxx
- * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
- * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
- *
- * I hope we won't use values > 0xFFFF anytime soon !
- */
- /**
- * xmlCheckUTF8: Check utf-8 string for legality.
- * @utf: Pointer to putative utf-8 encoded string.
- *
- * Checks @utf for being valid utf-8. @utf is assumed to be
- * null-terminated. This function is not super-strict, as it will
- * allow longer utf-8 sequences than necessary. Note that Java is
- * capable of producing these sequences if provoked. Also note, this
- * routine checks for the 4-byte maxiumum size, but does not check for
- * 0x10ffff maximum value.
- *
- * Return value: true if @utf is valid.
- **/
- int
- xmlCheckUTF8(const unsigned char *utf)
- {
- int ix;
- unsigned char c;
- for (ix = 0; (c = utf[ix]);) {
- if (c & 0x80) {
- if ((utf[ix + 1] & 0xc0) != 0x80)
- return(0);
- if ((c & 0xe0) == 0xe0) {
- if ((utf[ix + 2] & 0xc0) != 0x80)
- return(0);
- if ((c & 0xf0) == 0xf0) {
- if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
- return(0);
- ix += 4;
- /* 4-byte code */
- } else
- /* 3-byte code */
- ix += 3;
- } else
- /* 2-byte code */
- ix += 2;
- } else
- /* 1-byte code */
- ix++;
- }
- return(1);
- }
- /**
- * isolat1ToUTF8:
- * @out: a pointer to an array of bytes to store the result
- * @outlen: the length of @out
- * @in: a pointer to an array of ISO Latin 1 chars
- * @inlen: the length of @in
- *
- * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
- * block of chars out.
- * Returns the number of byte written, or -1 by lack of space.
- */
- int
- isolat1ToUTF8(unsigned char* out, int outlen,
- const unsigned char* in, int *inlen) {
- unsigned char* outstart= out;
- unsigned char* outend= out+outlen;
- const unsigned char* inend= in+*inlen;
- unsigned char c;
- while (in < inend) {
- c= *in++;
- if (c < 0x80) {
- if (out >= outend) return(-1);
- *out++ = c;
- }
- else {
- if (out >= outend) return(-1);
- *out++ = 0xC0 | (c >> 6);
- if (out >= outend) return(-1);
- *out++ = 0x80 | (0x3F & c);
- }
- }
- return(out-outstart);
- }
- /**
- * UTF8Toisolat1:
- * @out: a pointer to an array of bytes to store the result
- * @outlen: the length of @out
- * @in: a pointer to an array of UTF-8 chars
- * @inlen: the length of @in
- *
- * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
- * block of chars out.
- * TODO: UTF8Toisolat1 need a fallback mechanism ...
- *
- * Returns the number of byte written, or -1 by lack of space, or -2
- * if the transcoding fails (for *in is not valid utf8 string or
- * the result of transformation can't fit into the encoding we want)
- * The value of @inlen after return is the number of octets consumed
- * as the return value is positive, else unpredictiable.
- */
- int
- UTF8Toisolat1(unsigned char* out, int outlen,
- const unsigned char* in, int *inlen) {
- unsigned char* outstart= out;
- unsigned char* outend= out+outlen;
- const unsigned char* inend= in+*inlen;
- unsigned char c;
- while (in < inend) {
- c= *in++;
- if (c < 0x80) {
- if (out >= outend) return(-1);
- *out++= c;
- }
- else if (in == inend) {
- *inlen -= 1;
- break;
- }
- else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
- /* a two byte utf-8 and can be encoding as isolate1 */
- *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
- }
- else
- return(-2);
- /* TODO : some should be represent as "&#x____;" */
- }
- return(out-outstart);
- }
- /**
- * UTF16LEToUTF8:
- * @out: a pointer to an array of bytes to store the result
- * @outlen: the length of @out
- * @inb: a pointer to an array of UTF-16LE passwd as a byte array
- * @inlenb: the length of @in in UTF-16LE chars
- *
- * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
- * block of chars out. This function assume the endian properity
- * is the same between the native type of this machine and the
- * inputed one.
- *
- * Returns the number of byte written, or -1 by lack of space, or -2
- * if the transcoding fails (for *in is not valid utf16 string)
- * The value of *inlen after return is the number of octets consumed
- * as the return value is positive, else unpredictiable.
- */
- int
- UTF16LEToUTF8(unsigned char* out, int outlen,
- const unsigned char* inb, int *inlenb)
- {
- unsigned char* outstart= out;
- unsigned char* outend= out+outlen;
- unsigned short* in = (unsigned short*) inb;
- unsigned short* inend;
- unsigned int c, d, inlen;
- unsigned char *tmp;
- int bits;
- if ((*inlenb % 2) == 1)
- (*inlenb)--;
- inlen = *inlenb / 2;
- inend= in + inlen;
- while (in < inend) {
- #ifdef BIG_ENDIAN
- tmp = (unsigned char *) in;
- c = *tmp++;
- c = c | (((unsigned int)*tmp) << 8);
- in++;
- #else /* BIG_ENDIAN */
- c= *in++;
- #endif /* BIG_ENDIAN */
- if ((c & 0xFC00) == 0xD800) { /* surrogates */
- if (in >= inend) { /* (in > inend) shouldn't happens */
- (*inlenb) -= 2;
- break;
- }
- #ifdef BIG_ENDIAN
- tmp = (unsigned char *) in;
- d = *tmp++;
- d = d | (((unsigned int)*tmp) << 8);
- in++;
- #else /* BIG_ENDIAN */
- d = *in++;
- #endif /* BIG_ENDIAN */
- if ((d & 0xFC00) == 0xDC00) {
- c &= 0x03FF;
- c <<= 10;
- c |= d & 0x03FF;
- c += 0x10000;
- }
- else
- return(-2);
- }
- /* assertion: c is a single UTF-4 value */
- if (out >= outend)
- return(-1);
- if (c < 0x80) { *out++= c; bits= -6; }
- else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
- else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
- else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
-
- for ( ; bits >= 0; bits-= 6) {
- if (out >= outend)
- return(-1);
- *out++= ((c >> bits) & 0x3F) | 0x80;
- }
- }
- return(out-outstart);
- }
- /**
- * UTF8ToUTF16LE:
- * @outb: a pointer to an array of bytes to store the result
- * @outlen: the length of @outb
- * @in: a pointer to an array of UTF-8 chars
- * @inlen: the length of @in
- *
- * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
- * block of chars out.
- * TODO: UTF8ToUTF16LE need a fallback mechanism ...
- *
- * Returns the number of byte written, or -1 by lack of space, or -2
- * if the transcoding failed.
- */
- int
- UTF8ToUTF16LE(unsigned char* outb, int outlen,
- const unsigned char* in, int *inlen)
- {
- unsigned short* out = (unsigned short*) outb;
- unsigned short* outstart= out;
- unsigned short* outend;
- const unsigned char* inend= in+*inlen;
- unsigned int c, d, trailing;
- #ifdef BIG_ENDIAN
- unsigned char *tmp;
- unsigned short tmp1, tmp2;
- #endif /* BIG_ENDIAN */
- outlen /= 2; /* convert in short length */
- outend = out + outlen;
- while (in < inend) {
- d= *in++;
- if (d < 0x80) { c= d; trailing= 0; }
- else if (d < 0xC0)
- return(-2); /* trailing byte in leading position */
- else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
- else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
- else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
- else
- return(-2); /* no chance for this in UTF-16 */
- if (inend - in < trailing) {
- *inlen -= (inend - in);
- break;
- }
- for ( ; trailing; trailing--) {
- if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
- return(-1);
- c <<= 6;
- c |= d & 0x3F;
- }
- /* assertion: c is a single UTF-4 value */
- if (c < 0x10000) {
- if (out >= outend)
- return(-1);
- #ifdef BIG_ENDIAN
- tmp = (unsigned char *) out;
- *tmp = c ;
- *(tmp + 1) = c >> 8 ;
- out++;
- #else /* BIG_ENDIAN */
- *out++ = c;
- #endif /* BIG_ENDIAN */
- }
- else if (c < 0x110000) {
- if (out+1 >= outend)
- return(-1);
- c -= 0x10000;
- #ifdef BIG_ENDIAN
- tmp1 = 0xD800 | (c >> 10);
- tmp = (unsigned char *) out;
- *tmp = tmp1;
- *(tmp + 1) = tmp1 >> 8;
- out++;
- tmp2 = 0xDC00 | (c & 0x03FF);
- tmp = (unsigned char *) out;
- *tmp = tmp2;
- *(tmp + 1) = tmp2 >> 8;
- out++;
- #else /* BIG_ENDIAN */
- *out++ = 0xD800 | (c >> 10);
- *out++ = 0xDC00 | (c & 0x03FF);
- #endif /* BIG_ENDIAN */
- }
- else
- return(-1);
- }
- return(out-outstart);
- }
- /**
- * UTF16BEToUTF8:
- * @out: a pointer to an array of bytes to store the result
- * @outlen: the length of @out
- * @inb: a pointer to an array of UTF-16 passwd as a byte array
- * @inlenb: the length of @in in UTF-16 chars
- *
- * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
- * block of chars out. This function assume the endian properity
- * is the same between the native type of this machine and the
- * inputed one.
- *
- * Returns the number of byte written, or -1 by lack of space, or -2
- * if the transcoding fails (for *in is not valid utf16 string)
- * The value of *inlen after return is the number of octets consumed
- * as the return value is positive, else unpredictiable.
- */
- int
- UTF16BEToUTF8(unsigned char* out, int outlen,
- const unsigned char* inb, int *inlenb)
- {
- unsigned char* outstart= out;
- unsigned char* outend= out+outlen;
- unsigned short* in = (unsigned short*) inb;
- unsigned short* inend;
- unsigned int c, d, inlen;
- #ifdef BIG_ENDIAN
- #else /* BIG_ENDIAN */
- unsigned char *tmp;
- #endif /* BIG_ENDIAN */
- int bits;
- if ((*inlenb % 2) == 1)
- (*inlenb)--;
- inlen = *inlenb / 2;
- inend= in + inlen;
- while (in < inend) {
- #ifdef BIG_ENDIAN
- c= *in++;
- #else
- tmp = (unsigned char *) in;
- c = *tmp++;
- c = c << 8;
- c = c | (unsigned int) *tmp;
- in++;
- #endif
- if ((c & 0xFC00) == 0xD800) { /* surrogates */
- if (in >= inend) { /* (in > inend) shouldn't happens */
- (*inlenb) -= 2;
- break;
- }
- #ifdef BIG_ENDIAN
- d= *in++;
- #else
- tmp = (unsigned char *) in;
- d = *tmp++;
- d = d << 8;
- d = d | (unsigned int) *tmp;
- in++;
- #endif
- if ((d & 0xFC00) == 0xDC00) {
- c &= 0x03FF;
- c <<= 10;
- c |= d & 0x03FF;
- c += 0x10000;
- }
- else
- return(-2);
- }
- /* assertion: c is a single UTF-4 value */
- if (out >= outend)
- return(-1);
- if (c < 0x80) { *out++= c; bits= -6; }
- else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
- else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
- else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
-
- for ( ; bits >= 0; bits-= 6) {
- if (out >= outend)
- return(-1);
- *out++= ((c >> bits) & 0x3F) | 0x80;
- }
- }
- return(out-outstart);
- }
- /**
- * UTF8ToUTF16BE:
- * @outb: a pointer to an array of bytes to store the result
- * @outlen: the length of @outb
- * @in: a pointer to an array of UTF-8 chars
- * @inlen: the length of @in
- *
- * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
- * block of chars out.
- * TODO: UTF8ToUTF16BE need a fallback mechanism ...
- *
- * Returns the number of byte written, or -1 by lack of space, or -2
- * if the transcoding failed.
- */
- int
- UTF8ToUTF16BE(unsigned char* outb, int outlen,
- const unsigned char* in, int *inlen)
- {
- unsigned short* out = (unsigned short*) outb;
- unsigned short* outstart= out;
- unsigned short* outend;
- const unsigned char* inend= in+*inlen;
- unsigned int c, d, trailing;
- #ifdef BIG_ENDIAN
- #else
- unsigned char *tmp;
- unsigned short tmp1, tmp2;
- #endif /* BIG_ENDIAN */
- outlen /= 2; /* convert in short length */
- outend = out + outlen;
- while (in < inend) {
- d= *in++;
- if (d < 0x80) { c= d; trailing= 0; }
- else if (d < 0xC0)
- return(-2); /* trailing byte in leading position */
- else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
- else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
- else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
- else
- return(-2); /* no chance for this in UTF-16 */
- if (inend - in < trailing) {
- *inlen -= (inend - in);
- break;
- }
- for ( ; trailing; trailing--) {
- if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return(-1);
- c <<= 6;
- c |= d & 0x3F;
- }
- /* assertion: c is a single UTF-4 value */
- if (c < 0x10000) {
- if (out >= outend) return(-1);
- #ifdef BIG_ENDIAN
- *out++ = c;
- #else
- tmp = (unsigned char *) out;
- *tmp = c >> 8;
- *(tmp + 1) = c;
- out++;
- #endif /* BIG_ENDIAN */
- }
- else if (c < 0x110000) {
- if (out+1 >= outend) return(-1);
- c -= 0x10000;
- #ifdef BIG_ENDIAN
- *out++ = 0xD800 | (c >> 10);
- *out++ = 0xDC00 | (c & 0x03FF);
- #else
- tmp1 = 0xD800 | (c >> 10);
- tmp = (unsigned char *) out;
- *tmp = tmp1 >> 8;
- *(tmp + 1) = tmp1;
- out++;
- tmp2 = 0xDC00 | (c & 0x03FF);
- tmp = (unsigned char *) out;
- *tmp = tmp2 >> 8;
- *(tmp + 1) = tmp2;
- out++;
- #endif
- }
- else return(-1);
- }
- return(out-outstart);
- }
- /**
- * xmlDetectCharEncoding:
- * @in: a pointer to the first bytes of the XML entity, must be at least
- * 4 bytes long.
- * @len: pointer to the length of the buffer
- *
- * Guess the encoding of the entity using the first bytes of the entity content
- * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
- *
- * Returns one of the XML_CHAR_ENCODING_... values.
- */
- xmlCharEncoding
- xmlDetectCharEncoding(const unsigned char* in, int len)
- {
- if (len >= 4) {
- if ((in[0] == 0x00) && (in[1] == 0x00) &&
- (in[2] == 0x00) && (in[3] == 0x3C))
- return(XML_CHAR_ENCODING_UCS4BE);
- if ((in[0] == 0x3C) && (in[1] == 0x00) &&
- (in[2] == 0x00) && (in[3] == 0x00))
- return(XML_CHAR_ENCODING_UCS4LE);
- if ((in[0] == 0x00) && (in[1] == 0x00) &&
- (in[2] == 0x3C) && (in[3] == 0x00))
- return(XML_CHAR_ENCODING_UCS4_2143);
- if ((in[0] == 0x00) && (in[1] == 0x3C) &&
- (in[2] == 0x00) && (in[3] == 0x00))
- return(XML_CHAR_ENCODING_UCS4_3412);
- if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
- (in[2] == 0xA7) && (in[3] == 0x94))
- return(XML_CHAR_ENCODING_EBCDIC);
- if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
- (in[2] == 0x78) && (in[3] == 0x6D))
- return(XML_CHAR_ENCODING_UTF8);
- }
- if (len >= 2) {
- if ((in[0] == 0xFE) && (in[1] == 0xFF))
- return(XML_CHAR_ENCODING_UTF16BE);
- if ((in[0] == 0xFF) && (in[1] == 0xFE))
- return(XML_CHAR_ENCODING_UTF16LE);
- }
- return(XML_CHAR_ENCODING_NONE);
- }
- /**
- * xmlParseCharEncoding:
- * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
- *
- * Conpare the string to the known encoding schemes already known. Note
- * that the comparison is case insensitive accordingly to the section
- * [XML] 4.3.3 Character Encoding in Entities.
- *
- * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
- * if not recognized.
- */
- xmlCharEncoding
- xmlParseCharEncoding(const char* name)
- {
- char upper[500];
- int i;
- for (i = 0;i < 499;i++) {
- upper[i] = toupper(name[i]);
- if (upper[i] == 0) break;
- }
- upper[i] = 0;
- if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
- if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
- if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
- /*
- * NOTE: if we were able to parse this, the endianness of UTF16 is
- * already found and in use
- */
- if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
- if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
-
- if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
- if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
- if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
- /*
- * NOTE: if we were able to parse this, the endianness of UCS4 is
- * already found and in use
- */
- if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
- if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
- if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
-
- if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
- if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
- if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
- if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
- if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
- if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
- if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
- if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
- if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
- if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
- if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
- if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
- if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
- if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
- if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
- if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
- return(XML_CHAR_ENCODING_ERROR);
- }
- /****************************************************************
- * *
- * Char encoding handlers *
- * *
- ****************************************************************/
- /* the size should be growable, but it's not a big deal ... */
- #define MAX_ENCODING_HANDLERS 50
- static xmlCharEncodingHandlerPtr *handlers = NULL;
- static int nbCharEncodingHandler = 0;
- /*
- * The default is UTF-8 for XML, that's also the default used for the
- * parser internals, so the default encoding handler is NULL
- */
- static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
- /**
- * xmlNewCharEncodingHandler:
- * @name: the encoding name, in UTF-8 format (ASCII actually)
- * @input: the xmlCharEncodingInputFunc to read that encoding
- * @output: the xmlCharEncodingOutputFunc to write that encoding
- *
- * Create and registers an xmlCharEncodingHandler.
- * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
- */
- xmlCharEncodingHandlerPtr
- xmlNewCharEncodingHandler(const char *name,
- xmlCharEncodingInputFunc input,
- xmlCharEncodingOutputFunc output) {
- xmlCharEncodingHandlerPtr handler;
- char upper[500];
- int i;
- char *up = 0;
- /*
- * Keep only the uppercase version of the encoding.
- */
- if (name == NULL) {
- fprintf(stderr, "xmlNewCharEncodingHandler : no name !n");
- return(NULL);
- }
- for (i = 0;i < 499;i++) {
- upper[i] = toupper(name[i]);
- if (upper[i] == 0) break;
- }
- upper[i] = 0;
- up = xmlMemStrdup(upper);
- if (up == NULL) {
- fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !n");
- return(NULL);
- }
- /*
- * allocate and fill-up an handler block.
- */
- handler = (xmlCharEncodingHandlerPtr)
- xmlMalloc(sizeof(xmlCharEncodingHandler));
- if (handler == NULL) {
- fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !n");
- return(NULL);
- }
- handler->input = input;
- handler->output = output;
- handler->name = up;
- /*
- * registers and returns the handler.
- */
- xmlRegisterCharEncodingHandler(handler);
- return(handler);
- }
- /**
- * xmlInitCharEncodingHandlers:
- *
- * Initialize the char encoding support, it registers the default
- * encoding supported.
- * NOTE: while public, this function usually doesn't need to be called
- * in normal processing.
- */
- void
- xmlInitCharEncodingHandlers(void) {
- if (handlers != NULL) return;
- handlers = (xmlCharEncodingHandlerPtr *)
- xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
- if (handlers == NULL) {
- fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !n");
- return;
- }
- xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
- xmlUTF16LEHandler =
- xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
- xmlUTF16BEHandler =
- xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
- xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
- }
- /**
- * xmlCleanupCharEncodingHandlers:
- *
- * Cleanup the memory allocated for the char encoding support, it
- * unregisters all the encoding handlers.
- */
- void
- xmlCleanupCharEncodingHandlers(void) {
- if (handlers == NULL) return;
- for (;nbCharEncodingHandler > 0;) {
- nbCharEncodingHandler--;
- if (handlers[nbCharEncodingHandler] != NULL) {
- xmlFree(handlers[nbCharEncodingHandler]->name);
- xmlFree(handlers[nbCharEncodingHandler]);
- }
- }
- xmlFree(handlers);
- handlers = NULL;
- nbCharEncodingHandler = 0;
- xmlDefaultCharEncodingHandler = NULL;
- }
- /**
- * xmlRegisterCharEncodingHandler:
- * @handler: the xmlCharEncodingHandlerPtr handler block
- *
- * Register the char encoding handler, surprizing, isn't it ?
- */
- void
- xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
- if (handlers == NULL) xmlInitCharEncodingHandlers();
- if (handler == NULL) {
- fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !n");
- return;
- }
- if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
- fprintf(stderr,
- "xmlRegisterCharEncodingHandler: Too many handler registeredn");
- fprintf(stderr, "tincrease MAX_ENCODING_HANDLERS : %sn", __FILE__);
- return;
- }
- handlers[nbCharEncodingHandler++] = handler;
- }
- /**
- * xmlGetCharEncodingHandler:
- * @enc: an xmlCharEncoding value.
- *
- * Search in the registrered set the handler able to read/write that encoding.
- *
- * Returns the handler or NULL if not found
- */
- xmlCharEncodingHandlerPtr
- xmlGetCharEncodingHandler(xmlCharEncoding enc) {
- if (handlers == NULL) xmlInitCharEncodingHandlers();
- switch (enc) {
- case XML_CHAR_ENCODING_ERROR:
- return(NULL);
- case XML_CHAR_ENCODING_NONE:
- return(NULL);
- case XML_CHAR_ENCODING_UTF8:
- return(NULL);
- case XML_CHAR_ENCODING_UTF16LE:
- return(xmlUTF16LEHandler);
- case XML_CHAR_ENCODING_UTF16BE:
- return(xmlUTF16BEHandler);
- case XML_CHAR_ENCODING_EBCDIC:
- return(NULL);
- case XML_CHAR_ENCODING_UCS4LE:
- return(NULL);
- case XML_CHAR_ENCODING_UCS4BE:
- return(NULL);
- case XML_CHAR_ENCODING_UCS4_2143:
- return(NULL);
- case XML_CHAR_ENCODING_UCS4_3412:
- return(NULL);
- case XML_CHAR_ENCODING_UCS2:
- return(NULL);
- case XML_CHAR_ENCODING_8859_1:
- return(NULL);
- case XML_CHAR_ENCODING_8859_2:
- return(NULL);
- case XML_CHAR_ENCODING_8859_3:
- return(NULL);
- case XML_CHAR_ENCODING_8859_4:
- return(NULL);
- case XML_CHAR_ENCODING_8859_5:
- return(NULL);
- case XML_CHAR_ENCODING_8859_6:
- return(NULL);
- case XML_CHAR_ENCODING_8859_7:
- return(NULL);
- case XML_CHAR_ENCODING_8859_8:
- return(NULL);
- case XML_CHAR_ENCODING_8859_9:
- return(NULL);
- case XML_CHAR_ENCODING_2022_JP:
- case XML_CHAR_ENCODING_SHIFT_JIS:
- case XML_CHAR_ENCODING_EUC_JP:
- return(NULL);
- }
- return(NULL);
- }
- /**
- * xmlGetCharEncodingHandler:
- * @enc: a string describing the char encoding.
- *
- * Search in the registrered set the handler able to read/write that encoding.
- *
- * Returns the handler or NULL if not found
- */
- xmlCharEncodingHandlerPtr
- xmlFindCharEncodingHandler(const char *name) {
- char upper[500];
- int i;
- if (handlers == NULL) xmlInitCharEncodingHandlers();
- if (name == NULL) return(xmlDefaultCharEncodingHandler);
- if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
- for (i = 0;i < 499;i++) {
- upper[i] = toupper(name[i]);
- if (upper[i] == 0) break;
- }
- upper[i] = 0;
- for (i = 0;i < nbCharEncodingHandler; i++)
- if (!strcmp(name, handlers[i]->name))
- return(handlers[i]);
- return(NULL);
- }