unicode.cpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:11k
- /*
- * ===========================================================================
- * PRODUCTION $Log: unicode.cpp,v $
- * PRODUCTION Revision 1000.0 2004/06/01 19:43:29 gouriano
- * PRODUCTION PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.3
- * PRODUCTION
- * ===========================================================================
- */
- /* $Id: unicode.cpp,v 1000.0 2004/06/01 19:43:29 gouriano Exp $
- * ==========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ==========================================================================
- *
- * Author: Aleksey Vinokurov
- *
- * File Description:
- * Unicode transformation library
- *
- */
- #include <ncbi_pch.hpp>
- #include <util/unicode.hpp>
- BEGIN_NCBI_SCOPE
- BEGIN_SCOPE(utf8)
- #include "unicode_plans/plan00.inc"
- #include "unicode_plans/plan01.inc"
- #include "unicode_plans/plan02.inc"
- #include "unicode_plans/plan03.inc"
- #include "unicode_plans/plan04.inc"
- #include "unicode_plans/plan1e.inc"
- #include "unicode_plans/plan20.inc"
- #include "unicode_plans/plan21.inc"
- #include "unicode_plans/plan22.inc"
- #include "unicode_plans/plan23.inc"
- #include "unicode_plans/plan24.inc"
- #include "unicode_plans/plan25.inc"
- #include "unicode_plans/plan26.inc"
- #include "unicode_plans/plan27.inc"
- #include "unicode_plans/plan30.inc"
- #include "unicode_plans/plane0.inc"
- #include "unicode_plans/plane2.inc"
- #include "unicode_plans/plane3.inc"
- #include "unicode_plans/plane4.inc"
- #include "unicode_plans/plane5.inc"
- #include "unicode_plans/plane6.inc"
- #include "unicode_plans/plane7.inc"
- #include "unicode_plans/plane8.inc"
- #include "unicode_plans/planea.inc"
- #include "unicode_plans/planeb.inc"
- #include "unicode_plans/planfb.inc"
- #include "unicode_plans/planfe.inc"
- static TUnicodeTable g_DefaultUnicodeTable =
- {
- &s_Plan_00h, &s_Plan_01h, &s_Plan_02h, &s_Plan_03h, &s_Plan_04h, 0, 0, 0, // Plan 00 - 07
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan 08 - 0F
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan 10 - 17
- 0, 0, 0, 0, 0, 0, &s_Plan_1Eh, 0, // Plan 18 - 1F
- &s_Plan_20h, &s_Plan_21h, &s_Plan_22h, &s_Plan_23h, &s_Plan_24h, &s_Plan_25h, &s_Plan_26h, &s_Plan_27h, // Plan 20 - 27
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan 28 - 2F
- &s_Plan_30h, 0, 0, 0, 0, 0, 0, 0, // Plan 30 - 37
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan 38 - 3F
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan 40 - 47
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan 48 - 4F
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan 50 - 57
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan 58 - 5F
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan 60 - 67
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan 68 - 6F
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan 70 - 77
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan 78 - 7F
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan 80 - 87
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan 88 - 8F
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan 90 - 97
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan 98 - 9F
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan A0 - A7
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan A8 - AF
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan B0 - B7
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan B8 - BF
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan C0 - C7
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan C8 - CF
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan D0 - D7
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan D8 - DF
- &s_Plan_E0h, 0, &s_Plan_E2h, &s_Plan_E3h, &s_Plan_E4h, &s_Plan_E5h, &s_Plan_E6h, &s_Plan_E7h, // Plan E0 - E7
- &s_Plan_E8h, 0, &s_Plan_EAh, &s_Plan_EBh, 0, 0, 0, 0, // Plan E8 - EF
- 0, 0, 0, 0, 0, 0, 0, 0, // Plan F0 - F7
- 0, 0, 0, &s_Plan_FBh, 0, 0, &s_Plan_FEh, 0 // Plan F8 - FF
- };
- const SUnicodeTranslation*
- UnicodeToAscii(TUnicode character, const TUnicodeTable* table)
- {
- if (!table) {
- table = &g_DefaultUnicodeTable;
- }
- unsigned int thePlanNo = (character & 0xFF00) >> 8;
- unsigned int theOffset = character & 0xFF;
- const TUnicodePlan* thePlan = (*table)[thePlanNo];
- if ( !thePlan ) return 0;
- return &((*thePlan)[theOffset]);
- }
- TUnicode UTF8ToUnicode( const char* theUTF )
- {
- int seq_len;
- const char *p = theUTF;
- char counter = *p++;
- unsigned char c;
- if ( ((*theUTF) & 0xC0) != 0xC0 )
- {
- TUnicode RC = 0;
- RC |= (unsigned char)theUTF[0];
- return RC;
- }
- TUnicode acc = counter & 037;
- while((counter <<= 1) < 0) {
- c = *p++;
- if((c & ~077) != 0200) { // Broken UTF-8 chain
- seq_len = p - theUTF;
- return ~0;
- }
- acc = (acc << 6) | (c & 077);
- }
- return acc;
- }
- int UTF8ToUnicode( const char* theUTF, TUnicode* theUnicode )
- {
- int seq_len;
- const char *p = theUTF;
- char counter = *p++;
- unsigned char c;
- if ( (unsigned char )theUTF[0] < 0x80 ) {
- // This is one character UTF8. I.e. regular character.
- *theUnicode = *theUTF;
- return 1;
- }
- if ( ((*theUTF) & 0xC0) != 0xC0 ) {
- // This is not a unicode
- return 0;
- }
- TUnicode acc = counter & 037;
- while((counter <<= 1) < 0) {
- c = *p++;
- if((c & ~077) != 0200) { // Broken UTF-8 chain
- seq_len = p - theUTF;
- return ~0;
- }
- acc = (acc << 6) | (c & 077);
- } // while
- seq_len = p - theUTF;
- *theUnicode = acc;
- return seq_len;
- }
- string UnicodeToUTF8( TUnicode theUnicode )
- {
- char theBuffer[10];
- int theLength = UnicodeToUTF8( theUnicode, theBuffer, 10 );
- return string( theBuffer, theLength );
- }
- int UnicodeToUTF8( TUnicode theUnicode, char *theBuffer, int theBufLength )
- {
- int Length = 0;
- if (theUnicode < 0x80) {
- Length = 1;
- if ( Length > theBufLength ) return 0;
- theBuffer[0] = char(theUnicode);
- }
- else if (theUnicode < 0x800) {
- Length = 2;
- if ( Length > theBufLength ) return 0;
- theBuffer[0] = char(0xC0 | theUnicode>>6);
- theBuffer[1] = char(0x80 | theUnicode & 0x3F);
- }
- else if (theUnicode < 0x10000) {
- Length = 3;
- if ( Length > theBufLength ) return 0;
- theBuffer[0] = char(0xE0 | theUnicode>>12);
- theBuffer[1] = char(0x80 | theUnicode>>6 & 0x3F);
- theBuffer[2] = char(0x80 | theUnicode & 0x3F);
- }
- else if (theUnicode < 0x200000) {
- Length = 4;
- if ( Length > theBufLength ) return 0;
- theBuffer[0] = char(0xF0 | theUnicode>>18);
- theBuffer[1] = char(0x80 | theUnicode>>12 & 0x3F);
- theBuffer[2] = char(0x80 | theUnicode>>6 & 0x3F);
- theBuffer[3] = char(0x80 | theUnicode & 0x3F);
- }
- return Length;
- }
- int UTF8ToAscii( const char* src, char* dst,
- int dstLen, const TUnicodeTable* table)
- {
- if ( !src || !dst || dstLen == 0 ) return 0;
- long srcPos = 0;
- long dstPos = 0;
- long srcLen = strlen( src );
- for ( srcPos = 0; srcPos < srcLen; )
- {
- // Assign quck pointers
- char* pDst = &(dst[dstPos]);
- const char* pSrc = &(src[srcPos]);
- TUnicode theUnicode;
- int utfLen = UTF8ToUnicode( pSrc, &theUnicode );
- if ( (utfLen == 0) || (utfLen == -1) ) {
- // Skip the error.
- srcPos++;
- continue;
- }
- srcPos += utfLen;
- // Find the correct substitution.
- const SUnicodeTranslation*
- pSubst = UnicodeToAscii( theUnicode, table );
- // Check if the unicode has a translation
- if ( !pSubst ) {
- continue;
- }
- // Check if type is eSkip or substituting string is empty.
- if ( (pSubst->Type == eSkip) ||
- !(pSubst->Subst) ) {
- continue;
- }
- // Check if type is eAsIs
- if (pSubst->Type == eAsIs) {
- memcpy( pDst, pSrc, utfLen );
- dstPos += utfLen;
- continue;
- }
- // Check the remaining length and put the result in there.
- int substLen = strlen( pSubst->Subst );
- if ( (dstPos + substLen) > dstLen ) {
- return -1; // Unsufficient space;
- }
- // Copy the substituting value into the destignation string
- memcpy( pDst, pSubst->Subst, substLen );
- dstPos += substLen;
- }
- return dstPos;
- }
- string UTF8ToAsciiString( const char* src, const TUnicodeTable* table)
- {
- if ( !src ) return 0;
- string dst;
- long srcPos = 0;
- long srcLen = strlen( src );
- for ( srcPos = 0; srcPos < srcLen; )
- {
- // Assign quck pointers
- const char* pSrc = &(src[srcPos]);
- TUnicode theUnicode;
- int utfLen = UTF8ToUnicode( pSrc, &theUnicode );
- if ( (utfLen == 0) || (utfLen == -1) ) {
- // Skip the error.
- srcPos++;
- continue;
- }
- srcPos += utfLen;
- // Find the correct substitution.
- const SUnicodeTranslation*
- pSubst = UnicodeToAscii( theUnicode, table );
- // Check if the unicode has a translation
- if ( !pSubst ) {
- srcPos += utfLen;
- continue;
- }
- // Check if type is eSkip or substituting string is empty.
- if ( (pSubst->Type == eSkip) ||
- !(pSubst->Subst) ) {
- srcPos += utfLen;
- continue;
- }
- // Check if type is eAsIs
- if (pSubst->Type == eAsIs) {
- dst += string( pSrc, utfLen );
- srcPos += utfLen;
- continue;
- }
- // Copy the substituting value into the destignation string
- dst += pSubst->Subst;
- }
- return dst;
- }
- END_SCOPE(utf8)
- END_NCBI_SCOPE
- /*
- * ==========================================================================
- * $Log: unicode.cpp,v $
- * Revision 1000.0 2004/06/01 19:43:29 gouriano
- * PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.3
- *
- * Revision 1.3 2004/05/17 21:06:02 gorelenk
- * Added include of PCH ncbi_pch.hpp
- *
- * Revision 1.2 2004/05/13 21:18:12 ucko
- * Respect constness in UnicodeToAscii.
- *
- * Revision 1.1 2004/05/06 18:15:29 gouriano
- * Imported from pubmed/xmldb
- *
- * ==========================================================================
- */