生物技术

开发平台：
C/C++

utf8.cpp：源码内容
							/*
 * ===========================================================================
 * PRODUCTION $Log: utf8.cpp,v $
 * PRODUCTION Revision 1000.2  2004/06/01 19:40:43  gouriano
 * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.6
 * PRODUCTION
 * ===========================================================================
 */
/*  $Id: utf8.cpp,v 1000.2 2004/06/01 19:40:43 gouriano Exp $
 * ===========================================================================
 *
 *                            PUBLIC DOMAIN NOTICE
 *               National Center for Biotechnology Information
 *
 *  This software/database is a "United States Government Work" under the
 *  terms of the United States Copyright Act.  It was written as part of
 *  the author's official duties as a United States Government employee and
 *  thus cannot be copyrighted.  This software/database is freely available
 *  to the public for use. The National Library of Medicine and the U.S.
 *  Government have not placed any restriction on its use or reproduction.
 *
 *  Although all reasonable efforts have been taken to ensure the accuracy
 *  and reliability of the software and data, the NLM and the U.S.
 *  Government do not and cannot warrant the performance or results that
 *  may be obtained by using this software or data. The NLM and the U.S.
 *  Government disclaim all warranties, express or implied, including
 *  warranties of performance, merchantability or fitness for any particular
 *  purpose.
 *
 *  Please cite the author in any work or product based on this material.
 *
 * ===========================================================================
 *
 * Author: Aleksey Vinokurov, Vladimir Ivanov
 *
 * File Description:  UTF8 converter functions
 *
 */
#include <ncbi_pch.hpp>
#include <util/utf8.hpp>
BEGIN_NCBI_SCOPE
BEGIN_SCOPE(utf8)
// Translation tables.
// I've put codes from ASCII-7 table here. So in this table should be only 
// 7-bit characters and two special characters - 0x00 (unable to translate) 
// and 0xFF (character should be skipped).
static unsigned char tblTrans[] =
{
    // Latin Base
 // 0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0 , // 08
    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0 , // 09
    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  'a',  0,  '"',  0,   0,  '-', // 0A
   0xFF, 0,  '2', '3',''',  0,   0,  '.',  0,  '1', 'o',  0,  '"',  0,   0,   0 , // 0B
   'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', // 0C
   'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x', 'O', 'U', 'U', 'U', 'U', 'Y',  0,  'B', // 0D 
   'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', // 0E 
   'o', 'n', 'o', 'o', 'o', 'o', 'o', '-', 'o', 'u', 'u', 'u', 'u', 'y',  0,  'y', // 0F 
    // Latin A
 // 0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
   'A', 'a', 'A', 'a', 'A', 'a', 'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', // 10 
   'D', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', // 11 
   'G', 'g', 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', // 12 
   'I', 'i', 'J', 'j', 'J', 'j', 'K', 'k', 'k', 'L', 'l', 'L', 'l', 'L', 'l', 'L', // 13 
   'l', 'L', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', 'N', 'n', 'O', 'o', 'O', 'o', // 14 
   'O', 'o', 'O', 'o', 'R', 'r', 'R', 'r', 'R', 'r', 'S', 's', 'S', 's', 'S', 's', // 15 
   'S', 's', 'T', 't', 'T', 't', 'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', // 16 
   'U', 'u', 'U', 'u', 'W', 'w', 'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z',  0 , // 17 
    // Latin B
 // 0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
   'b', 'B',  0 ,  0 ,  0 ,  0 ,  0 , 'C', 'c', 'D', 'D',  0 ,  0 ,  0 ,  0 ,  0 , // 18 
   'E', 'F', 'f', 'G',  0 ,  0 ,  0 , 'I', 'K', 'k',  0 ,  0 ,  0 , 'N', 'n',  0 , // 19 
   'O', 'o',  0 ,  0 , 'P', 'p', 'R',  0 ,  0 ,  0 ,  0 , 't', 'T', 't', 'T', 'U', // 1A 
   'u',  0 ,  0 , 'Y', 'y', 'Z', 'z', 'Z',  0 ,  0 , 'z',  0 ,  0 ,  0 ,  0 ,  0 , // 1B 
    0 ,  0 ,  0 , '!', 'D', 'd', 'd', 'L', 'L', 'l', 'N', 'N', 'n', 'A', 'a', 'I', // 1C 
   'i', 'O', 'o', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u',  0 , 'A', 'a', // 1D 
   'A', 'a', 'A', 'a', 'G', 'g', 'G', 'g', 'K', 'k', 'O', 'o', 'O', 'o', 'Z', 'z', // 1E
   'j', 'D', 'D', 'd', 'G', 'g',  0 ,  0 , 'N', 'n', 'A', 'a',  0,   0 , 'O', 'o', // 1F 
   'A', 'a', 'A', 'a', 'E', 'e', 'E', 'e', 'I', 'i', 'I', 'i', 'O', 'o', 'O', 'o', // 20 
   'R', 'r', 'R', 'r', 'U', 'u', 'U', 'u', 'S', 's', 'T', 't',  0 ,  0 , 'H', 'h', // 21 
    0 ,  0 ,  0 ,  0 , 'Z', 'z', 'A', 'a', 'E', 'e', 'O', 'o', 'O', 'o', 'O', 'o', // 22 
   'O', 'o', 'Y', 'y',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 , // 23
    0 ,  0,   0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 , // 24
    // IPA Extensions
 // 0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
    0 , 'a',  0 ,  0 ,  0 ,  0 , 'd', 'd',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 , // 25
   'g', 'g', 'G',  0 ,  0 ,  0 , 'h' ,'h', 'i', 'i', 'I',  0 ,  0 ,  0 ,  0 ,  0 , // 26
    0,  'm',  0,  'n', 'N',  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0 , // 27
   'R',  0,  's',  0,   0,   0,   0,   0,  't', 'u',  0,   0,   0,   0,   0,  'Y', // 28
   'Z', 'Z', 'z',  'z', 0,   0,   0,   0,  'O', 'B',  0,  'G', 'H', 'j',  0,  'L', // 29
   'q',  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0 , // 2A
    // Spacing Modifiers
 // 0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
   'h', 'h', 'j', 'r',  0 ,  0 ,  0 , 'w', 'y',''', '"',''',''',''',''',''', // 2B
   '?', '?', '<', '>', '^', 'v', '^', 'v',''', '-',''', '`',''', '_',''', '`', // 2C
    0,   0, ''',''',  0 ,  0 , '+', '-', '~', '.', '.',  0,  '~', '"' , 0 , 'x', // 2D
    0 ,  0,   0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 , // 2E
    0 , 'l', 's', 'x',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 , 'v' ,'=', '"',  0   // 2F
};
static unsigned char tblTransA[] =
{
    // Spacing Modifiers
 // 0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
   'A', 'a', 'B', 'b', 'B', 'b', 'B', 'b', 'C', 'c', 'D', 'd', 'D', 'd', 'D', 'd', // 1E0
   'D', 'd', 'D', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'F', 'f', // 1E1
   'G', 'g', 'H', 'h', 'H', 'h', 'H', 'h', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', // 1E2
   'K', 'k', 'K', 'k', 'K', 'k', 'L', 'l', 'L', 'l', 'L', 'l', 'L', 'l', 'M', 'm', // 1E3
   'M', 'm', 'M', 'm', 'N', 'n', 'N', 'n', 'N', 'n', 'N', 'n', 'O', 'o', 'O', 'o', // 1E4
   'O', 'o', 'O', 'o', 'P', 'p', 'P', 'p', 'R', 'r', 'R', 'r', 'R', 'r', 'R', 'r', // 1E5
   'S', 's', 'S', 's', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't', // 1E6
   'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'V', 'v', 'V', 'v', // 1E7
   'W', 'w', 'W', 'w', 'W', 'w', 'W', 'w', 'W', 'w', 'X', 'x', 'X', 'x', 'Y', 'y', // 1E8
   'Z', 'z', 'Z', 'z', 'Z', 'z', 'h', 't', 'w', 'y', 'a', 'f',  0 ,  0 ,  0 ,  0 , // 1E9
   'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', // 1EA
   'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', // 1EB
   'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'I', 'i', 'I', 'i', 'O', 'o', 'O', 'o', // 1EC
   'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', // 1ED
   'O', 'o', 'O', 'o', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', // 1EE
   'U', 'u', 'Y', 'y', 'Y', 'y', 'Y', 'y', 'Y', 'y',  0 ,  0 ,  0 ,  0,   0,   0   // 1EF
};
// Macro for return character together with status
// Using in functions returning status their work
//
#define RETURN_S(ch,res)
{
    if (status) *status = res;
    return ch;
}
// Macro for return character together with status and length 
// Using in functions returning status and length their work
//
#define RETURN_LS(ch,len,res)
{
    if (seq_len) *seq_len = len;
    if (status) *status = res;
    return ch;
}
// Convert first UTF-8 symbol of "src" into ASCII-7 character.
// "ascii_table" specifies whether to use ASCII-7 translation tables.
// Length of the retrieved UTF-8 symbol is returned in "*seq_len"
// (if "seq_len" is not NULL).
// Return resulting ASCII-7 character.
// NOTE:  If the UTF-8 symbol has no ASCII-7 equivalent, then return
//        kOutrangeChar or hSkipChar.
//
char StringToChar(const string&      src,
                  size_t*            seq_len,
                  bool               ascii_table,
                  EConversionStatus* status)
{
    long              dst_code;  // UTF-code symbol code
    unsigned char     dst_char;  // Result character
    EConversionStatus stat;      // Temporary status     
    // Process one UTF character
    dst_code = StringToCode(src, seq_len, &stat);
    if (status) *status = stat;
    // If it was happily
    if (stat == eSuccess) {
        // Conversion
        if (ascii_table) {
            // Convert into appropriate 7-bit character via conversion table 
            dst_char = CodeToChar(dst_code, status);
            return dst_char;
        }    
        else
        {
            // if character greater than 127 (0x7F) than substitute it 
            // with kOutrangeChar, else leave it as is.
            if (dst_code > 0x7F) {
                RETURN_S (kOutrangeChar, eOutrange);
            }
        }
    }
    // Was error translate char
    return (char)dst_code;
}
// Convert UTF-8 string "src" into the ASCII-7 string with
// graphically similar characters -- using StringToChar().
// Return resulting ASCII-7 string.
//
string StringToAscii(const string& src, bool ascii_table)
{
    string  dst;      // String to result 
    char    ch;       // Temporary UTF symbol code
    size_t  utf_len;  // Length of UTF symbol
    size_t  src_len;  // Length source string
    src_len = src.size();
    for (size_t i = 0; i < src_len; )
    {
        // Process one UTF character
        ch = StringToChar(src.data() + i, &utf_len, ascii_table);
        // Add character to the result vector
        if ( ch != kSkipChar ) dst += ch;
        i += utf_len;
    }
    return dst;
}
// Convert first UTF-8 symbol of "src" into a Unicode symbol code.
// Length of the retrieved UTF-8 symbol is returned in "*seq_len"
// (if "seq_len" is not NULL).
// Return resulting Unicode symbol code.
// NOTE:  If the UTF-8 symbol has no Unicode equivalent, then return
//        kOutrangeChar or hSkipChar.
//
long StringToCode(const string&      src,
                  size_t*            seq_len,
                  EConversionStatus* status)
{
    unsigned char ch = src.data()[0];
    size_t utf_len = 0;
    long dst_code = 0;
        
    // If character less then 0x80 we put it as is
    if (ch < 0x80)
    {
        RETURN_LS (ch, 1, eSuccess)
    } 
    else
    {
        // Determine the length of the UTF-8 symbol in bytes
        if      ((ch & 0xFC) == 0xFC) utf_len = 6; // 6 bytes length
        else if ((ch & 0xF8) == 0xF8) utf_len = 5; // 5 bytes length
        else if ((ch & 0xF0) == 0xF0) utf_len = 4; // 4 bytes length
        else if ((ch & 0xE0) == 0xE0) utf_len = 3; // 3 bytes length
        else if ((ch & 0xC0) == 0xC0) utf_len = 2; // 2 bytes length
        else
        {
            // Bad character. Save it as kOutrangeChar
            RETURN_LS (kOutrangeChar, 1, eOutrange)
        }
    }
    // Broken unicode sequence
    if (utf_len > src.size()) {
        RETURN_LS ((long)kSkipChar, 1, eSkip);
    }
        
    unsigned char mask = 0xFF;
    mask = mask >> utf_len; 
    dst_code = ch & mask;
    for (size_t j = 1; j < utf_len; j++)
    {
        dst_code = dst_code << 6;
        ch = src.data()[j];
        ch &= 0x3F;
        dst_code = dst_code | ch;
    }
    // Return result
    RETURN_LS (dst_code, utf_len, eSuccess)
}
// Convert UTF-8 string "src" into the vector of Unicode symbol codes
// using StringToCode().
// Return resulting vector.
//
vector<long> StringToVector (const string& src)
{
    vector<long> dst;      // String to result 
    long         ch;       // Unicode symbol code
    size_t       utf_len;  // Length of Unicode symbol
    size_t       src_len;  // Length of source string
    src_len = src.size();
    for (size_t i = 0; i < src_len; )
    {
        // Process one UTF character
        ch = StringToCode(src.data()+i, &utf_len);
        // Add character to the result vector
        dst.push_back(ch);
        i += utf_len;
    }
    return dst;
}
// Translate Unicode symbol code "src" into graphically similar ASCII-7
// character.
// Return resulting ASCII-7 character.
// NOTE:  If the Unicode symbol has no ASCII-7 equivalent, then return
//        kOutrangeChar or hSkipChar.
//
char CodeToChar(const long src, EConversionStatus* status)
{
    unsigned char ch;
    if (src < 0x80) RETURN_S ((char)src, eSuccess);
    if ((src >= 0x0300) && (src <= 0x036F)) RETURN_S (kSkipChar, eSkip);
    if ((src >= 0x1E00) && (src <= 0x1EFF))
    {
      ch = tblTransA[src-0x1E00];
      if (!ch) RETURN_S (kOutrangeChar, eOutrange)
      else     RETURN_S ((char)ch, eSuccess);
    }
    if ((src >= 0xFE20) && (src <= 0xFE2F)) RETURN_S (kSkipChar, eSkip);
    if (src > 0x2FF) RETURN_S (kOutrangeChar, eOutrange);
    ch = tblTrans[src-0x80];
    if (!ch) RETURN_S (kOutrangeChar, eOutrange);
    RETURN_S ((char)ch, eSuccess);
}
END_SCOPE(utf8)
END_NCBI_SCOPE
/*
 * ===========================================================================
 * $Log: utf8.cpp,v $
 * Revision 1000.2  2004/06/01 19:40:43  gouriano
 * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.6
 *
 * Revision 1.6  2004/05/17 21:06:02  gorelenk
 * Added include of PCH ncbi_pch.hpp
 *
 * Revision 1.5  2002/01/24 20:10:21  vinokuro
 * Skip characters processing has been fixed in StringToAscii function.
 *
 * Revision 1.4  2002/01/18 19:24:13  ivanov
 * Changed result char's upper limit from 0xFF to 0x7F in StringToChar()
 *
 * Revision 1.3  2001/05/17 15:07:15  lavr
 * Typos corrected
 *
 * Revision 1.2  2001/04/18 16:31:59  ivanov
 * Change types TUnicodeChar, TUnicodeString to simple types.
 * TUnicode char to long, TUnicodeString to vector<long>.
 *
 * Revision 1.1  2001/04/06 19:14:37  ivanov
 * Initial revision
 * ===========================================================================
 */