ncbistr.cpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:54k
- /*
- * ===========================================================================
- * PRODUCTION $Log: ncbistr.cpp,v $
- * PRODUCTION Revision 1000.6 2004/06/01 19:09:21 gouriano
- * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.109
- * PRODUCTION
- * ===========================================================================
- */
- /* $Id: ncbistr.cpp,v 1000.6 2004/06/01 19:09:21 gouriano Exp $
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * Authors: Eugene Vasilchenko, Denis Vakatov
- *
- * File Description:
- * Some helper functions
- *
- */
- #include <ncbi_pch.hpp>
- #include <corelib/ncbistd.hpp>
- #include <corelib/ncbi_limits.h>
- #include <memory>
- #include <algorithm>
- #include <ctype.h>
- #include <errno.h>
- #include <stdio.h>
- BEGIN_NCBI_SCOPE
- inline
- std::string::size_type s_DiffPtr(const char* end, const char* start)
- {
- if (end) {
- return end - start;
- }
- return 0;
- }
- const char *const kEmptyCStr = "";
- #ifndef NCBI_OS_MSWIN
- const string* CNcbiEmptyString::m_Str = 0;
- const string& CNcbiEmptyString::FirstGet(void) {
- static const string s_Str = "";
- m_Str = &s_Str;
- return s_Str;
- }
- #endif // NCBI_OS_MSWIN
- int NStr::CompareCase(const string& str, SIZE_TYPE pos, SIZE_TYPE n,
- const char* pattern)
- {
- if (pos == NPOS || !n || str.length() <= pos) {
- return *pattern ? -1 : 0;
- }
- if ( !*pattern ) {
- return 1;
- }
- if (n == NPOS || n > str.length() - pos) {
- n = str.length() - pos;
- }
- const char* s = str.data() + pos;
- while (n && *pattern && *s == *pattern) {
- s++; pattern++; n--;
- }
- if (n == 0) {
- return *pattern ? -1 : 0;
- }
- return *s - *pattern;
- }
- int NStr::CompareNocase(const string& str, SIZE_TYPE pos, SIZE_TYPE n,
- const char* pattern)
- {
- if (pos == NPOS || !n || str.length() <= pos) {
- return *pattern ? -1 : 0;
- }
- if ( !*pattern ) {
- return 1;
- }
- if (n == NPOS || n > str.length() - pos) {
- n = str.length() - pos;
- }
- const char* s = str.data() + pos;
- while (n && *pattern && toupper(*s) == toupper(*pattern)) {
- s++; pattern++; n--;
- }
- if (n == 0) {
- return *pattern ? -1 : 0;
- }
- return toupper(*s) - toupper(*pattern);
- }
- int NStr::CompareCase(const string& str, SIZE_TYPE pos, SIZE_TYPE n,
- const string& pattern)
- {
- if (pos == NPOS || !n || str.length() <= pos) {
- return pattern.empty() ? 0 : -1;
- }
- if ( pattern.empty() ) {
- return 1;
- }
- if (n == NPOS || n > str.length() - pos) {
- n = str.length() - pos;
- }
- SIZE_TYPE n_cmp = n;
- if (n_cmp > pattern.length()) {
- n_cmp = pattern.length();
- }
- const char* s = str.data() + pos;
- const char* p = pattern.data();
- while (n_cmp && *s == *p) {
- s++; p++; n_cmp--;
- }
- if (n_cmp == 0) {
- if (n == pattern.length())
- return 0;
- return n > pattern.length() ? 1 : -1;
- }
- return *s - *p;
- }
- int NStr::CompareNocase(const string& str, SIZE_TYPE pos, SIZE_TYPE n,
- const string& pattern)
- {
- if (pos == NPOS || !n || str.length() <= pos) {
- return pattern.empty() ? 0 : -1;
- }
- if ( pattern.empty() ) {
- return 1;
- }
- if (n == NPOS || n > str.length() - pos) {
- n = str.length() - pos;
- }
- SIZE_TYPE n_cmp = n;
- if (n_cmp > pattern.length()) {
- n_cmp = pattern.length();
- }
- const char* s = str.data() + pos;
- const char* p = pattern.data();
- while (n_cmp && toupper(*s) == toupper(*p)) {
- s++; p++; n_cmp--;
- }
- if (n_cmp == 0) {
- if (n == pattern.length())
- return 0;
- return n > pattern.length() ? 1 : -1;
- }
- return toupper(*s) - toupper(*p);
- }
- // NOTE: This code is used also in the CDirEntry::MatchesMask.
- bool NStr::MatchesMask(const char* str, const char* mask)
- {
- char c;
- bool infinite = true;
- while (infinite) {
- // Analyze symbol in mask
- switch ( c = *mask++ ) {
-
- case ' ':
- return *str == ' ';
- case '?':
- if ( *str == ' ' ) {
- return false;
- }
- ++str;
- break;
-
- case '*':
- c = *mask;
- // Collapse multiple stars
- while ( c == '*' ) {
- c = *++mask;
- }
- if (c == ' ') {
- return true;
- }
- // General case, use recursion
- while ( *str ) {
- if ( MatchesMask(str, mask) ) {
- return true;
- }
- ++str;
- }
- return false;
-
- default:
- // Compare nonpattern character in mask and name
- if ( c != *str++ ) {
- return false;
- }
- break;
- }
- }
- return false;
- }
- char* NStr::ToLower(char* str)
- {
- char* s;
- for (s = str; *str; str++) {
- *str = tolower(*str);
- }
- return s;
- }
- string& NStr::ToLower(string& str)
- {
- NON_CONST_ITERATE (string, it, str) {
- *it = tolower(*it);
- }
- return str;
- }
- char* NStr::ToUpper(char* str)
- {
- char* s;
- for (s = str; *str; str++) {
- *str = toupper(*str);
- }
- return s;
- }
- string& NStr::ToUpper(string& str)
- {
- NON_CONST_ITERATE (string, it, str) {
- *it = toupper(*it);
- }
- return str;
- }
- int NStr::StringToNumeric(const string& str)
- {
- if (str.empty() || !isdigit(*str.begin())) {
- return -1;
- }
- errno = 0;
- char* endptr = 0;
- unsigned long value = strtoul(str.c_str(), &endptr, 10);
- if (errno || !endptr || value > (unsigned long) kMax_Int ||
- *endptr != ' ' || endptr == str.c_str()) {
- return -1;
- }
- return (int) value;
- }
- # define CHECK_ENDPTR(conv)
- if (check_endptr == eCheck_Need && *endptr != ' ') {
- NCBI_THROW2(CStringException, eBadArgs,
- "String cannot be converted to " conv " - trailing junk",
- s_DiffPtr(endptr, str.c_str()));
- }
- int NStr::StringToInt(const string& str, int base /* = 10 */,
- ECheckEndPtr check_endptr /* = eCheck_Need */ )
- {
- errno = 0;
- char* endptr = 0;
- long value = strtol(str.c_str(), &endptr, base);
- if (errno || !endptr || endptr == str.c_str() ||
- value < kMin_Int || value > kMax_Int) {
- NCBI_THROW2(CStringException, eConvert,
- "String cannot be converted to int",
- s_DiffPtr(endptr, str.c_str()));
- }
- CHECK_ENDPTR("int");
- return (int) value;
- }
- unsigned int NStr::StringToUInt(const string& str, int base /* =10 */,
- ECheckEndPtr check_endptr /* =eCheck_Need */)
- {
- errno = 0;
- char* endptr = 0;
- unsigned long value = strtoul(str.c_str(), &endptr, base);
- if (errno || !endptr || endptr == str.c_str() || value > kMax_UInt) {
- NCBI_THROW2(CStringException, eConvert,
- "String cannot be converted unsigned int",
- s_DiffPtr(endptr, str.c_str()));
- }
- CHECK_ENDPTR("unsigned int");
- return (unsigned int) value;
- }
- long NStr::StringToLong(const string& str, int base /* = 10 */,
- ECheckEndPtr check_endptr /* = eCheck_Need */ )
- {
- errno = 0;
- char* endptr = 0;
- long value = strtol(str.c_str(), &endptr, base);
- if (errno || !endptr || endptr == str.c_str()) {
- NCBI_THROW2(CStringException, eConvert,
- "String cannot be converted to long",
- s_DiffPtr(endptr, str.c_str()));
- }
- CHECK_ENDPTR("long");
- return value;
- }
- unsigned long NStr::StringToULong(const string& str, int base /*=10 */,
- ECheckEndPtr check_endptr /*=eCheck_Need*/)
- {
- errno = 0;
- char* endptr = 0;
- unsigned long value = strtoul(str.c_str(), &endptr, base);
- if (errno || !endptr || endptr == str.c_str()) {
- NCBI_THROW2(CStringException, eConvert,
- "String cannot be converted to unsigned long",
- s_DiffPtr(endptr, str.c_str()));
- }
- CHECK_ENDPTR("unsigned long");
- return value;
- }
- double NStr::StringToDouble(const string& str,
- ECheckEndPtr check_endptr /* = eCheck_Need */ )
- {
- errno = 0;
- char* endptr = 0;
- double value = strtod(str.c_str(), &endptr);
- if (errno || !endptr || endptr == str.c_str()) {
- NCBI_THROW2(CStringException, eConvert,
- "String cannot be converted to double",
- s_DiffPtr(endptr, str.c_str()));
- }
- if (*(endptr - 1) != '.' && *endptr == '.')
- endptr++;
- CHECK_ENDPTR("double");
- return value;
- }
- string NStr::IntToString(long value, bool sign /* = false */ )
- {
- char buffer[64];
- ::sprintf(buffer, sign ? "%+ld" : "%ld", value);
- return buffer;
- }
- void NStr::IntToString(string& out_str, long value, bool sign)
- {
- char buffer[64];
- ::sprintf(buffer, sign ? "%+ld" : "%ld", value);
- out_str = buffer;
- }
- string NStr::UIntToString(unsigned long value)
- {
- char buffer[64];
- ::sprintf(buffer, "%lu", value);
- return buffer;
- }
- void NStr::UIntToString(string& out_str, unsigned long value)
- {
- char buffer[64];
- ::sprintf(buffer, "%lu", value);
- out_str = buffer;
- }
- string NStr::Int8ToString(Int8 value, bool sign /* = false */ )
- {
- string ret;
- NStr::Int8ToString(ret, value, sign);
- return ret;
- }
- void NStr::Int8ToString(string& out_str, Int8 value, bool sign)
- {
- const size_t kBufSize = (sizeof(value) * CHAR_BIT) / 3 + 2;
- char buffer[kBufSize];
- char* pos = buffer + kBufSize;
- if (value == 0) {
- *--pos = '0';
- }
- else {
- bool is_negative = value < 0;
- if ( is_negative )
- value = -value;
- do {
- *--pos = char('0' + (value % 10));
- value /= 10;
- } while ( value );
- if ( is_negative )
- *--pos = '-';
- else if ( sign )
- *--pos = '+';
- }
- out_str.resize(0);
- out_str.append(pos, buffer + kBufSize - pos);
- }
- Int8 NStr::StringToInt8(const string& str)
- {
- bool sign = false;
- const char* pc = str.c_str();
- switch (*pc) {
- case '-':
- sign = true;
- /*FALLTHRU*/
- case '+':
- ++pc;
- /*FALLTHRU*/
- default:
- break;
- }
- Int8 n = 0;
- Int8 limdiv = kMax_I8 / 10;
- Int8 limoff = kMax_I8 % 10;
- do {
- if (!isdigit(*pc)) {
- NCBI_THROW2(CStringException, eConvert,
- "String cannot be converted to Int8 - bad digit",
- s_DiffPtr(pc, str.c_str()));
- }
- int delta = *pc - '0';
- n *= 10;
- // Overflow checking
- if (n > limdiv || (n == limdiv && delta > limoff)) {
- NCBI_THROW2(CStringException, eConvert,
- "String cannot be converted Int8 - overflow",
- s_DiffPtr(pc, str.c_str()));
- }
- n += delta;
- } while (*++pc);
- return sign ? -n : n;
- }
- Uint8 NStr::StringToUInt8(const string& str, int base /* = 10 */)
- {
- const char* pc = str.c_str();
- if (*pc == '+')
- ++pc;
-
- Uint8 n = 0;
- Uint8 limdiv = kMax_UI8 / base;
- int limoff = int(kMax_UI8 % base);
- do {
- // Do a sanity check for common radixes
- int ch = *pc;
- if (base == 10 && !isdigit(ch) ||
- base == 16 && !isxdigit(ch) ||
- base == 8 && (ch < '0' || ch > '7') ||
- base == 2 && (ch < '0' || ch > '1')) {
- NCBI_THROW2(CStringException, eConvert,
- "String cannot be converted to UInt8 - bad digit",
- s_DiffPtr(pc, str.c_str()));
- }
- int delta; // corresponding numeric value of *pc
- if (isdigit(ch)) {
- delta = ch - '0';
- } else {
- ch = tolower(ch);
- // Got to be 'a' to 'f' because of previous sanity checks
- delta = ch - 'a' + 10;
- }
-
- n *= base;
- // Overflow checking
- if (n > limdiv || (n == limdiv && delta > limoff)) {
- NCBI_THROW2(CStringException, eConvert,
- "String cannot be converted to UInt8 - overflow",
- s_DiffPtr(pc, str.c_str()));
- }
- n += delta;
- } while (*++pc);
- return n;
- }
- void NStr::UInt8ToString(string& out_str, Uint8 value)
- {
- const size_t kBufSize = (sizeof(value) * CHAR_BIT) / 3 + 2;
- char buffer[kBufSize];
- char* pos = buffer + kBufSize;
- if ( value == 0 ) {
- *--pos = '0';
- }
- else {
- do {
- *--pos = char('0' + (value % 10));
- value /= 10;
- } while ( value );
- }
- out_str.resize(0);
- out_str.append(pos, buffer + kBufSize - pos);
- }
- string NStr::UInt8ToString(Uint8 value)
- {
- string ret;
- NStr::UInt8ToString(ret, value);
- return ret;
- }
- // A maximal double precision used in the double to string conversion
- #if defined(NCBI_OS_MSWIN)
- const unsigned int kMaxDoublePrecision = 200;
- #else
- const unsigned int kMaxDoublePrecision = 308;
- #endif
- // A maximal size of a double value in a string form.
- // Exponent size + sign + dot + ending ' ' + max.precision
- const unsigned int kMaxDoubleStringSize = 308 + 3 + kMaxDoublePrecision;
- string NStr::DoubleToString(double value)
- {
- char buffer[kMaxDoubleStringSize];
- ::sprintf(buffer, "%g", value);
- return buffer;
- }
- void NStr::DoubleToString(string& out_str, double value)
- {
- char buffer[kMaxDoubleStringSize];
- ::sprintf(buffer, "%g", value);
- out_str = buffer;
- }
- string NStr::DoubleToString(double value, unsigned int precision)
- {
- char buffer[kMaxDoubleStringSize];
- SIZE_TYPE n = DoubleToString(value, precision, buffer,
- kMaxDoubleStringSize);
- buffer[n] = ' ';
- return buffer;
- }
- SIZE_TYPE NStr::DoubleToString(double value, unsigned int precision,
- char* buf, SIZE_TYPE buf_size)
- {
- char buffer[kMaxDoubleStringSize];
- if (precision > kMaxDoublePrecision) {
- precision = kMaxDoublePrecision;
- }
- int n = ::sprintf(buffer, "%.*f", (int) precision, value);
- SIZE_TYPE n_copy = min((SIZE_TYPE) n, buf_size);
- memcpy(buf, buffer, n_copy);
- return n_copy;
- }
- string NStr::PtrToString(const void* value)
- {
- char buffer[64];
- ::sprintf(buffer, "%p", value);
- return buffer;
- }
- void NStr::PtrToString(string& out_str, const void* value)
- {
- char buffer[64];
- ::sprintf(buffer, "%p", value);
- out_str = buffer;
- }
- const void* NStr::StringToPtr(const string& str)
- {
- void *ptr = NULL;
- ::sscanf(str.c_str(), "%p", &ptr);
- return ptr;
- }
- static const string s_kTrueString = "true";
- static const string s_kFalseString = "false";
- static const string s_kTString = "t";
- static const string s_kFString = "f";
- static const string s_kYesString = "yes";
- static const string s_kNoString = "no";
- static const string s_kYString = "y";
- static const string s_kNString = "n";
- const string& NStr::BoolToString(bool value)
- {
- return value ? s_kTrueString : s_kFalseString;
- }
- bool NStr::StringToBool(const string& str)
- {
- if ( AStrEquiv(str, s_kTrueString, PNocase()) ||
- AStrEquiv(str, s_kTString, PNocase()) ||
- AStrEquiv(str, s_kYesString, PNocase()) ||
- AStrEquiv(str, s_kYString, PNocase()) )
- return true;
- if ( AStrEquiv(str, s_kFalseString, PNocase()) ||
- AStrEquiv(str, s_kFString, PNocase()) ||
- AStrEquiv(str, s_kNoString, PNocase()) ||
- AStrEquiv(str, s_kNString, PNocase()) )
- return false;
- NCBI_THROW2(CStringException, eConvert,
- "String cannot be converted to bool", 0);
- }
- string NStr::FormatVarargs(const char* format, va_list args)
- {
- #ifdef HAVE_VASPRINTF
- char* s;
- int n = vasprintf(&s, format, args);
- if (n >= 0) {
- string str(s, n);
- free(s);
- return str;
- } else {
- return kEmptyStr;
- }
- #elif defined(NCBI_COMPILER_GCC) && defined(NO_PUBSYNC)
- CNcbiOstrstream oss;
- oss.vform(format, args);
- return CNcbiOstrstreamToString(oss);
- #elif defined(HAVE_VSNPRINTF)
- // deal with implementation quirks
- size_t size = 1024;
- AutoPtr<char, ArrayDeleter<char> > buf(new char[size]);
- buf.get()[size-1] = buf.get()[size-2] = 0;
- size_t n = vsnprintf(buf.get(), size, format, args);
- while (n >= size || buf.get()[size-2]) {
- if (buf.get()[size-1]) {
- ERR_POST(Warning << "Buffer overrun by buggy vsnprintf");
- }
- size = max(size << 1, n);
- buf.reset(new char[size]);
- buf.get()[size-1] = buf.get()[size-2] = 0;
- n = vsnprintf(buf.get(), size, format, args);
- }
- return (n > 0) ? string(buf.get(), n) : kEmptyStr;
- #elif defined(HAVE_VPRINTF)
- char buf[1024];
- buf[sizeof(buf) - 1] = 0;
- vsprintf(buf, format, args);
- if (buf[sizeof(buf) - 1]) {
- ERR_POST(Warning << "Buffer overrun by vsprintf");
- }
- return buf;
- #else
- # error Please port this code to your system.
- #endif
- }
- SIZE_TYPE NStr::FindNoCase(const string& str, const string& pattern,
- SIZE_TYPE start, SIZE_TYPE end, EOccurrence where)
- {
- string pat(pattern, 0, 1);
- SIZE_TYPE l = pattern.size();
- if (isupper(pat[0])) {
- pat += (char)tolower(pat[0]);
- } else if (islower(pat[0])) {
- pat += (char)toupper(pat[0]);
- }
- if (where == eFirst) {
- SIZE_TYPE pos = str.find_first_of(pat, start);
- while (pos != NPOS && pos <= end
- && CompareNocase(str, pos, l, pattern) != 0) {
- pos = str.find_first_of(pat, pos + 1);
- }
- return pos > end ? NPOS : pos;
- } else { // eLast
- SIZE_TYPE pos = str.find_last_of(pat, end);
- while (pos != NPOS && pos >= start
- && CompareNocase(str, pos, l, pattern) != 0) {
- if (pos == 0) {
- return NPOS;
- }
- pos = str.find_last_of(pat, pos - 1);
- }
- return pos < start ? NPOS : pos;
- }
- }
- string NStr::TruncateSpaces(const string& str, ETrunc where)
- {
- SIZE_TYPE beg = 0;
- if (where == eTrunc_Begin || where == eTrunc_Both) {
- while (beg < str.length() && isspace(str[beg]))
- beg++;
- if (beg == str.length())
- return kEmptyStr;
- }
- SIZE_TYPE end = str.length() - 1;
- if (where == eTrunc_End || where == eTrunc_Both) {
- while ( isspace(str[end]) )
- end--;
- }
- _ASSERT( beg <= end );
- return str.substr(beg, end - beg + 1);
- }
- string& NStr::Replace(const string& src,
- const string& search, const string& replace,
- string& dst, SIZE_TYPE start_pos, size_t max_replace)
- {
- // source and destination should not be the same
- if (&src == &dst) {
- NCBI_THROW2(CStringException, eBadArgs,
- "String replace called with source == destination", 0);
- }
- dst = src;
- if( start_pos + search.size() > src.size() ||
- search == replace)
- return dst;
- for(size_t count = 0; !(max_replace && count >= max_replace); count++) {
- start_pos = dst.find(search, start_pos);
- if(start_pos == NPOS)
- break;
- dst.replace(start_pos, search.size(), replace);
- start_pos += replace.size();
- }
- return dst;
- }
- string NStr::Replace(const string& src,
- const string& search, const string& replace,
- SIZE_TYPE start_pos, size_t max_replace)
- {
- string dst;
- return Replace(src, search, replace, dst, start_pos, max_replace);
- }
- list<string>& NStr::Split(const string& str, const string& delim,
- list<string>& arr, EMergeDelims merge)
- {
- for (size_t pos = 0; ; ) {
- size_t prev_pos = (merge == eMergeDelims
- ? str.find_first_not_of(delim, pos)
- : pos);
- if (prev_pos == NPOS) {
- break;
- }
- pos = str.find_first_of(delim, prev_pos);
- if (pos == NPOS) {
- arr.push_back(str.substr(prev_pos));
- break;
- } else {
- arr.push_back(str.substr(prev_pos, pos - prev_pos));
- ++pos;
- }
- }
- return arr;
- }
- vector<string>& NStr::Tokenize(const string& str, const string& delim,
- vector<string>& arr, EMergeDelims merge)
- {
- if (delim.empty()) {
- arr.push_back(str);
- return arr;
- }
- size_t pos, prev_pos;
- // Count number of tokens to determine the array size
- size_t tokens = 0;
- for (pos = prev_pos = 0; pos < str.length(); ++pos) {
- char c = str[pos];
- size_t dpos = delim.find(c);
- if (dpos != string::npos) ++tokens;
- }
- arr.reserve(arr.size() + tokens + 1);
- // Tokenization
- for (pos = 0; ; ) {
- prev_pos = (merge == eMergeDelims ? str.find_first_not_of(delim, pos)
- : pos);
- if (prev_pos == NPOS) {
- break;
- }
- pos = str.find_first_of(delim, prev_pos);
- if (pos == NPOS) {
- arr.push_back(str.substr(prev_pos));
- break;
- } else {
- arr.push_back(str.substr(prev_pos, pos - prev_pos));
- ++pos;
- }
- }
- return arr;
- }
- bool NStr::SplitInTwo(const string& str, const string& delim,
- string& str1, string& str2)
- {
- SIZE_TYPE delim_pos = str.find_first_of(delim);
- if (NPOS == delim_pos) { // only one piece.
- str1 = str;
- str2 = kEmptyStr;
- return false;
- }
- str1 = str.substr(0, delim_pos);
- str2 = str.substr(delim_pos + 1); // skip only one delimiter character.
- return true;
- }
- template <typename T>
- string s_NStr_Join(const T& arr, const string& delim)
- {
- if (arr.empty()) {
- return kEmptyStr;
- }
- string result = arr.front();
- typename T::const_iterator it = arr.begin();
- SIZE_TYPE needed = result.size();
- while (++it != arr.end()) {
- needed += delim.size() + it->size();
- }
- result.reserve(needed);
- it = arr.begin();
- while (++it != arr.end()) {
- result += delim;
- result += *it;
- }
- return result;
- }
- string NStr::Join(const list<string>& arr, const string& delim)
- {
- return s_NStr_Join(arr, delim);
- }
- string NStr::Join(const vector<string>& arr, const string& delim)
- {
- return s_NStr_Join(arr, delim);
- }
- string NStr::PrintableString(const string& str,
- NStr::ENewLineMode nl_mode)
- {
- static const char s_Hex[] = "0123456789ABCDEF";
- ITERATE ( string, it, str ) {
- if ( !isprint(*it) || *it == '"' || *it == '\' ) {
- // bad character - convert via CNcbiOstrstream
- CNcbiOstrstream out;
- // write first good characters in one chunk
- out.write(str.data(), it-str.begin());
- // convert all other characters one by one
- do {
- if ( isprint(*it) ) {
- // escape '"' and '\' anyway
- if ( *it == '"' || *it == '\' )
- out.put('\');
- out.put(*it);
- }
- else if (*it == 'n') {
- // newline needs special processing
- if (nl_mode == eNewLine_Quote) {
- out.write("\n", 2);
- }
- else {
- out.put('n');
- }
- } else {
- // all other non-printable characters need to be escaped
- out.put('\');
- if (*it == 't') {
- out.put('t');
- } else if (*it == 'r') {
- out.put('r');
- } else if (*it == 'v') {
- out.put('v');
- } else {
- // hex string for non-standard codes
- out.put('x');
- out.put(s_Hex[(unsigned char) *it >> 4]);
- out.put(s_Hex[(unsigned char) *it & 15]);
- }
- }
- } while (++it < it_end); // it_end is from ITERATE macro
- return CNcbiOstrstreamToString(out);
- }
- }
- // all characters are good - return orignal string
- return str;
- }
- string NStr::ParseEscapes(const string& str)
- {
- string out;
- out.reserve(str.size()); // can only be smaller
- SIZE_TYPE pos = 0;
- while (pos < str.size()) {
- SIZE_TYPE pos2 = str.find('\', pos);
- if (pos2 == NPOS) {
- out += str.substr(pos);
- break;
- }
- out += str.substr(pos, pos2 - pos);
- if (++pos2 == str.size()) {
- NCBI_THROW2(CStringException, eFormat,
- "Unterminated escape sequence", pos2);
- }
- switch (str[pos2]) {
- case 'a': out += 'a'; break;
- case 'b': out += 'b'; break;
- case 'f': out += 'f'; break;
- case 'n': out += 'n'; break;
- case 'r': out += 'r'; break;
- case 't': out += 't'; break;
- case 'v': out += 'v'; break;
- case 'x':
- {
- pos = pos2 + 1;
- while (pos2 <= pos && pos2 + 1 < str.size()
- && isxdigit(str[pos2 + 1])) {
- ++pos2;
- }
- if (pos2 >= pos) {
- out += static_cast<char>
- (StringToUInt(str.substr(pos, pos2 - pos + 1), 16));
- } else {
- NCBI_THROW2(CStringException, eFormat,
- "\x used with no following digits", pos);
- }
- break;
- }
- case '0': case '1': case '2': case '3':
- case '4': case '5': case '6': case '7':
- {
- pos = pos2;
- unsigned char c = str[pos2] - '0';
- while (pos2 < pos + 3 && pos2 + 1 < str.size()
- && str[pos2 + 1] >= '0' && str[pos2 + 1] <= '7') {
- c = (c << 3) | (str[++pos2] - '0');
- }
- out += c;
- }
- default:
- out += str[pos2];
- }
- pos = pos2 + 1;
- }
- return out;
- }
- // Determines the end of an HTML <...> tag, accounting for attributes
- // and comments (the latter allowed only within <!...>).
- static SIZE_TYPE s_EndOfTag(const string& str, SIZE_TYPE start)
- {
- _ASSERT(start < str.size() && str[start] == '<');
- bool comments_ok = (start + 1 < str.size() && str[start + 1] == '!');
- for (SIZE_TYPE pos = start + 1; pos < str.size(); ++pos) {
- switch (str[pos]) {
- case '>': // found the end
- return pos;
- case '"': // start of "string"; advance to end
- pos = str.find('"', pos + 1);
- if (pos == NPOS) {
- NCBI_THROW2(CStringException, eFormat,
- "Unclosed string in HTML tag", start);
- // return pos;
- }
- break;
- case '-': // possible start of -- comment --; advance to end
- if (comments_ok && pos + 1 < str.size()
- && str[pos + 1] == '-') {
- pos = str.find("--", pos + 2);
- if (pos == NPOS) {
- NCBI_THROW2(CStringException, eFormat,
- "Unclosed comment in HTML tag", start);
- // return pos;
- } else {
- ++pos;
- }
- }
- }
- }
- NCBI_THROW2(CStringException, eFormat, "Unclosed HTML tag", start);
- // return NPOS;
- }
- // Determines the end of an HTML &foo; character/entity reference
- // (which might not actually end with a semicolon :-/)
- static SIZE_TYPE s_EndOfReference(const string& str, SIZE_TYPE start)
- {
- _ASSERT(start < str.size() && str[start] == '&');
- #ifdef NCBI_STRICT_HTML_REFS
- return str.find(';', start + 1);
- #else
- SIZE_TYPE pos = str.find_first_not_of
- ("#0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
- start + 1);
- if (pos == NPOS || str[pos] == ';') {
- return pos;
- } else {
- return pos - 1;
- }
- #endif
- }
- static SIZE_TYPE s_VisibleWidth(const string& str, bool is_html)
- {
- if (is_html) {
- SIZE_TYPE width = 0, pos = 0;
- for (;;) {
- SIZE_TYPE pos2 = str.find_first_of("<&", pos);
- if (pos2 == NPOS) {
- width += str.size() - pos;
- break;
- } else {
- width += pos2 - pos;
- if (str[pos2] == '&') {
- ++width;
- pos = s_EndOfReference(str, pos);
- } else {
- pos = s_EndOfTag(str, pos);
- }
- if (pos == NPOS) {
- break;
- } else {
- ++pos;
- }
- }
- }
- return width;
- } else {
- return str.size();
- }
- }
- list<string>& NStr::Wrap(const string& str, SIZE_TYPE width,
- list<string>& arr, NStr::TWrapFlags flags,
- const string* prefix, const string* prefix1)
- {
- if (prefix == 0) {
- prefix = &kEmptyStr;
- }
- const string* pfx = prefix1 ? prefix1 : prefix;
- SIZE_TYPE pos = 0, len = str.size();
- string hyphen; // "-" or empty
- bool is_html = flags & fWrap_HTMLPre ? true : false;
- enum EScore { // worst to best
- eForced,
- ePunct,
- eSpace,
- eNewline
- };
- while (pos < len) {
- SIZE_TYPE column = s_VisibleWidth(*pfx, is_html);
- SIZE_TYPE column0 = column;
- // the next line will start at best_pos
- SIZE_TYPE best_pos = NPOS;
- EScore best_score = eForced;
- for (SIZE_TYPE pos2 = pos; pos2 < len && column <= width;
- ++pos2, ++column) {
- EScore score = eForced;
- SIZE_TYPE score_pos = pos2;
- char c = str[pos2];
- if (c == 'n') {
- best_pos = pos2;
- best_score = eNewline;
- break;
- } else if (isspace(c)) {
- if (pos2 > 0 && isspace(str[pos2 - 1])) {
- continue; // take the first space of a group
- }
- score = eSpace;
- } else if (is_html && c == '<') {
- // treat tags as zero-width...
- pos2 = s_EndOfTag(str, pos2);
- --column;
- } else if (is_html && c == '&') {
- // ...and references as single characters
- pos2 = s_EndOfReference(str, pos2);
- } else if (ispunct(c)) {
- if (c == '(' || c == '[' || c == '{' || c == '<'
- || c == '`') { // opening element
- score = ePunct;
- } else if (score_pos < len - 1) {
- // Prefer breaking *after* most types of punctuation.
- score = ePunct;
- ++score_pos;
- }
- }
- if (score >= best_score) {
- best_pos = score_pos;
- best_score = score;
- }
- while (pos2 < len - 1 && str[pos2 + 1] == 'b') {
- // Account for backspaces
- ++pos2;
- if (column > column0) {
- --column;
- }
- }
- }
- if (best_score != eNewline && column <= width) {
- // If the whole remaining text can fit, don't split it...
- best_pos = len;
- } else if (best_score == eForced && (flags & fWrap_Hyphenate)) {
- hyphen = "-";
- --best_pos;
- }
- arr.push_back(*pfx);
- {{ // eat backspaces and the characters (if any) that precede them
- string line(str, pos, best_pos - pos);
- SIZE_TYPE bs = 0;
- while ((bs = line.find('b', bs)) != NPOS) {
- if (bs > 0) {
- line.erase(bs - 1, 2);
- } else {
- line.erase(0, 1);
- }
- }
- arr.back() += line;
- }}
- arr.back() += hyphen;
- pos = best_pos;
- pfx = prefix;
- hyphen = kEmptyStr;
- if (best_score == eSpace) {
- // If breaking at a group of spaces, skip over the whole group
- while (pos < len && isspace(str[pos]) && str[pos] != 'n') {
- ++pos;
- }
- } else if (best_score == eNewline) {
- ++pos;
- }
- while (pos < len && str[pos] == 'b') {
- ++pos;
- }
- }
- return arr;
- }
- list<string>& NStr::WrapList(const list<string>& l, SIZE_TYPE width,
- const string& delim, list<string>& arr,
- NStr::TWrapFlags flags, const string* prefix,
- const string* prefix1)
- {
- if (l.empty()) {
- return arr;
- }
- const string* pfx = prefix1 ? prefix1 : prefix;
- string s = *pfx;
- bool is_html = flags & fWrap_HTMLPre ? true : false;
- SIZE_TYPE column = s_VisibleWidth(s, is_html);
- SIZE_TYPE delwidth = s_VisibleWidth(delim, is_html);
- bool at_start = true;
- ITERATE (list<string>, it, l) {
- SIZE_TYPE term_width = s_VisibleWidth(*it, is_html);
- if (at_start) {
- if (column + term_width <= width) {
- s += *it;
- column += term_width;
- at_start = false;
- } else {
- // Can't fit, even on its own line; break separately.
- Wrap(*it, width, arr, flags, prefix, pfx);
- pfx = prefix;
- s = *prefix;
- column = s_VisibleWidth(s, is_html);
- at_start = true;
- }
- } else if (column + delwidth + term_width <= width) {
- s += delim;
- s += *it;
- column += delwidth + term_width;
- at_start = false;
- } else {
- // Can't fit on this line; break here and try again.
- arr.push_back(s);
- pfx = prefix;
- s = *prefix;
- column = s_VisibleWidth(s, is_html);
- at_start = true;
- --it;
- }
- }
- arr.push_back(s);
- return arr;
- }
- #if !defined(HAVE_STRDUP)
- extern char* strdup(const char* str)
- {
- if ( !str )
- return 0;
- size_t size = strlen(str) + 1;
- void* result = malloc(size);
- return (char*) (result ? memcpy(result, str, size) : 0);
- }
- #endif
- /////////////////////////////////////////////////////////////////////////////
- // CStringUTF8
- void CStringUTF8::x_Append(const char* src)
- {
- const char* srcBuf;
- size_t needed = 0;
- for (srcBuf = src; *srcBuf; ++srcBuf) {
- Uint1 ch = *srcBuf;
- if (ch < 0x80) {
- ++needed;
- } else {
- needed += 2;
- }
- }
- if ( !needed )
- return;
- reserve(length()+needed+1);
- for (srcBuf = src; *srcBuf; ++srcBuf) {
- Uint1 ch = *srcBuf;
- if (ch < 0x80) {
- append(1, ch);
- } else {
- append(1, Uint1((ch >> 6) | 0xC0));
- append(1, Uint1((ch & 0x3F) | 0x80));
- }
- }
- }
- #if defined(HAVE_WSTRING)
- void CStringUTF8::x_Append(const wchar_t* src)
- {
- const wchar_t* srcBuf;
- size_t needed = 0;
- for (srcBuf = src; *srcBuf; ++srcBuf) {
- Uint2 ch = *srcBuf;
- if (ch < 0x80) {
- ++needed;
- } else if (ch < 0x800) {
- needed += 2;
- } else {
- needed += 3;
- }
- }
- if ( !needed )
- return;
- reserve(length()+needed+1);
- for (srcBuf = src; *srcBuf; ++srcBuf) {
- Uint2 ch = *srcBuf;
- if (ch < 0x80) {
- append(1, ch);
- }
- else if (ch < 0x800) {
- append(1, Uint2((ch >> 6) | 0xC0));
- append(1, Uint2((ch & 0x3F) | 0x80));
- } else {
- append(1, Uint2((ch >> 12) | 0xE0));
- append(1, Uint2(((ch >> 6) & 0x3F) | 0x80));
- append(1, Uint2((ch & 0x3F) | 0x80));
- }
- }
- }
- #endif // HAVE_WSTRING
- string CStringUTF8::AsAscii(void) const
- {
- string result;
- const char* srcBuf;
- size_t needed = 0;
- bool bad = false;
- bool enough = true;
- for (srcBuf = c_str(); *srcBuf; ++srcBuf) {
- Uint1 ch = *srcBuf;
- if ((ch & 0x80) == 0) {
- ++needed;
- } else if ((ch & 0xE0) == 0xC0) {
- enough = (ch & 0x1F) <= 0x03;
- if (enough) {
- ++needed;
- ch = *(++srcBuf);
- bad = (ch & 0xC0) != 0x80;
- }
- } else if ((ch & 0xF0) == 0xE0) {
- enough = false;
- } else {
- bad = true;
- }
- if (!enough) {
- NCBI_THROW2(CStringException, eConvert,
- "Cannot convert UTF8 string to single-byte string",
- s_DiffPtr(srcBuf,c_str()));
- }
- if (bad) {
- NCBI_THROW2(CStringException, eFormat,
- "Wrong UTF8 format",
- s_DiffPtr(srcBuf,c_str()));
- }
- }
- result.reserve( needed+1);
- for (srcBuf = c_str(); *srcBuf; ++srcBuf) {
- Uint1 chRes;
- size_t more;
- Uint1 ch = *srcBuf;
- if ((ch & 0x80) == 0) {
- chRes = ch;
- more = 0;
- } else {
- chRes = (ch & 0x1F);
- more = 1;
- }
- while (more--) {
- ch = *(++srcBuf);
- chRes = (chRes << 6) | (ch & 0x3F);
- }
- result += chRes;
- }
- return result;
- }
- #if defined(HAVE_WSTRING)
- wstring CStringUTF8::AsUnicode(void) const
- {
- wstring result;
- const char* srcBuf;
- size_t needed = 0;
- bool bad = false;
- for (srcBuf = c_str(); *srcBuf; ++srcBuf) {
- Uint1 ch = *srcBuf;
- if ((ch & 0x80) == 0) {
- ++needed;
- } else if ((ch & 0xE0) == 0xC0) {
- ++needed;
- ch = *(++srcBuf);
- bad = (ch & 0xC0) != 0x80;
- } else if ((ch & 0xF0) == 0xE0) {
- ++needed;
- ch = *(++srcBuf);
- bad = (ch & 0xC0) != 0x80;
- if (!bad) {
- ch = *(++srcBuf);
- bad = (ch & 0xC0) != 0x80;
- }
- } else {
- bad = true;
- }
- if (bad) {
- NCBI_THROW2(CStringException, eFormat,
- "Wrong UTF8 format",
- s_DiffPtr(srcBuf,c_str()));
- }
- }
- result.reserve( needed+1);
- for (srcBuf = c_str(); *srcBuf; ++srcBuf) {
- Uint2 chRes;
- size_t more;
- Uint1 ch = *srcBuf;
- if ((ch & 0x80) == 0) {
- chRes = ch;
- more = 0;
- } else if ((ch & 0xE0) == 0xC0) {
- chRes = (ch & 0x1F);
- more = 1;
- } else {
- chRes = (ch & 0x0F);
- more = 2;
- }
- while (more--) {
- ch = *(++srcBuf);
- chRes = (chRes << 6) | (ch & 0x3F);
- }
- result += chRes;
- }
- return result;
- }
- #endif // HAVE_WSTRING
- END_NCBI_SCOPE
- /*
- * ===========================================================================
- * $Log: ncbistr.cpp,v $
- * Revision 1000.6 2004/06/01 19:09:21 gouriano
- * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.109
- *
- * Revision 1.109 2004/05/26 19:21:25 ucko
- * FindNoCase: avoid looping in eLastMode when there aren't any full
- * matches but the first character of the string matches the first
- * character of the pattern.
- *
- * Revision 1.108 2004/05/14 13:59:27 gorelenk
- * Added include of ncbi_pch.hpp
- *
- * Revision 1.107 2004/03/11 18:49:48 gorelenk
- * Removed(condionaly) implementation of class CNcbiEmptyString.
- *
- * Revision 1.106 2004/03/05 12:26:43 ivanov
- * Moved CDirEntry::MatchesMask() to NStr class.
- *
- * Revision 1.105 2004/03/04 13:38:57 kuznets
- * + set of ToString conversion functions taking outout string as a parameter,
- * not a return value (should give a performance advantage in some cases)
- *
- * Revision 1.104 2004/02/19 16:44:55 vasilche
- * WorkShop compiler doesn't support static templates.
- *
- * Revision 1.103 2004/02/18 20:54:47 shomrat
- * bug fix (pos -> pos2)
- *
- * Revision 1.102 2003/12/12 20:06:44 rsmith
- * Take out un-needed include of stdarg.h (included in ncbistr.hpp).
- *
- * Revision 1.101 2003/12/12 20:04:24 rsmith
- * make sure stdarg.h is included to define va_list.
- *
- * Revision 1.100 2003/12/12 17:26:54 ucko
- * +FormatVarargs
- *
- * Revision 1.99 2003/12/01 20:45:47 ucko
- * Extend Join to handle vectors as well as lists (common code templatized).
- * Add ParseEscapes (inverse of PrintableString).
- *
- * Revision 1.98 2003/10/31 13:15:20 lavr
- * Fix typos in the log of the previous commit :-)
- *
- * Revision 1.97 2003/10/31 12:59:46 lavr
- * Better diagnostics messages from exceptions; some other cosmetic changes
- *
- * Revision 1.96 2003/10/03 15:16:02 ucko
- * NStr::Join: preallocate as much space as we need for result.
- *
- * Revision 1.95 2003/09/17 15:18:29 vasilche
- * Reduce memory allocations in NStr::PrintableString()
- *
- * Revision 1.94 2003/08/19 15:17:20 rsmith
- * Add NStr::SplitInTwo() function.
- *
- * Revision 1.93 2003/06/16 15:19:03 ucko
- * FindNoCase: always honor both start and end (oops).
- *
- * Revision 1.92 2003/05/22 20:09:29 gouriano
- * added UTF8 strings
- *
- * Revision 1.91 2003/05/14 21:52:09 ucko
- * Move FindNoCase out of line and reimplement it to avoid making
- * lowercase copies of both strings.
- *
- * Revision 1.90 2003/03/25 22:15:40 lavr
- * NStr::PrintableString():: Print NUL char as x00 instead of