/* OpenLieroX UTF8/Unicode conversions code under LGPL created 01-05-2007 by Albert Zeyer and Dark Charlie */ #ifndef __UNICODE_H__ #define __UNICODE_H__ #include // for Uint32 #include typedef Uint32 UnicodeChar; typedef std::basic_string UnicodeString; typedef Uint16 Utf16Char; typedef std::basic_string Utf16String; struct ConversionItem { UnicodeChar Unicode; unsigned char Utf8[4]; char Ascii; }; #define UNKNOWN_CHARACTER ' ' // Characters not in conversion table extern ConversionItem tConversionTable[]; /////////////////////// // Moves the iterator to next unicode character in the string, returns number of bytes skipped template inline size_t IncUtf8StringIterator(_Iterator1& it, const _Iterator2& last) { if(it == last) return 0; unsigned char c; size_t res = 1; for(++it; last != it; ++it, ++res) { c = *it; if(!(c&0x80) || ((c&0xC0) == 0xC0)) break; } return res; } template inline size_t MultIncUtf8StringIterator(_Iterator& it, const _Iterator& last, size_t count) { size_t res = 0; for(size_t i = 0; i < count; i++) { if(it == last) break; res += IncUtf8StringIterator(it, last); } return res; } /////////////////// // The iterator points at first byte of the UTF8 encoded character, returns number of bytes skipped template inline size_t DecUtf8StringIterator(_Iterator1& it, const _Iterator2& first) { if(it == first) return 0; size_t res = 1; unsigned char c; --it; for(; first != it; --it, ++res) { c = *it; if(!(c&0x80) || ((c&0xC0) == 0xC0)) break; } return res; } template inline _Iterator GetMultIncUtf8StringIterator(_Iterator it, const _Iterator& last, size_t count) { MultIncUtf8StringIterator(it, last, count); return it; } inline std::string::const_iterator Utf8PositionToIterator(const std::string& str, size_t pos) { std::string::const_iterator res = str.begin(); MultIncUtf8StringIterator(res, str.end(), pos); return res; } inline std::string::iterator Utf8PositionToIterator(std::string& str, size_t pos) { std::string::iterator res = str.begin(); MultIncUtf8StringIterator(res, str.end(), pos); return res; } //////////////////////// // Reads next unicode character from a UTF8 encoded string // the iterator shows at the next character after this operation UnicodeChar GetNextUnicodeFromUtf8(std::string::const_iterator &it, const std::string::const_iterator& last, size_t& num_skipped); inline UnicodeChar GetNextUnicodeFromUtf8(std::string::const_iterator& it, const std::string::const_iterator& last) { size_t tmp; return GetNextUnicodeFromUtf8(it, last, tmp); } inline UnicodeChar GetUnicodeFromUtf8(const std::string& str, size_t pos) { std::string::const_iterator it = Utf8PositionToIterator(str, pos); return GetNextUnicodeFromUtf8(it, str.end()); } //////////////////// // Gets the UTF8 representation of the unicode character (can be more bytes) std::string GetUtf8FromUnicode(UnicodeChar ch); inline size_t Utf8StringSize(const std::string& str) { size_t res = 0; std::string::const_iterator it = str.begin(); for(; it != str.end(); IncUtf8StringIterator(it, str.end())) res++; return res; } inline std::string Utf8SubStr(const std::string& str, size_t start, size_t n = (size_t)-1) { if (n == (size_t)-1) return std::string(Utf8PositionToIterator(str, start), str.end()); else return std::string( Utf8PositionToIterator(str, start), Utf8PositionToIterator(str, start + n)); } inline void Utf8Erase(std::string& str, size_t start, size_t n = (size_t)-1) { std::string::iterator it = Utf8PositionToIterator(str, start); str.erase(it, GetMultIncUtf8StringIterator(it, str.end(), n)); } inline void Utf8Insert(std::string& str, size_t start, const std::string& s) { str.insert(Utf8PositionToIterator(str, start), s.begin(), s.end()); } inline void InsertUnicodeChar(std::string& str, size_t pos, UnicodeChar ch) { std::string tmp = GetUtf8FromUnicode(ch); Utf8Insert(str, pos, tmp); } // Uppercase/lowercase handling UnicodeChar UnicodeToLower(UnicodeChar c); UnicodeChar UnicodeToUpper(UnicodeChar c); // Conversion functions int FindTableIndex(UnicodeChar c); char UnicodeCharToAsciiChar(UnicodeChar c); std::string RemoveSpecialChars(const std::string &Utf8String); std::string Utf16ToUtf8(const Utf16String& str); Utf16String Utf8ToUtf16(const std::string& str); std::string UnicodeToUtf8(const UnicodeString& str); UnicodeString Utf8ToUnicode(const std::string& str); #ifdef WIN32 std::string Utf8ToSystemNative(const std::string& utf8str); std::string SystemNativeToUtf8(const std::string& natstr); #else // Other platforms use natively utf8 (at least we suppose so) inline std::string Utf8ToSystemNative(const std::string& utf8str) { return utf8str; } inline std::string SystemNativeToUtf8(const std::string& natstr) { return natstr; } #endif size_t TransformRawToUtf8Pos(const std::string& text, size_t pos); size_t TransformUtf8PosToRaw(const std::string& text, size_t pos); inline size_t TransformRawToUtf8ToRaw(const std::string& src, size_t srcpos, const std::string& dest) { return TransformUtf8PosToRaw(dest, TransformRawToUtf8Pos(src, srcpos)); } #endif