diff --git a/nel/include/nel/misc/ucstring.h b/nel/include/nel/misc/ucstring.h index 8859a21d9..28c83ce2f 100644 --- a/nel/include/nel/misc/ucstring.h +++ b/nel/include/nel/misc/ucstring.h @@ -46,6 +46,11 @@ public: { } + ucstring(const ucchar *begin, const ucchar *end) + : ucstringbase(begin, end) + { + } + ucstring(const std::string &str) : ucstringbase() { diff --git a/nel/include/nel/misc/utf_string_view.h b/nel/include/nel/misc/utf_string_view.h index 78df8c141..503fc3737 100644 --- a/nel/include/nel/misc/utf_string_view.h +++ b/nel/include/nel/misc/utf_string_view.h @@ -19,7 +19,6 @@ #include #include -#include #include namespace NLMISC { @@ -27,7 +26,7 @@ namespace NLMISC { /// String view for UTF-8 and UTF-32 iteration as 32-bit codepoints. /// This string view keeps the string as a reference, it does not make a copy. /// Only use this for iterating a string's codepoints. -/// Strings are not necessarily NUL-terminated. +/// Strings are not required to be NUL-terminated, but must have at least one character extra. class CUtfStringView { public: @@ -41,12 +40,13 @@ public: inline CUtfStringView(const ucstring &utf16Str) : m_Str(utf16Str.c_str()), m_Size(utf16Str.size() << 1), m_Iterator(utf16Iterator) {} inline CUtfStringView(const u32string &utf32Str) : m_Str(utf32Str.c_str()), m_Size(utf32Str.size() << 2), m_Iterator(utf32Iterator) {} - std::string toUtf8(); // Makes a copy - u32string toUtf32(); // Makes a copy + std::string toUtf8(bool reEncode = false) const; // Makes a copy + ucstring toUtf16(bool reEncode = false) const; // Makes a copy + u32string toUtf32() const; // Makes a copy - inline bool isUtf8() { return m_Iterator == utf8Iterator; } - inline bool isUtf16() { return m_Iterator == utf16Iterator; } - inline bool isUtf32() { return m_Iterator == utf32Iterator; } + inline bool isUtf8() const { return m_Iterator == utf8Iterator; } + inline bool isUtf16() const { return m_Iterator == utf16Iterator; } + inline bool isUtf32() const { return m_Iterator == utf32Iterator; } struct const_iterator { @@ -82,8 +82,8 @@ private: static u32char utf16Iterator(const void **addr); static u32char utf32Iterator(const void **addr); - const void *const m_Str; - const size_t m_Size; + const void *const m_Str; // String + const size_t m_Size; // Size in bytes const TIterator m_Iterator; }; /* class CUtfStringView */ diff --git a/nel/src/misc/ucstring.cpp b/nel/src/misc/ucstring.cpp index 5fad643ce..79cf78f7e 100644 --- a/nel/src/misc/ucstring.cpp +++ b/nel/src/misc/ucstring.cpp @@ -16,6 +16,7 @@ #include "stdmisc.h" #include "nel/misc/ucstring.h" +#include "nel/misc/utf_string_view.h" void ucstring::toString(std::string &str) const { @@ -31,138 +32,12 @@ void ucstring::toString(std::string &str) const std::string ucstring::toUtf8() const { -#if defined(NL_OS_WINDOWS) - // Use OS implementation - nlctassert(sizeof(wchar_t) == sizeof(ucchar)); - nlctassert(sizeof(wchar_t) == sizeof(uint16)); - return NLMISC::wideToUtf8(static_cast(*this)); -#else - std::string res; - ucstring::const_iterator first(begin()), last(end()); - for (; first != last; ++first) - { - //ucchar c = *first; - uint nbLoop = 0; - if (*first < 0x80) - res += char(*first); - else if (*first < 0x800) - { - ucchar c = *first; - c = c >> 6; - c = c & 0x1F; - res += char(c) | 0xC0; - nbLoop = 1; - } - else /*if (*first < 0x10000)*/ - { - ucchar c = *first; - c = c >> 12; - c = c & 0x0F; - res += char(c) | 0xE0; - nbLoop = 2; - } - - for (uint i = 0; i < nbLoop; ++i) - { - ucchar c = *first; - c = c >> ((nbLoop - i - 1) * 6); - c = c & 0x3F; - res += char(c) | 0x80; - } - } - return res; -#endif + return NLMISC::CUtfStringView(*this).toUtf8(); } void ucstring::fromUtf8(const std::string &stringUtf8) { -#if defined(NL_OS_WINDOWS) - // Use OS implementation - nlctassert(sizeof(wchar_t) == sizeof(ucchar)); - nlctassert(sizeof(wchar_t) == sizeof(uint16)); - nlctassert(sizeof(std::wstring) == sizeof(ucstring)); // These can be swapped on Windows - static_cast(*this) = nlmove(NLMISC::utf8ToWide(stringUtf8)); - if (stringUtf8.size() && !size()) - rawCopy(stringUtf8); -#else - // clear the string - erase(); - - uint8 c; - ucchar code; - sint iterations = 0; - - std::string::const_iterator first(stringUtf8.begin()), last(stringUtf8.end()); - for (; first != last;) - { - c = *first++; - code = c; - - if ((code & 0xFE) == 0xFC) - { - code &= 0x01; - iterations = 5; - } - else if ((code & 0xFC) == 0xF8) - { - code &= 0x03; - iterations = 4; - } - else if ((code & 0xF8) == 0xF0) - { - code &= 0x07; - iterations = 3; - } - else if ((code & 0xF0) == 0xE0) - { - code &= 0x0F; - iterations = 2; - } - else if ((code & 0xE0) == 0xC0) - { - code &= 0x1F; - iterations = 1; - } - else if ((code & 0x80) == 0x80) - { - // If it's not a valid UTF8 string, just copy the line without utf8 conversion - rawCopy(stringUtf8); - return; - } - else - { - push_back(code); - iterations = 0; - } - - if (iterations) - { - for (sint i = 0; i < iterations; i++) - { - if (first == last) - { - // If it's not a valid UTF8 string, just copy the line without utf8 conversion - rawCopy(stringUtf8); - return; - } - - uint8 ch; - ch = *first++; - - if ((ch & 0xC0) != 0x80) - { - // If it's not a valid UTF8 string, just copy the line without utf8 conversion - rawCopy(stringUtf8); - return; - } - - code <<= 6; - code |= (ucchar)(ch & 0x3F); - } - push_back(code); - } - } -#endif + *this = NLMISC::CUtfStringView(stringUtf8).toUtf16(); } void ucstring::rawCopy(const std::string &str) diff --git a/nel/src/misc/utf_string_view.cpp b/nel/src/misc/utf_string_view.cpp index 57dc5e226..ee5fdf089 100644 --- a/nel/src/misc/utf_string_view.cpp +++ b/nel/src/misc/utf_string_view.cpp @@ -19,52 +19,178 @@ // Project includes #include +// References: +// - https://twiserandom.com/unicode/unicode-encoding-utf-8-utf-16-utf-32/ +// - https://www.compart.com/en/unicode/U+1F30D +// - 0xF0 0x9F 0x8C 0x8D +// - 0xD83C 0xDF0D +// - 0x0001F30D + namespace NLMISC { -std::string CUtfStringView::toUtf8() +std::string CUtfStringView::toUtf8(bool reEncode) const { - if (m_Iterator == utf8Iterator) + // Decode UTF and encode UTF-8 + // This implementation makes no attempt at fixing invalid codepoints + if (m_Iterator == utf8Iterator && !reEncode) return std::string((const char *)m_Str, (const char *)((ptrdiff_t)m_Str + m_Size)); std::string res; - res.reserve((m_Size << 1) + 1); + res.reserve(m_Size); + for (iterator it(begin()), end(end()); it != end; ++it) + { + u32char c = *it; + if (c < 0x80) + { + // Encode as 1 byte + res += (char)c; + } + else if (c < 0x0800) + { + // Encode as 2 bytes + res += (char)((c & 0x07C0) >> 6) | 0xC0; + res += (char)(c & 0x3F) | 0x80; + } + else if (c < 0x010000) + { + // Encode as 3 bytes + res += (char)((c & 0xF000) >> 12) | 0xE0; + res += (char)((c & 0x0FC0) >> 6) | 0x80; + res += (char)(c & 0x3F) | 0x80; + } + else + { + // Encode as 4 bytes + res += (char)((c & 0x1C0000) >> 18) | 0xF0; + res += (char)((c & 0x03F000) >> 12) | 0x80; + res += (char)((c & 0x0FC0) >> 6) | 0x80; + res += (char)(c & 0x3F) | 0x80; + } + } + return res; +} + +ucstring CUtfStringView::toUtf16(bool reEncode) const +{ + if (m_Iterator == utf16Iterator && !reEncode) + return ucstring((const ucchar *)m_Str, (const ucchar *)((ptrdiff_t)m_Str + m_Size)); + ucstring res; + res.reserve(m_Size << 1); for (iterator it(begin()), end(end()); it != end; ++it) { u32char c = *it; - res += (char)c; /* TODO: Encode UTF-8 */ + if (c < 0x10000) + { + res += c; + } + else + { + c -= 0x10000; + res += (c >> 10) | 0xD800; + res += (c & 0x3FF) | 0xDC00; + } } + return res; } -u32string CUtfStringView::toUtf32() +u32string CUtfStringView::toUtf32() const { + // Decode any UTF + // This implementation makes no attempt at fixing bad encoding if (m_Iterator == utf32Iterator) return u32string((const u32char *)m_Str, (const u32char *)((ptrdiff_t)m_Str + m_Size)); u32string res; - res.reserve(m_Size + 1); + res.reserve(m_Size << 2); for (iterator it(begin()), end(end()); it != end; ++it) res += *it; + return res; } u32char CUtfStringView::utf8Iterator(const void **addr) { - /* TODO: Decode UTF-8 */ - const ucchar **pp = reinterpret_cast(addr); - ucchar c = **pp; + // Decode UTF-8 + // This implementation makes no attempt at fixing bad encoding, except for bad UTF-16 surrogate pairs + const uint8 **pp = reinterpret_cast(addr); + u32char c0 = **pp; ++(*pp); - return c; + if ((c0 & 0xC0) == 0xC0) + { + uint8 cx = **pp; + if ((cx & 0xC0) == 0x80) + { + ++(*pp); + c0 &= 0x3F; // Drop first two bits + c0 <<= 6; + c0 |= (cx & 0x3F); // 12 bits now (6 + 6), 2-byte encoding + if (c0 & 0x800) + { + cx = **pp; + if ((cx & 0xC0) == 0x80) + { + ++(*pp); + c0 &= 0x07FF; // Drop first bit + c0 <<= 6; + c0 |= (cx & 0x3F); // 17 bits now (12 - 1 + 6), 3-byte encoding + if (c0 & 0x10000) + { + cx = **pp; + if ((cx & 0xC0) == 0x80) + { + ++(*pp); + c0 &= 0xFFFF; // Drop first bit + c0 <<= 6; + c0 |= (cx & 0x3F); // 22 bits now (17 - 1 + 6), 3-byte encoding + } + } + else if ((c0 & 0xFC00) == 0xD800) // Higher bits of nutcase UTF-16 encoded as UTF-8 + { + uint8 cy; + if ((*pp)[0] == 0xED && ((cx = (*pp)[1]) & 0xF0) == 0xB0 && ((cy = (*pp)[2]) & 0xC0) == 0x80) + { + // Lower bits of nutcase UTF-16 encoded as UTF-8 + (*pp) += 3; + uint16 c1 = (cx & 0x0F); + c1 <<= 6; + c1 |= (cy & 0x3F); + c0 &= 0x03FF; + c0 <<= 10; + c0 |= (c1 & 0x03FF); + c0 += 0x10000; + } + } + } + } + } + } + return c0; } u32char CUtfStringView::utf16Iterator(const void **addr) { - /* TODO: Decode UTF-16 */ - const ucchar **pp = reinterpret_cast(addr); - ucchar c = **pp; + // Decode UTF-16 + // This implementation makes no attempt at fixing bad encoding + const uint16 **pp = reinterpret_cast(addr); + u32char c0 = **pp; ++(*pp); - return c; + if ((c0 & 0xFC00) == 0xD800) // Higher bits + { + uint16 c1 = **pp; + if ((c1 & 0xFC00) == 0xDC00) // Lower bits + { + ++(*pp); + c0 &= 0x03FF; + c0 <<= 10; + c0 |= (c1 & 0x03FF); + c0 += 0x10000; + } + } + return c0; } u32char CUtfStringView::utf32Iterator(const void **addr) { + // UTF-32 + // This implementation makes no attempt at fixing bad encoding const u32char **pp = reinterpret_cast(addr); u32char c = **pp; ++(*pp);