Proper UTF encoding, ryzom/ryzomcore#335

4 years ago · 81143cdf43
parent 973531f461
commit 81143cdf43
4 changed files with 157 additions and 151 deletions
--- a/nel/include/nel/misc/ucstring.h
+++ b/nel/include/nel/misc/ucstring.h
@ -46,6 +46,11 @@ public:
 	{
 	}

+	ucstring(const ucchar *begin, const ucchar *end)
+		: ucstringbase(begin, end)
+	{
+	}
+
 	ucstring(const std::string &str)
 	    : ucstringbase()
 	{
--- a/nel/include/nel/misc/utf_string_view.h
+++ b/nel/include/nel/misc/utf_string_view.h
@ -19,7 +19,6 @@

 #include <nel/misc/types_nl.h>
 #include <nel/misc/ucstring.h>
-#include <nel/misc/utf32_string.h>
 #include <string>

 namespace NLMISC {
@ -27,7 +26,7 @@ namespace NLMISC {
 /// String view for UTF-8 and UTF-32 iteration as 32-bit codepoints.
 /// This string view keeps the string as a reference, it does not make a copy.
 /// Only use this for iterating a string's codepoints.
-/// Strings are not necessarily NUL-terminated.
+/// Strings are not required to be NUL-terminated, but must have at least one character extra.
 class CUtfStringView
 {
 public:
@ -41,12 +40,13 @@ public:
 	inline CUtfStringView(const ucstring &utf16Str) : m_Str(utf16Str.c_str()), m_Size(utf16Str.size() << 1), m_Iterator(utf16Iterator) {}
 	inline CUtfStringView(const u32string &utf32Str) : m_Str(utf32Str.c_str()), m_Size(utf32Str.size() << 2), m_Iterator(utf32Iterator) {}

-	std::string toUtf8(); // Makes a copy
-	u32string toUtf32(); // Makes a copy
+	std::string toUtf8(bool reEncode = false) const; // Makes a copy
+	ucstring toUtf16(bool reEncode = false) const; // Makes a copy
+	u32string toUtf32() const; // Makes a copy

-	inline bool isUtf8() { return m_Iterator == utf8Iterator; }
-	inline bool isUtf16() { return m_Iterator == utf16Iterator; }
-	inline bool isUtf32() { return m_Iterator == utf32Iterator; }
+	inline bool isUtf8() const { return m_Iterator == utf8Iterator; }
+	inline bool isUtf16() const { return m_Iterator == utf16Iterator; }
+	inline bool isUtf32() const { return m_Iterator == utf32Iterator; }

 	struct const_iterator
 	{
@ -82,8 +82,8 @@ private:
 	static u32char utf16Iterator(const void **addr);
 	static u32char utf32Iterator(const void **addr);

-	const void *const m_Str;
-	const size_t m_Size;
+	const void *const m_Str; // String
+	const size_t m_Size; // Size in bytes
 	const TIterator m_Iterator;
 	
 }; /* class CUtfStringView */
--- a/nel/src/misc/ucstring.cpp
+++ b/nel/src/misc/ucstring.cpp
@ -16,6 +16,7 @@

 #include "stdmisc.h"
 #include "nel/misc/ucstring.h"
+#include "nel/misc/utf_string_view.h"

 void ucstring::toString(std::string &str) const
 {
@ -31,138 +32,12 @@ void ucstring::toString(std::string &str) const

 std::string ucstring::toUtf8() const
 {
-#if defined(NL_OS_WINDOWS)
-	// Use OS implementation
-	nlctassert(sizeof(wchar_t) == sizeof(ucchar));
-	nlctassert(sizeof(wchar_t) == sizeof(uint16));
-	return NLMISC::wideToUtf8(static_cast<const std::wstring &>(*this));
-#else
-	std::string res;
-	ucstring::const_iterator first(begin()), last(end());
-	for (; first != last; ++first)
-	{
-		//ucchar	c = *first;
-		uint nbLoop = 0;
-		if (*first < 0x80)
-			res += char(*first);
-		else if (*first < 0x800)
-		{
-			ucchar c = *first;
-			c = c >> 6;
-			c = c & 0x1F;
-			res += char(c) | 0xC0;
-			nbLoop = 1;
-		}
-		else /*if (*first < 0x10000)*/
-		{
-			ucchar c = *first;
-			c = c >> 12;
-			c = c & 0x0F;
-			res += char(c) | 0xE0;
-			nbLoop = 2;
-		}
-
-		for (uint i = 0; i < nbLoop; ++i)
-		{
-			ucchar c = *first;
-			c = c >> ((nbLoop - i - 1) * 6);
-			c = c & 0x3F;
-			res += char(c) | 0x80;
-		}
-	}
-	return res;
-#endif
+	return NLMISC::CUtfStringView(*this).toUtf8();
 }

 void ucstring::fromUtf8(const std::string &stringUtf8)
 {
-#if defined(NL_OS_WINDOWS)
-	// Use OS implementation
-	nlctassert(sizeof(wchar_t) == sizeof(ucchar));
-	nlctassert(sizeof(wchar_t) == sizeof(uint16));
-	nlctassert(sizeof(std::wstring) == sizeof(ucstring)); // These can be swapped on Windows
-	static_cast<std::wstring &>(*this) = nlmove(NLMISC::utf8ToWide(stringUtf8));
-	if (stringUtf8.size() && !size())
-		rawCopy(stringUtf8);
-#else
-	// clear the string
-	erase();
-
-	uint8 c;
-	ucchar code;
-	sint iterations = 0;
-
-	std::string::const_iterator first(stringUtf8.begin()), last(stringUtf8.end());
-	for (; first != last;)
-	{
-		c = *first++;
-		code = c;
-
-		if ((code & 0xFE) == 0xFC)
-		{
-			code &= 0x01;
-			iterations = 5;
-		}
-		else if ((code & 0xFC) == 0xF8)
-		{
-			code &= 0x03;
-			iterations = 4;
-		}
-		else if ((code & 0xF8) == 0xF0)
-		{
-			code &= 0x07;
-			iterations = 3;
-		}
-		else if ((code & 0xF0) == 0xE0)
-		{
-			code &= 0x0F;
-			iterations = 2;
-		}
-		else if ((code & 0xE0) == 0xC0)
-		{
-			code &= 0x1F;
-			iterations = 1;
-		}
-		else if ((code & 0x80) == 0x80)
-		{
-			// If it's not a valid UTF8 string, just copy the line without utf8 conversion
-			rawCopy(stringUtf8);
-			return;
-		}
-		else
-		{
-			push_back(code);
-			iterations = 0;
-		}
-
-		if (iterations)
-		{
-			for (sint i = 0; i < iterations; i++)
-			{
-				if (first == last)
-				{
-					// If it's not a valid UTF8 string, just copy the line without utf8 conversion
-					rawCopy(stringUtf8);
-					return;
-				}
-
-				uint8 ch;
-				ch = *first++;
-
-				if ((ch & 0xC0) != 0x80)
-				{
-					// If it's not a valid UTF8 string, just copy the line without utf8 conversion
-					rawCopy(stringUtf8);
-					return;
-				}
-
-				code <<= 6;
-				code |= (ucchar)(ch & 0x3F);
-			}
-			push_back(code);
-		}
-	}
-#endif
+	*this = NLMISC::CUtfStringView(stringUtf8).toUtf16();
 }

 void ucstring::rawCopy(const std::string &str)
--- a/nel/src/misc/utf_string_view.cpp
+++ b/nel/src/misc/utf_string_view.cpp
@ -19,52 +19,178 @@
 // Project includes
 #include <nel/misc/utf_string_view.h>

+// References:
+// - https://twiserandom.com/unicode/unicode-encoding-utf-8-utf-16-utf-32/
+// - https://www.compart.com/en/unicode/U+1F30D
+//   - 0xF0 0x9F 0x8C 0x8D
+//   - 0xD83C 0xDF0D
+//   - 0x0001F30D
+
 namespace NLMISC
 {

-std::string CUtfStringView::toUtf8()
+std::string CUtfStringView::toUtf8(bool reEncode) const
 {
-	if (m_Iterator == utf8Iterator)
+	// Decode UTF and encode UTF-8
+	// This implementation makes no attempt at fixing invalid codepoints
+	if (m_Iterator == utf8Iterator && !reEncode)
 		return std::string((const char *)m_Str, (const char *)((ptrdiff_t)m_Str + m_Size));
 	std::string res;
-	res.reserve((m_Size << 1) + 1);
+	res.reserve(m_Size);
+	for (iterator it(begin()), end(end()); it != end; ++it)
+	{
+		u32char c = *it;
+		if (c < 0x80)
+		{
+			// Encode as 1 byte
+			res += (char)c;
+		}
+		else if (c < 0x0800)
+		{
+			// Encode as 2 bytes
+			res += (char)((c & 0x07C0) >> 6) | 0xC0;
+			res += (char)(c & 0x3F) | 0x80;
+		}
+		else if (c < 0x010000)
+		{
+			// Encode as 3 bytes
+			res += (char)((c & 0xF000) >> 12) | 0xE0;
+			res += (char)((c & 0x0FC0) >> 6) | 0x80;
+			res += (char)(c & 0x3F) | 0x80;
+		}
+		else
+		{
+			// Encode as 4 bytes
+			res += (char)((c & 0x1C0000) >> 18) | 0xF0;
+			res += (char)((c & 0x03F000) >> 12) | 0x80;
+			res += (char)((c & 0x0FC0) >> 6) | 0x80;
+			res += (char)(c & 0x3F) | 0x80;
+		}
+	}
+	return res;
+}
+
+ucstring CUtfStringView::toUtf16(bool reEncode) const
+{
+	if (m_Iterator == utf16Iterator && !reEncode)
+		return ucstring((const ucchar *)m_Str, (const ucchar *)((ptrdiff_t)m_Str + m_Size));
+	ucstring res;
+	res.reserve(m_Size << 1);
 	for (iterator it(begin()), end(end()); it != end; ++it)
 	{
 		u32char c = *it;
-		res += (char)c; /* TODO: Encode UTF-8 */
+		if (c < 0x10000)
+		{
+			res += c;
+		}
+		else
+		{
+			c -= 0x10000;
+			res += (c >> 10) | 0xD800;
+			res += (c & 0x3FF) | 0xDC00;
+		}
 	}
+	return res;
 }

-u32string CUtfStringView::toUtf32()
+u32string CUtfStringView::toUtf32() const
 {
+	// Decode any UTF
+	// This implementation makes no attempt at fixing bad encoding
 	if (m_Iterator == utf32Iterator)
 		return u32string((const u32char *)m_Str, (const u32char *)((ptrdiff_t)m_Str + m_Size));
 	u32string res;
-	res.reserve(m_Size + 1);
+	res.reserve(m_Size << 2);
 	for (iterator it(begin()), end(end()); it != end; ++it)
 		res += *it;
+	return res;
 }

 u32char CUtfStringView::utf8Iterator(const void **addr)
 {
-	/* TODO: Decode UTF-8 */
-	const ucchar **pp = reinterpret_cast<const ucchar **>(addr);
-	ucchar c = **pp;
+	// Decode UTF-8
+	// This implementation makes no attempt at fixing bad encoding, except for bad UTF-16 surrogate pairs
+	const uint8 **pp = reinterpret_cast<const uint8 **>(addr);
+	u32char c0 = **pp;
 	++(*pp);
-	return c;
+	if ((c0 & 0xC0) == 0xC0)
+	{
+		uint8 cx = **pp;
+		if ((cx & 0xC0) == 0x80)
+		{
+			++(*pp);
+			c0 &= 0x3F; // Drop first two bits
+			c0 <<= 6;
+			c0 |= (cx & 0x3F); // 12 bits now (6 + 6), 2-byte encoding
+			if (c0 & 0x800)
+			{
+				cx = **pp;
+				if ((cx & 0xC0) == 0x80)
+				{
+					++(*pp);
+					c0 &= 0x07FF; // Drop first bit
+					c0 <<= 6;
+					c0 |= (cx & 0x3F); // 17 bits now (12 - 1 + 6), 3-byte encoding
+					if (c0 & 0x10000)
+					{
+						cx = **pp;
+						if ((cx & 0xC0) == 0x80)
+						{
+							++(*pp);
+							c0 &= 0xFFFF; // Drop first bit
+							c0 <<= 6;
+							c0 |= (cx & 0x3F); // 22 bits now (17 - 1 + 6), 3-byte encoding
+						}
+					}
+					else if ((c0 & 0xFC00) == 0xD800) // Higher bits of nutcase UTF-16 encoded as UTF-8
+					{
+						uint8 cy;
+						if ((*pp)[0] == 0xED && ((cx = (*pp)[1]) & 0xF0) == 0xB0 && ((cy = (*pp)[2]) & 0xC0) == 0x80)
+						{
+							// Lower bits of nutcase UTF-16 encoded as UTF-8
+							(*pp) += 3;
+							uint16 c1 = (cx & 0x0F);
+							c1 <<= 6;
+							c1 |= (cy & 0x3F);
+							c0 &= 0x03FF;
+							c0 <<= 10;
+							c0 |= (c1 & 0x03FF);
+							c0 += 0x10000;
+						}
+					}
+				}
+			}
+		}
+	}
+	return c0;
 }

 u32char CUtfStringView::utf16Iterator(const void **addr)
 {
-	/* TODO: Decode UTF-16 */
-	const ucchar **pp = reinterpret_cast<const ucchar **>(addr);
-	ucchar c = **pp;
+	// Decode UTF-16
+	// This implementation makes no attempt at fixing bad encoding
+	const uint16 **pp = reinterpret_cast<const uint16 **>(addr);
+	u32char c0 = **pp;
 	++(*pp);
-	return c;
+	if ((c0 & 0xFC00) == 0xD800) // Higher bits
+	{
+		uint16 c1 = **pp;
+		if ((c1 & 0xFC00) == 0xDC00) // Lower bits
+		{
+			++(*pp);
+			c0 &= 0x03FF;
+			c0 <<= 10;
+			c0 |= (c1 & 0x03FF);
+			c0 += 0x10000;
+		}
+	}
+	return c0;
 }

 u32char CUtfStringView::utf32Iterator(const void **addr)
 {
+	// UTF-32
+	// This implementation makes no attempt at fixing bad encoding
 	const u32char **pp = reinterpret_cast<const u32char **>(addr);
 	u32char c = **pp;
 	++(*pp);