Stricter UTF-8

develop
kaetemi 4 years ago
parent 182a917338
commit 0bd43913e1

@ -49,7 +49,7 @@ NL_FORCE_INLINE void appendUtf8(std::string &str, u32char c)
str += (char)((c & 0x0FC0) >> 6) | 0x80;
str += (char)(c & 0x3F) | 0x80;
}
else
else if (c < 0x110000)
{
// Encode as 4 bytes
str += (char)((c & 0x1C0000) >> 18) | 0xF0;
@ -57,6 +57,11 @@ NL_FORCE_INLINE void appendUtf8(std::string &str, u32char c)
str += (char)((c & 0x0FC0) >> 6) | 0x80;
str += (char)(c & 0x3F) | 0x80;
}
else
{
// Replacement character <20>
str += "\xEF\xB\xBD";
}
}
void CUtfStringView::append(std::string &str, u32char c)
@ -175,6 +180,7 @@ u32char CUtfStringView::utf8Iterator(const void **addr)
{
// Decode UTF-8
// This implementation makes no attempt at fixing bad encoding, except for bad UTF-16 surrogate pairs
// Invalid characters are replaced with the replacement character
const uint8 **pp = reinterpret_cast<const uint8 **>(addr);
u32char c0 = **pp;
++(*pp);
@ -209,7 +215,12 @@ u32char CUtfStringView::utf8Iterator(const void **addr)
++(*pp);
c0 &= 0xFFFF; // Drop first bit
c0 <<= 6;
c0 |= (cx & 0x3F); // 22 bits now (17 - 1 + 6), 3-byte encoding
c0 |= (cx & 0x3F); // 22 bits now (17 - 1 + 6), 4-byte encoding
if (c0 >= 0x110000)
{
// Replacement character <20>
return 0xFFFD;
}
}
else
{

Loading…
Cancel
Save