Stricter UTF-8

develop
kaetemi 4 years ago
parent 182a917338
commit 0bd43913e1

@ -49,7 +49,7 @@ NL_FORCE_INLINE void appendUtf8(std::string &str, u32char c)
str += (char)((c & 0x0FC0) >> 6) | 0x80; str += (char)((c & 0x0FC0) >> 6) | 0x80;
str += (char)(c & 0x3F) | 0x80; str += (char)(c & 0x3F) | 0x80;
} }
else else if (c < 0x110000)
{ {
// Encode as 4 bytes // Encode as 4 bytes
str += (char)((c & 0x1C0000) >> 18) | 0xF0; str += (char)((c & 0x1C0000) >> 18) | 0xF0;
@ -57,6 +57,11 @@ NL_FORCE_INLINE void appendUtf8(std::string &str, u32char c)
str += (char)((c & 0x0FC0) >> 6) | 0x80; str += (char)((c & 0x0FC0) >> 6) | 0x80;
str += (char)(c & 0x3F) | 0x80; str += (char)(c & 0x3F) | 0x80;
} }
else
{
// Replacement character <20>
str += "\xEF\xB\xBD";
}
} }
void CUtfStringView::append(std::string &str, u32char c) void CUtfStringView::append(std::string &str, u32char c)
@ -175,6 +180,7 @@ u32char CUtfStringView::utf8Iterator(const void **addr)
{ {
// Decode UTF-8 // Decode UTF-8
// This implementation makes no attempt at fixing bad encoding, except for bad UTF-16 surrogate pairs // This implementation makes no attempt at fixing bad encoding, except for bad UTF-16 surrogate pairs
// Invalid characters are replaced with the replacement character
const uint8 **pp = reinterpret_cast<const uint8 **>(addr); const uint8 **pp = reinterpret_cast<const uint8 **>(addr);
u32char c0 = **pp; u32char c0 = **pp;
++(*pp); ++(*pp);
@ -209,7 +215,12 @@ u32char CUtfStringView::utf8Iterator(const void **addr)
++(*pp); ++(*pp);
c0 &= 0xFFFF; // Drop first bit c0 &= 0xFFFF; // Drop first bit
c0 <<= 6; c0 <<= 6;
c0 |= (cx & 0x3F); // 22 bits now (17 - 1 + 6), 3-byte encoding c0 |= (cx & 0x3F); // 22 bits now (17 - 1 + 6), 4-byte encoding
if (c0 >= 0x110000)
{
// Replacement character <20>
return 0xFFFD;
}
} }
else else
{ {

Loading…
Cancel
Save