|
|
|
@ -19,52 +19,178 @@
|
|
|
|
|
// Project includes
|
|
|
|
|
#include <nel/misc/utf_string_view.h>
|
|
|
|
|
|
|
|
|
|
// References:
|
|
|
|
|
// - https://twiserandom.com/unicode/unicode-encoding-utf-8-utf-16-utf-32/
|
|
|
|
|
// - https://www.compart.com/en/unicode/U+1F30D
|
|
|
|
|
// - 0xF0 0x9F 0x8C 0x8D
|
|
|
|
|
// - 0xD83C 0xDF0D
|
|
|
|
|
// - 0x0001F30D
|
|
|
|
|
|
|
|
|
|
namespace NLMISC
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
std::string CUtfStringView::toUtf8()
|
|
|
|
|
std::string CUtfStringView::toUtf8(bool reEncode) const
|
|
|
|
|
{
|
|
|
|
|
if (m_Iterator == utf8Iterator)
|
|
|
|
|
// Decode UTF and encode UTF-8
|
|
|
|
|
// This implementation makes no attempt at fixing invalid codepoints
|
|
|
|
|
if (m_Iterator == utf8Iterator && !reEncode)
|
|
|
|
|
return std::string((const char *)m_Str, (const char *)((ptrdiff_t)m_Str + m_Size));
|
|
|
|
|
std::string res;
|
|
|
|
|
res.reserve((m_Size << 1) + 1);
|
|
|
|
|
res.reserve(m_Size);
|
|
|
|
|
for (iterator it(begin()), end(end()); it != end; ++it)
|
|
|
|
|
{
|
|
|
|
|
u32char c = *it;
|
|
|
|
|
if (c < 0x80)
|
|
|
|
|
{
|
|
|
|
|
// Encode as 1 byte
|
|
|
|
|
res += (char)c;
|
|
|
|
|
}
|
|
|
|
|
else if (c < 0x0800)
|
|
|
|
|
{
|
|
|
|
|
// Encode as 2 bytes
|
|
|
|
|
res += (char)((c & 0x07C0) >> 6) | 0xC0;
|
|
|
|
|
res += (char)(c & 0x3F) | 0x80;
|
|
|
|
|
}
|
|
|
|
|
else if (c < 0x010000)
|
|
|
|
|
{
|
|
|
|
|
// Encode as 3 bytes
|
|
|
|
|
res += (char)((c & 0xF000) >> 12) | 0xE0;
|
|
|
|
|
res += (char)((c & 0x0FC0) >> 6) | 0x80;
|
|
|
|
|
res += (char)(c & 0x3F) | 0x80;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// Encode as 4 bytes
|
|
|
|
|
res += (char)((c & 0x1C0000) >> 18) | 0xF0;
|
|
|
|
|
res += (char)((c & 0x03F000) >> 12) | 0x80;
|
|
|
|
|
res += (char)((c & 0x0FC0) >> 6) | 0x80;
|
|
|
|
|
res += (char)(c & 0x3F) | 0x80;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ucstring CUtfStringView::toUtf16(bool reEncode) const
|
|
|
|
|
{
|
|
|
|
|
if (m_Iterator == utf16Iterator && !reEncode)
|
|
|
|
|
return ucstring((const ucchar *)m_Str, (const ucchar *)((ptrdiff_t)m_Str + m_Size));
|
|
|
|
|
ucstring res;
|
|
|
|
|
res.reserve(m_Size << 1);
|
|
|
|
|
for (iterator it(begin()), end(end()); it != end; ++it)
|
|
|
|
|
{
|
|
|
|
|
u32char c = *it;
|
|
|
|
|
res += (char)c; /* TODO: Encode UTF-8 */
|
|
|
|
|
if (c < 0x10000)
|
|
|
|
|
{
|
|
|
|
|
res += c;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
c -= 0x10000;
|
|
|
|
|
res += (c >> 10) | 0xD800;
|
|
|
|
|
res += (c & 0x3FF) | 0xDC00;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
u32string CUtfStringView::toUtf32()
|
|
|
|
|
u32string CUtfStringView::toUtf32() const
|
|
|
|
|
{
|
|
|
|
|
// Decode any UTF
|
|
|
|
|
// This implementation makes no attempt at fixing bad encoding
|
|
|
|
|
if (m_Iterator == utf32Iterator)
|
|
|
|
|
return u32string((const u32char *)m_Str, (const u32char *)((ptrdiff_t)m_Str + m_Size));
|
|
|
|
|
u32string res;
|
|
|
|
|
res.reserve(m_Size + 1);
|
|
|
|
|
res.reserve(m_Size << 2);
|
|
|
|
|
for (iterator it(begin()), end(end()); it != end; ++it)
|
|
|
|
|
res += *it;
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
u32char CUtfStringView::utf8Iterator(const void **addr)
|
|
|
|
|
{
|
|
|
|
|
/* TODO: Decode UTF-8 */
|
|
|
|
|
const ucchar **pp = reinterpret_cast<const ucchar **>(addr);
|
|
|
|
|
ucchar c = **pp;
|
|
|
|
|
// Decode UTF-8
|
|
|
|
|
// This implementation makes no attempt at fixing bad encoding, except for bad UTF-16 surrogate pairs
|
|
|
|
|
const uint8 **pp = reinterpret_cast<const uint8 **>(addr);
|
|
|
|
|
u32char c0 = **pp;
|
|
|
|
|
++(*pp);
|
|
|
|
|
return c;
|
|
|
|
|
if ((c0 & 0xC0) == 0xC0)
|
|
|
|
|
{
|
|
|
|
|
uint8 cx = **pp;
|
|
|
|
|
if ((cx & 0xC0) == 0x80)
|
|
|
|
|
{
|
|
|
|
|
++(*pp);
|
|
|
|
|
c0 &= 0x3F; // Drop first two bits
|
|
|
|
|
c0 <<= 6;
|
|
|
|
|
c0 |= (cx & 0x3F); // 12 bits now (6 + 6), 2-byte encoding
|
|
|
|
|
if (c0 & 0x800)
|
|
|
|
|
{
|
|
|
|
|
cx = **pp;
|
|
|
|
|
if ((cx & 0xC0) == 0x80)
|
|
|
|
|
{
|
|
|
|
|
++(*pp);
|
|
|
|
|
c0 &= 0x07FF; // Drop first bit
|
|
|
|
|
c0 <<= 6;
|
|
|
|
|
c0 |= (cx & 0x3F); // 17 bits now (12 - 1 + 6), 3-byte encoding
|
|
|
|
|
if (c0 & 0x10000)
|
|
|
|
|
{
|
|
|
|
|
cx = **pp;
|
|
|
|
|
if ((cx & 0xC0) == 0x80)
|
|
|
|
|
{
|
|
|
|
|
++(*pp);
|
|
|
|
|
c0 &= 0xFFFF; // Drop first bit
|
|
|
|
|
c0 <<= 6;
|
|
|
|
|
c0 |= (cx & 0x3F); // 22 bits now (17 - 1 + 6), 3-byte encoding
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else if ((c0 & 0xFC00) == 0xD800) // Higher bits of nutcase UTF-16 encoded as UTF-8
|
|
|
|
|
{
|
|
|
|
|
uint8 cy;
|
|
|
|
|
if ((*pp)[0] == 0xED && ((cx = (*pp)[1]) & 0xF0) == 0xB0 && ((cy = (*pp)[2]) & 0xC0) == 0x80)
|
|
|
|
|
{
|
|
|
|
|
// Lower bits of nutcase UTF-16 encoded as UTF-8
|
|
|
|
|
(*pp) += 3;
|
|
|
|
|
uint16 c1 = (cx & 0x0F);
|
|
|
|
|
c1 <<= 6;
|
|
|
|
|
c1 |= (cy & 0x3F);
|
|
|
|
|
c0 &= 0x03FF;
|
|
|
|
|
c0 <<= 10;
|
|
|
|
|
c0 |= (c1 & 0x03FF);
|
|
|
|
|
c0 += 0x10000;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return c0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
u32char CUtfStringView::utf16Iterator(const void **addr)
|
|
|
|
|
{
|
|
|
|
|
/* TODO: Decode UTF-16 */
|
|
|
|
|
const ucchar **pp = reinterpret_cast<const ucchar **>(addr);
|
|
|
|
|
ucchar c = **pp;
|
|
|
|
|
// Decode UTF-16
|
|
|
|
|
// This implementation makes no attempt at fixing bad encoding
|
|
|
|
|
const uint16 **pp = reinterpret_cast<const uint16 **>(addr);
|
|
|
|
|
u32char c0 = **pp;
|
|
|
|
|
++(*pp);
|
|
|
|
|
return c;
|
|
|
|
|
if ((c0 & 0xFC00) == 0xD800) // Higher bits
|
|
|
|
|
{
|
|
|
|
|
uint16 c1 = **pp;
|
|
|
|
|
if ((c1 & 0xFC00) == 0xDC00) // Lower bits
|
|
|
|
|
{
|
|
|
|
|
++(*pp);
|
|
|
|
|
c0 &= 0x03FF;
|
|
|
|
|
c0 <<= 10;
|
|
|
|
|
c0 |= (c1 & 0x03FF);
|
|
|
|
|
c0 += 0x10000;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return c0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
u32char CUtfStringView::utf32Iterator(const void **addr)
|
|
|
|
|
{
|
|
|
|
|
// UTF-32
|
|
|
|
|
// This implementation makes no attempt at fixing bad encoding
|
|
|
|
|
const u32char **pp = reinterpret_cast<const u32char **>(addr);
|
|
|
|
|
u32char c = **pp;
|
|
|
|
|
++(*pp);
|
|
|
|
|