develop
kaetemi 4 years ago
parent 973531f461
commit 81143cdf43

@ -46,6 +46,11 @@ public:
{
}
ucstring(const ucchar *begin, const ucchar *end)
: ucstringbase(begin, end)
{
}
ucstring(const std::string &str)
: ucstringbase()
{

@ -19,7 +19,6 @@
#include <nel/misc/types_nl.h>
#include <nel/misc/ucstring.h>
#include <nel/misc/utf32_string.h>
#include <string>
namespace NLMISC {
@ -27,7 +26,7 @@ namespace NLMISC {
/// String view for UTF-8 and UTF-32 iteration as 32-bit codepoints.
/// This string view keeps the string as a reference, it does not make a copy.
/// Only use this for iterating a string's codepoints.
/// Strings are not necessarily NUL-terminated.
/// Strings are not required to be NUL-terminated, but must have at least one character extra.
class CUtfStringView
{
public:
@ -41,12 +40,13 @@ public:
inline CUtfStringView(const ucstring &utf16Str) : m_Str(utf16Str.c_str()), m_Size(utf16Str.size() << 1), m_Iterator(utf16Iterator) {}
inline CUtfStringView(const u32string &utf32Str) : m_Str(utf32Str.c_str()), m_Size(utf32Str.size() << 2), m_Iterator(utf32Iterator) {}
std::string toUtf8(); // Makes a copy
u32string toUtf32(); // Makes a copy
std::string toUtf8(bool reEncode = false) const; // Makes a copy
ucstring toUtf16(bool reEncode = false) const; // Makes a copy
u32string toUtf32() const; // Makes a copy
inline bool isUtf8() { return m_Iterator == utf8Iterator; }
inline bool isUtf16() { return m_Iterator == utf16Iterator; }
inline bool isUtf32() { return m_Iterator == utf32Iterator; }
inline bool isUtf8() const { return m_Iterator == utf8Iterator; }
inline bool isUtf16() const { return m_Iterator == utf16Iterator; }
inline bool isUtf32() const { return m_Iterator == utf32Iterator; }
struct const_iterator
{
@ -82,8 +82,8 @@ private:
static u32char utf16Iterator(const void **addr);
static u32char utf32Iterator(const void **addr);
const void *const m_Str;
const size_t m_Size;
const void *const m_Str; // String
const size_t m_Size; // Size in bytes
const TIterator m_Iterator;
}; /* class CUtfStringView */

@ -16,6 +16,7 @@
#include "stdmisc.h"
#include "nel/misc/ucstring.h"
#include "nel/misc/utf_string_view.h"
void ucstring::toString(std::string &str) const
{
@ -31,138 +32,12 @@ void ucstring::toString(std::string &str) const
std::string ucstring::toUtf8() const
{
#if defined(NL_OS_WINDOWS)
// Use OS implementation
nlctassert(sizeof(wchar_t) == sizeof(ucchar));
nlctassert(sizeof(wchar_t) == sizeof(uint16));
return NLMISC::wideToUtf8(static_cast<const std::wstring &>(*this));
#else
std::string res;
ucstring::const_iterator first(begin()), last(end());
for (; first != last; ++first)
{
//ucchar c = *first;
uint nbLoop = 0;
if (*first < 0x80)
res += char(*first);
else if (*first < 0x800)
{
ucchar c = *first;
c = c >> 6;
c = c & 0x1F;
res += char(c) | 0xC0;
nbLoop = 1;
}
else /*if (*first < 0x10000)*/
{
ucchar c = *first;
c = c >> 12;
c = c & 0x0F;
res += char(c) | 0xE0;
nbLoop = 2;
}
for (uint i = 0; i < nbLoop; ++i)
{
ucchar c = *first;
c = c >> ((nbLoop - i - 1) * 6);
c = c & 0x3F;
res += char(c) | 0x80;
}
}
return res;
#endif
return NLMISC::CUtfStringView(*this).toUtf8();
}
void ucstring::fromUtf8(const std::string &stringUtf8)
{
#if defined(NL_OS_WINDOWS)
// Use OS implementation
nlctassert(sizeof(wchar_t) == sizeof(ucchar));
nlctassert(sizeof(wchar_t) == sizeof(uint16));
nlctassert(sizeof(std::wstring) == sizeof(ucstring)); // These can be swapped on Windows
static_cast<std::wstring &>(*this) = nlmove(NLMISC::utf8ToWide(stringUtf8));
if (stringUtf8.size() && !size())
rawCopy(stringUtf8);
#else
// clear the string
erase();
uint8 c;
ucchar code;
sint iterations = 0;
std::string::const_iterator first(stringUtf8.begin()), last(stringUtf8.end());
for (; first != last;)
{
c = *first++;
code = c;
if ((code & 0xFE) == 0xFC)
{
code &= 0x01;
iterations = 5;
}
else if ((code & 0xFC) == 0xF8)
{
code &= 0x03;
iterations = 4;
}
else if ((code & 0xF8) == 0xF0)
{
code &= 0x07;
iterations = 3;
}
else if ((code & 0xF0) == 0xE0)
{
code &= 0x0F;
iterations = 2;
}
else if ((code & 0xE0) == 0xC0)
{
code &= 0x1F;
iterations = 1;
}
else if ((code & 0x80) == 0x80)
{
// If it's not a valid UTF8 string, just copy the line without utf8 conversion
rawCopy(stringUtf8);
return;
}
else
{
push_back(code);
iterations = 0;
}
if (iterations)
{
for (sint i = 0; i < iterations; i++)
{
if (first == last)
{
// If it's not a valid UTF8 string, just copy the line without utf8 conversion
rawCopy(stringUtf8);
return;
}
uint8 ch;
ch = *first++;
if ((ch & 0xC0) != 0x80)
{
// If it's not a valid UTF8 string, just copy the line without utf8 conversion
rawCopy(stringUtf8);
return;
}
code <<= 6;
code |= (ucchar)(ch & 0x3F);
}
push_back(code);
}
}
#endif
*this = NLMISC::CUtfStringView(stringUtf8).toUtf16();
}
void ucstring::rawCopy(const std::string &str)

@ -19,52 +19,178 @@
// Project includes
#include <nel/misc/utf_string_view.h>
// References:
// - https://twiserandom.com/unicode/unicode-encoding-utf-8-utf-16-utf-32/
// - https://www.compart.com/en/unicode/U+1F30D
// - 0xF0 0x9F 0x8C 0x8D
// - 0xD83C 0xDF0D
// - 0x0001F30D
namespace NLMISC
{
std::string CUtfStringView::toUtf8()
std::string CUtfStringView::toUtf8(bool reEncode) const
{
if (m_Iterator == utf8Iterator)
// Decode UTF and encode UTF-8
// This implementation makes no attempt at fixing invalid codepoints
if (m_Iterator == utf8Iterator && !reEncode)
return std::string((const char *)m_Str, (const char *)((ptrdiff_t)m_Str + m_Size));
std::string res;
res.reserve((m_Size << 1) + 1);
res.reserve(m_Size);
for (iterator it(begin()), end(end()); it != end; ++it)
{
u32char c = *it;
if (c < 0x80)
{
// Encode as 1 byte
res += (char)c;
}
else if (c < 0x0800)
{
// Encode as 2 bytes
res += (char)((c & 0x07C0) >> 6) | 0xC0;
res += (char)(c & 0x3F) | 0x80;
}
else if (c < 0x010000)
{
// Encode as 3 bytes
res += (char)((c & 0xF000) >> 12) | 0xE0;
res += (char)((c & 0x0FC0) >> 6) | 0x80;
res += (char)(c & 0x3F) | 0x80;
}
else
{
// Encode as 4 bytes
res += (char)((c & 0x1C0000) >> 18) | 0xF0;
res += (char)((c & 0x03F000) >> 12) | 0x80;
res += (char)((c & 0x0FC0) >> 6) | 0x80;
res += (char)(c & 0x3F) | 0x80;
}
}
return res;
}
ucstring CUtfStringView::toUtf16(bool reEncode) const
{
if (m_Iterator == utf16Iterator && !reEncode)
return ucstring((const ucchar *)m_Str, (const ucchar *)((ptrdiff_t)m_Str + m_Size));
ucstring res;
res.reserve(m_Size << 1);
for (iterator it(begin()), end(end()); it != end; ++it)
{
u32char c = *it;
res += (char)c; /* TODO: Encode UTF-8 */
if (c < 0x10000)
{
res += c;
}
else
{
c -= 0x10000;
res += (c >> 10) | 0xD800;
res += (c & 0x3FF) | 0xDC00;
}
}
return res;
}
u32string CUtfStringView::toUtf32()
u32string CUtfStringView::toUtf32() const
{
// Decode any UTF
// This implementation makes no attempt at fixing bad encoding
if (m_Iterator == utf32Iterator)
return u32string((const u32char *)m_Str, (const u32char *)((ptrdiff_t)m_Str + m_Size));
u32string res;
res.reserve(m_Size + 1);
res.reserve(m_Size << 2);
for (iterator it(begin()), end(end()); it != end; ++it)
res += *it;
return res;
}
u32char CUtfStringView::utf8Iterator(const void **addr)
{
/* TODO: Decode UTF-8 */
const ucchar **pp = reinterpret_cast<const ucchar **>(addr);
ucchar c = **pp;
// Decode UTF-8
// This implementation makes no attempt at fixing bad encoding, except for bad UTF-16 surrogate pairs
const uint8 **pp = reinterpret_cast<const uint8 **>(addr);
u32char c0 = **pp;
++(*pp);
return c;
if ((c0 & 0xC0) == 0xC0)
{
uint8 cx = **pp;
if ((cx & 0xC0) == 0x80)
{
++(*pp);
c0 &= 0x3F; // Drop first two bits
c0 <<= 6;
c0 |= (cx & 0x3F); // 12 bits now (6 + 6), 2-byte encoding
if (c0 & 0x800)
{
cx = **pp;
if ((cx & 0xC0) == 0x80)
{
++(*pp);
c0 &= 0x07FF; // Drop first bit
c0 <<= 6;
c0 |= (cx & 0x3F); // 17 bits now (12 - 1 + 6), 3-byte encoding
if (c0 & 0x10000)
{
cx = **pp;
if ((cx & 0xC0) == 0x80)
{
++(*pp);
c0 &= 0xFFFF; // Drop first bit
c0 <<= 6;
c0 |= (cx & 0x3F); // 22 bits now (17 - 1 + 6), 3-byte encoding
}
}
else if ((c0 & 0xFC00) == 0xD800) // Higher bits of nutcase UTF-16 encoded as UTF-8
{
uint8 cy;
if ((*pp)[0] == 0xED && ((cx = (*pp)[1]) & 0xF0) == 0xB0 && ((cy = (*pp)[2]) & 0xC0) == 0x80)
{
// Lower bits of nutcase UTF-16 encoded as UTF-8
(*pp) += 3;
uint16 c1 = (cx & 0x0F);
c1 <<= 6;
c1 |= (cy & 0x3F);
c0 &= 0x03FF;
c0 <<= 10;
c0 |= (c1 & 0x03FF);
c0 += 0x10000;
}
}
}
}
}
}
return c0;
}
u32char CUtfStringView::utf16Iterator(const void **addr)
{
/* TODO: Decode UTF-16 */
const ucchar **pp = reinterpret_cast<const ucchar **>(addr);
ucchar c = **pp;
// Decode UTF-16
// This implementation makes no attempt at fixing bad encoding
const uint16 **pp = reinterpret_cast<const uint16 **>(addr);
u32char c0 = **pp;
++(*pp);
return c;
if ((c0 & 0xFC00) == 0xD800) // Higher bits
{
uint16 c1 = **pp;
if ((c1 & 0xFC00) == 0xDC00) // Lower bits
{
++(*pp);
c0 &= 0x03FF;
c0 <<= 10;
c0 |= (c1 & 0x03FF);
c0 += 0x10000;
}
}
return c0;
}
u32char CUtfStringView::utf32Iterator(const void **addr)
{
// UTF-32
// This implementation makes no attempt at fixing bad encoding
const u32char **pp = reinterpret_cast<const u32char **>(addr);
u32char c = **pp;
++(*pp);

Loading…
Cancel
Save