Merge branch 'develop' into ryzomclassic-develop
commit
2b2b0a7d48
@ -0,0 +1,150 @@
|
||||
// NeL - MMORPG Framework <https://wiki.ryzom.dev/>
|
||||
// Copyright (C) 2020 Jan BOON (Kaetemi) <jan.boon@kaetemi.be>
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as
|
||||
// published by the Free Software Foundation, either version 3 of the
|
||||
// License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
#ifndef NLMISC_UTF_STRING_VIEW_H
|
||||
#define NLMISC_UTF_STRING_VIEW_H
|
||||
|
||||
#include <nel/misc/types_nl.h>
|
||||
#include <nel/misc/ucstring.h>
|
||||
#include <string>
|
||||
|
||||
namespace NLMISC {
|
||||
|
||||
/// String view for UTF-8 and UTF-32 iteration as 32-bit codepoints.
|
||||
/// This string view keeps the string as a reference, it does not make a copy.
|
||||
/// Only use this for iterating a string's codepoints.
|
||||
/// Strings are not required to be NUL-terminated, but must have at least one character extra.
|
||||
class CUtfStringView
|
||||
{
|
||||
public:
|
||||
inline CUtfStringView() : m_Str(NULL), m_Size(0), m_Iterator(utf32Iterator) {}
|
||||
|
||||
inline CUtfStringView(const char *utf8Str) : m_Str(utf8Str), m_Size(strlen(utf8Str)), m_Iterator(utf8Iterator) {}
|
||||
inline CUtfStringView(const char *utf8Str, size_t len): m_Str(utf8Str), m_Size(len), m_Iterator(utf8Iterator)
|
||||
{
|
||||
nlassert(len <= strlen(utf8Str));
|
||||
}
|
||||
#if defined(NL_OS_WINDOWS)
|
||||
inline CUtfStringView(const wchar_t *utf16Str) : m_Str(utf16Str), m_Size(wcslen(utf16Str)), m_Iterator(utf16Iterator) {}
|
||||
inline CUtfStringView(const wchar_t *utf16Str, size_t len): m_Str(utf16Str), m_Size(len), m_Iterator(utf16Iterator)
|
||||
{
|
||||
nlassert(len <= wcslen(utf16Str));
|
||||
}
|
||||
#else
|
||||
inline CUtfStringView(const wchar_t *utf32Str) : m_Str(utf32Str), m_Size(wcslen(utf32Str)), m_Iterator(utf32Iterator) {}
|
||||
inline CUtfStringView(const wchar_t *utf32Str, size_t len): m_Str(utf32Str), m_Size(len), m_Iterator(utf32Iterator)
|
||||
{
|
||||
nlassert(len <= wcslen(utf32Str));
|
||||
}
|
||||
#endif
|
||||
|
||||
inline CUtfStringView(const std::string &utf8Str) : m_Str(utf8Str.c_str()), m_Size(utf8Str.size()), m_Iterator(utf8Iterator) {}
|
||||
inline CUtfStringView(const ucstring &utf16Str) : m_Str(utf16Str.c_str()), m_Size(utf16Str.size() << 1), m_Iterator(utf16Iterator) {}
|
||||
inline CUtfStringView(const u32string &utf32Str) : m_Str(utf32Str.c_str()), m_Size(utf32Str.size() << 2), m_Iterator(utf32Iterator) {}
|
||||
|
||||
std::string toUtf8(bool reEncode = false) const; // Makes a copy
|
||||
ucstring toUtf16(bool reEncode = false) const; // Makes a copy
|
||||
u32string toUtf32() const; // Makes a copy
|
||||
|
||||
std::wstring toWide() const; // Platform dependent, UTF-16 or UTF-32. Makes a copy.
|
||||
|
||||
inline bool isUtf8() const { return m_Iterator == utf8Iterator; }
|
||||
inline bool isUtf16() const { return m_Iterator == utf16Iterator; }
|
||||
inline bool isUtf32() const { return m_Iterator == utf32Iterator; }
|
||||
|
||||
struct const_iterator
|
||||
{
|
||||
public:
|
||||
inline void operator++()
|
||||
{
|
||||
m_Char = m_View.m_Iterator(&m_Addr);
|
||||
if ((ptrdiff_t)m_Addr > ((ptrdiff_t)m_View.m_Str + m_View.m_Size))
|
||||
{
|
||||
m_Addr = 0;
|
||||
m_Char = 0;
|
||||
}
|
||||
}
|
||||
inline void operator+=(ptrdiff_t a)
|
||||
{
|
||||
while (m_Addr)
|
||||
{
|
||||
++(*this);
|
||||
}
|
||||
}
|
||||
inline bool operator!=(const const_iterator &o) const { return m_Addr != o.m_Addr; }
|
||||
inline bool operator==(const const_iterator &o) const { return m_Addr == o.m_Addr; }
|
||||
inline const u32char &operator*() const { return m_Char; }
|
||||
const_iterator() : m_View(*(CUtfStringView *)NULL), m_Addr(NULL), m_Char(0) { }
|
||||
|
||||
const_iterator &operator=(const const_iterator &other)
|
||||
{
|
||||
if(this == &other) return *this;
|
||||
this->~const_iterator();
|
||||
return *new(this) const_iterator(other);
|
||||
}
|
||||
private:
|
||||
friend class CUtfStringView;
|
||||
inline const_iterator(const CUtfStringView &view, const void *addr) : m_View(view), m_Addr(addr), m_Char(addr ? view.m_Iterator(&m_Addr) : 0) { }
|
||||
const CUtfStringView &m_View;
|
||||
const void *m_Addr; // Next address
|
||||
u32char m_Char;
|
||||
};
|
||||
|
||||
typedef const_iterator iterator;
|
||||
|
||||
iterator begin() const { return iterator(*this, m_Str); }
|
||||
inline iterator end() const { return iterator(*this, NULL); }
|
||||
|
||||
/// Largest possible number of characters in this string.
|
||||
/// Number of actual characters may be less or equal.
|
||||
inline size_t largestSize() const { return m_Size; }
|
||||
|
||||
inline bool empty() const { return !m_Size; }
|
||||
const void *ptr() const { return m_Str; }
|
||||
|
||||
inline CUtfStringView substr(const iterator &begin, const iterator &end) const
|
||||
{
|
||||
return CUtfStringView(begin.m_Addr, (ptrdiff_t)end.m_Addr - (ptrdiff_t)begin.m_Addr, m_Iterator);
|
||||
}
|
||||
|
||||
inline bool endsWith(char c) { nlassert(c < 0x80); return *((char *)m_Str + m_Size - 1) == c; }
|
||||
|
||||
CUtfStringView &operator=(const CUtfStringView &other)
|
||||
{
|
||||
if(this == &other) return *this;
|
||||
this->~CUtfStringView();
|
||||
return *new(this) CUtfStringView(other);
|
||||
}
|
||||
|
||||
private:
|
||||
typedef u32char (*TIterator)(const void **addr);
|
||||
static u32char utf8Iterator(const void **addr);
|
||||
static u32char utf16Iterator(const void **addr);
|
||||
static u32char utf32Iterator(const void **addr);
|
||||
|
||||
inline CUtfStringView(const void *str, size_t len, TIterator it) : m_Str(str), m_Size(len), m_Iterator(it) { }
|
||||
|
||||
const void *const m_Str; // String
|
||||
const size_t m_Size; // Size in bytes
|
||||
const TIterator m_Iterator;
|
||||
|
||||
}; /* class CUtfStringView */
|
||||
|
||||
} /* namespace NLMISC */
|
||||
|
||||
#endif /* #ifndef NLMISC_STREAMED_PACKAGE_PROVIDER_H */
|
||||
|
||||
/* end of file */
|
@ -0,0 +1,235 @@
|
||||
// NeL - MMORPG Framework <https://wiki.ryzom.dev/>
|
||||
// Copyright (C) 2020 Jan BOON (Kaetemi) <jan.boon@kaetemi.be>
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as
|
||||
// published by the Free Software Foundation, either version 3 of the
|
||||
// License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
#include "stdmisc.h"
|
||||
|
||||
// Project includes
|
||||
#include <nel/misc/utf_string_view.h>
|
||||
|
||||
// References:
|
||||
// - https://twiserandom.com/unicode/unicode-encoding-utf-8-utf-16-utf-32/
|
||||
// - https://www.compart.com/en/unicode/U+1F30D
|
||||
// - 0xF0 0x9F 0x8C 0x8D
|
||||
// - 0xD83C 0xDF0D
|
||||
// - 0x0001F30D
|
||||
|
||||
namespace NLMISC
|
||||
{
|
||||
|
||||
std::string CUtfStringView::toUtf8(bool reEncode) const
|
||||
{
|
||||
// Decode UTF and encode UTF-8
|
||||
// This implementation makes no attempt at fixing invalid codepoints
|
||||
if (m_Iterator == utf8Iterator && !reEncode)
|
||||
return std::string((const char *)m_Str, (const char *)((ptrdiff_t)m_Str + m_Size));
|
||||
std::string res;
|
||||
res.reserve(m_Size);
|
||||
for (iterator it(begin()), end(end()); it != end; ++it)
|
||||
{
|
||||
u32char c = *it;
|
||||
if (c < 0x80)
|
||||
{
|
||||
// Encode as 1 byte
|
||||
res += (char)c;
|
||||
}
|
||||
else if (c < 0x0800)
|
||||
{
|
||||
// Encode as 2 bytes
|
||||
res += (char)((c & 0x07C0) >> 6) | 0xC0;
|
||||
res += (char)(c & 0x3F) | 0x80;
|
||||
}
|
||||
else if (c < 0x010000)
|
||||
{
|
||||
// Encode as 3 bytes
|
||||
res += (char)((c & 0xF000) >> 12) | 0xE0;
|
||||
res += (char)((c & 0x0FC0) >> 6) | 0x80;
|
||||
res += (char)(c & 0x3F) | 0x80;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Encode as 4 bytes
|
||||
res += (char)((c & 0x1C0000) >> 18) | 0xF0;
|
||||
res += (char)((c & 0x03F000) >> 12) | 0x80;
|
||||
res += (char)((c & 0x0FC0) >> 6) | 0x80;
|
||||
res += (char)(c & 0x3F) | 0x80;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
ucstring CUtfStringView::toUtf16(bool reEncode) const
|
||||
{
|
||||
if (m_Iterator == utf16Iterator && !reEncode)
|
||||
return ucstring((const ucchar *)m_Str, (const ucchar *)((ptrdiff_t)m_Str + m_Size));
|
||||
ucstring res;
|
||||
res.reserve(m_Size << 1);
|
||||
for (iterator it(begin()), end(end()); it != end; ++it)
|
||||
{
|
||||
u32char c = *it;
|
||||
if (c < 0x10000)
|
||||
{
|
||||
res += c;
|
||||
}
|
||||
else
|
||||
{
|
||||
c -= 0x10000;
|
||||
res += (c >> 10) | 0xD800;
|
||||
res += (c & 0x3FF) | 0xDC00;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
u32string CUtfStringView::toUtf32() const
|
||||
{
|
||||
// Decode any UTF
|
||||
// This implementation makes no attempt at fixing bad encoding
|
||||
if (m_Iterator == utf32Iterator)
|
||||
return u32string((const u32char *)m_Str, (const u32char *)((ptrdiff_t)m_Str + m_Size));
|
||||
u32string res;
|
||||
res.reserve(m_Size << 2);
|
||||
for (iterator it(begin()), end(end()); it != end; ++it)
|
||||
res += *it;
|
||||
return res;
|
||||
}
|
||||
|
||||
std::wstring CUtfStringView::toWide() const
|
||||
{
|
||||
#ifdef NL_OS_WINDOWS
|
||||
if (m_Iterator == utf16Iterator)
|
||||
return std::wstring((const wchar_t *)m_Str, (const wchar_t *)((ptrdiff_t)m_Str + m_Size));
|
||||
std::wstring res;
|
||||
res.reserve(m_Size << 1);
|
||||
for (iterator it(begin()), end(end()); it != end; ++it)
|
||||
{
|
||||
u32char c = *it;
|
||||
if (c < 0x10000)
|
||||
{
|
||||
res += c;
|
||||
}
|
||||
else
|
||||
{
|
||||
c -= 0x10000;
|
||||
res += (c >> 10) | 0xD800;
|
||||
res += (c & 0x3FF) | 0xDC00;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
#else
|
||||
if (m_Iterator == utf32Iterator)
|
||||
return std::wstring((const wchar_t *)m_Str, (const wchar_t *)((ptrdiff_t)m_Str + m_Size));
|
||||
std::wstring res;
|
||||
res.reserve(m_Size << 2);
|
||||
for (iterator it(begin()), end(end()); it != end; ++it)
|
||||
res += *it;
|
||||
return res;
|
||||
#endif
|
||||
}
|
||||
|
||||
u32char CUtfStringView::utf8Iterator(const void **addr)
|
||||
{
|
||||
// Decode UTF-8
|
||||
// This implementation makes no attempt at fixing bad encoding, except for bad UTF-16 surrogate pairs
|
||||
const uint8 **pp = reinterpret_cast<const uint8 **>(addr);
|
||||
u32char c0 = **pp;
|
||||
++(*pp);
|
||||
if ((c0 & 0xC0) == 0xC0)
|
||||
{
|
||||
uint8 cx = **pp;
|
||||
if ((cx & 0xC0) == 0x80)
|
||||
{
|
||||
++(*pp);
|
||||
c0 &= 0x3F; // Drop first two bits
|
||||
c0 <<= 6;
|
||||
c0 |= (cx & 0x3F); // 12 bits now (6 + 6), 2-byte encoding
|
||||
if (c0 & 0x800)
|
||||
{
|
||||
cx = **pp;
|
||||
if ((cx & 0xC0) == 0x80)
|
||||
{
|
||||
++(*pp);
|
||||
c0 &= 0x07FF; // Drop first bit
|
||||
c0 <<= 6;
|
||||
c0 |= (cx & 0x3F); // 17 bits now (12 - 1 + 6), 3-byte encoding
|
||||
if (c0 & 0x10000)
|
||||
{
|
||||
cx = **pp;
|
||||
if ((cx & 0xC0) == 0x80)
|
||||
{
|
||||
++(*pp);
|
||||
c0 &= 0xFFFF; // Drop first bit
|
||||
c0 <<= 6;
|
||||
c0 |= (cx & 0x3F); // 22 bits now (17 - 1 + 6), 3-byte encoding
|
||||
}
|
||||
}
|
||||
else if ((c0 & 0xFC00) == 0xD800) // Higher bits of nutcase UTF-16 encoded as UTF-8
|
||||
{
|
||||
uint8 cy;
|
||||
if ((*pp)[0] == 0xED && ((cx = (*pp)[1]) & 0xF0) == 0xB0 && ((cy = (*pp)[2]) & 0xC0) == 0x80)
|
||||
{
|
||||
// Lower bits of nutcase UTF-16 encoded as UTF-8
|
||||
(*pp) += 3;
|
||||
uint16 c1 = (cx & 0x0F);
|
||||
c1 <<= 6;
|
||||
c1 |= (cy & 0x3F);
|
||||
c0 &= 0x03FF;
|
||||
c0 <<= 10;
|
||||
c0 |= (c1 & 0x03FF);
|
||||
c0 += 0x10000;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return c0;
|
||||
}
|
||||
|
||||
u32char CUtfStringView::utf16Iterator(const void **addr)
|
||||
{
|
||||
// Decode UTF-16
|
||||
// This implementation makes no attempt at fixing bad encoding
|
||||
const uint16 **pp = reinterpret_cast<const uint16 **>(addr);
|
||||
u32char c0 = **pp;
|
||||
++(*pp);
|
||||
if ((c0 & 0xFC00) == 0xD800) // Higher bits
|
||||
{
|
||||
uint16 c1 = **pp;
|
||||
if ((c1 & 0xFC00) == 0xDC00) // Lower bits
|
||||
{
|
||||
++(*pp);
|
||||
c0 &= 0x03FF;
|
||||
c0 <<= 10;
|
||||
c0 |= (c1 & 0x03FF);
|
||||
c0 += 0x10000;
|
||||
}
|
||||
}
|
||||
return c0;
|
||||
}
|
||||
|
||||
u32char CUtfStringView::utf32Iterator(const void **addr)
|
||||
{
|
||||
// UTF-32
|
||||
// This implementation makes no attempt at fixing bad encoding
|
||||
const u32char **pp = reinterpret_cast<const u32char **>(addr);
|
||||
u32char c = **pp;
|
||||
++(*pp);
|
||||
return c;
|
||||
}
|
||||
|
||||
} /* namespace NLMISC */
|
||||
|
||||
/* end of file */
|
Loading…
Reference in New Issue