From 973531f461d46ded593c4615c6575a96cb09c1a7 Mon Sep 17 00:00:00 2001 From: kaetemi Date: Mon, 26 Oct 2020 03:55:36 +0800 Subject: [PATCH] UTF string view base, ryzom/ryzomcore#335 --- nel/include/nel/misc/types_nl.h | 8 +++ nel/include/nel/misc/utf_string_view.h | 95 ++++++++++++++++++++++++++ nel/src/misc/utf_string_view.cpp | 76 +++++++++++++++++++++ 3 files changed, 179 insertions(+) create mode 100644 nel/include/nel/misc/utf_string_view.h create mode 100644 nel/src/misc/utf_string_view.cpp diff --git a/nel/include/nel/misc/types_nl.h b/nel/include/nel/misc/types_nl.h index 73c05b3a5..b8fe8270a 100644 --- a/nel/include/nel/misc/types_nl.h +++ b/nel/include/nel/misc/types_nl.h @@ -552,6 +552,14 @@ typedef wchar_t ucchar; typedef uint16 ucchar; #endif +#ifdef NL_CPP14 +typedef char32_t u32char; +typedef std::u32string u32string; +#else +typedef uint32 u32char; +typedef std::basic_string u32string; +#endif + #ifndef NL_OVERRIDE #define NL_OVERRIDE override #endif diff --git a/nel/include/nel/misc/utf_string_view.h b/nel/include/nel/misc/utf_string_view.h new file mode 100644 index 000000000..78df8c141 --- /dev/null +++ b/nel/include/nel/misc/utf_string_view.h @@ -0,0 +1,95 @@ +// NeL - MMORPG Framework +// Copyright (C) 2020 Jan BOON (Kaetemi) +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +#ifndef NLMISC_UTF_STRING_VIEW_H +#define NLMISC_UTF_STRING_VIEW_H + +#include +#include +#include +#include + +namespace NLMISC { + +/// String view for UTF-8 and UTF-32 iteration as 32-bit codepoints. +/// This string view keeps the string as a reference, it does not make a copy. +/// Only use this for iterating a string's codepoints. +/// Strings are not necessarily NUL-terminated. +class CUtfStringView +{ +public: + inline CUtfStringView(const char *utf8Str) : m_Str(utf8Str), m_Size(strlen(utf8Str)), m_Iterator(utf8Iterator) {} + inline CUtfStringView(const char *utf8Str, size_t len): m_Str(utf8Str), m_Size(len), m_Iterator(utf8Iterator) + { + nlassert(len <= strlen(utf8Str)); + } + + inline CUtfStringView(const std::string &utf8Str) : m_Str(utf8Str.c_str()), m_Size(utf8Str.size()), m_Iterator(utf8Iterator) {} + inline CUtfStringView(const ucstring &utf16Str) : m_Str(utf16Str.c_str()), m_Size(utf16Str.size() << 1), m_Iterator(utf16Iterator) {} + inline CUtfStringView(const u32string &utf32Str) : m_Str(utf32Str.c_str()), m_Size(utf32Str.size() << 2), m_Iterator(utf32Iterator) {} + + std::string toUtf8(); // Makes a copy + u32string toUtf32(); // Makes a copy + + inline bool isUtf8() { return m_Iterator == utf8Iterator; } + inline bool isUtf16() { return m_Iterator == utf16Iterator; } + inline bool isUtf32() { return m_Iterator == utf32Iterator; } + + struct const_iterator + { + public: + inline void operator++() + { + m_Char = m_View.m_Iterator(&m_Addr); + if ((ptrdiff_t)m_Addr > ((ptrdiff_t)m_View.m_Str + m_View.m_Size)) + { + m_Addr = 0; + m_Char = 0; + } + } + inline bool operator!=(const const_iterator &o) const { return m_Addr != o.m_Addr; } + inline bool operator==(const const_iterator &o) const { return m_Addr == o.m_Addr; } + inline const u32char &operator*() const { return m_Char; } + private: + friend class CUtfStringView; + inline const_iterator(const CUtfStringView &view, const void *addr) : m_View(view), m_Addr(addr), m_Char(addr ? view.m_Iterator(&m_Addr) : 0) { } + const CUtfStringView &m_View; + const void *m_Addr; // Next address + u32char m_Char; + }; + + typedef const_iterator iterator; + + iterator begin() const { return iterator(*this, m_Str); } + inline iterator end() const { return iterator(*this, NULL); } + +private: + typedef u32char (*TIterator)(const void **addr); + static u32char utf8Iterator(const void **addr); + static u32char utf16Iterator(const void **addr); + static u32char utf32Iterator(const void **addr); + + const void *const m_Str; + const size_t m_Size; + const TIterator m_Iterator; + +}; /* class CUtfStringView */ + +} /* namespace NLMISC */ + +#endif /* #ifndef NLMISC_STREAMED_PACKAGE_PROVIDER_H */ + +/* end of file */ diff --git a/nel/src/misc/utf_string_view.cpp b/nel/src/misc/utf_string_view.cpp new file mode 100644 index 000000000..57dc5e226 --- /dev/null +++ b/nel/src/misc/utf_string_view.cpp @@ -0,0 +1,76 @@ +// NeL - MMORPG Framework +// Copyright (C) 2020 Jan BOON (Kaetemi) +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +#include "stdmisc.h" + +// Project includes +#include + +namespace NLMISC +{ + +std::string CUtfStringView::toUtf8() +{ + if (m_Iterator == utf8Iterator) + return std::string((const char *)m_Str, (const char *)((ptrdiff_t)m_Str + m_Size)); + std::string res; + res.reserve((m_Size << 1) + 1); + for (iterator it(begin()), end(end()); it != end; ++it) + { + u32char c = *it; + res += (char)c; /* TODO: Encode UTF-8 */ + } +} + +u32string CUtfStringView::toUtf32() +{ + if (m_Iterator == utf32Iterator) + return u32string((const u32char *)m_Str, (const u32char *)((ptrdiff_t)m_Str + m_Size)); + u32string res; + res.reserve(m_Size + 1); + for (iterator it(begin()), end(end()); it != end; ++it) + res += *it; +} + +u32char CUtfStringView::utf8Iterator(const void **addr) +{ + /* TODO: Decode UTF-8 */ + const ucchar **pp = reinterpret_cast(addr); + ucchar c = **pp; + ++(*pp); + return c; +} + +u32char CUtfStringView::utf16Iterator(const void **addr) +{ + /* TODO: Decode UTF-16 */ + const ucchar **pp = reinterpret_cast(addr); + ucchar c = **pp; + ++(*pp); + return c; +} + +u32char CUtfStringView::utf32Iterator(const void **addr) +{ + const u32char **pp = reinterpret_cast(addr); + u32char c = **pp; + ++(*pp); + return c; +} + +} /* namespace NLMISC */ + +/* end of file */