From c6535ee2fde33634b8355fb4a6eb500600b3f0d8 Mon Sep 17 00:00:00 2001 From: Nimetu Date: Sat, 18 Apr 2015 23:23:23 +0300 Subject: [PATCH] Ensure that html is valid utf8 and that tags come in correct(ish) order --HG-- branch : develop --- code/nel/src/gui/group_html_parser.cpp | 139 ++++++++++++++++++++++++- 1 file changed, 138 insertions(+), 1 deletion(-) diff --git a/code/nel/src/gui/group_html_parser.cpp b/code/nel/src/gui/group_html_parser.cpp index 19d1efe1a..e6f63d464 100644 --- a/code/nel/src/gui/group_html_parser.cpp +++ b/code/nel/src/gui/group_html_parser.cpp @@ -117,10 +117,144 @@ namespace NLGUI } } + // *************************************************************************** + // http://stackoverflow.com/a/18335183 + static std::string correct_non_utf_8(const std::string &str) + { + int i,f_size=str.size(); + unsigned char c,c2,c3,c4; + std::string to; + to.reserve(f_size); + + for(i=0 ; i127 && c2<192){//valid 2byte UTF8 + if(c==194 && c2<160){//control char, skipping + ; + }else{ + to.append(1,c); + to.append(1,c2); + } + i++; + continue; + } + }else if(c<240 && i+2127 && c2<192 && c3>127 && c3<192){//valid 3byte UTF8 + to.append(1,c); + to.append(1,c2); + to.append(1,c3); + i+=2; + continue; + } + }else if(c<245 && i+3127 && c2<192 && c3>127 && c3<192 && c4>127 && c4<192){//valid 4byte UTF8 + to.append(1,c); + to.append(1,c2); + to.append(1,c3); + to.append(1,c4); + i+=3; + continue; + } + } + //invalid UTF8, converting ASCII (c>245 || string too short for multi-byte)) + to.append(1,(unsigned char)195); + to.append(1,c-64); + } + return to; + } + + // *************************************************************************** + static void patchHtmlQuirks(std::string &htmlString) + { + size_t npos = std::string::npos; + size_t pos; + + // get rid of BOM (some ingame help files does not show up otherwise) + if (htmlString.substr(0, 3) == "\xEF\xBB\xBF") + { + htmlString.erase(0, 3); + } + + // if any element is before , then parser adds + // and original tags are ignored (their attributes not processed) + // + // only fix situation when there is tag with attributes + // + // tags are considered to be lowercase + + pos = htmlString.find(" + if (htmlString.substr(start, 2) == ""); + if (end != npos && start < end && end < pos) + { + // body tag end position + size_t insert = htmlString.find(">", pos); + if (insert != npos) + { + std::string str = htmlString.substr(start, end - start); + htmlString.insert(insert+1, str); + htmlString.erase(start, str.size()); + } + } + } + + // make sure (if present) is last in document or tags coming after it are ignored + pos = htmlString.find(""); + if (pos != npos && htmlString.find("<", pos+1) > pos) + { + htmlString.erase(pos, 7); + htmlString += ""; + } + + // if there is invalid utf-8 chars, then libxml will break everything after first it finds. + htmlString = correct_non_utf_8(htmlString); + } + // *************************************************************************** bool CGroupHTML::parseHtml(std::string htmlString) { - htmlParserCtxtPtr parser = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, XML_CHAR_ENCODING_NONE); + htmlParserCtxtPtr parser = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, XML_CHAR_ENCODING_UTF8); if (!parser) { nlwarning("Creating html parser context failed"); @@ -129,6 +263,9 @@ namespace NLGUI htmlCtxtUseOptions(parser, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); + // parser is little strict on tag order, so fix whats needed + patchHtmlQuirks(htmlString); + htmlParseChunk(parser, htmlString.c_str(), htmlString.size(), 0); htmlParseChunk(parser, "", 0, 1);