From 5f0d3a1edcad43a4342cfd1390bd6596859aae35 Mon Sep 17 00:00:00 2001 From: bbgan <2893129936@qq.com> Date: Wed, 31 Jan 2024 22:08:44 +0800 Subject: [PATCH] support escape and unescape[xml] --- iguana/util.hpp | 88 +++++++++++++++++++-------------- iguana/xml_reader.hpp | 7 ++- iguana/xml_util.hpp | 111 ++++++++++++++++++++++++++++++++++++++++++ iguana/xml_writer.hpp | 31 +++++++++++- test/test_xml.cpp | 29 +++++++++++ 5 files changed, 228 insertions(+), 38 deletions(-) diff --git a/iguana/util.hpp b/iguana/util.hpp index 5d622712..f4ecd675 100644 --- a/iguana/util.hpp +++ b/iguana/util.hpp @@ -215,10 +215,59 @@ inline constexpr auto has_qoute = [](uint64_t chunk) IGUANA__INLINE_LAMBDA { 0b0010001000100010001000100010001000100010001000100010001000100010); }; +template +IGUANA_INLINE void write_unicode_to_string(Ch& it, Stream& ss) { + static const char hexDigits[16] = {'0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; + unsigned codepoint = 0; + if (!decode_utf8(it, codepoint)) + IGUANA_UNLIKELY { throw std::runtime_error("illegal unicode character"); } + if constexpr (is_xml_serialization) { + ss.append("&#x"); + } + else { + ss.push_back('\\'); + ss.push_back('u'); + } + + if (codepoint <= 0xD7FF || (codepoint >= 0xE000 && codepoint <= 0xFFFF)) { + ss.push_back(hexDigits[(codepoint >> 12) & 15]); + ss.push_back(hexDigits[(codepoint >> 8) & 15]); + ss.push_back(hexDigits[(codepoint >> 4) & 15]); + ss.push_back(hexDigits[(codepoint)&15]); + } + else { + if (codepoint < 0x010000 || codepoint > 0x10FFFF) + IGUANA_UNLIKELY { throw std::runtime_error("illegal codepoint"); } + // Surrogate pair + unsigned s = codepoint - 0x010000; + unsigned lead = (s >> 10) + 0xD800; + unsigned trail = (s & 0x3FF) + 0xDC00; + ss.push_back(hexDigits[(lead >> 12) & 15]); + ss.push_back(hexDigits[(lead >> 8) & 15]); + ss.push_back(hexDigits[(lead >> 4) & 15]); + ss.push_back(hexDigits[(lead)&15]); + if constexpr (is_xml_serialization) { + ss.append(";&#x"); + } + else { + ss.push_back('\\'); + ss.push_back('u'); + } + ss.push_back(hexDigits[(trail >> 12) & 15]); + ss.push_back(hexDigits[(trail >> 8) & 15]); + ss.push_back(hexDigits[(trail >> 4) & 15]); + ss.push_back(hexDigits[(trail)&15]); + } + if constexpr (is_xml_serialization) { + ss.push_back(';'); + } +} + // https://github.com/Tencent/rapidjson/blob/master/include/rapidjson/writer.h template -inline void write_string_with_escape(const Ch* it, SizeType length, - Stream& ss) { +IGUANA_INLINE void write_string_with_escape(const Ch* it, SizeType length, + Stream& ss) { static const char hexDigits[16] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; static const char escape[256] = { @@ -241,40 +290,7 @@ inline void write_string_with_escape(const Ch* it, SizeType length, std::advance(end, length); while (it < end) { if (static_cast(*it) >= 0x80) - IGUANA_UNLIKELY { - unsigned codepoint = 0; - if (!decode_utf8(it, codepoint)) - IGUANA_UNLIKELY { - throw std::runtime_error("illegal unicode character"); - } - ss.push_back('\\'); - ss.push_back('u'); - if (codepoint <= 0xD7FF || - (codepoint >= 0xE000 && codepoint <= 0xFFFF)) { - ss.push_back(hexDigits[(codepoint >> 12) & 15]); - ss.push_back(hexDigits[(codepoint >> 8) & 15]); - ss.push_back(hexDigits[(codepoint >> 4) & 15]); - ss.push_back(hexDigits[(codepoint)&15]); - } - else { - if (codepoint < 0x010000 || codepoint > 0x10FFFF) - IGUANA_UNLIKELY { throw std::runtime_error("illegal codepoint"); } - // Surrogate pair - unsigned s = codepoint - 0x010000; - unsigned lead = (s >> 10) + 0xD800; - unsigned trail = (s & 0x3FF) + 0xDC00; - ss.push_back(hexDigits[(lead >> 12) & 15]); - ss.push_back(hexDigits[(lead >> 8) & 15]); - ss.push_back(hexDigits[(lead >> 4) & 15]); - ss.push_back(hexDigits[(lead)&15]); - ss.push_back('\\'); - ss.push_back('u'); - ss.push_back(hexDigits[(trail >> 12) & 15]); - ss.push_back(hexDigits[(trail >> 8) & 15]); - ss.push_back(hexDigits[(trail >> 4) & 15]); - ss.push_back(hexDigits[(trail)&15]); - } - } + IGUANA_UNLIKELY { write_unicode_to_string(it, ss); } else if (escape[static_cast(*it)]) IGUANA_UNLIKELY { ss.push_back('\\'); diff --git a/iguana/xml_reader.hpp b/iguana/xml_reader.hpp index 829a2705..6da30fab 100644 --- a/iguana/xml_reader.hpp +++ b/iguana/xml_reader.hpp @@ -28,7 +28,12 @@ template , int> = 0> IGUANA_INLINE void parse_value(U &&value, It &&begin, It &&end) { using T = std::decay_t; if constexpr (string_container_v) { - value = T(&*begin, static_cast(std::distance(begin, end))); + if constexpr (string_view_v) { + value = T(&*begin, static_cast(std::distance(begin, end))); + } + else { + parse_escape_xml(value, begin, end); + } } else if constexpr (num_v) { auto size = std::distance(begin, end); diff --git a/iguana/xml_util.hpp b/iguana/xml_util.hpp index cd2df709..fb1dff6a 100644 --- a/iguana/xml_util.hpp +++ b/iguana/xml_util.hpp @@ -186,4 +186,115 @@ IGUANA_INLINE auto skip_pass(It &&it, It &&end) { return res + 1; } +template +IGUANA_INLINE bool is_match(It &&it, const It &end) { + const auto n = static_cast(std::distance(it, end)); + if ((n < sizeof...(C)) || (... || (*it++ != C))) { + return false; + } + return true; +} + +// loose policy: allow '&' +template , int> = 0> +IGUANA_INLINE void parse_escape_xml(U &value, It &&it, It &&end) { + static const unsigned char lookup_digits[256] = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 10, 11, 12, 13, 14, 15, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255}; + while (it < end) { + if (*it == '&') + IGUANA_UNLIKELY { + switch (*(it + 1)) { + // & ' + case 'a': + if (is_match<'m', 'p', ';'>(it + 2, end)) { + value.push_back('&'); + it += 5; + continue; + } + if (is_match<'p', 'o', 's', ';'>(it + 2, end)) { + value.push_back('\''); + it += 6; + continue; + } + break; + // " + case 'q': + if (is_match<'u', 'o', 't', ';'>(it + 2, end)) { + value.push_back('\"'); + it += 6; + continue; + } + break; + // > + case 'g': + if (is_match<'t', ';'>(it + 2, end)) { + value.push_back('>'); + it += 4; + continue; + } + break; + // < + case 'l': + if (is_match<'t', ';'>(it + 2, end)) { + value.push_back('<'); + it += 4; + continue; + } + break; + case '#': + if (*(it + 2) == 'x') { + // &#x + unsigned long codepoint = 0; + it += 3; + while (true) { + auto digit = lookup_digits[static_cast(*it)]; + if (digit == 0xFF) + break; + codepoint = codepoint * 16 + digit; + ++it; + } + encode_utf8(value, codepoint); + } + else { + unsigned long codepoint = 0; + it += 2; + while (true) { + auto digit = lookup_digits[static_cast(*it)]; + if (digit == 0xFF) + break; + codepoint = codepoint * 10 + digit; + ++it; + } + encode_utf8(value, codepoint); + } + match<';'>(it, end); + continue; + default: + break; + } + value.push_back(*(it++)); + } + else { + value.push_back(*(it++)); + } + } +} + } // namespace iguana diff --git a/iguana/xml_writer.hpp b/iguana/xml_writer.hpp index b83a2c8b..21d2ed41 100644 --- a/iguana/xml_writer.hpp +++ b/iguana/xml_writer.hpp @@ -6,6 +6,35 @@ namespace iguana { +template +IGUANA_INLINE void render_string_with_escape_xml(const Ch *it, SizeType length, + Stream &ss) { + auto end = it; + std::advance(end, length); + while (it < end) { + if (static_cast(*it) >= 0x80) + IGUANA_UNLIKELY { + write_unicode_to_string(it, ss); + continue; + ss.push_back(*it); + } + else if (*it == '\'') + IGUANA_UNLIKELY { ss.append("'"); } + else if (*it == '"') + IGUANA_UNLIKELY { ss.append("""); } + else if (*it == '&') + IGUANA_UNLIKELY { ss.append("&"); } + else if (*it == '>') + IGUANA_UNLIKELY { ss.append(">"); } + else if (*it == '<') + IGUANA_UNLIKELY { ss.append("<"); } + else { + ss.push_back(*it); + } + ++it; + } +} + template , int> = 0> IGUANA_INLINE void render_xml_value(Stream &ss, const T &value, @@ -42,7 +71,7 @@ IGUANA_INLINE void render_head(Stream &ss, std::string_view str) { template , int> = 0> IGUANA_INLINE void render_value(Stream &ss, const T &value) { if constexpr (string_container_v) { - ss.append(value.data(), value.size()); + render_string_with_escape_xml(value.data(), value.size(), ss); } else if constexpr (num_v) { char temp[65]; diff --git a/test/test_xml.cpp b/test/test_xml.cpp index a7796bc9..f6697088 100644 --- a/test/test_xml.cpp +++ b/test/test_xml.cpp @@ -18,6 +18,35 @@ struct Owner_t { } }; REFLECTION(Owner_t, ID, DisplayName); +TEST_CASE("test escape") { + { + std::string str = R"( + + '&"<> + 小强 + + )"; + using Owner_attr_t = + iguana::xml_attr_t>; + auto validator = [](const Owner_attr_t &Owner) { + auto Ow = Owner.value(); + auto attr = Owner.attr(); + CHECK(attr["description"] == "<小强>"); + CHECK(Ow.ID == R"('&"<>)"); + CHECK(Ow.DisplayName == "小强"); + }; + Owner_attr_t Owner; + iguana::from_xml(Owner, str); + validator(Owner); + + std::string ss; + iguana::to_xml(Owner, ss); + std::cout << ss << std::endl; + Owner_attr_t Owner1; + iguana::from_xml(Owner1, ss); + validator(Owner1); + } +} struct Contents { std::string Key;