Skip to content

Commit

Permalink
Merge pull request #260 from bbbgan/xml_escape
Browse files Browse the repository at this point in the history
Xml escape
  • Loading branch information
bbbgan authored Apr 29, 2024
2 parents fb6e6d5 + ce845aa commit d38797a
Show file tree
Hide file tree
Showing 7 changed files with 408 additions and 48 deletions.
35 changes: 35 additions & 0 deletions example/xml_example.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,12 +280,47 @@ void province_example() {
std::cout << ss1;
}

struct text_t {
using escape_attr_t =
iguana::xml_attr_t<std::string, std::map<std::string_view, std::string>>;
escape_attr_t ID;
std::string DisplayName;
};
REFLECTION(text_t, ID, DisplayName);
void escape_example() {
{
std::string str = R"(
<text_t description="&quot;&lt;'&#x5c0f;&#24378;'&gt;&quot;">
<ID ID'msg='{"msg&apos;reply": "it&apos;s ok"}'>&amp;&lt;&gt;</ID>
<DisplayName>&#x5c0f;&#24378;</DisplayName>
</text_t>
)";
using text_attr_t =
iguana::xml_attr_t<text_t, std::map<std::string_view, std::string>>;
auto validator = [](const text_attr_t& text) {
auto v = text.value();
auto attr = text.attr();
assert(attr["description"] == R"("<'小强'>")");
assert(v.ID.value() == R"(&<>)");
assert(v.ID.attr()["ID'msg"] == R"({"msg'reply": "it's ok"})");
assert(v.DisplayName == "小强");
};
text_attr_t text;
iguana::from_xml(text, str);
validator(text);
std::string ss;
iguana::to_xml<true>(text, ss);
std::cout << ss << std::endl;
}
}

int main(void) {
some_type_example();
lib_example();
package_example();
derived_object();
cdata_example();
province_example();
escape_example();
return 0;
}
88 changes: 52 additions & 36 deletions iguana/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,10 +215,59 @@ inline constexpr auto has_qoute = [](uint64_t chunk) IGUANA__INLINE_LAMBDA {
0b0010001000100010001000100010001000100010001000100010001000100010);
};

template <bool is_xml_serialization = false, typename Stream, typename Ch>
IGUANA_INLINE void write_unicode_to_string(Ch& it, Stream& ss) {
static const char hexDigits[16] = {'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
unsigned codepoint = 0;
if (!decode_utf8(it, codepoint))
IGUANA_UNLIKELY { throw std::runtime_error("illegal unicode character"); }
if constexpr (is_xml_serialization) {
ss.append("&#x");
}
else {
ss.push_back('\\');
ss.push_back('u');
}

if (codepoint <= 0xD7FF || (codepoint >= 0xE000 && codepoint <= 0xFFFF)) {
ss.push_back(hexDigits[(codepoint >> 12) & 15]);
ss.push_back(hexDigits[(codepoint >> 8) & 15]);
ss.push_back(hexDigits[(codepoint >> 4) & 15]);
ss.push_back(hexDigits[(codepoint)&15]);
}
else {
if (codepoint < 0x010000 || codepoint > 0x10FFFF)
IGUANA_UNLIKELY { throw std::runtime_error("illegal codepoint"); }
// Surrogate pair
unsigned s = codepoint - 0x010000;
unsigned lead = (s >> 10) + 0xD800;
unsigned trail = (s & 0x3FF) + 0xDC00;
ss.push_back(hexDigits[(lead >> 12) & 15]);
ss.push_back(hexDigits[(lead >> 8) & 15]);
ss.push_back(hexDigits[(lead >> 4) & 15]);
ss.push_back(hexDigits[(lead)&15]);
if constexpr (is_xml_serialization) {
ss.append(";&#x");
}
else {
ss.push_back('\\');
ss.push_back('u');
}
ss.push_back(hexDigits[(trail >> 12) & 15]);
ss.push_back(hexDigits[(trail >> 8) & 15]);
ss.push_back(hexDigits[(trail >> 4) & 15]);
ss.push_back(hexDigits[(trail)&15]);
}
if constexpr (is_xml_serialization) {
ss.push_back(';');
}
}

// https://github.com/Tencent/rapidjson/blob/master/include/rapidjson/writer.h
template <typename Ch, typename SizeType, typename Stream>
inline void write_string_with_escape(const Ch* it, SizeType length,
Stream& ss) {
IGUANA_INLINE void write_string_with_escape(const Ch* it, SizeType length,
Stream& ss) {
static const char hexDigits[16] = {'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
static const char escape[256] = {
Expand All @@ -241,40 +290,7 @@ inline void write_string_with_escape(const Ch* it, SizeType length,
std::advance(end, length);
while (it < end) {
if (static_cast<unsigned>(*it) >= 0x80)
IGUANA_UNLIKELY {
unsigned codepoint = 0;
if (!decode_utf8(it, codepoint))
IGUANA_UNLIKELY {
throw std::runtime_error("illegal unicode character");
}
ss.push_back('\\');
ss.push_back('u');
if (codepoint <= 0xD7FF ||
(codepoint >= 0xE000 && codepoint <= 0xFFFF)) {
ss.push_back(hexDigits[(codepoint >> 12) & 15]);
ss.push_back(hexDigits[(codepoint >> 8) & 15]);
ss.push_back(hexDigits[(codepoint >> 4) & 15]);
ss.push_back(hexDigits[(codepoint)&15]);
}
else {
if (codepoint < 0x010000 || codepoint > 0x10FFFF)
IGUANA_UNLIKELY { throw std::runtime_error("illegal codepoint"); }
// Surrogate pair
unsigned s = codepoint - 0x010000;
unsigned lead = (s >> 10) + 0xD800;
unsigned trail = (s & 0x3FF) + 0xDC00;
ss.push_back(hexDigits[(lead >> 12) & 15]);
ss.push_back(hexDigits[(lead >> 8) & 15]);
ss.push_back(hexDigits[(lead >> 4) & 15]);
ss.push_back(hexDigits[(lead)&15]);
ss.push_back('\\');
ss.push_back('u');
ss.push_back(hexDigits[(trail >> 12) & 15]);
ss.push_back(hexDigits[(trail >> 8) & 15]);
ss.push_back(hexDigits[(trail >> 4) & 15]);
ss.push_back(hexDigits[(trail)&15]);
}
}
IGUANA_UNLIKELY { write_unicode_to_string(it, ss); }
else if (escape[static_cast<unsigned char>(*it)])
IGUANA_UNLIKELY {
ss.push_back('\\');
Expand Down
32 changes: 28 additions & 4 deletions iguana/xml_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,21 @@ template <typename U, typename It, std::enable_if_t<plain_v<U>, int> = 0>
IGUANA_INLINE void parse_value(U &&value, It &&begin, It &&end) {
using T = std::decay_t<U>;
if constexpr (string_container_v<T>) {
value = T(&*begin, static_cast<size_t>(std::distance(begin, end)));
if constexpr (string_view_v<T>) {
value = T(&*begin, static_cast<size_t>(std::distance(begin, end)));
}
else {
// TODO: When not parsing the value in the attribute, it is not necessary
// to unescape'and "
value.clear();
auto pre = begin;
while (advance_until_character<'&'>(begin, end)) {
value.append(T(&*pre, static_cast<size_t>(std::distance(pre, begin))));
parse_escape_xml(value, begin, end);
pre = begin;
}
value.append(T(&*pre, static_cast<size_t>(std::distance(pre, begin))));
}
}
else if constexpr (num_v<T>) {
auto size = std::distance(begin, end);
Expand Down Expand Up @@ -87,9 +101,19 @@ IGUANA_INLINE void parse_attr(U &&value, It &&it, It &&end) {
parse_value(key, key_begin, key_end);

skip_sapces_and_newline(it, end);
match<'"'>(it, end);
auto value_begin = it;
auto value_end = skip_pass<'"'>(it, end);
auto value_begin = it + 1;
auto value_end = value_begin;
if (*it == '"')
IGUANA_LIKELY {
++it;
value_end = skip_pass<'"'>(it, end);
}
else if (*it == '\'') {
++it;
value_end = skip_pass<'\''>(it, end);
}
else
IGUANA_UNLIKELY { throw std::runtime_error("expected quote or apos"); }
value_type v;
parse_value(v, value_begin, value_end);
value.emplace(std::move(key), std::move(v));
Expand Down
143 changes: 143 additions & 0 deletions iguana/xml_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,24 @@ inline constexpr auto has_square_bracket =
0b0101110101011101010111010101110101011101010111010101110101011101);
};

inline constexpr auto has_and = [](uint64_t chunk) IGUANA__INLINE_LAMBDA {
return has_zero(
chunk ^
0b0010011000100110001001100010011000100110001001100010011000100110);
};

inline constexpr auto has_equal = [](uint64_t chunk) IGUANA__INLINE_LAMBDA {
return has_zero(
chunk ^
0b0011110100111101001111010011110100111101001111010011110100111101);
};

inline constexpr auto has_apos = [](uint64_t chunk) IGUANA__INLINE_LAMBDA {
return has_zero(
chunk ^
0b0010011100100111001001110010011100100111001001110010011100100111);
};

template <typename It>
IGUANA_INLINE void skip_sapces_and_newline(It &&it, It &&end) {
while (it != end && (static_cast<uint8_t>(*it) < 33)) {
Expand All @@ -104,6 +116,35 @@ IGUANA_INLINE void match_close_tag(It &&it, It &&end, std::string_view key) {
// ++it;
}

// returns true if the specified character 'c' is found, false otherwise.
template <char c, typename It>
IGUANA_INLINE bool advance_until_character(It &&it, It &&end) {
static_assert(contiguous_iterator<std::decay_t<It>>);
if (std::distance(it, end) >= 7)
IGUANA_LIKELY {
const auto end_m7 = end - 7;
for (; it < end_m7; it += 8) {
const auto chunk = *reinterpret_cast<const uint64_t *>(&*it);
uint64_t test;
if constexpr (c == '&')
test = has_and(chunk);
else
static_assert(!c, "not support this character");
if (test != 0) {
it += (countr_zero(test) >> 3);
return true;
}
}
}
// Tail end of buffer. Should be rare we even get here
while (it < end) {
if (*it == c)
return true;
++it;
}
return false;
}

template <char c, typename It>
IGUANA_INLINE void skip_till(It &&it, It &&end) {
static_assert(contiguous_iterator<std::decay_t<It>>);
Expand All @@ -126,6 +167,8 @@ IGUANA_INLINE void skip_till(It &&it, It &&end) {
test = has_square_bracket(chunk);
else if constexpr (c == '=')
test = has_equal(chunk);
else if constexpr (c == '\'')
test = has_apos(chunk);
else
static_assert(!c, "not support this character");
if (test != 0) {
Expand Down Expand Up @@ -186,4 +229,104 @@ IGUANA_INLINE auto skip_pass(It &&it, It &&end) {
return res + 1;
}

template <char... C, typename It>
IGUANA_INLINE bool is_match(It &&it, const It &end) {
const auto n = static_cast<size_t>(std::distance(it, end));
if ((n < sizeof...(C)) || (... || (*it++ != C))) {
return false;
}
return true;
}

template <typename U, typename It, std::enable_if_t<string_v<U>, int> = 0>
IGUANA_INLINE void parse_escape_xml(U &value, It &&it, It &&end) {
static const unsigned char lookup_digits[256] = {
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255,
255, 255, 255, 255, 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 10, 11, 12, 13, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255};
switch (*(it + 1)) {
// &amp; &apos;
case 'a':
if (is_match<'m', 'p', ';'>(it + 2, end)) {
value.push_back('&');
it += 5;
return;
}
if (is_match<'p', 'o', 's', ';'>(it + 2, end)) {
value.push_back('\'');
it += 6;
}
break;
// &quot;
case 'q':
if (is_match<'u', 'o', 't', ';'>(it + 2, end)) {
value.push_back('\"');
it += 6;
}
break;
// &gt;
case 'g':
if (is_match<'t', ';'>(it + 2, end)) {
value.push_back('>');
it += 4;
}
break;
// &lt;
case 'l':
if (is_match<'t', ';'>(it + 2, end)) {
value.push_back('<');
it += 4;
}
break;
case '#':
if (*(it + 2) == 'x') {
// &#x
unsigned long codepoint = 0;
it += 3;
while (true) {
auto digit = lookup_digits[static_cast<unsigned char>(*it)];
if (digit == 0xFF)
break;
codepoint = codepoint * 16 + digit;
++it;
}
encode_utf8(value, codepoint);
}
else {
unsigned long codepoint = 0;
it += 2;
while (true) {
auto digit = lookup_digits[static_cast<unsigned char>(*it)];
if (digit == 0xFF)
break;
codepoint = codepoint * 10 + digit;
++it;
}
encode_utf8(value, codepoint);
}
match<';'>(it, end);
break;
default:
// skip '&'
// loose policy: allow '&'
value.push_back(*(it++));
break;
}
}

} // namespace iguana
Loading

0 comments on commit d38797a

Please sign in to comment.