Skip to content

Commit

Permalink
add fast path to parse escape[xml]
Browse files Browse the repository at this point in the history
  • Loading branch information
bbbgan committed Apr 28, 2024
1 parent 7cb6455 commit f665e9f
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 87 deletions.
9 changes: 8 additions & 1 deletion iguana/xml_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,14 @@ IGUANA_INLINE void parse_value(U &&value, It &&begin, It &&end) {
value = T(&*begin, static_cast<size_t>(std::distance(begin, end)));
}
else {
parse_escape_xml(value, begin, end);
value.clear();
auto pre = begin;
while (advance_until_character<'&'>(begin, end)) {
value.append(T(&*pre, static_cast<size_t>(std::distance(pre, begin))));
parse_escape_xml(value, begin, end);
pre = begin;
}
value.append(T(&*pre, static_cast<size_t>(std::distance(pre, begin))));
}
}
else if constexpr (num_v<T>) {
Expand Down
169 changes: 96 additions & 73 deletions iguana/xml_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@ inline constexpr auto has_square_bracket =
0b0101110101011101010111010101110101011101010111010101110101011101);
};

inline constexpr auto has_and = [](uint64_t chunk) IGUANA__INLINE_LAMBDA {
return has_zero(
chunk ^
0b0010011000100110001001100010011000100110001001100010011000100110);
};

inline constexpr auto has_equal = [](uint64_t chunk) IGUANA__INLINE_LAMBDA {
return has_zero(
chunk ^
Expand Down Expand Up @@ -104,6 +110,35 @@ IGUANA_INLINE void match_close_tag(It &&it, It &&end, std::string_view key) {
// ++it;
}

// returns true if the specified character 'c' is found, false otherwise.
template <char c, typename It>
IGUANA_INLINE bool advance_until_character(It &&it, It &&end) {
static_assert(contiguous_iterator<std::decay_t<It>>);
if (std::distance(it, end) >= 7)
IGUANA_LIKELY {
const auto end_m7 = end - 7;
for (; it < end_m7; it += 8) {
const auto chunk = *reinterpret_cast<const uint64_t *>(&*it);
uint64_t test;
if constexpr (c == '&')
test = has_and(chunk);
else
static_assert(!c, "not support this character");
if (test != 0) {
it += (countr_zero(test) >> 3);
return true;
}
}
}
// Tail end of buffer. Should be rare we even get here
while (it < end) {
if (*it == c)
return true;
++it;
}
return false;
}

template <char c, typename It>
IGUANA_INLINE void skip_till(It &&it, It &&end) {
static_assert(contiguous_iterator<std::decay_t<It>>);
Expand Down Expand Up @@ -195,7 +230,6 @@ IGUANA_INLINE bool is_match(It &&it, const It &end) {
return true;
}

// loose policy: allow '&'
template <typename U, typename It, std::enable_if_t<string_v<U>, int> = 0>
IGUANA_INLINE void parse_escape_xml(U &value, It &&it, It &&end) {
static const unsigned char lookup_digits[256] = {
Expand All @@ -217,83 +251,72 @@ IGUANA_INLINE void parse_escape_xml(U &value, It &&it, It &&end) {
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255};
while (it < end) {
if (*it == '&')
IGUANA_UNLIKELY {
switch (*(it + 1)) {
// &amp; &apos;
case 'a':
if (is_match<'m', 'p', ';'>(it + 2, end)) {
value.push_back('&');
it += 5;
continue;
}
if (is_match<'p', 'o', 's', ';'>(it + 2, end)) {
value.push_back('\'');
it += 6;
continue;
}
break;
// &quot;
case 'q':
if (is_match<'u', 'o', 't', ';'>(it + 2, end)) {
value.push_back('\"');
it += 6;
continue;
}
break;
// &gt;
case 'g':
if (is_match<'t', ';'>(it + 2, end)) {
value.push_back('>');
it += 4;
continue;
}
break;
// &lt;
case 'l':
if (is_match<'t', ';'>(it + 2, end)) {
value.push_back('<');
it += 4;
continue;
}
switch (*(it + 1)) {
// &amp; &apos;
case 'a':
if (is_match<'m', 'p', ';'>(it + 2, end)) {
value.push_back('&');
it += 5;
}
if (is_match<'p', 'o', 's', ';'>(it + 2, end)) {
value.push_back('\'');
it += 6;
}
break;
// &quot;
case 'q':
if (is_match<'u', 'o', 't', ';'>(it + 2, end)) {
value.push_back('\"');
it += 6;
}
break;
// &gt;
case 'g':
if (is_match<'t', ';'>(it + 2, end)) {
value.push_back('>');
it += 4;
}
break;
// &lt;
case 'l':
if (is_match<'t', ';'>(it + 2, end)) {
value.push_back('<');
it += 4;
}
break;
case '#':
if (*(it + 2) == 'x') {
// &#x
unsigned long codepoint = 0;
it += 3;
while (true) {
auto digit = lookup_digits[static_cast<unsigned char>(*it)];
if (digit == 0xFF)
break;
case '#':
if (*(it + 2) == 'x') {
// &#x
unsigned long codepoint = 0;
it += 3;
while (true) {
auto digit = lookup_digits[static_cast<unsigned char>(*it)];
if (digit == 0xFF)
break;
codepoint = codepoint * 16 + digit;
++it;
}
encode_utf8(value, codepoint);
}
else {
unsigned long codepoint = 0;
it += 2;
while (true) {
auto digit = lookup_digits[static_cast<unsigned char>(*it)];
if (digit == 0xFF)
break;
codepoint = codepoint * 10 + digit;
++it;
}
encode_utf8(value, codepoint);
}
match<';'>(it, end);
continue;
default:
codepoint = codepoint * 16 + digit;
++it;
}
encode_utf8(value, codepoint);
}
else {
unsigned long codepoint = 0;
it += 2;
while (true) {
auto digit = lookup_digits[static_cast<unsigned char>(*it)];
if (digit == 0xFF)
break;
codepoint = codepoint * 10 + digit;
++it;
}
value.push_back(*(it++));
encode_utf8(value, codepoint);
}
else {
match<';'>(it, end);
break;
default:
// skip '&'
// loose policy: allow '&'
value.push_back(*(it++));
}
break;
}
}

Expand Down
10 changes: 5 additions & 5 deletions iguana/xml_writer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ IGUANA_INLINE void render_string_with_escape_xml(const Ch *it, SizeType length,
continue;
}
#endif
if (*it == '\'')
IGUANA_UNLIKELY { ss.append("&apos;"); }
else if (*it == '"')
IGUANA_UNLIKELY { ss.append("&quot;"); }
else if (*it == '&')
// if (*it == '\'')
// IGUANA_UNLIKELY { ss.append("&apos;"); }
// else if (*it == '"')
// IGUANA_UNLIKELY { ss.append("&quot;"); }
if (*it == '&')
IGUANA_UNLIKELY { ss.append("&amp;"); }
else if (*it == '>')
IGUANA_UNLIKELY { ss.append("&gt;"); }
Expand Down
16 changes: 8 additions & 8 deletions test/test_xml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ TEST_CASE("test escape") {
{
std::string str = R"(
<Owner_t description="&lt;&#x5c0f;&#24378;&gt;">
<ID>&apos;&amp;&quot;&lt;&gt;</ID>
<ID>&amp;&lt;&gt;</ID>
<DisplayName>&#x5c0f;&#24378;</DisplayName>
</Owner_t>
)";
Expand All @@ -32,19 +32,19 @@ TEST_CASE("test escape") {
auto Ow = Owner.value();
auto attr = Owner.attr();
CHECK(attr["description"] == "<小强>");
CHECK(Ow.ID == R"('&"<>)");
CHECK(Ow.ID == R"(&<>)");
CHECK(Ow.DisplayName == "小强");
};
Owner_attr_t Owner;
iguana::from_xml(Owner, str);
validator(Owner);

std::string ss;
iguana::to_xml(Owner, ss);
std::cout << ss << std::endl;
Owner_attr_t Owner1;
iguana::from_xml(Owner1, ss);
validator(Owner1);
// std::string ss;
// iguana::to_xml(Owner, ss);
// std::cout << ss << std::endl;
// Owner_attr_t Owner1;
// iguana::from_xml(Owner1, ss);
// validator(Owner1);
}
}

Expand Down

0 comments on commit f665e9f

Please sign in to comment.