From 43055e2379b4796d146a3825df5b314681041a2b Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 18 Jun 2021 20:34:31 +0100 Subject: [PATCH] [Project] Html: Allow decode entities function to normalise spaces + unit tests --- src/libserver/html/html_entities.cxx | 90 ++++++++++++++++++++++++---- src/libserver/html/html_entities.hxx | 2 +- 2 files changed, 79 insertions(+), 13 deletions(-) diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx index 9be8c67aa..554730fc8 100644 --- a/src/libserver/html/html_entities.cxx +++ b/src/libserver/html/html_entities.cxx @@ -18,10 +18,15 @@ #include "html_entities.hxx" #include +#include +#include #include #include #include "libutil/cxx/util.hxx" +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" + namespace rspamd::html { struct html_entity_def { @@ -2196,9 +2201,14 @@ public: static const html_entities_storage html_entities_defs; std::size_t -decode_html_entitles_inplace(char *s, std::size_t len) +decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces) { long l, rep_len; + /* + * t - tortoise (destination ptr) + * h - hare (source ptr) + * e - begin of entity + */ char *t = s, *h = s, *e = s, *end_ptr, old_c; const gchar *end; const gchar *entity; @@ -2208,7 +2218,12 @@ decode_html_entitles_inplace(char *s, std::size_t len) do_digits_only, do_mixed, } seen_digit_only; - int state = 0, base; + enum class parser_state { + normal_content, + ampersand, + skip_multi_spaces, + } state = parser_state::normal_content; + int base; UChar32 uc; if (len == 0) { @@ -2222,10 +2237,9 @@ decode_html_entitles_inplace(char *s, std::size_t len) while (h - s < l && t <= h) { switch (state) { - /* Out of entity */ - case 0: + case parser_state::normal_content: if (*h == '&') { - state = 1; + state = parser_state::ampersand; seen_hash = false; seen_hex = false; seen_digit_only = do_undefined; @@ -2234,12 +2248,17 @@ decode_html_entitles_inplace(char *s, std::size_t len) continue; } else { - *t = *h; - h++; - t++; + if (norm_spaces && g_ascii_isspace(*h)) { + *t++ = ' '; + state = parser_state::skip_multi_spaces; + h++; + } + else { + *t++ = *h++; + } } break; - case 1: + case parser_state::ampersand: if (*h == ';' && h > e) { decode_entity: old_c = *h; @@ -2340,11 +2359,11 @@ decode_entity: } } - state = 0; + state = parser_state::normal_content; } else if (*h == '&') { /* Previous `&` was bogus */ - state = 1; + state = parser_state::ampersand; if (end - t > h - e) { memmove(t, e, h - e); @@ -2378,12 +2397,20 @@ decode_entity: h++; + break; + case parser_state::skip_multi_spaces: + if (g_ascii_isspace(*h)) { + h ++; + } + else { + state = parser_state::normal_content; + } break; } } /* Leftover */ - if (state == 1 && h > e) { + if (state == parser_state::ampersand && h > e) { /* Unfinished entity, copy as is */ if (end - t >= h - e) { memmove(t, e, h - e); @@ -2394,4 +2421,43 @@ decode_entity: return (t - s); } +TEST_SUITE("html") { + + TEST_CASE("html entities") { + std::vector> cases{ + {"", ""}, + {"abc", "abc"}, + {"abc def", "abc def"}, + {"abc def", "abc def"}, + {"abc\ndef", "abc def"}, + {"abc\n \tdef", "abc def"}, + {" abc def ", " abc def "}, + {"FOO>BAR", "FOO>BAR"}, + {"FOO>BAR", "FOO>BAR"}, + {"FOO> BAR", "FOO>BAR"}, + {"FOO>;;BAR", "FOO>;;BAR"}, + {"I'm ¬it; ", "I'm ¬it; "}, + {"I'm ∉ ", "I'm ∉ "}, + {"FOO& BAR", "FOO& BAR"}, + {"FOO&&&>BAR", "FOO&&&>BAR"}, + {"FOO)BAR", "FOO)BAR"}, + {"FOOABAR", "FOOABAR"}, + {"FOOABAR", "FOOABAR"}, + {"FOO&#BAR", "FOO&#BAR"}, + {"FOO&#ZOO", "FOO&#ZOO"}, + {"FOOºR", "FOOºR"}, + {"FOO䆺R", "FOO䆺R"}, + {"FOO�ZOO", "FOO�ZOO"}, + }; + + for (const auto &c : cases) { + auto *cpy = new char[c.first.size()]; + memcpy(cpy, c.first.data(), c.first.size()); + auto nlen = decode_html_entitles_inplace(cpy, c.first.size(), true); + CHECK(std::string{cpy,nlen} == c.second); + delete[] cpy; + } + } +} + } // namespace rspamd::html \ No newline at end of file diff --git a/src/libserver/html/html_entities.hxx b/src/libserver/html/html_entities.hxx index 9e48c20a0..d59674906 100644 --- a/src/libserver/html/html_entities.hxx +++ b/src/libserver/html/html_entities.hxx @@ -22,7 +22,7 @@ namespace rspamd::html { -std::size_t decode_html_entitles_inplace (char *s, std::size_t len); +std::size_t decode_html_entitles_inplace (char *s, std::size_t len, bool norm_spaces = false); } -- 2.39.5