#include "html_entities.hxx"
#include <string>
+#include <utility>
+#include <vector>
#include <contrib/robin-hood/robin_hood.h>
#include <unicode/utf8.h>
#include "libutil/cxx/util.hxx"
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+
namespace rspamd::html {
struct html_entity_def {
static const html_entities_storage html_entities_defs;
std::size_t
-decode_html_entitles_inplace(char *s, std::size_t len)
+decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
{
long l, rep_len;
+ /*
+ * t - tortoise (destination ptr)
+ * h - hare (source ptr)
+ * e - begin of entity
+ */
char *t = s, *h = s, *e = s, *end_ptr, old_c;
const gchar *end;
const gchar *entity;
do_digits_only,
do_mixed,
} seen_digit_only;
- int state = 0, base;
+ enum class parser_state {
+ normal_content,
+ ampersand,
+ skip_multi_spaces,
+ } state = parser_state::normal_content;
+ int base;
UChar32 uc;
if (len == 0) {
while (h - s < l && t <= h) {
switch (state) {
- /* Out of entity */
- case 0:
+ case parser_state::normal_content:
if (*h == '&') {
- state = 1;
+ state = parser_state::ampersand;
seen_hash = false;
seen_hex = false;
seen_digit_only = do_undefined;
continue;
}
else {
- *t = *h;
- h++;
- t++;
+ if (norm_spaces && g_ascii_isspace(*h)) {
+ *t++ = ' ';
+ state = parser_state::skip_multi_spaces;
+ h++;
+ }
+ else {
+ *t++ = *h++;
+ }
}
break;
- case 1:
+ case parser_state::ampersand:
if (*h == ';' && h > e) {
decode_entity:
old_c = *h;
}
}
- state = 0;
+ state = parser_state::normal_content;
}
else if (*h == '&') {
/* Previous `&` was bogus */
- state = 1;
+ state = parser_state::ampersand;
if (end - t > h - e) {
memmove(t, e, h - e);
h++;
+ break;
+ case parser_state::skip_multi_spaces:
+ if (g_ascii_isspace(*h)) {
+ h ++;
+ }
+ else {
+ state = parser_state::normal_content;
+ }
break;
}
}
/* Leftover */
- if (state == 1 && h > e) {
+ if (state == parser_state::ampersand && h > e) {
/* Unfinished entity, copy as is */
if (end - t >= h - e) {
memmove(t, e, h - e);
return (t - s);
}
+TEST_SUITE("html") {
+
+ TEST_CASE("html entities") {
+ std::vector<std::pair<std::string, std::string>> cases{
+ {"", ""},
+ {"abc", "abc"},
+ {"abc def", "abc def"},
+ {"abc def", "abc def"},
+ {"abc\ndef", "abc def"},
+ {"abc\n \tdef", "abc def"},
+ {" abc def ", " abc def "},
+ {"FOO>BAR", "FOO>BAR"},
+ {"FOO>BAR", "FOO>BAR"},
+ {"FOO> BAR", "FOO>BAR"},
+ {"FOO>;;BAR", "FOO>;;BAR"},
+ {"I'm ¬it; ", "I'm ¬it; "},
+ {"I'm ∉ ", "I'm ∉ "},
+ {"FOO& BAR", "FOO& BAR"},
+ {"FOO&&&>BAR", "FOO&&&>BAR"},
+ {"FOO)BAR", "FOO)BAR"},
+ {"FOOABAR", "FOOABAR"},
+ {"FOOABAR", "FOOABAR"},
+ {"FOO&#BAR", "FOO&#BAR"},
+ {"FOO&#ZOO", "FOO&#ZOO"},
+ {"FOOºR", "FOOºR"},
+ {"FOO䆺R", "FOO䆺R"},
+ {"FOO�ZOO", "FOO�ZOO"},
+ };
+
+ for (const auto &c : cases) {
+ auto *cpy = new char[c.first.size()];
+ memcpy(cpy, c.first.data(), c.first.size());
+ auto nlen = decode_html_entitles_inplace(cpy, c.first.size(), true);
+ CHECK(std::string{cpy,nlen} == c.second);
+ delete[] cpy;
+ }
+ }
+}
+
} // namespace rspamd::html
\ No newline at end of file