From b9801960d0ebb388dc2a4e93071f19868fe44bc0 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 21 Jun 2021 19:59:35 +0100 Subject: [PATCH] [Rework] Html: Start html text extraction rework --- src/libserver/html/html.cxx | 263 +++++++++++++----------------------- 1 file changed, 96 insertions(+), 167 deletions(-) diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 073b733a2..00dcebad6 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -1037,14 +1037,16 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag, } } -using tags_vector = std::vector>; - -static auto -tags_vector_ptr_dtor(void *ptr) +static inline auto +html_append_content(struct html_content *hc, std::string_view data) -> auto { - auto *ptags = (tags_vector *)ptr; + auto cur_offset = hc->parsed.size(); + hc->parsed.append(data); + auto nlen = decode_html_entitles_inplace(hc->parsed.data() + cur_offset, + hc->parsed.size() - cur_offset, true); + hc->parsed.resize(nlen + cur_offset); - delete ptags; + return nlen; } static auto @@ -1055,9 +1057,9 @@ html_process_input(rspamd_mempool_t *pool, GPtrArray *part_urls, bool allow_css) -> html_content * { - const gchar *p, *c, *end; + const gchar *p, *c, *end, *start; guchar t; - gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE; + gboolean closing = FALSE; guint obrace = 0, ebrace = 0; struct rspamd_url *url = NULL; gint len, href_offset = -1; @@ -1067,6 +1069,7 @@ html_process_input(rspamd_mempool_t *pool, enum { parse_start = 0, + content_before_start, tag_begin, sgml_tag, xml_tag, @@ -1076,11 +1079,9 @@ html_process_input(rspamd_mempool_t *pool, sgml_content, tag_content, tag_end, + html_text_content, xml_tag_end, - content_ignore, - content_write, content_style, - content_ignore_sp } state = parse_start; g_assert (in != NULL); @@ -1092,6 +1093,7 @@ html_process_input(rspamd_mempool_t *pool, p = (const char *)in->data; c = p; end = p + in->len; + start = c; while (p < end) { t = *p; @@ -1104,9 +1106,17 @@ html_process_input(rspamd_mempool_t *pool, else { /* We have no starting tag, so assume that it's content */ hc->flags |= RSPAMD_HTML_FLAG_BAD_START; - state = content_write; + state = content_before_start; + } + break; + case content_before_start: + if (t == '<') { + html_append_content(hc, {c, std::size_t(p - c)}); + state = tag_begin; + } + else { + p ++; } - break; case tag_begin: switch (t) { @@ -1248,7 +1258,7 @@ html_process_input(rspamd_mempool_t *pool, p ++; break; - case content_ignore: + case html_text_content: if (t != '<') { p ++; } @@ -1257,126 +1267,6 @@ html_process_input(rspamd_mempool_t *pool, } break; - case content_write: - - if (t != '<') { - if (t == '&') { - need_decode = TRUE; - } - else if (g_ascii_isspace (t)) { - save_space = TRUE; - - if (p > c) { - if (need_decode) { - goffset old_offset = hc->parsed.size(); - - if (content_tag) { - if (content_tag->content_length == 0) { - content_tag->content_offset = old_offset; - } - } - - hc->parsed.append(c, p - c); - - len = decode_html_entitles_inplace( - hc->parsed.data() + old_offset, - (std::size_t)(p - c)); - hc->parsed.resize(hc->parsed.size() + len - (p - c)); - - if (content_tag) { - content_tag->content_length += len; - } - } - else { - len = p - c; - - if (content_tag) { - if (content_tag->content_length == 0) { - content_tag->content_offset = hc->parsed.size(); - } - - content_tag->content_length += len; - } - - hc->parsed.append(c, len); - } - } - - c = p; - state = content_ignore_sp; - } - else { - if (save_space) { - /* Append one space if needed */ - if (!hc->parsed.empty() && - !g_ascii_isspace (hc->parsed.back())) { - hc->parsed += " "; - - if (content_tag) { - if (content_tag->content_length == 0) { - /* - * Special case - * we have a space at the beginning but - * we have no set content_offset - * so we need to do it here - */ - content_tag->content_offset = hc->parsed.size(); - } - else { - content_tag->content_length++; - } - } - } - save_space = FALSE; - } - } - } - else { - if (c != p) { - - if (need_decode) { - goffset old_offset = hc->parsed.size(); - - if (content_tag) { - if (content_tag->content_length == 0) { - content_tag->content_offset = hc->parsed.size(); - } - } - - hc->parsed.append(c, p - c); - len = decode_html_entitles_inplace( - hc->parsed.data() + old_offset, - (std::size_t)(p - c)); - hc->parsed.resize(hc->parsed.size() + len - (p - c)); - - if (content_tag) { - content_tag->content_length += len; - } - } - else { - len = p - c; - - if (content_tag) { - if (content_tag->content_length == 0) { - content_tag->content_offset = hc->parsed.size(); - } - - content_tag->content_length += len; - } - - hc->parsed.append(c, len); - } - } - - content_tag = NULL; - - state = tag_begin; - continue; - } - - p ++; - break; - case content_style: { /* @@ -1387,7 +1277,7 @@ html_process_input(rspamd_mempool_t *pool, "') { @@ -1457,18 +1336,13 @@ html_process_input(rspamd_mempool_t *pool, content_parser_env.reset(); if (cur_tag != nullptr) { + state = html_text_content; - if (html_process_tag(pool, hc, cur_tag, tags_stack)) { - state = content_write; - need_decode = FALSE; - } - else { + cur_tag->content_offset = p - start; + if (!html_process_tag(pool, hc, cur_tag, tags_stack)) { if (cur_tag->id == Tag_STYLE) { state = content_style; } - else { - state = content_ignore; - } } if (cur_tag->id != -1 && cur_tag->id < N_TAGS) { @@ -1507,7 +1381,6 @@ html_process_input(rspamd_mempool_t *pool, } } } - save_space = FALSE; } if ((cur_tag->id == Tag_P || @@ -1533,7 +1406,6 @@ html_process_input(rspamd_mempool_t *pool, } } } - save_space = FALSE; } /* XXX: uncomment when styles parsing is not so broken */ @@ -1637,11 +1509,8 @@ html_process_input(rspamd_mempool_t *pool, } } } - else { - state = content_write; - } - + state = html_text_content; p++; c = p; cur_tag = NULL; @@ -1740,6 +1609,19 @@ html_process_input(rspamd_mempool_t *pool, return true; }, html_content::traverse_type::PRE_ORDER); + /* Leftover */ + switch (state) { + case html_text_content: + case content_before_start: + if (p > c) { + html_append_content(hc, {c, std::size_t(p - c)}); + } + break; + default: + /* Do nothing */ + break; + } + return hc; } @@ -1816,17 +1698,64 @@ TEST_CASE("html parsing") "html", 0); for (const auto &c : cases) { - GByteArray *tmp = g_byte_array_sized_new(c.first.size()); - g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size()); - auto *hc = html_process_input(pool, tmp, nullptr, nullptr, nullptr, true); - CHECK(hc != nullptr); - auto dump = html_debug_structure(*hc); - CHECK(c.second == dump); - g_byte_array_free(tmp, TRUE); + SUBCASE((std::string("extract tags from: ") + c.first).c_str()) { + GByteArray *tmp = g_byte_array_sized_new(c.first.size()); + g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size()); + auto *hc = html_process_input(pool, tmp, nullptr, nullptr, nullptr, true); + CHECK(hc != nullptr); + auto dump = html_debug_structure(*hc); + CHECK(c.second == dump); + g_byte_array_free(tmp, TRUE); + } } rspamd_mempool_delete(pool); } + +TEST_CASE("html text extraction") +{ + + const std::vector> cases{ + {"test", "test"}, + {"test ", "test "}, + {"test foo, bar", "test foo, bar"}, + {"

text

", "text"}, + {"olo

text

lolo", "olo\ntext\nlolo"}, + {"foobarbaz", "foobarbaz"}, + {"foobarbaz", "foobarbaz"}, + {"foo
baz", "foo\nbaz"}, + {"
foo
bar
", "foo\nbar"}, + }; + + rspamd_url_init(NULL); + auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + "html", 0); + + auto replace_newlines = [](std::string &str) { + auto start_pos = 0; + while((start_pos = str.find("\n", start_pos, 1)) != std::string::npos) { + str.replace(start_pos, 1, "\\n", 2); + start_pos += 2; + } + }; + + for (const auto &c : cases) { + SUBCASE((std::string("extract text from: ") + c.first).c_str()) { + GByteArray *tmp = g_byte_array_sized_new(c.first.size()); + g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size()); + auto *hc = html_process_input(pool, tmp, nullptr, nullptr, nullptr, true); + CHECK(hc != nullptr); + replace_newlines(hc->parsed); + auto expected = c.second; + replace_newlines(expected); + CHECK(hc->parsed == expected); + g_byte_array_free(tmp, TRUE); + } + } + + rspamd_mempool_delete(pool); +} + } } -- 2.39.5