diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-07-01 17:46:31 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-07-01 17:46:31 +0100 |
commit | f2f16de4ab5f5c2ad58d67704ff040ed96058823 (patch) | |
tree | 34f10a4d1d0db3815202c73ed114c65bb24b833c | |
parent | 78d955e5735077953966863c1d8c0ade38c27e73 (diff) | |
download | rspamd-f2f16de4ab5f5c2ad58d67704ff040ed96058823.tar.gz rspamd-f2f16de4ab5f5c2ad58d67704ff040ed96058823.zip |
[Project] Html: Add rows display type support
-rw-r--r-- | src/libserver/css/css_value.cxx | 19 | ||||
-rw-r--r-- | src/libserver/css/css_value.hxx | 1 | ||||
-rw-r--r-- | src/libserver/html/html.cxx | 63 |
3 files changed, 61 insertions, 22 deletions
diff --git a/src/libserver/css/css_value.cxx b/src/libserver/css/css_value.cxx index 6988ea993..ec44b86a6 100644 --- a/src/libserver/css/css_value.cxx +++ b/src/libserver/css/css_value.cxx @@ -310,14 +310,14 @@ constexpr const auto display_names_map = frozen::make_unordered_map<frozen::stri {"list-item", css_display_value::DISPLAY_BLOCK}, {"run-in", css_display_value::DISPLAY_INLINE}, {"table", css_display_value::DISPLAY_BLOCK}, - {"table-caption", css_display_value::DISPLAY_BLOCK}, - {"table-column-group", css_display_value::DISPLAY_BLOCK}, - {"table-header-group", css_display_value::DISPLAY_BLOCK}, - {"table-footer-group", css_display_value::DISPLAY_BLOCK}, - {"table-row-group", css_display_value::DISPLAY_BLOCK}, - {"table-cell", css_display_value::DISPLAY_BLOCK}, - {"table-column", css_display_value::DISPLAY_BLOCK}, - {"table-row", css_display_value::DISPLAY_BLOCK}, + {"table-caption", css_display_value::DISPLAY_TABLE_ROW}, + {"table-column-group", css_display_value::DISPLAY_TABLE_ROW}, + {"table-header-group", css_display_value::DISPLAY_TABLE_ROW}, + {"table-footer-group", css_display_value::DISPLAY_TABLE_ROW}, + {"table-row-group", css_display_value::DISPLAY_TABLE_ROW}, + {"table-cell", css_display_value::DISPLAY_TABLE_ROW}, + {"table-column", css_display_value::DISPLAY_TABLE_ROW}, + {"table-row", css_display_value::DISPLAY_TABLE_ROW}, {"initial", css_display_value::DISPLAY_INLINE}, }); @@ -364,6 +364,9 @@ auto css_value::debug_str() const -> std::string { case css_display_value::DISPLAY_INLINE: ret += "inline"; break; + case css_display_value::DISPLAY_TABLE_ROW: + ret += "table_row"; + break; } } else if constexpr (std::is_integral_v<T>) { diff --git a/src/libserver/css/css_value.hxx b/src/libserver/css/css_value.hxx index d3d06a544..8dcfa63da 100644 --- a/src/libserver/css/css_value.hxx +++ b/src/libserver/css/css_value.hxx @@ -75,6 +75,7 @@ struct css_dimension { enum class css_display_value : std::uint8_t { DISPLAY_INLINE, DISPLAY_BLOCK, + DISPLAY_TABLE_ROW, DISPLAY_HIDDEN }; diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 088202286..ae73b7413 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -1041,10 +1041,26 @@ html_append_tag_content(rspamd_mempool_t *pool, GList **exceptions, khash_t (rspamd_url_hash) *url_set) -> goffset { - auto is_visible = true, is_block = false; + auto is_visible = true, is_block = false, is_spaces = false; goffset next_tag_offset = tag->closing.end, initial_dest_offset = hc->parsed.size(); + auto append_margin = [&](char c) -> void { + if (is_visible) { + if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') { + if (hc->parsed.back() == ' ') { + /* We also strip extra spaces at the end */ + hc->parsed.erase(std::find_if(hc->parsed.rbegin(), hc->parsed.rend(), + [](auto ch) -> auto { + return ch != ' '; + }).base(), + hc->parsed.end()); + } + hc->parsed.push_back(c); + } + } + }; + if (tag->id == Tag_BR || tag->id == Tag_HR) { hc->parsed.append("\n"); @@ -1064,16 +1080,21 @@ html_append_tag_content(rspamd_mempool_t *pool, else if (!tag->block->is_visible()) { is_visible = false; } - else { - is_block = tag->block->has_display() && - tag->block->display == css::css_display_value::DISPLAY_BLOCK; + else if (tag->block->has_display()) { + if (tag->block->display == css::css_display_value::DISPLAY_BLOCK) { + is_block = true; + } + else if (tag->block->display == css::css_display_value::DISPLAY_TABLE_ROW) { + is_spaces = true; + } } } if (is_block) { - if (!hc->parsed.empty() && hc->parsed.back() != '\n') { - hc->parsed.append("\n"); - } + append_margin('\n'); + } + else if (is_spaces) { + append_margin(' '); } goffset cur_offset = tag->content_offset; @@ -1104,11 +1125,11 @@ html_append_tag_content(rspamd_mempool_t *pool, std::size_t(final_part_len)}); } } - - if (is_block && is_visible) { - if (!hc->parsed.empty() && hc->parsed.back() != '\n') { - hc->parsed.append("\n"); - } + if (is_block) { + append_margin('\n'); + } + else if (is_spaces) { + append_margin(' '); } if (is_visible) { @@ -1707,12 +1728,15 @@ html_process_input(rspamd_mempool_t *pool, if (tag->block) { if (!tag->block->has_display()) { /* If we have no display field, we can check it by tag */ - if (tag->flags & CM_BLOCK) { + if (tag->flags & (CM_BLOCK|CM_TABLE)) { tag->block->set_display(css::css_display_value::DISPLAY_BLOCK); } else if (tag->flags & CM_HEAD) { tag->block->set_display(css::css_display_value::DISPLAY_HIDDEN); } + else if (tag->flags & CM_ROW) { + tag->block->set_display(css::css_display_value::DISPLAY_TABLE_ROW); + } else { tag->block->set_display(css::css_display_value::DISPLAY_INLINE); } @@ -1892,6 +1916,17 @@ TEST_CASE("html text extraction") { const std::vector<std::pair<std::string, std::string>> cases{ + /* Tables */ + {"<table>\n" + " <tr>\n" + " <th>heada</th>\n" + " <th>headb</th>\n" + " </tr>\n" + " <tr>\n" + " <td>data1</td>\n" + " <td>data2</td>\n" + " </tr>\n" + " </table>", "heada headb\ndata1 data2\n"}, /* XML tags */ {"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n" " <!DOCTYPE html\n" @@ -1938,7 +1973,7 @@ TEST_CASE("html text extraction") " </P>\n" " <b>stuff</p>?\n" " </body>\n" - "</html>", "Hello, world! test\ndata<> \nstuff?"}, + "</html>", "Hello, world! test\ndata<>\nstuff?"}, {"<p><!--comment-->test</br></hr><br>", "test\n"}, }; |