]> source.dussan.org Git - rspamd.git/commitdiff
[Rework] Html: Further rework of the tags content extraction
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 21 Jun 2021 23:21:24 +0000 (00:21 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 21 Jun 2021 23:21:24 +0000 (00:21 +0100)
src/libserver/html/html.cxx
src/libserver/html/html_block.hxx
src/libstat/tokenizers/tokenizers.c
src/libutil/util.h

index 925735f41283278a1955ed2ed5277ef0cba2405d..f82bd03595a2c7d877882f12cfe9f3457b997213 100644 (file)
@@ -1069,6 +1069,54 @@ html_append_content(struct html_content *hc, std::string_view data) -> auto
        return nlen;
 }
 
+static auto
+html_append_tag_content(const gchar *start, gsize len,
+                                               struct html_content *hc, const struct html_tag *tag) -> void
+{
+       auto cur_offset = tag->content_offset;
+       auto total_len = tag->content_length;
+
+       if (cur_offset > len || total_len + cur_offset > len) {
+               RSPAMD_UNREACHABLE;
+       }
+
+       if (tag->id == Tag_BR || tag->id == Tag_HR) {
+               hc->parsed.append("\n");
+               return;
+       }
+
+       if (!tag->block) {
+               return; /* XXX: is it always true? */
+       }
+
+       if (tag->block->has_display() && tag->block->display == css::css_display_value::DISPLAY_BLOCK) {
+               hc->parsed.append("\n");
+       }
+
+       for (const auto &cld_tag : tag->children) {
+               if (cld_tag->tag_start > cur_offset) {
+                       if (tag->block->is_visible()) {
+                               html_append_content(hc, {start + cur_offset,
+                                                                                cld_tag->tag_start - cur_offset});
+                       }
+               }
+               html_append_tag_content(start, len, hc, cld_tag);
+               auto old_offset = cur_offset;
+               cur_offset = cld_tag->content_offset + cld_tag->content_length;
+
+               if (total_len < cur_offset - old_offset) {
+                       /* Child tag spans over parent (e.g. wrong nesting) */
+                       total_len = 0;
+                       break;
+               }
+               total_len -= cur_offset - old_offset;
+       }
+
+       if (total_len > 0 && tag->block->is_visible()) {
+               html_append_content(hc, {start + cur_offset, total_len});
+       }
+}
+
 static auto
 html_process_input(rspamd_mempool_t *pool,
                                        GByteArray *in,
@@ -1490,17 +1538,8 @@ html_process_input(rspamd_mempool_t *pool,
                }
        }
 
-       /* Summarize content length from children */
-       hc->traverse_block_tags([](const html_tag *tag) -> bool {
-
-               for (const auto *cld_tag : tag->children) {
-                       tag->content_length += cld_tag->content_length;
-               }
-               return true;
-       }, html_content::traverse_type::POST_ORDER);
-
        /* Propagate styles */
-       hc->traverse_block_tags([&hc, &exceptions,&pool](const html_tag *tag) -> bool {
+       hc->traverse_block_tags([&hc](const html_tag *tag) -> bool {
                if (hc->css_style) {
                        auto *css_block = hc->css_style->check_tag_block(tag);
 
@@ -1514,62 +1553,18 @@ html_process_input(rspamd_mempool_t *pool,
                        }
                }
                if (tag->block) {
-                       tag->block->compute_visibility();
-
-                       if (exceptions) {
-                               if (!tag->block->is_visible()) {
-                                       if (tag->parent == nullptr || (tag->parent->block && tag->parent->block->is_visible())) {
-                                               /* Add exception for an invisible element */
-                                               auto * ex = rspamd_mempool_alloc_type (pool,struct rspamd_process_exception);
-                                               ex->pos = tag->content_offset;
-                                               ex->len = tag->content_length;
-                                               ex->type = RSPAMD_EXCEPTION_INVISIBLE;
-                                               ex->ptr = (void *)tag;
-
-                                               *exceptions = g_list_prepend(*exceptions, ex);
-                                       }
+                       if (!tag->block->has_display()) {
+                               /* If we have no display field, we can check it by tag */
+                               if (tag->flags & CM_BLOCK) {
+                                       tag->block->set_display(css::css_display_value::DISPLAY_BLOCK);
                                }
-                               else if (*exceptions && tag->parent) {
-                                       /* Current block is visible, check if parent is invisible */
-                                       auto *ex = (struct rspamd_process_exception*)g_list_first(*exceptions)->data;
-
-                                       /*
-                                        * TODO: we need to handle the following cases:
-                                        * <inv><vis><inv> -< insert one more exception
-                                        * <vis><inv> -< increase content_offset decrease length
-                                        * <inv><vis> -< decrease length
-                                        */
-                                       if (ex && ex->type == RSPAMD_EXCEPTION_INVISIBLE &&
-                                               ex->ptr == (void *)tag->parent) {
-                                               auto *parent = tag->parent;
-
-                                               if (tag->content_offset + tag->content_length ==
-                                                       parent->content_offset + parent->content_length) {
-                                                       /* <inv><vis> */
-                                                       ex->len -= tag->content_length;
-                                               }
-                                               else if (tag->content_offset == parent->content_offset) {
-                                                       /* <vis><inv> */
-                                                       ex->len -= tag->content_length;
-                                                       ex->pos += tag->content_length;
-                                               }
-                                               else if (tag->content_offset > ex->pos) {
-                                                       auto *nex = rspamd_mempool_alloc_type (pool,
-                                                                       struct rspamd_process_exception);
-                                                       auto start_len = tag->content_offset - ex->pos;
-                                                       auto end_len = ex->len - tag->content_length - tag->content_length;
-                                                       nex->pos = tag->content_offset + tag->content_length;
-                                                       nex->len = end_len;
-                                                       nex->type = RSPAMD_EXCEPTION_INVISIBLE;
-                                                       nex->ptr = (void *)parent; /* ! */
-                                                       ex->len = start_len;
-                                                       *exceptions = g_list_prepend(*exceptions, ex);
-                                               }
-
-                                       }
+                               else {
+                                       tag->block->set_display(css::css_display_value::DISPLAY_INLINE);
                                }
                        }
 
+                       tag->block->compute_visibility();
+
                        for (const auto *cld_tag : tag->children) {
                                if (cld_tag->block) {
                                        cld_tag->block->propagate_block(*tag->block);
@@ -1582,6 +1577,10 @@ html_process_input(rspamd_mempool_t *pool,
                return true;
        }, html_content::traverse_type::PRE_ORDER);
 
+       if (hc->root_tag) {
+               html_append_tag_content(start, end - start, hc, hc->root_tag);
+       }
+
        /* Leftover */
        switch (state) {
        case html_text_content:
index 51f3dbb9d776983fd60204c0484b49e163eae20a..f2bbf1d64db2e26bfe4acb8d63aa4560cc04b68f 100644 (file)
@@ -219,6 +219,10 @@ struct html_block {
                return (mask & transparent_flag) != 0;
        }
 
+       constexpr auto has_display(void) const -> bool {
+               return (mask & display_mask) != 0;
+       }
+
        /**
         * Returns a default html block for root HTML element
         * @return
@@ -227,7 +231,7 @@ struct html_block {
                return html_block{rspamd::css::css_color::black(),
                                                  rspamd::css::css_color::white(),
                                                  0, 0,
-                                                 (fg_color_mask|bg_color_mask|display_mask|font_size_mask),
+                                                 (fg_color_mask|bg_color_mask|font_size_mask),
                                                  rspamd::css::css_display_value::DISPLAY_INLINE,
                                                  12};
        }
index 2dd4a6f5a1061e4af0ec72027145a076a8325b9c..b4f8ac75cbfde64d557a4daa1b47232941b8774c 100644 (file)
@@ -275,14 +275,6 @@ rspamd_tokenize_exception (struct rspamd_process_exception *ex, GArray *res)
                g_array_append_val (res, token);
                token.flags = 0;
        }
-       else if (ex->type == RSPAMD_EXCEPTION_INVISIBLE) {
-               token.original.begin = "!!INV!!";
-               token.original.len = sizeof ("!!INV!!") - 1;
-               token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
-
-               g_array_append_val (res, token);
-               token.flags = 0;
-       }
 }
 
 
index 9ee8a09ae9eeb9b27e6ef79c4b3e62aa99d1b199..d993fcbdfc44cb2af3eb4fe106bf8abd70a21f6d 100644 (file)
@@ -25,7 +25,6 @@ enum rspamd_exception_type {
        RSPAMD_EXCEPTION_NEWLINE = 0,
        RSPAMD_EXCEPTION_URL,
        RSPAMD_EXCEPTION_GENERIC,
-       RSPAMD_EXCEPTION_INVISIBLE,
 };
 /**
  * Structure to point exception in text from processing