]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Html: Implement logic for tags pairing
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 26 Jun 2021 13:30:19 +0000 (14:30 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 26 Jun 2021 13:46:51 +0000 (14:46 +0100)
src/libserver/html/html.cxx
src/libserver/html/html_tag.hxx
src/libserver/html/html_tags.h

index 894b1ee45c987a351fc1e8c65c0e887744b4afec..8d312b73351ebd357a9d2926617a92ff20267a82 100644 (file)
@@ -76,51 +76,136 @@ auto html_components_map = frozen::make_unordered_map<frozen::string, html_compo
 INIT_LOG_MODULE(html)
 
 static auto
-html_check_balance(struct html_tag *tag,
+html_check_balance(struct html_content *hc,
+                                  struct html_tag *tag,
                                   struct html_tag *parent,
                                   std::vector<html_tag *> &tags_stack,
                                   goffset tag_start_offset,
                                   goffset tag_end_offset) -> bool
 {
 
-       if (tag->flags & FL_CLOSING) {
-               /* Find the opening pair if any and check if it is correctly placed */
-               auto found_opening = std::find_if(tags_stack.rbegin(), tags_stack.rend(),
-                               [&](const html_tag *t) {
-                                       return (t->flags & FL_CLOSED) == 0 && t->id == tag->id;
-                               });
+       auto calculate_content_length = [tag_start_offset](html_tag *t) {
+               auto opening_content_offset = t->content_offset;
 
-               if (found_opening != tags_stack.rend()) {
-                       auto *opening_tag = (*found_opening);
-                       opening_tag->flags |= FL_CLOSED;
+               if (opening_content_offset <= tag_start_offset) {
+                       t->content_length = tag_start_offset - opening_content_offset;
+               }
+               else {
+                       t->content_length = 0;
+               }
+       };
 
-                       /* Adjust size */
-                       auto opening_content_offset = opening_tag->content_offset;
+       auto balance_tag = [&]() -> void {
+               auto it = tags_stack.rbegin();
 
-                       if (opening_content_offset <= tag_start_offset) {
-                               opening_tag->content_length =
-                                               tag_start_offset - opening_content_offset;
-                       }
-                       else {
-                               opening_tag->content_length = 0;
+               for (auto end_it = tags_stack.rend(); it != end_it; ++it) {
+                       if ((*it)->id == tag->id && !((*it)->flags & FL_CLOSING)) {
+                               break;
                        }
+                       /* Insert a virtual closing tag for all tags that are not closed */
+                       auto &&vtag = std::make_unique<html_tag>();
+                       vtag->id = (*it)->id;
+                       vtag->flags = FL_VIRTUAL|FL_CLOSING;
+                       vtag->tag_start = tag->tag_start;
+                       vtag->content_offset = tag->content_offset;
+                       vtag->content_length = 0;
+                       vtag->parent = (*it)->parent;
+                       calculate_content_length(*it);
+                       (*it)->flags |= FL_CLOSED;
+                       hc->all_tags.emplace_back(std::move(vtag));
+               }
+
+               /* Remove tags */
+               tags_stack.erase(it.base(), std::end(tags_stack));
+       };
 
-                       if (found_opening == tags_stack.rbegin()) {
+       if (tag->flags & FL_CLOSING) {
+               if (!tags_stack.empty()) {
+                       auto *last_tag = tags_stack.back();
+
+                       if (last_tag->id == tag->id && !(last_tag->flags & FL_CLOSED)) {
+                               last_tag->flags |= FL_CLOSED;
+
+                               calculate_content_length(last_tag);
                                tags_stack.pop_back();
                                /* All good */
                                return true;
                        }
                        else {
-                               /* Move to front */
-                               std::iter_swap(found_opening, tags_stack.rbegin());
-                               tags_stack.pop_back();
+                               balance_tag();
+
                                return false;
                        }
                }
                else {
-                       /* We have unpaired tag */
-                       return false;
+                       /*
+                        * We have no opening tags in the stack, so we need to assume that there
+                        * is an opening tag at the beginning of the document.
+                        * There are two possibilities:
+                        *
+                        * 1) We have some block tag in hc->all_tags;
+                        * 2) We have no tags
+                        */
+
+                       if (hc->all_tags.empty()) {
+                               auto &&vtag = std::make_unique<html_tag>();
+                               vtag->id = tag->id;
+                               vtag->flags = FL_VIRTUAL|FL_CLOSED;
+                               vtag->tag_start = 0;
+                               vtag->content_offset = 0;
+                               calculate_content_length(vtag.get());
+
+
+                               if (!hc->root_tag) {
+                                       hc->root_tag = vtag.get();
+                               }
+                               else {
+                                       vtag->parent = hc->root_tag;
+                               }
+                               hc->all_tags.emplace_back(std::move(vtag));
+                       }
+                       else {
+                               auto found_closing = std::find_if(hc->all_tags.rbegin(),
+                                               hc->all_tags.rend(),
+                                               [&](const auto &t) {
+                                                       constexpr const auto expect_flags = FL_BLOCK|FL_CLOSING;
+                                                       return (t->flags & expect_flags) == (expect_flags) &&
+                                                                       t.get() != tag &&
+                                                                       t->parent != nullptr;
+                                               });
+
+                               if (found_closing != hc->all_tags.rend()) {
+                                       auto *closing_tag = (*found_closing).get();
+                                       auto &&vtag = std::make_unique<html_tag>();
+                                       vtag->id = tag->id;
+                                       vtag->flags = FL_VIRTUAL|FL_CLOSED;
+                                       vtag->tag_start = closing_tag->content_offset - 1;
+                                       vtag->content_offset = vtag->tag_start + 1;
+                                       vtag->parent = closing_tag->parent;
+                                       vtag->content_length = tag->tag_start - vtag->content_offset;
+                                       hc->all_tags.emplace_back(std::move(vtag));
+                               }
+                               else {
+                                       auto &&vtag = std::make_unique<html_tag>();
+                                       vtag->id = tag->id;
+                                       vtag->flags = FL_VIRTUAL|FL_CLOSED;
+                                       vtag->tag_start = 0;
+                                       vtag->content_offset = 0;
+                                       calculate_content_length(vtag.get());
+
+
+                                       if (!hc->root_tag) {
+                                               hc->root_tag = vtag.get();
+                                       }
+                                       else {
+                                               vtag->parent = hc->root_tag;
+                                       }
+                                       hc->all_tags.emplace_back(std::move(vtag));
+                               }
+                       }
                }
+
+               return false;
        }
 
        /* Misuse */
@@ -166,7 +251,7 @@ html_process_tag(rspamd_mempool_t *pool,
                                return false;
                        }
 
-                       if (!html_check_balance(tag, parent, tags_stack,
+                       if (!html_check_balance(hc, tag, parent, tags_stack,
                                        tag_start_offset, tag_end_offset)) {
                                msg_debug_html (
                                                "mark part as unbalanced as it has not pairable closing tags");
index 9091b9060abc0a45413eeb60b5258ae0ffa60dc1..36110c8c79a9a26e4ac3a50458760db8dbfa5caf 100644 (file)
@@ -43,16 +43,17 @@ enum class html_component_type : std::uint8_t {
 
 /* Public tags flags */
 /* XML tag */
-#define FL_XML          (1 << 23)
+#define FL_XML          (1 << 22)
 /* Closing tag */
-#define FL_CLOSING      (1 << 24)
+#define FL_CLOSING      (1 << 23)
 /* Fully closed tag (e.g. <a attrs />) */
-#define FL_CLOSED       (1 << 25)
-#define FL_BROKEN       (1 << 26)
-#define FL_IGNORE       (1 << 27)
-#define FL_BLOCK        (1 << 28)
-#define FL_HREF         (1 << 29)
+#define FL_CLOSED       (1 << 24)
+#define FL_BROKEN       (1 << 25)
+#define FL_IGNORE       (1 << 26)
+#define FL_BLOCK        (1 << 27)
+#define FL_HREF         (1 << 28)
 #define FL_COMMENT      (1 << 29)
+#define FL_VIRTUAL      (1 << 30)
 
 /**
  * Returns component type from a string
index 8f430204e593332fba0a00ea6d9242b5590c299f..3f209c08ee9d7f6da07541cc1377a6cc2e1fdc08 100644 (file)
@@ -191,13 +191,10 @@ typedef enum {
 #define CM_NO_INDENT    (1 << 18)
 /* Elements that are obsolete (such as "dir", "menu"). */
 #define CM_OBSOLETE     (1 << 19)
-/* User defined elements. Used to determine how attributes without value
-   should be printed. */
-#define CM_NEW          (1 << 20)
 /* Elements that cannot be omitted. */
-#define CM_OMITST       (1 << 21)
+#define CM_OMITST       (1 << 20)
 /* Unique elements */
-#define CM_UNIQUE       (1 << 22)
+#define CM_UNIQUE       (1 << 21)
 
 #ifdef  __cplusplus
 }