[Fix] Try to fix parsing of the unencoded `>` characters in html attributes

author Vsevolod Stakhov <vsevolod@rspamd.com>

Tue, 25 Oct 2022 06:01:40 +0000 (07:01 +0100)

committer Vsevolod Stakhov <vsevolod@rspamd.com>

Tue, 25 Oct 2022 06:01:40 +0000 (07:01 +0100)
author Vsevolod Stakhov <vsevolod@rspamd.com>
Tue, 25 Oct 2022 06:01:40 +0000 (07:01 +0100)
committer Vsevolod Stakhov <vsevolod@rspamd.com>
Tue, 25 Oct 2022 06:01:40 +0000 (07:01 +0100)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx

index 265a31f13e8a5886c26766a6a5f286de2ab9fadd..e2f48480449046b9a177be0ff909cf8fd2c601d6 100644 (file)
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -212,14 +212,34 @@ html_component_from_string(const std::string_view &st) -> std::optional<html_com
         }
  }
  
+enum tag_parser_state {
+       parse_start = 0,
+       parse_name,
+       parse_attr_name,
+       parse_equal,
+       parse_start_dquote,
+       parse_dqvalue,
+       parse_end_dquote,
+       parse_start_squote,
+       parse_sqvalue,
+       parse_end_squote,
+       parse_value,
+       spaces_before_eq,
+       spaces_after_eq,
+       spaces_after_param,
+       ignore_bad_tag,
+       tag_end,
+       slash_after_value,
+       slash_in_unquoted_value,
+};
  struct tag_content_parser_state {
-       int cur_state = 0;
+       tag_parser_state cur_state = parse_start;
         std::string buf;
         std::optional<html_component_type> cur_component;
  
         void reset()
         {
-               cur_state = 0;
+               cur_state = parse_start;
                 buf.clear();
                 cur_component = std::nullopt;
         }
@@ -232,28 +252,7 @@ html_parse_tag_content(rspamd_mempool_t *pool,
                                            const char *in,
                                            struct tag_content_parser_state &parser_env)
  {
-       enum tag_parser_state {
-               parse_start = 0,
-               parse_name,
-               parse_attr_name,
-               parse_equal,
-               parse_start_dquote,
-               parse_dqvalue,
-               parse_end_dquote,
-               parse_start_squote,
-               parse_sqvalue,
-               parse_end_squote,
-               parse_value,
-               spaces_before_eq,
-               spaces_after_eq,
-               spaces_after_param,
-               ignore_bad_tag,
-               tag_end,
-               slash_after_value,
-               slash_in_unquoted_value,
-       } state;
-
-       state = static_cast<enum tag_parser_state>(parser_env.cur_state);
+       auto state = parser_env.cur_state;
  
         /*
          * Stores tag component if it doesn't exist, performing copy of the
@@ -1855,19 +1854,54 @@ html_process_input(rspamd_mempool_t *pool,
                         html_parse_tag_content(pool, hc, cur_tag, p, content_parser_env);
  
                         if (t == '>') {
-                               if (closing) {
-                                       cur_tag->closing.start = c - start;
-                                       cur_tag->closing.end = p - start + 1;
-
-                                       closing = FALSE;
-                                       state = tag_end_closing;
+                               if (content_parser_env.cur_state != parse_dqvalue && content_parser_env.cur_state != parse_sqvalue) {
+                                       /* We have a closing element */
+                                       if (closing) {
+                                               cur_tag->closing.start = c - start;
+                                               cur_tag->closing.end = p - start + 1;
+
+                                               closing = FALSE;
+                                               state = tag_end_closing;
+                                       }
+                                       else {
+                                               cur_tag->content_offset = p - start + 1;
+                                               state = tag_end_opening;
+                                       }
                                 }
                                 else {
-                                       cur_tag->content_offset = p - start + 1;
-                                       state = tag_end_opening;
-                               }
-
+                                       /*
+                                        * We are in the parse_quoted value state but got
+                                        * an unescaped `>` character.
+                                        * HTML is written for monkeys, so there are two possibilities:
+                                        * 1) We have missing ending quote
+                                        * 2) We have unescaped `>` character
+                                        * How to distinguish between those possibilities?
+                                        * Well, the idea is to do some lookahead and try to find a
+                                        * quote. If we can find a quote, we just pretend as we have
+                                        * not seen `>` character. Otherwise, we pretend that it is an
+                                        * unquoted stuff. This logic is quite fragile but I really
+                                        * don't know any better options...
+                                        */
+                                       auto end_quote = content_parser_env.cur_state == parse_sqvalue ? '\'' : '"';
+                                       if (memchr(p, end_quote, end - p) != nullptr) {
+                                               /* Unencoded `>` */
+                                               p++;
+                                               continue;
+                                       }
+                                       else {
+                                               if (closing) {
+                                                       cur_tag->closing.start = c - start;
+                                                       cur_tag->closing.end = p - start + 1;
  
+                                                       closing = FALSE;
+                                                       state = tag_end_closing;
+                                               }
+                                               else {
+                                                       cur_tag->content_offset = p - start + 1;
+                                                       state = tag_end_opening;
+                                               }
+                                       }
+                               }
                                 continue;
                         }
                         p++;
author	Vsevolod Stakhov <vsevolod@rspamd.com>
	Tue, 25 Oct 2022 06:01:40 +0000 (07:01 +0100)
committer	Vsevolod Stakhov <vsevolod@rspamd.com>
	Tue, 25 Oct 2022 06:01:40 +0000 (07:01 +0100)