]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Html/CSS: Switch styles parsing to css parser
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 10 Jun 2021 16:21:09 +0000 (17:21 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 11 Jun 2021 14:09:10 +0000 (15:09 +0100)
src/libserver/css/css_parser.cxx
src/libserver/css/css_parser.hxx
src/libserver/html/html.cxx
src/libserver/html/html.hxx
src/libserver/html/html_block.hxx

index 774a65cfcd3956e9e432334902db983e21e3de97..34d65aadc3d3a8ed2642b86427d518f487c98bf0 100644 (file)
@@ -801,6 +801,20 @@ auto parse_css(rspamd_mempool_t *pool, const std::string_view &st,
                                                                                           "cannot parse input"});
 }
 
+auto
+parse_css_declaration(rspamd_mempool_t *pool, const std::string_view &st)
+       -> rspamd::html::html_block *
+{
+       auto &&res = process_declaration_tokens(pool,
+                       get_rules_parser_functor(pool, st));
+
+       if (res) {
+               return res->compile_to_block(pool);
+       }
+
+       return nullptr;
+}
+
 TEST_SUITE("css parser") {
        TEST_CASE("parse colors") {
                const std::vector<const char *> cases{
index ec6d5159a267c5194b8189e093dd4d7e710af39b..1e0762d78296dec03349a1df5858300ec868bbc8 100644 (file)
 #include "contrib/expected/expected.hpp"
 #include "logger.h"
 
+/* Forward declaration */
+namespace rspamd::html {
+struct html_block;
+}
 
 namespace rspamd::css {
 
@@ -205,6 +209,15 @@ auto get_selectors_parser_functor(rspamd_mempool_t *pool,
 auto get_rules_parser_functor(rspamd_mempool_t *pool,
                                                          const std::string_view &st) -> blocks_gen_functor;
 
+/**
+ * Parses a css declaration (e.g. embedded css and returns a completed html block)
+ * @param pool
+ * @param st
+ * @return
+ */
+auto parse_css_declaration(rspamd_mempool_t *pool, const std::string_view &st)
+       -> rspamd::html::html_block *;
+
 }
 
 #endif //RSPAMD_CSS_PARSER_HXX
index e867cce6dc7c5016ba0a6cdc8a5a5691227d016f..1d13c24660060e88ab959551ead33ac3e410652e 100644 (file)
@@ -990,610 +990,36 @@ html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
        }
 }
 
-static void
-html_process_color(std::string_view input, struct html_color *cl)
-{
-       const gchar *p = input.data(), *end = input.data() + input.size();
-       char hexbuf[7];
-
-       memset(cl, 0, sizeof(*cl));
-
-       if (*p == '#') {
-               /* HEX color */
-               p++;
-               rspamd_strlcpy(hexbuf, p, MIN ((gint) sizeof(hexbuf), end - p + 1));
-               cl->d.val = strtoul(hexbuf, NULL, 16);
-               cl->d.comp.alpha = 255;
-               cl->valid = TRUE;
-       }
-       else if (input.size() > 4 && rspamd_lc_cmp(p, "rgb", 3) == 0) {
-               /* We have something like rgba(x,x,x,x) or rgb(x,x,x) */
-               enum {
-                       obrace,
-                       num1,
-                       num2,
-                       num3,
-                       num4,
-                       skip_spaces
-               } state = skip_spaces, next_state = obrace;
-               gulong r = 0, g = 0, b = 0, opacity = 255;
-               const gchar *c;
-               gboolean valid = FALSE;
-
-               p += 3;
-
-               if (*p == 'a') {
-                       p++;
-               }
-
-               c = p;
-
-               while (p < end) {
-                       switch (state) {
-                       case obrace:
-                               if (*p == '(') {
-                                       p++;
-                                       state = skip_spaces;
-                                       next_state = num1;
-                               }
-                               else if (g_ascii_isspace (*p)) {
-                                       state = skip_spaces;
-                                       next_state = obrace;
-                               }
-                               else {
-                                       goto stop;
-                               }
-                               break;
-                       case num1:
-                               if (*p == ',') {
-                                       if (!rspamd_strtoul(c, p - c, &r)) {
-                                               goto stop;
-                                       }
-
-                                       p++;
-                                       state = skip_spaces;
-                                       next_state = num2;
-                               }
-                               else if (!g_ascii_isdigit (*p)) {
-                                       goto stop;
-                               }
-                               else {
-                                       p++;
-                               }
-                               break;
-                       case num2:
-                               if (*p == ',') {
-                                       if (!rspamd_strtoul(c, p - c, &g)) {
-                                               goto stop;
-                                       }
-
-                                       p++;
-                                       state = skip_spaces;
-                                       next_state = num3;
-                               }
-                               else if (!g_ascii_isdigit (*p)) {
-                                       goto stop;
-                               }
-                               else {
-                                       p++;
-                               }
-                               break;
-                       case num3:
-                               if (*p == ',') {
-                                       if (!rspamd_strtoul(c, p - c, &b)) {
-                                               goto stop;
-                                       }
-
-                                       valid = TRUE;
-                                       p++;
-                                       state = skip_spaces;
-                                       next_state = num4;
-                               }
-                               else if (*p == ')') {
-                                       if (!rspamd_strtoul(c, p - c, &b)) {
-                                               goto stop;
-                                       }
-
-                                       valid = TRUE;
-                                       goto stop;
-                               }
-                               else if (!g_ascii_isdigit (*p)) {
-                                       goto stop;
-                               }
-                               else {
-                                       p++;
-                               }
-                               break;
-                       case num4:
-                               if (*p == ',') {
-                                       if (!rspamd_strtoul(c, p - c, &opacity)) {
-                                               goto stop;
-                                       }
-
-                                       valid = TRUE;
-                                       goto stop;
-                               }
-                               else if (*p == ')') {
-                                       if (!rspamd_strtoul(c, p - c, &opacity)) {
-                                               goto stop;
-                                       }
-
-                                       valid = TRUE;
-                                       goto stop;
-                               }
-                               else if (!g_ascii_isdigit (*p)) {
-                                       goto stop;
-                               }
-                               else {
-                                       p++;
-                               }
-                               break;
-                       case skip_spaces:
-                               if (!g_ascii_isspace (*p)) {
-                                       c = p;
-                                       state = next_state;
-                               }
-                               else {
-                                       p++;
-                               }
-                               break;
-                       }
-               }
-
-stop:
-
-               if (valid) {
-                       cl->d.comp.r = r;
-                       cl->d.comp.g = g;
-                       cl->d.comp.b = b;
-                       cl->d.comp.alpha = opacity;
-                       cl->valid = TRUE;
-               }
-       }
-       else {
-               auto maybe_color_value =
-                               rspamd::css::css_value::maybe_color_from_string(input);
-
-               if (maybe_color_value.has_value()) {
-                       auto color = maybe_color_value->to_color().value();
-                       cl->d.val = color.to_number();
-                       cl->d.comp.alpha = 255; /* Non transparent */
-               }
-       }
-}
-
-/*
- * Target is used for in and out if this function returns TRUE
- */
-static auto
-html_process_css_size(const gchar *suffix, gsize len,
-                                                        double &tgt)  -> bool
-{
-       gdouble sz = tgt;
-       gboolean ret = FALSE;
-
-       if (len >= 2) {
-               if (memcmp(suffix, "px", 2) == 0) {
-                       sz = (guint) sz; /* Round to number */
-                       ret = TRUE;
-               }
-               else if (memcmp(suffix, "em", 2) == 0) {
-                       /* EM is 16 px, so multiply and round */
-                       sz = (guint) (sz * 16.0);
-                       ret = TRUE;
-               }
-               else if (len >= 3 && memcmp(suffix, "rem", 3) == 0) {
-                       /* equal to EM in our case */
-                       sz = (guint) (sz * 16.0);
-                       ret = TRUE;
-               }
-               else if (memcmp(suffix, "ex", 2) == 0) {
-                       /*
-                        * Represents the x-height of the element's font.
-                        * On fonts with the "x" letter, this is generally the height
-                        * of lowercase letters in the font; 1ex = 0.5em in many fonts.
-                        */
-                       sz = (guint) (sz * 8.0);
-                       ret = TRUE;
-               }
-               else if (memcmp(suffix, "vw", 2) == 0) {
-                       /*
-                        * Vewport width in percentages:
-                        * we assume 1% of viewport width as 8px
-                        */
-                       sz = (guint) (sz * 8.0);
-                       ret = TRUE;
-               }
-               else if (memcmp(suffix, "vh", 2) == 0) {
-                       /*
-                        * Vewport height in percentages
-                        * we assume 1% of viewport width as 6px
-                        */
-                       sz = (guint) (sz * 6.0);
-                       ret = TRUE;
-               }
-               else if (len >= 4 && memcmp(suffix, "vmax", 4) == 0) {
-                       /*
-                        * Vewport width in percentages
-                        * we assume 1% of viewport width as 6px
-                        */
-                       sz = (guint) (sz * 8.0);
-                       ret = TRUE;
-               }
-               else if (len >= 4 && memcmp(suffix, "vmin", 4) == 0) {
-                       /*
-                        * Vewport height in percentages
-                        * we assume 1% of viewport width as 6px
-                        */
-                       sz = (guint) (sz * 6.0);
-                       ret = TRUE;
-               }
-               else if (memcmp(suffix, "pt", 2) == 0) {
-                       sz = (guint) (sz * 96.0 / 72.0); /* One point. 1pt = 1/72nd of 1in */
-                       ret = TRUE;
-               }
-               else if (memcmp(suffix, "cm", 2) == 0) {
-                       sz = (guint) (sz * 96.0 / 2.54); /* 96px/2.54 */
-                       ret = TRUE;
-               }
-               else if (memcmp(suffix, "mm", 2) == 0) {
-                       sz = (guint) (sz * 9.6 / 2.54); /* 9.6px/2.54 */
-                       ret = TRUE;
-               }
-               else if (memcmp(suffix, "in", 2) == 0) {
-                       sz = (guint) (sz * 96.0); /* 96px */
-                       ret = TRUE;
-               }
-               else if (memcmp(suffix, "pc", 2) == 0) {
-                       sz = (guint) (sz * 96.0 / 6.0); /* 1pc = 12pt = 1/6th of 1in. */
-                       ret = TRUE;
-               }
-       }
-       else if (suffix[0] == '%') {
-               /* Percentages from 16 px */
-               sz = (guint) (sz / 100.0 * 16.0);
-               ret = TRUE;
-       }
-
-       if (ret) {
-               tgt = sz;
-       }
-
-       return ret;
-}
-
-static auto
-html_process_font_size(const gchar *line, guint len, guint &fs,
-                                                         gboolean is_css) -> void
-{
-       const gchar *p = line, *end = line + len;
-       gchar *err = NULL, numbuf[64];
-       gdouble sz = 0;
-       gboolean failsafe = FALSE;
-
-       while (p < end && g_ascii_isspace (*p)) {
-               p++;
-               len--;
-       }
-
-       if (g_ascii_isdigit (*p)) {
-               rspamd_strlcpy(numbuf, p, MIN (sizeof(numbuf), len + 1));
-               sz = strtod(numbuf, &err);
-
-               /* Now check leftover */
-               if (sz < 0) {
-                       sz = 0;
-               }
-       }
-       else {
-               /* Ignore the rest */
-               failsafe = TRUE;
-               sz = is_css ? 16 : 1;
-               /* TODO: add textual fonts descriptions */
-       }
-
-       if (err && *err != '\0') {
-               const gchar *e = err;
-               gsize slen;
-
-               /* Skip spaces */
-               while (*e && g_ascii_isspace (*e)) {
-                       e++;
-               }
-
-               /* Lowercase */
-               slen = strlen(e);
-               rspamd_str_lc((gchar *) e, slen);
-
-               if (!html_process_css_size(e, slen, sz)) {
-                       failsafe = TRUE;
-               }
-       }
-       else {
-               /* Failsafe naked number */
-               failsafe = TRUE;
-       }
-
-       if (failsafe) {
-               if (is_css) {
-                       /*
-                        * In css mode we usually ignore sizes, but let's treat
-                        * small sizes specially
-                        */
-                       if (sz < 1) {
-                               sz = 0;
-                       }
-                       else {
-                               sz = 16; /* Ignore */
-                       }
-               }
-               else {
-                       /* In non-css mode we have to check legacy size */
-                       sz = sz >= 1 ? sz * 16 : 16;
-               }
-       }
-
-       if (sz > 32) {
-               sz = 32;
-       }
-
-       fs = sz;
-}
-
-static void
-html_process_style(rspamd_mempool_t *pool, struct html_block *bl,
-                                  struct html_content *hc,
-                                  std::string_view style)
-{
-       const gchar *p, *c, *end, *key = NULL;
-       enum {
-               read_key,
-               read_colon,
-               read_value,
-               skip_spaces,
-       } state = skip_spaces, next_state = read_key;
-       guint klen = 0;
-       gdouble opacity = 1.0;
-
-       p = style.data();
-       c = p;
-       end = p + style.size();
-
-       while (p <= end) {
-               switch (state) {
-               case read_key:
-                       if (p == end || *p == ':') {
-                               key = c;
-                               klen = p - c;
-                               state = skip_spaces;
-                               next_state = read_value;
-                       }
-                       else if (g_ascii_isspace (*p)) {
-                               key = c;
-                               klen = p - c;
-                               state = skip_spaces;
-                               next_state = read_colon;
-                       }
-
-                       p++;
-                       break;
-
-               case read_colon:
-                       if (p == end || *p == ':') {
-                               state = skip_spaces;
-                               next_state = read_value;
-                       }
-
-                       p++;
-                       break;
-
-               case read_value:
-                       if (p == end || *p == ';') {
-                               if (key && klen && p - c > 0) {
-                                       if ((klen == 5 && g_ascii_strncasecmp(key, "color", 5) == 0)
-                                               || (klen == 10 && g_ascii_strncasecmp(key, "font-color", 10) == 0)) {
-
-                                               html_process_color({c, (std::size_t)(p - c)}, &bl->font_color);
-                                               msg_debug_html ("got color: %xd", bl->font_color.d.val);
-                                       }
-                                       else if ((klen == 16 && g_ascii_strncasecmp(key,
-                                                       "background-color", 16) == 0) ||
-                                                        (klen == 10 && g_ascii_strncasecmp(key,
-                                                                        "background", 10) == 0)) {
-
-                                               html_process_color({c, (std::size_t)(p - c)}, &bl->background_color);
-                                               msg_debug_html ("got bgcolor: %xd", bl->background_color.d.val);
-                                       }
-                                       else if (klen == 7 && g_ascii_strncasecmp(key, "display", 7) == 0) {
-                                               if (p - c >= 4 && rspamd_substring_search_caseless(c, p - c,
-                                                               "none", 4) != -1) {
-                                                       bl->visible = FALSE;
-                                                       msg_debug_html ("tag is not visible");
-                                               }
-                                       }
-                                       else if (klen == 9 &&
-                                                        g_ascii_strncasecmp(key, "font-size", 9) == 0) {
-                                               html_process_font_size(c, p - c,
-                                                               bl->font_size, TRUE);
-                                               msg_debug_html ("got font size: %ud", bl->font_size);
-                                       }
-                                       else if (klen == 7 &&
-                                                        g_ascii_strncasecmp(key, "opacity", 7) == 0) {
-                                               gchar numbuf[64];
-
-                                               rspamd_strlcpy(numbuf, c,
-                                                               MIN (sizeof(numbuf), p - c + 1));
-                                               opacity = strtod(numbuf, NULL);
-
-                                               if (opacity > 1) {
-                                                       opacity = 1;
-                                               }
-                                               else if (opacity < 0) {
-                                                       opacity = 0;
-                                               }
-
-                                               bl->font_color.d.comp.alpha = (guint8) (opacity * 255.0);
-                                       }
-                                       else if (klen == 10 &&
-                                                        g_ascii_strncasecmp(key, "visibility", 10) == 0) {
-                                               if (p - c >= 6 && rspamd_substring_search_caseless(c,
-                                                               p - c,
-                                                               "hidden", 6) != -1) {
-                                                       bl->visible = FALSE;
-                                                       msg_debug_html ("tag is not visible");
-                                               }
-                                       }
-                               }
-
-                               key = NULL;
-                               klen = 0;
-                               state = skip_spaces;
-                               next_state = read_key;
-                       }
-
-                       p++;
-                       break;
-
-               case skip_spaces:
-                       if (p < end && !g_ascii_isspace (*p)) {
-                               c = p;
-                               state = next_state;
-                       }
-                       else {
-                               p++;
-                       }
-
-                       break;
-               }
-       }
-}
-
 static auto
 html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
                                           struct html_content *hc) -> void
 {
-       auto *bl = rspamd_mempool_alloc0_type (pool, struct html_block);
-       bl->tag = tag;
-       bl->visible = TRUE;
-       bl->font_size = (guint) -1;
-       bl->font_color.d.comp.alpha = 255;
+       std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor;
 
        for (const auto &param : tag->parameters) {
                if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) {
-                       html_process_color(param.value, &bl->font_color);
-                       msg_debug_html ("tag %*s; got color: %xd",
-                                       (int) tag->name.size(), tag->name.data(),
-                                       bl->font_color.d.val);
+                       maybe_fgcolor = css::css_value::maybe_color_from_string(param.value);
                }
 
                if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) {
-                       html_process_color(param.value, &bl->background_color);
-                       msg_debug_html ("tag %*s; got bgcolor: %xd",
-                                       (int) tag->name.size(), tag->name.data(),
-                                       bl->background_color.d.val);
-                       if (tag->id == Tag_BODY) {
-                               /* Set global background color */
-                               memcpy(&hc->bgcolor, &bl->background_color,
-                                               sizeof(hc->bgcolor));
-                       }
+                       maybe_bgcolor = css::css_value::maybe_color_from_string(param.value);
                }
 
                if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
-                       html_process_style(pool, bl, hc, param.value);
-                       msg_debug_html ("tag: %*s; got style: %*s",
-                                       (int) tag->name.size(), tag->name.data(),
-                                       (int) bl->style.len, bl->style.begin);
-               }
-
-               if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) {
-                       rspamd_ftok_t fstr;
-                       fstr.begin = param.value.data();
-                       fstr.len = param.value.size();
-                       bl->html_class = rspamd_mempool_ftokdup (pool, &fstr);
-                       msg_debug_html ("tag: %*s; got class: %s",
-                                       (int) tag->name.size(), tag->name.data(), bl->html_class);
+                       tag->block = rspamd::css::parse_css_declaration(pool, param.value);
                }
        }
 
-       hc->blocks.push_back(bl);
-       tag->block = bl;
-}
-
-static auto
-html_propagate_style(struct html_content *hc,
-                                                       struct html_tag *tag,
-                                                       struct html_block *bl,
-                                                       std::vector<struct html_block *> &blocks) -> void
-{
-       gboolean push_block = FALSE;
-
-       if (blocks.empty()) {
-               /* No blocks to propagate */
-               return;
-       }
-       /* Propagate from the parent if needed */
-       auto *bl_parent = blocks.back();
-
-       if (!bl->background_color.valid) {
-               /* Try to propagate background color from parent nodes */
-               if (bl_parent->background_color.valid) {
-                       memcpy(&bl->background_color, &bl_parent->background_color,
-                                       sizeof(bl->background_color));
-               }
-       }
-       else {
-               push_block = TRUE;
+       if (!tag->block) {
+               tag->block = html_block::undefined_html_block_pool(pool);
        }
 
-       if (!bl->font_color.valid) {
-               /* Try to propagate background color from parent nodes */
-               if (bl_parent->font_color.valid) {
-                       memcpy(&bl->font_color, &bl_parent->font_color,
-                                       sizeof(bl->font_color));
-               }
-       }
-       else {
-               push_block = TRUE;
-       }
-
-       /* Propagate font size */
-       if (bl->font_size == (guint) -1) {
-               if (bl_parent->font_size != (guint) -1) {
-                       bl->font_size = bl_parent->font_size;
-               }
-       }
-       else {
-               push_block = TRUE;
+       if (maybe_fgcolor) {
+               tag->block->set_fgcolor(maybe_fgcolor->to_color().value());
        }
 
-       /* Set bgcolor to the html bgcolor and font color to black as a last resort */
-       if (!bl->font_color.valid) {
-               /* Don't touch opacity as it can be set separately */
-               bl->font_color.d.comp.r = 0;
-               bl->font_color.d.comp.g = 0;
-               bl->font_color.d.comp.b = 0;
-               bl->font_color.valid = TRUE;
-       }
-       else {
-               push_block = TRUE;
-       }
-
-       if (!bl->background_color.valid) {
-               memcpy(&bl->background_color, &hc->bgcolor, sizeof(hc->bgcolor));
-       }
-       else {
-               push_block = TRUE;
-       }
-
-       if (bl->font_size == (guint) -1) {
-               bl->font_size = 16; /* Default for browsers */
-       }
-       else {
-               push_block = TRUE;
-       }
-
-       if (push_block && !(tag->flags & FL_CLOSED)) {
-               blocks.push_back(bl);
+       if (maybe_bgcolor) {
+               tag->block->set_bgcolor(maybe_fgcolor->to_color().value());
        }
 }
 
@@ -2186,7 +1612,8 @@ html_process_input(rspamd_mempool_t *pool,
                                        html_process_link_tag(pool, cur_tag, hc, url_set,
                                                        part_urls);
                                }
-                               else if (cur_tag->flags & FL_BLOCK) {
+
+                               if (cur_tag->flags & FL_BLOCK) {
                                        struct html_block *bl;
 
                                        if (cur_tag->flags & FL_CLOSING) {
@@ -2197,27 +1624,6 @@ html_process_input(rspamd_mempool_t *pool,
                                        }
                                        else {
                                                html_process_block_tag(pool, cur_tag, hc);
-                                               bl = cur_tag->block;
-
-                                               if (bl) {
-                                                       html_propagate_style(hc, cur_tag,
-                                                                       bl, blocks_stack);
-
-                                                       /* Check visibility */
-                                                       if (bl->font_size < 3 ||
-                                                               bl->font_color.d.comp.alpha < 10) {
-
-                                                               bl->visible = FALSE;
-                                                               msg_debug_html ("tag is not visible: font size: "
-                                                                                               "%d, alpha: %d",
-                                                                               (int)bl->font_size,
-                                                                               (int)bl->font_color.d.comp.alpha);
-                                                       }
-
-                                                       if (!bl->visible) {
-                                                               state = content_ignore;
-                                                       }
-                                               }
                                        }
                                }
                        }
index fc1dda141a12de9c95795372cf9c1c21cb0a14fd..c75d84ea4855760162209f210e8c4c3106782834 100644 (file)
@@ -40,7 +40,6 @@ struct html_content {
        guint total_tags = 0;
        std::vector<bool> tags_seen;
        std::vector<html_image *> images;
-       std::vector<html_block *> blocks;
        std::vector<std::unique_ptr<struct html_tag>> all_tags;
        std::string parsed;
        void *css_style;
@@ -48,7 +47,6 @@ struct html_content {
        /* Preallocate and reserve all internal structures */
        html_content() {
                tags_seen.resize(N_TAGS, false);
-               blocks.reserve(128);
                all_tags.reserve(128);
                parsed.reserve(256);
        }
index 3978bcf1ed97f9cd4d0166883ed53fa53f513bcf..91348086411a1e2ef9e172557ed77f4a23c049c9 100644 (file)
@@ -171,6 +171,17 @@ struct html_block {
                                                  rspamd::css::css_display_value::DISPLAY_NORMAL,
                                                  12};
        }
+       /**
+        * Produces html block with no defined values allocated from the pool
+        * @param pool
+        * @return
+        */
+       static auto undefined_html_block_pool(rspamd_mempool_t *pool) -> html_block* {
+               auto *bl = rspamd_mempool_alloc_type(pool, html_block);
+               bl->mask = 0;
+
+               return bl;
+       }
 };
 
 }