aboutsummaryrefslogtreecommitdiffstats
path: root/src/libserver
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2021-05-25 10:42:57 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2021-05-27 15:05:21 +0100
commit2bf1323550564fc5d99d9dec52abf99ce110ca91 (patch)
tree459f1578b9085b793d7d0a6d2472c73f7c326d98 /src/libserver
parentc9a5719fb6187b59229465be631972ace9f32896 (diff)
downloadrspamd-2bf1323550564fc5d99d9dec52abf99ce110ca91.tar.gz
rspamd-2bf1323550564fc5d99d9dec52abf99ce110ca91.zip
[Rework] Html: Start refactoring of the html tags handling
Diffstat (limited to 'src/libserver')
-rw-r--r--src/libserver/html/html.cxx570
-rw-r--r--src/libserver/html/html.h50
-rw-r--r--src/libserver/html/html_tag.hxx56
-rw-r--r--src/libserver/html/html_tag_defs.hxx2
4 files changed, 214 insertions, 464 deletions
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index fcb6ba8f7..c167b004f 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -29,8 +29,11 @@
#include "html_tag_defs.hxx"
#include "html_entities.hxx"
+#include "html_tag.hxx"
#include <vector>
+#include <frozen/unordered_map.h>
+#include <frozen/string.h>
#include <unicode/uversion.h>
#include <unicode/ucnv.h>
@@ -44,8 +47,22 @@ static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */
static const html_tags_storage html_tags_defs;
-
-}
+auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_type>(
+ {
+ {"name", html_component_type::RSPAMD_HTML_COMPONENT_NAME},
+ {"href", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
+ {"src", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
+ {"action", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
+ {"color", html_component_type::RSPAMD_HTML_COMPONENT_COLOR},
+ {"bgcolor", html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR},
+ {"style", html_component_type::RSPAMD_HTML_COMPONENT_STYLE},
+ {"class", html_component_type::RSPAMD_HTML_COMPONENT_CLASS},
+ {"width", html_component_type::RSPAMD_HTML_COMPONENT_WIDTH},
+ {"height", html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT},
+ {"size", html_component_type::RSPAMD_HTML_COMPONENT_SIZE},
+ {"rel", html_component_type::RSPAMD_HTML_COMPONENT_REL},
+ {"alt", html_component_type::RSPAMD_HTML_COMPONENT_ALT},
+ });
#define msg_debug_html(...) rspamd_conditional_debug_fast (NULL, NULL, \
rspamd_html_log_id, "html", pool->tag.uid, \
@@ -54,13 +71,6 @@ static const html_tags_storage html_tags_defs;
INIT_LOG_MODULE(html)
-/* Unconverted C part */
-
-static struct rspamd_url *rspamd_html_process_url(rspamd_mempool_t *pool,
- const gchar *start, guint len,
- struct html_tag_component *comp);
-
-
static gboolean
rspamd_html_check_balance(GNode *node, GNode **cur_level)
{
@@ -92,249 +102,12 @@ rspamd_html_check_balance(GNode *node, GNode **cur_level)
}
static gboolean
-rspamd_url_is_subdomain(rspamd_ftok_t *t1, rspamd_ftok_t *t2)
-{
- const gchar *p1, *p2;
-
- p1 = t1->begin + t1->len - 1;
- p2 = t2->begin + t2->len - 1;
-
- /* Skip trailing dots */
- while (p1 > t1->begin) {
- if (*p1 != '.') {
- break;
- }
-
- p1--;
- }
-
- while (p2 > t2->begin) {
- if (*p2 != '.') {
- break;
- }
-
- p2--;
- }
-
- while (p1 > t1->begin && p2 > t2->begin) {
- if (*p1 != *p2) {
- break;
- }
-
- p1--;
- p2--;
- }
-
- if (p2 == t2->begin) {
- /* p2 can be subdomain of p1 if *p1 is '.' */
- if (p1 != t1->begin && *(p1 - 1) == '.') {
- return TRUE;
- }
- }
- else if (p1 == t1->begin) {
- if (p2 != t2->begin && *(p2 - 1) == '.') {
- return TRUE;
- }
- }
-
- return FALSE;
-}
-
-static void
-rspamd_html_url_is_phished(rspamd_mempool_t *pool,
- struct rspamd_url *href_url,
- const guchar *url_text,
- gsize len,
- gboolean *url_found,
- struct rspamd_url **ptext_url)
+rspamd_html_process_tag(rspamd_mempool_t *pool,
+ struct html_content *hc,
+ struct html_tag *tag,
+ GNode **cur_level,
+ gboolean *balanced)
{
- struct rspamd_url *text_url;
- rspamd_ftok_t disp_tok, href_tok;
- gint rc;
- goffset url_pos;
- gchar *url_str = NULL, *idn_hbuf;
- const guchar *end = url_text + len, *p;
-#if U_ICU_VERSION_MAJOR_NUM >= 46
- static UIDNA *udn;
- UErrorCode uc_err = U_ZERO_ERROR;
- UIDNAInfo uinfo = UIDNA_INFO_INITIALIZER;
-#endif
-
- *url_found = FALSE;
-#if U_ICU_VERSION_MAJOR_NUM >= 46
- if (udn == NULL) {
- udn = uidna_openUTS46(UIDNA_DEFAULT, &uc_err);
-
- if (uc_err != U_ZERO_ERROR) {
- msg_err_pool ("cannot init idna converter: %s", u_errorName(uc_err));
- }
- }
-#endif
-
- while (url_text < end && g_ascii_isspace (*url_text)) {
- url_text++;
- }
-
- if (end > url_text + 4 &&
- rspamd_url_find(pool, (const gchar *)url_text, (gsize)(end - url_text), &url_str,
- RSPAMD_URL_FIND_ALL,
- &url_pos, NULL) &&
- url_str != NULL) {
- if (url_pos > 0) {
- /*
- * We have some url at some offset, so we need to check what is
- * at the start of the text
- */
- p = url_text;
-
- while (p < url_text + url_pos) {
- if (!g_ascii_isspace (*p)) {
- *url_found = FALSE;
- return;
- }
-
- p++;
- }
- }
-
- text_url = rspamd_mempool_alloc0_type (pool, struct rspamd_url);
- rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
- RSPAMD_URL_PARSE_TEXT);
-
- if (rc == URI_ERRNO_OK) {
- disp_tok.len = text_url->hostlen;
- disp_tok.begin = rspamd_url_host_unsafe (text_url);
-#if U_ICU_VERSION_MAJOR_NUM >= 46
- if (rspamd_substring_search_caseless(rspamd_url_host_unsafe (text_url),
- text_url->hostlen, "xn--", 4) != -1) {
- idn_hbuf = (char *)rspamd_mempool_alloc (pool, text_url->hostlen * 2 + 1);
- /* We need to convert it to the normal value first */
- disp_tok.len = uidna_nameToUnicodeUTF8(udn,
- rspamd_url_host_unsafe (text_url), text_url->hostlen,
- idn_hbuf, text_url->hostlen * 2 + 1, &uinfo, &uc_err);
-
- if (uc_err != U_ZERO_ERROR) {
- msg_err_pool ("cannot convert to IDN: %s",
- u_errorName(uc_err));
- disp_tok.len = text_url->hostlen;
- }
- else {
- disp_tok.begin = idn_hbuf;
- }
- }
-#endif
- href_tok.len = href_url->hostlen;
- href_tok.begin = rspamd_url_host_unsafe (href_url);
-#if U_ICU_VERSION_MAJOR_NUM >= 46
- if (rspamd_substring_search_caseless(rspamd_url_host_unsafe (href_url),
- href_url->hostlen, "xn--", 4) != -1) {
- idn_hbuf = (char *)rspamd_mempool_alloc (pool, href_url->hostlen * 2 + 1);
- /* We need to convert it to the normal value first */
- href_tok.len = uidna_nameToUnicodeUTF8(udn,
- rspamd_url_host_unsafe (href_url), href_url->hostlen,
- idn_hbuf, href_url->hostlen * 2 + 1, &uinfo, &uc_err);
-
- if (uc_err != U_ZERO_ERROR) {
- msg_err_pool ("cannot convert to IDN: %s",
- u_errorName(uc_err));
- href_tok.len = href_url->hostlen;
- }
- else {
- href_tok.begin = idn_hbuf;
- }
- }
-#endif
- if (rspamd_ftok_casecmp(&disp_tok, &href_tok) != 0 &&
- text_url->tldlen > 0 && href_url->tldlen > 0) {
-
- /* Apply the same logic for TLD */
- disp_tok.len = text_url->tldlen;
- disp_tok.begin = rspamd_url_tld_unsafe (text_url);
-#if U_ICU_VERSION_MAJOR_NUM >= 46
- if (rspamd_substring_search_caseless(rspamd_url_tld_unsafe (text_url),
- text_url->tldlen, "xn--", 4) != -1) {
- idn_hbuf = (char *)rspamd_mempool_alloc (pool, text_url->tldlen * 2 + 1);
- /* We need to convert it to the normal value first */
- disp_tok.len = uidna_nameToUnicodeUTF8(udn,
- rspamd_url_tld_unsafe (text_url), text_url->tldlen,
- idn_hbuf, text_url->tldlen * 2 + 1, &uinfo, &uc_err);
-
- if (uc_err != U_ZERO_ERROR) {
- msg_err_pool ("cannot convert to IDN: %s",
- u_errorName(uc_err));
- disp_tok.len = text_url->tldlen;
- }
- else {
- disp_tok.begin = idn_hbuf;
- }
- }
-#endif
- href_tok.len = href_url->tldlen;
- href_tok.begin = rspamd_url_tld_unsafe (href_url);
-#if U_ICU_VERSION_MAJOR_NUM >= 46
- if (rspamd_substring_search_caseless(rspamd_url_tld_unsafe (href_url),
- href_url->tldlen, "xn--", 4) != -1) {
- idn_hbuf = (char*)rspamd_mempool_alloc (pool, href_url->tldlen * 2 + 1);
- /* We need to convert it to the normal value first */
- href_tok.len = uidna_nameToUnicodeUTF8(udn,
- rspamd_url_tld_unsafe (href_url), href_url->tldlen,
- idn_hbuf, href_url->tldlen * 2 + 1, &uinfo, &uc_err);
-
- if (uc_err != U_ZERO_ERROR) {
- msg_err_pool ("cannot convert to IDN: %s",
- u_errorName(uc_err));
- href_tok.len = href_url->tldlen;
- }
- else {
- href_tok.begin = idn_hbuf;
- }
- }
-#endif
- if (rspamd_ftok_casecmp(&disp_tok, &href_tok) != 0) {
- /* Check if one url is a subdomain for another */
-
- if (!rspamd_url_is_subdomain(&disp_tok, &href_tok)) {
- href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
- href_url->linked_url = text_url;
- text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
- }
- }
- }
-
- *ptext_url = text_url;
- *url_found = TRUE;
- }
- else {
- /*
- * We have found something that looks like an url but it was
- * not parsed correctly.
- * Sometimes it means an obfuscation attempt, so we have to check
- * what's inside of the text
- */
- gboolean obfuscation_found = FALSE;
-
- if (len > 4 && g_ascii_strncasecmp((char *)url_text, "http", 4) == 0 &&
- rspamd_substring_search((char *)url_text, len, "://", 3) != -1) {
- /* Clearly an obfuscation attempt */
- obfuscation_found = TRUE;
- }
-
- msg_info_pool ("extract of url '%s' failed: %s; obfuscation detected: %s",
- url_str,
- rspamd_url_strerror(rc),
- obfuscation_found ? "yes" : "no");
-
- if (obfuscation_found) {
- href_url->flags |= RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED;
- }
- }
- }
-
-}
-
-static gboolean
-rspamd_html_process_tag(rspamd_mempool_t *pool, struct html_content *hc,
- struct html_tag *tag, GNode **cur_level, gboolean *balanced) {
GNode *nnode;
struct html_tag *parent;
@@ -454,115 +227,49 @@ rspamd_html_process_tag(rspamd_mempool_t *pool, struct html_content *hc,
return TRUE;
}
-#define NEW_COMPONENT(comp_type) do { \
- comp = (decltype(comp))rspamd_mempool_alloc (pool, sizeof (*comp)); \
- comp->type = (comp_type); \
- comp->start = NULL; \
- comp->len = 0; \
- g_queue_push_tail (tag->params, comp); \
- ret = TRUE; \
-} while(0)
-
-static gboolean
-rspamd_html_parse_tag_component(rspamd_mempool_t *pool,
- const guchar *begin, const guchar *end,
- struct html_tag *tag) {
- struct html_tag_component *comp;
- gint len;
- gboolean ret = FALSE;
- gchar *p;
+static auto
+find_tag_component_name(rspamd_mempool_t *pool,
+ const gchar *begin,
+ const gchar *end) -> std::optional<html_component_type>
+{
if (end <= begin) {
- return FALSE;
+ return std::nullopt;
}
- p = (char *)rspamd_mempool_alloc (pool, end - begin);
+ auto *p = rspamd_mempool_alloc_buffer(pool, end - begin);
memcpy(p, begin, end - begin);
- len = rspamd_html_decode_entitles_inplace(p, end - begin);
+ auto len = decode_html_entitles_inplace(p, end - begin);
+ auto known_component_it = html_components_map.find({p, len});
- if (len == 3) {
- if (g_ascii_strncasecmp(p, "src", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF);
- }
- else if (g_ascii_strncasecmp(p, "rel", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_REL);
- }
- else if (g_ascii_strncasecmp(p, "alt", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_ALT);
- }
+ if (known_component_it != html_components_map.end()) {
+ return known_component_it->second;
}
- else if (len == 4) {
- if (g_ascii_strncasecmp(p, "href", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF);
- }
- }
- else if (len == 6) {
- if (g_ascii_strncasecmp(p, "action", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HREF);
- }
+ else {
+ return std::nullopt;
}
+}
- if (tag->id == Tag_IMG) {
- /* Check width and height if presented */
- if (len == 5 && g_ascii_strncasecmp(p, "width", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_WIDTH);
- }
- else if (len == 6 && g_ascii_strncasecmp(p, "height", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_HEIGHT);
- }
- else if (g_ascii_strncasecmp(p, "style", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
- }
- }
- else if (tag->id == Tag_FONT) {
- if (len == 5) {
- if (g_ascii_strncasecmp(p, "color", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_COLOR);
- }
- else if (g_ascii_strncasecmp(p, "style", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
- }
- else if (g_ascii_strncasecmp(p, "class", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_CLASS);
- }
- }
- else if (len == 7) {
- if (g_ascii_strncasecmp(p, "bgcolor", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_BGCOLOR);
- }
- }
- else if (len == 4) {
- if (g_ascii_strncasecmp(p, "size", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_SIZE);
- }
- }
- }
- else if (tag->flags & FL_BLOCK) {
- if (len == 5) {
- if (g_ascii_strncasecmp(p, "color", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_COLOR);
- }
- else if (g_ascii_strncasecmp(p, "style", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_STYLE);
- }
- else if (g_ascii_strncasecmp(p, "class", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_CLASS);
- }
- }
- else if (len == 7) {
- if (g_ascii_strncasecmp(p, "bgcolor", len) == 0) {
- NEW_COMPONENT (RSPAMD_HTML_COMPONENT_BGCOLOR);
- }
- }
- }
+struct tag_content_parser_state {
+ int cur_state = 0;
+ const char *saved_p = nullptr;
+ std::optional<html_component_type> cur_component;
- return ret;
-}
+ void reset()
+ {
+ cur_state = 0;
+ saved_p = nullptr;
+ cur_component = std::nullopt;
+ }
+};
static inline void
-rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
- struct html_content *hc, struct html_tag *tag, const guchar *in,
- gint *statep, guchar const **savep) {
+parse_tag_content(rspamd_mempool_t *pool,
+ struct html_content *hc,
+ struct html_tag *tag,
+ const char *in,
+ struct tag_content_parser_state parser_env)
+{
enum tag_parser_state {
parse_start = 0,
parse_name,
@@ -581,11 +288,36 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
spaces_after_param,
ignore_bad_tag
} state;
-
gboolean store = FALSE;
- struct html_tag_component *comp;
- state = static_cast<enum tag_parser_state>(*statep);
+ state = static_cast<enum tag_parser_state>(parser_env.cur_state);
+
+ /*
+ * Stores tag component if it doesn't exist, performing copy of the
+ * value + decoding of the entities
+ * Parser env is set to clear the current html attribute fields (saved_p and
+ * cur_component)
+ */
+ auto store_tag_component = [&]() -> void {
+ if (parser_env.saved_p != nullptr && parser_env.cur_component &&
+ in > parser_env.saved_p) {
+
+ /* We ignore repeated attributes */
+ auto found_it = tag->parameters.find(parser_env.cur_component.value());
+
+ if (found_it == tag->parameters.end()) {
+ auto sz = (std::size_t)(in - parser_env.saved_p);
+ auto *s = rspamd_mempool_alloc_buffer(pool, sz);
+ memcpy(s, parser_env.saved_p, sz);
+ sz = rspamd_html_decode_entitles_inplace(s, in - parser_env.saved_p);
+ tag->parameters.emplace(parser_env.cur_component.value(),
+ std::string_view{s, sz});
+ }
+ }
+
+ parser_env.saved_p = nullptr;
+ parser_env.cur_component = std::nullopt;
+ };
switch (state) {
case parse_start:
@@ -597,40 +329,39 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
}
else if (g_ascii_isalpha (*in)) {
state = parse_name;
- tag->name.start = in;
+ tag->name = std::string_view{in, 0};
}
break;
case parse_name:
if (g_ascii_isspace (*in) || *in == '>' || *in == '/') {
- g_assert (in >= tag->name.start);
+ const auto *start = tag->name.begin();
+ g_assert (in >= start);
if (*in == '/') {
tag->flags |= FL_CLOSED;
}
- tag->name.len = in - tag->name.start;
+ tag->name = std::string_view{start, (std::size_t)(in - start)};
- if (tag->name.len == 0) {
+ if (tag->name.empty()) {
hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
tag->id = -1;
tag->flags |= FL_BROKEN;
state = ignore_bad_tag;
}
else {
- gchar *s;
- /* We CANNOT safely modify tag's name here, as it is already parsed */
-
- s = (char *)rspamd_mempool_alloc (pool, tag->name.len + 1);
- memcpy(s, tag->name.start, tag->name.len);
- tag->name.len = rspamd_html_decode_entitles_inplace(s,
- tag->name.len);
- tag->name.start = (guchar *)s;
- tag->name.len = rspamd_str_lc_utf8(s, tag->name.len);
- s[tag->name.len] = '\0';
+ /*
+ * Copy tag name to the temporary buffer for modifications
+ */
+ auto *s = rspamd_mempool_alloc_buffer(pool, tag->name.size() + 1);
+ rspamd_strlcpy(s, tag->name.data(), tag->name.size());
+ auto nsize = rspamd_html_decode_entitles_inplace(s,
+ tag->name.size());
+ nsize = rspamd_str_lc_utf8(s, nsize);
+ tag->name = std::string_view{s, nsize};
- const auto *tag_def = rspamd::html::html_tags_defs.by_name({
- (const char *)tag->name.start, tag->name.len});
+ const auto *tag_def = rspamd::html::html_tags_defs.by_name(tag->name);
if (tag_def == nullptr) {
hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
@@ -647,11 +378,11 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
break;
case parse_attr_name:
- if (*savep == NULL) {
+ if (parser_env.saved_p == nullptr) {
state = ignore_bad_tag;
}
else {
- const guchar *attr_name_end = in;
+ const auto *attr_name_end = in;
if (*in == '=') {
state = parse_equal;
@@ -661,7 +392,7 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
state = parse_start_dquote;
attr_name_end = in - 1;
- while (attr_name_end > *savep) {
+ while (attr_name_end > parser_env.saved_p) {
if (!g_ascii_isalnum (*attr_name_end)) {
attr_name_end--;
}
@@ -683,7 +414,7 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
state = parse_value;
attr_name_end = in - 1;
- while (attr_name_end > *savep) {
+ while (attr_name_end > parser_env.saved_p) {
if (!g_ascii_isalnum (*attr_name_end)) {
attr_name_end--;
}
@@ -699,12 +430,16 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
return;
}
- if (!rspamd_html_parse_tag_component(pool, *savep, attr_name_end, tag)) {
+ parser_env.cur_component = find_tag_component_name(pool,
+ parser_env.saved_p,
+ attr_name_end);
+
+ if (!parser_env.cur_component) {
/* Ignore unknown params */
- *savep = NULL;
+ parser_env.saved_p = nullptr;
}
else if (state == parse_value) {
- *savep = in + 1;
+ parser_env.saved_p = in + 1;
}
}
@@ -712,7 +447,8 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
case spaces_after_name:
if (!g_ascii_isspace (*in)) {
- *savep = in;
+ parser_env.saved_p = in;
+
if (*in == '/') {
tag->flags |= FL_CLOSED;
}
@@ -754,7 +490,7 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
* now. We don't use them in fact...
*/
state = parse_attr_name;
- *savep = in;
+ parser_env.saved_p = in;
}
}
break;
@@ -767,9 +503,9 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
state = parse_start_squote;
}
else if (!g_ascii_isspace (*in)) {
- if (*savep != NULL) {
+ if (parser_env.saved_p != nullptr) {
/* We need to save this param */
- *savep = in;
+ parser_env.saved_p = in;
}
state = parse_value;
}
@@ -786,9 +522,9 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
state = parse_start_squote;
}
else {
- if (*savep != NULL) {
+ if (parser_env.saved_p != nullptr) {
/* We need to save this param */
- *savep = in;
+ parser_env.saved_p = in;
}
state = parse_value;
}
@@ -796,16 +532,16 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
case parse_start_dquote:
if (*in == '"') {
- if (*savep != NULL) {
+ if (parser_env.saved_p != nullptr) {
/* We have an empty attribute value */
- savep = NULL;
+ parser_env.saved_p = nullptr;
}
state = spaces_after_param;
}
else {
- if (*savep != NULL) {
+ if (parser_env.saved_p != nullptr) {
/* We need to save this param */
- *savep = in;
+ parser_env.saved_p = in;
}
state = parse_dqvalue;
}
@@ -813,16 +549,16 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
case parse_start_squote:
if (*in == '\'') {
- if (*savep != NULL) {
+ if (parser_env.saved_p != nullptr) {
/* We have an empty attribute value */
- savep = NULL;
+ parser_env.saved_p = nullptr;
}
state = spaces_after_param;
}
else {
- if (*savep != NULL) {
+ if (parser_env.saved_p != nullptr) {
/* We need to save this param */
- *savep = in;
+ parser_env.saved_p = in;
}
state = parse_sqvalue;
}
@@ -835,19 +571,7 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
}
if (store) {
- if (*savep != NULL) {
- gchar *s;
-
- g_assert (tag->params != NULL);
- comp = (struct html_tag_component *)g_queue_peek_tail(tag->params);
- g_assert (comp != NULL);
- comp->len = in - *savep;
- s = (char *)rspamd_mempool_alloc (pool, comp->len);
- memcpy(s, *savep, comp->len);
- comp->len = rspamd_html_decode_entitles_inplace(s, comp->len);
- comp->start = (unsigned char *)s;
- *savep = NULL;
- }
+ store_tag_component();
}
break;
@@ -857,19 +581,7 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
state = parse_end_squote;
}
if (store) {
- if (*savep != NULL) {
- gchar *s;
-
- g_assert (tag->params != NULL);
- comp = (struct html_tag_component *)g_queue_peek_tail(tag->params);
- g_assert (comp != NULL);
- comp->len = in - *savep;
- s = (char *)rspamd_mempool_alloc (pool, comp->len);
- memcpy(s, *savep, comp->len);
- comp->len = rspamd_html_decode_entitles_inplace(s, comp->len);
- comp->start = (unsigned char *)s;
- *savep = NULL;
- }
+ store_tag_component();
}
break;
@@ -884,19 +596,7 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
}
if (store) {
- if (*savep != NULL) {
- gchar *s;
-
- g_assert (tag->params != NULL);
- comp = (struct html_tag_component *)g_queue_peek_tail(tag->params);
- g_assert (comp != NULL);
- comp->len = in - *savep;
- s = (char *)rspamd_mempool_alloc (pool, comp->len);
- memcpy(s, *savep, comp->len);
- comp->len = rspamd_html_decode_entitles_inplace(s, comp->len);
- comp->start = (unsigned char *)s;
- *savep = NULL;
- }
+ store_tag_component();
}
break;
@@ -911,7 +611,7 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
else {
/* No space, proceed immediately to the attribute name */
state = parse_attr_name;
- *savep = in;
+ parser_env.saved_p = in;
}
break;
@@ -922,7 +622,7 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
}
state = parse_attr_name;
- *savep = in;
+ parser_env.saved_p = in;
}
break;
@@ -930,9 +630,19 @@ rspamd_html_parse_tag_content(rspamd_mempool_t *pool,
break;
}
- *statep = state;
+ parser_env.cur_state = state;
+}
+
}
+/* Unconverted C part */
+
+static struct rspamd_url *rspamd_html_process_url(rspamd_mempool_t *pool,
+ const gchar *start, guint len,
+ struct html_tag_component *comp);
+
+
+
struct rspamd_url *
rspamd_html_process_url(rspamd_mempool_t *pool, const gchar *start, guint len,
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h
index 83a2a7ee7..14217b2c9 100644
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -1,5 +1,17 @@
-/*
- * Functions for simple html parsing
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
*/
#ifndef RSPAMD_HTML_H
@@ -32,28 +44,9 @@ extern "C" {
#define RSPAMD_HTML_FLAG_IMAGE_EXTERNAL (1 << 1)
#define RSPAMD_HTML_FLAG_IMAGE_DATA (1 << 2)
-enum html_component_type {
- RSPAMD_HTML_COMPONENT_NAME = 0,
- RSPAMD_HTML_COMPONENT_HREF,
- RSPAMD_HTML_COMPONENT_COLOR,
- RSPAMD_HTML_COMPONENT_BGCOLOR,
- RSPAMD_HTML_COMPONENT_STYLE,
- RSPAMD_HTML_COMPONENT_CLASS,
- RSPAMD_HTML_COMPONENT_WIDTH,
- RSPAMD_HTML_COMPONENT_HEIGHT,
- RSPAMD_HTML_COMPONENT_SIZE,
- RSPAMD_HTML_COMPONENT_REL,
- RSPAMD_HTML_COMPONENT_ALT,
-};
-
-struct html_tag_component {
- enum html_component_type type;
- guint len;
- const guchar *start;
-};
-
struct rspamd_image;
+struct html_tag;
struct html_image {
guint height;
@@ -89,7 +82,7 @@ struct html_block {
struct html_tag *tag;
struct html_color font_color;
struct html_color background_color;
- struct html_tag_component style;
+ //struct html_tag_component style;
guint font_size;
gboolean visible;
gchar *html_class;
@@ -108,16 +101,7 @@ struct html_block {
#define FL_HREF (1 << 29)
#define FL_IMAGE (1 << 30)
-struct html_tag {
- gint id;
- gint flags;
- struct html_tag_component name;
- guint content_length;
- goffset content_offset;
- GQueue *params;
- gpointer extra; /** Additional data associated with tag (e.g. image) */
- GNode *parent;
-};
+
/* Forwarded declaration */
struct rspamd_task;
diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx
new file mode 100644
index 000000000..01fce796a
--- /dev/null
+++ b/src/libserver/html/html_tag.hxx
@@ -0,0 +1,56 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_HTML_TAG_HXX
+#define RSPAMD_HTML_TAG_HXX
+#pragma once
+
+#include <utility>
+#include <string_view>
+#include <contrib/robin-hood/robin_hood.h>
+
+namespace rspamd::html {
+
+enum class html_component_type : std::uint8_t {
+ RSPAMD_HTML_COMPONENT_NAME = 0,
+ RSPAMD_HTML_COMPONENT_HREF,
+ RSPAMD_HTML_COMPONENT_COLOR,
+ RSPAMD_HTML_COMPONENT_BGCOLOR,
+ RSPAMD_HTML_COMPONENT_STYLE,
+ RSPAMD_HTML_COMPONENT_CLASS,
+ RSPAMD_HTML_COMPONENT_WIDTH,
+ RSPAMD_HTML_COMPONENT_HEIGHT,
+ RSPAMD_HTML_COMPONENT_SIZE,
+ RSPAMD_HTML_COMPONENT_REL,
+ RSPAMD_HTML_COMPONENT_ALT,
+};
+
+struct html_tag {
+ gint id;
+ gint flags;
+ guint content_length;
+ goffset content_offset;
+
+ std::string_view name;
+ robin_hood::unordered_flat_map<html_component_type, std::string_view> parameters;
+
+ gpointer extra; /* TODO: convert to variant */
+ GNode *parent;
+};
+
+}
+
+#endif //RSPAMD_HTML_TAG_HXX
diff --git a/src/libserver/html/html_tag_defs.hxx b/src/libserver/html/html_tag_defs.hxx
index 5a552066d..36d3ba4ed 100644
--- a/src/libserver/html/html_tag_defs.hxx
+++ b/src/libserver/html/html_tag_defs.hxx
@@ -1,5 +1,5 @@
/*-
- * Copyright 2016 Vsevolod Stakhov
+ * Copyright 2021 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.